From 7836a48ad441c03dab4a9251afb6afc317e6b9e5 Mon Sep 17 00:00:00 2001 From: Mahmoud-Emad Date: Wed, 12 Nov 2025 10:38:39 +0200 Subject: [PATCH] feat: Implement container networking and improve lifecycle - Add thread-safe network management for containers - Implement graceful and forceful container stopping - Enhance container creation and deletion logic - Refine image management and metadata handling - Add container name validation for security --- lib/virt/heropods/container.v | 249 ++++++++++++++--- lib/virt/heropods/container_create.v | 66 ++++- lib/virt/heropods/container_image.v | 73 +++-- lib/virt/heropods/heropods_factory_.v | 42 ++- lib/virt/heropods/heropods_model.v | 56 ++-- lib/virt/heropods/instructions.md | 5 - lib/virt/heropods/network.v | 384 ++++++++++++++++++++++++++ lib/virt/heropods/utils.v | 35 +++ 8 files changed, 806 insertions(+), 104 deletions(-) delete mode 100644 lib/virt/heropods/instructions.md create mode 100644 lib/virt/heropods/network.v create mode 100644 lib/virt/heropods/utils.v diff --git a/lib/virt/heropods/container.v b/lib/virt/heropods/container.v index f774a3b5..af15b8cf 100644 --- a/lib/virt/heropods/container.v +++ b/lib/virt/heropods/container.v @@ -8,25 +8,47 @@ import time import incubaid.herolib.builder import json +// Container lifecycle timeout constants +const cleanup_retry_delay_ms = 500 // Time to wait for filesystem cleanup to complete +const sigterm_timeout_ms = 5000 // Time to wait for graceful shutdown (5 seconds) +const sigkill_wait_ms = 500 // Time to wait after SIGKILL +const stop_check_interval_ms = 500 // Interval to check if container stopped + +// Container represents a running or stopped OCI container managed by crun +// +// Thread Safety: +// Container operations that interact with network configuration (start, stop, delete) +// are thread-safe because they delegate to HeroPods.network_* methods which use +// the network_mutex for protection. @[heap] pub struct Container { pub mut: - name string - node ?&builder.Node - tmux_pane ?&tmux.Pane - crun_config ?&crun.CrunConfig - factory &HeroPods + name string // Unique container name + node ?&builder.Node // Builder node for executing commands inside container + tmux_pane ?&tmux.Pane // Optional tmux pane for interactive access + crun_config ?&crun.CrunConfig // OCI runtime configuration + factory &HeroPods // Reference to parent HeroPods instance } -// Struct to parse JSON output of `crun state` +// CrunState represents the JSON output of `crun state` command struct CrunState { - id string - status string - pid int - bundle string - created string + id string // Container ID + status string // Container status (running, stopped, paused) + pid int // PID of container init process + bundle string // Path to OCI bundle + created string // Creation timestamp } +// Start the container +// +// This method handles the complete container startup lifecycle: +// 1. Creates the container in crun if it doesn't exist +// 2. Handles leftover state cleanup if creation fails +// 3. Starts the container process +// 4. Sets up networking (thread-safe via network_mutex) +// +// Thread Safety: +// Network setup is thread-safe via HeroPods.network_setup_container() pub fn (mut self Container) start() ! { // Check if container exists in crun container_exists := self.container_exists_in_crun()! @@ -37,7 +59,7 @@ pub fn (mut self Container) start() ! { // Try to create the container, if it fails with "File exists" error, // try to force delete any leftover state and retry crun_root := '${self.factory.base_dir}/runtime' - create_result := osal.exec( + _ := osal.exec( cmd: 'crun --root ${crun_root} create --bundle ${self.factory.base_dir}/configs/${self.name} ${self.name}' stdout: true ) or { @@ -50,7 +72,7 @@ pub fn (mut self Container) start() ! { osal.exec(cmd: 'rm -rf ${crun_root}/${self.name}', stdout: false) or {} osal.exec(cmd: 'rm -rf /run/crun/${self.name}', stdout: false) or {} // Wait a moment for cleanup to complete - time.sleep(500 * time.millisecond) + time.sleep(cleanup_retry_delay_ms * time.millisecond) // Retry creation osal.exec( cmd: 'crun --root ${crun_root} create --bundle ${self.factory.base_dir}/configs/${self.name} ${self.name}' @@ -84,10 +106,34 @@ pub fn (mut self Container) start() ! { // start the container (crun start doesn't have --detach flag) crun_root := '${self.factory.base_dir}/runtime' - osal.exec(cmd: 'crun --root ${crun_root} start ${self.name}', stdout: true)! + // Start the container + osal.exec(cmd: 'crun --root ${crun_root} start ${self.name}', stdout: true) or { + return error('Failed to start container: ${err}') + } + + // Setup network for the container (thread-safe) + // If this fails, stop the container to clean up + self.setup_network() or { + console.print_stderr('Network setup failed, stopping container: ${err}') + // Use stop() method to properly clean up (kills process, cleans network, etc.) + // Ignore errors from stop since we're already in an error path + self.stop() or { console.print_debug('Failed to stop container during cleanup: ${err}') } + return error('Failed to setup network for container: ${err}') + } + console.print_green('Container ${self.name} started') } +// Stop the container gracefully (SIGTERM) or forcefully (SIGKILL) +// +// This method: +// 1. Sends SIGTERM for graceful shutdown +// 2. Waits up to sigterm_timeout_ms for graceful stop +// 3. Sends SIGKILL if still running after timeout +// 4. Cleans up network resources (thread-safe) +// +// Thread Safety: +// Network cleanup is thread-safe via HeroPods.network_cleanup_container() pub fn (mut self Container) stop() ! { status := self.status()! if status == .stopped { @@ -96,28 +142,90 @@ pub fn (mut self Container) stop() ! { } crun_root := '${self.factory.base_dir}/runtime' - osal.exec(cmd: 'crun --root ${crun_root} kill ${self.name} SIGTERM', stdout: false) or {} - time.sleep(2 * time.second) - // Force kill if still running - if self.status()! == .running { - osal.exec(cmd: 'crun --root ${crun_root} kill ${self.name} SIGKILL', stdout: false) or {} + // Send SIGTERM for graceful shutdown + osal.exec(cmd: 'crun --root ${crun_root} kill ${self.name} SIGTERM', stdout: false) or { + console.print_debug('Failed to send SIGTERM (container may already be stopped): ${err}') } + + // Wait up to sigterm_timeout_ms for graceful shutdown + mut attempts := 0 + max_attempts := sigterm_timeout_ms / stop_check_interval_ms + for attempts < max_attempts { + time.sleep(stop_check_interval_ms * time.millisecond) + current_status := self.status() or { + // If we can't get status, assume it's stopped (container may have been deleted) + ContainerStatus.stopped + } + if current_status == .stopped { + console.print_debug('Container ${self.name} stopped gracefully') + self.cleanup_network()! // Thread-safe network cleanup + console.print_green('Container ${self.name} stopped') + return + } + attempts++ + } + + // Force kill if still running after timeout + console.print_debug('Container ${self.name} did not stop gracefully, force killing') + osal.exec(cmd: 'crun --root ${crun_root} kill ${self.name} SIGKILL', stdout: false) or { + console.print_debug('Failed to send SIGKILL: ${err}') + } + + // Wait for SIGKILL to take effect + time.sleep(sigkill_wait_ms * time.millisecond) + + // Verify it's actually stopped + final_status := self.status() or { + // If we can't get status, assume it's stopped (container may have been deleted) + ContainerStatus.stopped + } + if final_status != .stopped { + return error('Failed to stop container ${self.name} - status: ${final_status}') + } + + // Cleanup network resources (thread-safe) + self.cleanup_network()! + console.print_green('Container ${self.name} stopped') } +// Delete the container +// +// This method: +// 1. Checks if container exists in crun +// 2. Stops the container (which cleans up network) +// 3. Deletes the container from crun +// 4. Removes from factory's container cache +// +// Thread Safety: +// Network cleanup is thread-safe via stop() -> cleanup_network() pub fn (mut self Container) delete() ! { // Check if container exists before trying to delete if !self.container_exists_in_crun()! { - console.print_debug('Container ${self.name} does not exist, nothing to delete') + console.print_debug('Container ${self.name} does not exist in crun') + // Still cleanup network resources in case they exist (thread-safe) + self.cleanup_network() or { + console.print_debug('Network cleanup failed (may not exist): ${err}') + } + // Remove from factory's container cache only after all cleanup is done + if self.name in self.factory.containers { + self.factory.containers.delete(self.name) + } + console.print_debug('Container ${self.name} removed from cache') return } + // Stop the container (this will cleanup network via stop()) self.stop()! - crun_root := '${self.factory.base_dir}/runtime' - osal.exec(cmd: 'crun --root ${crun_root} delete ${self.name}', stdout: false) or {} - // Remove from factory's container cache + // Delete the container from crun + crun_root := '${self.factory.base_dir}/runtime' + osal.exec(cmd: 'crun --root ${crun_root} delete ${self.name}', stdout: false) or { + console.print_debug('Failed to delete container from crun: ${err}') + } + + // Remove from factory's container cache only after all cleanup is complete if self.name in self.factory.containers { self.factory.containers.delete(self.name) } @@ -135,24 +243,92 @@ pub fn (mut self Container) exec(cmd_ osal.Command) !string { // Use the builder node to execute inside container mut node := self.node()! console.print_debug('Executing command in container ${self.name}: ${cmd_.cmd}') - return node.exec(cmd: cmd_.cmd, stdout: cmd_.stdout) + + // Execute and provide better error context + return node.exec(cmd: cmd_.cmd, stdout: cmd_.stdout) or { + // Check if container still exists to provide better error message + if !self.container_exists_in_crun()! { + return error('Container ${self.name} was deleted during command execution') + } + return error('Command execution failed in container ${self.name}: ${err}') + } } pub fn (self Container) status() !ContainerStatus { crun_root := '${self.factory.base_dir}/runtime' result := osal.exec(cmd: 'crun --root ${crun_root} state ${self.name}', stdout: false) or { - return .unknown + // Container doesn't exist - this is expected in some cases (e.g., before creation) + // Check error message to distinguish between "not found" and real errors + err_msg := err.msg().to_lower() + if err_msg.contains('does not exist') || err_msg.contains('not found') + || err_msg.contains('no such') { + return .stopped + } + // Real error (permissions, crun not installed, etc.) - propagate it + return error('Failed to get container status: ${err}') } // Parse JSON output from crun state - state := json.decode(CrunState, result.output) or { return .unknown } - - return match state.status { - 'running' { .running } - 'stopped' { .stopped } - 'paused' { .paused } - else { .unknown } + state := json.decode(CrunState, result.output) or { + return error('Failed to parse container state JSON: ${err}') } + + status_result := match state.status { + 'running' { + ContainerStatus.running + } + 'stopped' { + ContainerStatus.stopped + } + 'paused' { + ContainerStatus.paused + } + else { + console.print_debug('Unknown container status: ${state.status}') + ContainerStatus.unknown + } + } + return status_result +} + +// Get the PID of the container's init process +pub fn (self Container) pid() !int { + crun_root := '${self.factory.base_dir}/runtime' + result := osal.exec( + cmd: 'crun --root ${crun_root} state ${self.name}' + stdout: false + )! + + // Parse JSON output from crun state + state := json.decode(CrunState, result.output)! + + if state.pid == 0 { + return error('Container ${self.name} has no PID (not running?)') + } + + return state.pid +} + +// Setup network for this container (thread-safe) +// +// Delegates to HeroPods.network_setup_container() which uses network_mutex +// for thread-safe IP allocation and network configuration. +fn (mut self Container) setup_network() ! { + // Get container PID + container_pid := self.pid()! + + // Delegate to factory's network setup (thread-safe) + mut factory := self.factory + factory.network_setup_container(self.name, container_pid)! +} + +// Cleanup network for this container (thread-safe) +// +// Delegates to HeroPods.network_cleanup_container() which uses network_mutex +// for thread-safe IP deallocation and network cleanup. +fn (mut self Container) cleanup_network() ! { + mut factory := self.factory + factory.network_cleanup_container(self.name)! } // Check if container exists in crun (regardless of its state) @@ -167,11 +343,12 @@ fn (self Container) container_exists_in_crun() !bool { return result.exit_code == 0 } +// ContainerStatus represents the current state of a container pub enum ContainerStatus { - running - stopped - paused - unknown + running // Container is running + stopped // Container is stopped or doesn't exist + paused // Container is paused + unknown // Unknown status (error case) } // Get CPU usage in percentage diff --git a/lib/virt/heropods/container_create.v b/lib/virt/heropods/container_create.v index c1531f43..a3a98ec4 100644 --- a/lib/virt/heropods/container_create.v +++ b/lib/virt/heropods/container_create.v @@ -6,25 +6,44 @@ import incubaid.herolib.virt.crun import incubaid.herolib.installers.virt.herorunner as herorunner_installer import os -// Updated enum to be more flexible +// ContainerImageType defines the available container base images pub enum ContainerImageType { - alpine_3_20 - ubuntu_24_04 - ubuntu_25_04 - custom // For custom images downloaded via podman + alpine_3_20 // Alpine Linux 3.20 + ubuntu_24_04 // Ubuntu 24.04 LTS + ubuntu_25_04 // Ubuntu 25.04 + custom // Custom image downloaded via podman } +// ContainerNewArgs defines parameters for creating a new container @[params] pub struct ContainerNewArgs { pub: - name string @[required] - image ContainerImageType = .alpine_3_20 + name string @[required] // Unique container name + image ContainerImageType = .alpine_3_20 // Base image type custom_image_name string // Used when image = .custom docker_url string // Docker image URL for new images - reset bool + reset bool // Reset if container already exists } +// Create a new container +// +// This method: +// 1. Validates the container name +// 2. Determines the image to use (built-in or custom) +// 3. Creates crun configuration +// 4. Installs hero binary in rootfs +// 5. Configures DNS in rootfs +// +// Note: The actual container creation in crun happens when start() is called. +// This method only prepares the configuration and rootfs. +// +// Thread Safety: +// This method doesn't interact with network_config, so no mutex is needed. +// Network setup happens later in container.start(). pub fn (mut self HeroPods) container_new(args ContainerNewArgs) !&Container { + // Validate container name to prevent shell injection and path traversal + validate_container_name(args.name) or { return error('Invalid container name: ${err}') } + if args.name in self.containers && !args.reset { return self.containers[args.name] or { panic('bug: container should exist') } } @@ -85,13 +104,22 @@ pub fn (mut self HeroPods) container_new(args ContainerNewArgs) !&Container { self.containers[args.name] = container - // Always install hero binary in container rootfs + // Install hero binary in container rootfs self.install_hero_in_rootfs(rootfs_path)! + // Configure DNS in container rootfs (uses network_config but doesn't modify it) + self.network_configure_dns(args.name, rootfs_path)! + return container } -// Create crun configuration using the crun module +// Create crun configuration for a container +// +// This creates an OCI-compliant runtime configuration with: +// - No terminal (background container) +// - Long-running sleep process +// - Standard environment variables +// - Resource limits fn (mut self HeroPods) create_crun_config(container_name string, rootfs_path string) !&crun.CrunConfig { // Create crun configuration using the factory pattern mut config := crun.new(mut self.crun_configs, name: container_name)! @@ -107,7 +135,7 @@ fn (mut self HeroPods) create_crun_config(container_name string, rootfs_path str config.set_hostname('container') config.set_no_new_privileges(true) - // Add the specific rlimit for file descriptors + // Add resource limits config.add_rlimit(.rlimit_nofile, 1024, 1024) // Validate the configuration @@ -123,7 +151,13 @@ fn (mut self HeroPods) create_crun_config(container_name string, rootfs_path str return config } -// Use podman to pull image and extract rootfs +// Pull a Docker image using podman and extract its rootfs +// +// This method: +// 1. Pulls the image from Docker registry +// 2. Creates a temporary container from the image +// 3. Exports the container filesystem to rootfs_path +// 4. Cleans up the temporary container fn (self HeroPods) podman_pull_and_export(docker_url string, image_name string, rootfs_path string) ! { // Pull image osal.exec( @@ -156,8 +190,12 @@ fn (self HeroPods) podman_pull_and_export(docker_url string, image_name string, } // Install hero binary into container rootfs -// This copies the hero binary from the host into the container's rootfs -// If the hero binary doesn't exist on the host, it will be compiled first +// +// This method: +// 1. Checks if hero binary already exists in rootfs +// 2. If not, copies from host (~/hero/bin/hero) +// 3. If host binary doesn't exist, compiles it first +// 4. Makes the binary executable fn (mut self HeroPods) install_hero_in_rootfs(rootfs_path string) ! { console.print_debug('Installing hero binary into container rootfs: ${rootfs_path}') diff --git a/lib/virt/heropods/container_image.v b/lib/virt/heropods/container_image.v index ff355f30..e5db636a 100644 --- a/lib/virt/heropods/container_image.v +++ b/lib/virt/heropods/container_image.v @@ -5,40 +5,57 @@ import incubaid.herolib.osal.core as osal import incubaid.herolib.core.texttools import os +// ContainerImage represents a container base image with its rootfs +// +// Thread Safety: +// Image operations are filesystem-based and don't interact with network_config, +// so no special thread safety considerations are needed. @[heap] pub struct ContainerImage { pub mut: - image_name string @[required] // image is located in ${self.factory.base_dir}/images//rootfs - docker_url string // optional docker image URL - rootfs_path string // path to the extracted rootfs - size_mb f64 // size in MB - created_at string // creation timestamp - factory &HeroPods @[skip; str: skip] + image_name string @[required] // Image name (located in ${self.factory.base_dir}/images//rootfs) + docker_url string // Optional Docker registry URL + rootfs_path string // Path to the extracted rootfs + size_mb f64 // Size in MB + created_at string // Creation timestamp + factory &HeroPods @[skip; str: skip] // Reference to parent HeroPods instance } +// ContainerImageArgs defines parameters for creating/managing container images @[params] pub struct ContainerImageArgs { pub mut: - image_name string @[required] // image is located in ${self.factory.base_dir}/images//rootfs - docker_url string // docker image URL like "alpine:3.20" or "ubuntu:24.04" - reset bool + image_name string @[required] // Unique image name (located in ${self.factory.base_dir}/images//rootfs) + docker_url string // Docker image URL like "alpine:3.20" or "ubuntu:24.04" + reset bool // Reset if image already exists } +// ImageExportArgs defines parameters for exporting an image @[params] pub struct ImageExportArgs { pub mut: - dest_path string @[required] // destination .tgz file path - compress_level int = 6 // compression level 1-9 + dest_path string @[required] // Destination .tgz file path + compress_level int = 6 // Compression level 1-9 } +// ImageImportArgs defines parameters for importing an image @[params] pub struct ImageImportArgs { pub mut: - source_path string @[required] // source .tgz file path - reset bool // overwrite if exists + source_path string @[required] // Source .tgz file path + reset bool // Overwrite if exists } -// Create new image or get existing +// Create a new image or get existing image +// +// This method: +// 1. Normalizes the image name +// 2. Returns existing image if found (unless reset=true) +// 3. Downloads image from Docker registry if docker_url provided +// 4. Creates image metadata and stores in cache +// +// Thread Safety: +// Image operations are filesystem-based and don't interact with network_config. pub fn (mut self HeroPods) image_new(args ContainerImageArgs) !&ContainerImage { mut image_name := texttools.name_fix(args.image_name) rootfs_path := '${self.base_dir}/images/${image_name}/rootfs' @@ -77,7 +94,13 @@ pub fn (mut self HeroPods) image_new(args ContainerImageArgs) !&ContainerImage { return image } -// Download image from docker registry using podman +// Download image from Docker registry using podman +// +// This method: +// 1. Pulls the image from Docker registry +// 2. Creates a temporary container +// 3. Exports the rootfs to the images directory +// 4. Cleans up the temporary container fn (mut self ContainerImage) download_from_docker(docker_url string, reset bool) ! { console.print_header('Downloading image: ${docker_url}') @@ -119,12 +142,14 @@ fn (mut self ContainerImage) download_from_docker(docker_url string, reset bool) } // Update image metadata (size, creation time, etc.) +// +// Calculates the rootfs size and records creation timestamp fn (mut self ContainerImage) update_metadata() ! { if !os.is_dir(self.rootfs_path) { return error('Rootfs path does not exist: ${self.rootfs_path}') } - // Calculate size + // Calculate size in MB result := osal.exec(cmd: 'du -sm ${self.rootfs_path}', stdout: false)! result_parts := result.output.split_by_space()[0] or { panic('bug') } size_str := result_parts.trim_space() @@ -132,10 +157,12 @@ fn (mut self ContainerImage) update_metadata() ! { // Get creation time info := os.stat(self.rootfs_path) or { return error('stat failed: ${err}') } - self.created_at = info.ctime.str() // or mtime.str(), depending on what you want + self.created_at = info.ctime.str() } // List all available images +// +// Scans the images directory and returns all found images with metadata pub fn (mut self HeroPods) images_list() ![]&ContainerImage { mut images := []&ContainerImage{} @@ -173,6 +200,8 @@ pub fn (mut self HeroPods) images_list() ![]&ContainerImage { } // Export image to .tgz file +// +// Creates a compressed tarball of the image rootfs pub fn (mut self ContainerImage) export(args ImageExportArgs) ! { if !os.is_dir(self.rootfs_path) { return error('Image rootfs not found: ${self.rootfs_path}') @@ -192,6 +221,8 @@ pub fn (mut self ContainerImage) export(args ImageExportArgs) ! { } // Import image from .tgz file +// +// Extracts a compressed tarball into the images directory and creates image metadata pub fn (mut self HeroPods) image_import(args ImageImportArgs) !&ContainerImage { if !os.exists(args.source_path) { return error('Source file not found: ${args.source_path}') @@ -238,6 +269,8 @@ pub fn (mut self HeroPods) image_import(args ImageImportArgs) !&ContainerImage { } // Delete image +// +// Removes the image directory and removes from factory cache pub fn (mut self ContainerImage) delete() ! { console.print_header('Deleting image: ${self.image_name}') @@ -255,6 +288,8 @@ pub fn (mut self ContainerImage) delete() ! { } // Get image info as map +// +// Returns image metadata as a string map for display/serialization pub fn (self ContainerImage) info() map[string]string { return { 'name': self.image_name @@ -265,7 +300,9 @@ pub fn (self ContainerImage) info() map[string]string { } } -// List available docker images that can be downloaded +// List available Docker images that can be downloaded +// +// Returns a curated list of commonly used Docker images pub fn list_available_docker_images() []string { return [ 'alpine:3.20', diff --git a/lib/virt/heropods/heropods_factory_.v b/lib/virt/heropods/heropods_factory_.v index 466dcf0d..7bf064c5 100644 --- a/lib/virt/heropods/heropods_factory_.v +++ b/lib/virt/heropods/heropods_factory_.v @@ -4,6 +4,14 @@ import incubaid.herolib.core.base import incubaid.herolib.core.playbook { PlayBook } import json +// Global state for HeroPods instances +// +// Thread Safety Note: +// heropods_global is not marked as `shared` because it would break compile-time +// reflection in paramsparser. The map operations are generally safe for concurrent +// read access. For write operations, the Redis backend provides the source of truth +// and synchronization. Each HeroPods instance has its own network_mutex for +// protecting network operations. __global ( heropods_global map[string]&HeroPods heropods_default string @@ -31,9 +39,12 @@ pub fn new(args ArgsGet) !&HeroPods { return get(name: args.name)! } +// Get a HeroPods instance by name +// If fromdb is true, loads from Redis; otherwise returns from memory cache pub fn get(args ArgsGet) !&HeroPods { mut context := base.context()! heropods_default = args.name + if args.fromdb || args.name !in heropods_global { mut r := context.redis()! if r.hexists('context:heropods', args.name)! { @@ -52,15 +63,16 @@ pub fn get(args ArgsGet) !&HeroPods { return error("HeroPods with name '${args.name}' does not exist") } } - return get(args)! // no longer from db nor create + return get(args)! // Recursive call with fromdb=false } + return heropods_global[args.name] or { print_backtrace() return error('could not get config for heropods with name:${args.name}') } } -// register the config for the future +// Register a HeroPods instance (saves to both memory and Redis) pub fn set(o HeroPods) ! { mut o2 := set_in_mem(o)! heropods_default = o2.name @@ -69,13 +81,14 @@ pub fn set(o HeroPods) ! { r.hset('context:heropods', o2.name, json.encode(o2))! } -// does the config exists? +// Check if a HeroPods instance exists in Redis pub fn exists(args ArgsGet) !bool { mut context := base.context()! mut r := context.redis()! return r.hexists('context:heropods', args.name)! } +// Delete a HeroPods instance from Redis (does not affect memory cache) pub fn delete(args ArgsGet) ! { mut context := base.context()! mut r := context.redis()! @@ -88,33 +101,36 @@ pub mut: fromdb bool // will load from filesystem } -// if fromdb set: load from filesystem, and not from mem, will also reset what is in mem +// List all HeroPods instances +// If fromdb is true, loads from Redis and resets memory cache +// If fromdb is false, returns from memory cache pub fn list(args ArgsList) ![]&HeroPods { mut res := []&HeroPods{} mut context := base.context()! + if args.fromdb { - // reset what is in mem + // Reset memory cache and load from Redis heropods_global = map[string]&HeroPods{} heropods_default = '' - } - if args.fromdb { + mut r := context.redis()! mut l := r.hkeys('context:heropods')! for name in l { res << get(name: name, fromdb: true)! } - return res } else { - // load from memory + // Load from memory cache for _, client in heropods_global { res << client } } + return res } -// only sets in mem, does not set as config +// Set a HeroPods instance in memory cache only (does not persist to Redis) +// Initializes the instance via obj_init before caching fn set_in_mem(o HeroPods) !HeroPods { mut o2 := obj_init(o)! heropods_global[o2.name] = &o2 @@ -226,7 +242,11 @@ pub fn play(mut plbook PlayBook) ! { } } -// switch instance to be used for heropods +// Switch the default HeroPods instance +// +// Thread Safety Note: +// String assignment is atomic on most platforms, so no explicit locking is needed. +// If strict thread safety is required in the future, this could be wrapped in a lock. pub fn switch(name string) { heropods_default = name } diff --git a/lib/virt/heropods/heropods_model.v b/lib/virt/heropods/heropods_model.v index cdd898cc..b9a237b5 100644 --- a/lib/virt/heropods/heropods_model.v +++ b/lib/virt/heropods/heropods_model.v @@ -5,24 +5,31 @@ import incubaid.herolib.osal.core as osal import incubaid.herolib.ui.console import incubaid.herolib.virt.crun import os +import sync pub const version = '0.0.0' const singleton = false const default = true -// THIS THE THE SOURCE OF THE INFORMATION OF THIS FILE, HERE WE HAVE THE CONFIG OBJECT CONFIGURED AND MODELLED - +// HeroPods factory for managing containers +// +// Thread Safety: +// The network_config field is protected by network_mutex for thread-safe concurrent access. +// We use a separate mutex instead of marking network_config as `shared` because V's +// compile-time reflection (used by paramsparser) cannot handle shared fields. @[heap] pub struct HeroPods { pub mut: - tmux_session string // tmux session name - containers map[string]&Container // name -> container mapping - images map[string]&ContainerImage // name -> image mapping - crun_configs map[string]&crun.CrunConfig // name -> crun config mapping - base_dir string // base directory for all container data - reset bool // will reset the heropods - use_podman bool = true // will use podman for image management - name string // name of the heropods + tmux_session string // tmux session name + containers map[string]&Container // name -> container mapping + images map[string]&ContainerImage // name -> image mapping + crun_configs map[string]&crun.CrunConfig // name -> crun config mapping + base_dir string // base directory for all container data + reset bool // will reset the heropods + use_podman bool = true // will use podman for image management + name string // name of the heropods + network_config NetworkConfig @[skip; str: skip] // network configuration (automatically initialized, not serialized) + network_mutex sync.Mutex @[skip; str: skip] // protects network_config for thread-safe concurrent access } // your checking & initialization code if needed @@ -46,22 +53,31 @@ fn obj_init(mycfg_ HeroPods) !HeroPods { } } + // Initialize HeroPods instance with network configuration + // Note: network_mutex is automatically initialized to zero value (unlocked state) mut heropods := HeroPods{ - tmux_session: args.name - containers: map[string]&Container{} - images: map[string]&ContainerImage{} - crun_configs: map[string]&crun.CrunConfig{} - base_dir: args.base_dir - reset: args.reset - use_podman: args.use_podman - name: args.name + tmux_session: args.name + containers: map[string]&Container{} + images: map[string]&ContainerImage{} + crun_configs: map[string]&crun.CrunConfig{} + base_dir: args.base_dir + reset: args.reset + use_podman: args.use_podman + name: args.name + network_config: NetworkConfig{ + allocated_ips: map[string]string{} + } } // Clean up any leftover crun state if reset is requested if args.reset { heropods.cleanup_crun_state()! + heropods.network_cleanup_all(false)! // Keep bridge for reuse } + // Initialize network layer + heropods.network_init()! + // Load existing images into cache heropods.load_existing_images()! @@ -70,7 +86,7 @@ fn obj_init(mycfg_ HeroPods) !HeroPods { heropods.setup_default_images(args.reset)! } - return args + return heropods } /////////////NORMALLY NO NEED TO TOUCH @@ -92,7 +108,7 @@ fn (mut self HeroPods) setup_default_images(reset bool) ! { } if img.str() !in self.images || reset { console.print_debug('Preparing default image: ${img.str()}') - _ = self.image_new(args)! + self.image_new(args)! } } } diff --git a/lib/virt/heropods/instructions.md b/lib/virt/heropods/instructions.md deleted file mode 100644 index 5e6c77c1..00000000 --- a/lib/virt/heropods/instructions.md +++ /dev/null @@ -1,5 +0,0 @@ - - -- use builder... for remote execution inside the container - - make an executor like we have for SSH but then for the container, so we can use this to execute commands inside the container -- \ No newline at end of file diff --git a/lib/virt/heropods/network.v b/lib/virt/heropods/network.v new file mode 100644 index 00000000..dda58f7f --- /dev/null +++ b/lib/virt/heropods/network.v @@ -0,0 +1,384 @@ +module heropods + +import incubaid.herolib.osal.core as osal +import incubaid.herolib.ui.console +import os +import crypto.sha256 + +// Network configuration for HeroPods +// +// This module provides container networking similar to Docker/Podman: +// - Bridge networking with automatic IP allocation +// - NAT for outbound internet access +// - DNS configuration +// - veth pair management +// +// Thread Safety: +// All network_config operations are protected by HeroPods.network_mutex. +// The struct is not marked as `shared` to maintain compatibility with +// paramsparser's compile-time reflection. +// +// Future extension possibilities: +// - IPv6 support +// - Custom per-container DNS servers +// - iptables isolation (firewall per container) +// - Multiple bridges for isolated networks +// - Port forwarding/mapping +// - Network policies and traffic shaping + +// NetworkConfig holds network configuration for HeroPods containers +struct NetworkConfig { +mut: + bridge_name string = 'heropods0' + subnet string = '10.10.0.0/24' + gateway_ip string = '10.10.0.1' + dns_servers []string = ['8.8.8.8', '8.8.4.4'] + allocated_ips map[string]string // container_name -> IP address + freed_ip_pool []int // Pool of freed IP offsets for reuse (e.g., [15, 23, 42]) + next_ip_offset int = 10 // Start allocating from 10.10.0.10 (only used when pool is empty) +} + +// Initialize network configuration in HeroPods factory +fn (mut self HeroPods) network_init() ! { + console.print_debug('Initializing HeroPods network layer...') + + // Setup host bridge if it doesn't exist + self.network_setup_bridge()! + + console.print_debug('HeroPods network layer initialized') +} + +// Setup the host bridge network (one-time setup, idempotent) +fn (mut self HeroPods) network_setup_bridge() ! { + bridge_name := self.network_config.bridge_name + gateway_ip := '${self.network_config.gateway_ip}/${self.network_config.subnet.split('/')[1]}' + subnet := self.network_config.subnet + + // Check if bridge already exists + result := osal.exec( + cmd: 'ip link show ${bridge_name}' + stdout: false + raise_error: false + ) or { + osal.Job{ + exit_code: 1 + } + } + + if result.exit_code == 0 { + console.print_debug('Bridge ${bridge_name} already exists') + return + } + + console.print_debug('Creating bridge ${bridge_name}...') + + // Create bridge + osal.exec( + cmd: 'ip link add name ${bridge_name} type bridge' + stdout: false + )! + + // Assign IP to bridge + osal.exec( + cmd: 'ip addr add ${gateway_ip} dev ${bridge_name}' + stdout: false + )! + + // Bring bridge up + osal.exec( + cmd: 'ip link set ${bridge_name} up' + stdout: false + )! + + // Enable IP forwarding (with error resilience) + osal.exec( + cmd: 'sysctl -w net.ipv4.ip_forward=1' + stdout: false + ) or { + console.print_stderr('Warning: Failed to enable IPv4 forwarding. Containers may not have internet access.') + console.print_debug('You may need to run: sudo sysctl -w net.ipv4.ip_forward=1') + } + + // Get primary network interface for NAT + primary_iface := self.network_get_primary_interface() or { + console.print_stderr('Warning: Could not detect primary network interface. NAT may not work.') + 'eth0' // fallback + } + + // Setup NAT for outbound traffic (with error resilience) + console.print_debug('Setting up NAT rules for ${primary_iface}...') + osal.exec( + cmd: 'iptables -t nat -C POSTROUTING -s ${subnet} -o ${primary_iface} -j MASQUERADE 2>/dev/null || iptables -t nat -A POSTROUTING -s ${subnet} -o ${primary_iface} -j MASQUERADE' + stdout: false + ) or { + console.print_stderr('Warning: Failed to setup NAT rules. Containers may not have internet access.') + console.print_debug('You may need to run: sudo iptables -t nat -A POSTROUTING -s ${subnet} -o ${primary_iface} -j MASQUERADE') + } + + console.print_green('Bridge ${bridge_name} created and configured') +} + +// Get the primary network interface for NAT +fn (self HeroPods) network_get_primary_interface() !string { + // Try to get the default route interface + result := osal.exec( + cmd: "ip route | grep default | awk '{print \$5}' | head -n1" + stdout: false + )! + + iface := result.output.trim_space() + if iface == '' { + return error('Could not determine primary network interface') + } + + return iface +} + +// Allocate an IP address for a container (thread-safe) +// +// IP REUSE STRATEGY: +// 1. First, try to reuse an IP from the freed_ip_pool (recycled IPs from deleted containers) +// 2. If pool is empty, allocate a new IP by incrementing next_ip_offset +// 3. This prevents IP exhaustion in a /24 subnet (254 usable IPs) +// +// Thread Safety: +// This function uses network_mutex to ensure atomic IP allocation. +// Multiple concurrent container starts will be serialized at the IP allocation step, +// preventing race conditions where two containers could receive the same IP. +fn (mut self HeroPods) network_allocate_ip(container_name string) !string { + self.network_mutex.@lock() + defer { + self.network_mutex.unlock() + } + + // Check if already allocated + if container_name in self.network_config.allocated_ips { + return self.network_config.allocated_ips[container_name] + } + + // Extract base IP from subnet (e.g., "10.10.0.0/24" -> "10.10.0") + subnet_parts := self.network_config.subnet.split('/') + base_ip_parts := subnet_parts[0].split('.') + base_ip := '${base_ip_parts[0]}.${base_ip_parts[1]}.${base_ip_parts[2]}' + + // Determine IP offset: reuse from pool first, then increment + mut ip_offset := 0 + if self.network_config.freed_ip_pool.len > 0 { + // Reuse a freed IP from the pool (LIFO - pop from end) + ip_offset = self.network_config.freed_ip_pool.last() + self.network_config.freed_ip_pool.delete_last() + console.print_debug('Reusing IP offset ${ip_offset} from freed pool (pool size: ${self.network_config.freed_ip_pool.len})') + } else { + // No freed IPs available, allocate a new one + // This increment is atomic within the mutex lock + ip_offset = self.network_config.next_ip_offset + self.network_config.next_ip_offset++ + + // Check if we're approaching the subnet limit (254 usable IPs in /24) + if ip_offset > 254 { + return error('IP address pool exhausted: subnet ${self.network_config.subnet} has no more available IPs. Consider using a larger subnet or multiple bridges.') + } + + console.print_debug('Allocated new IP offset ${ip_offset} (next: ${self.network_config.next_ip_offset})') + } + + // Build the full IP address + ip := '${base_ip}.${ip_offset}' + self.network_config.allocated_ips[container_name] = ip + + console.print_debug('Allocated IP ${ip} to container ${container_name}') + return ip +} + +// Setup network for a container (creates veth pair, assigns IP, configures routing) +fn (mut self HeroPods) network_setup_container(container_name string, container_pid int) ! { + console.print_debug('Setting up network for container ${container_name} (PID: ${container_pid})...') + + // Allocate IP address (thread-safe) + container_ip := self.network_allocate_ip(container_name)! + + bridge_name := self.network_config.bridge_name + subnet_mask := self.network_config.subnet.split('/')[1] + gateway_ip := self.network_config.gateway_ip + + // Create veth pair with unique names using hash to avoid collisions + // Interface names are limited to 15 chars, so we use a hash suffix + short_hash := sha256.hexhash(container_name)[..6] + veth_container_short := 'veth-${short_hash}' + veth_bridge_short := 'vbr-${short_hash}' + + // Delete veth pair if it already exists (cleanup from previous run) + osal.exec(cmd: 'ip link delete ${veth_container_short} 2>/dev/null', stdout: false) or {} + osal.exec(cmd: 'ip link delete ${veth_bridge_short} 2>/dev/null', stdout: false) or {} + + // Create veth pair + console.print_debug('Creating veth pair: ${veth_container_short} <-> ${veth_bridge_short}') + osal.exec( + cmd: 'ip link add ${veth_container_short} type veth peer name ${veth_bridge_short}' + stdout: false + )! + + // Attach bridge end to bridge + osal.exec( + cmd: 'ip link set ${veth_bridge_short} master ${bridge_name}' + stdout: false + )! + + osal.exec( + cmd: 'ip link set ${veth_bridge_short} up' + stdout: false + )! + + // Move container end into container's network namespace + console.print_debug('Moving ${veth_container_short} into container namespace (PID: ${container_pid})') + osal.exec( + cmd: 'ip link set ${veth_container_short} netns ${container_pid}' + stdout: false + )! + + // Configure network inside container + console.print_debug('Configuring network inside container: ${container_ip}/${subnet_mask}') + + // Rename veth to eth0 inside container for consistency + osal.exec( + cmd: 'nsenter -t ${container_pid} -n ip link set ${veth_container_short} name eth0' + stdout: false + )! + + // Assign IP address + osal.exec( + cmd: 'nsenter -t ${container_pid} -n ip addr add ${container_ip}/${subnet_mask} dev eth0' + stdout: false + )! + + // Bring interface up + osal.exec( + cmd: 'nsenter -t ${container_pid} -n ip link set dev eth0 up' + stdout: false + )! + + // Add default route using gateway IP + osal.exec( + cmd: 'nsenter -t ${container_pid} -n ip route add default via ${gateway_ip}' + stdout: false + )! + + console.print_green('Network configured for container ${container_name}: ${container_ip}') +} + +// Configure DNS inside container by writing resolv.conf +fn (self HeroPods) network_configure_dns(container_name string, rootfs_path string) ! { + console.print_debug('Configuring DNS for container ${container_name}...') + + resolv_conf_path := '${rootfs_path}/etc/resolv.conf' + + // Ensure /etc directory exists + etc_dir := '${rootfs_path}/etc' + if !os.exists(etc_dir) { + os.mkdir_all(etc_dir)! + } + + // Build DNS configuration from configured DNS servers + mut dns_lines := []string{} + for dns_server in self.network_config.dns_servers { + dns_lines << 'nameserver ${dns_server}' + } + dns_content := dns_lines.join('\n') + '\n' + + os.write_file(resolv_conf_path, dns_content)! + + dns_servers_str := self.network_config.dns_servers.join(', ') + console.print_debug('DNS configured: ${dns_servers_str}') +} + +// Cleanup network for a container (removes veth pair and deallocates IP) +// +// Thread Safety: +// IP deallocation is protected by network_mutex to prevent race conditions +// when multiple containers are being deleted concurrently. +fn (mut self HeroPods) network_cleanup_container(container_name string) ! { + console.print_debug('Cleaning up network for container ${container_name}...') + + // Remove veth interfaces (they should be auto-removed when container stops, but cleanup anyway) + // Use same hash logic as setup to ensure we delete the correct interface + short_hash := sha256.hexhash(container_name)[..6] + veth_bridge_short := 'vbr-${short_hash}' + + osal.exec( + cmd: 'ip link delete ${veth_bridge_short} 2>/dev/null' + stdout: false + ) or { console.print_debug('veth interface ${veth_bridge_short} already removed') } + + // Deallocate IP address and return it to the freed pool for reuse (thread-safe) + self.network_mutex.@lock() + defer { + self.network_mutex.unlock() + } + + if container_name in self.network_config.allocated_ips { + ip := self.network_config.allocated_ips[container_name] + + // Extract the IP offset from the full IP address (e.g., "10.10.0.42" -> 42) + ip_parts := ip.split('.') + if ip_parts.len == 4 { + ip_offset := ip_parts[3].int() + + // Add to freed pool for reuse (avoid duplicates) + if ip_offset !in self.network_config.freed_ip_pool { + self.network_config.freed_ip_pool << ip_offset + console.print_debug('Returned IP offset ${ip_offset} to freed pool (pool size: ${self.network_config.freed_ip_pool.len})') + } + } + + // Remove from allocated IPs + self.network_config.allocated_ips.delete(container_name) + console.print_debug('Deallocated IP ${ip} from container ${container_name}') + } +} + +// Cleanup all network resources (called on reset) +// +// Parameters: +// - full: if true, also removes the bridge (for complete teardown) +// if false, keeps the bridge for reuse (default) +// +// Thread Safety: +// Uses separate lock/unlock calls for read and write operations to minimize +// lock contention. The container cleanup loop runs without holding the lock. +fn (mut self HeroPods) network_cleanup_all(full bool) ! { + console.print_debug('Cleaning up all HeroPods network resources (full=${full})...') + + // Get list of containers to cleanup (thread-safe read) + self.network_mutex.@lock() + container_names := self.network_config.allocated_ips.keys() + self.network_mutex.unlock() + + // Remove all veth interfaces (no lock needed - operates on local copy) + for container_name in container_names { + self.network_cleanup_container(container_name) or { + console.print_debug('Failed to cleanup network for ${container_name}: ${err}') + } + } + + // Clear allocated IPs and freed pool (thread-safe write) + self.network_mutex.@lock() + self.network_config.allocated_ips.clear() + self.network_config.freed_ip_pool.clear() + self.network_config.next_ip_offset = 10 + self.network_mutex.unlock() + + console.print_debug('Cleared IP allocations and freed pool') + + // Optionally remove the bridge for full cleanup + if full { + bridge_name := self.network_config.bridge_name + + console.print_debug('Removing bridge ${bridge_name}...') + osal.exec( + cmd: 'ip link delete ${bridge_name}' + stdout: false + ) or { console.print_debug('Bridge ${bridge_name} already removed or does not exist') } + } + + console.print_debug('Network cleanup complete') +} diff --git a/lib/virt/heropods/utils.v b/lib/virt/heropods/utils.v new file mode 100644 index 00000000..4d4461d2 --- /dev/null +++ b/lib/virt/heropods/utils.v @@ -0,0 +1,35 @@ +module heropods + +// Validate container name to prevent shell injection and path traversal +// +// Security validation that ensures container names: +// - Are not empty and not too long (max 64 chars) +// - Contain only alphanumeric characters, dashes, and underscores +// - Don't start with dash or underscore +// - Don't contain path traversal sequences +// +// This is critical for preventing command injection attacks since container +// names are used in shell commands throughout the module. +fn validate_container_name(name string) ! { + if name == '' { + return error('Container name cannot be empty') + } + if name.len > 64 { + return error('Container name too long (max 64 characters)') + } + + // Check if name contains only allowed characters: alphanumeric, dash, underscore + allowed_chars := 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + if !name.contains_only(allowed_chars) { + return error('Container name "${name}" contains invalid characters. Only alphanumeric characters, dashes, and underscores are allowed.') + } + + if name.starts_with('-') || name.starts_with('_') { + return error('Container name cannot start with dash or underscore') + } + + // Prevent path traversal (redundant check but explicit for security) + if name.contains('..') || name.contains('/') || name.contains('\\') { + return error('Container name cannot contain path separators or ".."') + } +}