Files
herolib/lib/virt/heropods/container.v
Mahmoud-Emad 76876049be test: Add comprehensive heropods network and container tests
- Add wait_for_process_ready to container start
- Reduce sigterm and stop check timeouts
- Update default container base directory
- Introduce new heropods test suite with multiple tests
- Add tests for initialization and custom network config
- Add tests for Docker image pull and container creation
- Add tests for container lifecycle (start, stop, delete)
- Add tests for container command execution
- Add tests for network IP allocation
- Add tests for IPv4 connectivity
- Add tests for container deletion and IP cleanup
- Add tests for bridge network setup and NAT rules
- Add tests for IP pool management
- Add tests for custom bridge configuration
2025-11-24 14:02:36 +02:00

687 lines
21 KiB
V

module heropods
import incubaid.herolib.osal.tmux
import incubaid.herolib.osal.core as osal
import incubaid.herolib.virt.crun
import time
import incubaid.herolib.builder
import json
import os
// Container lifecycle timeout constants
const cleanup_retry_delay_ms = 500 // Time to wait for filesystem cleanup to complete
const sigterm_timeout_ms = 1000 // Time to wait for graceful shutdown (1 second) - reduced from 5s for faster tests
const sigkill_wait_ms = 500 // Time to wait after SIGKILL
const stop_check_interval_ms = 200 // Interval to check if container stopped - reduced from 500ms for faster response
// Container represents a running or stopped OCI container managed by crun
//
// Thread Safety:
// Container operations that interact with network configuration (start, stop, delete)
// are thread-safe because they delegate to HeroPods.network_* methods which use
// the network_mutex for protection.
@[heap]
pub struct Container {
pub mut:
name string // Unique container name
node ?&builder.Node // Builder node for executing commands inside container
tmux_pane ?&tmux.Pane // Optional tmux pane for interactive access
crun_config ?&crun.CrunConfig // OCI runtime configuration
factory &HeroPods // Reference to parent HeroPods instance
}
// CrunState represents the JSON output of `crun state` command
struct CrunState {
id string // Container ID
status string // Container status (running, stopped, paused)
pid int // PID of container init process
bundle string // Path to OCI bundle
created string // Creation timestamp
}
// Start the container
//
// This method handles the complete container startup lifecycle:
// 1. Creates the container in crun if it doesn't exist
// 2. Handles leftover state cleanup if creation fails
// 3. Starts the container process
// 4. Sets up networking (thread-safe via network_mutex)
//
// Thread Safety:
// Network setup is thread-safe via HeroPods.network_setup_container()
pub fn (mut self Container) start() ! {
// Check if container exists in crun
container_exists := self.container_exists_in_crun()!
if !container_exists {
// Container doesn't exist, create it first
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} does not exist, creating it...'
logtype: .stdout
) or {}
// Try to create the container, if it fails with "File exists" error,
// try to force delete any leftover state and retry
crun_root := '${self.factory.base_dir}/runtime'
_ := osal.exec(
cmd: 'crun --root ${crun_root} create --bundle ${self.factory.base_dir}/configs/${self.name} ${self.name}'
stdout: true
) or {
if err.msg().contains('File exists') {
self.factory.logger.log(
cat: 'container'
log: 'Container creation failed with "File exists", attempting to clean up leftover state...'
logtype: .stdout
) or {}
// Force delete any leftover state - try multiple cleanup approaches
osal.exec(cmd: 'crun --root ${crun_root} delete ${self.name}', stdout: false) or {}
osal.exec(cmd: 'crun delete ${self.name}', stdout: false) or {} // Also try default root
// Clean up any leftover runtime directories
osal.exec(cmd: 'rm -rf ${crun_root}/${self.name}', stdout: false) or {}
osal.exec(cmd: 'rm -rf /run/crun/${self.name}', stdout: false) or {}
// Wait a moment for cleanup to complete
time.sleep(cleanup_retry_delay_ms * time.millisecond)
// Retry creation
osal.exec(
cmd: 'crun --root ${crun_root} create --bundle ${self.factory.base_dir}/configs/${self.name} ${self.name}'
stdout: true
)!
} else {
return err
}
}
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} created'
logtype: .stdout
) or {}
}
status := self.status()!
if status == .running {
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} is already running'
logtype: .stdout
) or {}
return
}
// If container exists but is stopped, we need to delete and recreate it
// because crun doesn't allow restarting a stopped container
if container_exists && status != .running {
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} exists but is stopped, recreating...'
logtype: .stdout
) or {}
crun_root := '${self.factory.base_dir}/runtime'
osal.exec(cmd: 'crun --root ${crun_root} delete ${self.name}', stdout: false) or {}
osal.exec(
cmd: 'crun --root ${crun_root} create --bundle ${self.factory.base_dir}/configs/${self.name} ${self.name}'
stdout: true
)!
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} recreated'
logtype: .stdout
) or {}
}
// start the container (crun start doesn't have --detach flag)
crun_root := '${self.factory.base_dir}/runtime'
osal.exec(cmd: 'crun --root ${crun_root} start ${self.name}', stdout: true)!
// Wait for container process to be fully ready before setting up network
// Poll for the PID and verify /proc/<pid>/ns/net exists
self.wait_for_process_ready()!
// Setup network for the container (thread-safe)
// If this fails, stop the container to clean up
self.setup_network() or {
self.factory.logger.log(
cat: 'container'
log: 'Network setup failed, stopping container: ${err}'
logtype: .error
) or {}
// Use stop() method to properly clean up (kills process, cleans network, etc.)
// Ignore errors from stop since we're already in an error path
self.stop() or {
self.factory.logger.log(
cat: 'container'
log: 'Failed to stop container during cleanup: ${err}'
logtype: .error
) or {}
}
return error('Failed to setup network for container: ${err}')
}
// Setup Mycelium IPv6 overlay network if enabled
if self.factory.mycelium_enabled {
container_pid := self.pid()!
self.factory.mycelium_setup_container(self.name, container_pid) or {
self.factory.logger.log(
cat: 'container'
log: 'Mycelium setup failed, stopping container: ${err}'
logtype: .error
) or {}
// Stop container to clean up
self.stop() or {
self.factory.logger.log(
cat: 'container'
log: 'Failed to stop container during Mycelium cleanup: ${err}'
logtype: .error
) or {}
}
return error('Failed to setup Mycelium for container: ${err}')
}
}
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} started'
logtype: .stdout
) or {}
}
// Stop the container gracefully (SIGTERM) or forcefully (SIGKILL)
//
// This method:
// 1. Sends SIGTERM for graceful shutdown
// 2. Waits up to sigterm_timeout_ms for graceful stop
// 3. Sends SIGKILL if still running after timeout
// 4. Cleans up network resources (thread-safe)
//
// Thread Safety:
// Network cleanup is thread-safe via HeroPods.network_cleanup_container()
pub fn (mut self Container) stop() ! {
status := self.status()!
if status == .stopped {
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} is already stopped'
logtype: .stdout
) or {}
return
}
crun_root := '${self.factory.base_dir}/runtime'
// Send SIGTERM for graceful shutdown
osal.exec(cmd: 'crun --root ${crun_root} kill ${self.name} SIGTERM', stdout: false) or {
self.factory.logger.log(
cat: 'container'
log: 'Failed to send SIGTERM (container may already be stopped): ${err}'
logtype: .stdout
) or {}
}
// Wait up to sigterm_timeout_ms for graceful shutdown
mut attempts := 0
max_attempts := sigterm_timeout_ms / stop_check_interval_ms
for attempts < max_attempts {
time.sleep(stop_check_interval_ms * time.millisecond)
current_status := self.status() or {
// If we can't get status, assume it's stopped (container may have been deleted)
ContainerStatus.stopped
}
if current_status == .stopped {
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} stopped gracefully'
logtype: .stdout
) or {}
self.cleanup_network()! // Thread-safe network cleanup
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} stopped'
logtype: .stdout
) or {}
return
}
attempts++
}
// Force kill if still running after timeout
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} did not stop gracefully, force killing'
logtype: .stdout
) or {}
osal.exec(cmd: 'crun --root ${crun_root} kill ${self.name} SIGKILL', stdout: false) or {
self.factory.logger.log(
cat: 'container'
log: 'Failed to send SIGKILL: ${err}'
logtype: .error
) or {}
}
// Wait for SIGKILL to take effect
time.sleep(sigkill_wait_ms * time.millisecond)
// Verify it's actually stopped
final_status := self.status() or {
// If we can't get status, assume it's stopped (container may have been deleted)
ContainerStatus.stopped
}
if final_status != .stopped {
return error('Failed to stop container ${self.name} - status: ${final_status}')
}
// Cleanup network resources (thread-safe)
self.cleanup_network()!
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} stopped'
logtype: .stdout
) or {}
}
// Delete the container
//
// This method:
// 1. Checks if container exists in crun
// 2. Stops the container (which cleans up network)
// 3. Deletes the container from crun
// 4. Removes from factory's container cache
//
// Thread Safety:
// Network cleanup is thread-safe via stop() -> cleanup_network()
pub fn (mut self Container) delete() ! {
// Check if container exists before trying to delete
if !self.container_exists_in_crun()! {
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} does not exist in crun'
logtype: .stdout
) or {}
// Still cleanup network resources in case they exist (thread-safe)
self.cleanup_network() or {
self.factory.logger.log(
cat: 'container'
log: 'Network cleanup failed (may not exist): ${err}'
logtype: .stdout
) or {}
}
// Remove from factory's container cache only after all cleanup is done
if self.name in self.factory.containers {
self.factory.containers.delete(self.name)
}
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} removed from cache'
logtype: .stdout
) or {}
return
}
// Stop the container (this will cleanup network via stop())
self.stop()!
// Delete the container from crun
crun_root := '${self.factory.base_dir}/runtime'
osal.exec(cmd: 'crun --root ${crun_root} delete ${self.name}', stdout: false) or {
self.factory.logger.log(
cat: 'container'
log: 'Failed to delete container from crun: ${err}'
logtype: .error
) or {}
}
// Remove from factory's container cache only after all cleanup is complete
if self.name in self.factory.containers {
self.factory.containers.delete(self.name)
}
self.factory.logger.log(
cat: 'container'
log: 'Container ${self.name} deleted'
logtype: .stdout
) or {}
}
// Execute command inside the container
pub fn (mut self Container) exec(cmd_ osal.Command) !string {
// Ensure container is running
if self.status()! != .running {
self.start()!
}
// Use the builder node to execute inside container
mut node := self.node()!
self.factory.logger.log(
cat: 'container'
log: 'Executing command in container ${self.name}: ${cmd_.cmd}'
logtype: .stdout
) or {}
// Execute and provide better error context
return node.exec(cmd: cmd_.cmd, stdout: cmd_.stdout) or {
// Check if container still exists to provide better error message
if !self.container_exists_in_crun()! {
return error('Container ${self.name} was deleted during command execution')
}
return error('Command execution failed in container ${self.name}: ${err}')
}
}
pub fn (self Container) status() !ContainerStatus {
crun_root := '${self.factory.base_dir}/runtime'
result := osal.exec(cmd: 'crun --root ${crun_root} state ${self.name}', stdout: false) or {
// Container doesn't exist - this is expected in some cases (e.g., before creation)
// Check error message to distinguish between "not found" and real errors
err_msg := err.msg().to_lower()
if err_msg.contains('does not exist') || err_msg.contains('not found')
|| err_msg.contains('no such') {
return .stopped
}
// Real error (permissions, crun not installed, etc.) - propagate it
return error('Failed to get container status: ${err}')
}
// Parse JSON output from crun state
state := json.decode(CrunState, result.output) or {
return error('Failed to parse container state JSON: ${err}')
}
status_result := match state.status {
'running' {
ContainerStatus.running
}
'stopped' {
ContainerStatus.stopped
}
'paused' {
ContainerStatus.paused
}
else {
// Unknown status - return unknown (can't log here as function is immutable)
ContainerStatus.unknown
}
}
return status_result
}
// Get the PID of the container's init process
pub fn (self Container) pid() !int {
crun_root := '${self.factory.base_dir}/runtime'
result := osal.exec(
cmd: 'crun --root ${crun_root} state ${self.name}'
stdout: false
)!
// Parse JSON output from crun state
state := json.decode(CrunState, result.output)!
if state.pid == 0 {
return error('Container ${self.name} has no PID (not running?)')
}
return state.pid
}
// Wait for container process to be fully ready
//
// After `crun start` returns, the container process may not be fully initialized yet.
// This method polls for the container's PID and verifies that /proc/<pid>/ns/net exists
// before returning. This ensures network setup can proceed without errors.
//
// The method uses exponential backoff polling (no sleep delays) to minimize wait time.
fn (self Container) wait_for_process_ready() ! {
crun_root := '${self.factory.base_dir}/runtime'
// Poll for up to 100 iterations (very fast, no sleep)
// Most containers will be ready within the first few iterations
for i in 0 .. 100 {
// Try to get the container state
result := osal.exec(
cmd: 'crun --root ${crun_root} state ${self.name}'
stdout: false
) or {
// Container state not ready yet, continue polling
continue
}
// Parse the state to get PID
state := json.decode(CrunState, result.output) or {
// JSON not ready yet, continue polling
continue
}
// Check if we have a valid PID
if state.pid == 0 {
continue
}
// Verify that /proc/<pid>/ns/net exists (this is what nsenter needs)
ns_net_path := '/proc/${state.pid}/ns/net'
if os.exists(ns_net_path) {
// Process is ready!
return
}
// If we've tried many times, add a tiny yield to avoid busy-waiting
if i > 50 && i % 10 == 0 {
time.sleep(1 * time.millisecond)
}
}
return error('Container process did not become ready in time')
}
// Setup network for this container (thread-safe)
//
// Delegates to HeroPods.network_setup_container() which uses network_mutex
// for thread-safe IP allocation and network configuration.
fn (mut self Container) setup_network() ! {
// Get container PID
container_pid := self.pid()!
// Delegate to factory's network setup (thread-safe)
mut factory := self.factory
factory.network_setup_container(self.name, container_pid)!
}
// Cleanup network for this container (thread-safe)
//
// Delegates to HeroPods.network_cleanup_container() which uses network_mutex
// for thread-safe IP deallocation and network cleanup.
// Also cleans up Mycelium IPv6 overlay network if enabled.
fn (mut self Container) cleanup_network() ! {
mut factory := self.factory
factory.network_cleanup_container(self.name)!
// Cleanup Mycelium IPv6 overlay network if enabled
if factory.mycelium_enabled {
factory.mycelium_cleanup_container(self.name) or {
factory.logger.log(
cat: 'container'
log: 'Warning: Failed to cleanup Mycelium for container ${self.name}: ${err}'
logtype: .error
) or {}
}
}
}
// Check if container exists in crun (regardless of its state)
fn (self Container) container_exists_in_crun() !bool {
// Try to get container state - if it fails, container doesn't exist
crun_root := '${self.factory.base_dir}/runtime'
result := osal.exec(cmd: 'crun --root ${crun_root} state ${self.name}', stdout: false) or {
return false
}
// If we get here, the container exists (even if stopped/paused)
return result.exit_code == 0
}
// ContainerStatus represents the current state of a container
pub enum ContainerStatus {
running // Container is running
stopped // Container is stopped or doesn't exist
paused // Container is paused
unknown // Unknown status (error case)
}
// Get CPU usage in percentage
pub fn (self Container) cpu_usage() !f64 {
// Use cgroup stats to get CPU usage
result := osal.exec(
cmd: 'cat /sys/fs/cgroup/system.slice/crun-${self.name}.scope/cpu.stat'
stdout: false
) or { return 0.0 }
for line in result.output.split_into_lines() {
if line.starts_with('usage_usec') {
usage := line.split(' ')[1].f64()
return usage / 1000000.0 // Convert to percentage
}
}
return 0.0
}
// Get memory usage in MB
pub fn (self Container) mem_usage() !f64 {
result := osal.exec(
cmd: 'cat /sys/fs/cgroup/system.slice/crun-${self.name}.scope/memory.current'
stdout: false
) or { return 0.0 }
bytes := result.output.trim_space().f64()
return bytes / (1024 * 1024) // Convert to MB
}
pub struct TmuxPaneArgs {
pub mut:
window_name string
pane_nr int
pane_name string // optional
cmd string // optional, will execute this cmd
reset bool // if true will reset everything and restart a cmd
env map[string]string // optional, will set these env vars in the pane
}
pub fn (mut self Container) tmux_pane(args TmuxPaneArgs) !&tmux.Pane {
mut t := tmux.new()!
session_name := 'herorun'
mut session := if t.session_exist(session_name) {
t.session_get(session_name)!
} else {
t.session_create(name: session_name)!
}
// Get or create window
mut window := session.window_get(name: args.window_name) or {
session.window_new(name: args.window_name)!
}
// Get existing pane by number, or create a new one
mut pane := window.pane_get(args.pane_nr) or { window.pane_new()! }
if args.reset {
pane.clear()!
}
// Set environment variables if provided
for key, value in args.env {
pane.send_keys('export ${key}="${value}"')!
}
// Execute command if provided
if args.cmd != '' {
crun_root := '${self.factory.base_dir}/runtime'
pane.send_keys('crun --root ${crun_root} exec ${self.name} ${args.cmd}')!
}
self.tmux_pane = pane
return pane
}
pub fn (mut self Container) node() !&builder.Node {
// If node already initialized, return it
if self.node != none {
return self.node
}
mut b := builder.new()!
mut exec := builder.ExecutorCrun{
container_id: self.name
crun_root: '${self.factory.base_dir}/runtime'
debug: false
}
exec.init() or {
return error('Failed to init ExecutorCrun for container ${self.name}: ${err}')
}
// Create node using the factory method, then override the executor
mut node := b.node_new(name: 'container_${self.name}', ipaddr: 'localhost')!
node.executor = exec
node.platform = .alpine
node.cputype = .intel
node.done = map[string]string{}
node.environment = map[string]string{}
node.hostname = self.name
self.node = node
return node
}
// Get the crun configuration for this container
pub fn (self Container) config() !&crun.CrunConfig {
return self.crun_config or { return error('Container ${self.name} has no crun configuration') }
}
// Container configuration customization methods
pub fn (mut self Container) set_memory_limit(limit_mb u64) !&Container {
mut config := self.config()!
config.set_memory_limit(limit_mb * 1024 * 1024) // Convert MB to bytes
return &self
}
pub fn (mut self Container) set_cpu_limits(period u64, quota i64, shares u64) !&Container {
mut config := self.config()!
config.set_cpu_limits(period, quota, shares)
return &self
}
pub fn (mut self Container) add_mount(source string, destination string, mount_type crun.MountType, options []crun.MountOption) !&Container {
mut config := self.config()!
config.add_mount(source, destination, mount_type, options)
return &self
}
pub fn (mut self Container) add_capability(cap crun.Capability) !&Container {
mut config := self.config()!
config.add_capability(cap)
return &self
}
pub fn (mut self Container) remove_capability(cap crun.Capability) !&Container {
mut config := self.config()!
config.remove_capability(cap)
return &self
}
pub fn (mut self Container) add_env(key string, value string) !&Container {
mut config := self.config()!
config.add_env(key, value)
return &self
}
pub fn (mut self Container) set_working_dir(dir string) !&Container {
mut config := self.config()!
config.set_working_dir(dir)
return &self
}
// Save the current configuration to disk
pub fn (self Container) save_config() ! {
config := self.config()!
config_path := '${self.factory.base_dir}/configs/${self.name}/config.json'
config.save_to_file(config_path)!
}