- Replace console logging with logger.log calls - Improve network bridge creation robustness - Enhance network IP allocation and cleanup logic - Refactor network cleanup for better concurrency handling
603 lines
18 KiB
V
603 lines
18 KiB
V
module heropods
|
|
|
|
import incubaid.herolib.osal.tmux
|
|
import incubaid.herolib.osal.core as osal
|
|
import incubaid.herolib.virt.crun
|
|
import time
|
|
import incubaid.herolib.builder
|
|
import json
|
|
|
|
// Container lifecycle timeout constants
|
|
const cleanup_retry_delay_ms = 500 // Time to wait for filesystem cleanup to complete
|
|
const sigterm_timeout_ms = 5000 // Time to wait for graceful shutdown (5 seconds)
|
|
const sigkill_wait_ms = 500 // Time to wait after SIGKILL
|
|
const stop_check_interval_ms = 500 // Interval to check if container stopped
|
|
|
|
// Container represents a running or stopped OCI container managed by crun
|
|
//
|
|
// Thread Safety:
|
|
// Container operations that interact with network configuration (start, stop, delete)
|
|
// are thread-safe because they delegate to HeroPods.network_* methods which use
|
|
// the network_mutex for protection.
|
|
@[heap]
|
|
pub struct Container {
|
|
pub mut:
|
|
name string // Unique container name
|
|
node ?&builder.Node // Builder node for executing commands inside container
|
|
tmux_pane ?&tmux.Pane // Optional tmux pane for interactive access
|
|
crun_config ?&crun.CrunConfig // OCI runtime configuration
|
|
factory &HeroPods // Reference to parent HeroPods instance
|
|
}
|
|
|
|
// CrunState represents the JSON output of `crun state` command
|
|
struct CrunState {
|
|
id string // Container ID
|
|
status string // Container status (running, stopped, paused)
|
|
pid int // PID of container init process
|
|
bundle string // Path to OCI bundle
|
|
created string // Creation timestamp
|
|
}
|
|
|
|
// Start the container
|
|
//
|
|
// This method handles the complete container startup lifecycle:
|
|
// 1. Creates the container in crun if it doesn't exist
|
|
// 2. Handles leftover state cleanup if creation fails
|
|
// 3. Starts the container process
|
|
// 4. Sets up networking (thread-safe via network_mutex)
|
|
//
|
|
// Thread Safety:
|
|
// Network setup is thread-safe via HeroPods.network_setup_container()
|
|
pub fn (mut self Container) start() ! {
|
|
// Check if container exists in crun
|
|
container_exists := self.container_exists_in_crun()!
|
|
|
|
if !container_exists {
|
|
// Container doesn't exist, create it first
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} does not exist, creating it...'
|
|
logtype: .stdout
|
|
) or {}
|
|
// Try to create the container, if it fails with "File exists" error,
|
|
// try to force delete any leftover state and retry
|
|
crun_root := '${self.factory.base_dir}/runtime'
|
|
_ := osal.exec(
|
|
cmd: 'crun --root ${crun_root} create --bundle ${self.factory.base_dir}/configs/${self.name} ${self.name}'
|
|
stdout: true
|
|
) or {
|
|
if err.msg().contains('File exists') {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container creation failed with "File exists", attempting to clean up leftover state...'
|
|
logtype: .stdout
|
|
) or {}
|
|
// Force delete any leftover state - try multiple cleanup approaches
|
|
osal.exec(cmd: 'crun --root ${crun_root} delete ${self.name}', stdout: false) or {}
|
|
osal.exec(cmd: 'crun delete ${self.name}', stdout: false) or {} // Also try default root
|
|
// Clean up any leftover runtime directories
|
|
osal.exec(cmd: 'rm -rf ${crun_root}/${self.name}', stdout: false) or {}
|
|
osal.exec(cmd: 'rm -rf /run/crun/${self.name}', stdout: false) or {}
|
|
// Wait a moment for cleanup to complete
|
|
time.sleep(cleanup_retry_delay_ms * time.millisecond)
|
|
// Retry creation
|
|
osal.exec(
|
|
cmd: 'crun --root ${crun_root} create --bundle ${self.factory.base_dir}/configs/${self.name} ${self.name}'
|
|
stdout: true
|
|
)!
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} created'
|
|
logtype: .stdout
|
|
) or {}
|
|
}
|
|
|
|
status := self.status()!
|
|
if status == .running {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} is already running'
|
|
logtype: .stdout
|
|
) or {}
|
|
return
|
|
}
|
|
|
|
// If container exists but is stopped, we need to delete and recreate it
|
|
// because crun doesn't allow restarting a stopped container
|
|
if container_exists && status != .running {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} exists but is stopped, recreating...'
|
|
logtype: .stdout
|
|
) or {}
|
|
crun_root := '${self.factory.base_dir}/runtime'
|
|
osal.exec(cmd: 'crun --root ${crun_root} delete ${self.name}', stdout: false) or {}
|
|
osal.exec(
|
|
cmd: 'crun --root ${crun_root} create --bundle ${self.factory.base_dir}/configs/${self.name} ${self.name}'
|
|
stdout: true
|
|
)!
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} recreated'
|
|
logtype: .stdout
|
|
) or {}
|
|
}
|
|
|
|
// start the container (crun start doesn't have --detach flag)
|
|
crun_root := '${self.factory.base_dir}/runtime'
|
|
// Start the container
|
|
osal.exec(cmd: 'crun --root ${crun_root} start ${self.name}', stdout: true) or {
|
|
return error('Failed to start container: ${err}')
|
|
}
|
|
|
|
// Setup network for the container (thread-safe)
|
|
// If this fails, stop the container to clean up
|
|
self.setup_network() or {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Network setup failed, stopping container: ${err}'
|
|
logtype: .error
|
|
) or {}
|
|
// Use stop() method to properly clean up (kills process, cleans network, etc.)
|
|
// Ignore errors from stop since we're already in an error path
|
|
self.stop() or {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Failed to stop container during cleanup: ${err}'
|
|
logtype: .error
|
|
) or {}
|
|
}
|
|
return error('Failed to setup network for container: ${err}')
|
|
}
|
|
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} started'
|
|
logtype: .stdout
|
|
) or {}
|
|
}
|
|
|
|
// Stop the container gracefully (SIGTERM) or forcefully (SIGKILL)
|
|
//
|
|
// This method:
|
|
// 1. Sends SIGTERM for graceful shutdown
|
|
// 2. Waits up to sigterm_timeout_ms for graceful stop
|
|
// 3. Sends SIGKILL if still running after timeout
|
|
// 4. Cleans up network resources (thread-safe)
|
|
//
|
|
// Thread Safety:
|
|
// Network cleanup is thread-safe via HeroPods.network_cleanup_container()
|
|
pub fn (mut self Container) stop() ! {
|
|
status := self.status()!
|
|
if status == .stopped {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} is already stopped'
|
|
logtype: .stdout
|
|
) or {}
|
|
return
|
|
}
|
|
|
|
crun_root := '${self.factory.base_dir}/runtime'
|
|
|
|
// Send SIGTERM for graceful shutdown
|
|
osal.exec(cmd: 'crun --root ${crun_root} kill ${self.name} SIGTERM', stdout: false) or {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Failed to send SIGTERM (container may already be stopped): ${err}'
|
|
logtype: .stdout
|
|
) or {}
|
|
}
|
|
|
|
// Wait up to sigterm_timeout_ms for graceful shutdown
|
|
mut attempts := 0
|
|
max_attempts := sigterm_timeout_ms / stop_check_interval_ms
|
|
for attempts < max_attempts {
|
|
time.sleep(stop_check_interval_ms * time.millisecond)
|
|
current_status := self.status() or {
|
|
// If we can't get status, assume it's stopped (container may have been deleted)
|
|
ContainerStatus.stopped
|
|
}
|
|
if current_status == .stopped {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} stopped gracefully'
|
|
logtype: .stdout
|
|
) or {}
|
|
self.cleanup_network()! // Thread-safe network cleanup
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} stopped'
|
|
logtype: .stdout
|
|
) or {}
|
|
return
|
|
}
|
|
attempts++
|
|
}
|
|
|
|
// Force kill if still running after timeout
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} did not stop gracefully, force killing'
|
|
logtype: .stdout
|
|
) or {}
|
|
osal.exec(cmd: 'crun --root ${crun_root} kill ${self.name} SIGKILL', stdout: false) or {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Failed to send SIGKILL: ${err}'
|
|
logtype: .error
|
|
) or {}
|
|
}
|
|
|
|
// Wait for SIGKILL to take effect
|
|
time.sleep(sigkill_wait_ms * time.millisecond)
|
|
|
|
// Verify it's actually stopped
|
|
final_status := self.status() or {
|
|
// If we can't get status, assume it's stopped (container may have been deleted)
|
|
ContainerStatus.stopped
|
|
}
|
|
if final_status != .stopped {
|
|
return error('Failed to stop container ${self.name} - status: ${final_status}')
|
|
}
|
|
|
|
// Cleanup network resources (thread-safe)
|
|
self.cleanup_network()!
|
|
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} stopped'
|
|
logtype: .stdout
|
|
) or {}
|
|
}
|
|
|
|
// Delete the container
|
|
//
|
|
// This method:
|
|
// 1. Checks if container exists in crun
|
|
// 2. Stops the container (which cleans up network)
|
|
// 3. Deletes the container from crun
|
|
// 4. Removes from factory's container cache
|
|
//
|
|
// Thread Safety:
|
|
// Network cleanup is thread-safe via stop() -> cleanup_network()
|
|
pub fn (mut self Container) delete() ! {
|
|
// Check if container exists before trying to delete
|
|
if !self.container_exists_in_crun()! {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} does not exist in crun'
|
|
logtype: .stdout
|
|
) or {}
|
|
// Still cleanup network resources in case they exist (thread-safe)
|
|
self.cleanup_network() or {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Network cleanup failed (may not exist): ${err}'
|
|
logtype: .stdout
|
|
) or {}
|
|
}
|
|
// Remove from factory's container cache only after all cleanup is done
|
|
if self.name in self.factory.containers {
|
|
self.factory.containers.delete(self.name)
|
|
}
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} removed from cache'
|
|
logtype: .stdout
|
|
) or {}
|
|
return
|
|
}
|
|
|
|
// Stop the container (this will cleanup network via stop())
|
|
self.stop()!
|
|
|
|
// Delete the container from crun
|
|
crun_root := '${self.factory.base_dir}/runtime'
|
|
osal.exec(cmd: 'crun --root ${crun_root} delete ${self.name}', stdout: false) or {
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Failed to delete container from crun: ${err}'
|
|
logtype: .error
|
|
) or {}
|
|
}
|
|
|
|
// Remove from factory's container cache only after all cleanup is complete
|
|
if self.name in self.factory.containers {
|
|
self.factory.containers.delete(self.name)
|
|
}
|
|
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Container ${self.name} deleted'
|
|
logtype: .stdout
|
|
) or {}
|
|
}
|
|
|
|
// Execute command inside the container
|
|
pub fn (mut self Container) exec(cmd_ osal.Command) !string {
|
|
// Ensure container is running
|
|
if self.status()! != .running {
|
|
self.start()!
|
|
}
|
|
|
|
// Use the builder node to execute inside container
|
|
mut node := self.node()!
|
|
self.factory.logger.log(
|
|
cat: 'container'
|
|
log: 'Executing command in container ${self.name}: ${cmd_.cmd}'
|
|
logtype: .stdout
|
|
) or {}
|
|
|
|
// Execute and provide better error context
|
|
return node.exec(cmd: cmd_.cmd, stdout: cmd_.stdout) or {
|
|
// Check if container still exists to provide better error message
|
|
if !self.container_exists_in_crun()! {
|
|
return error('Container ${self.name} was deleted during command execution')
|
|
}
|
|
return error('Command execution failed in container ${self.name}: ${err}')
|
|
}
|
|
}
|
|
|
|
pub fn (self Container) status() !ContainerStatus {
|
|
crun_root := '${self.factory.base_dir}/runtime'
|
|
result := osal.exec(cmd: 'crun --root ${crun_root} state ${self.name}', stdout: false) or {
|
|
// Container doesn't exist - this is expected in some cases (e.g., before creation)
|
|
// Check error message to distinguish between "not found" and real errors
|
|
err_msg := err.msg().to_lower()
|
|
if err_msg.contains('does not exist') || err_msg.contains('not found')
|
|
|| err_msg.contains('no such') {
|
|
return .stopped
|
|
}
|
|
// Real error (permissions, crun not installed, etc.) - propagate it
|
|
return error('Failed to get container status: ${err}')
|
|
}
|
|
|
|
// Parse JSON output from crun state
|
|
state := json.decode(CrunState, result.output) or {
|
|
return error('Failed to parse container state JSON: ${err}')
|
|
}
|
|
|
|
status_result := match state.status {
|
|
'running' {
|
|
ContainerStatus.running
|
|
}
|
|
'stopped' {
|
|
ContainerStatus.stopped
|
|
}
|
|
'paused' {
|
|
ContainerStatus.paused
|
|
}
|
|
else {
|
|
// Unknown status - return unknown (can't log here as function is immutable)
|
|
ContainerStatus.unknown
|
|
}
|
|
}
|
|
return status_result
|
|
}
|
|
|
|
// Get the PID of the container's init process
|
|
pub fn (self Container) pid() !int {
|
|
crun_root := '${self.factory.base_dir}/runtime'
|
|
result := osal.exec(
|
|
cmd: 'crun --root ${crun_root} state ${self.name}'
|
|
stdout: false
|
|
)!
|
|
|
|
// Parse JSON output from crun state
|
|
state := json.decode(CrunState, result.output)!
|
|
|
|
if state.pid == 0 {
|
|
return error('Container ${self.name} has no PID (not running?)')
|
|
}
|
|
|
|
return state.pid
|
|
}
|
|
|
|
// Setup network for this container (thread-safe)
|
|
//
|
|
// Delegates to HeroPods.network_setup_container() which uses network_mutex
|
|
// for thread-safe IP allocation and network configuration.
|
|
fn (mut self Container) setup_network() ! {
|
|
// Get container PID
|
|
container_pid := self.pid()!
|
|
|
|
// Delegate to factory's network setup (thread-safe)
|
|
mut factory := self.factory
|
|
factory.network_setup_container(self.name, container_pid)!
|
|
}
|
|
|
|
// Cleanup network for this container (thread-safe)
|
|
//
|
|
// Delegates to HeroPods.network_cleanup_container() which uses network_mutex
|
|
// for thread-safe IP deallocation and network cleanup.
|
|
fn (mut self Container) cleanup_network() ! {
|
|
mut factory := self.factory
|
|
factory.network_cleanup_container(self.name)!
|
|
}
|
|
|
|
// Check if container exists in crun (regardless of its state)
|
|
fn (self Container) container_exists_in_crun() !bool {
|
|
// Try to get container state - if it fails, container doesn't exist
|
|
crun_root := '${self.factory.base_dir}/runtime'
|
|
result := osal.exec(cmd: 'crun --root ${crun_root} state ${self.name}', stdout: false) or {
|
|
return false
|
|
}
|
|
|
|
// If we get here, the container exists (even if stopped/paused)
|
|
return result.exit_code == 0
|
|
}
|
|
|
|
// ContainerStatus represents the current state of a container
|
|
pub enum ContainerStatus {
|
|
running // Container is running
|
|
stopped // Container is stopped or doesn't exist
|
|
paused // Container is paused
|
|
unknown // Unknown status (error case)
|
|
}
|
|
|
|
// Get CPU usage in percentage
|
|
pub fn (self Container) cpu_usage() !f64 {
|
|
// Use cgroup stats to get CPU usage
|
|
result := osal.exec(
|
|
cmd: 'cat /sys/fs/cgroup/system.slice/crun-${self.name}.scope/cpu.stat'
|
|
stdout: false
|
|
) or { return 0.0 }
|
|
|
|
for line in result.output.split_into_lines() {
|
|
if line.starts_with('usage_usec') {
|
|
usage := line.split(' ')[1].f64()
|
|
return usage / 1000000.0 // Convert to percentage
|
|
}
|
|
}
|
|
return 0.0
|
|
}
|
|
|
|
// Get memory usage in MB
|
|
pub fn (self Container) mem_usage() !f64 {
|
|
result := osal.exec(
|
|
cmd: 'cat /sys/fs/cgroup/system.slice/crun-${self.name}.scope/memory.current'
|
|
stdout: false
|
|
) or { return 0.0 }
|
|
|
|
bytes := result.output.trim_space().f64()
|
|
return bytes / (1024 * 1024) // Convert to MB
|
|
}
|
|
|
|
pub struct TmuxPaneArgs {
|
|
pub mut:
|
|
window_name string
|
|
pane_nr int
|
|
pane_name string // optional
|
|
cmd string // optional, will execute this cmd
|
|
reset bool // if true will reset everything and restart a cmd
|
|
env map[string]string // optional, will set these env vars in the pane
|
|
}
|
|
|
|
pub fn (mut self Container) tmux_pane(args TmuxPaneArgs) !&tmux.Pane {
|
|
mut t := tmux.new()!
|
|
session_name := 'herorun'
|
|
|
|
mut session := if t.session_exist(session_name) {
|
|
t.session_get(session_name)!
|
|
} else {
|
|
t.session_create(name: session_name)!
|
|
}
|
|
|
|
// Get or create window
|
|
mut window := session.window_get(name: args.window_name) or {
|
|
session.window_new(name: args.window_name)!
|
|
}
|
|
|
|
// Get existing pane by number, or create a new one
|
|
mut pane := window.pane_get(args.pane_nr) or { window.pane_new()! }
|
|
|
|
if args.reset {
|
|
pane.clear()!
|
|
}
|
|
|
|
// Set environment variables if provided
|
|
for key, value in args.env {
|
|
pane.send_keys('export ${key}="${value}"')!
|
|
}
|
|
|
|
// Execute command if provided
|
|
if args.cmd != '' {
|
|
crun_root := '${self.factory.base_dir}/runtime'
|
|
pane.send_keys('crun --root ${crun_root} exec ${self.name} ${args.cmd}')!
|
|
}
|
|
|
|
self.tmux_pane = pane
|
|
return pane
|
|
}
|
|
|
|
pub fn (mut self Container) node() !&builder.Node {
|
|
// If node already initialized, return it
|
|
if self.node != none {
|
|
return self.node
|
|
}
|
|
|
|
mut b := builder.new()!
|
|
|
|
mut exec := builder.ExecutorCrun{
|
|
container_id: self.name
|
|
crun_root: '${self.factory.base_dir}/runtime'
|
|
debug: false
|
|
}
|
|
|
|
exec.init() or {
|
|
return error('Failed to init ExecutorCrun for container ${self.name}: ${err}')
|
|
}
|
|
|
|
// Create node using the factory method, then override the executor
|
|
mut node := b.node_new(name: 'container_${self.name}', ipaddr: 'localhost')!
|
|
node.executor = exec
|
|
node.platform = .alpine
|
|
node.cputype = .intel
|
|
node.done = map[string]string{}
|
|
node.environment = map[string]string{}
|
|
node.hostname = self.name
|
|
|
|
self.node = node
|
|
return node
|
|
}
|
|
|
|
// Get the crun configuration for this container
|
|
pub fn (self Container) config() !&crun.CrunConfig {
|
|
return self.crun_config or { return error('Container ${self.name} has no crun configuration') }
|
|
}
|
|
|
|
// Container configuration customization methods
|
|
pub fn (mut self Container) set_memory_limit(limit_mb u64) !&Container {
|
|
mut config := self.config()!
|
|
config.set_memory_limit(limit_mb * 1024 * 1024) // Convert MB to bytes
|
|
return &self
|
|
}
|
|
|
|
pub fn (mut self Container) set_cpu_limits(period u64, quota i64, shares u64) !&Container {
|
|
mut config := self.config()!
|
|
config.set_cpu_limits(period, quota, shares)
|
|
return &self
|
|
}
|
|
|
|
pub fn (mut self Container) add_mount(source string, destination string, mount_type crun.MountType, options []crun.MountOption) !&Container {
|
|
mut config := self.config()!
|
|
config.add_mount(source, destination, mount_type, options)
|
|
return &self
|
|
}
|
|
|
|
pub fn (mut self Container) add_capability(cap crun.Capability) !&Container {
|
|
mut config := self.config()!
|
|
config.add_capability(cap)
|
|
return &self
|
|
}
|
|
|
|
pub fn (mut self Container) remove_capability(cap crun.Capability) !&Container {
|
|
mut config := self.config()!
|
|
config.remove_capability(cap)
|
|
return &self
|
|
}
|
|
|
|
pub fn (mut self Container) add_env(key string, value string) !&Container {
|
|
mut config := self.config()!
|
|
config.add_env(key, value)
|
|
return &self
|
|
}
|
|
|
|
pub fn (mut self Container) set_working_dir(dir string) !&Container {
|
|
mut config := self.config()!
|
|
config.set_working_dir(dir)
|
|
return &self
|
|
}
|
|
|
|
// Save the current configuration to disk
|
|
pub fn (self Container) save_config() ! {
|
|
config := self.config()!
|
|
config_path := '${self.factory.base_dir}/configs/${self.name}/config.json'
|
|
config.save_to_file(config_path)!
|
|
}
|