refactor wip

2025-08-05 12:19:38 +02:00
parent 8ed40ce99c
commit 7a652c9c3c
51 changed files with 6183 additions and 840 deletions
--- a/core/supervisor/Cargo.toml
+++ b/core/supervisor/Cargo.toml
@@ -7,19 +7,35 @@ edition = "2021"
 name = "supervisor"
 path = "cmd/supervisor.rs"

+[[bin]]
+name = "hive-supervisor"
+path = "cmd/hive_supervisor.rs"
+
+[[bin]]
+name = "hive-supervisor-tui"
+path = "cmd/hive_supervisor_tui.rs"
+
+[[bin]]
+name = "hive-supervisor-tui-safe"
+path = "cmd/hive_supervisor_tui_safe.rs"
+
 [dependencies]
 clap = { version = "4.4", features = ["derive"] }
 env_logger = "0.10"
 redis = { version = "0.25.0", features = ["tokio-comp"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
+toml = "0.8"
 uuid = { version = "1.6", features = ["v4", "serde"] }
 chrono = { version = "0.4", features = ["serde"] }
 log = "0.4"
 tokio = { version = "1", features = ["macros", "rt-multi-thread"] } # For async main in examples, and general async
 colored = "2.0"
 hero_job = { path = "../job" }
-zinit-client = "0.4.0"
+zinit-client = { path = "/Users/timurgordon/code/github/threefoldtech/zinit/zinit-client" }
+ratatui = "0.28"
+crossterm = "0.28"
+anyhow = "1.0"

 [dev-dependencies] # For examples later
 env_logger = "0.10"
--- a/core/supervisor/LIFECYCLE.md
+++ b/core/supervisor/LIFECYCLE.md
@@ -8,8 +8,6 @@ The lifecycle management system provides:

 - **Worker Process Management**: Start, stop, restart, and monitor worker binaries
 - **Health Monitoring**: Automatic ping jobs every 10 minutes for idle workers
- **Load Balancing**: Dynamic scaling of workers based on demand
- **Service Dependencies**: Proper startup ordering with dependency management
 - **Graceful Shutdown**: Clean termination of worker processes

 ## Architecture
@@ -313,3 +311,9 @@ redis-cli keys "hero:job:*"
 - **User Permissions**: Run workers with appropriate user permissions
 - **Network Security**: Secure Redis and Zinit socket access
 - **Binary Validation**: Verify worker binary integrity before deployment
+
+
+## Future
+
+- **Load Balancing**: Dynamic scaling of workers based on demand
+- **Service Dependencies**: Proper startup ordering with dependency management
--- a/core/supervisor/cmd/README.md
+++ b/core/supervisor/cmd/README.md
@@ -1,157 +1,66 @@
-# Rhai Client Binary
+# Supervisor CLI

-A command-line client for executing Rhai scripts on remote workers via Redis.
+A command-line interface for the Hero Supervisor.

-## Binary: `client`
+## Binary: `hive-supervisor`

 ### Installation

 Build the binary:
 ```bash
-cargo build --bin client --release
+cargo build --bin hive-supervisor --release
 ```

+
 ### Usage

 ```bash
-# Basic usage - requires caller and circle keys
-client --caller-key <CALLER_KEY> --circle-key <CIRCLE_KEY>
-
-# Execute inline script
-client -c <CALLER_KEY> -k <CIRCLE_KEY> --script "print('Hello World!')"
-
-# Execute script from file
-client -c <CALLER_KEY> -k <CIRCLE_KEY> --file script.rhai
-
-# Use specific worker (defaults to circle key)
-client -c <CALLER_KEY> -k <CIRCLE_KEY> -w <WORKER_KEY> --script "2 + 2"
-
-# Custom Redis and timeout
-client -c <CALLER_KEY> -k <CIRCLE_KEY> --redis-url redis://localhost:6379/1 --timeout 60
-
-# Remove timestamps from logs
-client -c <CALLER_KEY> -k <CIRCLE_KEY> --no-timestamp
-
-# Increase verbosity
-client -c <CALLER_KEY> -k <CIRCLE_KEY> -v --script "debug_info()"
-```
-
-### Command-Line Options
-
-| Option | Short | Default | Description |
-|--------|-------|---------|-------------|
-| `--caller-key` | `-c` | **Required** | Caller public key (your identity) |
-| `--circle-key` | `-k` | **Required** | Circle public key (execution context) |
-| `--worker-key` | `-w` | `circle-key` | Worker public key (target worker) |
-| `--redis-url` | `-r` | `redis://localhost:6379` | Redis connection URL |
-| `--script` | `-s` | | Rhai script to execute |
-| `--file` | `-f` | | Path to Rhai script file |
-| `--timeout` | `-t` | `30` | Timeout for script execution (seconds) |
-| `--no-timestamp` | | `false` | Remove timestamps from log output |
-| `--verbose` | `-v` | | Increase verbosity (stackable) |
-
-### Execution Modes
-
-#### Inline Script Execution
-```bash
-# Execute a simple calculation
-client -c caller_123 -k circle_456 -s "let result = 2 + 2; print(result);"
-
-# Execute with specific worker
-client -c caller_123 -k circle_456 -w worker_789 -s "get_user_data()"
-```
-
-#### Script File Execution
-```bash
-# Execute script from file
-client -c caller_123 -k circle_456 -f examples/data_processing.rhai
-
-# Execute with custom timeout
-client -c caller_123 -k circle_456 -f long_running_script.rhai -t 120
-```
-
-#### Interactive Mode
-```bash
-# Enter interactive REPL mode (when no script or file provided)
-client -c caller_123 -k circle_456
-
-# Interactive mode with verbose logging
-client -c caller_123 -k circle_456 -v --no-timestamp
-```
-
-### Interactive Mode
-
-When no script (`-s`) or file (`-f`) is provided, the client enters interactive mode:
+# Basic usage
+hive-supervisor --config <CONFIG_PATH>

 ```
-🔗 Starting Rhai Client
-📋 Configuration:
-   Caller Key: caller_123
-   Circle Key: circle_456
-   Worker Key: circle_456
-   Redis URL: redis://localhost:6379
-   Timeout: 30s

-✅ Connected to Redis at redis://localhost:6379
-🎮 Entering interactive mode
-Type Rhai scripts and press Enter to execute. Type 'exit' or 'quit' to close.
-rhai> let x = 42; print(x);
-Status: completed
-Output: 42
-rhai> exit
-👋 Goodbye!
+Where config is toml file with the following structure:
+```toml
+[global]
+redis_url = "redis://localhost:6379"
+
+[osis_worker]
+binary_path = "/path/to/osis_worker"
+env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
+
+[sal_worker]
+binary_path = "/path/to/sal_worker"
+env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
+
+[v_worker]
+binary_path = "/path/to/v_worker"
+env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
+
+[python_worker]
+binary_path = "/path/to/python_worker"
+env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
 ```

-### Configuration Examples

-#### Development Usage
-```bash
-# Simple development client
-client -c dev_user -k dev_circle
+Lets have verbosity settings etc.
+CLI Offers a few commands:

-# Development with clean logs
-client -c dev_user -k dev_circle --no-timestamp -v
-```
+workers:
+    start
+    stop
+    restart
+    status
+    logs
+    list

-#### Production Usage
-```bash
-# Production client with specific worker
-client \
-  --caller-key prod_user_123 \
-  --circle-key prod_circle_456 \
-  --worker-key prod_worker_789 \
-  --redis-url redis://redis-cluster:6379/0 \
-  --timeout 300 \
-  --file production_script.rhai
-```
-
-#### Batch Processing
-```bash
-# Process multiple scripts
-for script in scripts/*.rhai; do
-  client -c batch_user -k batch_circle -f "$script" --no-timestamp
-done
-```
-
-### Key Concepts
-
- **Caller Key**: Your identity - used for authentication and tracking
- **Circle Key**: Execution context - defines the environment/permissions
- **Worker Key**: Target worker - which worker should execute the script (defaults to circle key)
-
-### Error Handling
-
-The client provides clear error messages for:
- Missing required keys
- Redis connection failures
- Script execution timeouts
- Worker unavailability
- Script syntax errors
-
-### Dependencies
-
- `rhai_supervisor`: Core client library for Redis-based script execution
- `redis`: Redis client for task queue communication
- `clap`: Command-line argument parsing
- `env_logger`: Logging infrastructure
- `tokio`: Async runtime
+jobs:
+    create
+    start
+    stop
+    restart
+    status
+    logs
+    list
+    
+repl: you can enter interactive mode to run scripts, however predefine caller_id, context_id and worker type so supervisor dispathces jobs accordingly
--- a/core/supervisor/cmd/hive_supervisor_tui_safe.rs
+++ b/core/supervisor/cmd/hive_supervisor_tui_safe.rs
@@ -0,0 +1,365 @@
+use anyhow::Result;
+use clap::Parser;
+use crossterm::{
+    event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, KeyEventKind},
+    execute,
+    terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
+};
+use hero_supervisor::{Supervisor, SupervisorBuilder};
+use zinit_client::ZinitClient;
+use log::{error, info};
+use ratatui::{
+    backend::CrosstermBackend,
+    layout::{Constraint, Direction, Layout, Rect},
+    style::{Color, Modifier, Style},
+    text::Line,
+    widgets::{
+        Block, Borders, List, ListItem, Paragraph, Tabs, Wrap,
+    },
+    Frame, Terminal,
+};
+use std::{
+    io,
+    path::PathBuf,
+    sync::Arc,
+    time::{Duration, Instant},
+};
+use tokio::time::sleep;
+use toml;
+use serde::Deserialize;
+
+#[derive(Parser)]
+#[command(name = "hive-supervisor-tui")]
+#[command(about = "Hero Supervisor Terminal User Interface")]
+struct Args {
+    #[arg(short, long, help = "Configuration file path")]
+    config: PathBuf,
+    
+    #[arg(short, long, help = "Enable verbose logging")]
+    verbose: bool,
+}
+
+#[derive(Debug, Deserialize)]
+struct Config {
+    global: GlobalConfig,
+    #[serde(flatten)]
+    workers: std::collections::HashMap<String, WorkerConfigToml>,
+}
+
+#[derive(Debug, Deserialize)]
+struct GlobalConfig {
+    redis_url: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct WorkerConfigToml {
+    binary_path: String,
+    env_vars: Option<std::collections::HashMap<String, String>>,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+enum TabId {
+    Dashboard,
+    Workers,
+    Jobs,
+    Logs,
+}
+
+impl TabId {
+    fn all() -> Vec<TabId> {
+        vec![TabId::Dashboard, TabId::Workers, TabId::Jobs, TabId::Logs]
+    }
+    
+    fn title(&self) -> &str {
+        match self {
+            TabId::Dashboard => "Dashboard",
+            TabId::Workers => "Workers",
+            TabId::Jobs => "Jobs", 
+            TabId::Logs => "Logs",
+        }
+    }
+}
+
+struct App {
+    supervisor: Arc<Supervisor>,
+    current_tab: TabId,
+    should_quit: bool,
+    logs: Vec<String>,
+    last_update: Instant,
+}
+
+impl App {
+    fn new(supervisor: Arc<Supervisor>) -> Self {
+        Self {
+            supervisor,
+            current_tab: TabId::Dashboard,
+            should_quit: false,
+            logs: vec!["TUI started successfully".to_string()],
+            last_update: Instant::now(),
+        }
+    }
+    
+    fn next_tab(&mut self) {
+        let tabs = TabId::all();
+        let current_index = tabs.iter().position(|t| *t == self.current_tab).unwrap_or(0);
+        let next_index = (current_index + 1) % tabs.len();
+        self.current_tab = tabs[next_index].clone();
+    }
+    
+    fn prev_tab(&mut self) {
+        let tabs = TabId::all();
+        let current_index = tabs.iter().position(|t| *t == self.current_tab).unwrap_or(0);
+        let prev_index = if current_index == 0 { tabs.len() - 1 } else { current_index - 1 };
+        self.current_tab = tabs[prev_index].clone();
+    }
+    
+    fn add_log(&mut self, message: String) {
+        self.logs.push(format!("[{}] {}", 
+            chrono::Utc::now().format("%H:%M:%S"), 
+            message
+        ));
+        if self.logs.len() > 100 {
+            self.logs.remove(0);
+        }
+    }
+    
+    fn handle_key(&mut self, key: KeyCode) -> bool {
+        match key {
+            KeyCode::Char('q') => {
+                self.should_quit = true;
+                true
+            }
+            KeyCode::Tab => {
+                self.next_tab();
+                false
+            }
+            KeyCode::BackTab => {
+                self.prev_tab();
+                false
+            }
+            _ => false
+        }
+    }
+}
+
+fn render_ui(f: &mut Frame, app: &mut App) {
+    let chunks = Layout::default()
+        .direction(Direction::Vertical)
+        .constraints([Constraint::Length(3), Constraint::Min(0)].as_ref())
+        .split(f.area());
+    
+    // Render tabs
+    let tabs_list = TabId::all();
+    let tab_titles: Vec<Line> = tabs_list
+        .iter()
+        .map(|t| Line::from(t.title()))
+        .collect();
+    
+    let selected_tab = TabId::all().iter().position(|t| *t == app.current_tab).unwrap_or(0);
+    let tabs = Tabs::new(tab_titles)
+        .block(Block::default().borders(Borders::ALL).title("Hero Supervisor TUI"))
+        .select(selected_tab)
+        .style(Style::default().fg(Color::Cyan))
+        .highlight_style(Style::default().add_modifier(Modifier::BOLD).bg(Color::Black));
+    
+    f.render_widget(tabs, chunks[0]);
+    
+    // Render content based on selected tab
+    match app.current_tab {
+        TabId::Dashboard => render_dashboard(f, chunks[1], app),
+        TabId::Workers => render_workers(f, chunks[1], app),
+        TabId::Jobs => render_jobs(f, chunks[1], app),
+        TabId::Logs => render_logs(f, chunks[1], app),
+    }
+}
+
+fn render_dashboard(f: &mut Frame, area: Rect, app: &App) {
+    let chunks = Layout::default()
+        .direction(Direction::Vertical)
+        .constraints([Constraint::Length(7), Constraint::Min(0)].as_ref())
+        .split(area);
+    
+    // Status overview - supervisor is already running if we get here
+    let status_text = "Status: ✓ Running\nWorkers: Started successfully\nJobs: Ready for processing\n\nPress 'q' to quit, Tab to navigate";
+    
+    let status_paragraph = Paragraph::new(status_text)
+        .block(Block::default().borders(Borders::ALL).title("System Status"))
+        .wrap(Wrap { trim: true });
+    
+    f.render_widget(status_paragraph, chunks[0]);
+    
+    // Recent logs
+    let log_items: Vec<ListItem> = app.logs
+        .iter()
+        .rev()
+        .take(10)
+        .map(|log| ListItem::new(log.as_str()))
+        .collect();
+    
+    let logs_list = List::new(log_items)
+        .block(Block::default().borders(Borders::ALL).title("Recent Activity"));
+    
+    f.render_widget(logs_list, chunks[1]);
+}
+
+fn render_workers(f: &mut Frame, area: Rect, _app: &App) {
+    let paragraph = Paragraph::new("Workers tab - Status checking not implemented yet to avoid system issues")
+        .block(Block::default().borders(Borders::ALL).title("Workers"))
+        .wrap(Wrap { trim: true });
+    
+    f.render_widget(paragraph, area);
+}
+
+fn render_jobs(f: &mut Frame, area: Rect, _app: &App) {
+    let paragraph = Paragraph::new("Jobs tab - Job monitoring not implemented yet to avoid system issues")
+        .block(Block::default().borders(Borders::ALL).title("Jobs"))
+        .wrap(Wrap { trim: true });
+    
+    f.render_widget(paragraph, area);
+}
+
+fn render_logs(f: &mut Frame, area: Rect, app: &App) {
+    let items: Vec<ListItem> = app.logs
+        .iter()
+        .map(|log| ListItem::new(log.as_str()))
+        .collect();
+    
+    let logs_list = List::new(items)
+        .block(Block::default().borders(Borders::ALL).title("System Logs"));
+    
+    f.render_widget(logs_list, area);
+}
+
+async fn run_app(
+    terminal: &mut Terminal<CrosstermBackend<io::Stdout>>,
+    app: &mut App,
+) -> Result<()> {
+    loop {
+        terminal.draw(|f| render_ui(f, app))?;
+
+        // Simple, safe event handling
+        if event::poll(Duration::from_millis(100))? {
+            if let Event::Key(key) = event::read()? {
+                if key.kind == KeyEventKind::Press {
+                    if app.handle_key(key.code) {
+                        break;
+                    }
+                }
+            }
+        }
+
+        if app.should_quit {
+            break;
+        }
+
+        // Small delay to prevent excessive CPU usage
+        sleep(Duration::from_millis(50)).await;
+    }
+
+    Ok(())
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let args = Args::parse();
+    
+    // Initialize logging
+    if args.verbose {
+        env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug")).init();
+    } else {
+        env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
+    }
+    
+    info!("Hero Supervisor TUI - Fail-fast initialization");
+    
+    // Step 1: Load and parse configuration
+    info!("Step 1/4: Loading configuration from {:?}", args.config);
+    let config_content = std::fs::read_to_string(&args.config)
+        .map_err(|e| anyhow::anyhow!("Failed to read config file: {}", e))?;
+    let config: Config = toml::from_str(&config_content)
+        .map_err(|e| anyhow::anyhow!("Failed to parse config file: {}", e))?;
+    info!("✓ Configuration loaded successfully");
+    
+    // Step 2: Check if Zinit is running
+    info!("Step 2/4: Checking if Zinit is running...");
+    let zinit_client = ZinitClient::new("/tmp/zinit.sock");
+    match zinit_client.status("_test_connectivity").await {
+        Ok(_) => {
+            info!("✓ Zinit is running and accessible");
+        }
+        Err(e) => {
+            let error_msg = e.to_string();
+            if error_msg.contains("Connection refused") || error_msg.contains("No such file") {
+                eprintln!("Error: Zinit process manager is not running.");
+                eprintln!("Please start Zinit before running the supervisor TUI.");
+                eprintln!("Expected Zinit socket at: /tmp/zinit.sock");
+                std::process::exit(1);
+            } else {
+                info!("✓ Zinit is running (service not found is expected)");
+            }
+        }
+    }
+    
+    // Step 3: Build supervisor
+    info!("Step 3/4: Building supervisor...");
+    let mut builder = SupervisorBuilder::new()
+        .redis_url(&config.global.redis_url);
+    
+    for (worker_name, worker_config) in &config.workers {
+        match worker_name.as_str() {
+            "osis_worker" => builder = builder.osis_worker(&worker_config.binary_path),
+            "sal_worker" => builder = builder.sal_worker(&worker_config.binary_path),
+            "v_worker" => builder = builder.v_worker(&worker_config.binary_path),
+            "python_worker" => builder = builder.python_worker(&worker_config.binary_path),
+            _ => log::warn!("Unknown worker type: {}", worker_name),
+        }
+        
+        if let Some(env_vars) = &worker_config.env_vars {
+            for (key, value) in env_vars {
+                builder = builder.worker_env_var(key, value);
+            }
+        }
+    }
+    
+    let supervisor = Arc::new(builder.build()
+        .map_err(|e| anyhow::anyhow!("Failed to build supervisor: {}", e))?);
+    info!("✓ Supervisor built successfully");
+    
+    // Step 4: Start supervisor and workers
+    info!("Step 4/4: Starting supervisor and workers...");
+    supervisor.start_workers().await
+        .map_err(|e| anyhow::anyhow!("Failed to start workers: {}", e))?;
+    info!("✓ All workers started successfully");
+    
+    // All initialization successful - now start TUI
+    info!("Initialization complete - starting TUI...");
+    let mut app = App::new(Arc::clone(&supervisor));
+    
+    // Setup terminal
+    enable_raw_mode()?;
+    let mut stdout = io::stdout();
+    execute!(stdout, EnterAlternateScreen, EnableMouseCapture)?;
+    let backend = CrosstermBackend::new(stdout);
+    let mut terminal = Terminal::new(backend)?;
+    
+    // Run the app
+    let result = run_app(&mut terminal, &mut app).await;
+    
+    // Cleanup
+    disable_raw_mode()?;
+    execute!(
+        terminal.backend_mut(),
+        LeaveAlternateScreen,
+        DisableMouseCapture
+    )?;
+    terminal.show_cursor()?;
+    
+    // Cleanup supervisor
+    if let Err(e) = supervisor.cleanup_and_shutdown().await {
+        error!("Error during cleanup: {}", e);
+    }
+    
+    info!("Hero Supervisor TUI shutdown complete");
+    
+    result
+}
--- a/core/supervisor/docs/ARCHITECTURE.md
+++ b/core/supervisor/docs/ARCHITECTURE.md
@@ -1,190 +0,0 @@
-# Architecture of the `rhai_supervisor` Crate
-
-The `rhai_supervisor` crate provides a Redis-based client library for submitting Rhai scripts to distributed worker services and awaiting their execution results. It implements a request-reply pattern using Redis as the message broker.
-
-## Core Architecture
-
-The client follows a builder pattern design with clear separation of concerns:
-
-```mermaid
-graph TD
-    A[RhaiSupervisorBuilder] --> B[RhaiSupervisor]
-    B --> C[PlayRequestBuilder]
-    C --> D[PlayRequest]
-    D --> E[Redis Task Queue]
-    E --> F[Worker Service]
-    F --> G[Redis Reply Queue]
-    G --> H[Client Response]
-    
-    subgraph "Client Components"
-        A
-        B
-        C
-        D
-    end
-    
-    subgraph "Redis Infrastructure"
-        E
-        G
-    end
-    
-    subgraph "External Services"
-        F
-    end
-```
-
-## Key Components
-
-### 1. RhaiSupervisorBuilder
-
-A builder pattern implementation for constructing `RhaiSupervisor` instances with proper configuration validation.
-
-**Responsibilities:**
- Configure Redis connection URL
- Set caller ID for task attribution
- Validate configuration before building client
-
-**Key Methods:**
- `caller_id(id: &str)` - Sets the caller identifier
- `redis_url(url: &str)` - Configures Redis connection
- `build()` - Creates the final `RhaiSupervisor` instance
-
-### 2. RhaiSupervisor
-
-The main client interface that manages Redis connections and provides factory methods for creating play requests.
-
-**Responsibilities:**
- Maintain Redis connection pool
- Provide factory methods for request builders
- Handle low-level Redis operations
- Manage task status queries
-
-**Key Methods:**
- `new_play_request()` - Creates a new `PlayRequestBuilder`
- `get_task_status(task_id)` - Queries task status from Redis
- Internal methods for Redis operations
-
-### 3. PlayRequestBuilder
-
-A fluent builder for constructing and submitting script execution requests.
-
-**Responsibilities:**
- Configure script execution parameters
- Handle script loading from files or strings
- Manage request timeouts
- Provide submission methods (fire-and-forget vs await-response)
-
-**Key Methods:**
- `worker_id(id: &str)` - Target worker queue (determines which worker processes the task)
- `context_id(id: &str)` - Target context ID (determines execution context/circle)
- `script(content: &str)` - Set script content directly
- `script_path(path: &str)` - Load script from file
- `timeout(duration: Duration)` - Set execution timeout
- `submit()` - Fire-and-forget submission
- `await_response()` - Submit and wait for result
-
-**Architecture Note:** The decoupling of `worker_id` and `context_id` allows a single worker to process tasks for multiple contexts (circles), providing greater deployment flexibility.
-
-### 4. Data Structures
-
-#### RhaiTaskDetails
-Represents the complete state of a task throughout its lifecycle.
-
-```rust
-pub struct RhaiTaskDetails {
-    pub task_id: String,
-    pub script: String,
-    pub status: String,        // "pending", "processing", "completed", "error"
-    pub output: Option<String>,
-    pub error: Option<String>,
-    pub created_at: DateTime<Utc>,
-    pub updated_at: DateTime<Utc>,
-    pub caller_id: String,
-}
-```
-
-#### RhaiSupervisorError
-Comprehensive error handling for various failure scenarios:
- `RedisError` - Redis connection/operation failures
- `SerializationError` - JSON serialization/deserialization issues
- `Timeout` - Task execution timeouts
- `TaskNotFound` - Missing tasks after submission
-
-## Communication Protocol
-
-### Task Submission Flow
-
-1. **Task Creation**: Client generates unique UUID for task identification
-2. **Task Storage**: Task details stored in Redis hash: `rhailib:<task_id>`
-3. **Queue Submission**: Task ID pushed to worker queue: `rhailib:<worker_id>`
-4. **Reply Queue Setup**: Client listens on: `rhailib:reply:<task_id>`
-
-### Redis Key Patterns
-
- **Task Storage**: `rhailib:<task_id>` (Redis Hash)
- **Worker Queues**: `rhailib:<worker_id>` (Redis List)
- **Reply Queues**: `rhailib:reply:<task_id>` (Redis List)
-
-### Message Flow Diagram
-
-```mermaid
-sequenceDiagram
-    participant C as Client
-    participant R as Redis
-    participant W as Worker
-    
-    C->>R: HSET rhailib:task_id (task details)
-    C->>R: LPUSH rhailib:worker_id task_id
-    C->>R: BLPOP rhailib:reply:task_id (blocking)
-    
-    W->>R: BRPOP rhailib:worker_id (blocking)
-    W->>W: Execute Rhai Script
-    W->>R: LPUSH rhailib:reply:task_id (result)
-    
-    R->>C: Return result from BLPOP
-    C->>R: DEL rhailib:reply:task_id (cleanup)
-```
-
-## Concurrency and Async Design
-
-The client is built on `tokio` for asynchronous operations:
-
- **Connection Pooling**: Uses Redis multiplexed connections for efficiency
- **Non-blocking Operations**: All Redis operations are async
- **Timeout Handling**: Configurable timeouts with proper cleanup
- **Error Propagation**: Comprehensive error handling with context
-
-## Configuration and Deployment
-
-### Prerequisites
- Redis server accessible to both client and workers
- Proper network connectivity between components
- Sufficient Redis memory for task storage
-
-### Configuration Options
- **Redis URL**: Connection string for Redis instance
- **Caller ID**: Unique identifier for client instance
- **Timeouts**: Per-request timeout configuration
- **Worker Targeting**: Direct worker queue addressing
-
-## Security Considerations
-
- **Task Isolation**: Each task uses unique identifiers
- **Queue Separation**: Worker-specific queues prevent cross-contamination
- **Cleanup**: Automatic cleanup of reply queues after completion
- **Error Handling**: Secure error propagation without sensitive data leakage
-
-## Performance Characteristics
-
- **Scalability**: Horizontal scaling through multiple worker instances
- **Throughput**: Limited by Redis performance and network latency
- **Memory Usage**: Efficient with connection pooling and cleanup
- **Latency**: Low latency for local Redis deployments
-
-## Integration Points
-
-The client integrates with:
- **Worker Services**: Via Redis queue protocol
- **Monitoring Systems**: Through structured logging
- **Application Code**: Via builder pattern API
- **Configuration Systems**: Through environment variables and builders
--- a/core/supervisor/examples/cli/README.md
+++ b/core/supervisor/examples/cli/README.md
@@ -0,0 +1,185 @@
+# Hero Supervisor CLI Example
+
+This example demonstrates how to use the `hive-supervisor` CLI tool for managing workers and jobs in the Hero ecosystem.
+
+## Prerequisites
+
+1. **Redis Server**: Make sure Redis is running on `localhost:6379`
+   ```bash
+   # Install Redis (macOS)
+   brew install redis
+   
+   # Start Redis
+   redis-server
+   ```
+
+2. **Zinit Process Manager**: Install and configure Zinit
+   ```bash
+   # Install Zinit (example for Linux/macOS)
+   # Follow Zinit installation instructions for your platform
+   ```
+
+3. **Worker Binaries**: The configuration references worker binaries that need to be available:
+   - `/usr/local/bin/osis_worker`
+   - `/usr/local/bin/sal_worker`
+   - `/usr/local/bin/v_worker`
+   - `/usr/local/bin/python_worker`
+
+   For testing purposes, you can create mock worker binaries or update the paths in `config.toml` to point to existing binaries.
+
+## Configuration
+
+The `config.toml` file contains the supervisor configuration:
+
+- **Global settings**: Redis URL and Zinit socket path
+- **Worker configurations**: Binary paths and environment variables for each worker type
+
+## Usage Examples
+
+### 1. Build the CLI
+
+```bash
+# From the supervisor directory
+cargo build --bin hive-supervisor --release
+```
+
+### 2. Worker Management
+
+```bash
+# Show help
+./target/release/hive-supervisor --config examples/cli/config.toml --help
+
+# List all configured workers
+./target/release/hive-supervisor --config examples/cli/config.toml workers list
+
+# Start all workers
+./target/release/hive-supervisor --config examples/cli/config.toml workers start
+
+# Start specific workers
+./target/release/hive-supervisor --config examples/cli/config.toml workers start osis_worker sal_worker
+
+# Check worker status
+./target/release/hive-supervisor --config examples/cli/config.toml workers status
+
+# Stop all workers
+./target/release/hive-supervisor --config examples/cli/config.toml workers stop
+
+# Restart specific worker
+./target/release/hive-supervisor --config examples/cli/config.toml workers restart osis_worker
+```
+
+### 3. Job Management
+
+```bash
+# Create a job with inline script
+./target/release/hive-supervisor --config examples/cli/config.toml jobs create \
+  --script 'print("Hello from OSIS worker!");' \
+  --script-type osis \
+  --caller-id "user123" \
+  --context-id "session456"
+
+# Create a job from file
+./target/release/hive-supervisor --config examples/cli/config.toml jobs create \
+  --file examples/cli/sample_script.rhai \
+  --script-type osis \
+  --caller-id "user123" \
+  --context-id "session456"
+
+# List all jobs
+./target/release/hive-supervisor --config examples/cli/config.toml jobs list
+
+# Check job status
+./target/release/hive-supervisor --config examples/cli/config.toml jobs status <JOB_ID>
+
+# View job logs
+./target/release/hive-supervisor --config examples/cli/config.toml jobs logs <JOB_ID>
+
+# Stop a job
+./target/release/hive-supervisor --config examples/cli/config.toml jobs stop <JOB_ID>
+```
+
+### 4. Interactive REPL Mode
+
+```bash
+# Enter REPL mode for OSIS scripts
+./target/release/hive-supervisor --config examples/cli/config.toml repl \
+  --caller-id "user123" \
+  --context-id "session456" \
+  --script-type osis \
+  --timeout 60
+
+# In REPL mode, you can:
+# - Type scripts directly and press Enter to execute
+# - Type 'help' for available commands
+# - Type 'exit' or 'quit' to leave REPL mode
+```
+
+### 5. Verbose Logging
+
+```bash
+# Enable debug logging
+./target/release/hive-supervisor --config examples/cli/config.toml -v workers status
+
+# Enable trace logging
+./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
+
+# Disable timestamps
+./target/release/hive-supervisor --config examples/cli/config.toml --no-timestamp workers status
+```
+
+## Sample Scripts
+
+The `sample_scripts/` directory contains example scripts for different worker types:
+
+- `hello_osis.rhai` - Simple OSIS/HeroScript example
+- `system_sal.rhai` - SAL system operation example
+- `math_v.v` - V language calculation example
+- `data_python.py` - Python data processing example
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Redis Connection Error**
+   - Ensure Redis is running: `redis-cli ping`
+   - Check the Redis URL in `config.toml`
+
+2. **Zinit Socket Error**
+   - Verify Zinit is running and the socket path is correct
+   - Check permissions on the socket file
+
+3. **Worker Binary Not Found**
+   - Update binary paths in `config.toml` to match your system
+   - Ensure worker binaries are executable
+
+4. **Permission Denied**
+   - Check file permissions on configuration and binary files
+   - Ensure the user has access to the Zinit socket
+
+### Debug Mode
+
+Run with verbose logging to see detailed operation information:
+
+```bash
+RUST_LOG=debug ./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
+```
+
+## Configuration Customization
+
+You can customize the configuration for your environment:
+
+1. **Update Redis URL**: Change `redis_url` in the `[global]` section
+2. **Update Zinit Socket**: Change `zinit_socket_path` for your Zinit installation
+3. **Worker Paths**: Update binary paths in worker sections to match your setup
+4. **Environment Variables**: Add or modify environment variables for each worker type
+
+## Integration with Hero Ecosystem
+
+This CLI integrates with the broader Hero ecosystem:
+
+- **Job Queue**: Uses Redis for job queuing and status tracking
+- **Process Management**: Uses Zinit for worker lifecycle management
+- **Script Execution**: Supports multiple script types (OSIS, SAL, V, Python)
+- **Monitoring**: Provides real-time status and logging capabilities
+
+For more information about the Hero ecosystem, see the main project documentation.
--- a/core/supervisor/examples/cli/config.toml
+++ b/core/supervisor/examples/cli/config.toml
@@ -0,0 +1,19 @@
+# Hero Supervisor CLI Configuration Example
+# This configuration demonstrates how to set up the hive-supervisor CLI
+# with different worker types for script execution.
+
+[global]
+# Redis connection URL for job queuing
+redis_url = "redis://localhost:6379"
+
+# OSIS Worker Configuration
+# Handles OSIS (HeroScript) execution
+[osis_worker]
+binary_path = "/Users/timurgordon/code/git.ourworld.tf/herocode/hero/target/debug/osis"
+env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "osis", "MAX_CONCURRENT_JOBS" = "5" }
+
+# SAL Worker Configuration  
+# Handles System Abstraction Layer scripts
+[sal_worker]
+binary_path = "/Users/timurgordon/code/git.ourworld.tf/herocode/hero/target/debug/sal"
+env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "sal", "MAX_CONCURRENT_JOBS" = "3" }
--- a/core/supervisor/examples/cli/run_examples.sh
+++ b/core/supervisor/examples/cli/run_examples.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# Hero Supervisor CLI Example Runner
+# This script demonstrates various CLI operations
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SUPERVISOR_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
+CONFIG_FILE="$SCRIPT_DIR/config.toml"
+CLI_BINARY="$SUPERVISOR_DIR/target/release/hive-supervisor"
+
+echo -e "${BLUE}=== Hero Supervisor CLI Example Runner ===${NC}"
+echo "Script directory: $SCRIPT_DIR"
+echo "Supervisor directory: $SUPERVISOR_DIR"
+echo "Configuration file: $CONFIG_FILE"
+echo
+
+# Function to run CLI command with error handling
+run_cli() {
+    local description="$1"
+    shift
+    echo -e "${YELLOW}Running: $description${NC}"
+    echo "Command: $CLI_BINARY --config $CONFIG_FILE $*"
+    echo
+    
+    if "$CLI_BINARY" --config "$CONFIG_FILE" "$@"; then
+        echo -e "${GREEN}✓ Success${NC}"
+    else
+        echo -e "${RED}✗ Failed${NC}"
+        return 1
+    fi
+    echo
+}
+
+# Check if CLI binary exists
+if [[ ! -f "$CLI_BINARY" ]]; then
+    echo -e "${YELLOW}Building CLI binary...${NC}"
+    cd "$SUPERVISOR_DIR"
+    cargo build --bin hive-supervisor --release
+    echo
+fi
+
+# Check if config file exists
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo -e "${RED}Error: Configuration file not found: $CONFIG_FILE${NC}"
+    exit 1
+fi
+
+echo -e "${BLUE}=== CLI Help and Information ===${NC}"
+run_cli "Show main help" --help
+
+echo -e "${BLUE}=== Worker Management Examples ===${NC}"
+run_cli "List configured workers" workers list
+run_cli "Show worker management help" workers --help
+
+# Note: These commands would require actual worker binaries and Zinit setup
+echo -e "${YELLOW}Note: The following commands require actual worker binaries and Zinit setup${NC}"
+echo -e "${YELLOW}They are shown for demonstration but may fail without proper setup${NC}"
+echo
+
+# Uncomment these if you have the proper setup
+# run_cli "Check worker status" workers status
+# run_cli "Start all workers" workers start
+# run_cli "Check worker status after start" workers status
+
+echo -e "${BLUE}=== Job Management Examples ===${NC}"
+run_cli "Show job management help" jobs --help
+
+# Create sample jobs (these will also require workers to be running)
+echo -e "${YELLOW}Sample job creation commands (require running workers):${NC}"
+echo
+
+echo "# Create OSIS job with inline script:"
+echo "$CLI_BINARY --config $CONFIG_FILE jobs create \\"
+echo "  --script 'print(\"Hello from CLI!\");' \\"
+echo "  --script-type osis \\"
+echo "  --caller-id \"cli_demo\" \\"
+echo "  --context-id \"example_session\""
+echo
+
+echo "# Create job from sample script file:"
+echo "$CLI_BINARY --config $CONFIG_FILE jobs create \\"
+echo "  --file \"$SCRIPT_DIR/sample_scripts/hello_osis.rhai\" \\"
+echo "  --script-type osis \\"
+echo "  --caller-id \"cli_demo\" \\"
+echo "  --context-id \"example_session\""
+echo
+
+echo "# List all jobs:"
+echo "$CLI_BINARY --config $CONFIG_FILE jobs list"
+echo
+
+echo "# Check job status (replace JOB_ID with actual job ID):"
+echo "$CLI_BINARY --config $CONFIG_FILE jobs status JOB_ID"
+echo
+
+echo -e "${BLUE}=== REPL Mode Example ===${NC}"
+echo -e "${YELLOW}REPL mode command (interactive):${NC}"
+echo "$CLI_BINARY --config $CONFIG_FILE repl \\"
+echo "  --caller-id \"cli_demo\" \\"
+echo "  --context-id \"example_session\" \\"
+echo "  --script-type osis \\"
+echo "  --timeout 60"
+echo
+
+echo -e "${BLUE}=== Sample Scripts ===${NC}"
+echo "Available sample scripts in $SCRIPT_DIR/sample_scripts/:"
+for script in "$SCRIPT_DIR/sample_scripts"/*; do
+    if [[ -f "$script" ]]; then
+        basename "$script"
+    fi
+done
+echo
+
+echo -e "${BLUE}=== Verbose Logging Examples ===${NC}"
+echo "# Debug logging:"
+echo "$CLI_BINARY --config $CONFIG_FILE -v workers list"
+echo
+echo "# Trace logging:"
+echo "$CLI_BINARY --config $CONFIG_FILE -vv workers list"
+echo
+echo "# No timestamps:"
+echo "$CLI_BINARY --config $CONFIG_FILE --no-timestamp workers list"
+echo
+
+echo -e "${GREEN}=== Example Runner Complete ===${NC}"
+echo -e "${YELLOW}To run actual commands, ensure you have:${NC}"
+echo "1. Redis server running on localhost:6379"
+echo "2. Zinit process manager installed and configured"
+echo "3. Worker binaries available at the paths specified in config.toml"
+echo
+echo -e "${YELLOW}For testing without full setup, you can:${NC}"
+echo "1. Update config.toml with paths to existing binaries"
+echo "2. Use the CLI help commands and configuration validation"
+echo "3. Test the REPL mode (requires workers to be running)"
--- a/core/supervisor/examples/cli/sample_scripts/data_python.py
+++ b/core/supervisor/examples/cli/sample_scripts/data_python.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""
+Sample Python script for demonstration
+This script demonstrates Python worker functionality
+"""
+
+import json
+import datetime
+from typing import List, Dict
+
+def main():
+    print("=== Python Worker Demo ===")
+    print("Python data processing operations")
+    
+    # Data structures
+    print("\nData structures:")
+    users = [
+        {"id": 1, "name": "Alice", "age": 30, "role": "developer"},
+        {"id": 2, "name": "Bob", "age": 25, "role": "designer"},
+        {"id": 3, "name": "Charlie", "age": 35, "role": "manager"},
+        {"id": 4, "name": "Diana", "age": 28, "role": "developer"}
+    ]
+    
+    print(f"Total users: {len(users)}")
+    
+    # Data filtering
+    developers = [user for user in users if user["role"] == "developer"]
+    print(f"Developers: {len(developers)}")
+    for dev in developers:
+        print(f"  - {dev['name']} (age {dev['age']})")
+    
+    # Statistical operations
+    print("\nStatistical operations:")
+    ages = [user["age"] for user in users]
+    avg_age = sum(ages) / len(ages)
+    min_age = min(ages)
+    max_age = max(ages)
+    
+    print(f"Average age: {avg_age:.1f}")
+    print(f"Age range: {min_age} - {max_age}")
+    
+    # Date/time operations
+    print("\nDate/time operations:")
+    now = datetime.datetime.now()
+    print(f"Current time: {now.strftime('%Y-%m-%d %H:%M:%S')}")
+    
+    # Calculate birth years
+    current_year = now.year
+    for user in users:
+        birth_year = current_year - user["age"]
+        print(f"{user['name']} was born in {birth_year}")
+    
+    # JSON processing
+    print("\nJSON processing:")
+    json_data = json.dumps(users, indent=2)
+    print("User data as JSON:")
+    print(json_data[:200] + "..." if len(json_data) > 200 else json_data)
+    
+    # File operations simulation
+    print("\nFile operations:")
+    simulate_file_processing()
+    
+    print("=== Python Demo Complete ===")
+
+def simulate_file_processing():
+    """Simulate file processing operations"""
+    files = [
+        {"name": "data.csv", "size": 1024, "type": "csv"},
+        {"name": "config.json", "size": 512, "type": "json"},
+        {"name": "report.pdf", "size": 2048, "type": "pdf"},
+        {"name": "script.py", "size": 768, "type": "python"}
+    ]
+    
+    total_size = sum(file["size"] for file in files)
+    print(f"Processing {len(files)} files, total size: {total_size} bytes")
+    
+    # Group by type
+    file_types = {}
+    for file in files:
+        file_type = file["type"]
+        if file_type not in file_types:
+            file_types[file_type] = []
+        file_types[file_type].append(file["name"])
+    
+    print("Files by type:")
+    for file_type, file_names in file_types.items():
+        print(f"  {file_type}: {', '.join(file_names)}")
+
+if __name__ == "__main__":
+    main()
--- a/core/supervisor/examples/cli/sample_scripts/hello_osis.rhai
+++ b/core/supervisor/examples/cli/sample_scripts/hello_osis.rhai
@@ -0,0 +1,34 @@
+// Sample OSIS/HeroScript for demonstration
+// This script demonstrates basic OSIS worker functionality
+
+print("=== OSIS Worker Demo ===");
+print("Hello from the OSIS worker!");
+
+// Basic variable operations
+let name = "Hero";
+let version = "1.0";
+print(`Running ${name} version ${version}`);
+
+// Simple calculation
+let x = 10;
+let y = 20;
+let result = x + y;
+print(`Calculation: ${x} + ${y} = ${result}`);
+
+// Array operations
+let numbers = [1, 2, 3, 4, 5];
+let sum = 0;
+for num in numbers {
+    sum += num;
+}
+print(`Sum of array [1,2,3,4,5]: ${sum}`);
+
+// Function definition and call
+fn greet(person) {
+    return `Hello, ${person}! Welcome to Hero.`;
+}
+
+let greeting = greet("Developer");
+print(greeting);
+
+print("=== OSIS Demo Complete ===");
--- a/core/supervisor/examples/cli/sample_scripts/math_v.v
+++ b/core/supervisor/examples/cli/sample_scripts/math_v.v
@@ -0,0 +1,67 @@
+// Sample V language script for demonstration
+// This script demonstrates V worker functionality
+
+module main
+
+import math
+
+fn main() {
+    println("=== V Worker Demo ===")
+    println("V language mathematical operations")
+    
+    // Basic arithmetic
+    x := 15
+    y := 25
+    sum := x + y
+    product := x * y
+    println("Basic arithmetic:")
+    println("${x} + ${y} = ${sum}")
+    println("${x} * ${y} = ${product}")
+    
+    // Mathematical functions
+    println("\nMathematical functions:")
+    angle := 45.0
+    sin_val := math.sin(math.radians(angle))
+    cos_val := math.cos(math.radians(angle))
+    println("sin(${angle}°) = ${sin_val:.4f}")
+    println("cos(${angle}°) = ${cos_val:.4f}")
+    
+    // Array operations
+    numbers := [1, 4, 9, 16, 25]
+    println("\nArray operations:")
+    println("Numbers: ${numbers}")
+    
+    mut total := 0
+    for num in numbers {
+        total += num
+    }
+    println("Sum: ${total}")
+    
+    // Square roots
+    println("\nSquare roots:")
+    for num in numbers {
+        sqrt_val := math.sqrt(f64(num))
+        println("√${num} = ${sqrt_val:.2f}")
+    }
+    
+    // Fibonacci sequence
+    println("\nFibonacci sequence (first 10 numbers):")
+    fib := fibonacci(10)
+    println("${fib}")
+    
+    println("=== V Demo Complete ===")
+}
+
+fn fibonacci(n int) []int {
+    mut fib := []int{len: n}
+    if n >= 1 {
+        fib[0] = 0
+    }
+    if n >= 2 {
+        fib[1] = 1
+    }
+    for i in 2 .. n {
+        fib[i] = fib[i-1] + fib[i-2]
+    }
+    return fib
+}
--- a/core/supervisor/examples/cli/sample_scripts/system_sal.rhai
+++ b/core/supervisor/examples/cli/sample_scripts/system_sal.rhai
@@ -0,0 +1,43 @@
+// Sample SAL (System Abstraction Layer) script for demonstration
+// This script demonstrates system-level operations through SAL worker
+
+print("=== SAL Worker Demo ===");
+print("System Abstraction Layer operations");
+
+// System information gathering
+print("Gathering system information...");
+
+// Simulated system operations (actual SAL would have real system calls)
+let hostname = "hero-system";
+let uptime = "2 days, 4 hours";
+let load_avg = "0.45, 0.52, 0.48";
+
+print(`Hostname: ${hostname}`);
+print(`Uptime: ${uptime}`);
+print(`Load Average: ${load_avg}`);
+
+// File system operations
+print("\nFile system operations:");
+let disk_usage = "45% used";
+let available_space = "120GB available";
+
+print(`Disk Usage: ${disk_usage}`);
+print(`Available Space: ${available_space}`);
+
+// Process management simulation
+print("\nProcess management:");
+let active_processes = 156;
+let memory_usage = "68%";
+
+print(`Active Processes: ${active_processes}`);
+print(`Memory Usage: ${memory_usage}`);
+
+// Network status
+print("\nNetwork status:");
+let network_interfaces = ["eth0", "lo"];
+let connectivity = "Connected";
+
+print(`Network Interfaces: ${network_interfaces}`);
+print(`Connectivity: ${connectivity}`);
+
+print("=== SAL Demo Complete ===");
--- a/core/supervisor/examples/lifecycle_demo.rs
+++ b/core/supervisor/examples/lifecycle_demo.rs
@@ -17,7 +17,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    
    // Configuration
    let redis_url = "redis://localhost:6379";
-    let zinit_socket = "/var/run/zinit.sock";
    
    // Create supervisor
    let supervisor = SupervisorBuilder::new()
--- a/core/supervisor/examples/simple_lifecycle_demo.rs
+++ b/core/supervisor/examples/simple_lifecycle_demo.rs
@@ -12,7 +12,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Workers are automatically launched during build
    let supervisor = SupervisorBuilder::new()
        .redis_url("redis://localhost:6379")
-        .zinit_socket_path("/var/run/zinit.sock")
        .osis_worker("/usr/local/bin/osis_worker")
        .sal_worker("/usr/local/bin/sal_worker")
        .v_worker("/usr/local/bin/v_worker")
--- a/core/supervisor/examples/supervisor_config.toml
+++ b/core/supervisor/examples/supervisor_config.toml
@@ -0,0 +1,18 @@
+[global]
+redis_url = "redis://localhost:6379"
+
+[osis_worker]
+binary_path = "/path/to/osis_worker"
+env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
+
+[sal_worker]
+binary_path = "/path/to/sal_worker"
+env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
+
+[v_worker]
+binary_path = "/path/to/v_worker"
+env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
+
+[python_worker]
+binary_path = "/path/to/python_worker"
+env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
--- a/core/supervisor/src/error.rs
+++ b/core/supervisor/src/error.rs
@@ -31,6 +31,8 @@ pub enum SupervisorError {
    /// Zinit client operation error
    ZinitError(String),
    SupervisorNotConfigured,
+    /// Configuration file parsing error
+    ConfigError(String),
 }

 impl From<redis::RedisError> for SupervisorError {
@@ -95,6 +97,9 @@ impl std::fmt::Display for SupervisorError {
            SupervisorError::SupervisorNotConfigured => {
                write!(f, "Supervisor not configured for health monitoring")
            }
+            SupervisorError::ConfigError(msg) => {
+                write!(f, "Configuration error: {}", msg)
+            }
        }
    }
 }
--- a/core/supervisor/src/lib.rs
+++ b/core/supervisor/src/lib.rs
@@ -1,9 +1,14 @@
 use log::{debug, error, info, warn};
 use redis::AsyncCommands;
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
+use std::fs;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
 use std::time::Duration;
 use hero_job::NAMESPACE_PREFIX;
-use zinit_client::ZinitClient;
+use zinit_client::Client as ZinitClient;
+

 mod job;
 mod error;
@@ -23,46 +28,209 @@ pub struct Supervisor {

 pub struct SupervisorBuilder {
    redis_url: Option<String>,
-    zinit_socket_path: Option<String>,
    osis_worker: Option<String>,
    sal_worker: Option<String>,
    v_worker: Option<String>,
    python_worker: Option<String>,
    worker_env_vars: HashMap<String, String>,
+    websocket_config: Option<WebSocketServerConfig>,
 }

 /// Helper struct to pass builder data to worker launch method
+#[derive(Clone)]
 struct SupervisorBuilderData {
    osis_worker: Option<String>,
    sal_worker: Option<String>,
    v_worker: Option<String>,
    python_worker: Option<String>,
    worker_env_vars: HashMap<String, String>,
+    websocket_config: Option<WebSocketServerConfig>,
+}
+
+/// TOML configuration structure for the supervisor
+#[derive(Debug, Deserialize, Serialize)]
+pub struct SupervisorConfig {
+    pub global: GlobalConfig,
+    pub websocket_server: Option<WebSocketServerConfig>,
+    pub osis_worker: Option<WorkerConfigToml>,
+    pub sal_worker: Option<WorkerConfigToml>,
+    pub v_worker: Option<WorkerConfigToml>,
+    pub python_worker: Option<WorkerConfigToml>,
+}
+
+/// Global configuration section
+#[derive(Debug, Deserialize, Serialize)]
+pub struct GlobalConfig {
+    pub redis_url: String,
+}
+
+/// Worker configuration section in TOML
+#[derive(Debug, Deserialize, Serialize)]
+pub struct WorkerConfigToml {
+    pub binary_path: String,
+    #[serde(default)]
+    pub env_vars: HashMap<String, String>,
+}
+
+/// WebSocket server configuration section in TOML
+/// This mirrors the ServerConfig from hero_websocket_server but avoids circular dependency
+#[derive(Debug, Deserialize, Serialize, Clone)]
+pub struct WebSocketServerConfig {
+    /// Server host address
+    #[serde(default = "default_host")]
+    pub host: String,
+    
+    /// Server port
+    #[serde(default = "default_port")]
+    pub port: u16,
+    
+    /// Redis connection URL
+    #[serde(default = "default_redis_url")]
+    pub redis_url: String,
+    
+    /// Enable authentication
+    #[serde(default)]
+    pub auth: bool,
+    
+    /// Enable TLS/WSS
+    #[serde(default)]
+    pub tls: bool,
+    
+    /// Path to TLS certificate file
+    pub cert: Option<String>,
+    
+    /// Path to TLS private key file
+    pub key: Option<String>,
+    
+    /// Separate port for TLS connections
+    pub tls_port: Option<u16>,
+    
+    /// Circles configuration - maps circle names to lists of member public keys
+    #[serde(default)]
+    pub circles: HashMap<String, Vec<String>>,
+}
+
+// Default value functions for WebSocket server config
+fn default_host() -> String {
+    "127.0.0.1".to_string()
+}
+
+fn default_port() -> u16 {
+    8443
+}
+
+fn default_redis_url() -> String {
+    "redis://127.0.0.1/".to_string()
 }

 impl SupervisorBuilder {
    pub fn new() -> Self {
        Self {
            redis_url: None,
-            zinit_socket_path: Some("/var/run/zinit.sock".to_string()),
            osis_worker: None,
            sal_worker: None,
            v_worker: None,
            python_worker: None,
            worker_env_vars: HashMap::new(),
+            websocket_config: None,
        }
    }

+    /// Create a SupervisorBuilder from a TOML configuration file
+    pub fn from_toml<P: AsRef<Path>>(toml_path: P) -> Result<Self, SupervisorError> {
+        let toml_content = fs::read_to_string(toml_path)
+            .map_err(|e| SupervisorError::ConfigError(format!("Failed to read TOML file: {}", e)))?;
+        
+        let config: SupervisorConfig = toml::from_str(&toml_content)
+            .map_err(|e| SupervisorError::ConfigError(format!("Failed to parse TOML: {}", e)))?;
+        
+        let mut builder = Self::new()
+            .redis_url(&config.global.redis_url);
+        
+        // Configure workers based on TOML config
+        if let Some(osis_config) = config.osis_worker {
+            builder = builder.osis_worker(&osis_config.binary_path)
+                .worker_env_vars(osis_config.env_vars);
+        }
+        
+        if let Some(sal_config) = config.sal_worker {
+            builder = builder.sal_worker(&sal_config.binary_path)
+                .worker_env_vars(sal_config.env_vars);
+        }
+        
+        if let Some(v_config) = config.v_worker {
+            builder = builder.v_worker(&v_config.binary_path)
+                .worker_env_vars(v_config.env_vars);
+        }
+        
+        if let Some(python_config) = config.python_worker {
+            builder = builder.python_worker(&python_config.binary_path)
+                .worker_env_vars(python_config.env_vars);
+        }
+        
+        // Store WebSocket configuration for later use
+        if let Some(ws_config) = config.websocket_server {
+            builder.websocket_config = Some(ws_config);
+        }
+        
+        Ok(builder)
+    }
+
+    /// Validate that all configured worker binaries exist and are executable
+    fn validate_worker_binaries(&self) -> Result<(), SupervisorError> {
+        let workers = [
+            ("OSIS", &self.osis_worker),
+            ("SAL", &self.sal_worker),
+            ("V", &self.v_worker),
+            ("Python", &self.python_worker),
+        ];
+
+        for (worker_type, binary_path) in workers {
+            if let Some(path) = binary_path {
+                let path_obj = Path::new(path);
+                
+                if !path_obj.exists() {
+                    return Err(SupervisorError::ConfigError(
+                        format!("{} worker binary does not exist: {}", worker_type, path)
+                    ));
+                }
+                
+                if !path_obj.is_file() {
+                    return Err(SupervisorError::ConfigError(
+                        format!("{} worker path is not a file: {}", worker_type, path)
+                    ));
+                }
+                
+                // Check if the file is executable (Unix-like systems)
+                #[cfg(unix)]
+                {
+                    use std::os::unix::fs::PermissionsExt;
+                    let metadata = path_obj.metadata().map_err(|e| {
+                        SupervisorError::ConfigError(
+                            format!("Failed to read metadata for {} worker binary {}: {}", worker_type, path, e)
+                        )
+                    })?;
+                    
+                    let permissions = metadata.permissions();
+                    if permissions.mode() & 0o111 == 0 {
+                        return Err(SupervisorError::ConfigError(
+                            format!("{} worker binary is not executable: {}", worker_type, path)
+                        ));
+                    }
+                }
+                
+                info!("Validated {} worker binary: {}", worker_type, path);
+            }
+        }
+        
+        Ok(())
+    }
+
    pub fn redis_url(mut self, url: &str) -> Self {
        self.redis_url = Some(url.to_string());
        self
    }

-    pub fn zinit_socket_path(mut self, path: &str) -> Self {
-        self.zinit_socket_path = Some(path.to_string());
-        self
-    }
-
    pub fn osis_worker(mut self, binary_path: &str) -> Self {
        self.osis_worker = Some(binary_path.to_string());
        self
@@ -95,21 +263,23 @@ impl SupervisorBuilder {

    /// Builds the final `Supervisor` instance synchronously.
    ///
-    /// This method validates the configuration and creates the Redis client.
-    /// Worker launching is deferred to the `start_workers()` method.
+    /// This method validates the configuration, checks worker binary existence,
+    /// and creates the Redis client. Worker launching is deferred to the `start_workers()` method.
    ///
    /// # Returns
    ///
-    /// * `Ok(Supervisor)` - Successfully configured client
-    /// * `Err(SupervisorError)` - Configuration or connection error
-    pub fn build(self) -> Result<Supervisor, SupervisorError> {
+    /// * `Ok(Supervisor)` - Successfully configured client with valid binaries
+    /// * `Err(SupervisorError)` - Configuration, binary validation, or connection error
+    pub async fn build(self) -> Result<Supervisor, SupervisorError> {
+        // Validate that all configured worker binaries exist first
+        Self::validate_worker_binaries(&self)?;
+        
        let url = self.redis_url
            .unwrap_or_else(|| "redis://127.0.0.1/".to_string());
        let client = redis::Client::open(url)?;
        
-        let zinit_socket = self.zinit_socket_path
-            .unwrap_or_else(|| "/var/run/zinit.sock".to_string());
-        let zinit_client = ZinitClient::new(&zinit_socket);
+        let zinit_client = ZinitClient::unix_socket("/tmp/zinit.sock").await
+            .map_err(|e| SupervisorError::ZinitError(format!("Failed to create Zinit client: {}", e)))?;
        
        // Store builder data for later use in start_workers()
        let builder_data = SupervisorBuilderData {
@@ -118,6 +288,7 @@ impl SupervisorBuilder {
            v_worker: self.v_worker,
            python_worker: self.python_worker,
            worker_env_vars: self.worker_env_vars,
+            websocket_config: self.websocket_config,
        };
        
        let supervisor = Supervisor {
@@ -134,14 +305,33 @@ impl Supervisor {
    /// Start all configured workers asynchronously.
    /// This method should be called after build() to launch the workers.
    pub async fn start_workers(&self) -> Result<(), SupervisorError> {
+        info!("Starting Hero Supervisor workers...");
+        
+        // Test Zinit connection first
+        info!("Testing Zinit connection at /tmp/zinit.sock...");
+        match self.zinit_client.list().await {
+            Ok(services) => {
+                info!("Successfully connected to Zinit. Current services: {:?}", services);
+            }
+            Err(e) => {
+                error!("Failed to connect to Zinit: {:?}", e);
+                return Err(SupervisorError::ZinitError(format!("Zinit connection failed: {}", e)));
+            }
+        }
+        
        // Clean up any existing worker services first
+        info!("Cleaning up existing worker services...");
        self.cleanup_existing_workers().await?;
        
        // Launch configured workers if builder data is available
        if let Some(builder_data) = &self.builder_data {
+            info!("Launching configured workers...");
            self.launch_configured_workers(builder_data).await?;
+        } else {
+            warn!("No builder data available, no workers to start");
        }
        
+        info!("All workers started successfully!");
        Ok(())
    }
    
@@ -179,7 +369,11 @@ impl Supervisor {
        
        for worker_name in worker_names {
            // Try to stop and delete, but don't fail if they don't exist
-            let _ = self.stop_and_delete_worker(worker_name).await;
+            info!("Attempting to cleanup worker: {}", worker_name);
+            match self.stop_and_delete_worker(worker_name).await {
+                Ok(_) => info!("Successfully cleaned up worker: {}", worker_name),
+                Err(e) => debug!("Failed to cleanup worker {}: {}", worker_name, e),
+            }
        }
        
        info!("Existing worker cleanup completed");
@@ -188,18 +382,33 @@ impl Supervisor {
    
    /// Stop and delete a worker service from zinit
    async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
+        info!("Starting cleanup for worker: {}", worker_name);
+        
        // First try to stop the worker
+        info!("Attempting to stop worker: {}", worker_name);
        if let Err(e) = self.zinit_client.stop(worker_name).await {
            debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
+        } else {
+            info!("Successfully stopped worker: {}", worker_name);
        }
        
-        // Then try to delete the service
-        if let Err(e) = self.zinit_client.delete(worker_name).await {
+        // Then forget the service to stop monitoring it
+        info!("Attempting to forget worker: {}", worker_name);
+        if let Err(e) = self.zinit_client.forget(worker_name).await {
+            info!("Worker {} was not being monitored or failed to forget: {}", worker_name, e);
+        } else {
+            info!("Successfully forgot worker service: {}", worker_name);
+        }
+        
+        // Finally, delete the service configuration
+        info!("Attempting to delete service for worker: {}", worker_name);
+        if let Err(e) = self.zinit_client.delete_service(worker_name).await {
            debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
        } else {
            info!("Successfully deleted worker service: {}", worker_name);
        }
        
+        info!("Completed cleanup for worker: {}", worker_name);
        Ok(())
    }

@@ -211,6 +420,157 @@ impl Supervisor {
    pub fn new_job(&self) -> JobBuilder {
        JobBuilder::new(self)
    }
+    
+    /// Get WebSocket server configuration from TOML config
+    pub fn get_websocket_config(&self) -> Result<WebSocketServerConfig, SupervisorError> {
+        let builder_data = self.builder_data.as_ref().ok_or_else(|| {
+            SupervisorError::ConfigError("No builder data available for WebSocket config".to_string())
+        })?;
+        
+        builder_data.websocket_config.clone().ok_or_else(|| {
+            SupervisorError::ConfigError("No WebSocket server configuration found in TOML config".to_string())
+        })
+    }
+
+    /// Extract worker configurations from the supervisor's builder data
+    pub fn get_worker_configs(&self) -> Result<Vec<WorkerConfig>, SupervisorError> {
+        let builder_data = self.builder_data.as_ref().ok_or_else(|| {
+            SupervisorError::ConfigError("No builder data available for worker configs".to_string())
+        })?;
+        
+        let mut configs = Vec::new();
+        let env_vars = builder_data.worker_env_vars.clone();
+        
+        if let Some(osis_path) = &builder_data.osis_worker {
+            configs.push(
+                WorkerConfig::new("osis_worker_1".to_string(), PathBuf::from(osis_path), ScriptType::OSIS)
+                    .with_env(env_vars.clone())
+            );
+        }
+        
+        if let Some(sal_path) = &builder_data.sal_worker {
+            configs.push(
+                WorkerConfig::new("sal_worker_1".to_string(), PathBuf::from(sal_path), ScriptType::SAL)
+                    .with_env(env_vars.clone())
+            );
+        }
+        
+        if let Some(v_path) = &builder_data.v_worker {
+            configs.push(
+                WorkerConfig::new("v_worker_1".to_string(), PathBuf::from(v_path), ScriptType::V)
+                    .with_env(env_vars.clone())
+            );
+        }
+        
+        if let Some(python_path) = &builder_data.python_worker {
+            configs.push(
+                WorkerConfig::new("python_worker_1".to_string(), PathBuf::from(python_path), ScriptType::Python)
+                    .with_env(env_vars.clone())
+            );
+        }
+        
+        Ok(configs)
+    }
+
+    /// Spawn a background lifecycle manager that continuously monitors and maintains worker health
+    /// Returns a JoinHandle that can be used to stop the lifecycle manager
+    pub fn spawn_lifecycle_manager(
+        self: Arc<Self>,
+        worker_configs: Vec<WorkerConfig>,
+        health_check_interval: Duration,
+    ) -> tokio::task::JoinHandle<Result<(), SupervisorError>> {
+        let supervisor = self;
+        
+        tokio::spawn(async move {
+            info!("Starting background lifecycle manager with {} workers", worker_configs.len());
+            info!("Health check interval: {:?}", health_check_interval);
+            
+            // Initial worker startup
+            info!("Performing initial worker startup...");
+            if let Err(e) = supervisor.start_workers().await {
+                error!("Failed to start workers during initialization: {}", e);
+                return Err(e);
+            }
+            
+            // Start the monitoring loop
+            let mut interval = tokio::time::interval(health_check_interval);
+            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+            
+            loop {
+                interval.tick().await;
+                
+                info!("Running periodic worker health check...");
+                
+                // Check each worker's health and restart if needed
+                for worker_config in &worker_configs {
+                    if let Err(e) = supervisor.check_and_restart_worker(worker_config).await {
+                        error!("Failed to check/restart worker {}: {}", worker_config.name, e);
+                    }
+                }
+                
+                info!("Health check cycle completed");
+            }
+        })
+    }
+    
+    /// Check a single worker's health and restart if needed
+    async fn check_and_restart_worker(&self, worker_config: &WorkerConfig) -> Result<(), SupervisorError> {
+        let worker_name = &worker_config.name;
+        
+        // Get worker status
+        match self.zinit_client.status(worker_name).await {
+            Ok(status) => {
+                let is_healthy = status.state == "running" && status.pid > 0;
+                
+                if is_healthy {
+                    debug!("Worker {} is healthy (state: {}, pid: {})", worker_name, status.state, status.pid);
+                    
+                    // Optionally send a ping job for deeper health check
+                    if let Err(e) = self.send_ping_job(worker_config.script_type.clone()).await {
+                        warn!("Ping job failed for worker {}: {}", worker_name, e);
+                        // Note: We don't restart on ping failure as it might be temporary
+                    }
+                } else {
+                    warn!("Worker {} is unhealthy (state: {}, pid: {}), restarting...", 
+                          worker_name, status.state, status.pid);
+                    
+                    // Attempt to restart the worker
+                    if let Err(e) = self.restart_worker(worker_name).await {
+                        error!("Failed to restart unhealthy worker {}: {}", worker_name, e);
+                        
+                        // If restart fails, try a full stop/start cycle
+                        warn!("Attempting full stop/start cycle for worker: {}", worker_name);
+                        if let Err(e) = self.stop_and_delete_worker(worker_name).await {
+                            error!("Failed to stop worker {} during recovery: {}", worker_name, e);
+                        }
+                        
+                        if let Err(e) = self.start_worker(worker_config).await {
+                            error!("Failed to start worker {} during recovery: {}", worker_name, e);
+                            return Err(e);
+                        }
+                        
+                        info!("Successfully recovered worker: {}", worker_name);
+                    } else {
+                        info!("Successfully restarted worker: {}", worker_name);
+                    }
+                }
+            }
+            Err(e) => {
+                warn!("Could not get status for worker {} (may not exist): {}", worker_name, e);
+                
+                // Worker doesn't exist, try to start it
+                info!("Attempting to start missing worker: {}", worker_name);
+                if let Err(e) = self.start_worker(worker_config).await {
+                    error!("Failed to start missing worker {}: {}", worker_name, e);
+                    return Err(e);
+                }
+                
+                info!("Successfully started missing worker: {}", worker_name);
+            }
+        }
+        
+        Ok(())
+    }

    // Internal helper to submit script details and push to work queue
    async fn create_job_using_connection(
--- a/core/supervisor/src/lifecycle.rs
+++ b/core/supervisor/src/lifecycle.rs
@@ -8,7 +8,7 @@ use serde_json::json;
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::time::Duration;
-use zinit_client::{ZinitClient, ServiceStatus, ServiceState};
+use zinit_client::{Client as ZinitClient, Status};
 use hero_job::ScriptType;
 use crate::{Supervisor, SupervisorError};

@@ -16,7 +16,7 @@ use crate::{Supervisor, SupervisorError};
 #[derive(Debug, Clone)]
 pub struct WorkerInfo {
    pub config: WorkerConfig,
-    pub status: Option<ServiceStatus>,
+    pub status: Option<Status>,
    pub is_running: bool,
 }

@@ -90,7 +90,7 @@ impl Supervisor {
        for config in worker_configs {
            let status = self.zinit_client.status(&config.name).await.ok();
            let is_running = status.as_ref()
-                .map(|s| matches!(s.state, ServiceState::Running) && s.pid > 0)
+                .map(|s| s.state == "running" && s.pid > 0)
                .unwrap_or(false);
            
            workers.push(WorkerInfo {
@@ -117,6 +117,10 @@ impl Supervisor {
        self.zinit_client.create_service(&worker_config.name, service_config).await
            .map_err(|e| SupervisorError::ZinitError(format!("Failed to create service: {}", e)))?;
        
+        // Monitor the service so Zinit starts managing it
+        self.zinit_client.monitor(&worker_config.name).await
+            .map_err(|e| SupervisorError::ZinitError(format!("Failed to monitor service: {}", e)))?;
+        
        // Start the service
        self.zinit_client.start(&worker_config.name).await
            .map_err(|e| SupervisorError::ZinitError(format!("Failed to start worker: {}", e)))?;
@@ -168,7 +172,7 @@ impl Supervisor {
        &self,
        worker_name: &str,
        zinit_client: &ZinitClient,
-    ) -> Result<ServiceStatus, SupervisorError> {
+    ) -> Result<Status, SupervisorError> {
        match zinit_client.status(worker_name).await {
            Ok(status) => Ok(status),
            Err(e) => {
@@ -183,7 +187,7 @@ impl Supervisor {
        &self,
        worker_configs: &[WorkerConfig],
        zinit_client: &ZinitClient,
-    ) -> Result<HashMap<String, ServiceStatus>, SupervisorError> {
+    ) -> Result<HashMap<String, Status>, SupervisorError> {
        let mut status_map = HashMap::new();
        
        for worker in worker_configs {
@@ -200,19 +204,7 @@ impl Supervisor {
        Ok(status_map)
    }

-    /// Start multiple workers
-    pub async fn start_workers(
-        &self,
-        worker_configs: &[WorkerConfig],
-    ) -> Result<(), SupervisorError> {
-        info!("Starting {} workers", worker_configs.len());
-        
-        for worker in worker_configs {
-            self.start_worker(worker).await?;
-        }
-        
-        Ok(())
-    }
+

    /// Stop multiple workers
    pub async fn stop_workers(
@@ -240,7 +232,7 @@ impl Supervisor {
        for worker in worker_configs {
            if worker.script_type == *script_type {
                if let Ok(status) = zinit_client.status(&worker.name).await {
-                    if status.state == ServiceState::Running {
+                    if status.state == "running" {
                        running_count += 1;
                    }
                }
@@ -277,26 +269,35 @@ impl Supervisor {
    }

    /// Create Zinit service configuration from worker config
-    fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Value {
-        let mut config = json!({
-            "exec": format!("{} {}", 
+    fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Map<String, serde_json::Value> {
+        use serde_json::{Map, Value};
+        
+        let mut config = Map::new();
+        
+        config.insert(
+            "exec".to_string(),
+            Value::String(format!("{} {}", 
                worker.binary_path.display(), 
                worker.args.join(" ")
-            ),
-            "oneshot": !worker.restart_on_exit,
-        });
+            ))
+        );
+        
+        config.insert(
+            "oneshot".to_string(),
+            Value::Bool(!worker.restart_on_exit)
+        );
        
        if let Some(health_check) = &worker.health_check {
-            config["test"] = json!(health_check);
+            config.insert("test".to_string(), Value::String(health_check.clone()));
        }
        
        if !worker.dependencies.is_empty() {
-            config["after"] = json!(worker.dependencies);
+            config.insert("after".to_string(), json!(worker.dependencies));
        }
        
        // Add environment variables if any
        if !worker.env.is_empty() {
-            config["env"] = json!(worker.env);
+            config.insert("env".to_string(), json!(worker.env));
        }
        
        config
@@ -307,6 +308,8 @@ impl Supervisor {
        use hero_job::ScriptType;
        use std::path::PathBuf;
        
+        let mut errors = Vec::new();
+        
        // Launch OSIS worker if configured
        if let Some(binary_path) = &builder.osis_worker {
            let worker_id = "osis_worker_1";
@@ -318,7 +321,11 @@ impl Supervisor {
            config.env.extend(builder.worker_env_vars.clone());
            
            info!("Launching OSIS worker: {}", worker_id);
-            self.start_worker(&config).await?;
+            if let Err(e) = self.start_worker(&config).await {
+                let error_msg = format!("Failed to start OSIS worker: {}", e);
+                warn!("{}", error_msg);
+                errors.push(error_msg);
+            }
        }
        
        // Launch SAL worker if configured
@@ -332,7 +339,11 @@ impl Supervisor {
            config.env.extend(builder.worker_env_vars.clone());
            
            info!("Launching SAL worker: {}", worker_id);
-            self.start_worker(&config).await?;
+            if let Err(e) = self.start_worker(&config).await {
+                let error_msg = format!("Failed to start SAL worker: {}", e);
+                warn!("{}", error_msg);
+                errors.push(error_msg);
+            }
        }
        
        // Launch V worker if configured
@@ -346,7 +357,11 @@ impl Supervisor {
            config.env.extend(builder.worker_env_vars.clone());
            
            info!("Launching V worker: {}", worker_id);
-            self.start_worker(&config).await?;
+            if let Err(e) = self.start_worker(&config).await {
+                let error_msg = format!("Failed to start V worker: {}", e);
+                warn!("{}", error_msg);
+                errors.push(error_msg);
+            }
        }
        
        // Launch Python worker if configured
@@ -360,9 +375,21 @@ impl Supervisor {
            config.env.extend(builder.worker_env_vars.clone());
            
            info!("Launching Python worker: {}", worker_id);
-            self.start_worker(&config).await?;
+            if let Err(e) = self.start_worker(&config).await {
+                let error_msg = format!("Failed to start Python worker: {}", e);
+                warn!("{}", error_msg);
+                errors.push(error_msg);
+            }
        }
        
-        Ok(())
+        // Return result based on whether any workers started successfully
+        if errors.is_empty() {
+            info!("All configured workers started successfully");
+            Ok(())
+        } else {
+            let combined_error = format!("Some workers failed to start: {}", errors.join("; "));
+            warn!("{}", combined_error);
+            Err(SupervisorError::ZinitError(combined_error))
+        }
    }
 }