refactor wip

This commit is contained in:
Timur Gordon
2025-08-05 12:19:38 +02:00
parent 8ed40ce99c
commit 7a652c9c3c
51 changed files with 6183 additions and 840 deletions

View File

@@ -7,19 +7,35 @@ edition = "2021"
name = "supervisor"
path = "cmd/supervisor.rs"
[[bin]]
name = "hive-supervisor"
path = "cmd/hive_supervisor.rs"
[[bin]]
name = "hive-supervisor-tui"
path = "cmd/hive_supervisor_tui.rs"
[[bin]]
name = "hive-supervisor-tui-safe"
path = "cmd/hive_supervisor_tui_safe.rs"
[dependencies]
clap = { version = "4.4", features = ["derive"] }
env_logger = "0.10"
redis = { version = "0.25.0", features = ["tokio-comp"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
toml = "0.8"
uuid = { version = "1.6", features = ["v4", "serde"] }
chrono = { version = "0.4", features = ["serde"] }
log = "0.4"
tokio = { version = "1", features = ["macros", "rt-multi-thread"] } # For async main in examples, and general async
colored = "2.0"
hero_job = { path = "../job" }
zinit-client = "0.4.0"
zinit-client = { path = "/Users/timurgordon/code/github/threefoldtech/zinit/zinit-client" }
ratatui = "0.28"
crossterm = "0.28"
anyhow = "1.0"
[dev-dependencies] # For examples later
env_logger = "0.10"

View File

@@ -8,8 +8,6 @@ The lifecycle management system provides:
- **Worker Process Management**: Start, stop, restart, and monitor worker binaries
- **Health Monitoring**: Automatic ping jobs every 10 minutes for idle workers
- **Load Balancing**: Dynamic scaling of workers based on demand
- **Service Dependencies**: Proper startup ordering with dependency management
- **Graceful Shutdown**: Clean termination of worker processes
## Architecture
@@ -313,3 +311,9 @@ redis-cli keys "hero:job:*"
- **User Permissions**: Run workers with appropriate user permissions
- **Network Security**: Secure Redis and Zinit socket access
- **Binary Validation**: Verify worker binary integrity before deployment
## Future
- **Load Balancing**: Dynamic scaling of workers based on demand
- **Service Dependencies**: Proper startup ordering with dependency management

View File

@@ -1,157 +1,66 @@
# Rhai Client Binary
# Supervisor CLI
A command-line client for executing Rhai scripts on remote workers via Redis.
A command-line interface for the Hero Supervisor.
## Binary: `client`
## Binary: `hive-supervisor`
### Installation
Build the binary:
```bash
cargo build --bin client --release
cargo build --bin hive-supervisor --release
```
### Usage
```bash
# Basic usage - requires caller and circle keys
client --caller-key <CALLER_KEY> --circle-key <CIRCLE_KEY>
# Execute inline script
client -c <CALLER_KEY> -k <CIRCLE_KEY> --script "print('Hello World!')"
# Execute script from file
client -c <CALLER_KEY> -k <CIRCLE_KEY> --file script.rhai
# Use specific worker (defaults to circle key)
client -c <CALLER_KEY> -k <CIRCLE_KEY> -w <WORKER_KEY> --script "2 + 2"
# Custom Redis and timeout
client -c <CALLER_KEY> -k <CIRCLE_KEY> --redis-url redis://localhost:6379/1 --timeout 60
# Remove timestamps from logs
client -c <CALLER_KEY> -k <CIRCLE_KEY> --no-timestamp
# Increase verbosity
client -c <CALLER_KEY> -k <CIRCLE_KEY> -v --script "debug_info()"
```
### Command-Line Options
| Option | Short | Default | Description |
|--------|-------|---------|-------------|
| `--caller-key` | `-c` | **Required** | Caller public key (your identity) |
| `--circle-key` | `-k` | **Required** | Circle public key (execution context) |
| `--worker-key` | `-w` | `circle-key` | Worker public key (target worker) |
| `--redis-url` | `-r` | `redis://localhost:6379` | Redis connection URL |
| `--script` | `-s` | | Rhai script to execute |
| `--file` | `-f` | | Path to Rhai script file |
| `--timeout` | `-t` | `30` | Timeout for script execution (seconds) |
| `--no-timestamp` | | `false` | Remove timestamps from log output |
| `--verbose` | `-v` | | Increase verbosity (stackable) |
### Execution Modes
#### Inline Script Execution
```bash
# Execute a simple calculation
client -c caller_123 -k circle_456 -s "let result = 2 + 2; print(result);"
# Execute with specific worker
client -c caller_123 -k circle_456 -w worker_789 -s "get_user_data()"
```
#### Script File Execution
```bash
# Execute script from file
client -c caller_123 -k circle_456 -f examples/data_processing.rhai
# Execute with custom timeout
client -c caller_123 -k circle_456 -f long_running_script.rhai -t 120
```
#### Interactive Mode
```bash
# Enter interactive REPL mode (when no script or file provided)
client -c caller_123 -k circle_456
# Interactive mode with verbose logging
client -c caller_123 -k circle_456 -v --no-timestamp
```
### Interactive Mode
When no script (`-s`) or file (`-f`) is provided, the client enters interactive mode:
# Basic usage
hive-supervisor --config <CONFIG_PATH>
```
🔗 Starting Rhai Client
📋 Configuration:
Caller Key: caller_123
Circle Key: circle_456
Worker Key: circle_456
Redis URL: redis://localhost:6379
Timeout: 30s
✅ Connected to Redis at redis://localhost:6379
🎮 Entering interactive mode
Type Rhai scripts and press Enter to execute. Type 'exit' or 'quit' to close.
rhai> let x = 42; print(x);
Status: completed
Output: 42
rhai> exit
👋 Goodbye!
Where config is toml file with the following structure:
```toml
[global]
redis_url = "redis://localhost:6379"
[osis_worker]
binary_path = "/path/to/osis_worker"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
[sal_worker]
binary_path = "/path/to/sal_worker"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
[v_worker]
binary_path = "/path/to/v_worker"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
[python_worker]
binary_path = "/path/to/python_worker"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
```
### Configuration Examples
#### Development Usage
```bash
# Simple development client
client -c dev_user -k dev_circle
Lets have verbosity settings etc.
CLI Offers a few commands:
# Development with clean logs
client -c dev_user -k dev_circle --no-timestamp -v
```
workers:
start
stop
restart
status
logs
list
#### Production Usage
```bash
# Production client with specific worker
client \
--caller-key prod_user_123 \
--circle-key prod_circle_456 \
--worker-key prod_worker_789 \
--redis-url redis://redis-cluster:6379/0 \
--timeout 300 \
--file production_script.rhai
```
#### Batch Processing
```bash
# Process multiple scripts
for script in scripts/*.rhai; do
client -c batch_user -k batch_circle -f "$script" --no-timestamp
done
```
### Key Concepts
- **Caller Key**: Your identity - used for authentication and tracking
- **Circle Key**: Execution context - defines the environment/permissions
- **Worker Key**: Target worker - which worker should execute the script (defaults to circle key)
### Error Handling
The client provides clear error messages for:
- Missing required keys
- Redis connection failures
- Script execution timeouts
- Worker unavailability
- Script syntax errors
### Dependencies
- `rhai_supervisor`: Core client library for Redis-based script execution
- `redis`: Redis client for task queue communication
- `clap`: Command-line argument parsing
- `env_logger`: Logging infrastructure
- `tokio`: Async runtime
jobs:
create
start
stop
restart
status
logs
list
repl: you can enter interactive mode to run scripts, however predefine caller_id, context_id and worker type so supervisor dispathces jobs accordingly

View File

@@ -0,0 +1,365 @@
use anyhow::Result;
use clap::Parser;
use crossterm::{
event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, KeyEventKind},
execute,
terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
};
use hero_supervisor::{Supervisor, SupervisorBuilder};
use zinit_client::ZinitClient;
use log::{error, info};
use ratatui::{
backend::CrosstermBackend,
layout::{Constraint, Direction, Layout, Rect},
style::{Color, Modifier, Style},
text::Line,
widgets::{
Block, Borders, List, ListItem, Paragraph, Tabs, Wrap,
},
Frame, Terminal,
};
use std::{
io,
path::PathBuf,
sync::Arc,
time::{Duration, Instant},
};
use tokio::time::sleep;
use toml;
use serde::Deserialize;
#[derive(Parser)]
#[command(name = "hive-supervisor-tui")]
#[command(about = "Hero Supervisor Terminal User Interface")]
struct Args {
#[arg(short, long, help = "Configuration file path")]
config: PathBuf,
#[arg(short, long, help = "Enable verbose logging")]
verbose: bool,
}
#[derive(Debug, Deserialize)]
struct Config {
global: GlobalConfig,
#[serde(flatten)]
workers: std::collections::HashMap<String, WorkerConfigToml>,
}
#[derive(Debug, Deserialize)]
struct GlobalConfig {
redis_url: String,
}
#[derive(Debug, Deserialize)]
struct WorkerConfigToml {
binary_path: String,
env_vars: Option<std::collections::HashMap<String, String>>,
}
#[derive(Debug, Clone, PartialEq)]
enum TabId {
Dashboard,
Workers,
Jobs,
Logs,
}
impl TabId {
fn all() -> Vec<TabId> {
vec![TabId::Dashboard, TabId::Workers, TabId::Jobs, TabId::Logs]
}
fn title(&self) -> &str {
match self {
TabId::Dashboard => "Dashboard",
TabId::Workers => "Workers",
TabId::Jobs => "Jobs",
TabId::Logs => "Logs",
}
}
}
struct App {
supervisor: Arc<Supervisor>,
current_tab: TabId,
should_quit: bool,
logs: Vec<String>,
last_update: Instant,
}
impl App {
fn new(supervisor: Arc<Supervisor>) -> Self {
Self {
supervisor,
current_tab: TabId::Dashboard,
should_quit: false,
logs: vec!["TUI started successfully".to_string()],
last_update: Instant::now(),
}
}
fn next_tab(&mut self) {
let tabs = TabId::all();
let current_index = tabs.iter().position(|t| *t == self.current_tab).unwrap_or(0);
let next_index = (current_index + 1) % tabs.len();
self.current_tab = tabs[next_index].clone();
}
fn prev_tab(&mut self) {
let tabs = TabId::all();
let current_index = tabs.iter().position(|t| *t == self.current_tab).unwrap_or(0);
let prev_index = if current_index == 0 { tabs.len() - 1 } else { current_index - 1 };
self.current_tab = tabs[prev_index].clone();
}
fn add_log(&mut self, message: String) {
self.logs.push(format!("[{}] {}",
chrono::Utc::now().format("%H:%M:%S"),
message
));
if self.logs.len() > 100 {
self.logs.remove(0);
}
}
fn handle_key(&mut self, key: KeyCode) -> bool {
match key {
KeyCode::Char('q') => {
self.should_quit = true;
true
}
KeyCode::Tab => {
self.next_tab();
false
}
KeyCode::BackTab => {
self.prev_tab();
false
}
_ => false
}
}
}
fn render_ui(f: &mut Frame, app: &mut App) {
let chunks = Layout::default()
.direction(Direction::Vertical)
.constraints([Constraint::Length(3), Constraint::Min(0)].as_ref())
.split(f.area());
// Render tabs
let tabs_list = TabId::all();
let tab_titles: Vec<Line> = tabs_list
.iter()
.map(|t| Line::from(t.title()))
.collect();
let selected_tab = TabId::all().iter().position(|t| *t == app.current_tab).unwrap_or(0);
let tabs = Tabs::new(tab_titles)
.block(Block::default().borders(Borders::ALL).title("Hero Supervisor TUI"))
.select(selected_tab)
.style(Style::default().fg(Color::Cyan))
.highlight_style(Style::default().add_modifier(Modifier::BOLD).bg(Color::Black));
f.render_widget(tabs, chunks[0]);
// Render content based on selected tab
match app.current_tab {
TabId::Dashboard => render_dashboard(f, chunks[1], app),
TabId::Workers => render_workers(f, chunks[1], app),
TabId::Jobs => render_jobs(f, chunks[1], app),
TabId::Logs => render_logs(f, chunks[1], app),
}
}
fn render_dashboard(f: &mut Frame, area: Rect, app: &App) {
let chunks = Layout::default()
.direction(Direction::Vertical)
.constraints([Constraint::Length(7), Constraint::Min(0)].as_ref())
.split(area);
// Status overview - supervisor is already running if we get here
let status_text = "Status: ✓ Running\nWorkers: Started successfully\nJobs: Ready for processing\n\nPress 'q' to quit, Tab to navigate";
let status_paragraph = Paragraph::new(status_text)
.block(Block::default().borders(Borders::ALL).title("System Status"))
.wrap(Wrap { trim: true });
f.render_widget(status_paragraph, chunks[0]);
// Recent logs
let log_items: Vec<ListItem> = app.logs
.iter()
.rev()
.take(10)
.map(|log| ListItem::new(log.as_str()))
.collect();
let logs_list = List::new(log_items)
.block(Block::default().borders(Borders::ALL).title("Recent Activity"));
f.render_widget(logs_list, chunks[1]);
}
fn render_workers(f: &mut Frame, area: Rect, _app: &App) {
let paragraph = Paragraph::new("Workers tab - Status checking not implemented yet to avoid system issues")
.block(Block::default().borders(Borders::ALL).title("Workers"))
.wrap(Wrap { trim: true });
f.render_widget(paragraph, area);
}
fn render_jobs(f: &mut Frame, area: Rect, _app: &App) {
let paragraph = Paragraph::new("Jobs tab - Job monitoring not implemented yet to avoid system issues")
.block(Block::default().borders(Borders::ALL).title("Jobs"))
.wrap(Wrap { trim: true });
f.render_widget(paragraph, area);
}
fn render_logs(f: &mut Frame, area: Rect, app: &App) {
let items: Vec<ListItem> = app.logs
.iter()
.map(|log| ListItem::new(log.as_str()))
.collect();
let logs_list = List::new(items)
.block(Block::default().borders(Borders::ALL).title("System Logs"));
f.render_widget(logs_list, area);
}
async fn run_app(
terminal: &mut Terminal<CrosstermBackend<io::Stdout>>,
app: &mut App,
) -> Result<()> {
loop {
terminal.draw(|f| render_ui(f, app))?;
// Simple, safe event handling
if event::poll(Duration::from_millis(100))? {
if let Event::Key(key) = event::read()? {
if key.kind == KeyEventKind::Press {
if app.handle_key(key.code) {
break;
}
}
}
}
if app.should_quit {
break;
}
// Small delay to prevent excessive CPU usage
sleep(Duration::from_millis(50)).await;
}
Ok(())
}
#[tokio::main]
async fn main() -> Result<()> {
let args = Args::parse();
// Initialize logging
if args.verbose {
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug")).init();
} else {
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
}
info!("Hero Supervisor TUI - Fail-fast initialization");
// Step 1: Load and parse configuration
info!("Step 1/4: Loading configuration from {:?}", args.config);
let config_content = std::fs::read_to_string(&args.config)
.map_err(|e| anyhow::anyhow!("Failed to read config file: {}", e))?;
let config: Config = toml::from_str(&config_content)
.map_err(|e| anyhow::anyhow!("Failed to parse config file: {}", e))?;
info!("✓ Configuration loaded successfully");
// Step 2: Check if Zinit is running
info!("Step 2/4: Checking if Zinit is running...");
let zinit_client = ZinitClient::new("/tmp/zinit.sock");
match zinit_client.status("_test_connectivity").await {
Ok(_) => {
info!("✓ Zinit is running and accessible");
}
Err(e) => {
let error_msg = e.to_string();
if error_msg.contains("Connection refused") || error_msg.contains("No such file") {
eprintln!("Error: Zinit process manager is not running.");
eprintln!("Please start Zinit before running the supervisor TUI.");
eprintln!("Expected Zinit socket at: /tmp/zinit.sock");
std::process::exit(1);
} else {
info!("✓ Zinit is running (service not found is expected)");
}
}
}
// Step 3: Build supervisor
info!("Step 3/4: Building supervisor...");
let mut builder = SupervisorBuilder::new()
.redis_url(&config.global.redis_url);
for (worker_name, worker_config) in &config.workers {
match worker_name.as_str() {
"osis_worker" => builder = builder.osis_worker(&worker_config.binary_path),
"sal_worker" => builder = builder.sal_worker(&worker_config.binary_path),
"v_worker" => builder = builder.v_worker(&worker_config.binary_path),
"python_worker" => builder = builder.python_worker(&worker_config.binary_path),
_ => log::warn!("Unknown worker type: {}", worker_name),
}
if let Some(env_vars) = &worker_config.env_vars {
for (key, value) in env_vars {
builder = builder.worker_env_var(key, value);
}
}
}
let supervisor = Arc::new(builder.build()
.map_err(|e| anyhow::anyhow!("Failed to build supervisor: {}", e))?);
info!("✓ Supervisor built successfully");
// Step 4: Start supervisor and workers
info!("Step 4/4: Starting supervisor and workers...");
supervisor.start_workers().await
.map_err(|e| anyhow::anyhow!("Failed to start workers: {}", e))?;
info!("✓ All workers started successfully");
// All initialization successful - now start TUI
info!("Initialization complete - starting TUI...");
let mut app = App::new(Arc::clone(&supervisor));
// Setup terminal
enable_raw_mode()?;
let mut stdout = io::stdout();
execute!(stdout, EnterAlternateScreen, EnableMouseCapture)?;
let backend = CrosstermBackend::new(stdout);
let mut terminal = Terminal::new(backend)?;
// Run the app
let result = run_app(&mut terminal, &mut app).await;
// Cleanup
disable_raw_mode()?;
execute!(
terminal.backend_mut(),
LeaveAlternateScreen,
DisableMouseCapture
)?;
terminal.show_cursor()?;
// Cleanup supervisor
if let Err(e) = supervisor.cleanup_and_shutdown().await {
error!("Error during cleanup: {}", e);
}
info!("Hero Supervisor TUI shutdown complete");
result
}

View File

@@ -1,190 +0,0 @@
# Architecture of the `rhai_supervisor` Crate
The `rhai_supervisor` crate provides a Redis-based client library for submitting Rhai scripts to distributed worker services and awaiting their execution results. It implements a request-reply pattern using Redis as the message broker.
## Core Architecture
The client follows a builder pattern design with clear separation of concerns:
```mermaid
graph TD
A[RhaiSupervisorBuilder] --> B[RhaiSupervisor]
B --> C[PlayRequestBuilder]
C --> D[PlayRequest]
D --> E[Redis Task Queue]
E --> F[Worker Service]
F --> G[Redis Reply Queue]
G --> H[Client Response]
subgraph "Client Components"
A
B
C
D
end
subgraph "Redis Infrastructure"
E
G
end
subgraph "External Services"
F
end
```
## Key Components
### 1. RhaiSupervisorBuilder
A builder pattern implementation for constructing `RhaiSupervisor` instances with proper configuration validation.
**Responsibilities:**
- Configure Redis connection URL
- Set caller ID for task attribution
- Validate configuration before building client
**Key Methods:**
- `caller_id(id: &str)` - Sets the caller identifier
- `redis_url(url: &str)` - Configures Redis connection
- `build()` - Creates the final `RhaiSupervisor` instance
### 2. RhaiSupervisor
The main client interface that manages Redis connections and provides factory methods for creating play requests.
**Responsibilities:**
- Maintain Redis connection pool
- Provide factory methods for request builders
- Handle low-level Redis operations
- Manage task status queries
**Key Methods:**
- `new_play_request()` - Creates a new `PlayRequestBuilder`
- `get_task_status(task_id)` - Queries task status from Redis
- Internal methods for Redis operations
### 3. PlayRequestBuilder
A fluent builder for constructing and submitting script execution requests.
**Responsibilities:**
- Configure script execution parameters
- Handle script loading from files or strings
- Manage request timeouts
- Provide submission methods (fire-and-forget vs await-response)
**Key Methods:**
- `worker_id(id: &str)` - Target worker queue (determines which worker processes the task)
- `context_id(id: &str)` - Target context ID (determines execution context/circle)
- `script(content: &str)` - Set script content directly
- `script_path(path: &str)` - Load script from file
- `timeout(duration: Duration)` - Set execution timeout
- `submit()` - Fire-and-forget submission
- `await_response()` - Submit and wait for result
**Architecture Note:** The decoupling of `worker_id` and `context_id` allows a single worker to process tasks for multiple contexts (circles), providing greater deployment flexibility.
### 4. Data Structures
#### RhaiTaskDetails
Represents the complete state of a task throughout its lifecycle.
```rust
pub struct RhaiTaskDetails {
pub task_id: String,
pub script: String,
pub status: String, // "pending", "processing", "completed", "error"
pub output: Option<String>,
pub error: Option<String>,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
pub caller_id: String,
}
```
#### RhaiSupervisorError
Comprehensive error handling for various failure scenarios:
- `RedisError` - Redis connection/operation failures
- `SerializationError` - JSON serialization/deserialization issues
- `Timeout` - Task execution timeouts
- `TaskNotFound` - Missing tasks after submission
## Communication Protocol
### Task Submission Flow
1. **Task Creation**: Client generates unique UUID for task identification
2. **Task Storage**: Task details stored in Redis hash: `rhailib:<task_id>`
3. **Queue Submission**: Task ID pushed to worker queue: `rhailib:<worker_id>`
4. **Reply Queue Setup**: Client listens on: `rhailib:reply:<task_id>`
### Redis Key Patterns
- **Task Storage**: `rhailib:<task_id>` (Redis Hash)
- **Worker Queues**: `rhailib:<worker_id>` (Redis List)
- **Reply Queues**: `rhailib:reply:<task_id>` (Redis List)
### Message Flow Diagram
```mermaid
sequenceDiagram
participant C as Client
participant R as Redis
participant W as Worker
C->>R: HSET rhailib:task_id (task details)
C->>R: LPUSH rhailib:worker_id task_id
C->>R: BLPOP rhailib:reply:task_id (blocking)
W->>R: BRPOP rhailib:worker_id (blocking)
W->>W: Execute Rhai Script
W->>R: LPUSH rhailib:reply:task_id (result)
R->>C: Return result from BLPOP
C->>R: DEL rhailib:reply:task_id (cleanup)
```
## Concurrency and Async Design
The client is built on `tokio` for asynchronous operations:
- **Connection Pooling**: Uses Redis multiplexed connections for efficiency
- **Non-blocking Operations**: All Redis operations are async
- **Timeout Handling**: Configurable timeouts with proper cleanup
- **Error Propagation**: Comprehensive error handling with context
## Configuration and Deployment
### Prerequisites
- Redis server accessible to both client and workers
- Proper network connectivity between components
- Sufficient Redis memory for task storage
### Configuration Options
- **Redis URL**: Connection string for Redis instance
- **Caller ID**: Unique identifier for client instance
- **Timeouts**: Per-request timeout configuration
- **Worker Targeting**: Direct worker queue addressing
## Security Considerations
- **Task Isolation**: Each task uses unique identifiers
- **Queue Separation**: Worker-specific queues prevent cross-contamination
- **Cleanup**: Automatic cleanup of reply queues after completion
- **Error Handling**: Secure error propagation without sensitive data leakage
## Performance Characteristics
- **Scalability**: Horizontal scaling through multiple worker instances
- **Throughput**: Limited by Redis performance and network latency
- **Memory Usage**: Efficient with connection pooling and cleanup
- **Latency**: Low latency for local Redis deployments
## Integration Points
The client integrates with:
- **Worker Services**: Via Redis queue protocol
- **Monitoring Systems**: Through structured logging
- **Application Code**: Via builder pattern API
- **Configuration Systems**: Through environment variables and builders

View File

@@ -0,0 +1,185 @@
# Hero Supervisor CLI Example
This example demonstrates how to use the `hive-supervisor` CLI tool for managing workers and jobs in the Hero ecosystem.
## Prerequisites
1. **Redis Server**: Make sure Redis is running on `localhost:6379`
```bash
# Install Redis (macOS)
brew install redis
# Start Redis
redis-server
```
2. **Zinit Process Manager**: Install and configure Zinit
```bash
# Install Zinit (example for Linux/macOS)
# Follow Zinit installation instructions for your platform
```
3. **Worker Binaries**: The configuration references worker binaries that need to be available:
- `/usr/local/bin/osis_worker`
- `/usr/local/bin/sal_worker`
- `/usr/local/bin/v_worker`
- `/usr/local/bin/python_worker`
For testing purposes, you can create mock worker binaries or update the paths in `config.toml` to point to existing binaries.
## Configuration
The `config.toml` file contains the supervisor configuration:
- **Global settings**: Redis URL and Zinit socket path
- **Worker configurations**: Binary paths and environment variables for each worker type
## Usage Examples
### 1. Build the CLI
```bash
# From the supervisor directory
cargo build --bin hive-supervisor --release
```
### 2. Worker Management
```bash
# Show help
./target/release/hive-supervisor --config examples/cli/config.toml --help
# List all configured workers
./target/release/hive-supervisor --config examples/cli/config.toml workers list
# Start all workers
./target/release/hive-supervisor --config examples/cli/config.toml workers start
# Start specific workers
./target/release/hive-supervisor --config examples/cli/config.toml workers start osis_worker sal_worker
# Check worker status
./target/release/hive-supervisor --config examples/cli/config.toml workers status
# Stop all workers
./target/release/hive-supervisor --config examples/cli/config.toml workers stop
# Restart specific worker
./target/release/hive-supervisor --config examples/cli/config.toml workers restart osis_worker
```
### 3. Job Management
```bash
# Create a job with inline script
./target/release/hive-supervisor --config examples/cli/config.toml jobs create \
--script 'print("Hello from OSIS worker!");' \
--script-type osis \
--caller-id "user123" \
--context-id "session456"
# Create a job from file
./target/release/hive-supervisor --config examples/cli/config.toml jobs create \
--file examples/cli/sample_script.rhai \
--script-type osis \
--caller-id "user123" \
--context-id "session456"
# List all jobs
./target/release/hive-supervisor --config examples/cli/config.toml jobs list
# Check job status
./target/release/hive-supervisor --config examples/cli/config.toml jobs status <JOB_ID>
# View job logs
./target/release/hive-supervisor --config examples/cli/config.toml jobs logs <JOB_ID>
# Stop a job
./target/release/hive-supervisor --config examples/cli/config.toml jobs stop <JOB_ID>
```
### 4. Interactive REPL Mode
```bash
# Enter REPL mode for OSIS scripts
./target/release/hive-supervisor --config examples/cli/config.toml repl \
--caller-id "user123" \
--context-id "session456" \
--script-type osis \
--timeout 60
# In REPL mode, you can:
# - Type scripts directly and press Enter to execute
# - Type 'help' for available commands
# - Type 'exit' or 'quit' to leave REPL mode
```
### 5. Verbose Logging
```bash
# Enable debug logging
./target/release/hive-supervisor --config examples/cli/config.toml -v workers status
# Enable trace logging
./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
# Disable timestamps
./target/release/hive-supervisor --config examples/cli/config.toml --no-timestamp workers status
```
## Sample Scripts
The `sample_scripts/` directory contains example scripts for different worker types:
- `hello_osis.rhai` - Simple OSIS/HeroScript example
- `system_sal.rhai` - SAL system operation example
- `math_v.v` - V language calculation example
- `data_python.py` - Python data processing example
## Troubleshooting
### Common Issues
1. **Redis Connection Error**
- Ensure Redis is running: `redis-cli ping`
- Check the Redis URL in `config.toml`
2. **Zinit Socket Error**
- Verify Zinit is running and the socket path is correct
- Check permissions on the socket file
3. **Worker Binary Not Found**
- Update binary paths in `config.toml` to match your system
- Ensure worker binaries are executable
4. **Permission Denied**
- Check file permissions on configuration and binary files
- Ensure the user has access to the Zinit socket
### Debug Mode
Run with verbose logging to see detailed operation information:
```bash
RUST_LOG=debug ./target/release/hive-supervisor --config examples/cli/config.toml -vv workers status
```
## Configuration Customization
You can customize the configuration for your environment:
1. **Update Redis URL**: Change `redis_url` in the `[global]` section
2. **Update Zinit Socket**: Change `zinit_socket_path` for your Zinit installation
3. **Worker Paths**: Update binary paths in worker sections to match your setup
4. **Environment Variables**: Add or modify environment variables for each worker type
## Integration with Hero Ecosystem
This CLI integrates with the broader Hero ecosystem:
- **Job Queue**: Uses Redis for job queuing and status tracking
- **Process Management**: Uses Zinit for worker lifecycle management
- **Script Execution**: Supports multiple script types (OSIS, SAL, V, Python)
- **Monitoring**: Provides real-time status and logging capabilities
For more information about the Hero ecosystem, see the main project documentation.

View File

@@ -0,0 +1,19 @@
# Hero Supervisor CLI Configuration Example
# This configuration demonstrates how to set up the hive-supervisor CLI
# with different worker types for script execution.
[global]
# Redis connection URL for job queuing
redis_url = "redis://localhost:6379"
# OSIS Worker Configuration
# Handles OSIS (HeroScript) execution
[osis_worker]
binary_path = "/Users/timurgordon/code/git.ourworld.tf/herocode/hero/target/debug/osis"
env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "osis", "MAX_CONCURRENT_JOBS" = "5" }
# SAL Worker Configuration
# Handles System Abstraction Layer scripts
[sal_worker]
binary_path = "/Users/timurgordon/code/git.ourworld.tf/herocode/hero/target/debug/sal"
env_vars = { "RUST_LOG" = "info", "WORKER_TYPE" = "sal", "MAX_CONCURRENT_JOBS" = "3" }

View File

@@ -0,0 +1,144 @@
#!/bin/bash
# Hero Supervisor CLI Example Runner
# This script demonstrates various CLI operations
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SUPERVISOR_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
CONFIG_FILE="$SCRIPT_DIR/config.toml"
CLI_BINARY="$SUPERVISOR_DIR/target/release/hive-supervisor"
echo -e "${BLUE}=== Hero Supervisor CLI Example Runner ===${NC}"
echo "Script directory: $SCRIPT_DIR"
echo "Supervisor directory: $SUPERVISOR_DIR"
echo "Configuration file: $CONFIG_FILE"
echo
# Function to run CLI command with error handling
run_cli() {
local description="$1"
shift
echo -e "${YELLOW}Running: $description${NC}"
echo "Command: $CLI_BINARY --config $CONFIG_FILE $*"
echo
if "$CLI_BINARY" --config "$CONFIG_FILE" "$@"; then
echo -e "${GREEN}✓ Success${NC}"
else
echo -e "${RED}✗ Failed${NC}"
return 1
fi
echo
}
# Check if CLI binary exists
if [[ ! -f "$CLI_BINARY" ]]; then
echo -e "${YELLOW}Building CLI binary...${NC}"
cd "$SUPERVISOR_DIR"
cargo build --bin hive-supervisor --release
echo
fi
# Check if config file exists
if [[ ! -f "$CONFIG_FILE" ]]; then
echo -e "${RED}Error: Configuration file not found: $CONFIG_FILE${NC}"
exit 1
fi
echo -e "${BLUE}=== CLI Help and Information ===${NC}"
run_cli "Show main help" --help
echo -e "${BLUE}=== Worker Management Examples ===${NC}"
run_cli "List configured workers" workers list
run_cli "Show worker management help" workers --help
# Note: These commands would require actual worker binaries and Zinit setup
echo -e "${YELLOW}Note: The following commands require actual worker binaries and Zinit setup${NC}"
echo -e "${YELLOW}They are shown for demonstration but may fail without proper setup${NC}"
echo
# Uncomment these if you have the proper setup
# run_cli "Check worker status" workers status
# run_cli "Start all workers" workers start
# run_cli "Check worker status after start" workers status
echo -e "${BLUE}=== Job Management Examples ===${NC}"
run_cli "Show job management help" jobs --help
# Create sample jobs (these will also require workers to be running)
echo -e "${YELLOW}Sample job creation commands (require running workers):${NC}"
echo
echo "# Create OSIS job with inline script:"
echo "$CLI_BINARY --config $CONFIG_FILE jobs create \\"
echo " --script 'print(\"Hello from CLI!\");' \\"
echo " --script-type osis \\"
echo " --caller-id \"cli_demo\" \\"
echo " --context-id \"example_session\""
echo
echo "# Create job from sample script file:"
echo "$CLI_BINARY --config $CONFIG_FILE jobs create \\"
echo " --file \"$SCRIPT_DIR/sample_scripts/hello_osis.rhai\" \\"
echo " --script-type osis \\"
echo " --caller-id \"cli_demo\" \\"
echo " --context-id \"example_session\""
echo
echo "# List all jobs:"
echo "$CLI_BINARY --config $CONFIG_FILE jobs list"
echo
echo "# Check job status (replace JOB_ID with actual job ID):"
echo "$CLI_BINARY --config $CONFIG_FILE jobs status JOB_ID"
echo
echo -e "${BLUE}=== REPL Mode Example ===${NC}"
echo -e "${YELLOW}REPL mode command (interactive):${NC}"
echo "$CLI_BINARY --config $CONFIG_FILE repl \\"
echo " --caller-id \"cli_demo\" \\"
echo " --context-id \"example_session\" \\"
echo " --script-type osis \\"
echo " --timeout 60"
echo
echo -e "${BLUE}=== Sample Scripts ===${NC}"
echo "Available sample scripts in $SCRIPT_DIR/sample_scripts/:"
for script in "$SCRIPT_DIR/sample_scripts"/*; do
if [[ -f "$script" ]]; then
basename "$script"
fi
done
echo
echo -e "${BLUE}=== Verbose Logging Examples ===${NC}"
echo "# Debug logging:"
echo "$CLI_BINARY --config $CONFIG_FILE -v workers list"
echo
echo "# Trace logging:"
echo "$CLI_BINARY --config $CONFIG_FILE -vv workers list"
echo
echo "# No timestamps:"
echo "$CLI_BINARY --config $CONFIG_FILE --no-timestamp workers list"
echo
echo -e "${GREEN}=== Example Runner Complete ===${NC}"
echo -e "${YELLOW}To run actual commands, ensure you have:${NC}"
echo "1. Redis server running on localhost:6379"
echo "2. Zinit process manager installed and configured"
echo "3. Worker binaries available at the paths specified in config.toml"
echo
echo -e "${YELLOW}For testing without full setup, you can:${NC}"
echo "1. Update config.toml with paths to existing binaries"
echo "2. Use the CLI help commands and configuration validation"
echo "3. Test the REPL mode (requires workers to be running)"

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""
Sample Python script for demonstration
This script demonstrates Python worker functionality
"""
import json
import datetime
from typing import List, Dict
def main():
print("=== Python Worker Demo ===")
print("Python data processing operations")
# Data structures
print("\nData structures:")
users = [
{"id": 1, "name": "Alice", "age": 30, "role": "developer"},
{"id": 2, "name": "Bob", "age": 25, "role": "designer"},
{"id": 3, "name": "Charlie", "age": 35, "role": "manager"},
{"id": 4, "name": "Diana", "age": 28, "role": "developer"}
]
print(f"Total users: {len(users)}")
# Data filtering
developers = [user for user in users if user["role"] == "developer"]
print(f"Developers: {len(developers)}")
for dev in developers:
print(f" - {dev['name']} (age {dev['age']})")
# Statistical operations
print("\nStatistical operations:")
ages = [user["age"] for user in users]
avg_age = sum(ages) / len(ages)
min_age = min(ages)
max_age = max(ages)
print(f"Average age: {avg_age:.1f}")
print(f"Age range: {min_age} - {max_age}")
# Date/time operations
print("\nDate/time operations:")
now = datetime.datetime.now()
print(f"Current time: {now.strftime('%Y-%m-%d %H:%M:%S')}")
# Calculate birth years
current_year = now.year
for user in users:
birth_year = current_year - user["age"]
print(f"{user['name']} was born in {birth_year}")
# JSON processing
print("\nJSON processing:")
json_data = json.dumps(users, indent=2)
print("User data as JSON:")
print(json_data[:200] + "..." if len(json_data) > 200 else json_data)
# File operations simulation
print("\nFile operations:")
simulate_file_processing()
print("=== Python Demo Complete ===")
def simulate_file_processing():
"""Simulate file processing operations"""
files = [
{"name": "data.csv", "size": 1024, "type": "csv"},
{"name": "config.json", "size": 512, "type": "json"},
{"name": "report.pdf", "size": 2048, "type": "pdf"},
{"name": "script.py", "size": 768, "type": "python"}
]
total_size = sum(file["size"] for file in files)
print(f"Processing {len(files)} files, total size: {total_size} bytes")
# Group by type
file_types = {}
for file in files:
file_type = file["type"]
if file_type not in file_types:
file_types[file_type] = []
file_types[file_type].append(file["name"])
print("Files by type:")
for file_type, file_names in file_types.items():
print(f" {file_type}: {', '.join(file_names)}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,34 @@
// Sample OSIS/HeroScript for demonstration
// This script demonstrates basic OSIS worker functionality
print("=== OSIS Worker Demo ===");
print("Hello from the OSIS worker!");
// Basic variable operations
let name = "Hero";
let version = "1.0";
print(`Running ${name} version ${version}`);
// Simple calculation
let x = 10;
let y = 20;
let result = x + y;
print(`Calculation: ${x} + ${y} = ${result}`);
// Array operations
let numbers = [1, 2, 3, 4, 5];
let sum = 0;
for num in numbers {
sum += num;
}
print(`Sum of array [1,2,3,4,5]: ${sum}`);
// Function definition and call
fn greet(person) {
return `Hello, ${person}! Welcome to Hero.`;
}
let greeting = greet("Developer");
print(greeting);
print("=== OSIS Demo Complete ===");

View File

@@ -0,0 +1,67 @@
// Sample V language script for demonstration
// This script demonstrates V worker functionality
module main
import math
fn main() {
println("=== V Worker Demo ===")
println("V language mathematical operations")
// Basic arithmetic
x := 15
y := 25
sum := x + y
product := x * y
println("Basic arithmetic:")
println("${x} + ${y} = ${sum}")
println("${x} * ${y} = ${product}")
// Mathematical functions
println("\nMathematical functions:")
angle := 45.0
sin_val := math.sin(math.radians(angle))
cos_val := math.cos(math.radians(angle))
println("sin(${angle}°) = ${sin_val:.4f}")
println("cos(${angle}°) = ${cos_val:.4f}")
// Array operations
numbers := [1, 4, 9, 16, 25]
println("\nArray operations:")
println("Numbers: ${numbers}")
mut total := 0
for num in numbers {
total += num
}
println("Sum: ${total}")
// Square roots
println("\nSquare roots:")
for num in numbers {
sqrt_val := math.sqrt(f64(num))
println("√${num} = ${sqrt_val:.2f}")
}
// Fibonacci sequence
println("\nFibonacci sequence (first 10 numbers):")
fib := fibonacci(10)
println("${fib}")
println("=== V Demo Complete ===")
}
fn fibonacci(n int) []int {
mut fib := []int{len: n}
if n >= 1 {
fib[0] = 0
}
if n >= 2 {
fib[1] = 1
}
for i in 2 .. n {
fib[i] = fib[i-1] + fib[i-2]
}
return fib
}

View File

@@ -0,0 +1,43 @@
// Sample SAL (System Abstraction Layer) script for demonstration
// This script demonstrates system-level operations through SAL worker
print("=== SAL Worker Demo ===");
print("System Abstraction Layer operations");
// System information gathering
print("Gathering system information...");
// Simulated system operations (actual SAL would have real system calls)
let hostname = "hero-system";
let uptime = "2 days, 4 hours";
let load_avg = "0.45, 0.52, 0.48";
print(`Hostname: ${hostname}`);
print(`Uptime: ${uptime}`);
print(`Load Average: ${load_avg}`);
// File system operations
print("\nFile system operations:");
let disk_usage = "45% used";
let available_space = "120GB available";
print(`Disk Usage: ${disk_usage}`);
print(`Available Space: ${available_space}`);
// Process management simulation
print("\nProcess management:");
let active_processes = 156;
let memory_usage = "68%";
print(`Active Processes: ${active_processes}`);
print(`Memory Usage: ${memory_usage}`);
// Network status
print("\nNetwork status:");
let network_interfaces = ["eth0", "lo"];
let connectivity = "Connected";
print(`Network Interfaces: ${network_interfaces}`);
print(`Connectivity: ${connectivity}`);
print("=== SAL Demo Complete ===");

View File

@@ -17,7 +17,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Configuration
let redis_url = "redis://localhost:6379";
let zinit_socket = "/var/run/zinit.sock";
// Create supervisor
let supervisor = SupervisorBuilder::new()

View File

@@ -12,7 +12,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Workers are automatically launched during build
let supervisor = SupervisorBuilder::new()
.redis_url("redis://localhost:6379")
.zinit_socket_path("/var/run/zinit.sock")
.osis_worker("/usr/local/bin/osis_worker")
.sal_worker("/usr/local/bin/sal_worker")
.v_worker("/usr/local/bin/v_worker")

View File

@@ -0,0 +1,18 @@
[global]
redis_url = "redis://localhost:6379"
[osis_worker]
binary_path = "/path/to/osis_worker"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
[sal_worker]
binary_path = "/path/to/sal_worker"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
[v_worker]
binary_path = "/path/to/v_worker"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }
[python_worker]
binary_path = "/path/to/python_worker"
env_vars = { "VAR1" = "value1", "VAR2" = "value2" }

View File

@@ -31,6 +31,8 @@ pub enum SupervisorError {
/// Zinit client operation error
ZinitError(String),
SupervisorNotConfigured,
/// Configuration file parsing error
ConfigError(String),
}
impl From<redis::RedisError> for SupervisorError {
@@ -95,6 +97,9 @@ impl std::fmt::Display for SupervisorError {
SupervisorError::SupervisorNotConfigured => {
write!(f, "Supervisor not configured for health monitoring")
}
SupervisorError::ConfigError(msg) => {
write!(f, "Configuration error: {}", msg)
}
}
}
}

View File

@@ -1,9 +1,14 @@
use log::{debug, error, info, warn};
use redis::AsyncCommands;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use hero_job::NAMESPACE_PREFIX;
use zinit_client::ZinitClient;
use zinit_client::Client as ZinitClient;
mod job;
mod error;
@@ -23,46 +28,209 @@ pub struct Supervisor {
pub struct SupervisorBuilder {
redis_url: Option<String>,
zinit_socket_path: Option<String>,
osis_worker: Option<String>,
sal_worker: Option<String>,
v_worker: Option<String>,
python_worker: Option<String>,
worker_env_vars: HashMap<String, String>,
websocket_config: Option<WebSocketServerConfig>,
}
/// Helper struct to pass builder data to worker launch method
#[derive(Clone)]
struct SupervisorBuilderData {
osis_worker: Option<String>,
sal_worker: Option<String>,
v_worker: Option<String>,
python_worker: Option<String>,
worker_env_vars: HashMap<String, String>,
websocket_config: Option<WebSocketServerConfig>,
}
/// TOML configuration structure for the supervisor
#[derive(Debug, Deserialize, Serialize)]
pub struct SupervisorConfig {
pub global: GlobalConfig,
pub websocket_server: Option<WebSocketServerConfig>,
pub osis_worker: Option<WorkerConfigToml>,
pub sal_worker: Option<WorkerConfigToml>,
pub v_worker: Option<WorkerConfigToml>,
pub python_worker: Option<WorkerConfigToml>,
}
/// Global configuration section
#[derive(Debug, Deserialize, Serialize)]
pub struct GlobalConfig {
pub redis_url: String,
}
/// Worker configuration section in TOML
#[derive(Debug, Deserialize, Serialize)]
pub struct WorkerConfigToml {
pub binary_path: String,
#[serde(default)]
pub env_vars: HashMap<String, String>,
}
/// WebSocket server configuration section in TOML
/// This mirrors the ServerConfig from hero_websocket_server but avoids circular dependency
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct WebSocketServerConfig {
/// Server host address
#[serde(default = "default_host")]
pub host: String,
/// Server port
#[serde(default = "default_port")]
pub port: u16,
/// Redis connection URL
#[serde(default = "default_redis_url")]
pub redis_url: String,
/// Enable authentication
#[serde(default)]
pub auth: bool,
/// Enable TLS/WSS
#[serde(default)]
pub tls: bool,
/// Path to TLS certificate file
pub cert: Option<String>,
/// Path to TLS private key file
pub key: Option<String>,
/// Separate port for TLS connections
pub tls_port: Option<u16>,
/// Circles configuration - maps circle names to lists of member public keys
#[serde(default)]
pub circles: HashMap<String, Vec<String>>,
}
// Default value functions for WebSocket server config
fn default_host() -> String {
"127.0.0.1".to_string()
}
fn default_port() -> u16 {
8443
}
fn default_redis_url() -> String {
"redis://127.0.0.1/".to_string()
}
impl SupervisorBuilder {
pub fn new() -> Self {
Self {
redis_url: None,
zinit_socket_path: Some("/var/run/zinit.sock".to_string()),
osis_worker: None,
sal_worker: None,
v_worker: None,
python_worker: None,
worker_env_vars: HashMap::new(),
websocket_config: None,
}
}
/// Create a SupervisorBuilder from a TOML configuration file
pub fn from_toml<P: AsRef<Path>>(toml_path: P) -> Result<Self, SupervisorError> {
let toml_content = fs::read_to_string(toml_path)
.map_err(|e| SupervisorError::ConfigError(format!("Failed to read TOML file: {}", e)))?;
let config: SupervisorConfig = toml::from_str(&toml_content)
.map_err(|e| SupervisorError::ConfigError(format!("Failed to parse TOML: {}", e)))?;
let mut builder = Self::new()
.redis_url(&config.global.redis_url);
// Configure workers based on TOML config
if let Some(osis_config) = config.osis_worker {
builder = builder.osis_worker(&osis_config.binary_path)
.worker_env_vars(osis_config.env_vars);
}
if let Some(sal_config) = config.sal_worker {
builder = builder.sal_worker(&sal_config.binary_path)
.worker_env_vars(sal_config.env_vars);
}
if let Some(v_config) = config.v_worker {
builder = builder.v_worker(&v_config.binary_path)
.worker_env_vars(v_config.env_vars);
}
if let Some(python_config) = config.python_worker {
builder = builder.python_worker(&python_config.binary_path)
.worker_env_vars(python_config.env_vars);
}
// Store WebSocket configuration for later use
if let Some(ws_config) = config.websocket_server {
builder.websocket_config = Some(ws_config);
}
Ok(builder)
}
/// Validate that all configured worker binaries exist and are executable
fn validate_worker_binaries(&self) -> Result<(), SupervisorError> {
let workers = [
("OSIS", &self.osis_worker),
("SAL", &self.sal_worker),
("V", &self.v_worker),
("Python", &self.python_worker),
];
for (worker_type, binary_path) in workers {
if let Some(path) = binary_path {
let path_obj = Path::new(path);
if !path_obj.exists() {
return Err(SupervisorError::ConfigError(
format!("{} worker binary does not exist: {}", worker_type, path)
));
}
if !path_obj.is_file() {
return Err(SupervisorError::ConfigError(
format!("{} worker path is not a file: {}", worker_type, path)
));
}
// Check if the file is executable (Unix-like systems)
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let metadata = path_obj.metadata().map_err(|e| {
SupervisorError::ConfigError(
format!("Failed to read metadata for {} worker binary {}: {}", worker_type, path, e)
)
})?;
let permissions = metadata.permissions();
if permissions.mode() & 0o111 == 0 {
return Err(SupervisorError::ConfigError(
format!("{} worker binary is not executable: {}", worker_type, path)
));
}
}
info!("Validated {} worker binary: {}", worker_type, path);
}
}
Ok(())
}
pub fn redis_url(mut self, url: &str) -> Self {
self.redis_url = Some(url.to_string());
self
}
pub fn zinit_socket_path(mut self, path: &str) -> Self {
self.zinit_socket_path = Some(path.to_string());
self
}
pub fn osis_worker(mut self, binary_path: &str) -> Self {
self.osis_worker = Some(binary_path.to_string());
self
@@ -95,21 +263,23 @@ impl SupervisorBuilder {
/// Builds the final `Supervisor` instance synchronously.
///
/// This method validates the configuration and creates the Redis client.
/// Worker launching is deferred to the `start_workers()` method.
/// This method validates the configuration, checks worker binary existence,
/// and creates the Redis client. Worker launching is deferred to the `start_workers()` method.
///
/// # Returns
///
/// * `Ok(Supervisor)` - Successfully configured client
/// * `Err(SupervisorError)` - Configuration or connection error
pub fn build(self) -> Result<Supervisor, SupervisorError> {
/// * `Ok(Supervisor)` - Successfully configured client with valid binaries
/// * `Err(SupervisorError)` - Configuration, binary validation, or connection error
pub async fn build(self) -> Result<Supervisor, SupervisorError> {
// Validate that all configured worker binaries exist first
Self::validate_worker_binaries(&self)?;
let url = self.redis_url
.unwrap_or_else(|| "redis://127.0.0.1/".to_string());
let client = redis::Client::open(url)?;
let zinit_socket = self.zinit_socket_path
.unwrap_or_else(|| "/var/run/zinit.sock".to_string());
let zinit_client = ZinitClient::new(&zinit_socket);
let zinit_client = ZinitClient::unix_socket("/tmp/zinit.sock").await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create Zinit client: {}", e)))?;
// Store builder data for later use in start_workers()
let builder_data = SupervisorBuilderData {
@@ -118,6 +288,7 @@ impl SupervisorBuilder {
v_worker: self.v_worker,
python_worker: self.python_worker,
worker_env_vars: self.worker_env_vars,
websocket_config: self.websocket_config,
};
let supervisor = Supervisor {
@@ -134,14 +305,33 @@ impl Supervisor {
/// Start all configured workers asynchronously.
/// This method should be called after build() to launch the workers.
pub async fn start_workers(&self) -> Result<(), SupervisorError> {
info!("Starting Hero Supervisor workers...");
// Test Zinit connection first
info!("Testing Zinit connection at /tmp/zinit.sock...");
match self.zinit_client.list().await {
Ok(services) => {
info!("Successfully connected to Zinit. Current services: {:?}", services);
}
Err(e) => {
error!("Failed to connect to Zinit: {:?}", e);
return Err(SupervisorError::ZinitError(format!("Zinit connection failed: {}", e)));
}
}
// Clean up any existing worker services first
info!("Cleaning up existing worker services...");
self.cleanup_existing_workers().await?;
// Launch configured workers if builder data is available
if let Some(builder_data) = &self.builder_data {
info!("Launching configured workers...");
self.launch_configured_workers(builder_data).await?;
} else {
warn!("No builder data available, no workers to start");
}
info!("All workers started successfully!");
Ok(())
}
@@ -179,7 +369,11 @@ impl Supervisor {
for worker_name in worker_names {
// Try to stop and delete, but don't fail if they don't exist
let _ = self.stop_and_delete_worker(worker_name).await;
info!("Attempting to cleanup worker: {}", worker_name);
match self.stop_and_delete_worker(worker_name).await {
Ok(_) => info!("Successfully cleaned up worker: {}", worker_name),
Err(e) => debug!("Failed to cleanup worker {}: {}", worker_name, e),
}
}
info!("Existing worker cleanup completed");
@@ -188,18 +382,33 @@ impl Supervisor {
/// Stop and delete a worker service from zinit
async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
info!("Starting cleanup for worker: {}", worker_name);
// First try to stop the worker
info!("Attempting to stop worker: {}", worker_name);
if let Err(e) = self.zinit_client.stop(worker_name).await {
debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
} else {
info!("Successfully stopped worker: {}", worker_name);
}
// Then try to delete the service
if let Err(e) = self.zinit_client.delete(worker_name).await {
// Then forget the service to stop monitoring it
info!("Attempting to forget worker: {}", worker_name);
if let Err(e) = self.zinit_client.forget(worker_name).await {
info!("Worker {} was not being monitored or failed to forget: {}", worker_name, e);
} else {
info!("Successfully forgot worker service: {}", worker_name);
}
// Finally, delete the service configuration
info!("Attempting to delete service for worker: {}", worker_name);
if let Err(e) = self.zinit_client.delete_service(worker_name).await {
debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
} else {
info!("Successfully deleted worker service: {}", worker_name);
}
info!("Completed cleanup for worker: {}", worker_name);
Ok(())
}
@@ -211,6 +420,157 @@ impl Supervisor {
pub fn new_job(&self) -> JobBuilder {
JobBuilder::new(self)
}
/// Get WebSocket server configuration from TOML config
pub fn get_websocket_config(&self) -> Result<WebSocketServerConfig, SupervisorError> {
let builder_data = self.builder_data.as_ref().ok_or_else(|| {
SupervisorError::ConfigError("No builder data available for WebSocket config".to_string())
})?;
builder_data.websocket_config.clone().ok_or_else(|| {
SupervisorError::ConfigError("No WebSocket server configuration found in TOML config".to_string())
})
}
/// Extract worker configurations from the supervisor's builder data
pub fn get_worker_configs(&self) -> Result<Vec<WorkerConfig>, SupervisorError> {
let builder_data = self.builder_data.as_ref().ok_or_else(|| {
SupervisorError::ConfigError("No builder data available for worker configs".to_string())
})?;
let mut configs = Vec::new();
let env_vars = builder_data.worker_env_vars.clone();
if let Some(osis_path) = &builder_data.osis_worker {
configs.push(
WorkerConfig::new("osis_worker_1".to_string(), PathBuf::from(osis_path), ScriptType::OSIS)
.with_env(env_vars.clone())
);
}
if let Some(sal_path) = &builder_data.sal_worker {
configs.push(
WorkerConfig::new("sal_worker_1".to_string(), PathBuf::from(sal_path), ScriptType::SAL)
.with_env(env_vars.clone())
);
}
if let Some(v_path) = &builder_data.v_worker {
configs.push(
WorkerConfig::new("v_worker_1".to_string(), PathBuf::from(v_path), ScriptType::V)
.with_env(env_vars.clone())
);
}
if let Some(python_path) = &builder_data.python_worker {
configs.push(
WorkerConfig::new("python_worker_1".to_string(), PathBuf::from(python_path), ScriptType::Python)
.with_env(env_vars.clone())
);
}
Ok(configs)
}
/// Spawn a background lifecycle manager that continuously monitors and maintains worker health
/// Returns a JoinHandle that can be used to stop the lifecycle manager
pub fn spawn_lifecycle_manager(
self: Arc<Self>,
worker_configs: Vec<WorkerConfig>,
health_check_interval: Duration,
) -> tokio::task::JoinHandle<Result<(), SupervisorError>> {
let supervisor = self;
tokio::spawn(async move {
info!("Starting background lifecycle manager with {} workers", worker_configs.len());
info!("Health check interval: {:?}", health_check_interval);
// Initial worker startup
info!("Performing initial worker startup...");
if let Err(e) = supervisor.start_workers().await {
error!("Failed to start workers during initialization: {}", e);
return Err(e);
}
// Start the monitoring loop
let mut interval = tokio::time::interval(health_check_interval);
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
loop {
interval.tick().await;
info!("Running periodic worker health check...");
// Check each worker's health and restart if needed
for worker_config in &worker_configs {
if let Err(e) = supervisor.check_and_restart_worker(worker_config).await {
error!("Failed to check/restart worker {}: {}", worker_config.name, e);
}
}
info!("Health check cycle completed");
}
})
}
/// Check a single worker's health and restart if needed
async fn check_and_restart_worker(&self, worker_config: &WorkerConfig) -> Result<(), SupervisorError> {
let worker_name = &worker_config.name;
// Get worker status
match self.zinit_client.status(worker_name).await {
Ok(status) => {
let is_healthy = status.state == "running" && status.pid > 0;
if is_healthy {
debug!("Worker {} is healthy (state: {}, pid: {})", worker_name, status.state, status.pid);
// Optionally send a ping job for deeper health check
if let Err(e) = self.send_ping_job(worker_config.script_type.clone()).await {
warn!("Ping job failed for worker {}: {}", worker_name, e);
// Note: We don't restart on ping failure as it might be temporary
}
} else {
warn!("Worker {} is unhealthy (state: {}, pid: {}), restarting...",
worker_name, status.state, status.pid);
// Attempt to restart the worker
if let Err(e) = self.restart_worker(worker_name).await {
error!("Failed to restart unhealthy worker {}: {}", worker_name, e);
// If restart fails, try a full stop/start cycle
warn!("Attempting full stop/start cycle for worker: {}", worker_name);
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
error!("Failed to stop worker {} during recovery: {}", worker_name, e);
}
if let Err(e) = self.start_worker(worker_config).await {
error!("Failed to start worker {} during recovery: {}", worker_name, e);
return Err(e);
}
info!("Successfully recovered worker: {}", worker_name);
} else {
info!("Successfully restarted worker: {}", worker_name);
}
}
}
Err(e) => {
warn!("Could not get status for worker {} (may not exist): {}", worker_name, e);
// Worker doesn't exist, try to start it
info!("Attempting to start missing worker: {}", worker_name);
if let Err(e) = self.start_worker(worker_config).await {
error!("Failed to start missing worker {}: {}", worker_name, e);
return Err(e);
}
info!("Successfully started missing worker: {}", worker_name);
}
}
Ok(())
}
// Internal helper to submit script details and push to work queue
async fn create_job_using_connection(

View File

@@ -8,7 +8,7 @@ use serde_json::json;
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
use zinit_client::{ZinitClient, ServiceStatus, ServiceState};
use zinit_client::{Client as ZinitClient, Status};
use hero_job::ScriptType;
use crate::{Supervisor, SupervisorError};
@@ -16,7 +16,7 @@ use crate::{Supervisor, SupervisorError};
#[derive(Debug, Clone)]
pub struct WorkerInfo {
pub config: WorkerConfig,
pub status: Option<ServiceStatus>,
pub status: Option<Status>,
pub is_running: bool,
}
@@ -90,7 +90,7 @@ impl Supervisor {
for config in worker_configs {
let status = self.zinit_client.status(&config.name).await.ok();
let is_running = status.as_ref()
.map(|s| matches!(s.state, ServiceState::Running) && s.pid > 0)
.map(|s| s.state == "running" && s.pid > 0)
.unwrap_or(false);
workers.push(WorkerInfo {
@@ -117,6 +117,10 @@ impl Supervisor {
self.zinit_client.create_service(&worker_config.name, service_config).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create service: {}", e)))?;
// Monitor the service so Zinit starts managing it
self.zinit_client.monitor(&worker_config.name).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to monitor service: {}", e)))?;
// Start the service
self.zinit_client.start(&worker_config.name).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to start worker: {}", e)))?;
@@ -168,7 +172,7 @@ impl Supervisor {
&self,
worker_name: &str,
zinit_client: &ZinitClient,
) -> Result<ServiceStatus, SupervisorError> {
) -> Result<Status, SupervisorError> {
match zinit_client.status(worker_name).await {
Ok(status) => Ok(status),
Err(e) => {
@@ -183,7 +187,7 @@ impl Supervisor {
&self,
worker_configs: &[WorkerConfig],
zinit_client: &ZinitClient,
) -> Result<HashMap<String, ServiceStatus>, SupervisorError> {
) -> Result<HashMap<String, Status>, SupervisorError> {
let mut status_map = HashMap::new();
for worker in worker_configs {
@@ -200,19 +204,7 @@ impl Supervisor {
Ok(status_map)
}
/// Start multiple workers
pub async fn start_workers(
&self,
worker_configs: &[WorkerConfig],
) -> Result<(), SupervisorError> {
info!("Starting {} workers", worker_configs.len());
for worker in worker_configs {
self.start_worker(worker).await?;
}
Ok(())
}
/// Stop multiple workers
pub async fn stop_workers(
@@ -240,7 +232,7 @@ impl Supervisor {
for worker in worker_configs {
if worker.script_type == *script_type {
if let Ok(status) = zinit_client.status(&worker.name).await {
if status.state == ServiceState::Running {
if status.state == "running" {
running_count += 1;
}
}
@@ -277,26 +269,35 @@ impl Supervisor {
}
/// Create Zinit service configuration from worker config
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Value {
let mut config = json!({
"exec": format!("{} {}",
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Map<String, serde_json::Value> {
use serde_json::{Map, Value};
let mut config = Map::new();
config.insert(
"exec".to_string(),
Value::String(format!("{} {}",
worker.binary_path.display(),
worker.args.join(" ")
),
"oneshot": !worker.restart_on_exit,
});
))
);
config.insert(
"oneshot".to_string(),
Value::Bool(!worker.restart_on_exit)
);
if let Some(health_check) = &worker.health_check {
config["test"] = json!(health_check);
config.insert("test".to_string(), Value::String(health_check.clone()));
}
if !worker.dependencies.is_empty() {
config["after"] = json!(worker.dependencies);
config.insert("after".to_string(), json!(worker.dependencies));
}
// Add environment variables if any
if !worker.env.is_empty() {
config["env"] = json!(worker.env);
config.insert("env".to_string(), json!(worker.env));
}
config
@@ -307,6 +308,8 @@ impl Supervisor {
use hero_job::ScriptType;
use std::path::PathBuf;
let mut errors = Vec::new();
// Launch OSIS worker if configured
if let Some(binary_path) = &builder.osis_worker {
let worker_id = "osis_worker_1";
@@ -318,7 +321,11 @@ impl Supervisor {
config.env.extend(builder.worker_env_vars.clone());
info!("Launching OSIS worker: {}", worker_id);
self.start_worker(&config).await?;
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start OSIS worker: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Launch SAL worker if configured
@@ -332,7 +339,11 @@ impl Supervisor {
config.env.extend(builder.worker_env_vars.clone());
info!("Launching SAL worker: {}", worker_id);
self.start_worker(&config).await?;
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start SAL worker: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Launch V worker if configured
@@ -346,7 +357,11 @@ impl Supervisor {
config.env.extend(builder.worker_env_vars.clone());
info!("Launching V worker: {}", worker_id);
self.start_worker(&config).await?;
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start V worker: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Launch Python worker if configured
@@ -360,9 +375,21 @@ impl Supervisor {
config.env.extend(builder.worker_env_vars.clone());
info!("Launching Python worker: {}", worker_id);
self.start_worker(&config).await?;
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start Python worker: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
Ok(())
// Return result based on whether any workers started successfully
if errors.is_empty() {
info!("All configured workers started successfully");
Ok(())
} else {
let combined_error = format!("Some workers failed to start: {}", errors.join("; "));
warn!("{}", combined_error);
Err(SupervisorError::ZinitError(combined_error))
}
}
}