This commit is contained in:
Timur Gordon
2025-08-01 00:01:08 +02:00
parent 32c2cbe0cc
commit 8ed40ce99c
57 changed files with 2047 additions and 4113 deletions

1
core/supervisor/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/target

View File

@@ -0,0 +1,26 @@
[package]
name = "hero_supervisor"
version = "0.1.0"
edition = "2021"
[[bin]]
name = "supervisor"
path = "cmd/supervisor.rs"
[dependencies]
clap = { version = "4.4", features = ["derive"] }
env_logger = "0.10"
redis = { version = "0.25.0", features = ["tokio-comp"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
uuid = { version = "1.6", features = ["v4", "serde"] }
chrono = { version = "0.4", features = ["serde"] }
log = "0.4"
tokio = { version = "1", features = ["macros", "rt-multi-thread"] } # For async main in examples, and general async
colored = "2.0"
hero_job = { path = "../job" }
zinit-client = "0.4.0"
[dev-dependencies] # For examples later
env_logger = "0.10"
rhai = "1.18.0" # For examples that might need to show engine setup

View File

@@ -0,0 +1,315 @@
# Worker Lifecycle Management
The Hero Supervisor includes comprehensive worker lifecycle management functionality using [Zinit](https://github.com/threefoldtech/zinit) as the process manager. This enables the supervisor to manage worker processes, perform health monitoring, and implement load balancing.
## Overview
The lifecycle management system provides:
- **Worker Process Management**: Start, stop, restart, and monitor worker binaries
- **Health Monitoring**: Automatic ping jobs every 10 minutes for idle workers
- **Load Balancing**: Dynamic scaling of workers based on demand
- **Service Dependencies**: Proper startup ordering with dependency management
- **Graceful Shutdown**: Clean termination of worker processes
## Architecture
```
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
│ Supervisor │ │ WorkerLifecycle │ │ Zinit │
│ │◄──►│ Manager │◄──►│ (Process │
│ (Job Dispatch) │ │ │ │ Manager) │
└─────────────────┘ └──────────────────┘ └─────────────────┘
│ │ │
│ │ │
▼ ▼ ▼
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
│ Redis │ │ Health Monitor │ │ Worker Binaries │
│ (Job Queue) │ │ (Ping Jobs) │ │ (OSIS/SAL/V) │
└─────────────────┘ └──────────────────┘ └─────────────────┘
```
## Components
### WorkerConfig
Defines configuration for a worker binary:
```rust
use hero_supervisor::{WorkerConfig, ScriptType};
use std::path::PathBuf;
use std::collections::HashMap;
let config = WorkerConfig::new(
"osis_worker_0".to_string(),
PathBuf::from("/usr/local/bin/osis_worker"),
ScriptType::OSIS,
)
.with_args(vec![
"--redis-url".to_string(),
"redis://localhost:6379".to_string(),
"--worker-id".to_string(),
"osis_worker_0".to_string(),
])
.with_env({
let mut env = HashMap::new();
env.insert("RUST_LOG".to_string(), "info".to_string());
env.insert("WORKER_TYPE".to_string(), "osis".to_string());
env
})
.with_health_check("/usr/local/bin/osis_worker --health-check".to_string())
.with_dependencies(vec!["redis".to_string()]);
```
### WorkerLifecycleManager
Main component for managing worker lifecycles:
```rust
use hero_supervisor::{WorkerLifecycleManagerBuilder, Supervisor};
let supervisor = SupervisorBuilder::new()
.redis_url("redis://localhost:6379")
.caller_id("my_supervisor")
.context_id("production")
.build()?;
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new("/var/run/zinit.sock".to_string())
.with_supervisor(supervisor.clone())
.add_worker(osis_worker_config)
.add_worker(sal_worker_config)
.add_worker(v_worker_config)
.build();
```
## Supported Script Types
The lifecycle manager supports all Hero script types:
- **OSIS**: Rhai/HeroScript execution workers
- **SAL**: System Abstraction Layer workers
- **V**: HeroScript execution in V language
- **Python**: HeroScript execution in Python
## Key Features
### 1. Worker Management
```rust
// Start all configured workers
lifecycle_manager.start_all_workers().await?;
// Stop all workers
lifecycle_manager.stop_all_workers().await?;
// Restart specific worker
lifecycle_manager.restart_worker("osis_worker_0").await?;
// Get worker status
let status = lifecycle_manager.get_worker_status("osis_worker_0").await?;
println!("Worker state: {:?}, PID: {}", status.state, status.pid);
```
### 2. Health Monitoring
The system automatically monitors worker health:
- Tracks last job execution time for each worker
- Sends ping jobs to workers idle for 10+ minutes
- Restarts workers that fail ping checks 3 times
- Updates job times when workers receive tasks
```rust
// Manual health check
lifecycle_manager.monitor_worker_health().await?;
// Update job time (called automatically by supervisor)
lifecycle_manager.update_worker_job_time("osis_worker_0");
// Start continuous health monitoring
lifecycle_manager.start_health_monitoring().await; // Runs forever
```
### 3. Dynamic Scaling
Scale workers up or down based on demand:
```rust
// Scale OSIS workers to 5 instances
lifecycle_manager.scale_workers(&ScriptType::OSIS, 5).await?;
// Scale down SAL workers to 1 instance
lifecycle_manager.scale_workers(&ScriptType::SAL, 1).await?;
// Check current running count
let count = lifecycle_manager.get_running_worker_count(&ScriptType::V).await;
println!("Running V workers: {}", count);
```
### 4. Service Dependencies
Workers can depend on other services:
```rust
let config = WorkerConfig::new(name, binary, script_type)
.with_dependencies(vec![
"redis".to_string(),
"database".to_string(),
"auth_service".to_string(),
]);
```
Zinit ensures dependencies start before the worker.
## Integration with Supervisor
The lifecycle manager integrates seamlessly with the supervisor:
```rust
use hero_supervisor::{Supervisor, WorkerLifecycleManager};
// Create supervisor and lifecycle manager
let supervisor = SupervisorBuilder::new().build()?;
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new(zinit_socket)
.with_supervisor(supervisor.clone())
.build();
// Start workers
lifecycle_manager.start_all_workers().await?;
// Create and execute jobs (supervisor automatically routes to workers)
let job = supervisor
.new_job()
.script_type(ScriptType::OSIS)
.script_content("println!(\"Hello World!\");".to_string())
.build()?;
let result = supervisor.run_job_and_await_result(&job).await?;
println!("Job result: {}", result);
```
## Zinit Service Configuration
The lifecycle manager automatically creates Zinit service configurations:
```yaml
# Generated service config for osis_worker_0
exec: "/usr/local/bin/osis_worker --redis-url redis://localhost:6379 --worker-id osis_worker_0"
test: "/usr/local/bin/osis_worker --health-check"
oneshot: false # Restart on exit
after:
- redis
env:
RUST_LOG: "info"
WORKER_TYPE: "osis"
```
## Error Handling
The system provides comprehensive error handling:
```rust
use hero_supervisor::SupervisorError;
match lifecycle_manager.start_worker(&config).await {
Ok(_) => println!("Worker started successfully"),
Err(SupervisorError::WorkerStartFailed(worker, reason)) => {
eprintln!("Failed to start {}: {}", worker, reason);
}
Err(e) => eprintln!("Other error: {}", e),
}
```
## Example Usage
See `examples/lifecycle_demo.rs` for a comprehensive demonstration:
```bash
# Run the lifecycle demo
cargo run --example lifecycle_demo
# Run with custom Redis URL
REDIS_URL=redis://localhost:6379 cargo run --example lifecycle_demo
```
## Prerequisites
1. **Zinit**: Install and run Zinit process manager
```bash
curl https://raw.githubusercontent.com/threefoldtech/zinit/refs/heads/master/install.sh | bash
zinit init --config /etc/zinit/ --socket /var/run/zinit.sock
```
2. **Redis**: Running Redis instance for job queues
```bash
redis-server
```
3. **Worker Binaries**: Compiled worker binaries for each script type
- `/usr/local/bin/osis_worker`
- `/usr/local/bin/sal_worker`
- `/usr/local/bin/v_worker`
- `/usr/local/bin/python_worker`
## Configuration Best Practices
1. **Resource Limits**: Configure appropriate resource limits in Zinit
2. **Health Checks**: Implement meaningful health check commands
3. **Dependencies**: Define proper service dependencies
4. **Environment**: Set appropriate environment variables
5. **Logging**: Configure structured logging for debugging
6. **Monitoring**: Use health monitoring for production deployments
## Troubleshooting
### Common Issues
1. **Zinit Connection Failed**
- Ensure Zinit is running: `ps aux | grep zinit`
- Check socket permissions: `ls -la /var/run/zinit.sock`
- Verify socket path in configuration
2. **Worker Start Failed**
- Check binary exists and is executable
- Verify dependencies are running
- Review Zinit logs: `zinit logs <service-name>`
3. **Health Check Failures**
- Implement proper health check endpoint in workers
- Verify health check command syntax
- Check worker responsiveness
4. **Redis Connection Issues**
- Ensure Redis is running and accessible
- Verify Redis URL configuration
- Check network connectivity
### Debug Commands
```bash
# Check Zinit status
zinit list
# View service logs
zinit logs osis_worker_0
# Check service status
zinit status osis_worker_0
# Monitor Redis queues
redis-cli keys "hero:job:*"
```
## Performance Considerations
- **Scaling**: Start with minimal workers and scale based on queue depth
- **Health Monitoring**: Adjust ping intervals based on workload patterns
- **Resource Usage**: Monitor CPU/memory usage of worker processes
- **Queue Depth**: Monitor Redis queue lengths for scaling decisions
## Security
- **Process Isolation**: Zinit provides process isolation
- **User Permissions**: Run workers with appropriate user permissions
- **Network Security**: Secure Redis and Zinit socket access
- **Binary Validation**: Verify worker binary integrity before deployment

103
core/supervisor/README.md Normal file
View File

@@ -0,0 +1,103 @@
# Hero Supervisor
The **Hero Supervisor** is responsible for supervising the lifecycle of workers and dispatching jobs to them via Redis queues.
## Overview
The system involves four primary actors:
1. **OSIS**: A worker that executes Rhai and HeroScript.
2. **SAL**: A worker that performs system abstraction layer functionalities using Rhai.
3. **V**: A worker that executes HeroScript in the V programming language.
4. **Python**: A worker that executes HeroScript in Python.
The Supervisor utilizes **zinit** to start and monitor these workers, ensuring they are running correctly.
### Key Features
- **Worker Lifecycle Supervision**: Oversee the lifecycle of workers, including starting, stopping, restarting, and load balancing based on job demand.
- **Job Supervision**: API for efficiently managing jobs dispatched to workers over Redis queues.
## Worker Lifecycle Supervision
The Supervisor oversees the lifecycle of the workers, ensuring they are operational and efficiently allocated. Load balancing is implemented to dynamically adjust the number of active workers based on job demand.
Additionally, the Supervisor implements health monitoring for worker engines: if a worker engine does not receive a job within 10 minutes, the Supervisor sends a ping job. The engine must respond immediately; if it fails to do so, the Supervisor restarts the requested job engine.
### Prerequisites
**Important**: Before running any lifecycle examples or using worker management features, you must start the Zinit daemon:
```bash
# Start Zinit daemon (required for worker lifecycle management)
sudo zinit init
# Or start Zinit with a custom socket path
sudo zinit --socket /var/run/zinit.sock init
```
**Note**: The Supervisor uses Zinit as the process manager for worker lifecycle operations. The default socket path is `/var/run/zinit.sock`, but you can configure a custom path using the `SupervisorBuilder::zinit_socket_path()` method.
**Troubleshooting**: If you get connection errors when running examples, ensure:
1. Zinit daemon is running (`zinit list` should work)
2. The socket path matches between Zinit and your Supervisor configuration
3. You have appropriate permissions to access the Zinit socket
### Supervisor API for Worker Lifecycle
The Supervisor provides the following methods for supervising the worker lifecycle:
- **`start_worker()`**: Initializes and starts a specified worker.
- **`stop_worker()`**: Gracefully stops a specified worker.
- **`restart_worker()`**: Restarts a specified worker to ensure it operates correctly.
- **`get_worker_status()`**: Checks the status of a specific worker.
## Job Supervision
Jobs are dispatched to workers through their designated Redis queues, and the Supervisor provides an API for comprehensive job supervision.
### Supervisor API for Job Supervision
The Supervisor offers the following methods for handling jobs:
- **`new_job()`**: Creates a new `JobBuilder` for configuring a job.
- **`create_job()`**: Stores a job in Redis.
- **`run_job_and_await_result()`**: Executes a job and waits for its completion.
- **`get_job_status()`**: Checks the current execution status of a job.
- **`get_job_output()`**: Retrieves the results of a completed job.
## Running Examples
The supervisor includes several examples demonstrating lifecycle management:
```bash
# 1. First, start the Zinit daemon
sudo zinit init
# 2. In another terminal, start Redis (if not already running)
redis-server
# 3. Run the lifecycle demo
cargo run --example simple_lifecycle_demo
# Or run the comprehensive lifecycle demo
cargo run --example lifecycle_demo
```
**Example Configuration**: The examples use these default paths:
- Redis: `redis://localhost:6379`
- Zinit socket: `/var/run/zinit.sock`
You can modify these in the example source code if your setup differs.
### Redis Schema for Job Supervision
Jobs are managed within the `hero:` namespace in Redis:
- **`hero:job:{job_id}`**: Stores job parameters as a Redis hash.
- **`hero:work_queue:{worker_id}`**: Contains worker-specific job queues for dispatching jobs.
- **`hero:reply:{job_id}`**: Dedicated queues for job results.
## Prerequisites
- A Redis server must be accessible to both the Supervisor and the workers.

View File

@@ -0,0 +1,157 @@
# Rhai Client Binary
A command-line client for executing Rhai scripts on remote workers via Redis.
## Binary: `client`
### Installation
Build the binary:
```bash
cargo build --bin client --release
```
### Usage
```bash
# Basic usage - requires caller and circle keys
client --caller-key <CALLER_KEY> --circle-key <CIRCLE_KEY>
# Execute inline script
client -c <CALLER_KEY> -k <CIRCLE_KEY> --script "print('Hello World!')"
# Execute script from file
client -c <CALLER_KEY> -k <CIRCLE_KEY> --file script.rhai
# Use specific worker (defaults to circle key)
client -c <CALLER_KEY> -k <CIRCLE_KEY> -w <WORKER_KEY> --script "2 + 2"
# Custom Redis and timeout
client -c <CALLER_KEY> -k <CIRCLE_KEY> --redis-url redis://localhost:6379/1 --timeout 60
# Remove timestamps from logs
client -c <CALLER_KEY> -k <CIRCLE_KEY> --no-timestamp
# Increase verbosity
client -c <CALLER_KEY> -k <CIRCLE_KEY> -v --script "debug_info()"
```
### Command-Line Options
| Option | Short | Default | Description |
|--------|-------|---------|-------------|
| `--caller-key` | `-c` | **Required** | Caller public key (your identity) |
| `--circle-key` | `-k` | **Required** | Circle public key (execution context) |
| `--worker-key` | `-w` | `circle-key` | Worker public key (target worker) |
| `--redis-url` | `-r` | `redis://localhost:6379` | Redis connection URL |
| `--script` | `-s` | | Rhai script to execute |
| `--file` | `-f` | | Path to Rhai script file |
| `--timeout` | `-t` | `30` | Timeout for script execution (seconds) |
| `--no-timestamp` | | `false` | Remove timestamps from log output |
| `--verbose` | `-v` | | Increase verbosity (stackable) |
### Execution Modes
#### Inline Script Execution
```bash
# Execute a simple calculation
client -c caller_123 -k circle_456 -s "let result = 2 + 2; print(result);"
# Execute with specific worker
client -c caller_123 -k circle_456 -w worker_789 -s "get_user_data()"
```
#### Script File Execution
```bash
# Execute script from file
client -c caller_123 -k circle_456 -f examples/data_processing.rhai
# Execute with custom timeout
client -c caller_123 -k circle_456 -f long_running_script.rhai -t 120
```
#### Interactive Mode
```bash
# Enter interactive REPL mode (when no script or file provided)
client -c caller_123 -k circle_456
# Interactive mode with verbose logging
client -c caller_123 -k circle_456 -v --no-timestamp
```
### Interactive Mode
When no script (`-s`) or file (`-f`) is provided, the client enters interactive mode:
```
🔗 Starting Rhai Client
📋 Configuration:
Caller Key: caller_123
Circle Key: circle_456
Worker Key: circle_456
Redis URL: redis://localhost:6379
Timeout: 30s
✅ Connected to Redis at redis://localhost:6379
🎮 Entering interactive mode
Type Rhai scripts and press Enter to execute. Type 'exit' or 'quit' to close.
rhai> let x = 42; print(x);
Status: completed
Output: 42
rhai> exit
👋 Goodbye!
```
### Configuration Examples
#### Development Usage
```bash
# Simple development client
client -c dev_user -k dev_circle
# Development with clean logs
client -c dev_user -k dev_circle --no-timestamp -v
```
#### Production Usage
```bash
# Production client with specific worker
client \
--caller-key prod_user_123 \
--circle-key prod_circle_456 \
--worker-key prod_worker_789 \
--redis-url redis://redis-cluster:6379/0 \
--timeout 300 \
--file production_script.rhai
```
#### Batch Processing
```bash
# Process multiple scripts
for script in scripts/*.rhai; do
client -c batch_user -k batch_circle -f "$script" --no-timestamp
done
```
### Key Concepts
- **Caller Key**: Your identity - used for authentication and tracking
- **Circle Key**: Execution context - defines the environment/permissions
- **Worker Key**: Target worker - which worker should execute the script (defaults to circle key)
### Error Handling
The client provides clear error messages for:
- Missing required keys
- Redis connection failures
- Script execution timeouts
- Worker unavailability
- Script syntax errors
### Dependencies
- `rhai_supervisor`: Core client library for Redis-based script execution
- `redis`: Redis client for task queue communication
- `clap`: Command-line argument parsing
- `env_logger`: Logging infrastructure
- `tokio`: Async runtime

View File

@@ -0,0 +1,236 @@
use clap::Parser;
use hero_supervisor::{Supervisor, SupervisorBuilder, ScriptType};
use log::{error, info};
use colored::Colorize;
use std::io::{self, Write};
use std::time::Duration;
#[derive(Parser, Debug)]
#[command(author, version, about = "Rhai Client - Script execution client", long_about = None)]
struct Args {
/// Caller ID (your identity)
#[arg(short = 'c', long = "caller-id", help = "Caller ID (your identity)")]
caller_id: String,
/// Context ID (execution context)
#[arg(short = 'k', long = "context-id", help = "Context ID (execution context)")]
context_id: String,
/// Script type to execute (osis, sal, v, python)
#[arg(short = 'T', long = "script-type", default_value = "osis", help = "Script type: osis, sal, v, or python")]
script_type: String,
/// Redis URL
#[arg(short, long, default_value = "redis://localhost:6379", help = "Redis connection URL")]
redis_url: String,
/// Rhai script to execute
#[arg(short, long, help = "Rhai script to execute")]
script: Option<String>,
/// Path to Rhai script file
#[arg(short, long, help = "Path to Rhai script file")]
file: Option<String>,
/// Timeout for script execution (in seconds)
#[arg(short, long, default_value = "30", help = "Timeout for script execution in seconds")]
timeout: u64,
/// Increase verbosity (can be used multiple times)
#[arg(short, long, action = clap::ArgAction::Count, help = "Increase verbosity (-v for debug, -vv for trace)")]
verbose: u8,
/// Disable timestamps in log output
#[arg(long, help = "Remove timestamps from log output")]
no_timestamp: bool,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Args::parse();
// Configure logging based on verbosity level
let log_config = match args.verbose {
0 => "warn,hero_supervisor=warn",
1 => "info,hero_supervisor=info",
2 => "debug,hero_supervisor=debug",
_ => "trace,hero_supervisor=trace",
};
std::env::set_var("RUST_LOG", log_config);
// Configure env_logger with or without timestamps
if args.no_timestamp {
env_logger::Builder::from_default_env()
.format_timestamp(None)
.init();
} else {
env_logger::init();
}
// Validate script type
match args.script_type.to_lowercase().as_str() {
"osis" | "sal" | "v" | "python" => {
// Valid script types - no worker validation needed since we use hardcoded queues
}
_ => {
error!("❌ Invalid script type: {}. Valid types: osis, sal, v, python", args.script_type);
return Err(format!("Invalid script type: {}", args.script_type).into());
}
}
if args.verbose > 0 {
info!("🔗 Starting Hero Supervisor");
info!("📋 Configuration:");
info!(" Caller ID: {}", args.caller_id);
info!(" Context ID: {}", args.context_id);
info!(" Script Type: {}", args.script_type);
info!(" Redis URL: {}", args.redis_url);
info!(" Timeout: {}s", args.timeout);
info!(" Using hardcoded worker queues for script type: {}", args.script_type);
info!("");
}
// Create the supervisor client
let client = SupervisorBuilder::new()
.redis_url(&args.redis_url)
.build()?;
if args.verbose > 0 {
info!("✅ Connected to Redis at {}", args.redis_url);
}
// Determine execution mode
if let Some(script_content) = args.script {
// Execute inline script
if args.verbose > 0 {
info!("📜 Executing inline script");
}
execute_script(&client, script_content, &args.script_type, args.timeout).await?;
} else if let Some(file_path) = args.file {
// Execute script from file
if args.verbose > 0 {
info!("📁 Loading script from file: {}", file_path);
}
let script_content = std::fs::read_to_string(&file_path)
.map_err(|e| format!("Failed to read script file '{}': {}", file_path, e))?;
execute_script(&client, script_content, &args.script_type, args.timeout).await?;
} else {
// Interactive mode
info!("🎮 Entering interactive mode");
info!("Type Rhai scripts and press Enter to execute. Type 'exit' or 'quit' to close.");
run_interactive_mode(&client, &args.script_type, args.timeout, args.verbose).await?;
}
Ok(())
}
async fn execute_script(
client: &Supervisor,
script: String,
script_type_str: &str,
timeout_secs: u64,
) -> Result<(), Box<dyn std::error::Error>> {
info!("⚡ Executing script: {:.50}...", script);
// Parse script type
let script_type = match script_type_str.to_lowercase().as_str() {
"osis" => ScriptType::OSIS,
"sal" => ScriptType::SAL,
"v" => ScriptType::V,
"python" => ScriptType::Python,
_ => {
error!("❌ Invalid script type: {}. Valid types: osis, sal, v, python", script_type_str);
return Err(format!("Invalid script type: {}", script_type_str).into());
}
};
let timeout = Duration::from_secs(timeout_secs);
match client
.new_job()
.script_type(script_type)
.script(&script)
.timeout(timeout)
.await_response()
.await
{
Ok(result) => {
info!("✅ Script execution completed");
println!("{}", "Result:".green().bold());
println!("{}", result);
}
Err(e) => {
error!("❌ Script execution failed: {}", e);
return Err(Box::new(e));
}
}
Ok(())
}
async fn run_interactive_mode(
client: &Supervisor,
script_type_str: &str,
timeout_secs: u64,
verbose: u8,
) -> Result<(), Box<dyn std::error::Error>> {
// Parse script type
let script_type = match script_type_str.to_lowercase().as_str() {
"osis" => ScriptType::OSIS,
"sal" => ScriptType::SAL,
"v" => ScriptType::V,
"python" => ScriptType::Python,
_ => {
error!("❌ Invalid script type: {}. Valid types: osis, sal, v, python", script_type_str);
return Err(format!("Invalid script type: {}", script_type_str).into());
}
};
let timeout = Duration::from_secs(timeout_secs);
loop {
print!("rhai> ");
io::stdout().flush()?;
let mut input = String::new();
io::stdin().read_line(&mut input)?;
let input = input.trim();
if input.is_empty() {
continue;
}
if input == "exit" || input == "quit" {
info!("👋 Goodbye!");
break;
}
if verbose > 0 {
info!("⚡ Executing: {}", input);
}
match client
.new_job()
.script_type(script_type.clone())
.script(input)
.timeout(timeout)
.await_response()
.await
{
Ok(result) => {
println!("{}", result.green());
}
Err(e) => {
println!("{}", format!("error: {}", e).red());
}
}
println!(); // Add blank line for readability
}
Ok(())
}

View File

@@ -0,0 +1,190 @@
# Architecture of the `rhai_supervisor` Crate
The `rhai_supervisor` crate provides a Redis-based client library for submitting Rhai scripts to distributed worker services and awaiting their execution results. It implements a request-reply pattern using Redis as the message broker.
## Core Architecture
The client follows a builder pattern design with clear separation of concerns:
```mermaid
graph TD
A[RhaiSupervisorBuilder] --> B[RhaiSupervisor]
B --> C[PlayRequestBuilder]
C --> D[PlayRequest]
D --> E[Redis Task Queue]
E --> F[Worker Service]
F --> G[Redis Reply Queue]
G --> H[Client Response]
subgraph "Client Components"
A
B
C
D
end
subgraph "Redis Infrastructure"
E
G
end
subgraph "External Services"
F
end
```
## Key Components
### 1. RhaiSupervisorBuilder
A builder pattern implementation for constructing `RhaiSupervisor` instances with proper configuration validation.
**Responsibilities:**
- Configure Redis connection URL
- Set caller ID for task attribution
- Validate configuration before building client
**Key Methods:**
- `caller_id(id: &str)` - Sets the caller identifier
- `redis_url(url: &str)` - Configures Redis connection
- `build()` - Creates the final `RhaiSupervisor` instance
### 2. RhaiSupervisor
The main client interface that manages Redis connections and provides factory methods for creating play requests.
**Responsibilities:**
- Maintain Redis connection pool
- Provide factory methods for request builders
- Handle low-level Redis operations
- Manage task status queries
**Key Methods:**
- `new_play_request()` - Creates a new `PlayRequestBuilder`
- `get_task_status(task_id)` - Queries task status from Redis
- Internal methods for Redis operations
### 3. PlayRequestBuilder
A fluent builder for constructing and submitting script execution requests.
**Responsibilities:**
- Configure script execution parameters
- Handle script loading from files or strings
- Manage request timeouts
- Provide submission methods (fire-and-forget vs await-response)
**Key Methods:**
- `worker_id(id: &str)` - Target worker queue (determines which worker processes the task)
- `context_id(id: &str)` - Target context ID (determines execution context/circle)
- `script(content: &str)` - Set script content directly
- `script_path(path: &str)` - Load script from file
- `timeout(duration: Duration)` - Set execution timeout
- `submit()` - Fire-and-forget submission
- `await_response()` - Submit and wait for result
**Architecture Note:** The decoupling of `worker_id` and `context_id` allows a single worker to process tasks for multiple contexts (circles), providing greater deployment flexibility.
### 4. Data Structures
#### RhaiTaskDetails
Represents the complete state of a task throughout its lifecycle.
```rust
pub struct RhaiTaskDetails {
pub task_id: String,
pub script: String,
pub status: String, // "pending", "processing", "completed", "error"
pub output: Option<String>,
pub error: Option<String>,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
pub caller_id: String,
}
```
#### RhaiSupervisorError
Comprehensive error handling for various failure scenarios:
- `RedisError` - Redis connection/operation failures
- `SerializationError` - JSON serialization/deserialization issues
- `Timeout` - Task execution timeouts
- `TaskNotFound` - Missing tasks after submission
## Communication Protocol
### Task Submission Flow
1. **Task Creation**: Client generates unique UUID for task identification
2. **Task Storage**: Task details stored in Redis hash: `rhailib:<task_id>`
3. **Queue Submission**: Task ID pushed to worker queue: `rhailib:<worker_id>`
4. **Reply Queue Setup**: Client listens on: `rhailib:reply:<task_id>`
### Redis Key Patterns
- **Task Storage**: `rhailib:<task_id>` (Redis Hash)
- **Worker Queues**: `rhailib:<worker_id>` (Redis List)
- **Reply Queues**: `rhailib:reply:<task_id>` (Redis List)
### Message Flow Diagram
```mermaid
sequenceDiagram
participant C as Client
participant R as Redis
participant W as Worker
C->>R: HSET rhailib:task_id (task details)
C->>R: LPUSH rhailib:worker_id task_id
C->>R: BLPOP rhailib:reply:task_id (blocking)
W->>R: BRPOP rhailib:worker_id (blocking)
W->>W: Execute Rhai Script
W->>R: LPUSH rhailib:reply:task_id (result)
R->>C: Return result from BLPOP
C->>R: DEL rhailib:reply:task_id (cleanup)
```
## Concurrency and Async Design
The client is built on `tokio` for asynchronous operations:
- **Connection Pooling**: Uses Redis multiplexed connections for efficiency
- **Non-blocking Operations**: All Redis operations are async
- **Timeout Handling**: Configurable timeouts with proper cleanup
- **Error Propagation**: Comprehensive error handling with context
## Configuration and Deployment
### Prerequisites
- Redis server accessible to both client and workers
- Proper network connectivity between components
- Sufficient Redis memory for task storage
### Configuration Options
- **Redis URL**: Connection string for Redis instance
- **Caller ID**: Unique identifier for client instance
- **Timeouts**: Per-request timeout configuration
- **Worker Targeting**: Direct worker queue addressing
## Security Considerations
- **Task Isolation**: Each task uses unique identifiers
- **Queue Separation**: Worker-specific queues prevent cross-contamination
- **Cleanup**: Automatic cleanup of reply queues after completion
- **Error Handling**: Secure error propagation without sensitive data leakage
## Performance Characteristics
- **Scalability**: Horizontal scaling through multiple worker instances
- **Throughput**: Limited by Redis performance and network latency
- **Memory Usage**: Efficient with connection pooling and cleanup
- **Latency**: Low latency for local Redis deployments
## Integration Points
The client integrates with:
- **Worker Services**: Via Redis queue protocol
- **Monitoring Systems**: Through structured logging
- **Application Code**: Via builder pattern API
- **Configuration Systems**: Through environment variables and builders

View File

@@ -0,0 +1,272 @@
# Hero Supervisor Protocol
This document describes the Redis-based protocol used by the Hero Supervisor for job management and worker communication.
## Overview
The Hero Supervisor uses Redis as a message broker and data store for managing distributed job execution. Jobs are stored as Redis hashes, and communication with workers happens through Redis lists (queues).
## Redis Namespace
All supervisor-related keys use the `hero:` namespace prefix to avoid conflicts with other Redis usage.
## Data Structures
### Job Storage
Jobs are stored as Redis hashes with the following key pattern:
```
hero:job:{job_id}
```
**Job Hash Fields:**
- `id`: Unique job identifier (UUID v4)
- `caller_id`: Identifier of the client that created the job
- `worker_id`: Target worker identifier
- `context_id`: Execution context identifier
- `script`: Script content to execute (Rhai or HeroScript)
- `timeout`: Execution timeout in seconds
- `retries`: Number of retry attempts
- `concurrent`: Whether to execute in separate thread (true/false)
- `log_path`: Optional path to log file for job output
- `created_at`: Job creation timestamp (ISO 8601)
- `updated_at`: Job last update timestamp (ISO 8601)
- `status`: Current job status (dispatched/started/error/finished)
- `env_vars`: Environment variables as JSON object (optional)
- `prerequisites`: JSON array of job IDs that must complete before this job (optional)
- `dependents`: JSON array of job IDs that depend on this job completing (optional)
- `output`: Job execution result (set by worker)
- `error`: Error message if job failed (set by worker)
- `dependencies`: List of job IDs that this job depends on
### Job Dependencies
Jobs can have dependencies on other jobs, which are stored in the `dependencies` field. A job will not be dispatched until all its dependencies have completed successfully.
### Work Queues
Jobs are queued for execution using Redis lists:
```
hero:work_queue:{worker_id}
```
Workers listen on their specific queue using `BLPOP` for job IDs to process.
### Stop Queues
Job stop requests are sent through dedicated stop queues:
```
hero:stop_queue:{worker_id}
```
Workers monitor these queues to receive stop requests for running jobs.
### Reply Queues
For synchronous job execution, dedicated reply queues are used:
```
hero:reply:{job_id}
```
Workers send results to these queues when jobs complete.
## Job Lifecycle
### 1. Job Creation
```
Client -> Redis: HSET hero:job:{job_id} {job_fields}
```
### 2. Job Submission
```
Client -> Redis: LPUSH hero:work_queue:{worker_id} {job_id}
```
### 3. Job Processing
```
Worker -> Redis: BLPOP hero:work_queue:{worker_id}
Worker -> Redis: HSET hero:job:{job_id} status "started"
Worker: Execute script
Worker -> Redis: HSET hero:job:{job_id} status "finished" output "{result}"
```
### 4. Job Completion (Async)
```
Worker -> Redis: LPUSH hero:reply:{job_id} {result}
```
## API Operations
### List Jobs
```rust
supervisor.list_jobs() -> Vec<String>
```
**Redis Operations:**
- `KEYS hero:job:*` - Get all job keys
- Extract job IDs from key names
### Stop Job
```rust
supervisor.stop_job(job_id) -> Result<(), SupervisorError>
```
**Redis Operations:**
- `LPUSH hero:stop_queue:{worker_id} {job_id}` - Send stop request
### Get Job Status
```rust
supervisor.get_job_status(job_id) -> Result<JobStatus, SupervisorError>
```
**Redis Operations:**
- `HGETALL hero:job:{job_id}` - Get job data
- Parse `status` field
### Get Job Logs
```rust
supervisor.get_job_logs(job_id) -> Result<Option<String>, SupervisorError>
```
**Redis Operations:**
- `HGETALL hero:job:{job_id}` - Get job data
- Read `log_path` field
- Read log file from filesystem
### Run Job and Await Result
```rust
supervisor.run_job_and_await_result(job, worker_id) -> Result<String, SupervisorError>
```
**Redis Operations:**
1. `HSET hero:job:{job_id} {job_fields}` - Store job
2. `LPUSH hero:work_queue:{worker_id} {job_id}` - Submit job
3. `BLPOP hero:reply:{job_id} {timeout}` - Wait for result
## Worker Protocol
### Job Processing Loop
```rust
loop {
// 1. Wait for job
job_id = BLPOP hero:work_queue:{worker_id}
// 2. Get job details
job_data = HGETALL hero:job:{job_id}
// 3. Update status
HSET hero:job:{job_id} status "started"
// 4. Check for stop requests
if LLEN hero:stop_queue:{worker_id} > 0 {
stop_job_id = LPOP hero:stop_queue:{worker_id}
if stop_job_id == job_id {
HSET hero:job:{job_id} status "error" error "stopped"
continue
}
}
// 5. Execute script
result = execute_script(job_data.script)
// 6. Update job with result
HSET hero:job:{job_id} status "finished" output result
// 7. Send reply if needed
if reply_queue_exists(hero:reply:{job_id}) {
LPUSH hero:reply:{job_id} result
}
}
```
### Stop Request Handling
Workers should periodically check the stop queue during long-running jobs:
```rust
if LLEN hero:stop_queue:{worker_id} > 0 {
stop_requests = LRANGE hero:stop_queue:{worker_id} 0 -1
if stop_requests.contains(current_job_id) {
// Stop current job execution
HSET hero:job:{current_job_id} status "error" error "stopped_by_request"
// Remove stop request
LREM hero:stop_queue:{worker_id} 1 current_job_id
return
}
}
```
## Error Handling
### Job Timeouts
- Client sets timeout when creating job
- Worker should respect timeout and stop execution
- If timeout exceeded: `HSET hero:job:{job_id} status "error" error "timeout"`
### Worker Failures
- If worker crashes, job remains in "started" status
- Monitoring systems can detect stale jobs and retry
- Jobs can be requeued: `LPUSH hero:work_queue:{worker_id} {job_id}`
### Redis Connection Issues
- Clients should implement retry logic with exponential backoff
- Workers should reconnect and resume processing
- Use Redis persistence to survive Redis restarts
## Monitoring and Observability
### Queue Monitoring
```bash
# Check work queue length
LLEN hero:work_queue:{worker_id}
# Check stop queue length
LLEN hero:stop_queue:{worker_id}
# List all jobs
KEYS hero:job:*
# Get job details
HGETALL hero:job:{job_id}
```
### Metrics to Track
- Jobs created per second
- Jobs completed per second
- Average job execution time
- Queue depths
- Worker availability
- Error rates by job type
## Security Considerations
### Redis Security
- Use Redis AUTH for authentication
- Enable TLS for Redis connections
- Restrict Redis network access
- Use Redis ACLs to limit worker permissions
### Job Security
- Validate script content before execution
- Sandbox script execution environment
- Limit resource usage (CPU, memory, disk)
- Log all job executions for audit
### Log File Security
- Ensure log paths are within allowed directories
- Validate log file permissions
- Rotate and archive logs regularly
- Sanitize sensitive data in logs
## Performance Considerations
### Redis Optimization
- Use Redis pipelining for batch operations
- Configure appropriate Redis memory limits
- Use Redis clustering for high availability
- Monitor Redis memory usage and eviction
### Job Optimization
- Keep job payloads small
- Use efficient serialization formats
- Batch similar jobs when possible
- Implement job prioritization if needed
### Worker Optimization
- Pool worker connections to Redis
- Use async I/O for Redis operations
- Implement graceful shutdown handling
- Monitor worker resource usage

View File

@@ -0,0 +1,239 @@
use hero_supervisor::{
Supervisor, SupervisorBuilder, WorkerConfig, WorkerLifecycleManager,
WorkerLifecycleManagerBuilder, ScriptType
};
use log::{info, warn, error};
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
use tokio::time::sleep;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Initialize logging
env_logger::init();
info!("Starting Worker Lifecycle Management Demo");
// Configuration
let redis_url = "redis://localhost:6379";
let zinit_socket = "/var/run/zinit.sock";
// Create supervisor
let supervisor = SupervisorBuilder::new()
.redis_url(redis_url)
.caller_id("lifecycle_demo")
.context_id("demo_context")
.build()?;
// Configure workers for different script types
let mut worker_configs = Vec::new();
// OSIS workers (Rhai/HeroScript)
for i in 0..2 {
let config = WorkerConfig::new(
format!("osis_worker_{}", i),
PathBuf::from("/usr/local/bin/osis_worker"),
ScriptType::OSIS,
)
.with_args(vec![
"--redis-url".to_string(),
redis_url.to_string(),
"--worker-id".to_string(),
format!("osis_worker_{}", i),
])
.with_env({
let mut env = HashMap::new();
env.insert("RUST_LOG".to_string(), "info".to_string());
env.insert("WORKER_TYPE".to_string(), "osis".to_string());
env
})
.with_health_check("/usr/local/bin/osis_worker --health-check".to_string())
.with_dependencies(vec!["redis".to_string()]);
worker_configs.push(config);
}
// SAL workers (System Abstraction Layer)
for i in 0..3 {
let config = WorkerConfig::new(
format!("sal_worker_{}", i),
PathBuf::from("/usr/local/bin/sal_worker"),
ScriptType::SAL,
)
.with_args(vec![
"--redis-url".to_string(),
redis_url.to_string(),
"--worker-id".to_string(),
format!("sal_worker_{}", i),
])
.with_env({
let mut env = HashMap::new();
env.insert("RUST_LOG".to_string(), "info".to_string());
env.insert("WORKER_TYPE".to_string(), "sal".to_string());
env
})
.with_health_check("/usr/local/bin/sal_worker --health-check".to_string())
.with_dependencies(vec!["redis".to_string()]);
worker_configs.push(config);
}
// V workers (HeroScript in V language)
for i in 0..2 {
let config = WorkerConfig::new(
format!("v_worker_{}", i),
PathBuf::from("/usr/local/bin/v_worker"),
ScriptType::V,
)
.with_args(vec![
"--redis-url".to_string(),
redis_url.to_string(),
"--worker-id".to_string(),
format!("v_worker_{}", i),
])
.with_env({
let mut env = HashMap::new();
env.insert("RUST_LOG".to_string(), "info".to_string());
env.insert("WORKER_TYPE".to_string(), "v".to_string());
env
})
.with_health_check("/usr/local/bin/v_worker --health-check".to_string())
.with_dependencies(vec!["redis".to_string()]);
worker_configs.push(config);
}
// Create lifecycle manager
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new(zinit_socket.to_string())
.with_supervisor(supervisor.clone());
// Add all worker configurations
for config in worker_configs {
lifecycle_manager = lifecycle_manager.add_worker(config);
}
let mut lifecycle_manager = lifecycle_manager.build();
// Demonstrate lifecycle operations
info!("=== Starting Worker Lifecycle Demo ===");
// 1. Start all workers
info!("1. Starting all workers...");
match lifecycle_manager.start_all_workers().await {
Ok(_) => info!("✅ All workers started successfully"),
Err(e) => {
error!("❌ Failed to start workers: {}", e);
return Err(e.into());
}
}
// Wait for workers to initialize
sleep(Duration::from_secs(5)).await;
// 2. Check worker status
info!("2. Checking worker status...");
match lifecycle_manager.get_all_worker_status().await {
Ok(status_map) => {
for (worker_name, status) in status_map {
info!(" Worker '{}': State={:?}, PID={}", worker_name, status.state, status.pid);
}
}
Err(e) => warn!("Failed to get worker status: {}", e),
}
// 3. Demonstrate scaling
info!("3. Demonstrating worker scaling...");
// Scale up OSIS workers
info!(" Scaling up OSIS workers to 3...");
if let Err(e) = lifecycle_manager.scale_workers(&ScriptType::OSIS, 3).await {
warn!("Failed to scale OSIS workers: {}", e);
}
sleep(Duration::from_secs(3)).await;
// Scale down SAL workers
info!(" Scaling down SAL workers to 1...");
if let Err(e) = lifecycle_manager.scale_workers(&ScriptType::SAL, 1).await {
warn!("Failed to scale SAL workers: {}", e);
}
sleep(Duration::from_secs(3)).await;
// 4. Check running worker counts
info!("4. Checking running worker counts after scaling...");
for script_type in [ScriptType::OSIS, ScriptType::SAL, ScriptType::V] {
let count = lifecycle_manager.get_running_worker_count(&script_type).await;
info!(" {:?}: {} workers running", script_type, count);
}
// 5. Demonstrate restart functionality
info!("5. Demonstrating worker restart...");
if let Err(e) = lifecycle_manager.restart_worker("osis_worker_0").await {
warn!("Failed to restart worker: {}", e);
} else {
info!(" ✅ Successfully restarted osis_worker_0");
}
sleep(Duration::from_secs(3)).await;
// 6. Simulate job dispatch and health monitoring
info!("6. Simulating job dispatch and health monitoring...");
// Update job time for a worker (simulating job dispatch)
lifecycle_manager.update_worker_job_time("sal_worker_0");
info!(" Updated job time for sal_worker_0");
// Perform health monitoring check
if let Err(e) = lifecycle_manager.monitor_worker_health().await {
warn!("Health monitoring failed: {}", e);
} else {
info!(" ✅ Health monitoring completed");
}
// 7. Create and execute a test job
info!("7. Creating and executing a test job...");
let test_job = supervisor
.new_job()
.script_type(ScriptType::OSIS)
.script_content("println!(\"Hello from worker!\");".to_string())
.timeout(Duration::from_secs(30))
.build()?;
match supervisor.run_job_and_await_result(&test_job).await {
Ok(result) => info!(" ✅ Job executed successfully: {}", result),
Err(e) => warn!(" ❌ Job execution failed: {}", e),
}
// 8. Demonstrate graceful shutdown
info!("8. Demonstrating graceful shutdown...");
// Stop specific workers
info!(" Stopping specific workers...");
for worker_name in ["osis_worker_1", "v_worker_0"] {
if let Err(e) = lifecycle_manager.stop_worker(worker_name).await {
warn!("Failed to stop worker {}: {}", worker_name, e);
} else {
info!(" ✅ Stopped worker: {}", worker_name);
}
}
sleep(Duration::from_secs(2)).await;
// Stop all remaining workers
info!(" Stopping all remaining workers...");
if let Err(e) = lifecycle_manager.stop_all_workers().await {
error!("Failed to stop all workers: {}", e);
} else {
info!(" ✅ All workers stopped successfully");
}
info!("=== Worker Lifecycle Demo Completed ===");
// Optional: Start health monitoring loop (commented out for demo)
// info!("Starting health monitoring loop (Ctrl+C to stop)...");
// lifecycle_manager.start_health_monitoring().await;
Ok(())
}

View File

@@ -0,0 +1,74 @@
use hero_supervisor::SupervisorBuilder;
use tokio::time::{sleep, Duration};
use log::{info, error};
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
env_logger::init();
info!("Starting Hero Supervisor Lifecycle Demo");
// Build supervisor with simplified worker configuration
// Workers are automatically launched during build
let supervisor = SupervisorBuilder::new()
.redis_url("redis://localhost:6379")
.zinit_socket_path("/var/run/zinit.sock")
.osis_worker("/usr/local/bin/osis_worker")
.sal_worker("/usr/local/bin/sal_worker")
.v_worker("/usr/local/bin/v_worker")
.worker_env_var("REDIS_URL", "redis://localhost:6379")
.worker_env_var("LOG_LEVEL", "info")
.build().await?;
info!("Supervisor created and workers launched successfully");
// Wait a moment for workers to start
sleep(Duration::from_secs(2)).await;
// Check worker status using the simplified API
info!("Checking worker status...");
let workers = supervisor.get_workers(&[]).await;
for worker in &workers {
let status_info = if worker.is_running {
format!("Running (PID: {})", worker.status.as_ref().map(|s| s.pid).unwrap_or(0))
} else {
"Stopped".to_string()
};
info!(" Worker '{}' ({:?}): {}", worker.config.name, worker.config.script_type, status_info);
}
// Demonstrate lifecycle operations with simplified API
info!("=== Worker Lifecycle Operations ===");
// 1. Demonstrate restart functionality
info!("1. Demonstrating worker restart...");
if let Err(e) = supervisor.restart_worker("osis_worker_1").await {
error!("Failed to restart worker: {}", e);
} else {
info!(" ✅ Successfully restarted osis_worker_1");
}
sleep(Duration::from_secs(2)).await;
// 2. Send a ping job for health checking
info!("2. Sending ping job for health checking...");
if let Err(e) = supervisor.send_ping_job(hero_job::ScriptType::OSIS).await {
error!("Ping job failed: {}", e);
} else {
info!(" ✅ Ping job completed successfully");
}
// 3. Demonstrate graceful shutdown
info!("3. Demonstrating graceful shutdown...");
// Stop specific workers
if let Err(e) = supervisor.stop_worker("osis_worker_1").await {
error!("Failed to stop worker: {}", e);
} else {
info!(" ✅ Worker stopped successfully");
}
info!("Demo completed successfully!");
Ok(())
}

View File

@@ -0,0 +1,90 @@
use log::info;
use hero_supervisor::{SupervisorBuilder, SupervisorError, ScriptType};
use std::time::{Duration, Instant};
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
env_logger::builder()
.filter_level(log::LevelFilter::Info)
.init();
// Build the client using the new builder pattern
let client = SupervisorBuilder::new()
.caller_id("timeout-example-runner")
.redis_url("redis://127.0.0.1/")
.build()?;
info!("Supervisor created.");
let script_content = r#"
// This script will never be executed by a worker because the recipient does not exist.
let x = 10;
let y = x + 32;
y
"#;
// The worker_id points to a worker queue that doesn't have a worker.
let non_existent_recipient = "non_existent_worker_for_timeout_test";
let very_short_timeout = Duration::from_secs(2);
info!(
"Submitting script to non-existent recipient '{}' with a timeout of {:?}...",
non_existent_recipient, very_short_timeout
);
let start_time = Instant::now();
// Use the new JobBuilder
let result = client
.new_job()
.script_type(ScriptType::HeroScript)
.script(script_content)
.timeout(very_short_timeout)
.await_response()
.await;
match result {
Ok(details) => {
log::error!(
"Timeout Example FAILED: Expected a timeout, but got Ok: {:?}",
details
);
Err("Expected timeout, but task completed successfully.".into())
}
Err(e) => {
let elapsed = start_time.elapsed();
info!("Timeout Example: Received error as expected: {}", e);
info!("Elapsed time: {:?}", elapsed);
match e {
SupervisorError::Timeout(task_id) => {
info!("Timeout Example PASSED: Correctly received SupervisorError::Timeout for task_id: {}", task_id);
// Ensure the elapsed time is close to the timeout duration
// Allow for some buffer for processing
assert!(
elapsed >= very_short_timeout
&& elapsed < very_short_timeout + Duration::from_secs(1),
"Elapsed time {:?} should be close to timeout {:?}",
elapsed,
very_short_timeout
);
info!(
"Elapsed time {:?} is consistent with timeout duration {:?}.",
elapsed, very_short_timeout
);
Ok(())
}
other_error => {
log::error!(
"Timeout Example FAILED: Expected SupervisorError::Timeout, but got other error: {:?}",
other_error
);
Err(format!(
"Expected SupervisorError::Timeout, got other error: {:?}",
other_error
)
.into())
}
}
}
}
}

View File

@@ -0,0 +1,102 @@
// Added error
// Duration is still used, Instant and sleep were removed
/// Comprehensive error type for all possible failures in the Rhai client.
///
/// This enum covers all error scenarios that can occur during client operations,
/// from Redis connectivity issues to task execution timeouts.
#[derive(Debug)]
pub enum SupervisorError {
/// Redis connection or operation error
RedisError(redis::RedisError),
/// JSON serialization/deserialization error
SerializationError(serde_json::Error),
/// Task execution timeout - contains the task_id that timed out
Timeout(String),
/// Task not found after submission - contains the task_id (rare occurrence)
TaskNotFound(String),
/// Context ID is missing
ContextIdMissing,
/// Invalid input provided
InvalidInput(String),
/// Job operation error
JobError(hero_job::JobError),
/// Worker lifecycle management errors
WorkerStartFailed(String, String),
WorkerStopFailed(String, String),
WorkerRestartFailed(String, String),
WorkerStatusFailed(String, String),
WorkerNotFound(String),
PingJobFailed(String, String),
/// Zinit client operation error
ZinitError(String),
SupervisorNotConfigured,
}
impl From<redis::RedisError> for SupervisorError {
fn from(err: redis::RedisError) -> Self {
SupervisorError::RedisError(err)
}
}
impl From<serde_json::Error> for SupervisorError {
fn from(err: serde_json::Error) -> Self {
SupervisorError::SerializationError(err)
}
}
impl From<hero_job::JobError> for SupervisorError {
fn from(err: hero_job::JobError) -> Self {
SupervisorError::JobError(err)
}
}
impl std::fmt::Display for SupervisorError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SupervisorError::RedisError(e) => write!(f, "Redis error: {}", e),
SupervisorError::SerializationError(e) => write!(f, "Serialization error: {}", e),
SupervisorError::Timeout(task_id) => {
write!(f, "Timeout waiting for task {} to complete", task_id)
}
SupervisorError::TaskNotFound(task_id) => {
write!(f, "Task {} not found after submission", task_id)
}
SupervisorError::ContextIdMissing => {
write!(f, "Context ID is missing")
}
SupervisorError::InvalidInput(msg) => {
write!(f, "Invalid input: {}", msg)
}
SupervisorError::JobError(e) => {
write!(f, "Job error: {}", e)
}
SupervisorError::WorkerStartFailed(worker, reason) => {
write!(f, "Failed to start worker '{}': {}", worker, reason)
}
SupervisorError::WorkerStopFailed(worker, reason) => {
write!(f, "Failed to stop worker '{}': {}", worker, reason)
}
SupervisorError::WorkerRestartFailed(worker, reason) => {
write!(f, "Failed to restart worker '{}': {}", worker, reason)
}
SupervisorError::WorkerStatusFailed(worker, reason) => {
write!(f, "Failed to get status for worker '{}': {}", worker, reason)
}
SupervisorError::WorkerNotFound(worker) => {
write!(f, "Worker '{}' not found", worker)
}
SupervisorError::PingJobFailed(worker, reason) => {
write!(f, "Ping job failed for worker '{}': {}", worker, reason)
}
SupervisorError::ZinitError(msg) => {
write!(f, "Zinit error: {}", msg)
}
SupervisorError::SupervisorNotConfigured => {
write!(f, "Supervisor not configured for health monitoring")
}
}
}
}
impl std::error::Error for SupervisorError {}

261
core/supervisor/src/job.rs Normal file
View File

@@ -0,0 +1,261 @@
use chrono::Utc;
use std::collections::HashMap;
use std::time::Duration;
use uuid::Uuid;
use crate::{Supervisor, SupervisorError};
use hero_job::{Job, ScriptType};
/// Builder for constructing and submitting script execution requests.
///
/// This builder provides a fluent interface for configuring script execution
/// parameters and offers two submission modes: fire-and-forget (`submit()`)
/// and request-reply (`await_response()`).
///
/// # Example
///
/// ```rust,no_run
/// use std::time::Duration;
/// use hero_supervisor::ScriptType;
///
/// # async fn example(client: &hero_supervisor::Supervisor) -> Result<String, hero_supervisor::SupervisorError> {
/// let result = client
/// .new_job()
/// .script_type(ScriptType::OSIS)
/// .script(r#"print("Hello, World!");"#)
/// .timeout(Duration::from_secs(30))
/// .await_response()
/// .await?;
/// # Ok(result)
/// # }
/// ```
pub struct JobBuilder<'a> {
client: &'a Supervisor,
request_id: String,
context_id: String,
caller_id: String,
script: String,
script_type: ScriptType,
timeout: Duration,
retries: u32,
concurrent: bool,
log_path: Option<String>,
env_vars: HashMap<String, String>,
prerequisites: Vec<String>,
dependents: Vec<String>
}
impl<'a> JobBuilder<'a> {
pub fn new(client: &'a Supervisor) -> Self {
Self {
client,
request_id: "".to_string(),
context_id: "".to_string(),
caller_id: "".to_string(),
script: "".to_string(),
script_type: ScriptType::OSIS, // Default to OSIS
timeout: Duration::from_secs(5),
retries: 0,
concurrent: false,
log_path: None,
env_vars: HashMap::new(),
prerequisites: Vec::new(),
dependents: Vec::new(),
}
}
pub fn request_id(mut self, request_id: &str) -> Self {
self.request_id = request_id.to_string();
self
}
pub fn script_type(mut self, script_type: ScriptType) -> Self {
self.script_type = script_type;
self
}
pub fn context_id(mut self, context_id: &str) -> Self {
self.context_id = context_id.to_string();
self
}
pub fn script(mut self, script: &str) -> Self {
self.script = script.to_string();
self
}
pub fn script_path(mut self, script_path: &str) -> Self {
self.script = std::fs::read_to_string(script_path).unwrap();
self
}
pub fn timeout(mut self, timeout: Duration) -> Self {
self.timeout = timeout;
self
}
pub fn log_path(mut self, log_path: &str) -> Self {
self.log_path = Some(log_path.to_string());
self
}
/// Set a single environment variable
pub fn env_var(mut self, key: &str, value: &str) -> Self {
self.env_vars.insert(key.to_string(), value.to_string());
self
}
/// Set multiple environment variables from a HashMap
pub fn env_vars(mut self, env_vars: HashMap<String, String>) -> Self {
self.env_vars.extend(env_vars);
self
}
/// Clear all environment variables
pub fn clear_env_vars(mut self) -> Self {
self.env_vars.clear();
self
}
/// Add a prerequisite job ID that must complete before this job can run
pub fn prerequisite(mut self, job_id: &str) -> Self {
self.prerequisites.push(job_id.to_string());
self
}
/// Set multiple prerequisite job IDs
pub fn prerequisites(mut self, job_ids: Vec<String>) -> Self {
self.prerequisites.extend(job_ids);
self
}
/// Add a dependent job ID that depends on this job completing
pub fn dependent(mut self, job_id: &str) -> Self {
self.dependents.push(job_id.to_string());
self
}
/// Set multiple dependent job IDs
pub fn dependents(mut self, job_ids: Vec<String>) -> Self {
self.dependents.extend(job_ids);
self
}
/// Clear all prerequisites
pub fn clear_prerequisites(mut self) -> Self {
self.prerequisites.clear();
self
}
/// Clear all dependents
pub fn clear_dependents(mut self) -> Self {
self.dependents.clear();
self
}
pub fn build(self) -> Result<Job, SupervisorError> {
let request_id = if self.request_id.is_empty() {
// Generate a UUID for the request_id
Uuid::new_v4().to_string()
} else {
self.request_id.clone()
};
if self.context_id.is_empty() {
return Err(SupervisorError::ContextIdMissing);
}
if self.caller_id.is_empty() {
return Err(SupervisorError::ContextIdMissing);
}
let now = Utc::now();
Ok(Job {
id: request_id,
caller_id: self.caller_id,
context_id: self.context_id,
script: self.script,
script_type: self.script_type,
timeout: self.timeout,
retries: self.retries as u8,
concurrent: self.concurrent,
log_path: self.log_path.clone(),
env_vars: self.env_vars.clone(),
prerequisites: self.prerequisites.clone(),
dependents: self.dependents.clone(),
created_at: now,
updated_at: now,
})
}
pub async fn submit(self) -> Result<(), SupervisorError> {
// Create job first, then use client reference
let request_id = if self.request_id.is_empty() {
Uuid::new_v4().to_string()
} else {
self.request_id
};
if self.context_id.is_empty() {
return Err(SupervisorError::ContextIdMissing);
}
let now = Utc::now();
let job = Job {
id: request_id,
caller_id: self.caller_id,
context_id: self.context_id,
script: self.script,
script_type: self.script_type.clone(),
timeout: self.timeout,
retries: self.retries as u8,
concurrent: self.concurrent,
log_path: self.log_path.clone(),
env_vars: self.env_vars.clone(),
prerequisites: self.prerequisites.clone(),
dependents: self.dependents.clone(),
created_at: now,
updated_at: now,
};
self.client.create_job(&job).await?;
Ok(())
}
pub async fn await_response(self) -> Result<String, SupervisorError> {
// Create job first, then use client reference
let request_id = if self.request_id.is_empty() {
Uuid::new_v4().to_string()
} else {
self.request_id
};
if self.context_id.is_empty() {
return Err(SupervisorError::ContextIdMissing);
}
let now = Utc::now();
let job = Job {
id: request_id,
caller_id: self.caller_id.clone(),
context_id: self.context_id,
script: self.script,
script_type: self.script_type.clone(),
timeout: self.timeout,
retries: self.retries as u8,
concurrent: self.concurrent,
log_path: self.log_path.clone(),
env_vars: self.env_vars.clone(),
prerequisites: self.prerequisites.clone(),
dependents: self.dependents.clone(),
created_at: now,
updated_at: now,
};
let result = self.client.run_job_and_await_result(&job).await?;
Ok(result)
}
}

596
core/supervisor/src/lib.rs Normal file
View File

@@ -0,0 +1,596 @@
use log::{debug, error, info, warn};
use redis::AsyncCommands;
use std::collections::HashMap;
use std::time::Duration;
use hero_job::NAMESPACE_PREFIX;
use zinit_client::ZinitClient;
mod job;
mod error;
mod lifecycle;
pub use crate::error::SupervisorError;
pub use crate::job::JobBuilder;
pub use crate::lifecycle::WorkerConfig;
// Re-export types from hero_job for public API
pub use hero_job::{Job, JobStatus, ScriptType};
pub struct Supervisor {
redis_client: redis::Client,
zinit_client: ZinitClient,
builder_data: Option<SupervisorBuilderData>,
}
pub struct SupervisorBuilder {
redis_url: Option<String>,
zinit_socket_path: Option<String>,
osis_worker: Option<String>,
sal_worker: Option<String>,
v_worker: Option<String>,
python_worker: Option<String>,
worker_env_vars: HashMap<String, String>,
}
/// Helper struct to pass builder data to worker launch method
struct SupervisorBuilderData {
osis_worker: Option<String>,
sal_worker: Option<String>,
v_worker: Option<String>,
python_worker: Option<String>,
worker_env_vars: HashMap<String, String>,
}
impl SupervisorBuilder {
pub fn new() -> Self {
Self {
redis_url: None,
zinit_socket_path: Some("/var/run/zinit.sock".to_string()),
osis_worker: None,
sal_worker: None,
v_worker: None,
python_worker: None,
worker_env_vars: HashMap::new(),
}
}
pub fn redis_url(mut self, url: &str) -> Self {
self.redis_url = Some(url.to_string());
self
}
pub fn zinit_socket_path(mut self, path: &str) -> Self {
self.zinit_socket_path = Some(path.to_string());
self
}
pub fn osis_worker(mut self, binary_path: &str) -> Self {
self.osis_worker = Some(binary_path.to_string());
self
}
pub fn sal_worker(mut self, binary_path: &str) -> Self {
self.sal_worker = Some(binary_path.to_string());
self
}
pub fn v_worker(mut self, binary_path: &str) -> Self {
self.v_worker = Some(binary_path.to_string());
self
}
pub fn python_worker(mut self, binary_path: &str) -> Self {
self.python_worker = Some(binary_path.to_string());
self
}
pub fn worker_env_var(mut self, key: &str, value: &str) -> Self {
self.worker_env_vars.insert(key.to_string(), value.to_string());
self
}
pub fn worker_env_vars(mut self, env_vars: HashMap<String, String>) -> Self {
self.worker_env_vars.extend(env_vars);
self
}
/// Builds the final `Supervisor` instance synchronously.
///
/// This method validates the configuration and creates the Redis client.
/// Worker launching is deferred to the `start_workers()` method.
///
/// # Returns
///
/// * `Ok(Supervisor)` - Successfully configured client
/// * `Err(SupervisorError)` - Configuration or connection error
pub fn build(self) -> Result<Supervisor, SupervisorError> {
let url = self.redis_url
.unwrap_or_else(|| "redis://127.0.0.1/".to_string());
let client = redis::Client::open(url)?;
let zinit_socket = self.zinit_socket_path
.unwrap_or_else(|| "/var/run/zinit.sock".to_string());
let zinit_client = ZinitClient::new(&zinit_socket);
// Store builder data for later use in start_workers()
let builder_data = SupervisorBuilderData {
osis_worker: self.osis_worker,
sal_worker: self.sal_worker,
v_worker: self.v_worker,
python_worker: self.python_worker,
worker_env_vars: self.worker_env_vars,
};
let supervisor = Supervisor {
redis_client: client,
zinit_client,
builder_data: Some(builder_data),
};
Ok(supervisor)
}
}
impl Supervisor {
/// Start all configured workers asynchronously.
/// This method should be called after build() to launch the workers.
pub async fn start_workers(&self) -> Result<(), SupervisorError> {
// Clean up any existing worker services first
self.cleanup_existing_workers().await?;
// Launch configured workers if builder data is available
if let Some(builder_data) = &self.builder_data {
self.launch_configured_workers(builder_data).await?;
}
Ok(())
}
/// Clean up all worker services from zinit on program exit
pub async fn cleanup_and_shutdown(&self) -> Result<(), SupervisorError> {
info!("Cleaning up worker services before shutdown...");
let worker_names = vec![
"osis_worker_1",
"sal_worker_1",
"v_worker_1",
"python_worker_1"
];
for worker_name in worker_names {
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
warn!("Failed to cleanup worker {}: {}", worker_name, e);
}
}
info!("Worker cleanup completed");
Ok(())
}
/// Clean up any existing worker services on startup
async fn cleanup_existing_workers(&self) -> Result<(), SupervisorError> {
info!("Cleaning up any existing worker services...");
let worker_names = vec![
"osis_worker_1",
"sal_worker_1",
"v_worker_1",
"python_worker_1"
];
for worker_name in worker_names {
// Try to stop and delete, but don't fail if they don't exist
let _ = self.stop_and_delete_worker(worker_name).await;
}
info!("Existing worker cleanup completed");
Ok(())
}
/// Stop and delete a worker service from zinit
async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
// First try to stop the worker
if let Err(e) = self.zinit_client.stop(worker_name).await {
debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
}
// Then try to delete the service
if let Err(e) = self.zinit_client.delete(worker_name).await {
debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
} else {
info!("Successfully deleted worker service: {}", worker_name);
}
Ok(())
}
/// Get the hardcoded worker queue key for the script type
fn get_worker_queue_key(&self, script_type: &ScriptType) -> String {
format!("{}worker_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix())
}
pub fn new_job(&self) -> JobBuilder {
JobBuilder::new(self)
}
// Internal helper to submit script details and push to work queue
async fn create_job_using_connection(
&self,
conn: &mut redis::aio::MultiplexedConnection,
job: &Job,
) -> Result<(), SupervisorError> {
debug!(
"Submitting play request: {} for script type: {:?} with namespace prefix: {}",
job.id, job.script_type, NAMESPACE_PREFIX
);
// Use the shared Job struct's Redis storage method
job.store_in_redis(conn).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to store job in Redis: {}", e)))?;
Ok(())
}
// Internal helper to submit script details and push to work queue
async fn start_job_using_connection(
&self,
conn: &mut redis::aio::MultiplexedConnection,
job_id: String,
script_type: &ScriptType
) -> Result<(), SupervisorError> {
let worker_queue_key = self.get_worker_queue_key(script_type);
// lpush also infers its types, RV is typically i64 (length of list) or () depending on exact command variant
// For `redis::AsyncCommands::lpush`, it's `RedisResult<R>` where R: FromRedisValue
// Often this is the length of the list. Let's allow inference or specify if needed.
let _: redis::RedisResult<i64> =
conn.lpush(&worker_queue_key, job_id.clone()).await;
Ok(())
}
// Internal helper to await response from worker
async fn await_response_from_connection(
&self,
conn: &mut redis::aio::MultiplexedConnection,
job_key: &String,
reply_queue_key: &String,
timeout: Duration,
) -> Result<String, SupervisorError> {
// BLPOP on the reply queue
// The timeout for BLPOP is in seconds (integer)
let blpop_timeout_secs = timeout.as_secs().max(1); // Ensure at least 1 second for BLPOP timeout
match conn
.blpop::<&String, Option<(String, String)>>(reply_queue_key, blpop_timeout_secs as f64)
.await
{
Ok(Some((_queue, result_message_str))) => {
Ok(result_message_str)
}
Ok(None) => {
// BLPOP timed out
warn!(
"Timeout waiting for result on reply queue {} for job {}",
reply_queue_key, job_key
);
// Optionally, delete the reply queue
let _: redis::RedisResult<i32> = conn.del(&reply_queue_key).await;
Err(SupervisorError::Timeout(job_key.clone()))
}
Err(e) => {
// Redis error
error!(
"Redis error on BLPOP for reply queue {}: {}",
reply_queue_key, e
);
// Optionally, delete the reply queue
let _: redis::RedisResult<i32> = conn.del(&reply_queue_key).await;
Err(SupervisorError::RedisError(e))
}
}
}
// New method using dedicated reply queue
pub async fn create_job(
&self,
job: &Job,
) -> Result<(), SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
self.create_job_using_connection(
&mut conn,
&job, // Pass the job_id parameter
)
.await?;
Ok(())
}
// Method to start a previously created job
pub async fn start_job(
&self,
job_id: &str,
) -> Result<(), SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
// Load the job to get its script type
let job = Job::load_from_redis(&mut conn, job_id).await?;
self.start_job_using_connection(&mut conn, job_id.to_string(), &job.script_type).await?;
Ok(())
}
// New method using dedicated reply queue with automatic worker selection
pub async fn run_job_and_await_result(
&self,
job: &Job
) -> Result<String, SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
let reply_queue_key = format!("{}:reply:{}", NAMESPACE_PREFIX, job.id); // Derived from the passed job_id
self.create_job_using_connection(
&mut conn,
&job, // Pass the job_id parameter
)
.await?;
self.start_job_using_connection(&mut conn, job.id.clone(), &job.script_type).await?;
info!(
"Task {} submitted. Waiting for result on queue {} with timeout {:?}...",
job.id, // This is the UUID
reply_queue_key,
job.timeout
);
self.await_response_from_connection(
&mut conn,
&job.id,
&reply_queue_key,
job.timeout,
)
.await
}
// Method to get job status
pub async fn get_job_status(
&self,
job_id: &str,
) -> Result<JobStatus, SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
let job_key = format!("{}{}", NAMESPACE_PREFIX, job_id);
let result_map: Option<std::collections::HashMap<String, String>> =
conn.hgetall(&job_key).await?;
match result_map {
Some(map) => {
let status_str = map.get("status").cloned().unwrap_or_else(|| {
warn!("Task {}: 'status' field missing from Redis hash, defaulting to empty.", job_id);
String::new()
});
let status = match status_str.as_str() {
"dispatched" => JobStatus::Dispatched,
"started" => JobStatus::Started,
"error" => JobStatus::Error,
"finished" => JobStatus::Finished,
_ => JobStatus::Dispatched, // default
};
Ok(status)
}
None => {
warn!("Job {} not found in Redis", job_id);
Ok(JobStatus::Dispatched) // default for missing jobs
}
}
}
// Method to get job output
pub async fn get_job_output(
&self,
job_id: &str,
) -> Result<Option<String>, SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
let job_key = format!("{}{}", NAMESPACE_PREFIX, job_id);
let result_map: Option<std::collections::HashMap<String, String>> =
conn.hgetall(&job_key).await?;
match result_map {
Some(map) => {
Ok(map.get("output").cloned())
}
None => {
warn!("Job {} not found in Redis", job_id);
Ok(None)
}
}
}
/// List all jobs in Redis
pub async fn list_jobs(&self) -> Result<Vec<String>, SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
// Use the shared Job struct's list method
Job::list_all_job_ids(&mut conn).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to list jobs: {}", e)))
}
/// Stop a job by pushing its ID to the stop queue
pub async fn stop_job(&self, job_id: &str) -> Result<(), SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
// Get job details to determine script type and appropriate worker
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;
if job_data.is_empty() {
return Err(SupervisorError::InvalidInput(format!("Job {} not found", job_id)));
}
// Parse script type from job data
let script_type_str = job_data.get("script_type")
.ok_or_else(|| SupervisorError::InvalidInput("Job missing script_type field".to_string()))?;
let script_type: ScriptType = serde_json::from_str(&format!("\"{}\"", script_type_str))
.map_err(|e| SupervisorError::InvalidInput(format!("Invalid script type: {}", e)))?;
// Use hardcoded stop queue key for this script type
let stop_queue_key = format!("{}stop_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix());
// Push job ID to the stop queue
conn.lpush::<_, _, ()>(&stop_queue_key, job_id).await?;
info!("Job {} added to stop queue {} for script type {:?}", job_id, stop_queue_key, script_type);
Ok(())
}
/// Get logs for a job by reading from its log file
pub async fn get_job_logs(&self, job_id: &str) -> Result<Option<String>, SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
// Get the job data to find the log path
let result_map: Option<std::collections::HashMap<String, String>> =
conn.hgetall(&job_key).await?;
match result_map {
Some(map) => {
if let Some(log_path) = map.get("log_path") {
// Try to read the log file
match std::fs::read_to_string(log_path) {
Ok(contents) => Ok(Some(contents)),
Err(e) => {
warn!("Failed to read log file {}: {}", log_path, e);
Ok(None)
}
}
} else {
// No log path configured for this job
Ok(None)
}
}
None => {
warn!("Job {} not found in Redis", job_id);
Ok(None)
}
}
}
/// Delete a specific job by ID
pub async fn delete_job(&self, job_id: &str) -> Result<(), SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
// Use the shared Job struct's delete method
Job::delete_from_redis(&mut conn, job_id).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to delete job: {}", e)))?;
info!("Job {} deleted successfully", job_id);
Ok(())
}
/// Clear all jobs from Redis
pub async fn clear_all_jobs(&self) -> Result<usize, SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
// Get all job IDs first
let job_ids = Job::list_all_job_ids(&mut conn).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to list jobs: {}", e)))?;
let count = job_ids.len();
// Delete each job using the shared method
for job_id in job_ids {
Job::delete_from_redis(&mut conn, &job_id).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to delete job {}: {}", job_id, e)))?;
}
Ok(count)
}
/// Check if all prerequisites for a job are completed
pub async fn check_prerequisites_completed(&self, job_id: &str) -> Result<bool, SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
// Load the job using the shared Job struct
let job = Job::load_from_redis(&mut conn, job_id).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to load job: {}", e)))?;
// Check each prerequisite job status
for prereq_id in &job.prerequisites {
let status = Job::get_status(&mut conn, prereq_id).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to get prerequisite status: {}", e)))?;
if status != JobStatus::Finished {
return Ok(false); // Prerequisite not completed
}
}
Ok(true) // All prerequisites completed (or no prerequisites)
}
/// Update job status and check dependent jobs for readiness
pub async fn update_job_status_and_check_dependents(&self, job_id: &str, new_status: JobStatus) -> Result<Vec<String>, SupervisorError> {
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
// Update job status using shared Job method
Job::update_status(&mut conn, job_id, new_status.clone()).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to update job status: {}", e)))?;
let mut ready_jobs = Vec::new();
// If job finished, check dependent jobs
if new_status == JobStatus::Finished {
// Load the job to get its dependents
let job = Job::load_from_redis(&mut conn, job_id).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to load job: {}", e)))?;
// Check each dependent job
for dependent_id in &job.dependents {
let dependent_status = Job::get_status(&mut conn, dependent_id).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to get dependent status: {}", e)))?;
// Only check jobs that are waiting for prerequisites
if dependent_status == JobStatus::WaitingForPrerequisites {
// Check if all prerequisites are now completed
if self.check_prerequisites_completed(dependent_id).await? {
// Update status to dispatched and add to ready jobs
Job::update_status(&mut conn, dependent_id, JobStatus::Dispatched).await
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to update dependent status: {}", e)))?;
ready_jobs.push(dependent_id.clone());
}
}
}
}
Ok(ready_jobs)
}
/// Dispatch jobs that are ready (have all prerequisites completed)
pub async fn dispatch_ready_jobs(&self, ready_job_ids: Vec<String>) -> Result<(), SupervisorError> {
for job_id in ready_job_ids {
// Get job data to determine script type and select worker
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;
if let Some(script_type_str) = job_data.get("script_type") {
// Parse script type (stored as Debug format, e.g., "OSIS")
let script_type = match script_type_str.as_str() {
"OSIS" => ScriptType::OSIS,
"SAL" => ScriptType::SAL,
"V" => ScriptType::V,
"Python" => ScriptType::Python,
_ => return Err(SupervisorError::InvalidInput(format!("Unknown script type: {}", script_type_str))),
};
// Dispatch job using hardcoded queue
self.start_job_using_connection(&mut conn, job_id, &script_type).await?;
}
}
Ok(())
}
}

View File

@@ -0,0 +1,368 @@
//! Worker lifecycle management functionality for the Hero Supervisor
//!
//! This module provides worker process lifecycle management using Zinit as the process manager.
//! All functionality is implemented as methods on the Supervisor struct for a clean API.
use log::{debug, error, info, warn};
use serde_json::json;
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
use zinit_client::{ZinitClient, ServiceStatus, ServiceState};
use hero_job::ScriptType;
use crate::{Supervisor, SupervisorError};
/// Information about a worker including its configuration and current status
#[derive(Debug, Clone)]
pub struct WorkerInfo {
pub config: WorkerConfig,
pub status: Option<ServiceStatus>,
pub is_running: bool,
}
/// Configuration for a worker binary
#[derive(Debug, Clone)]
pub struct WorkerConfig {
/// Name of the worker service
pub name: String,
/// Path to the worker binary
pub binary_path: PathBuf,
/// Script type this worker handles
pub script_type: ScriptType,
/// Command line arguments for the worker
pub args: Vec<String>,
/// Environment variables for the worker
pub env: HashMap<String, String>,
/// Whether this worker should restart on exit
pub restart_on_exit: bool,
/// Health check command (optional)
pub health_check: Option<String>,
/// Dependencies that must be running first
pub dependencies: Vec<String>,
}
impl WorkerConfig {
pub fn new(name: String, binary_path: PathBuf, script_type: ScriptType) -> Self {
Self {
name,
binary_path,
script_type,
args: Vec::new(),
env: HashMap::new(),
restart_on_exit: true,
health_check: None,
dependencies: Vec::new(),
}
}
pub fn with_args(mut self, args: Vec<String>) -> Self {
self.args = args;
self
}
pub fn with_env(mut self, env: HashMap<String, String>) -> Self {
self.env = env;
self
}
pub fn with_health_check(mut self, health_check: String) -> Self {
self.health_check = Some(health_check);
self
}
pub fn with_dependencies(mut self, dependencies: Vec<String>) -> Self {
self.dependencies = dependencies;
self
}
pub fn no_restart(mut self) -> Self {
self.restart_on_exit = false;
self
}
}
/// Worker lifecycle management methods for Supervisor
impl Supervisor {
/// Get all workers with their configuration and status - unified method
pub async fn get_workers(&self, worker_configs: &[WorkerConfig]) -> Vec<WorkerInfo> {
let mut workers = Vec::new();
for config in worker_configs {
let status = self.zinit_client.status(&config.name).await.ok();
let is_running = status.as_ref()
.map(|s| matches!(s.state, ServiceState::Running) && s.pid > 0)
.unwrap_or(false);
workers.push(WorkerInfo {
config: config.clone(),
status,
is_running,
});
}
workers
}
/// Start a worker using Zinit
pub async fn start_worker(
&self,
worker_config: &WorkerConfig,
) -> Result<(), SupervisorError> {
info!("Starting worker: {}", worker_config.name);
// Create service configuration for Zinit
let service_config = self.create_service_config(worker_config);
// Create the service in Zinit
self.zinit_client.create_service(&worker_config.name, service_config).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create service: {}", e)))?;
// Start the service
self.zinit_client.start(&worker_config.name).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to start worker: {}", e)))?;
info!("Successfully started worker: {}", worker_config.name);
Ok(())
}
/// Stop a worker using Zinit
pub async fn stop_worker(
&self,
worker_name: &str,
) -> Result<(), SupervisorError> {
info!("Stopping worker: {}", worker_name);
match self.zinit_client.stop(worker_name).await {
Ok(_) => {
info!("Successfully stopped worker: {}", worker_name);
Ok(())
}
Err(e) => {
error!("Failed to stop worker {}: {}", worker_name, e);
Err(SupervisorError::WorkerStopFailed(worker_name.to_string(), e.to_string()))
}
}
}
/// Restart a worker using Zinit
pub async fn restart_worker(
&self,
worker_name: &str,
) -> Result<(), SupervisorError> {
info!("Restarting worker: {}", worker_name);
match self.zinit_client.restart(worker_name).await {
Ok(_) => {
info!("Successfully restarted worker: {}", worker_name);
Ok(())
}
Err(e) => {
error!("Failed to restart worker {}: {}", worker_name, e);
Err(SupervisorError::WorkerRestartFailed(worker_name.to_string(), e.to_string()))
}
}
}
/// Get status of a worker using Zinit
pub async fn get_worker_status(
&self,
worker_name: &str,
zinit_client: &ZinitClient,
) -> Result<ServiceStatus, SupervisorError> {
match zinit_client.status(worker_name).await {
Ok(status) => Ok(status),
Err(e) => {
error!("Failed to get status for worker {}: {}", worker_name, e);
Err(SupervisorError::WorkerStatusFailed(worker_name.to_string(), e.to_string()))
}
}
}
/// Get status of all workers
pub async fn get_all_worker_status(
&self,
worker_configs: &[WorkerConfig],
zinit_client: &ZinitClient,
) -> Result<HashMap<String, ServiceStatus>, SupervisorError> {
let mut status_map = HashMap::new();
for worker in worker_configs {
match zinit_client.status(&worker.name).await {
Ok(status) => {
status_map.insert(worker.name.clone(), status);
}
Err(e) => {
warn!("Failed to get status for worker {}: {}", worker.name, e);
}
}
}
Ok(status_map)
}
/// Start multiple workers
pub async fn start_workers(
&self,
worker_configs: &[WorkerConfig],
) -> Result<(), SupervisorError> {
info!("Starting {} workers", worker_configs.len());
for worker in worker_configs {
self.start_worker(worker).await?;
}
Ok(())
}
/// Stop multiple workers
pub async fn stop_workers(
&self,
worker_names: &[String],
) -> Result<(), SupervisorError> {
info!("Stopping {} workers", worker_names.len());
for worker_name in worker_names {
self.stop_worker(worker_name).await?;
}
Ok(())
}
/// Get count of running workers for a script type
pub async fn get_running_worker_count(
&self,
worker_configs: &[WorkerConfig],
script_type: &ScriptType,
zinit_client: &ZinitClient,
) -> usize {
let mut running_count = 0;
for worker in worker_configs {
if worker.script_type == *script_type {
if let Ok(status) = zinit_client.status(&worker.name).await {
if status.state == ServiceState::Running {
running_count += 1;
}
}
}
}
running_count
}
/// Send a ping job to a worker for health checking
pub async fn send_ping_job(
&self,
script_type: ScriptType,
) -> Result<(), SupervisorError> {
// Create a ping job
let ping_job = self
.new_job()
.script_type(script_type.clone())
.script("ping") // Simple ping script
.timeout(Duration::from_secs(30))
.build()?;
// Execute the ping job with a short timeout
match self.run_job_and_await_result(&ping_job).await {
Ok(_) => {
debug!("Ping job successful for script type: {:?}", script_type);
Ok(())
}
Err(e) => {
warn!("Ping job failed for script type {:?}: {}", script_type, e);
Err(SupervisorError::PingJobFailed(format!("{:?}", script_type), e.to_string()))
}
}
}
/// Create Zinit service configuration from worker config
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Value {
let mut config = json!({
"exec": format!("{} {}",
worker.binary_path.display(),
worker.args.join(" ")
),
"oneshot": !worker.restart_on_exit,
});
if let Some(health_check) = &worker.health_check {
config["test"] = json!(health_check);
}
if !worker.dependencies.is_empty() {
config["after"] = json!(worker.dependencies);
}
// Add environment variables if any
if !worker.env.is_empty() {
config["env"] = json!(worker.env);
}
config
}
/// Launch workers based on SupervisorBuilder configuration
pub(crate) async fn launch_configured_workers(&self, builder: &crate::SupervisorBuilderData) -> Result<(), SupervisorError> {
use hero_job::ScriptType;
use std::path::PathBuf;
// Launch OSIS worker if configured
if let Some(binary_path) = &builder.osis_worker {
let worker_id = "osis_worker_1";
let mut config = WorkerConfig::new(
worker_id.to_string(),
PathBuf::from(binary_path),
ScriptType::OSIS
);
config.env.extend(builder.worker_env_vars.clone());
info!("Launching OSIS worker: {}", worker_id);
self.start_worker(&config).await?;
}
// Launch SAL worker if configured
if let Some(binary_path) = &builder.sal_worker {
let worker_id = "sal_worker_1";
let mut config = WorkerConfig::new(
worker_id.to_string(),
PathBuf::from(binary_path),
ScriptType::SAL
);
config.env.extend(builder.worker_env_vars.clone());
info!("Launching SAL worker: {}", worker_id);
self.start_worker(&config).await?;
}
// Launch V worker if configured
if let Some(binary_path) = &builder.v_worker {
let worker_id = "v_worker_1";
let mut config = WorkerConfig::new(
worker_id.to_string(),
PathBuf::from(binary_path),
ScriptType::V
);
config.env.extend(builder.worker_env_vars.clone());
info!("Launching V worker: {}", worker_id);
self.start_worker(&config).await?;
}
// Launch Python worker if configured
if let Some(binary_path) = &builder.python_worker {
let worker_id = "python_worker_1";
let mut config = WorkerConfig::new(
worker_id.to_string(),
PathBuf::from(binary_path),
ScriptType::Python
);
config.env.extend(builder.worker_env_vars.clone());
info!("Launching Python worker: {}", worker_id);
self.start_worker(&config).await?;
}
Ok(())
}
}