wip
This commit is contained in:
1
core/supervisor/.gitignore
vendored
Normal file
1
core/supervisor/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/target
|
26
core/supervisor/Cargo.toml
Normal file
26
core/supervisor/Cargo.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
[package]
|
||||
name = "hero_supervisor"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[[bin]]
|
||||
name = "supervisor"
|
||||
path = "cmd/supervisor.rs"
|
||||
|
||||
[dependencies]
|
||||
clap = { version = "4.4", features = ["derive"] }
|
||||
env_logger = "0.10"
|
||||
redis = { version = "0.25.0", features = ["tokio-comp"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
uuid = { version = "1.6", features = ["v4", "serde"] }
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
log = "0.4"
|
||||
tokio = { version = "1", features = ["macros", "rt-multi-thread"] } # For async main in examples, and general async
|
||||
colored = "2.0"
|
||||
hero_job = { path = "../job" }
|
||||
zinit-client = "0.4.0"
|
||||
|
||||
[dev-dependencies] # For examples later
|
||||
env_logger = "0.10"
|
||||
rhai = "1.18.0" # For examples that might need to show engine setup
|
315
core/supervisor/LIFECYCLE.md
Normal file
315
core/supervisor/LIFECYCLE.md
Normal file
@@ -0,0 +1,315 @@
|
||||
# Worker Lifecycle Management
|
||||
|
||||
The Hero Supervisor includes comprehensive worker lifecycle management functionality using [Zinit](https://github.com/threefoldtech/zinit) as the process manager. This enables the supervisor to manage worker processes, perform health monitoring, and implement load balancing.
|
||||
|
||||
## Overview
|
||||
|
||||
The lifecycle management system provides:
|
||||
|
||||
- **Worker Process Management**: Start, stop, restart, and monitor worker binaries
|
||||
- **Health Monitoring**: Automatic ping jobs every 10 minutes for idle workers
|
||||
- **Load Balancing**: Dynamic scaling of workers based on demand
|
||||
- **Service Dependencies**: Proper startup ordering with dependency management
|
||||
- **Graceful Shutdown**: Clean termination of worker processes
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
||||
│ Supervisor │ │ WorkerLifecycle │ │ Zinit │
|
||||
│ │◄──►│ Manager │◄──►│ (Process │
|
||||
│ (Job Dispatch) │ │ │ │ Manager) │
|
||||
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
||||
│ │ │
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
||||
│ Redis │ │ Health Monitor │ │ Worker Binaries │
|
||||
│ (Job Queue) │ │ (Ping Jobs) │ │ (OSIS/SAL/V) │
|
||||
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
## Components
|
||||
|
||||
### WorkerConfig
|
||||
|
||||
Defines configuration for a worker binary:
|
||||
|
||||
```rust
|
||||
use hero_supervisor::{WorkerConfig, ScriptType};
|
||||
use std::path::PathBuf;
|
||||
use std::collections::HashMap;
|
||||
|
||||
let config = WorkerConfig::new(
|
||||
"osis_worker_0".to_string(),
|
||||
PathBuf::from("/usr/local/bin/osis_worker"),
|
||||
ScriptType::OSIS,
|
||||
)
|
||||
.with_args(vec![
|
||||
"--redis-url".to_string(),
|
||||
"redis://localhost:6379".to_string(),
|
||||
"--worker-id".to_string(),
|
||||
"osis_worker_0".to_string(),
|
||||
])
|
||||
.with_env({
|
||||
let mut env = HashMap::new();
|
||||
env.insert("RUST_LOG".to_string(), "info".to_string());
|
||||
env.insert("WORKER_TYPE".to_string(), "osis".to_string());
|
||||
env
|
||||
})
|
||||
.with_health_check("/usr/local/bin/osis_worker --health-check".to_string())
|
||||
.with_dependencies(vec!["redis".to_string()]);
|
||||
```
|
||||
|
||||
### WorkerLifecycleManager
|
||||
|
||||
Main component for managing worker lifecycles:
|
||||
|
||||
```rust
|
||||
use hero_supervisor::{WorkerLifecycleManagerBuilder, Supervisor};
|
||||
|
||||
let supervisor = SupervisorBuilder::new()
|
||||
.redis_url("redis://localhost:6379")
|
||||
.caller_id("my_supervisor")
|
||||
.context_id("production")
|
||||
.build()?;
|
||||
|
||||
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new("/var/run/zinit.sock".to_string())
|
||||
.with_supervisor(supervisor.clone())
|
||||
.add_worker(osis_worker_config)
|
||||
.add_worker(sal_worker_config)
|
||||
.add_worker(v_worker_config)
|
||||
.build();
|
||||
```
|
||||
|
||||
## Supported Script Types
|
||||
|
||||
The lifecycle manager supports all Hero script types:
|
||||
|
||||
- **OSIS**: Rhai/HeroScript execution workers
|
||||
- **SAL**: System Abstraction Layer workers
|
||||
- **V**: HeroScript execution in V language
|
||||
- **Python**: HeroScript execution in Python
|
||||
|
||||
## Key Features
|
||||
|
||||
### 1. Worker Management
|
||||
|
||||
```rust
|
||||
// Start all configured workers
|
||||
lifecycle_manager.start_all_workers().await?;
|
||||
|
||||
// Stop all workers
|
||||
lifecycle_manager.stop_all_workers().await?;
|
||||
|
||||
// Restart specific worker
|
||||
lifecycle_manager.restart_worker("osis_worker_0").await?;
|
||||
|
||||
// Get worker status
|
||||
let status = lifecycle_manager.get_worker_status("osis_worker_0").await?;
|
||||
println!("Worker state: {:?}, PID: {}", status.state, status.pid);
|
||||
```
|
||||
|
||||
### 2. Health Monitoring
|
||||
|
||||
The system automatically monitors worker health:
|
||||
|
||||
- Tracks last job execution time for each worker
|
||||
- Sends ping jobs to workers idle for 10+ minutes
|
||||
- Restarts workers that fail ping checks 3 times
|
||||
- Updates job times when workers receive tasks
|
||||
|
||||
```rust
|
||||
// Manual health check
|
||||
lifecycle_manager.monitor_worker_health().await?;
|
||||
|
||||
// Update job time (called automatically by supervisor)
|
||||
lifecycle_manager.update_worker_job_time("osis_worker_0");
|
||||
|
||||
// Start continuous health monitoring
|
||||
lifecycle_manager.start_health_monitoring().await; // Runs forever
|
||||
```
|
||||
|
||||
### 3. Dynamic Scaling
|
||||
|
||||
Scale workers up or down based on demand:
|
||||
|
||||
```rust
|
||||
// Scale OSIS workers to 5 instances
|
||||
lifecycle_manager.scale_workers(&ScriptType::OSIS, 5).await?;
|
||||
|
||||
// Scale down SAL workers to 1 instance
|
||||
lifecycle_manager.scale_workers(&ScriptType::SAL, 1).await?;
|
||||
|
||||
// Check current running count
|
||||
let count = lifecycle_manager.get_running_worker_count(&ScriptType::V).await;
|
||||
println!("Running V workers: {}", count);
|
||||
```
|
||||
|
||||
### 4. Service Dependencies
|
||||
|
||||
Workers can depend on other services:
|
||||
|
||||
```rust
|
||||
let config = WorkerConfig::new(name, binary, script_type)
|
||||
.with_dependencies(vec![
|
||||
"redis".to_string(),
|
||||
"database".to_string(),
|
||||
"auth_service".to_string(),
|
||||
]);
|
||||
```
|
||||
|
||||
Zinit ensures dependencies start before the worker.
|
||||
|
||||
## Integration with Supervisor
|
||||
|
||||
The lifecycle manager integrates seamlessly with the supervisor:
|
||||
|
||||
```rust
|
||||
use hero_supervisor::{Supervisor, WorkerLifecycleManager};
|
||||
|
||||
// Create supervisor and lifecycle manager
|
||||
let supervisor = SupervisorBuilder::new().build()?;
|
||||
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new(zinit_socket)
|
||||
.with_supervisor(supervisor.clone())
|
||||
.build();
|
||||
|
||||
// Start workers
|
||||
lifecycle_manager.start_all_workers().await?;
|
||||
|
||||
// Create and execute jobs (supervisor automatically routes to workers)
|
||||
let job = supervisor
|
||||
.new_job()
|
||||
.script_type(ScriptType::OSIS)
|
||||
.script_content("println!(\"Hello World!\");".to_string())
|
||||
.build()?;
|
||||
|
||||
let result = supervisor.run_job_and_await_result(&job).await?;
|
||||
println!("Job result: {}", result);
|
||||
```
|
||||
|
||||
## Zinit Service Configuration
|
||||
|
||||
The lifecycle manager automatically creates Zinit service configurations:
|
||||
|
||||
```yaml
|
||||
# Generated service config for osis_worker_0
|
||||
exec: "/usr/local/bin/osis_worker --redis-url redis://localhost:6379 --worker-id osis_worker_0"
|
||||
test: "/usr/local/bin/osis_worker --health-check"
|
||||
oneshot: false # Restart on exit
|
||||
after:
|
||||
- redis
|
||||
env:
|
||||
RUST_LOG: "info"
|
||||
WORKER_TYPE: "osis"
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The system provides comprehensive error handling:
|
||||
|
||||
```rust
|
||||
use hero_supervisor::SupervisorError;
|
||||
|
||||
match lifecycle_manager.start_worker(&config).await {
|
||||
Ok(_) => println!("Worker started successfully"),
|
||||
Err(SupervisorError::WorkerStartFailed(worker, reason)) => {
|
||||
eprintln!("Failed to start {}: {}", worker, reason);
|
||||
}
|
||||
Err(e) => eprintln!("Other error: {}", e),
|
||||
}
|
||||
```
|
||||
|
||||
## Example Usage
|
||||
|
||||
See `examples/lifecycle_demo.rs` for a comprehensive demonstration:
|
||||
|
||||
```bash
|
||||
# Run the lifecycle demo
|
||||
cargo run --example lifecycle_demo
|
||||
|
||||
# Run with custom Redis URL
|
||||
REDIS_URL=redis://localhost:6379 cargo run --example lifecycle_demo
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Zinit**: Install and run Zinit process manager
|
||||
```bash
|
||||
curl https://raw.githubusercontent.com/threefoldtech/zinit/refs/heads/master/install.sh | bash
|
||||
zinit init --config /etc/zinit/ --socket /var/run/zinit.sock
|
||||
```
|
||||
|
||||
2. **Redis**: Running Redis instance for job queues
|
||||
```bash
|
||||
redis-server
|
||||
```
|
||||
|
||||
3. **Worker Binaries**: Compiled worker binaries for each script type
|
||||
- `/usr/local/bin/osis_worker`
|
||||
- `/usr/local/bin/sal_worker`
|
||||
- `/usr/local/bin/v_worker`
|
||||
- `/usr/local/bin/python_worker`
|
||||
|
||||
## Configuration Best Practices
|
||||
|
||||
1. **Resource Limits**: Configure appropriate resource limits in Zinit
|
||||
2. **Health Checks**: Implement meaningful health check commands
|
||||
3. **Dependencies**: Define proper service dependencies
|
||||
4. **Environment**: Set appropriate environment variables
|
||||
5. **Logging**: Configure structured logging for debugging
|
||||
6. **Monitoring**: Use health monitoring for production deployments
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Zinit Connection Failed**
|
||||
- Ensure Zinit is running: `ps aux | grep zinit`
|
||||
- Check socket permissions: `ls -la /var/run/zinit.sock`
|
||||
- Verify socket path in configuration
|
||||
|
||||
2. **Worker Start Failed**
|
||||
- Check binary exists and is executable
|
||||
- Verify dependencies are running
|
||||
- Review Zinit logs: `zinit logs <service-name>`
|
||||
|
||||
3. **Health Check Failures**
|
||||
- Implement proper health check endpoint in workers
|
||||
- Verify health check command syntax
|
||||
- Check worker responsiveness
|
||||
|
||||
4. **Redis Connection Issues**
|
||||
- Ensure Redis is running and accessible
|
||||
- Verify Redis URL configuration
|
||||
- Check network connectivity
|
||||
|
||||
### Debug Commands
|
||||
|
||||
```bash
|
||||
# Check Zinit status
|
||||
zinit list
|
||||
|
||||
# View service logs
|
||||
zinit logs osis_worker_0
|
||||
|
||||
# Check service status
|
||||
zinit status osis_worker_0
|
||||
|
||||
# Monitor Redis queues
|
||||
redis-cli keys "hero:job:*"
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- **Scaling**: Start with minimal workers and scale based on queue depth
|
||||
- **Health Monitoring**: Adjust ping intervals based on workload patterns
|
||||
- **Resource Usage**: Monitor CPU/memory usage of worker processes
|
||||
- **Queue Depth**: Monitor Redis queue lengths for scaling decisions
|
||||
|
||||
## Security
|
||||
|
||||
- **Process Isolation**: Zinit provides process isolation
|
||||
- **User Permissions**: Run workers with appropriate user permissions
|
||||
- **Network Security**: Secure Redis and Zinit socket access
|
||||
- **Binary Validation**: Verify worker binary integrity before deployment
|
103
core/supervisor/README.md
Normal file
103
core/supervisor/README.md
Normal file
@@ -0,0 +1,103 @@
|
||||
# Hero Supervisor
|
||||
|
||||
The **Hero Supervisor** is responsible for supervising the lifecycle of workers and dispatching jobs to them via Redis queues.
|
||||
|
||||
## Overview
|
||||
|
||||
The system involves four primary actors:
|
||||
|
||||
1. **OSIS**: A worker that executes Rhai and HeroScript.
|
||||
2. **SAL**: A worker that performs system abstraction layer functionalities using Rhai.
|
||||
3. **V**: A worker that executes HeroScript in the V programming language.
|
||||
4. **Python**: A worker that executes HeroScript in Python.
|
||||
|
||||
The Supervisor utilizes **zinit** to start and monitor these workers, ensuring they are running correctly.
|
||||
|
||||
### Key Features
|
||||
|
||||
- **Worker Lifecycle Supervision**: Oversee the lifecycle of workers, including starting, stopping, restarting, and load balancing based on job demand.
|
||||
- **Job Supervision**: API for efficiently managing jobs dispatched to workers over Redis queues.
|
||||
|
||||
## Worker Lifecycle Supervision
|
||||
|
||||
The Supervisor oversees the lifecycle of the workers, ensuring they are operational and efficiently allocated. Load balancing is implemented to dynamically adjust the number of active workers based on job demand.
|
||||
|
||||
Additionally, the Supervisor implements health monitoring for worker engines: if a worker engine does not receive a job within 10 minutes, the Supervisor sends a ping job. The engine must respond immediately; if it fails to do so, the Supervisor restarts the requested job engine.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
**Important**: Before running any lifecycle examples or using worker management features, you must start the Zinit daemon:
|
||||
|
||||
```bash
|
||||
# Start Zinit daemon (required for worker lifecycle management)
|
||||
sudo zinit init
|
||||
|
||||
# Or start Zinit with a custom socket path
|
||||
sudo zinit --socket /var/run/zinit.sock init
|
||||
```
|
||||
|
||||
**Note**: The Supervisor uses Zinit as the process manager for worker lifecycle operations. The default socket path is `/var/run/zinit.sock`, but you can configure a custom path using the `SupervisorBuilder::zinit_socket_path()` method.
|
||||
|
||||
**Troubleshooting**: If you get connection errors when running examples, ensure:
|
||||
1. Zinit daemon is running (`zinit list` should work)
|
||||
2. The socket path matches between Zinit and your Supervisor configuration
|
||||
3. You have appropriate permissions to access the Zinit socket
|
||||
|
||||
### Supervisor API for Worker Lifecycle
|
||||
|
||||
The Supervisor provides the following methods for supervising the worker lifecycle:
|
||||
|
||||
- **`start_worker()`**: Initializes and starts a specified worker.
|
||||
- **`stop_worker()`**: Gracefully stops a specified worker.
|
||||
- **`restart_worker()`**: Restarts a specified worker to ensure it operates correctly.
|
||||
- **`get_worker_status()`**: Checks the status of a specific worker.
|
||||
|
||||
## Job Supervision
|
||||
|
||||
Jobs are dispatched to workers through their designated Redis queues, and the Supervisor provides an API for comprehensive job supervision.
|
||||
|
||||
### Supervisor API for Job Supervision
|
||||
|
||||
The Supervisor offers the following methods for handling jobs:
|
||||
|
||||
- **`new_job()`**: Creates a new `JobBuilder` for configuring a job.
|
||||
- **`create_job()`**: Stores a job in Redis.
|
||||
- **`run_job_and_await_result()`**: Executes a job and waits for its completion.
|
||||
- **`get_job_status()`**: Checks the current execution status of a job.
|
||||
- **`get_job_output()`**: Retrieves the results of a completed job.
|
||||
|
||||
## Running Examples
|
||||
|
||||
The supervisor includes several examples demonstrating lifecycle management:
|
||||
|
||||
```bash
|
||||
# 1. First, start the Zinit daemon
|
||||
sudo zinit init
|
||||
|
||||
# 2. In another terminal, start Redis (if not already running)
|
||||
redis-server
|
||||
|
||||
# 3. Run the lifecycle demo
|
||||
cargo run --example simple_lifecycle_demo
|
||||
|
||||
# Or run the comprehensive lifecycle demo
|
||||
cargo run --example lifecycle_demo
|
||||
```
|
||||
|
||||
**Example Configuration**: The examples use these default paths:
|
||||
- Redis: `redis://localhost:6379`
|
||||
- Zinit socket: `/var/run/zinit.sock`
|
||||
|
||||
You can modify these in the example source code if your setup differs.
|
||||
|
||||
### Redis Schema for Job Supervision
|
||||
|
||||
Jobs are managed within the `hero:` namespace in Redis:
|
||||
|
||||
- **`hero:job:{job_id}`**: Stores job parameters as a Redis hash.
|
||||
- **`hero:work_queue:{worker_id}`**: Contains worker-specific job queues for dispatching jobs.
|
||||
- **`hero:reply:{job_id}`**: Dedicated queues for job results.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- A Redis server must be accessible to both the Supervisor and the workers.
|
157
core/supervisor/cmd/README.md
Normal file
157
core/supervisor/cmd/README.md
Normal file
@@ -0,0 +1,157 @@
|
||||
# Rhai Client Binary
|
||||
|
||||
A command-line client for executing Rhai scripts on remote workers via Redis.
|
||||
|
||||
## Binary: `client`
|
||||
|
||||
### Installation
|
||||
|
||||
Build the binary:
|
||||
```bash
|
||||
cargo build --bin client --release
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
# Basic usage - requires caller and circle keys
|
||||
client --caller-key <CALLER_KEY> --circle-key <CIRCLE_KEY>
|
||||
|
||||
# Execute inline script
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> --script "print('Hello World!')"
|
||||
|
||||
# Execute script from file
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> --file script.rhai
|
||||
|
||||
# Use specific worker (defaults to circle key)
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> -w <WORKER_KEY> --script "2 + 2"
|
||||
|
||||
# Custom Redis and timeout
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> --redis-url redis://localhost:6379/1 --timeout 60
|
||||
|
||||
# Remove timestamps from logs
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> --no-timestamp
|
||||
|
||||
# Increase verbosity
|
||||
client -c <CALLER_KEY> -k <CIRCLE_KEY> -v --script "debug_info()"
|
||||
```
|
||||
|
||||
### Command-Line Options
|
||||
|
||||
| Option | Short | Default | Description |
|
||||
|--------|-------|---------|-------------|
|
||||
| `--caller-key` | `-c` | **Required** | Caller public key (your identity) |
|
||||
| `--circle-key` | `-k` | **Required** | Circle public key (execution context) |
|
||||
| `--worker-key` | `-w` | `circle-key` | Worker public key (target worker) |
|
||||
| `--redis-url` | `-r` | `redis://localhost:6379` | Redis connection URL |
|
||||
| `--script` | `-s` | | Rhai script to execute |
|
||||
| `--file` | `-f` | | Path to Rhai script file |
|
||||
| `--timeout` | `-t` | `30` | Timeout for script execution (seconds) |
|
||||
| `--no-timestamp` | | `false` | Remove timestamps from log output |
|
||||
| `--verbose` | `-v` | | Increase verbosity (stackable) |
|
||||
|
||||
### Execution Modes
|
||||
|
||||
#### Inline Script Execution
|
||||
```bash
|
||||
# Execute a simple calculation
|
||||
client -c caller_123 -k circle_456 -s "let result = 2 + 2; print(result);"
|
||||
|
||||
# Execute with specific worker
|
||||
client -c caller_123 -k circle_456 -w worker_789 -s "get_user_data()"
|
||||
```
|
||||
|
||||
#### Script File Execution
|
||||
```bash
|
||||
# Execute script from file
|
||||
client -c caller_123 -k circle_456 -f examples/data_processing.rhai
|
||||
|
||||
# Execute with custom timeout
|
||||
client -c caller_123 -k circle_456 -f long_running_script.rhai -t 120
|
||||
```
|
||||
|
||||
#### Interactive Mode
|
||||
```bash
|
||||
# Enter interactive REPL mode (when no script or file provided)
|
||||
client -c caller_123 -k circle_456
|
||||
|
||||
# Interactive mode with verbose logging
|
||||
client -c caller_123 -k circle_456 -v --no-timestamp
|
||||
```
|
||||
|
||||
### Interactive Mode
|
||||
|
||||
When no script (`-s`) or file (`-f`) is provided, the client enters interactive mode:
|
||||
|
||||
```
|
||||
🔗 Starting Rhai Client
|
||||
📋 Configuration:
|
||||
Caller Key: caller_123
|
||||
Circle Key: circle_456
|
||||
Worker Key: circle_456
|
||||
Redis URL: redis://localhost:6379
|
||||
Timeout: 30s
|
||||
|
||||
✅ Connected to Redis at redis://localhost:6379
|
||||
🎮 Entering interactive mode
|
||||
Type Rhai scripts and press Enter to execute. Type 'exit' or 'quit' to close.
|
||||
rhai> let x = 42; print(x);
|
||||
Status: completed
|
||||
Output: 42
|
||||
rhai> exit
|
||||
👋 Goodbye!
|
||||
```
|
||||
|
||||
### Configuration Examples
|
||||
|
||||
#### Development Usage
|
||||
```bash
|
||||
# Simple development client
|
||||
client -c dev_user -k dev_circle
|
||||
|
||||
# Development with clean logs
|
||||
client -c dev_user -k dev_circle --no-timestamp -v
|
||||
```
|
||||
|
||||
#### Production Usage
|
||||
```bash
|
||||
# Production client with specific worker
|
||||
client \
|
||||
--caller-key prod_user_123 \
|
||||
--circle-key prod_circle_456 \
|
||||
--worker-key prod_worker_789 \
|
||||
--redis-url redis://redis-cluster:6379/0 \
|
||||
--timeout 300 \
|
||||
--file production_script.rhai
|
||||
```
|
||||
|
||||
#### Batch Processing
|
||||
```bash
|
||||
# Process multiple scripts
|
||||
for script in scripts/*.rhai; do
|
||||
client -c batch_user -k batch_circle -f "$script" --no-timestamp
|
||||
done
|
||||
```
|
||||
|
||||
### Key Concepts
|
||||
|
||||
- **Caller Key**: Your identity - used for authentication and tracking
|
||||
- **Circle Key**: Execution context - defines the environment/permissions
|
||||
- **Worker Key**: Target worker - which worker should execute the script (defaults to circle key)
|
||||
|
||||
### Error Handling
|
||||
|
||||
The client provides clear error messages for:
|
||||
- Missing required keys
|
||||
- Redis connection failures
|
||||
- Script execution timeouts
|
||||
- Worker unavailability
|
||||
- Script syntax errors
|
||||
|
||||
### Dependencies
|
||||
|
||||
- `rhai_supervisor`: Core client library for Redis-based script execution
|
||||
- `redis`: Redis client for task queue communication
|
||||
- `clap`: Command-line argument parsing
|
||||
- `env_logger`: Logging infrastructure
|
||||
- `tokio`: Async runtime
|
236
core/supervisor/cmd/supervisor.rs
Normal file
236
core/supervisor/cmd/supervisor.rs
Normal file
@@ -0,0 +1,236 @@
|
||||
use clap::Parser;
|
||||
use hero_supervisor::{Supervisor, SupervisorBuilder, ScriptType};
|
||||
use log::{error, info};
|
||||
use colored::Colorize;
|
||||
use std::io::{self, Write};
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about = "Rhai Client - Script execution client", long_about = None)]
|
||||
struct Args {
|
||||
/// Caller ID (your identity)
|
||||
#[arg(short = 'c', long = "caller-id", help = "Caller ID (your identity)")]
|
||||
caller_id: String,
|
||||
|
||||
/// Context ID (execution context)
|
||||
#[arg(short = 'k', long = "context-id", help = "Context ID (execution context)")]
|
||||
context_id: String,
|
||||
|
||||
/// Script type to execute (osis, sal, v, python)
|
||||
#[arg(short = 'T', long = "script-type", default_value = "osis", help = "Script type: osis, sal, v, or python")]
|
||||
script_type: String,
|
||||
|
||||
/// Redis URL
|
||||
#[arg(short, long, default_value = "redis://localhost:6379", help = "Redis connection URL")]
|
||||
redis_url: String,
|
||||
|
||||
/// Rhai script to execute
|
||||
#[arg(short, long, help = "Rhai script to execute")]
|
||||
script: Option<String>,
|
||||
|
||||
/// Path to Rhai script file
|
||||
#[arg(short, long, help = "Path to Rhai script file")]
|
||||
file: Option<String>,
|
||||
|
||||
/// Timeout for script execution (in seconds)
|
||||
#[arg(short, long, default_value = "30", help = "Timeout for script execution in seconds")]
|
||||
timeout: u64,
|
||||
|
||||
/// Increase verbosity (can be used multiple times)
|
||||
#[arg(short, long, action = clap::ArgAction::Count, help = "Increase verbosity (-v for debug, -vv for trace)")]
|
||||
verbose: u8,
|
||||
|
||||
/// Disable timestamps in log output
|
||||
#[arg(long, help = "Remove timestamps from log output")]
|
||||
no_timestamp: bool,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let args = Args::parse();
|
||||
|
||||
// Configure logging based on verbosity level
|
||||
let log_config = match args.verbose {
|
||||
0 => "warn,hero_supervisor=warn",
|
||||
1 => "info,hero_supervisor=info",
|
||||
2 => "debug,hero_supervisor=debug",
|
||||
_ => "trace,hero_supervisor=trace",
|
||||
};
|
||||
|
||||
std::env::set_var("RUST_LOG", log_config);
|
||||
|
||||
// Configure env_logger with or without timestamps
|
||||
if args.no_timestamp {
|
||||
env_logger::Builder::from_default_env()
|
||||
.format_timestamp(None)
|
||||
.init();
|
||||
} else {
|
||||
env_logger::init();
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Validate script type
|
||||
match args.script_type.to_lowercase().as_str() {
|
||||
"osis" | "sal" | "v" | "python" => {
|
||||
// Valid script types - no worker validation needed since we use hardcoded queues
|
||||
}
|
||||
_ => {
|
||||
error!("❌ Invalid script type: {}. Valid types: osis, sal, v, python", args.script_type);
|
||||
return Err(format!("Invalid script type: {}", args.script_type).into());
|
||||
}
|
||||
}
|
||||
|
||||
if args.verbose > 0 {
|
||||
info!("🔗 Starting Hero Supervisor");
|
||||
info!("📋 Configuration:");
|
||||
info!(" Caller ID: {}", args.caller_id);
|
||||
info!(" Context ID: {}", args.context_id);
|
||||
info!(" Script Type: {}", args.script_type);
|
||||
info!(" Redis URL: {}", args.redis_url);
|
||||
info!(" Timeout: {}s", args.timeout);
|
||||
info!(" Using hardcoded worker queues for script type: {}", args.script_type);
|
||||
info!("");
|
||||
}
|
||||
|
||||
// Create the supervisor client
|
||||
let client = SupervisorBuilder::new()
|
||||
.redis_url(&args.redis_url)
|
||||
.build()?;
|
||||
|
||||
if args.verbose > 0 {
|
||||
info!("✅ Connected to Redis at {}", args.redis_url);
|
||||
}
|
||||
|
||||
// Determine execution mode
|
||||
if let Some(script_content) = args.script {
|
||||
// Execute inline script
|
||||
if args.verbose > 0 {
|
||||
info!("📜 Executing inline script");
|
||||
}
|
||||
execute_script(&client, script_content, &args.script_type, args.timeout).await?;
|
||||
} else if let Some(file_path) = args.file {
|
||||
// Execute script from file
|
||||
if args.verbose > 0 {
|
||||
info!("📁 Loading script from file: {}", file_path);
|
||||
}
|
||||
let script_content = std::fs::read_to_string(&file_path)
|
||||
.map_err(|e| format!("Failed to read script file '{}': {}", file_path, e))?;
|
||||
execute_script(&client, script_content, &args.script_type, args.timeout).await?;
|
||||
} else {
|
||||
// Interactive mode
|
||||
info!("🎮 Entering interactive mode");
|
||||
info!("Type Rhai scripts and press Enter to execute. Type 'exit' or 'quit' to close.");
|
||||
run_interactive_mode(&client, &args.script_type, args.timeout, args.verbose).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn execute_script(
|
||||
client: &Supervisor,
|
||||
script: String,
|
||||
script_type_str: &str,
|
||||
timeout_secs: u64,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
info!("⚡ Executing script: {:.50}...", script);
|
||||
|
||||
// Parse script type
|
||||
let script_type = match script_type_str.to_lowercase().as_str() {
|
||||
"osis" => ScriptType::OSIS,
|
||||
"sal" => ScriptType::SAL,
|
||||
"v" => ScriptType::V,
|
||||
"python" => ScriptType::Python,
|
||||
_ => {
|
||||
error!("❌ Invalid script type: {}. Valid types: osis, sal, v, python", script_type_str);
|
||||
return Err(format!("Invalid script type: {}", script_type_str).into());
|
||||
}
|
||||
};
|
||||
|
||||
let timeout = Duration::from_secs(timeout_secs);
|
||||
|
||||
match client
|
||||
.new_job()
|
||||
.script_type(script_type)
|
||||
.script(&script)
|
||||
.timeout(timeout)
|
||||
.await_response()
|
||||
.await
|
||||
{
|
||||
Ok(result) => {
|
||||
info!("✅ Script execution completed");
|
||||
println!("{}", "Result:".green().bold());
|
||||
println!("{}", result);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("❌ Script execution failed: {}", e);
|
||||
return Err(Box::new(e));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_interactive_mode(
|
||||
client: &Supervisor,
|
||||
script_type_str: &str,
|
||||
timeout_secs: u64,
|
||||
verbose: u8,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Parse script type
|
||||
let script_type = match script_type_str.to_lowercase().as_str() {
|
||||
"osis" => ScriptType::OSIS,
|
||||
"sal" => ScriptType::SAL,
|
||||
"v" => ScriptType::V,
|
||||
"python" => ScriptType::Python,
|
||||
_ => {
|
||||
error!("❌ Invalid script type: {}. Valid types: osis, sal, v, python", script_type_str);
|
||||
return Err(format!("Invalid script type: {}", script_type_str).into());
|
||||
}
|
||||
};
|
||||
|
||||
let timeout = Duration::from_secs(timeout_secs);
|
||||
|
||||
loop {
|
||||
print!("rhai> ");
|
||||
io::stdout().flush()?;
|
||||
|
||||
let mut input = String::new();
|
||||
io::stdin().read_line(&mut input)?;
|
||||
|
||||
let input = input.trim();
|
||||
|
||||
if input.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if input == "exit" || input == "quit" {
|
||||
info!("👋 Goodbye!");
|
||||
break;
|
||||
}
|
||||
|
||||
if verbose > 0 {
|
||||
info!("⚡ Executing: {}", input);
|
||||
}
|
||||
|
||||
match client
|
||||
.new_job()
|
||||
.script_type(script_type.clone())
|
||||
.script(input)
|
||||
.timeout(timeout)
|
||||
.await_response()
|
||||
.await
|
||||
{
|
||||
Ok(result) => {
|
||||
println!("{}", result.green());
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{}", format!("error: {}", e).red());
|
||||
}
|
||||
}
|
||||
|
||||
println!(); // Add blank line for readability
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
190
core/supervisor/docs/ARCHITECTURE.md
Normal file
190
core/supervisor/docs/ARCHITECTURE.md
Normal file
@@ -0,0 +1,190 @@
|
||||
# Architecture of the `rhai_supervisor` Crate
|
||||
|
||||
The `rhai_supervisor` crate provides a Redis-based client library for submitting Rhai scripts to distributed worker services and awaiting their execution results. It implements a request-reply pattern using Redis as the message broker.
|
||||
|
||||
## Core Architecture
|
||||
|
||||
The client follows a builder pattern design with clear separation of concerns:
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
A[RhaiSupervisorBuilder] --> B[RhaiSupervisor]
|
||||
B --> C[PlayRequestBuilder]
|
||||
C --> D[PlayRequest]
|
||||
D --> E[Redis Task Queue]
|
||||
E --> F[Worker Service]
|
||||
F --> G[Redis Reply Queue]
|
||||
G --> H[Client Response]
|
||||
|
||||
subgraph "Client Components"
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
end
|
||||
|
||||
subgraph "Redis Infrastructure"
|
||||
E
|
||||
G
|
||||
end
|
||||
|
||||
subgraph "External Services"
|
||||
F
|
||||
end
|
||||
```
|
||||
|
||||
## Key Components
|
||||
|
||||
### 1. RhaiSupervisorBuilder
|
||||
|
||||
A builder pattern implementation for constructing `RhaiSupervisor` instances with proper configuration validation.
|
||||
|
||||
**Responsibilities:**
|
||||
- Configure Redis connection URL
|
||||
- Set caller ID for task attribution
|
||||
- Validate configuration before building client
|
||||
|
||||
**Key Methods:**
|
||||
- `caller_id(id: &str)` - Sets the caller identifier
|
||||
- `redis_url(url: &str)` - Configures Redis connection
|
||||
- `build()` - Creates the final `RhaiSupervisor` instance
|
||||
|
||||
### 2. RhaiSupervisor
|
||||
|
||||
The main client interface that manages Redis connections and provides factory methods for creating play requests.
|
||||
|
||||
**Responsibilities:**
|
||||
- Maintain Redis connection pool
|
||||
- Provide factory methods for request builders
|
||||
- Handle low-level Redis operations
|
||||
- Manage task status queries
|
||||
|
||||
**Key Methods:**
|
||||
- `new_play_request()` - Creates a new `PlayRequestBuilder`
|
||||
- `get_task_status(task_id)` - Queries task status from Redis
|
||||
- Internal methods for Redis operations
|
||||
|
||||
### 3. PlayRequestBuilder
|
||||
|
||||
A fluent builder for constructing and submitting script execution requests.
|
||||
|
||||
**Responsibilities:**
|
||||
- Configure script execution parameters
|
||||
- Handle script loading from files or strings
|
||||
- Manage request timeouts
|
||||
- Provide submission methods (fire-and-forget vs await-response)
|
||||
|
||||
**Key Methods:**
|
||||
- `worker_id(id: &str)` - Target worker queue (determines which worker processes the task)
|
||||
- `context_id(id: &str)` - Target context ID (determines execution context/circle)
|
||||
- `script(content: &str)` - Set script content directly
|
||||
- `script_path(path: &str)` - Load script from file
|
||||
- `timeout(duration: Duration)` - Set execution timeout
|
||||
- `submit()` - Fire-and-forget submission
|
||||
- `await_response()` - Submit and wait for result
|
||||
|
||||
**Architecture Note:** The decoupling of `worker_id` and `context_id` allows a single worker to process tasks for multiple contexts (circles), providing greater deployment flexibility.
|
||||
|
||||
### 4. Data Structures
|
||||
|
||||
#### RhaiTaskDetails
|
||||
Represents the complete state of a task throughout its lifecycle.
|
||||
|
||||
```rust
|
||||
pub struct RhaiTaskDetails {
|
||||
pub task_id: String,
|
||||
pub script: String,
|
||||
pub status: String, // "pending", "processing", "completed", "error"
|
||||
pub output: Option<String>,
|
||||
pub error: Option<String>,
|
||||
pub created_at: DateTime<Utc>,
|
||||
pub updated_at: DateTime<Utc>,
|
||||
pub caller_id: String,
|
||||
}
|
||||
```
|
||||
|
||||
#### RhaiSupervisorError
|
||||
Comprehensive error handling for various failure scenarios:
|
||||
- `RedisError` - Redis connection/operation failures
|
||||
- `SerializationError` - JSON serialization/deserialization issues
|
||||
- `Timeout` - Task execution timeouts
|
||||
- `TaskNotFound` - Missing tasks after submission
|
||||
|
||||
## Communication Protocol
|
||||
|
||||
### Task Submission Flow
|
||||
|
||||
1. **Task Creation**: Client generates unique UUID for task identification
|
||||
2. **Task Storage**: Task details stored in Redis hash: `rhailib:<task_id>`
|
||||
3. **Queue Submission**: Task ID pushed to worker queue: `rhailib:<worker_id>`
|
||||
4. **Reply Queue Setup**: Client listens on: `rhailib:reply:<task_id>`
|
||||
|
||||
### Redis Key Patterns
|
||||
|
||||
- **Task Storage**: `rhailib:<task_id>` (Redis Hash)
|
||||
- **Worker Queues**: `rhailib:<worker_id>` (Redis List)
|
||||
- **Reply Queues**: `rhailib:reply:<task_id>` (Redis List)
|
||||
|
||||
### Message Flow Diagram
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant C as Client
|
||||
participant R as Redis
|
||||
participant W as Worker
|
||||
|
||||
C->>R: HSET rhailib:task_id (task details)
|
||||
C->>R: LPUSH rhailib:worker_id task_id
|
||||
C->>R: BLPOP rhailib:reply:task_id (blocking)
|
||||
|
||||
W->>R: BRPOP rhailib:worker_id (blocking)
|
||||
W->>W: Execute Rhai Script
|
||||
W->>R: LPUSH rhailib:reply:task_id (result)
|
||||
|
||||
R->>C: Return result from BLPOP
|
||||
C->>R: DEL rhailib:reply:task_id (cleanup)
|
||||
```
|
||||
|
||||
## Concurrency and Async Design
|
||||
|
||||
The client is built on `tokio` for asynchronous operations:
|
||||
|
||||
- **Connection Pooling**: Uses Redis multiplexed connections for efficiency
|
||||
- **Non-blocking Operations**: All Redis operations are async
|
||||
- **Timeout Handling**: Configurable timeouts with proper cleanup
|
||||
- **Error Propagation**: Comprehensive error handling with context
|
||||
|
||||
## Configuration and Deployment
|
||||
|
||||
### Prerequisites
|
||||
- Redis server accessible to both client and workers
|
||||
- Proper network connectivity between components
|
||||
- Sufficient Redis memory for task storage
|
||||
|
||||
### Configuration Options
|
||||
- **Redis URL**: Connection string for Redis instance
|
||||
- **Caller ID**: Unique identifier for client instance
|
||||
- **Timeouts**: Per-request timeout configuration
|
||||
- **Worker Targeting**: Direct worker queue addressing
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- **Task Isolation**: Each task uses unique identifiers
|
||||
- **Queue Separation**: Worker-specific queues prevent cross-contamination
|
||||
- **Cleanup**: Automatic cleanup of reply queues after completion
|
||||
- **Error Handling**: Secure error propagation without sensitive data leakage
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
- **Scalability**: Horizontal scaling through multiple worker instances
|
||||
- **Throughput**: Limited by Redis performance and network latency
|
||||
- **Memory Usage**: Efficient with connection pooling and cleanup
|
||||
- **Latency**: Low latency for local Redis deployments
|
||||
|
||||
## Integration Points
|
||||
|
||||
The client integrates with:
|
||||
- **Worker Services**: Via Redis queue protocol
|
||||
- **Monitoring Systems**: Through structured logging
|
||||
- **Application Code**: Via builder pattern API
|
||||
- **Configuration Systems**: Through environment variables and builders
|
272
core/supervisor/docs/protocol.md
Normal file
272
core/supervisor/docs/protocol.md
Normal file
@@ -0,0 +1,272 @@
|
||||
# Hero Supervisor Protocol
|
||||
|
||||
This document describes the Redis-based protocol used by the Hero Supervisor for job management and worker communication.
|
||||
|
||||
## Overview
|
||||
|
||||
The Hero Supervisor uses Redis as a message broker and data store for managing distributed job execution. Jobs are stored as Redis hashes, and communication with workers happens through Redis lists (queues).
|
||||
|
||||
## Redis Namespace
|
||||
|
||||
All supervisor-related keys use the `hero:` namespace prefix to avoid conflicts with other Redis usage.
|
||||
|
||||
## Data Structures
|
||||
|
||||
### Job Storage
|
||||
|
||||
Jobs are stored as Redis hashes with the following key pattern:
|
||||
```
|
||||
hero:job:{job_id}
|
||||
```
|
||||
|
||||
**Job Hash Fields:**
|
||||
- `id`: Unique job identifier (UUID v4)
|
||||
- `caller_id`: Identifier of the client that created the job
|
||||
- `worker_id`: Target worker identifier
|
||||
- `context_id`: Execution context identifier
|
||||
- `script`: Script content to execute (Rhai or HeroScript)
|
||||
- `timeout`: Execution timeout in seconds
|
||||
- `retries`: Number of retry attempts
|
||||
- `concurrent`: Whether to execute in separate thread (true/false)
|
||||
- `log_path`: Optional path to log file for job output
|
||||
- `created_at`: Job creation timestamp (ISO 8601)
|
||||
- `updated_at`: Job last update timestamp (ISO 8601)
|
||||
- `status`: Current job status (dispatched/started/error/finished)
|
||||
- `env_vars`: Environment variables as JSON object (optional)
|
||||
- `prerequisites`: JSON array of job IDs that must complete before this job (optional)
|
||||
- `dependents`: JSON array of job IDs that depend on this job completing (optional)
|
||||
- `output`: Job execution result (set by worker)
|
||||
- `error`: Error message if job failed (set by worker)
|
||||
- `dependencies`: List of job IDs that this job depends on
|
||||
|
||||
### Job Dependencies
|
||||
|
||||
Jobs can have dependencies on other jobs, which are stored in the `dependencies` field. A job will not be dispatched until all its dependencies have completed successfully.
|
||||
|
||||
### Work Queues
|
||||
|
||||
Jobs are queued for execution using Redis lists:
|
||||
```
|
||||
hero:work_queue:{worker_id}
|
||||
```
|
||||
|
||||
Workers listen on their specific queue using `BLPOP` for job IDs to process.
|
||||
|
||||
### Stop Queues
|
||||
|
||||
Job stop requests are sent through dedicated stop queues:
|
||||
```
|
||||
hero:stop_queue:{worker_id}
|
||||
```
|
||||
|
||||
Workers monitor these queues to receive stop requests for running jobs.
|
||||
|
||||
### Reply Queues
|
||||
|
||||
For synchronous job execution, dedicated reply queues are used:
|
||||
```
|
||||
hero:reply:{job_id}
|
||||
```
|
||||
|
||||
Workers send results to these queues when jobs complete.
|
||||
|
||||
## Job Lifecycle
|
||||
|
||||
### 1. Job Creation
|
||||
```
|
||||
Client -> Redis: HSET hero:job:{job_id} {job_fields}
|
||||
```
|
||||
|
||||
### 2. Job Submission
|
||||
```
|
||||
Client -> Redis: LPUSH hero:work_queue:{worker_id} {job_id}
|
||||
```
|
||||
|
||||
### 3. Job Processing
|
||||
```
|
||||
Worker -> Redis: BLPOP hero:work_queue:{worker_id}
|
||||
Worker -> Redis: HSET hero:job:{job_id} status "started"
|
||||
Worker: Execute script
|
||||
Worker -> Redis: HSET hero:job:{job_id} status "finished" output "{result}"
|
||||
```
|
||||
|
||||
### 4. Job Completion (Async)
|
||||
```
|
||||
Worker -> Redis: LPUSH hero:reply:{job_id} {result}
|
||||
```
|
||||
|
||||
## API Operations
|
||||
|
||||
### List Jobs
|
||||
```rust
|
||||
supervisor.list_jobs() -> Vec<String>
|
||||
```
|
||||
**Redis Operations:**
|
||||
- `KEYS hero:job:*` - Get all job keys
|
||||
- Extract job IDs from key names
|
||||
|
||||
### Stop Job
|
||||
```rust
|
||||
supervisor.stop_job(job_id) -> Result<(), SupervisorError>
|
||||
```
|
||||
**Redis Operations:**
|
||||
- `LPUSH hero:stop_queue:{worker_id} {job_id}` - Send stop request
|
||||
|
||||
### Get Job Status
|
||||
```rust
|
||||
supervisor.get_job_status(job_id) -> Result<JobStatus, SupervisorError>
|
||||
```
|
||||
**Redis Operations:**
|
||||
- `HGETALL hero:job:{job_id}` - Get job data
|
||||
- Parse `status` field
|
||||
|
||||
### Get Job Logs
|
||||
```rust
|
||||
supervisor.get_job_logs(job_id) -> Result<Option<String>, SupervisorError>
|
||||
```
|
||||
**Redis Operations:**
|
||||
- `HGETALL hero:job:{job_id}` - Get job data
|
||||
- Read `log_path` field
|
||||
- Read log file from filesystem
|
||||
|
||||
### Run Job and Await Result
|
||||
```rust
|
||||
supervisor.run_job_and_await_result(job, worker_id) -> Result<String, SupervisorError>
|
||||
```
|
||||
**Redis Operations:**
|
||||
1. `HSET hero:job:{job_id} {job_fields}` - Store job
|
||||
2. `LPUSH hero:work_queue:{worker_id} {job_id}` - Submit job
|
||||
3. `BLPOP hero:reply:{job_id} {timeout}` - Wait for result
|
||||
|
||||
## Worker Protocol
|
||||
|
||||
### Job Processing Loop
|
||||
```rust
|
||||
loop {
|
||||
// 1. Wait for job
|
||||
job_id = BLPOP hero:work_queue:{worker_id}
|
||||
|
||||
// 2. Get job details
|
||||
job_data = HGETALL hero:job:{job_id}
|
||||
|
||||
// 3. Update status
|
||||
HSET hero:job:{job_id} status "started"
|
||||
|
||||
// 4. Check for stop requests
|
||||
if LLEN hero:stop_queue:{worker_id} > 0 {
|
||||
stop_job_id = LPOP hero:stop_queue:{worker_id}
|
||||
if stop_job_id == job_id {
|
||||
HSET hero:job:{job_id} status "error" error "stopped"
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Execute script
|
||||
result = execute_script(job_data.script)
|
||||
|
||||
// 6. Update job with result
|
||||
HSET hero:job:{job_id} status "finished" output result
|
||||
|
||||
// 7. Send reply if needed
|
||||
if reply_queue_exists(hero:reply:{job_id}) {
|
||||
LPUSH hero:reply:{job_id} result
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Stop Request Handling
|
||||
Workers should periodically check the stop queue during long-running jobs:
|
||||
```rust
|
||||
if LLEN hero:stop_queue:{worker_id} > 0 {
|
||||
stop_requests = LRANGE hero:stop_queue:{worker_id} 0 -1
|
||||
if stop_requests.contains(current_job_id) {
|
||||
// Stop current job execution
|
||||
HSET hero:job:{current_job_id} status "error" error "stopped_by_request"
|
||||
// Remove stop request
|
||||
LREM hero:stop_queue:{worker_id} 1 current_job_id
|
||||
return
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Job Timeouts
|
||||
- Client sets timeout when creating job
|
||||
- Worker should respect timeout and stop execution
|
||||
- If timeout exceeded: `HSET hero:job:{job_id} status "error" error "timeout"`
|
||||
|
||||
### Worker Failures
|
||||
- If worker crashes, job remains in "started" status
|
||||
- Monitoring systems can detect stale jobs and retry
|
||||
- Jobs can be requeued: `LPUSH hero:work_queue:{worker_id} {job_id}`
|
||||
|
||||
### Redis Connection Issues
|
||||
- Clients should implement retry logic with exponential backoff
|
||||
- Workers should reconnect and resume processing
|
||||
- Use Redis persistence to survive Redis restarts
|
||||
|
||||
## Monitoring and Observability
|
||||
|
||||
### Queue Monitoring
|
||||
```bash
|
||||
# Check work queue length
|
||||
LLEN hero:work_queue:{worker_id}
|
||||
|
||||
# Check stop queue length
|
||||
LLEN hero:stop_queue:{worker_id}
|
||||
|
||||
# List all jobs
|
||||
KEYS hero:job:*
|
||||
|
||||
# Get job details
|
||||
HGETALL hero:job:{job_id}
|
||||
```
|
||||
|
||||
### Metrics to Track
|
||||
- Jobs created per second
|
||||
- Jobs completed per second
|
||||
- Average job execution time
|
||||
- Queue depths
|
||||
- Worker availability
|
||||
- Error rates by job type
|
||||
|
||||
## Security Considerations
|
||||
|
||||
### Redis Security
|
||||
- Use Redis AUTH for authentication
|
||||
- Enable TLS for Redis connections
|
||||
- Restrict Redis network access
|
||||
- Use Redis ACLs to limit worker permissions
|
||||
|
||||
### Job Security
|
||||
- Validate script content before execution
|
||||
- Sandbox script execution environment
|
||||
- Limit resource usage (CPU, memory, disk)
|
||||
- Log all job executions for audit
|
||||
|
||||
### Log File Security
|
||||
- Ensure log paths are within allowed directories
|
||||
- Validate log file permissions
|
||||
- Rotate and archive logs regularly
|
||||
- Sanitize sensitive data in logs
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Redis Optimization
|
||||
- Use Redis pipelining for batch operations
|
||||
- Configure appropriate Redis memory limits
|
||||
- Use Redis clustering for high availability
|
||||
- Monitor Redis memory usage and eviction
|
||||
|
||||
### Job Optimization
|
||||
- Keep job payloads small
|
||||
- Use efficient serialization formats
|
||||
- Batch similar jobs when possible
|
||||
- Implement job prioritization if needed
|
||||
|
||||
### Worker Optimization
|
||||
- Pool worker connections to Redis
|
||||
- Use async I/O for Redis operations
|
||||
- Implement graceful shutdown handling
|
||||
- Monitor worker resource usage
|
239
core/supervisor/examples/lifecycle_demo.rs
Normal file
239
core/supervisor/examples/lifecycle_demo.rs
Normal file
@@ -0,0 +1,239 @@
|
||||
use hero_supervisor::{
|
||||
Supervisor, SupervisorBuilder, WorkerConfig, WorkerLifecycleManager,
|
||||
WorkerLifecycleManagerBuilder, ScriptType
|
||||
};
|
||||
use log::{info, warn, error};
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use tokio::time::sleep;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Initialize logging
|
||||
env_logger::init();
|
||||
|
||||
info!("Starting Worker Lifecycle Management Demo");
|
||||
|
||||
// Configuration
|
||||
let redis_url = "redis://localhost:6379";
|
||||
let zinit_socket = "/var/run/zinit.sock";
|
||||
|
||||
// Create supervisor
|
||||
let supervisor = SupervisorBuilder::new()
|
||||
.redis_url(redis_url)
|
||||
.caller_id("lifecycle_demo")
|
||||
.context_id("demo_context")
|
||||
.build()?;
|
||||
|
||||
// Configure workers for different script types
|
||||
let mut worker_configs = Vec::new();
|
||||
|
||||
// OSIS workers (Rhai/HeroScript)
|
||||
for i in 0..2 {
|
||||
let config = WorkerConfig::new(
|
||||
format!("osis_worker_{}", i),
|
||||
PathBuf::from("/usr/local/bin/osis_worker"),
|
||||
ScriptType::OSIS,
|
||||
)
|
||||
.with_args(vec![
|
||||
"--redis-url".to_string(),
|
||||
redis_url.to_string(),
|
||||
"--worker-id".to_string(),
|
||||
format!("osis_worker_{}", i),
|
||||
])
|
||||
.with_env({
|
||||
let mut env = HashMap::new();
|
||||
env.insert("RUST_LOG".to_string(), "info".to_string());
|
||||
env.insert("WORKER_TYPE".to_string(), "osis".to_string());
|
||||
env
|
||||
})
|
||||
.with_health_check("/usr/local/bin/osis_worker --health-check".to_string())
|
||||
.with_dependencies(vec!["redis".to_string()]);
|
||||
|
||||
worker_configs.push(config);
|
||||
}
|
||||
|
||||
// SAL workers (System Abstraction Layer)
|
||||
for i in 0..3 {
|
||||
let config = WorkerConfig::new(
|
||||
format!("sal_worker_{}", i),
|
||||
PathBuf::from("/usr/local/bin/sal_worker"),
|
||||
ScriptType::SAL,
|
||||
)
|
||||
.with_args(vec![
|
||||
"--redis-url".to_string(),
|
||||
redis_url.to_string(),
|
||||
"--worker-id".to_string(),
|
||||
format!("sal_worker_{}", i),
|
||||
])
|
||||
.with_env({
|
||||
let mut env = HashMap::new();
|
||||
env.insert("RUST_LOG".to_string(), "info".to_string());
|
||||
env.insert("WORKER_TYPE".to_string(), "sal".to_string());
|
||||
env
|
||||
})
|
||||
.with_health_check("/usr/local/bin/sal_worker --health-check".to_string())
|
||||
.with_dependencies(vec!["redis".to_string()]);
|
||||
|
||||
worker_configs.push(config);
|
||||
}
|
||||
|
||||
// V workers (HeroScript in V language)
|
||||
for i in 0..2 {
|
||||
let config = WorkerConfig::new(
|
||||
format!("v_worker_{}", i),
|
||||
PathBuf::from("/usr/local/bin/v_worker"),
|
||||
ScriptType::V,
|
||||
)
|
||||
.with_args(vec![
|
||||
"--redis-url".to_string(),
|
||||
redis_url.to_string(),
|
||||
"--worker-id".to_string(),
|
||||
format!("v_worker_{}", i),
|
||||
])
|
||||
.with_env({
|
||||
let mut env = HashMap::new();
|
||||
env.insert("RUST_LOG".to_string(), "info".to_string());
|
||||
env.insert("WORKER_TYPE".to_string(), "v".to_string());
|
||||
env
|
||||
})
|
||||
.with_health_check("/usr/local/bin/v_worker --health-check".to_string())
|
||||
.with_dependencies(vec!["redis".to_string()]);
|
||||
|
||||
worker_configs.push(config);
|
||||
}
|
||||
|
||||
// Create lifecycle manager
|
||||
let mut lifecycle_manager = WorkerLifecycleManagerBuilder::new(zinit_socket.to_string())
|
||||
.with_supervisor(supervisor.clone());
|
||||
|
||||
// Add all worker configurations
|
||||
for config in worker_configs {
|
||||
lifecycle_manager = lifecycle_manager.add_worker(config);
|
||||
}
|
||||
|
||||
let mut lifecycle_manager = lifecycle_manager.build();
|
||||
|
||||
// Demonstrate lifecycle operations
|
||||
info!("=== Starting Worker Lifecycle Demo ===");
|
||||
|
||||
// 1. Start all workers
|
||||
info!("1. Starting all workers...");
|
||||
match lifecycle_manager.start_all_workers().await {
|
||||
Ok(_) => info!("✅ All workers started successfully"),
|
||||
Err(e) => {
|
||||
error!("❌ Failed to start workers: {}", e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for workers to initialize
|
||||
sleep(Duration::from_secs(5)).await;
|
||||
|
||||
// 2. Check worker status
|
||||
info!("2. Checking worker status...");
|
||||
match lifecycle_manager.get_all_worker_status().await {
|
||||
Ok(status_map) => {
|
||||
for (worker_name, status) in status_map {
|
||||
info!(" Worker '{}': State={:?}, PID={}", worker_name, status.state, status.pid);
|
||||
}
|
||||
}
|
||||
Err(e) => warn!("Failed to get worker status: {}", e),
|
||||
}
|
||||
|
||||
// 3. Demonstrate scaling
|
||||
info!("3. Demonstrating worker scaling...");
|
||||
|
||||
// Scale up OSIS workers
|
||||
info!(" Scaling up OSIS workers to 3...");
|
||||
if let Err(e) = lifecycle_manager.scale_workers(&ScriptType::OSIS, 3).await {
|
||||
warn!("Failed to scale OSIS workers: {}", e);
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(3)).await;
|
||||
|
||||
// Scale down SAL workers
|
||||
info!(" Scaling down SAL workers to 1...");
|
||||
if let Err(e) = lifecycle_manager.scale_workers(&ScriptType::SAL, 1).await {
|
||||
warn!("Failed to scale SAL workers: {}", e);
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(3)).await;
|
||||
|
||||
// 4. Check running worker counts
|
||||
info!("4. Checking running worker counts after scaling...");
|
||||
for script_type in [ScriptType::OSIS, ScriptType::SAL, ScriptType::V] {
|
||||
let count = lifecycle_manager.get_running_worker_count(&script_type).await;
|
||||
info!(" {:?}: {} workers running", script_type, count);
|
||||
}
|
||||
|
||||
// 5. Demonstrate restart functionality
|
||||
info!("5. Demonstrating worker restart...");
|
||||
if let Err(e) = lifecycle_manager.restart_worker("osis_worker_0").await {
|
||||
warn!("Failed to restart worker: {}", e);
|
||||
} else {
|
||||
info!(" ✅ Successfully restarted osis_worker_0");
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(3)).await;
|
||||
|
||||
// 6. Simulate job dispatch and health monitoring
|
||||
info!("6. Simulating job dispatch and health monitoring...");
|
||||
|
||||
// Update job time for a worker (simulating job dispatch)
|
||||
lifecycle_manager.update_worker_job_time("sal_worker_0");
|
||||
info!(" Updated job time for sal_worker_0");
|
||||
|
||||
// Perform health monitoring check
|
||||
if let Err(e) = lifecycle_manager.monitor_worker_health().await {
|
||||
warn!("Health monitoring failed: {}", e);
|
||||
} else {
|
||||
info!(" ✅ Health monitoring completed");
|
||||
}
|
||||
|
||||
// 7. Create and execute a test job
|
||||
info!("7. Creating and executing a test job...");
|
||||
let test_job = supervisor
|
||||
.new_job()
|
||||
.script_type(ScriptType::OSIS)
|
||||
.script_content("println!(\"Hello from worker!\");".to_string())
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()?;
|
||||
|
||||
match supervisor.run_job_and_await_result(&test_job).await {
|
||||
Ok(result) => info!(" ✅ Job executed successfully: {}", result),
|
||||
Err(e) => warn!(" ❌ Job execution failed: {}", e),
|
||||
}
|
||||
|
||||
// 8. Demonstrate graceful shutdown
|
||||
info!("8. Demonstrating graceful shutdown...");
|
||||
|
||||
// Stop specific workers
|
||||
info!(" Stopping specific workers...");
|
||||
for worker_name in ["osis_worker_1", "v_worker_0"] {
|
||||
if let Err(e) = lifecycle_manager.stop_worker(worker_name).await {
|
||||
warn!("Failed to stop worker {}: {}", worker_name, e);
|
||||
} else {
|
||||
info!(" ✅ Stopped worker: {}", worker_name);
|
||||
}
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
|
||||
// Stop all remaining workers
|
||||
info!(" Stopping all remaining workers...");
|
||||
if let Err(e) = lifecycle_manager.stop_all_workers().await {
|
||||
error!("Failed to stop all workers: {}", e);
|
||||
} else {
|
||||
info!(" ✅ All workers stopped successfully");
|
||||
}
|
||||
|
||||
info!("=== Worker Lifecycle Demo Completed ===");
|
||||
|
||||
// Optional: Start health monitoring loop (commented out for demo)
|
||||
// info!("Starting health monitoring loop (Ctrl+C to stop)...");
|
||||
// lifecycle_manager.start_health_monitoring().await;
|
||||
|
||||
Ok(())
|
||||
}
|
74
core/supervisor/examples/simple_lifecycle_demo.rs
Normal file
74
core/supervisor/examples/simple_lifecycle_demo.rs
Normal file
@@ -0,0 +1,74 @@
|
||||
use hero_supervisor::SupervisorBuilder;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use log::{info, error};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
env_logger::init();
|
||||
|
||||
info!("Starting Hero Supervisor Lifecycle Demo");
|
||||
|
||||
// Build supervisor with simplified worker configuration
|
||||
// Workers are automatically launched during build
|
||||
let supervisor = SupervisorBuilder::new()
|
||||
.redis_url("redis://localhost:6379")
|
||||
.zinit_socket_path("/var/run/zinit.sock")
|
||||
.osis_worker("/usr/local/bin/osis_worker")
|
||||
.sal_worker("/usr/local/bin/sal_worker")
|
||||
.v_worker("/usr/local/bin/v_worker")
|
||||
.worker_env_var("REDIS_URL", "redis://localhost:6379")
|
||||
.worker_env_var("LOG_LEVEL", "info")
|
||||
.build().await?;
|
||||
|
||||
info!("Supervisor created and workers launched successfully");
|
||||
|
||||
// Wait a moment for workers to start
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
|
||||
// Check worker status using the simplified API
|
||||
info!("Checking worker status...");
|
||||
let workers = supervisor.get_workers(&[]).await;
|
||||
|
||||
for worker in &workers {
|
||||
let status_info = if worker.is_running {
|
||||
format!("Running (PID: {})", worker.status.as_ref().map(|s| s.pid).unwrap_or(0))
|
||||
} else {
|
||||
"Stopped".to_string()
|
||||
};
|
||||
info!(" Worker '{}' ({:?}): {}", worker.config.name, worker.config.script_type, status_info);
|
||||
}
|
||||
|
||||
// Demonstrate lifecycle operations with simplified API
|
||||
info!("=== Worker Lifecycle Operations ===");
|
||||
|
||||
// 1. Demonstrate restart functionality
|
||||
info!("1. Demonstrating worker restart...");
|
||||
if let Err(e) = supervisor.restart_worker("osis_worker_1").await {
|
||||
error!("Failed to restart worker: {}", e);
|
||||
} else {
|
||||
info!(" ✅ Successfully restarted osis_worker_1");
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
|
||||
// 2. Send a ping job for health checking
|
||||
info!("2. Sending ping job for health checking...");
|
||||
if let Err(e) = supervisor.send_ping_job(hero_job::ScriptType::OSIS).await {
|
||||
error!("Ping job failed: {}", e);
|
||||
} else {
|
||||
info!(" ✅ Ping job completed successfully");
|
||||
}
|
||||
|
||||
// 3. Demonstrate graceful shutdown
|
||||
info!("3. Demonstrating graceful shutdown...");
|
||||
|
||||
// Stop specific workers
|
||||
if let Err(e) = supervisor.stop_worker("osis_worker_1").await {
|
||||
error!("Failed to stop worker: {}", e);
|
||||
} else {
|
||||
info!(" ✅ Worker stopped successfully");
|
||||
}
|
||||
|
||||
info!("Demo completed successfully!");
|
||||
Ok(())
|
||||
}
|
90
core/supervisor/examples/timeout_example.rs
Normal file
90
core/supervisor/examples/timeout_example.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
use log::info;
|
||||
use hero_supervisor::{SupervisorBuilder, SupervisorError, ScriptType};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
env_logger::builder()
|
||||
.filter_level(log::LevelFilter::Info)
|
||||
.init();
|
||||
|
||||
// Build the client using the new builder pattern
|
||||
let client = SupervisorBuilder::new()
|
||||
.caller_id("timeout-example-runner")
|
||||
.redis_url("redis://127.0.0.1/")
|
||||
.build()?;
|
||||
info!("Supervisor created.");
|
||||
|
||||
let script_content = r#"
|
||||
// This script will never be executed by a worker because the recipient does not exist.
|
||||
let x = 10;
|
||||
let y = x + 32;
|
||||
y
|
||||
"#;
|
||||
|
||||
// The worker_id points to a worker queue that doesn't have a worker.
|
||||
let non_existent_recipient = "non_existent_worker_for_timeout_test";
|
||||
let very_short_timeout = Duration::from_secs(2);
|
||||
|
||||
info!(
|
||||
"Submitting script to non-existent recipient '{}' with a timeout of {:?}...",
|
||||
non_existent_recipient, very_short_timeout
|
||||
);
|
||||
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Use the new JobBuilder
|
||||
let result = client
|
||||
.new_job()
|
||||
.script_type(ScriptType::HeroScript)
|
||||
.script(script_content)
|
||||
.timeout(very_short_timeout)
|
||||
.await_response()
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(details) => {
|
||||
log::error!(
|
||||
"Timeout Example FAILED: Expected a timeout, but got Ok: {:?}",
|
||||
details
|
||||
);
|
||||
Err("Expected timeout, but task completed successfully.".into())
|
||||
}
|
||||
Err(e) => {
|
||||
let elapsed = start_time.elapsed();
|
||||
info!("Timeout Example: Received error as expected: {}", e);
|
||||
info!("Elapsed time: {:?}", elapsed);
|
||||
|
||||
match e {
|
||||
SupervisorError::Timeout(task_id) => {
|
||||
info!("Timeout Example PASSED: Correctly received SupervisorError::Timeout for task_id: {}", task_id);
|
||||
// Ensure the elapsed time is close to the timeout duration
|
||||
// Allow for some buffer for processing
|
||||
assert!(
|
||||
elapsed >= very_short_timeout
|
||||
&& elapsed < very_short_timeout + Duration::from_secs(1),
|
||||
"Elapsed time {:?} should be close to timeout {:?}",
|
||||
elapsed,
|
||||
very_short_timeout
|
||||
);
|
||||
info!(
|
||||
"Elapsed time {:?} is consistent with timeout duration {:?}.",
|
||||
elapsed, very_short_timeout
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
other_error => {
|
||||
log::error!(
|
||||
"Timeout Example FAILED: Expected SupervisorError::Timeout, but got other error: {:?}",
|
||||
other_error
|
||||
);
|
||||
Err(format!(
|
||||
"Expected SupervisorError::Timeout, got other error: {:?}",
|
||||
other_error
|
||||
)
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
102
core/supervisor/src/error.rs
Normal file
102
core/supervisor/src/error.rs
Normal file
@@ -0,0 +1,102 @@
|
||||
// Added error
|
||||
// Duration is still used, Instant and sleep were removed
|
||||
|
||||
/// Comprehensive error type for all possible failures in the Rhai client.
|
||||
///
|
||||
/// This enum covers all error scenarios that can occur during client operations,
|
||||
/// from Redis connectivity issues to task execution timeouts.
|
||||
#[derive(Debug)]
|
||||
pub enum SupervisorError {
|
||||
/// Redis connection or operation error
|
||||
RedisError(redis::RedisError),
|
||||
/// JSON serialization/deserialization error
|
||||
SerializationError(serde_json::Error),
|
||||
/// Task execution timeout - contains the task_id that timed out
|
||||
Timeout(String),
|
||||
/// Task not found after submission - contains the task_id (rare occurrence)
|
||||
TaskNotFound(String),
|
||||
/// Context ID is missing
|
||||
ContextIdMissing,
|
||||
/// Invalid input provided
|
||||
InvalidInput(String),
|
||||
/// Job operation error
|
||||
JobError(hero_job::JobError),
|
||||
/// Worker lifecycle management errors
|
||||
WorkerStartFailed(String, String),
|
||||
WorkerStopFailed(String, String),
|
||||
WorkerRestartFailed(String, String),
|
||||
WorkerStatusFailed(String, String),
|
||||
WorkerNotFound(String),
|
||||
PingJobFailed(String, String),
|
||||
/// Zinit client operation error
|
||||
ZinitError(String),
|
||||
SupervisorNotConfigured,
|
||||
}
|
||||
|
||||
impl From<redis::RedisError> for SupervisorError {
|
||||
fn from(err: redis::RedisError) -> Self {
|
||||
SupervisorError::RedisError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for SupervisorError {
|
||||
fn from(err: serde_json::Error) -> Self {
|
||||
SupervisorError::SerializationError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<hero_job::JobError> for SupervisorError {
|
||||
fn from(err: hero_job::JobError) -> Self {
|
||||
SupervisorError::JobError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for SupervisorError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
SupervisorError::RedisError(e) => write!(f, "Redis error: {}", e),
|
||||
SupervisorError::SerializationError(e) => write!(f, "Serialization error: {}", e),
|
||||
SupervisorError::Timeout(task_id) => {
|
||||
write!(f, "Timeout waiting for task {} to complete", task_id)
|
||||
}
|
||||
SupervisorError::TaskNotFound(task_id) => {
|
||||
write!(f, "Task {} not found after submission", task_id)
|
||||
}
|
||||
SupervisorError::ContextIdMissing => {
|
||||
write!(f, "Context ID is missing")
|
||||
}
|
||||
SupervisorError::InvalidInput(msg) => {
|
||||
write!(f, "Invalid input: {}", msg)
|
||||
}
|
||||
SupervisorError::JobError(e) => {
|
||||
write!(f, "Job error: {}", e)
|
||||
}
|
||||
SupervisorError::WorkerStartFailed(worker, reason) => {
|
||||
write!(f, "Failed to start worker '{}': {}", worker, reason)
|
||||
}
|
||||
SupervisorError::WorkerStopFailed(worker, reason) => {
|
||||
write!(f, "Failed to stop worker '{}': {}", worker, reason)
|
||||
}
|
||||
SupervisorError::WorkerRestartFailed(worker, reason) => {
|
||||
write!(f, "Failed to restart worker '{}': {}", worker, reason)
|
||||
}
|
||||
SupervisorError::WorkerStatusFailed(worker, reason) => {
|
||||
write!(f, "Failed to get status for worker '{}': {}", worker, reason)
|
||||
}
|
||||
SupervisorError::WorkerNotFound(worker) => {
|
||||
write!(f, "Worker '{}' not found", worker)
|
||||
}
|
||||
SupervisorError::PingJobFailed(worker, reason) => {
|
||||
write!(f, "Ping job failed for worker '{}': {}", worker, reason)
|
||||
}
|
||||
SupervisorError::ZinitError(msg) => {
|
||||
write!(f, "Zinit error: {}", msg)
|
||||
}
|
||||
SupervisorError::SupervisorNotConfigured => {
|
||||
write!(f, "Supervisor not configured for health monitoring")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for SupervisorError {}
|
261
core/supervisor/src/job.rs
Normal file
261
core/supervisor/src/job.rs
Normal file
@@ -0,0 +1,261 @@
|
||||
use chrono::Utc;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{Supervisor, SupervisorError};
|
||||
use hero_job::{Job, ScriptType};
|
||||
|
||||
/// Builder for constructing and submitting script execution requests.
|
||||
///
|
||||
/// This builder provides a fluent interface for configuring script execution
|
||||
/// parameters and offers two submission modes: fire-and-forget (`submit()`)
|
||||
/// and request-reply (`await_response()`).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use std::time::Duration;
|
||||
/// use hero_supervisor::ScriptType;
|
||||
///
|
||||
/// # async fn example(client: &hero_supervisor::Supervisor) -> Result<String, hero_supervisor::SupervisorError> {
|
||||
/// let result = client
|
||||
/// .new_job()
|
||||
/// .script_type(ScriptType::OSIS)
|
||||
/// .script(r#"print("Hello, World!");"#)
|
||||
/// .timeout(Duration::from_secs(30))
|
||||
/// .await_response()
|
||||
/// .await?;
|
||||
/// # Ok(result)
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct JobBuilder<'a> {
|
||||
client: &'a Supervisor,
|
||||
request_id: String,
|
||||
context_id: String,
|
||||
caller_id: String,
|
||||
script: String,
|
||||
script_type: ScriptType,
|
||||
timeout: Duration,
|
||||
retries: u32,
|
||||
concurrent: bool,
|
||||
log_path: Option<String>,
|
||||
env_vars: HashMap<String, String>,
|
||||
prerequisites: Vec<String>,
|
||||
dependents: Vec<String>
|
||||
}
|
||||
|
||||
impl<'a> JobBuilder<'a> {
|
||||
pub fn new(client: &'a Supervisor) -> Self {
|
||||
Self {
|
||||
client,
|
||||
request_id: "".to_string(),
|
||||
context_id: "".to_string(),
|
||||
caller_id: "".to_string(),
|
||||
script: "".to_string(),
|
||||
script_type: ScriptType::OSIS, // Default to OSIS
|
||||
timeout: Duration::from_secs(5),
|
||||
retries: 0,
|
||||
concurrent: false,
|
||||
log_path: None,
|
||||
env_vars: HashMap::new(),
|
||||
prerequisites: Vec::new(),
|
||||
dependents: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn request_id(mut self, request_id: &str) -> Self {
|
||||
self.request_id = request_id.to_string();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn script_type(mut self, script_type: ScriptType) -> Self {
|
||||
self.script_type = script_type;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn context_id(mut self, context_id: &str) -> Self {
|
||||
self.context_id = context_id.to_string();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn script(mut self, script: &str) -> Self {
|
||||
self.script = script.to_string();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn script_path(mut self, script_path: &str) -> Self {
|
||||
self.script = std::fs::read_to_string(script_path).unwrap();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn timeout(mut self, timeout: Duration) -> Self {
|
||||
self.timeout = timeout;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn log_path(mut self, log_path: &str) -> Self {
|
||||
self.log_path = Some(log_path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set a single environment variable
|
||||
pub fn env_var(mut self, key: &str, value: &str) -> Self {
|
||||
self.env_vars.insert(key.to_string(), value.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set multiple environment variables from a HashMap
|
||||
pub fn env_vars(mut self, env_vars: HashMap<String, String>) -> Self {
|
||||
self.env_vars.extend(env_vars);
|
||||
self
|
||||
}
|
||||
|
||||
/// Clear all environment variables
|
||||
pub fn clear_env_vars(mut self) -> Self {
|
||||
self.env_vars.clear();
|
||||
self
|
||||
}
|
||||
|
||||
/// Add a prerequisite job ID that must complete before this job can run
|
||||
pub fn prerequisite(mut self, job_id: &str) -> Self {
|
||||
self.prerequisites.push(job_id.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set multiple prerequisite job IDs
|
||||
pub fn prerequisites(mut self, job_ids: Vec<String>) -> Self {
|
||||
self.prerequisites.extend(job_ids);
|
||||
self
|
||||
}
|
||||
|
||||
/// Add a dependent job ID that depends on this job completing
|
||||
pub fn dependent(mut self, job_id: &str) -> Self {
|
||||
self.dependents.push(job_id.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set multiple dependent job IDs
|
||||
pub fn dependents(mut self, job_ids: Vec<String>) -> Self {
|
||||
self.dependents.extend(job_ids);
|
||||
self
|
||||
}
|
||||
|
||||
/// Clear all prerequisites
|
||||
pub fn clear_prerequisites(mut self) -> Self {
|
||||
self.prerequisites.clear();
|
||||
self
|
||||
}
|
||||
|
||||
/// Clear all dependents
|
||||
pub fn clear_dependents(mut self) -> Self {
|
||||
self.dependents.clear();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Job, SupervisorError> {
|
||||
let request_id = if self.request_id.is_empty() {
|
||||
// Generate a UUID for the request_id
|
||||
Uuid::new_v4().to_string()
|
||||
} else {
|
||||
self.request_id.clone()
|
||||
};
|
||||
|
||||
if self.context_id.is_empty() {
|
||||
return Err(SupervisorError::ContextIdMissing);
|
||||
}
|
||||
|
||||
if self.caller_id.is_empty() {
|
||||
return Err(SupervisorError::ContextIdMissing);
|
||||
}
|
||||
|
||||
let now = Utc::now();
|
||||
|
||||
Ok(Job {
|
||||
id: request_id,
|
||||
caller_id: self.caller_id,
|
||||
context_id: self.context_id,
|
||||
script: self.script,
|
||||
script_type: self.script_type,
|
||||
timeout: self.timeout,
|
||||
retries: self.retries as u8,
|
||||
concurrent: self.concurrent,
|
||||
log_path: self.log_path.clone(),
|
||||
env_vars: self.env_vars.clone(),
|
||||
prerequisites: self.prerequisites.clone(),
|
||||
dependents: self.dependents.clone(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn submit(self) -> Result<(), SupervisorError> {
|
||||
// Create job first, then use client reference
|
||||
let request_id = if self.request_id.is_empty() {
|
||||
Uuid::new_v4().to_string()
|
||||
} else {
|
||||
self.request_id
|
||||
};
|
||||
|
||||
if self.context_id.is_empty() {
|
||||
return Err(SupervisorError::ContextIdMissing);
|
||||
}
|
||||
|
||||
let now = Utc::now();
|
||||
|
||||
let job = Job {
|
||||
id: request_id,
|
||||
caller_id: self.caller_id,
|
||||
context_id: self.context_id,
|
||||
script: self.script,
|
||||
script_type: self.script_type.clone(),
|
||||
timeout: self.timeout,
|
||||
retries: self.retries as u8,
|
||||
concurrent: self.concurrent,
|
||||
log_path: self.log_path.clone(),
|
||||
env_vars: self.env_vars.clone(),
|
||||
prerequisites: self.prerequisites.clone(),
|
||||
dependents: self.dependents.clone(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
};
|
||||
|
||||
self.client.create_job(&job).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn await_response(self) -> Result<String, SupervisorError> {
|
||||
// Create job first, then use client reference
|
||||
let request_id = if self.request_id.is_empty() {
|
||||
Uuid::new_v4().to_string()
|
||||
} else {
|
||||
self.request_id
|
||||
};
|
||||
|
||||
if self.context_id.is_empty() {
|
||||
return Err(SupervisorError::ContextIdMissing);
|
||||
}
|
||||
|
||||
let now = Utc::now();
|
||||
|
||||
let job = Job {
|
||||
id: request_id,
|
||||
caller_id: self.caller_id.clone(),
|
||||
context_id: self.context_id,
|
||||
script: self.script,
|
||||
script_type: self.script_type.clone(),
|
||||
timeout: self.timeout,
|
||||
retries: self.retries as u8,
|
||||
concurrent: self.concurrent,
|
||||
log_path: self.log_path.clone(),
|
||||
env_vars: self.env_vars.clone(),
|
||||
prerequisites: self.prerequisites.clone(),
|
||||
dependents: self.dependents.clone(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
};
|
||||
|
||||
let result = self.client.run_job_and_await_result(&job).await?;
|
||||
Ok(result)
|
||||
}
|
||||
}
|
596
core/supervisor/src/lib.rs
Normal file
596
core/supervisor/src/lib.rs
Normal file
@@ -0,0 +1,596 @@
|
||||
use log::{debug, error, info, warn};
|
||||
use redis::AsyncCommands;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use hero_job::NAMESPACE_PREFIX;
|
||||
use zinit_client::ZinitClient;
|
||||
|
||||
mod job;
|
||||
mod error;
|
||||
mod lifecycle;
|
||||
|
||||
pub use crate::error::SupervisorError;
|
||||
pub use crate::job::JobBuilder;
|
||||
pub use crate::lifecycle::WorkerConfig;
|
||||
// Re-export types from hero_job for public API
|
||||
pub use hero_job::{Job, JobStatus, ScriptType};
|
||||
|
||||
pub struct Supervisor {
|
||||
redis_client: redis::Client,
|
||||
zinit_client: ZinitClient,
|
||||
builder_data: Option<SupervisorBuilderData>,
|
||||
}
|
||||
|
||||
pub struct SupervisorBuilder {
|
||||
redis_url: Option<String>,
|
||||
zinit_socket_path: Option<String>,
|
||||
osis_worker: Option<String>,
|
||||
sal_worker: Option<String>,
|
||||
v_worker: Option<String>,
|
||||
python_worker: Option<String>,
|
||||
worker_env_vars: HashMap<String, String>,
|
||||
}
|
||||
|
||||
/// Helper struct to pass builder data to worker launch method
|
||||
struct SupervisorBuilderData {
|
||||
osis_worker: Option<String>,
|
||||
sal_worker: Option<String>,
|
||||
v_worker: Option<String>,
|
||||
python_worker: Option<String>,
|
||||
worker_env_vars: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl SupervisorBuilder {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
redis_url: None,
|
||||
zinit_socket_path: Some("/var/run/zinit.sock".to_string()),
|
||||
osis_worker: None,
|
||||
sal_worker: None,
|
||||
v_worker: None,
|
||||
python_worker: None,
|
||||
worker_env_vars: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn redis_url(mut self, url: &str) -> Self {
|
||||
self.redis_url = Some(url.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn zinit_socket_path(mut self, path: &str) -> Self {
|
||||
self.zinit_socket_path = Some(path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn osis_worker(mut self, binary_path: &str) -> Self {
|
||||
self.osis_worker = Some(binary_path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn sal_worker(mut self, binary_path: &str) -> Self {
|
||||
self.sal_worker = Some(binary_path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn v_worker(mut self, binary_path: &str) -> Self {
|
||||
self.v_worker = Some(binary_path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn python_worker(mut self, binary_path: &str) -> Self {
|
||||
self.python_worker = Some(binary_path.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn worker_env_var(mut self, key: &str, value: &str) -> Self {
|
||||
self.worker_env_vars.insert(key.to_string(), value.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn worker_env_vars(mut self, env_vars: HashMap<String, String>) -> Self {
|
||||
self.worker_env_vars.extend(env_vars);
|
||||
self
|
||||
}
|
||||
|
||||
/// Builds the final `Supervisor` instance synchronously.
|
||||
///
|
||||
/// This method validates the configuration and creates the Redis client.
|
||||
/// Worker launching is deferred to the `start_workers()` method.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `Ok(Supervisor)` - Successfully configured client
|
||||
/// * `Err(SupervisorError)` - Configuration or connection error
|
||||
pub fn build(self) -> Result<Supervisor, SupervisorError> {
|
||||
let url = self.redis_url
|
||||
.unwrap_or_else(|| "redis://127.0.0.1/".to_string());
|
||||
let client = redis::Client::open(url)?;
|
||||
|
||||
let zinit_socket = self.zinit_socket_path
|
||||
.unwrap_or_else(|| "/var/run/zinit.sock".to_string());
|
||||
let zinit_client = ZinitClient::new(&zinit_socket);
|
||||
|
||||
// Store builder data for later use in start_workers()
|
||||
let builder_data = SupervisorBuilderData {
|
||||
osis_worker: self.osis_worker,
|
||||
sal_worker: self.sal_worker,
|
||||
v_worker: self.v_worker,
|
||||
python_worker: self.python_worker,
|
||||
worker_env_vars: self.worker_env_vars,
|
||||
};
|
||||
|
||||
let supervisor = Supervisor {
|
||||
redis_client: client,
|
||||
zinit_client,
|
||||
builder_data: Some(builder_data),
|
||||
};
|
||||
|
||||
Ok(supervisor)
|
||||
}
|
||||
}
|
||||
|
||||
impl Supervisor {
|
||||
/// Start all configured workers asynchronously.
|
||||
/// This method should be called after build() to launch the workers.
|
||||
pub async fn start_workers(&self) -> Result<(), SupervisorError> {
|
||||
// Clean up any existing worker services first
|
||||
self.cleanup_existing_workers().await?;
|
||||
|
||||
// Launch configured workers if builder data is available
|
||||
if let Some(builder_data) = &self.builder_data {
|
||||
self.launch_configured_workers(builder_data).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean up all worker services from zinit on program exit
|
||||
pub async fn cleanup_and_shutdown(&self) -> Result<(), SupervisorError> {
|
||||
info!("Cleaning up worker services before shutdown...");
|
||||
|
||||
let worker_names = vec![
|
||||
"osis_worker_1",
|
||||
"sal_worker_1",
|
||||
"v_worker_1",
|
||||
"python_worker_1"
|
||||
];
|
||||
|
||||
for worker_name in worker_names {
|
||||
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
|
||||
warn!("Failed to cleanup worker {}: {}", worker_name, e);
|
||||
}
|
||||
}
|
||||
|
||||
info!("Worker cleanup completed");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean up any existing worker services on startup
|
||||
async fn cleanup_existing_workers(&self) -> Result<(), SupervisorError> {
|
||||
info!("Cleaning up any existing worker services...");
|
||||
|
||||
let worker_names = vec![
|
||||
"osis_worker_1",
|
||||
"sal_worker_1",
|
||||
"v_worker_1",
|
||||
"python_worker_1"
|
||||
];
|
||||
|
||||
for worker_name in worker_names {
|
||||
// Try to stop and delete, but don't fail if they don't exist
|
||||
let _ = self.stop_and_delete_worker(worker_name).await;
|
||||
}
|
||||
|
||||
info!("Existing worker cleanup completed");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop and delete a worker service from zinit
|
||||
async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
|
||||
// First try to stop the worker
|
||||
if let Err(e) = self.zinit_client.stop(worker_name).await {
|
||||
debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
|
||||
}
|
||||
|
||||
// Then try to delete the service
|
||||
if let Err(e) = self.zinit_client.delete(worker_name).await {
|
||||
debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
|
||||
} else {
|
||||
info!("Successfully deleted worker service: {}", worker_name);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the hardcoded worker queue key for the script type
|
||||
fn get_worker_queue_key(&self, script_type: &ScriptType) -> String {
|
||||
format!("{}worker_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix())
|
||||
}
|
||||
|
||||
pub fn new_job(&self) -> JobBuilder {
|
||||
JobBuilder::new(self)
|
||||
}
|
||||
|
||||
// Internal helper to submit script details and push to work queue
|
||||
async fn create_job_using_connection(
|
||||
&self,
|
||||
conn: &mut redis::aio::MultiplexedConnection,
|
||||
job: &Job,
|
||||
) -> Result<(), SupervisorError> {
|
||||
debug!(
|
||||
"Submitting play request: {} for script type: {:?} with namespace prefix: {}",
|
||||
job.id, job.script_type, NAMESPACE_PREFIX
|
||||
);
|
||||
|
||||
// Use the shared Job struct's Redis storage method
|
||||
job.store_in_redis(conn).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to store job in Redis: {}", e)))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Internal helper to submit script details and push to work queue
|
||||
async fn start_job_using_connection(
|
||||
&self,
|
||||
conn: &mut redis::aio::MultiplexedConnection,
|
||||
job_id: String,
|
||||
script_type: &ScriptType
|
||||
) -> Result<(), SupervisorError> {
|
||||
let worker_queue_key = self.get_worker_queue_key(script_type);
|
||||
|
||||
// lpush also infers its types, RV is typically i64 (length of list) or () depending on exact command variant
|
||||
// For `redis::AsyncCommands::lpush`, it's `RedisResult<R>` where R: FromRedisValue
|
||||
// Often this is the length of the list. Let's allow inference or specify if needed.
|
||||
let _: redis::RedisResult<i64> =
|
||||
conn.lpush(&worker_queue_key, job_id.clone()).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Internal helper to await response from worker
|
||||
async fn await_response_from_connection(
|
||||
&self,
|
||||
conn: &mut redis::aio::MultiplexedConnection,
|
||||
job_key: &String,
|
||||
reply_queue_key: &String,
|
||||
timeout: Duration,
|
||||
) -> Result<String, SupervisorError> {
|
||||
// BLPOP on the reply queue
|
||||
// The timeout for BLPOP is in seconds (integer)
|
||||
let blpop_timeout_secs = timeout.as_secs().max(1); // Ensure at least 1 second for BLPOP timeout
|
||||
|
||||
match conn
|
||||
.blpop::<&String, Option<(String, String)>>(reply_queue_key, blpop_timeout_secs as f64)
|
||||
.await
|
||||
{
|
||||
Ok(Some((_queue, result_message_str))) => {
|
||||
Ok(result_message_str)
|
||||
}
|
||||
Ok(None) => {
|
||||
// BLPOP timed out
|
||||
warn!(
|
||||
"Timeout waiting for result on reply queue {} for job {}",
|
||||
reply_queue_key, job_key
|
||||
);
|
||||
// Optionally, delete the reply queue
|
||||
let _: redis::RedisResult<i32> = conn.del(&reply_queue_key).await;
|
||||
Err(SupervisorError::Timeout(job_key.clone()))
|
||||
}
|
||||
Err(e) => {
|
||||
// Redis error
|
||||
error!(
|
||||
"Redis error on BLPOP for reply queue {}: {}",
|
||||
reply_queue_key, e
|
||||
);
|
||||
// Optionally, delete the reply queue
|
||||
let _: redis::RedisResult<i32> = conn.del(&reply_queue_key).await;
|
||||
Err(SupervisorError::RedisError(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// New method using dedicated reply queue
|
||||
pub async fn create_job(
|
||||
&self,
|
||||
job: &Job,
|
||||
) -> Result<(), SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
self.create_job_using_connection(
|
||||
&mut conn,
|
||||
&job, // Pass the job_id parameter
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Method to start a previously created job
|
||||
pub async fn start_job(
|
||||
&self,
|
||||
job_id: &str,
|
||||
) -> Result<(), SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
// Load the job to get its script type
|
||||
let job = Job::load_from_redis(&mut conn, job_id).await?;
|
||||
|
||||
self.start_job_using_connection(&mut conn, job_id.to_string(), &job.script_type).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// New method using dedicated reply queue with automatic worker selection
|
||||
pub async fn run_job_and_await_result(
|
||||
&self,
|
||||
job: &Job
|
||||
) -> Result<String, SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
let reply_queue_key = format!("{}:reply:{}", NAMESPACE_PREFIX, job.id); // Derived from the passed job_id
|
||||
|
||||
self.create_job_using_connection(
|
||||
&mut conn,
|
||||
&job, // Pass the job_id parameter
|
||||
)
|
||||
.await?;
|
||||
|
||||
self.start_job_using_connection(&mut conn, job.id.clone(), &job.script_type).await?;
|
||||
|
||||
info!(
|
||||
"Task {} submitted. Waiting for result on queue {} with timeout {:?}...",
|
||||
job.id, // This is the UUID
|
||||
reply_queue_key,
|
||||
job.timeout
|
||||
);
|
||||
|
||||
self.await_response_from_connection(
|
||||
&mut conn,
|
||||
&job.id,
|
||||
&reply_queue_key,
|
||||
job.timeout,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
// Method to get job status
|
||||
pub async fn get_job_status(
|
||||
&self,
|
||||
job_id: &str,
|
||||
) -> Result<JobStatus, SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
let job_key = format!("{}{}", NAMESPACE_PREFIX, job_id);
|
||||
|
||||
let result_map: Option<std::collections::HashMap<String, String>> =
|
||||
conn.hgetall(&job_key).await?;
|
||||
|
||||
match result_map {
|
||||
Some(map) => {
|
||||
let status_str = map.get("status").cloned().unwrap_or_else(|| {
|
||||
warn!("Task {}: 'status' field missing from Redis hash, defaulting to empty.", job_id);
|
||||
String::new()
|
||||
});
|
||||
|
||||
let status = match status_str.as_str() {
|
||||
"dispatched" => JobStatus::Dispatched,
|
||||
"started" => JobStatus::Started,
|
||||
"error" => JobStatus::Error,
|
||||
"finished" => JobStatus::Finished,
|
||||
_ => JobStatus::Dispatched, // default
|
||||
};
|
||||
|
||||
Ok(status)
|
||||
}
|
||||
None => {
|
||||
warn!("Job {} not found in Redis", job_id);
|
||||
Ok(JobStatus::Dispatched) // default for missing jobs
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Method to get job output
|
||||
pub async fn get_job_output(
|
||||
&self,
|
||||
job_id: &str,
|
||||
) -> Result<Option<String>, SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
let job_key = format!("{}{}", NAMESPACE_PREFIX, job_id);
|
||||
|
||||
let result_map: Option<std::collections::HashMap<String, String>> =
|
||||
conn.hgetall(&job_key).await?;
|
||||
|
||||
match result_map {
|
||||
Some(map) => {
|
||||
Ok(map.get("output").cloned())
|
||||
}
|
||||
None => {
|
||||
warn!("Job {} not found in Redis", job_id);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// List all jobs in Redis
|
||||
pub async fn list_jobs(&self) -> Result<Vec<String>, SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
// Use the shared Job struct's list method
|
||||
Job::list_all_job_ids(&mut conn).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to list jobs: {}", e)))
|
||||
}
|
||||
|
||||
/// Stop a job by pushing its ID to the stop queue
|
||||
pub async fn stop_job(&self, job_id: &str) -> Result<(), SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
// Get job details to determine script type and appropriate worker
|
||||
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
|
||||
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;
|
||||
|
||||
if job_data.is_empty() {
|
||||
return Err(SupervisorError::InvalidInput(format!("Job {} not found", job_id)));
|
||||
}
|
||||
|
||||
// Parse script type from job data
|
||||
let script_type_str = job_data.get("script_type")
|
||||
.ok_or_else(|| SupervisorError::InvalidInput("Job missing script_type field".to_string()))?;
|
||||
|
||||
let script_type: ScriptType = serde_json::from_str(&format!("\"{}\"", script_type_str))
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Invalid script type: {}", e)))?;
|
||||
|
||||
// Use hardcoded stop queue key for this script type
|
||||
let stop_queue_key = format!("{}stop_queue:{}", NAMESPACE_PREFIX, script_type.worker_queue_suffix());
|
||||
|
||||
// Push job ID to the stop queue
|
||||
conn.lpush::<_, _, ()>(&stop_queue_key, job_id).await?;
|
||||
|
||||
info!("Job {} added to stop queue {} for script type {:?}", job_id, stop_queue_key, script_type);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get logs for a job by reading from its log file
|
||||
pub async fn get_job_logs(&self, job_id: &str) -> Result<Option<String>, SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
|
||||
|
||||
// Get the job data to find the log path
|
||||
let result_map: Option<std::collections::HashMap<String, String>> =
|
||||
conn.hgetall(&job_key).await?;
|
||||
|
||||
match result_map {
|
||||
Some(map) => {
|
||||
if let Some(log_path) = map.get("log_path") {
|
||||
// Try to read the log file
|
||||
match std::fs::read_to_string(log_path) {
|
||||
Ok(contents) => Ok(Some(contents)),
|
||||
Err(e) => {
|
||||
warn!("Failed to read log file {}: {}", log_path, e);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No log path configured for this job
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warn!("Job {} not found in Redis", job_id);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete a specific job by ID
|
||||
pub async fn delete_job(&self, job_id: &str) -> Result<(), SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
// Use the shared Job struct's delete method
|
||||
Job::delete_from_redis(&mut conn, job_id).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to delete job: {}", e)))?;
|
||||
|
||||
info!("Job {} deleted successfully", job_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clear all jobs from Redis
|
||||
pub async fn clear_all_jobs(&self) -> Result<usize, SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
// Get all job IDs first
|
||||
let job_ids = Job::list_all_job_ids(&mut conn).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to list jobs: {}", e)))?;
|
||||
|
||||
let count = job_ids.len();
|
||||
|
||||
// Delete each job using the shared method
|
||||
for job_id in job_ids {
|
||||
Job::delete_from_redis(&mut conn, &job_id).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to delete job {}: {}", job_id, e)))?;
|
||||
}
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
/// Check if all prerequisites for a job are completed
|
||||
pub async fn check_prerequisites_completed(&self, job_id: &str) -> Result<bool, SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
// Load the job using the shared Job struct
|
||||
let job = Job::load_from_redis(&mut conn, job_id).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to load job: {}", e)))?;
|
||||
|
||||
// Check each prerequisite job status
|
||||
for prereq_id in &job.prerequisites {
|
||||
let status = Job::get_status(&mut conn, prereq_id).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to get prerequisite status: {}", e)))?;
|
||||
|
||||
if status != JobStatus::Finished {
|
||||
return Ok(false); // Prerequisite not completed
|
||||
}
|
||||
}
|
||||
|
||||
Ok(true) // All prerequisites completed (or no prerequisites)
|
||||
}
|
||||
|
||||
/// Update job status and check dependent jobs for readiness
|
||||
pub async fn update_job_status_and_check_dependents(&self, job_id: &str, new_status: JobStatus) -> Result<Vec<String>, SupervisorError> {
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
|
||||
// Update job status using shared Job method
|
||||
Job::update_status(&mut conn, job_id, new_status.clone()).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to update job status: {}", e)))?;
|
||||
|
||||
let mut ready_jobs = Vec::new();
|
||||
|
||||
// If job finished, check dependent jobs
|
||||
if new_status == JobStatus::Finished {
|
||||
// Load the job to get its dependents
|
||||
let job = Job::load_from_redis(&mut conn, job_id).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to load job: {}", e)))?;
|
||||
|
||||
// Check each dependent job
|
||||
for dependent_id in &job.dependents {
|
||||
let dependent_status = Job::get_status(&mut conn, dependent_id).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to get dependent status: {}", e)))?;
|
||||
|
||||
// Only check jobs that are waiting for prerequisites
|
||||
if dependent_status == JobStatus::WaitingForPrerequisites {
|
||||
// Check if all prerequisites are now completed
|
||||
if self.check_prerequisites_completed(dependent_id).await? {
|
||||
// Update status to dispatched and add to ready jobs
|
||||
Job::update_status(&mut conn, dependent_id, JobStatus::Dispatched).await
|
||||
.map_err(|e| SupervisorError::InvalidInput(format!("Failed to update dependent status: {}", e)))?;
|
||||
ready_jobs.push(dependent_id.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ready_jobs)
|
||||
}
|
||||
|
||||
/// Dispatch jobs that are ready (have all prerequisites completed)
|
||||
pub async fn dispatch_ready_jobs(&self, ready_job_ids: Vec<String>) -> Result<(), SupervisorError> {
|
||||
for job_id in ready_job_ids {
|
||||
// Get job data to determine script type and select worker
|
||||
let mut conn = self.redis_client.get_multiplexed_async_connection().await?;
|
||||
let job_key = format!("{}job:{}", NAMESPACE_PREFIX, job_id);
|
||||
let job_data: std::collections::HashMap<String, String> = conn.hgetall(&job_key).await?;
|
||||
|
||||
if let Some(script_type_str) = job_data.get("script_type") {
|
||||
// Parse script type (stored as Debug format, e.g., "OSIS")
|
||||
let script_type = match script_type_str.as_str() {
|
||||
"OSIS" => ScriptType::OSIS,
|
||||
"SAL" => ScriptType::SAL,
|
||||
"V" => ScriptType::V,
|
||||
"Python" => ScriptType::Python,
|
||||
_ => return Err(SupervisorError::InvalidInput(format!("Unknown script type: {}", script_type_str))),
|
||||
};
|
||||
|
||||
// Dispatch job using hardcoded queue
|
||||
self.start_job_using_connection(&mut conn, job_id, &script_type).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
368
core/supervisor/src/lifecycle.rs
Normal file
368
core/supervisor/src/lifecycle.rs
Normal file
@@ -0,0 +1,368 @@
|
||||
//! Worker lifecycle management functionality for the Hero Supervisor
|
||||
//!
|
||||
//! This module provides worker process lifecycle management using Zinit as the process manager.
|
||||
//! All functionality is implemented as methods on the Supervisor struct for a clean API.
|
||||
|
||||
use log::{debug, error, info, warn};
|
||||
use serde_json::json;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use zinit_client::{ZinitClient, ServiceStatus, ServiceState};
|
||||
use hero_job::ScriptType;
|
||||
use crate::{Supervisor, SupervisorError};
|
||||
|
||||
/// Information about a worker including its configuration and current status
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WorkerInfo {
|
||||
pub config: WorkerConfig,
|
||||
pub status: Option<ServiceStatus>,
|
||||
pub is_running: bool,
|
||||
}
|
||||
|
||||
/// Configuration for a worker binary
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WorkerConfig {
|
||||
/// Name of the worker service
|
||||
pub name: String,
|
||||
/// Path to the worker binary
|
||||
pub binary_path: PathBuf,
|
||||
/// Script type this worker handles
|
||||
pub script_type: ScriptType,
|
||||
/// Command line arguments for the worker
|
||||
pub args: Vec<String>,
|
||||
/// Environment variables for the worker
|
||||
pub env: HashMap<String, String>,
|
||||
/// Whether this worker should restart on exit
|
||||
pub restart_on_exit: bool,
|
||||
/// Health check command (optional)
|
||||
pub health_check: Option<String>,
|
||||
/// Dependencies that must be running first
|
||||
pub dependencies: Vec<String>,
|
||||
}
|
||||
|
||||
impl WorkerConfig {
|
||||
pub fn new(name: String, binary_path: PathBuf, script_type: ScriptType) -> Self {
|
||||
Self {
|
||||
name,
|
||||
binary_path,
|
||||
script_type,
|
||||
args: Vec::new(),
|
||||
env: HashMap::new(),
|
||||
restart_on_exit: true,
|
||||
health_check: None,
|
||||
dependencies: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_args(mut self, args: Vec<String>) -> Self {
|
||||
self.args = args;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_env(mut self, env: HashMap<String, String>) -> Self {
|
||||
self.env = env;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_health_check(mut self, health_check: String) -> Self {
|
||||
self.health_check = Some(health_check);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_dependencies(mut self, dependencies: Vec<String>) -> Self {
|
||||
self.dependencies = dependencies;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn no_restart(mut self) -> Self {
|
||||
self.restart_on_exit = false;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Worker lifecycle management methods for Supervisor
|
||||
impl Supervisor {
|
||||
/// Get all workers with their configuration and status - unified method
|
||||
pub async fn get_workers(&self, worker_configs: &[WorkerConfig]) -> Vec<WorkerInfo> {
|
||||
let mut workers = Vec::new();
|
||||
|
||||
for config in worker_configs {
|
||||
let status = self.zinit_client.status(&config.name).await.ok();
|
||||
let is_running = status.as_ref()
|
||||
.map(|s| matches!(s.state, ServiceState::Running) && s.pid > 0)
|
||||
.unwrap_or(false);
|
||||
|
||||
workers.push(WorkerInfo {
|
||||
config: config.clone(),
|
||||
status,
|
||||
is_running,
|
||||
});
|
||||
}
|
||||
|
||||
workers
|
||||
}
|
||||
|
||||
/// Start a worker using Zinit
|
||||
pub async fn start_worker(
|
||||
&self,
|
||||
worker_config: &WorkerConfig,
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Starting worker: {}", worker_config.name);
|
||||
|
||||
// Create service configuration for Zinit
|
||||
let service_config = self.create_service_config(worker_config);
|
||||
|
||||
// Create the service in Zinit
|
||||
self.zinit_client.create_service(&worker_config.name, service_config).await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create service: {}", e)))?;
|
||||
|
||||
// Start the service
|
||||
self.zinit_client.start(&worker_config.name).await
|
||||
.map_err(|e| SupervisorError::ZinitError(format!("Failed to start worker: {}", e)))?;
|
||||
|
||||
info!("Successfully started worker: {}", worker_config.name);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop a worker using Zinit
|
||||
pub async fn stop_worker(
|
||||
&self,
|
||||
worker_name: &str,
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Stopping worker: {}", worker_name);
|
||||
|
||||
match self.zinit_client.stop(worker_name).await {
|
||||
Ok(_) => {
|
||||
info!("Successfully stopped worker: {}", worker_name);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to stop worker {}: {}", worker_name, e);
|
||||
Err(SupervisorError::WorkerStopFailed(worker_name.to_string(), e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Restart a worker using Zinit
|
||||
pub async fn restart_worker(
|
||||
&self,
|
||||
worker_name: &str,
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Restarting worker: {}", worker_name);
|
||||
|
||||
match self.zinit_client.restart(worker_name).await {
|
||||
Ok(_) => {
|
||||
info!("Successfully restarted worker: {}", worker_name);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to restart worker {}: {}", worker_name, e);
|
||||
Err(SupervisorError::WorkerRestartFailed(worker_name.to_string(), e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get status of a worker using Zinit
|
||||
pub async fn get_worker_status(
|
||||
&self,
|
||||
worker_name: &str,
|
||||
zinit_client: &ZinitClient,
|
||||
) -> Result<ServiceStatus, SupervisorError> {
|
||||
match zinit_client.status(worker_name).await {
|
||||
Ok(status) => Ok(status),
|
||||
Err(e) => {
|
||||
error!("Failed to get status for worker {}: {}", worker_name, e);
|
||||
Err(SupervisorError::WorkerStatusFailed(worker_name.to_string(), e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get status of all workers
|
||||
pub async fn get_all_worker_status(
|
||||
&self,
|
||||
worker_configs: &[WorkerConfig],
|
||||
zinit_client: &ZinitClient,
|
||||
) -> Result<HashMap<String, ServiceStatus>, SupervisorError> {
|
||||
let mut status_map = HashMap::new();
|
||||
|
||||
for worker in worker_configs {
|
||||
match zinit_client.status(&worker.name).await {
|
||||
Ok(status) => {
|
||||
status_map.insert(worker.name.clone(), status);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to get status for worker {}: {}", worker.name, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(status_map)
|
||||
}
|
||||
|
||||
/// Start multiple workers
|
||||
pub async fn start_workers(
|
||||
&self,
|
||||
worker_configs: &[WorkerConfig],
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Starting {} workers", worker_configs.len());
|
||||
|
||||
for worker in worker_configs {
|
||||
self.start_worker(worker).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop multiple workers
|
||||
pub async fn stop_workers(
|
||||
&self,
|
||||
worker_names: &[String],
|
||||
) -> Result<(), SupervisorError> {
|
||||
info!("Stopping {} workers", worker_names.len());
|
||||
|
||||
for worker_name in worker_names {
|
||||
self.stop_worker(worker_name).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get count of running workers for a script type
|
||||
pub async fn get_running_worker_count(
|
||||
&self,
|
||||
worker_configs: &[WorkerConfig],
|
||||
script_type: &ScriptType,
|
||||
zinit_client: &ZinitClient,
|
||||
) -> usize {
|
||||
let mut running_count = 0;
|
||||
|
||||
for worker in worker_configs {
|
||||
if worker.script_type == *script_type {
|
||||
if let Ok(status) = zinit_client.status(&worker.name).await {
|
||||
if status.state == ServiceState::Running {
|
||||
running_count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
running_count
|
||||
}
|
||||
|
||||
/// Send a ping job to a worker for health checking
|
||||
pub async fn send_ping_job(
|
||||
&self,
|
||||
script_type: ScriptType,
|
||||
) -> Result<(), SupervisorError> {
|
||||
// Create a ping job
|
||||
let ping_job = self
|
||||
.new_job()
|
||||
.script_type(script_type.clone())
|
||||
.script("ping") // Simple ping script
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()?;
|
||||
|
||||
// Execute the ping job with a short timeout
|
||||
match self.run_job_and_await_result(&ping_job).await {
|
||||
Ok(_) => {
|
||||
debug!("Ping job successful for script type: {:?}", script_type);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Ping job failed for script type {:?}: {}", script_type, e);
|
||||
Err(SupervisorError::PingJobFailed(format!("{:?}", script_type), e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Create Zinit service configuration from worker config
|
||||
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Value {
|
||||
let mut config = json!({
|
||||
"exec": format!("{} {}",
|
||||
worker.binary_path.display(),
|
||||
worker.args.join(" ")
|
||||
),
|
||||
"oneshot": !worker.restart_on_exit,
|
||||
});
|
||||
|
||||
if let Some(health_check) = &worker.health_check {
|
||||
config["test"] = json!(health_check);
|
||||
}
|
||||
|
||||
if !worker.dependencies.is_empty() {
|
||||
config["after"] = json!(worker.dependencies);
|
||||
}
|
||||
|
||||
// Add environment variables if any
|
||||
if !worker.env.is_empty() {
|
||||
config["env"] = json!(worker.env);
|
||||
}
|
||||
|
||||
config
|
||||
}
|
||||
|
||||
/// Launch workers based on SupervisorBuilder configuration
|
||||
pub(crate) async fn launch_configured_workers(&self, builder: &crate::SupervisorBuilderData) -> Result<(), SupervisorError> {
|
||||
use hero_job::ScriptType;
|
||||
use std::path::PathBuf;
|
||||
|
||||
// Launch OSIS worker if configured
|
||||
if let Some(binary_path) = &builder.osis_worker {
|
||||
let worker_id = "osis_worker_1";
|
||||
let mut config = WorkerConfig::new(
|
||||
worker_id.to_string(),
|
||||
PathBuf::from(binary_path),
|
||||
ScriptType::OSIS
|
||||
);
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
|
||||
info!("Launching OSIS worker: {}", worker_id);
|
||||
self.start_worker(&config).await?;
|
||||
}
|
||||
|
||||
// Launch SAL worker if configured
|
||||
if let Some(binary_path) = &builder.sal_worker {
|
||||
let worker_id = "sal_worker_1";
|
||||
let mut config = WorkerConfig::new(
|
||||
worker_id.to_string(),
|
||||
PathBuf::from(binary_path),
|
||||
ScriptType::SAL
|
||||
);
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
|
||||
info!("Launching SAL worker: {}", worker_id);
|
||||
self.start_worker(&config).await?;
|
||||
}
|
||||
|
||||
// Launch V worker if configured
|
||||
if let Some(binary_path) = &builder.v_worker {
|
||||
let worker_id = "v_worker_1";
|
||||
let mut config = WorkerConfig::new(
|
||||
worker_id.to_string(),
|
||||
PathBuf::from(binary_path),
|
||||
ScriptType::V
|
||||
);
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
|
||||
info!("Launching V worker: {}", worker_id);
|
||||
self.start_worker(&config).await?;
|
||||
}
|
||||
|
||||
// Launch Python worker if configured
|
||||
if let Some(binary_path) = &builder.python_worker {
|
||||
let worker_id = "python_worker_1";
|
||||
let mut config = WorkerConfig::new(
|
||||
worker_id.to_string(),
|
||||
PathBuf::from(binary_path),
|
||||
ScriptType::Python
|
||||
);
|
||||
config.env.extend(builder.worker_env_vars.clone());
|
||||
|
||||
info!("Launching Python worker: {}", worker_id);
|
||||
self.start_worker(&config).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user