refactor wip

This commit is contained in:
Timur Gordon
2025-08-05 12:19:38 +02:00
parent 8ed40ce99c
commit 7a652c9c3c
51 changed files with 6183 additions and 840 deletions

View File

@@ -31,6 +31,8 @@ pub enum SupervisorError {
/// Zinit client operation error
ZinitError(String),
SupervisorNotConfigured,
/// Configuration file parsing error
ConfigError(String),
}
impl From<redis::RedisError> for SupervisorError {
@@ -95,6 +97,9 @@ impl std::fmt::Display for SupervisorError {
SupervisorError::SupervisorNotConfigured => {
write!(f, "Supervisor not configured for health monitoring")
}
SupervisorError::ConfigError(msg) => {
write!(f, "Configuration error: {}", msg)
}
}
}
}

View File

@@ -1,9 +1,14 @@
use log::{debug, error, info, warn};
use redis::AsyncCommands;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use hero_job::NAMESPACE_PREFIX;
use zinit_client::ZinitClient;
use zinit_client::Client as ZinitClient;
mod job;
mod error;
@@ -23,46 +28,209 @@ pub struct Supervisor {
pub struct SupervisorBuilder {
redis_url: Option<String>,
zinit_socket_path: Option<String>,
osis_worker: Option<String>,
sal_worker: Option<String>,
v_worker: Option<String>,
python_worker: Option<String>,
worker_env_vars: HashMap<String, String>,
websocket_config: Option<WebSocketServerConfig>,
}
/// Helper struct to pass builder data to worker launch method
#[derive(Clone)]
struct SupervisorBuilderData {
osis_worker: Option<String>,
sal_worker: Option<String>,
v_worker: Option<String>,
python_worker: Option<String>,
worker_env_vars: HashMap<String, String>,
websocket_config: Option<WebSocketServerConfig>,
}
/// TOML configuration structure for the supervisor
#[derive(Debug, Deserialize, Serialize)]
pub struct SupervisorConfig {
pub global: GlobalConfig,
pub websocket_server: Option<WebSocketServerConfig>,
pub osis_worker: Option<WorkerConfigToml>,
pub sal_worker: Option<WorkerConfigToml>,
pub v_worker: Option<WorkerConfigToml>,
pub python_worker: Option<WorkerConfigToml>,
}
/// Global configuration section
#[derive(Debug, Deserialize, Serialize)]
pub struct GlobalConfig {
pub redis_url: String,
}
/// Worker configuration section in TOML
#[derive(Debug, Deserialize, Serialize)]
pub struct WorkerConfigToml {
pub binary_path: String,
#[serde(default)]
pub env_vars: HashMap<String, String>,
}
/// WebSocket server configuration section in TOML
/// This mirrors the ServerConfig from hero_websocket_server but avoids circular dependency
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct WebSocketServerConfig {
/// Server host address
#[serde(default = "default_host")]
pub host: String,
/// Server port
#[serde(default = "default_port")]
pub port: u16,
/// Redis connection URL
#[serde(default = "default_redis_url")]
pub redis_url: String,
/// Enable authentication
#[serde(default)]
pub auth: bool,
/// Enable TLS/WSS
#[serde(default)]
pub tls: bool,
/// Path to TLS certificate file
pub cert: Option<String>,
/// Path to TLS private key file
pub key: Option<String>,
/// Separate port for TLS connections
pub tls_port: Option<u16>,
/// Circles configuration - maps circle names to lists of member public keys
#[serde(default)]
pub circles: HashMap<String, Vec<String>>,
}
// Default value functions for WebSocket server config
fn default_host() -> String {
"127.0.0.1".to_string()
}
fn default_port() -> u16 {
8443
}
fn default_redis_url() -> String {
"redis://127.0.0.1/".to_string()
}
impl SupervisorBuilder {
pub fn new() -> Self {
Self {
redis_url: None,
zinit_socket_path: Some("/var/run/zinit.sock".to_string()),
osis_worker: None,
sal_worker: None,
v_worker: None,
python_worker: None,
worker_env_vars: HashMap::new(),
websocket_config: None,
}
}
/// Create a SupervisorBuilder from a TOML configuration file
pub fn from_toml<P: AsRef<Path>>(toml_path: P) -> Result<Self, SupervisorError> {
let toml_content = fs::read_to_string(toml_path)
.map_err(|e| SupervisorError::ConfigError(format!("Failed to read TOML file: {}", e)))?;
let config: SupervisorConfig = toml::from_str(&toml_content)
.map_err(|e| SupervisorError::ConfigError(format!("Failed to parse TOML: {}", e)))?;
let mut builder = Self::new()
.redis_url(&config.global.redis_url);
// Configure workers based on TOML config
if let Some(osis_config) = config.osis_worker {
builder = builder.osis_worker(&osis_config.binary_path)
.worker_env_vars(osis_config.env_vars);
}
if let Some(sal_config) = config.sal_worker {
builder = builder.sal_worker(&sal_config.binary_path)
.worker_env_vars(sal_config.env_vars);
}
if let Some(v_config) = config.v_worker {
builder = builder.v_worker(&v_config.binary_path)
.worker_env_vars(v_config.env_vars);
}
if let Some(python_config) = config.python_worker {
builder = builder.python_worker(&python_config.binary_path)
.worker_env_vars(python_config.env_vars);
}
// Store WebSocket configuration for later use
if let Some(ws_config) = config.websocket_server {
builder.websocket_config = Some(ws_config);
}
Ok(builder)
}
/// Validate that all configured worker binaries exist and are executable
fn validate_worker_binaries(&self) -> Result<(), SupervisorError> {
let workers = [
("OSIS", &self.osis_worker),
("SAL", &self.sal_worker),
("V", &self.v_worker),
("Python", &self.python_worker),
];
for (worker_type, binary_path) in workers {
if let Some(path) = binary_path {
let path_obj = Path::new(path);
if !path_obj.exists() {
return Err(SupervisorError::ConfigError(
format!("{} worker binary does not exist: {}", worker_type, path)
));
}
if !path_obj.is_file() {
return Err(SupervisorError::ConfigError(
format!("{} worker path is not a file: {}", worker_type, path)
));
}
// Check if the file is executable (Unix-like systems)
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let metadata = path_obj.metadata().map_err(|e| {
SupervisorError::ConfigError(
format!("Failed to read metadata for {} worker binary {}: {}", worker_type, path, e)
)
})?;
let permissions = metadata.permissions();
if permissions.mode() & 0o111 == 0 {
return Err(SupervisorError::ConfigError(
format!("{} worker binary is not executable: {}", worker_type, path)
));
}
}
info!("Validated {} worker binary: {}", worker_type, path);
}
}
Ok(())
}
pub fn redis_url(mut self, url: &str) -> Self {
self.redis_url = Some(url.to_string());
self
}
pub fn zinit_socket_path(mut self, path: &str) -> Self {
self.zinit_socket_path = Some(path.to_string());
self
}
pub fn osis_worker(mut self, binary_path: &str) -> Self {
self.osis_worker = Some(binary_path.to_string());
self
@@ -95,21 +263,23 @@ impl SupervisorBuilder {
/// Builds the final `Supervisor` instance synchronously.
///
/// This method validates the configuration and creates the Redis client.
/// Worker launching is deferred to the `start_workers()` method.
/// This method validates the configuration, checks worker binary existence,
/// and creates the Redis client. Worker launching is deferred to the `start_workers()` method.
///
/// # Returns
///
/// * `Ok(Supervisor)` - Successfully configured client
/// * `Err(SupervisorError)` - Configuration or connection error
pub fn build(self) -> Result<Supervisor, SupervisorError> {
/// * `Ok(Supervisor)` - Successfully configured client with valid binaries
/// * `Err(SupervisorError)` - Configuration, binary validation, or connection error
pub async fn build(self) -> Result<Supervisor, SupervisorError> {
// Validate that all configured worker binaries exist first
Self::validate_worker_binaries(&self)?;
let url = self.redis_url
.unwrap_or_else(|| "redis://127.0.0.1/".to_string());
let client = redis::Client::open(url)?;
let zinit_socket = self.zinit_socket_path
.unwrap_or_else(|| "/var/run/zinit.sock".to_string());
let zinit_client = ZinitClient::new(&zinit_socket);
let zinit_client = ZinitClient::unix_socket("/tmp/zinit.sock").await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create Zinit client: {}", e)))?;
// Store builder data for later use in start_workers()
let builder_data = SupervisorBuilderData {
@@ -118,6 +288,7 @@ impl SupervisorBuilder {
v_worker: self.v_worker,
python_worker: self.python_worker,
worker_env_vars: self.worker_env_vars,
websocket_config: self.websocket_config,
};
let supervisor = Supervisor {
@@ -134,14 +305,33 @@ impl Supervisor {
/// Start all configured workers asynchronously.
/// This method should be called after build() to launch the workers.
pub async fn start_workers(&self) -> Result<(), SupervisorError> {
info!("Starting Hero Supervisor workers...");
// Test Zinit connection first
info!("Testing Zinit connection at /tmp/zinit.sock...");
match self.zinit_client.list().await {
Ok(services) => {
info!("Successfully connected to Zinit. Current services: {:?}", services);
}
Err(e) => {
error!("Failed to connect to Zinit: {:?}", e);
return Err(SupervisorError::ZinitError(format!("Zinit connection failed: {}", e)));
}
}
// Clean up any existing worker services first
info!("Cleaning up existing worker services...");
self.cleanup_existing_workers().await?;
// Launch configured workers if builder data is available
if let Some(builder_data) = &self.builder_data {
info!("Launching configured workers...");
self.launch_configured_workers(builder_data).await?;
} else {
warn!("No builder data available, no workers to start");
}
info!("All workers started successfully!");
Ok(())
}
@@ -179,7 +369,11 @@ impl Supervisor {
for worker_name in worker_names {
// Try to stop and delete, but don't fail if they don't exist
let _ = self.stop_and_delete_worker(worker_name).await;
info!("Attempting to cleanup worker: {}", worker_name);
match self.stop_and_delete_worker(worker_name).await {
Ok(_) => info!("Successfully cleaned up worker: {}", worker_name),
Err(e) => debug!("Failed to cleanup worker {}: {}", worker_name, e),
}
}
info!("Existing worker cleanup completed");
@@ -188,18 +382,33 @@ impl Supervisor {
/// Stop and delete a worker service from zinit
async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
info!("Starting cleanup for worker: {}", worker_name);
// First try to stop the worker
info!("Attempting to stop worker: {}", worker_name);
if let Err(e) = self.zinit_client.stop(worker_name).await {
debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
} else {
info!("Successfully stopped worker: {}", worker_name);
}
// Then try to delete the service
if let Err(e) = self.zinit_client.delete(worker_name).await {
// Then forget the service to stop monitoring it
info!("Attempting to forget worker: {}", worker_name);
if let Err(e) = self.zinit_client.forget(worker_name).await {
info!("Worker {} was not being monitored or failed to forget: {}", worker_name, e);
} else {
info!("Successfully forgot worker service: {}", worker_name);
}
// Finally, delete the service configuration
info!("Attempting to delete service for worker: {}", worker_name);
if let Err(e) = self.zinit_client.delete_service(worker_name).await {
debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
} else {
info!("Successfully deleted worker service: {}", worker_name);
}
info!("Completed cleanup for worker: {}", worker_name);
Ok(())
}
@@ -211,6 +420,157 @@ impl Supervisor {
pub fn new_job(&self) -> JobBuilder {
JobBuilder::new(self)
}
/// Get WebSocket server configuration from TOML config
pub fn get_websocket_config(&self) -> Result<WebSocketServerConfig, SupervisorError> {
let builder_data = self.builder_data.as_ref().ok_or_else(|| {
SupervisorError::ConfigError("No builder data available for WebSocket config".to_string())
})?;
builder_data.websocket_config.clone().ok_or_else(|| {
SupervisorError::ConfigError("No WebSocket server configuration found in TOML config".to_string())
})
}
/// Extract worker configurations from the supervisor's builder data
pub fn get_worker_configs(&self) -> Result<Vec<WorkerConfig>, SupervisorError> {
let builder_data = self.builder_data.as_ref().ok_or_else(|| {
SupervisorError::ConfigError("No builder data available for worker configs".to_string())
})?;
let mut configs = Vec::new();
let env_vars = builder_data.worker_env_vars.clone();
if let Some(osis_path) = &builder_data.osis_worker {
configs.push(
WorkerConfig::new("osis_worker_1".to_string(), PathBuf::from(osis_path), ScriptType::OSIS)
.with_env(env_vars.clone())
);
}
if let Some(sal_path) = &builder_data.sal_worker {
configs.push(
WorkerConfig::new("sal_worker_1".to_string(), PathBuf::from(sal_path), ScriptType::SAL)
.with_env(env_vars.clone())
);
}
if let Some(v_path) = &builder_data.v_worker {
configs.push(
WorkerConfig::new("v_worker_1".to_string(), PathBuf::from(v_path), ScriptType::V)
.with_env(env_vars.clone())
);
}
if let Some(python_path) = &builder_data.python_worker {
configs.push(
WorkerConfig::new("python_worker_1".to_string(), PathBuf::from(python_path), ScriptType::Python)
.with_env(env_vars.clone())
);
}
Ok(configs)
}
/// Spawn a background lifecycle manager that continuously monitors and maintains worker health
/// Returns a JoinHandle that can be used to stop the lifecycle manager
pub fn spawn_lifecycle_manager(
self: Arc<Self>,
worker_configs: Vec<WorkerConfig>,
health_check_interval: Duration,
) -> tokio::task::JoinHandle<Result<(), SupervisorError>> {
let supervisor = self;
tokio::spawn(async move {
info!("Starting background lifecycle manager with {} workers", worker_configs.len());
info!("Health check interval: {:?}", health_check_interval);
// Initial worker startup
info!("Performing initial worker startup...");
if let Err(e) = supervisor.start_workers().await {
error!("Failed to start workers during initialization: {}", e);
return Err(e);
}
// Start the monitoring loop
let mut interval = tokio::time::interval(health_check_interval);
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
loop {
interval.tick().await;
info!("Running periodic worker health check...");
// Check each worker's health and restart if needed
for worker_config in &worker_configs {
if let Err(e) = supervisor.check_and_restart_worker(worker_config).await {
error!("Failed to check/restart worker {}: {}", worker_config.name, e);
}
}
info!("Health check cycle completed");
}
})
}
/// Check a single worker's health and restart if needed
async fn check_and_restart_worker(&self, worker_config: &WorkerConfig) -> Result<(), SupervisorError> {
let worker_name = &worker_config.name;
// Get worker status
match self.zinit_client.status(worker_name).await {
Ok(status) => {
let is_healthy = status.state == "running" && status.pid > 0;
if is_healthy {
debug!("Worker {} is healthy (state: {}, pid: {})", worker_name, status.state, status.pid);
// Optionally send a ping job for deeper health check
if let Err(e) = self.send_ping_job(worker_config.script_type.clone()).await {
warn!("Ping job failed for worker {}: {}", worker_name, e);
// Note: We don't restart on ping failure as it might be temporary
}
} else {
warn!("Worker {} is unhealthy (state: {}, pid: {}), restarting...",
worker_name, status.state, status.pid);
// Attempt to restart the worker
if let Err(e) = self.restart_worker(worker_name).await {
error!("Failed to restart unhealthy worker {}: {}", worker_name, e);
// If restart fails, try a full stop/start cycle
warn!("Attempting full stop/start cycle for worker: {}", worker_name);
if let Err(e) = self.stop_and_delete_worker(worker_name).await {
error!("Failed to stop worker {} during recovery: {}", worker_name, e);
}
if let Err(e) = self.start_worker(worker_config).await {
error!("Failed to start worker {} during recovery: {}", worker_name, e);
return Err(e);
}
info!("Successfully recovered worker: {}", worker_name);
} else {
info!("Successfully restarted worker: {}", worker_name);
}
}
}
Err(e) => {
warn!("Could not get status for worker {} (may not exist): {}", worker_name, e);
// Worker doesn't exist, try to start it
info!("Attempting to start missing worker: {}", worker_name);
if let Err(e) = self.start_worker(worker_config).await {
error!("Failed to start missing worker {}: {}", worker_name, e);
return Err(e);
}
info!("Successfully started missing worker: {}", worker_name);
}
}
Ok(())
}
// Internal helper to submit script details and push to work queue
async fn create_job_using_connection(

View File

@@ -8,7 +8,7 @@ use serde_json::json;
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
use zinit_client::{ZinitClient, ServiceStatus, ServiceState};
use zinit_client::{Client as ZinitClient, Status};
use hero_job::ScriptType;
use crate::{Supervisor, SupervisorError};
@@ -16,7 +16,7 @@ use crate::{Supervisor, SupervisorError};
#[derive(Debug, Clone)]
pub struct WorkerInfo {
pub config: WorkerConfig,
pub status: Option<ServiceStatus>,
pub status: Option<Status>,
pub is_running: bool,
}
@@ -90,7 +90,7 @@ impl Supervisor {
for config in worker_configs {
let status = self.zinit_client.status(&config.name).await.ok();
let is_running = status.as_ref()
.map(|s| matches!(s.state, ServiceState::Running) && s.pid > 0)
.map(|s| s.state == "running" && s.pid > 0)
.unwrap_or(false);
workers.push(WorkerInfo {
@@ -117,6 +117,10 @@ impl Supervisor {
self.zinit_client.create_service(&worker_config.name, service_config).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to create service: {}", e)))?;
// Monitor the service so Zinit starts managing it
self.zinit_client.monitor(&worker_config.name).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to monitor service: {}", e)))?;
// Start the service
self.zinit_client.start(&worker_config.name).await
.map_err(|e| SupervisorError::ZinitError(format!("Failed to start worker: {}", e)))?;
@@ -168,7 +172,7 @@ impl Supervisor {
&self,
worker_name: &str,
zinit_client: &ZinitClient,
) -> Result<ServiceStatus, SupervisorError> {
) -> Result<Status, SupervisorError> {
match zinit_client.status(worker_name).await {
Ok(status) => Ok(status),
Err(e) => {
@@ -183,7 +187,7 @@ impl Supervisor {
&self,
worker_configs: &[WorkerConfig],
zinit_client: &ZinitClient,
) -> Result<HashMap<String, ServiceStatus>, SupervisorError> {
) -> Result<HashMap<String, Status>, SupervisorError> {
let mut status_map = HashMap::new();
for worker in worker_configs {
@@ -200,19 +204,7 @@ impl Supervisor {
Ok(status_map)
}
/// Start multiple workers
pub async fn start_workers(
&self,
worker_configs: &[WorkerConfig],
) -> Result<(), SupervisorError> {
info!("Starting {} workers", worker_configs.len());
for worker in worker_configs {
self.start_worker(worker).await?;
}
Ok(())
}
/// Stop multiple workers
pub async fn stop_workers(
@@ -240,7 +232,7 @@ impl Supervisor {
for worker in worker_configs {
if worker.script_type == *script_type {
if let Ok(status) = zinit_client.status(&worker.name).await {
if status.state == ServiceState::Running {
if status.state == "running" {
running_count += 1;
}
}
@@ -277,26 +269,35 @@ impl Supervisor {
}
/// Create Zinit service configuration from worker config
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Value {
let mut config = json!({
"exec": format!("{} {}",
fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Map<String, serde_json::Value> {
use serde_json::{Map, Value};
let mut config = Map::new();
config.insert(
"exec".to_string(),
Value::String(format!("{} {}",
worker.binary_path.display(),
worker.args.join(" ")
),
"oneshot": !worker.restart_on_exit,
});
))
);
config.insert(
"oneshot".to_string(),
Value::Bool(!worker.restart_on_exit)
);
if let Some(health_check) = &worker.health_check {
config["test"] = json!(health_check);
config.insert("test".to_string(), Value::String(health_check.clone()));
}
if !worker.dependencies.is_empty() {
config["after"] = json!(worker.dependencies);
config.insert("after".to_string(), json!(worker.dependencies));
}
// Add environment variables if any
if !worker.env.is_empty() {
config["env"] = json!(worker.env);
config.insert("env".to_string(), json!(worker.env));
}
config
@@ -307,6 +308,8 @@ impl Supervisor {
use hero_job::ScriptType;
use std::path::PathBuf;
let mut errors = Vec::new();
// Launch OSIS worker if configured
if let Some(binary_path) = &builder.osis_worker {
let worker_id = "osis_worker_1";
@@ -318,7 +321,11 @@ impl Supervisor {
config.env.extend(builder.worker_env_vars.clone());
info!("Launching OSIS worker: {}", worker_id);
self.start_worker(&config).await?;
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start OSIS worker: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Launch SAL worker if configured
@@ -332,7 +339,11 @@ impl Supervisor {
config.env.extend(builder.worker_env_vars.clone());
info!("Launching SAL worker: {}", worker_id);
self.start_worker(&config).await?;
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start SAL worker: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Launch V worker if configured
@@ -346,7 +357,11 @@ impl Supervisor {
config.env.extend(builder.worker_env_vars.clone());
info!("Launching V worker: {}", worker_id);
self.start_worker(&config).await?;
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start V worker: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
// Launch Python worker if configured
@@ -360,9 +375,21 @@ impl Supervisor {
config.env.extend(builder.worker_env_vars.clone());
info!("Launching Python worker: {}", worker_id);
self.start_worker(&config).await?;
if let Err(e) = self.start_worker(&config).await {
let error_msg = format!("Failed to start Python worker: {}", e);
warn!("{}", error_msg);
errors.push(error_msg);
}
}
Ok(())
// Return result based on whether any workers started successfully
if errors.is_empty() {
info!("All configured workers started successfully");
Ok(())
} else {
let combined_error = format!("Some workers failed to start: {}", errors.join("; "));
warn!("{}", combined_error);
Err(SupervisorError::ZinitError(combined_error))
}
}
}