refactor wip
This commit is contained in:
		@@ -31,6 +31,8 @@ pub enum SupervisorError {
 | 
			
		||||
    /// Zinit client operation error
 | 
			
		||||
    ZinitError(String),
 | 
			
		||||
    SupervisorNotConfigured,
 | 
			
		||||
    /// Configuration file parsing error
 | 
			
		||||
    ConfigError(String),
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl From<redis::RedisError> for SupervisorError {
 | 
			
		||||
@@ -95,6 +97,9 @@ impl std::fmt::Display for SupervisorError {
 | 
			
		||||
            SupervisorError::SupervisorNotConfigured => {
 | 
			
		||||
                write!(f, "Supervisor not configured for health monitoring")
 | 
			
		||||
            }
 | 
			
		||||
            SupervisorError::ConfigError(msg) => {
 | 
			
		||||
                write!(f, "Configuration error: {}", msg)
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -1,9 +1,14 @@
 | 
			
		||||
use log::{debug, error, info, warn};
 | 
			
		||||
use redis::AsyncCommands;
 | 
			
		||||
use serde::{Deserialize, Serialize};
 | 
			
		||||
use std::collections::HashMap;
 | 
			
		||||
use std::fs;
 | 
			
		||||
use std::path::{Path, PathBuf};
 | 
			
		||||
use std::sync::Arc;
 | 
			
		||||
use std::time::Duration;
 | 
			
		||||
use hero_job::NAMESPACE_PREFIX;
 | 
			
		||||
use zinit_client::ZinitClient;
 | 
			
		||||
use zinit_client::Client as ZinitClient;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
mod job;
 | 
			
		||||
mod error;
 | 
			
		||||
@@ -23,46 +28,209 @@ pub struct Supervisor {
 | 
			
		||||
 | 
			
		||||
pub struct SupervisorBuilder {
 | 
			
		||||
    redis_url: Option<String>,
 | 
			
		||||
    zinit_socket_path: Option<String>,
 | 
			
		||||
    osis_worker: Option<String>,
 | 
			
		||||
    sal_worker: Option<String>,
 | 
			
		||||
    v_worker: Option<String>,
 | 
			
		||||
    python_worker: Option<String>,
 | 
			
		||||
    worker_env_vars: HashMap<String, String>,
 | 
			
		||||
    websocket_config: Option<WebSocketServerConfig>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Helper struct to pass builder data to worker launch method
 | 
			
		||||
#[derive(Clone)]
 | 
			
		||||
struct SupervisorBuilderData {
 | 
			
		||||
    osis_worker: Option<String>,
 | 
			
		||||
    sal_worker: Option<String>,
 | 
			
		||||
    v_worker: Option<String>,
 | 
			
		||||
    python_worker: Option<String>,
 | 
			
		||||
    worker_env_vars: HashMap<String, String>,
 | 
			
		||||
    websocket_config: Option<WebSocketServerConfig>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// TOML configuration structure for the supervisor
 | 
			
		||||
#[derive(Debug, Deserialize, Serialize)]
 | 
			
		||||
pub struct SupervisorConfig {
 | 
			
		||||
    pub global: GlobalConfig,
 | 
			
		||||
    pub websocket_server: Option<WebSocketServerConfig>,
 | 
			
		||||
    pub osis_worker: Option<WorkerConfigToml>,
 | 
			
		||||
    pub sal_worker: Option<WorkerConfigToml>,
 | 
			
		||||
    pub v_worker: Option<WorkerConfigToml>,
 | 
			
		||||
    pub python_worker: Option<WorkerConfigToml>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Global configuration section
 | 
			
		||||
#[derive(Debug, Deserialize, Serialize)]
 | 
			
		||||
pub struct GlobalConfig {
 | 
			
		||||
    pub redis_url: String,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Worker configuration section in TOML
 | 
			
		||||
#[derive(Debug, Deserialize, Serialize)]
 | 
			
		||||
pub struct WorkerConfigToml {
 | 
			
		||||
    pub binary_path: String,
 | 
			
		||||
    #[serde(default)]
 | 
			
		||||
    pub env_vars: HashMap<String, String>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// WebSocket server configuration section in TOML
 | 
			
		||||
/// This mirrors the ServerConfig from hero_websocket_server but avoids circular dependency
 | 
			
		||||
#[derive(Debug, Deserialize, Serialize, Clone)]
 | 
			
		||||
pub struct WebSocketServerConfig {
 | 
			
		||||
    /// Server host address
 | 
			
		||||
    #[serde(default = "default_host")]
 | 
			
		||||
    pub host: String,
 | 
			
		||||
    
 | 
			
		||||
    /// Server port
 | 
			
		||||
    #[serde(default = "default_port")]
 | 
			
		||||
    pub port: u16,
 | 
			
		||||
    
 | 
			
		||||
    /// Redis connection URL
 | 
			
		||||
    #[serde(default = "default_redis_url")]
 | 
			
		||||
    pub redis_url: String,
 | 
			
		||||
    
 | 
			
		||||
    /// Enable authentication
 | 
			
		||||
    #[serde(default)]
 | 
			
		||||
    pub auth: bool,
 | 
			
		||||
    
 | 
			
		||||
    /// Enable TLS/WSS
 | 
			
		||||
    #[serde(default)]
 | 
			
		||||
    pub tls: bool,
 | 
			
		||||
    
 | 
			
		||||
    /// Path to TLS certificate file
 | 
			
		||||
    pub cert: Option<String>,
 | 
			
		||||
    
 | 
			
		||||
    /// Path to TLS private key file
 | 
			
		||||
    pub key: Option<String>,
 | 
			
		||||
    
 | 
			
		||||
    /// Separate port for TLS connections
 | 
			
		||||
    pub tls_port: Option<u16>,
 | 
			
		||||
    
 | 
			
		||||
    /// Circles configuration - maps circle names to lists of member public keys
 | 
			
		||||
    #[serde(default)]
 | 
			
		||||
    pub circles: HashMap<String, Vec<String>>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Default value functions for WebSocket server config
 | 
			
		||||
fn default_host() -> String {
 | 
			
		||||
    "127.0.0.1".to_string()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn default_port() -> u16 {
 | 
			
		||||
    8443
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn default_redis_url() -> String {
 | 
			
		||||
    "redis://127.0.0.1/".to_string()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl SupervisorBuilder {
 | 
			
		||||
    pub fn new() -> Self {
 | 
			
		||||
        Self {
 | 
			
		||||
            redis_url: None,
 | 
			
		||||
            zinit_socket_path: Some("/var/run/zinit.sock".to_string()),
 | 
			
		||||
            osis_worker: None,
 | 
			
		||||
            sal_worker: None,
 | 
			
		||||
            v_worker: None,
 | 
			
		||||
            python_worker: None,
 | 
			
		||||
            worker_env_vars: HashMap::new(),
 | 
			
		||||
            websocket_config: None,
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Create a SupervisorBuilder from a TOML configuration file
 | 
			
		||||
    pub fn from_toml<P: AsRef<Path>>(toml_path: P) -> Result<Self, SupervisorError> {
 | 
			
		||||
        let toml_content = fs::read_to_string(toml_path)
 | 
			
		||||
            .map_err(|e| SupervisorError::ConfigError(format!("Failed to read TOML file: {}", e)))?;
 | 
			
		||||
        
 | 
			
		||||
        let config: SupervisorConfig = toml::from_str(&toml_content)
 | 
			
		||||
            .map_err(|e| SupervisorError::ConfigError(format!("Failed to parse TOML: {}", e)))?;
 | 
			
		||||
        
 | 
			
		||||
        let mut builder = Self::new()
 | 
			
		||||
            .redis_url(&config.global.redis_url);
 | 
			
		||||
        
 | 
			
		||||
        // Configure workers based on TOML config
 | 
			
		||||
        if let Some(osis_config) = config.osis_worker {
 | 
			
		||||
            builder = builder.osis_worker(&osis_config.binary_path)
 | 
			
		||||
                .worker_env_vars(osis_config.env_vars);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        if let Some(sal_config) = config.sal_worker {
 | 
			
		||||
            builder = builder.sal_worker(&sal_config.binary_path)
 | 
			
		||||
                .worker_env_vars(sal_config.env_vars);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        if let Some(v_config) = config.v_worker {
 | 
			
		||||
            builder = builder.v_worker(&v_config.binary_path)
 | 
			
		||||
                .worker_env_vars(v_config.env_vars);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        if let Some(python_config) = config.python_worker {
 | 
			
		||||
            builder = builder.python_worker(&python_config.binary_path)
 | 
			
		||||
                .worker_env_vars(python_config.env_vars);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Store WebSocket configuration for later use
 | 
			
		||||
        if let Some(ws_config) = config.websocket_server {
 | 
			
		||||
            builder.websocket_config = Some(ws_config);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        Ok(builder)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Validate that all configured worker binaries exist and are executable
 | 
			
		||||
    fn validate_worker_binaries(&self) -> Result<(), SupervisorError> {
 | 
			
		||||
        let workers = [
 | 
			
		||||
            ("OSIS", &self.osis_worker),
 | 
			
		||||
            ("SAL", &self.sal_worker),
 | 
			
		||||
            ("V", &self.v_worker),
 | 
			
		||||
            ("Python", &self.python_worker),
 | 
			
		||||
        ];
 | 
			
		||||
 | 
			
		||||
        for (worker_type, binary_path) in workers {
 | 
			
		||||
            if let Some(path) = binary_path {
 | 
			
		||||
                let path_obj = Path::new(path);
 | 
			
		||||
                
 | 
			
		||||
                if !path_obj.exists() {
 | 
			
		||||
                    return Err(SupervisorError::ConfigError(
 | 
			
		||||
                        format!("{} worker binary does not exist: {}", worker_type, path)
 | 
			
		||||
                    ));
 | 
			
		||||
                }
 | 
			
		||||
                
 | 
			
		||||
                if !path_obj.is_file() {
 | 
			
		||||
                    return Err(SupervisorError::ConfigError(
 | 
			
		||||
                        format!("{} worker path is not a file: {}", worker_type, path)
 | 
			
		||||
                    ));
 | 
			
		||||
                }
 | 
			
		||||
                
 | 
			
		||||
                // Check if the file is executable (Unix-like systems)
 | 
			
		||||
                #[cfg(unix)]
 | 
			
		||||
                {
 | 
			
		||||
                    use std::os::unix::fs::PermissionsExt;
 | 
			
		||||
                    let metadata = path_obj.metadata().map_err(|e| {
 | 
			
		||||
                        SupervisorError::ConfigError(
 | 
			
		||||
                            format!("Failed to read metadata for {} worker binary {}: {}", worker_type, path, e)
 | 
			
		||||
                        )
 | 
			
		||||
                    })?;
 | 
			
		||||
                    
 | 
			
		||||
                    let permissions = metadata.permissions();
 | 
			
		||||
                    if permissions.mode() & 0o111 == 0 {
 | 
			
		||||
                        return Err(SupervisorError::ConfigError(
 | 
			
		||||
                            format!("{} worker binary is not executable: {}", worker_type, path)
 | 
			
		||||
                        ));
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
                
 | 
			
		||||
                info!("Validated {} worker binary: {}", worker_type, path);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn redis_url(mut self, url: &str) -> Self {
 | 
			
		||||
        self.redis_url = Some(url.to_string());
 | 
			
		||||
        self
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn zinit_socket_path(mut self, path: &str) -> Self {
 | 
			
		||||
        self.zinit_socket_path = Some(path.to_string());
 | 
			
		||||
        self
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn osis_worker(mut self, binary_path: &str) -> Self {
 | 
			
		||||
        self.osis_worker = Some(binary_path.to_string());
 | 
			
		||||
        self
 | 
			
		||||
@@ -95,21 +263,23 @@ impl SupervisorBuilder {
 | 
			
		||||
 | 
			
		||||
    /// Builds the final `Supervisor` instance synchronously.
 | 
			
		||||
    ///
 | 
			
		||||
    /// This method validates the configuration and creates the Redis client.
 | 
			
		||||
    /// Worker launching is deferred to the `start_workers()` method.
 | 
			
		||||
    /// This method validates the configuration, checks worker binary existence,
 | 
			
		||||
    /// and creates the Redis client. Worker launching is deferred to the `start_workers()` method.
 | 
			
		||||
    ///
 | 
			
		||||
    /// # Returns
 | 
			
		||||
    ///
 | 
			
		||||
    /// * `Ok(Supervisor)` - Successfully configured client
 | 
			
		||||
    /// * `Err(SupervisorError)` - Configuration or connection error
 | 
			
		||||
    pub fn build(self) -> Result<Supervisor, SupervisorError> {
 | 
			
		||||
    /// * `Ok(Supervisor)` - Successfully configured client with valid binaries
 | 
			
		||||
    /// * `Err(SupervisorError)` - Configuration, binary validation, or connection error
 | 
			
		||||
    pub async fn build(self) -> Result<Supervisor, SupervisorError> {
 | 
			
		||||
        // Validate that all configured worker binaries exist first
 | 
			
		||||
        Self::validate_worker_binaries(&self)?;
 | 
			
		||||
        
 | 
			
		||||
        let url = self.redis_url
 | 
			
		||||
            .unwrap_or_else(|| "redis://127.0.0.1/".to_string());
 | 
			
		||||
        let client = redis::Client::open(url)?;
 | 
			
		||||
        
 | 
			
		||||
        let zinit_socket = self.zinit_socket_path
 | 
			
		||||
            .unwrap_or_else(|| "/var/run/zinit.sock".to_string());
 | 
			
		||||
        let zinit_client = ZinitClient::new(&zinit_socket);
 | 
			
		||||
        let zinit_client = ZinitClient::unix_socket("/tmp/zinit.sock").await
 | 
			
		||||
            .map_err(|e| SupervisorError::ZinitError(format!("Failed to create Zinit client: {}", e)))?;
 | 
			
		||||
        
 | 
			
		||||
        // Store builder data for later use in start_workers()
 | 
			
		||||
        let builder_data = SupervisorBuilderData {
 | 
			
		||||
@@ -118,6 +288,7 @@ impl SupervisorBuilder {
 | 
			
		||||
            v_worker: self.v_worker,
 | 
			
		||||
            python_worker: self.python_worker,
 | 
			
		||||
            worker_env_vars: self.worker_env_vars,
 | 
			
		||||
            websocket_config: self.websocket_config,
 | 
			
		||||
        };
 | 
			
		||||
        
 | 
			
		||||
        let supervisor = Supervisor {
 | 
			
		||||
@@ -134,14 +305,33 @@ impl Supervisor {
 | 
			
		||||
    /// Start all configured workers asynchronously.
 | 
			
		||||
    /// This method should be called after build() to launch the workers.
 | 
			
		||||
    pub async fn start_workers(&self) -> Result<(), SupervisorError> {
 | 
			
		||||
        info!("Starting Hero Supervisor workers...");
 | 
			
		||||
        
 | 
			
		||||
        // Test Zinit connection first
 | 
			
		||||
        info!("Testing Zinit connection at /tmp/zinit.sock...");
 | 
			
		||||
        match self.zinit_client.list().await {
 | 
			
		||||
            Ok(services) => {
 | 
			
		||||
                info!("Successfully connected to Zinit. Current services: {:?}", services);
 | 
			
		||||
            }
 | 
			
		||||
            Err(e) => {
 | 
			
		||||
                error!("Failed to connect to Zinit: {:?}", e);
 | 
			
		||||
                return Err(SupervisorError::ZinitError(format!("Zinit connection failed: {}", e)));
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Clean up any existing worker services first
 | 
			
		||||
        info!("Cleaning up existing worker services...");
 | 
			
		||||
        self.cleanup_existing_workers().await?;
 | 
			
		||||
        
 | 
			
		||||
        // Launch configured workers if builder data is available
 | 
			
		||||
        if let Some(builder_data) = &self.builder_data {
 | 
			
		||||
            info!("Launching configured workers...");
 | 
			
		||||
            self.launch_configured_workers(builder_data).await?;
 | 
			
		||||
        } else {
 | 
			
		||||
            warn!("No builder data available, no workers to start");
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        info!("All workers started successfully!");
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
@@ -179,7 +369,11 @@ impl Supervisor {
 | 
			
		||||
        
 | 
			
		||||
        for worker_name in worker_names {
 | 
			
		||||
            // Try to stop and delete, but don't fail if they don't exist
 | 
			
		||||
            let _ = self.stop_and_delete_worker(worker_name).await;
 | 
			
		||||
            info!("Attempting to cleanup worker: {}", worker_name);
 | 
			
		||||
            match self.stop_and_delete_worker(worker_name).await {
 | 
			
		||||
                Ok(_) => info!("Successfully cleaned up worker: {}", worker_name),
 | 
			
		||||
                Err(e) => debug!("Failed to cleanup worker {}: {}", worker_name, e),
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        info!("Existing worker cleanup completed");
 | 
			
		||||
@@ -188,18 +382,33 @@ impl Supervisor {
 | 
			
		||||
    
 | 
			
		||||
    /// Stop and delete a worker service from zinit
 | 
			
		||||
    async fn stop_and_delete_worker(&self, worker_name: &str) -> Result<(), SupervisorError> {
 | 
			
		||||
        info!("Starting cleanup for worker: {}", worker_name);
 | 
			
		||||
        
 | 
			
		||||
        // First try to stop the worker
 | 
			
		||||
        info!("Attempting to stop worker: {}", worker_name);
 | 
			
		||||
        if let Err(e) = self.zinit_client.stop(worker_name).await {
 | 
			
		||||
            debug!("Worker {} was not running or failed to stop: {}", worker_name, e);
 | 
			
		||||
        } else {
 | 
			
		||||
            info!("Successfully stopped worker: {}", worker_name);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Then try to delete the service
 | 
			
		||||
        if let Err(e) = self.zinit_client.delete(worker_name).await {
 | 
			
		||||
        // Then forget the service to stop monitoring it
 | 
			
		||||
        info!("Attempting to forget worker: {}", worker_name);
 | 
			
		||||
        if let Err(e) = self.zinit_client.forget(worker_name).await {
 | 
			
		||||
            info!("Worker {} was not being monitored or failed to forget: {}", worker_name, e);
 | 
			
		||||
        } else {
 | 
			
		||||
            info!("Successfully forgot worker service: {}", worker_name);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Finally, delete the service configuration
 | 
			
		||||
        info!("Attempting to delete service for worker: {}", worker_name);
 | 
			
		||||
        if let Err(e) = self.zinit_client.delete_service(worker_name).await {
 | 
			
		||||
            debug!("Worker {} service did not exist or failed to delete: {}", worker_name, e);
 | 
			
		||||
        } else {
 | 
			
		||||
            info!("Successfully deleted worker service: {}", worker_name);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        info!("Completed cleanup for worker: {}", worker_name);
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -211,6 +420,157 @@ impl Supervisor {
 | 
			
		||||
    pub fn new_job(&self) -> JobBuilder {
 | 
			
		||||
        JobBuilder::new(self)
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    /// Get WebSocket server configuration from TOML config
 | 
			
		||||
    pub fn get_websocket_config(&self) -> Result<WebSocketServerConfig, SupervisorError> {
 | 
			
		||||
        let builder_data = self.builder_data.as_ref().ok_or_else(|| {
 | 
			
		||||
            SupervisorError::ConfigError("No builder data available for WebSocket config".to_string())
 | 
			
		||||
        })?;
 | 
			
		||||
        
 | 
			
		||||
        builder_data.websocket_config.clone().ok_or_else(|| {
 | 
			
		||||
            SupervisorError::ConfigError("No WebSocket server configuration found in TOML config".to_string())
 | 
			
		||||
        })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Extract worker configurations from the supervisor's builder data
 | 
			
		||||
    pub fn get_worker_configs(&self) -> Result<Vec<WorkerConfig>, SupervisorError> {
 | 
			
		||||
        let builder_data = self.builder_data.as_ref().ok_or_else(|| {
 | 
			
		||||
            SupervisorError::ConfigError("No builder data available for worker configs".to_string())
 | 
			
		||||
        })?;
 | 
			
		||||
        
 | 
			
		||||
        let mut configs = Vec::new();
 | 
			
		||||
        let env_vars = builder_data.worker_env_vars.clone();
 | 
			
		||||
        
 | 
			
		||||
        if let Some(osis_path) = &builder_data.osis_worker {
 | 
			
		||||
            configs.push(
 | 
			
		||||
                WorkerConfig::new("osis_worker_1".to_string(), PathBuf::from(osis_path), ScriptType::OSIS)
 | 
			
		||||
                    .with_env(env_vars.clone())
 | 
			
		||||
            );
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        if let Some(sal_path) = &builder_data.sal_worker {
 | 
			
		||||
            configs.push(
 | 
			
		||||
                WorkerConfig::new("sal_worker_1".to_string(), PathBuf::from(sal_path), ScriptType::SAL)
 | 
			
		||||
                    .with_env(env_vars.clone())
 | 
			
		||||
            );
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        if let Some(v_path) = &builder_data.v_worker {
 | 
			
		||||
            configs.push(
 | 
			
		||||
                WorkerConfig::new("v_worker_1".to_string(), PathBuf::from(v_path), ScriptType::V)
 | 
			
		||||
                    .with_env(env_vars.clone())
 | 
			
		||||
            );
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        if let Some(python_path) = &builder_data.python_worker {
 | 
			
		||||
            configs.push(
 | 
			
		||||
                WorkerConfig::new("python_worker_1".to_string(), PathBuf::from(python_path), ScriptType::Python)
 | 
			
		||||
                    .with_env(env_vars.clone())
 | 
			
		||||
            );
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        Ok(configs)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Spawn a background lifecycle manager that continuously monitors and maintains worker health
 | 
			
		||||
    /// Returns a JoinHandle that can be used to stop the lifecycle manager
 | 
			
		||||
    pub fn spawn_lifecycle_manager(
 | 
			
		||||
        self: Arc<Self>,
 | 
			
		||||
        worker_configs: Vec<WorkerConfig>,
 | 
			
		||||
        health_check_interval: Duration,
 | 
			
		||||
    ) -> tokio::task::JoinHandle<Result<(), SupervisorError>> {
 | 
			
		||||
        let supervisor = self;
 | 
			
		||||
        
 | 
			
		||||
        tokio::spawn(async move {
 | 
			
		||||
            info!("Starting background lifecycle manager with {} workers", worker_configs.len());
 | 
			
		||||
            info!("Health check interval: {:?}", health_check_interval);
 | 
			
		||||
            
 | 
			
		||||
            // Initial worker startup
 | 
			
		||||
            info!("Performing initial worker startup...");
 | 
			
		||||
            if let Err(e) = supervisor.start_workers().await {
 | 
			
		||||
                error!("Failed to start workers during initialization: {}", e);
 | 
			
		||||
                return Err(e);
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // Start the monitoring loop
 | 
			
		||||
            let mut interval = tokio::time::interval(health_check_interval);
 | 
			
		||||
            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
 | 
			
		||||
            
 | 
			
		||||
            loop {
 | 
			
		||||
                interval.tick().await;
 | 
			
		||||
                
 | 
			
		||||
                info!("Running periodic worker health check...");
 | 
			
		||||
                
 | 
			
		||||
                // Check each worker's health and restart if needed
 | 
			
		||||
                for worker_config in &worker_configs {
 | 
			
		||||
                    if let Err(e) = supervisor.check_and_restart_worker(worker_config).await {
 | 
			
		||||
                        error!("Failed to check/restart worker {}: {}", worker_config.name, e);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
                
 | 
			
		||||
                info!("Health check cycle completed");
 | 
			
		||||
            }
 | 
			
		||||
        })
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    /// Check a single worker's health and restart if needed
 | 
			
		||||
    async fn check_and_restart_worker(&self, worker_config: &WorkerConfig) -> Result<(), SupervisorError> {
 | 
			
		||||
        let worker_name = &worker_config.name;
 | 
			
		||||
        
 | 
			
		||||
        // Get worker status
 | 
			
		||||
        match self.zinit_client.status(worker_name).await {
 | 
			
		||||
            Ok(status) => {
 | 
			
		||||
                let is_healthy = status.state == "running" && status.pid > 0;
 | 
			
		||||
                
 | 
			
		||||
                if is_healthy {
 | 
			
		||||
                    debug!("Worker {} is healthy (state: {}, pid: {})", worker_name, status.state, status.pid);
 | 
			
		||||
                    
 | 
			
		||||
                    // Optionally send a ping job for deeper health check
 | 
			
		||||
                    if let Err(e) = self.send_ping_job(worker_config.script_type.clone()).await {
 | 
			
		||||
                        warn!("Ping job failed for worker {}: {}", worker_name, e);
 | 
			
		||||
                        // Note: We don't restart on ping failure as it might be temporary
 | 
			
		||||
                    }
 | 
			
		||||
                } else {
 | 
			
		||||
                    warn!("Worker {} is unhealthy (state: {}, pid: {}), restarting...", 
 | 
			
		||||
                          worker_name, status.state, status.pid);
 | 
			
		||||
                    
 | 
			
		||||
                    // Attempt to restart the worker
 | 
			
		||||
                    if let Err(e) = self.restart_worker(worker_name).await {
 | 
			
		||||
                        error!("Failed to restart unhealthy worker {}: {}", worker_name, e);
 | 
			
		||||
                        
 | 
			
		||||
                        // If restart fails, try a full stop/start cycle
 | 
			
		||||
                        warn!("Attempting full stop/start cycle for worker: {}", worker_name);
 | 
			
		||||
                        if let Err(e) = self.stop_and_delete_worker(worker_name).await {
 | 
			
		||||
                            error!("Failed to stop worker {} during recovery: {}", worker_name, e);
 | 
			
		||||
                        }
 | 
			
		||||
                        
 | 
			
		||||
                        if let Err(e) = self.start_worker(worker_config).await {
 | 
			
		||||
                            error!("Failed to start worker {} during recovery: {}", worker_name, e);
 | 
			
		||||
                            return Err(e);
 | 
			
		||||
                        }
 | 
			
		||||
                        
 | 
			
		||||
                        info!("Successfully recovered worker: {}", worker_name);
 | 
			
		||||
                    } else {
 | 
			
		||||
                        info!("Successfully restarted worker: {}", worker_name);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            Err(e) => {
 | 
			
		||||
                warn!("Could not get status for worker {} (may not exist): {}", worker_name, e);
 | 
			
		||||
                
 | 
			
		||||
                // Worker doesn't exist, try to start it
 | 
			
		||||
                info!("Attempting to start missing worker: {}", worker_name);
 | 
			
		||||
                if let Err(e) = self.start_worker(worker_config).await {
 | 
			
		||||
                    error!("Failed to start missing worker {}: {}", worker_name, e);
 | 
			
		||||
                    return Err(e);
 | 
			
		||||
                }
 | 
			
		||||
                
 | 
			
		||||
                info!("Successfully started missing worker: {}", worker_name);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Internal helper to submit script details and push to work queue
 | 
			
		||||
    async fn create_job_using_connection(
 | 
			
		||||
 
 | 
			
		||||
@@ -8,7 +8,7 @@ use serde_json::json;
 | 
			
		||||
use std::collections::HashMap;
 | 
			
		||||
use std::path::PathBuf;
 | 
			
		||||
use std::time::Duration;
 | 
			
		||||
use zinit_client::{ZinitClient, ServiceStatus, ServiceState};
 | 
			
		||||
use zinit_client::{Client as ZinitClient, Status};
 | 
			
		||||
use hero_job::ScriptType;
 | 
			
		||||
use crate::{Supervisor, SupervisorError};
 | 
			
		||||
 | 
			
		||||
@@ -16,7 +16,7 @@ use crate::{Supervisor, SupervisorError};
 | 
			
		||||
#[derive(Debug, Clone)]
 | 
			
		||||
pub struct WorkerInfo {
 | 
			
		||||
    pub config: WorkerConfig,
 | 
			
		||||
    pub status: Option<ServiceStatus>,
 | 
			
		||||
    pub status: Option<Status>,
 | 
			
		||||
    pub is_running: bool,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -90,7 +90,7 @@ impl Supervisor {
 | 
			
		||||
        for config in worker_configs {
 | 
			
		||||
            let status = self.zinit_client.status(&config.name).await.ok();
 | 
			
		||||
            let is_running = status.as_ref()
 | 
			
		||||
                .map(|s| matches!(s.state, ServiceState::Running) && s.pid > 0)
 | 
			
		||||
                .map(|s| s.state == "running" && s.pid > 0)
 | 
			
		||||
                .unwrap_or(false);
 | 
			
		||||
            
 | 
			
		||||
            workers.push(WorkerInfo {
 | 
			
		||||
@@ -117,6 +117,10 @@ impl Supervisor {
 | 
			
		||||
        self.zinit_client.create_service(&worker_config.name, service_config).await
 | 
			
		||||
            .map_err(|e| SupervisorError::ZinitError(format!("Failed to create service: {}", e)))?;
 | 
			
		||||
        
 | 
			
		||||
        // Monitor the service so Zinit starts managing it
 | 
			
		||||
        self.zinit_client.monitor(&worker_config.name).await
 | 
			
		||||
            .map_err(|e| SupervisorError::ZinitError(format!("Failed to monitor service: {}", e)))?;
 | 
			
		||||
        
 | 
			
		||||
        // Start the service
 | 
			
		||||
        self.zinit_client.start(&worker_config.name).await
 | 
			
		||||
            .map_err(|e| SupervisorError::ZinitError(format!("Failed to start worker: {}", e)))?;
 | 
			
		||||
@@ -168,7 +172,7 @@ impl Supervisor {
 | 
			
		||||
        &self,
 | 
			
		||||
        worker_name: &str,
 | 
			
		||||
        zinit_client: &ZinitClient,
 | 
			
		||||
    ) -> Result<ServiceStatus, SupervisorError> {
 | 
			
		||||
    ) -> Result<Status, SupervisorError> {
 | 
			
		||||
        match zinit_client.status(worker_name).await {
 | 
			
		||||
            Ok(status) => Ok(status),
 | 
			
		||||
            Err(e) => {
 | 
			
		||||
@@ -183,7 +187,7 @@ impl Supervisor {
 | 
			
		||||
        &self,
 | 
			
		||||
        worker_configs: &[WorkerConfig],
 | 
			
		||||
        zinit_client: &ZinitClient,
 | 
			
		||||
    ) -> Result<HashMap<String, ServiceStatus>, SupervisorError> {
 | 
			
		||||
    ) -> Result<HashMap<String, Status>, SupervisorError> {
 | 
			
		||||
        let mut status_map = HashMap::new();
 | 
			
		||||
        
 | 
			
		||||
        for worker in worker_configs {
 | 
			
		||||
@@ -200,19 +204,7 @@ impl Supervisor {
 | 
			
		||||
        Ok(status_map)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Start multiple workers
 | 
			
		||||
    pub async fn start_workers(
 | 
			
		||||
        &self,
 | 
			
		||||
        worker_configs: &[WorkerConfig],
 | 
			
		||||
    ) -> Result<(), SupervisorError> {
 | 
			
		||||
        info!("Starting {} workers", worker_configs.len());
 | 
			
		||||
        
 | 
			
		||||
        for worker in worker_configs {
 | 
			
		||||
            self.start_worker(worker).await?;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    /// Stop multiple workers
 | 
			
		||||
    pub async fn stop_workers(
 | 
			
		||||
@@ -240,7 +232,7 @@ impl Supervisor {
 | 
			
		||||
        for worker in worker_configs {
 | 
			
		||||
            if worker.script_type == *script_type {
 | 
			
		||||
                if let Ok(status) = zinit_client.status(&worker.name).await {
 | 
			
		||||
                    if status.state == ServiceState::Running {
 | 
			
		||||
                    if status.state == "running" {
 | 
			
		||||
                        running_count += 1;
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
@@ -277,26 +269,35 @@ impl Supervisor {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Create Zinit service configuration from worker config
 | 
			
		||||
    fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Value {
 | 
			
		||||
        let mut config = json!({
 | 
			
		||||
            "exec": format!("{} {}", 
 | 
			
		||||
    fn create_service_config(&self, worker: &WorkerConfig) -> serde_json::Map<String, serde_json::Value> {
 | 
			
		||||
        use serde_json::{Map, Value};
 | 
			
		||||
        
 | 
			
		||||
        let mut config = Map::new();
 | 
			
		||||
        
 | 
			
		||||
        config.insert(
 | 
			
		||||
            "exec".to_string(),
 | 
			
		||||
            Value::String(format!("{} {}", 
 | 
			
		||||
                worker.binary_path.display(), 
 | 
			
		||||
                worker.args.join(" ")
 | 
			
		||||
            ),
 | 
			
		||||
            "oneshot": !worker.restart_on_exit,
 | 
			
		||||
        });
 | 
			
		||||
            ))
 | 
			
		||||
        );
 | 
			
		||||
        
 | 
			
		||||
        config.insert(
 | 
			
		||||
            "oneshot".to_string(),
 | 
			
		||||
            Value::Bool(!worker.restart_on_exit)
 | 
			
		||||
        );
 | 
			
		||||
        
 | 
			
		||||
        if let Some(health_check) = &worker.health_check {
 | 
			
		||||
            config["test"] = json!(health_check);
 | 
			
		||||
            config.insert("test".to_string(), Value::String(health_check.clone()));
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        if !worker.dependencies.is_empty() {
 | 
			
		||||
            config["after"] = json!(worker.dependencies);
 | 
			
		||||
            config.insert("after".to_string(), json!(worker.dependencies));
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Add environment variables if any
 | 
			
		||||
        if !worker.env.is_empty() {
 | 
			
		||||
            config["env"] = json!(worker.env);
 | 
			
		||||
            config.insert("env".to_string(), json!(worker.env));
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        config
 | 
			
		||||
@@ -307,6 +308,8 @@ impl Supervisor {
 | 
			
		||||
        use hero_job::ScriptType;
 | 
			
		||||
        use std::path::PathBuf;
 | 
			
		||||
        
 | 
			
		||||
        let mut errors = Vec::new();
 | 
			
		||||
        
 | 
			
		||||
        // Launch OSIS worker if configured
 | 
			
		||||
        if let Some(binary_path) = &builder.osis_worker {
 | 
			
		||||
            let worker_id = "osis_worker_1";
 | 
			
		||||
@@ -318,7 +321,11 @@ impl Supervisor {
 | 
			
		||||
            config.env.extend(builder.worker_env_vars.clone());
 | 
			
		||||
            
 | 
			
		||||
            info!("Launching OSIS worker: {}", worker_id);
 | 
			
		||||
            self.start_worker(&config).await?;
 | 
			
		||||
            if let Err(e) = self.start_worker(&config).await {
 | 
			
		||||
                let error_msg = format!("Failed to start OSIS worker: {}", e);
 | 
			
		||||
                warn!("{}", error_msg);
 | 
			
		||||
                errors.push(error_msg);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Launch SAL worker if configured
 | 
			
		||||
@@ -332,7 +339,11 @@ impl Supervisor {
 | 
			
		||||
            config.env.extend(builder.worker_env_vars.clone());
 | 
			
		||||
            
 | 
			
		||||
            info!("Launching SAL worker: {}", worker_id);
 | 
			
		||||
            self.start_worker(&config).await?;
 | 
			
		||||
            if let Err(e) = self.start_worker(&config).await {
 | 
			
		||||
                let error_msg = format!("Failed to start SAL worker: {}", e);
 | 
			
		||||
                warn!("{}", error_msg);
 | 
			
		||||
                errors.push(error_msg);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Launch V worker if configured
 | 
			
		||||
@@ -346,7 +357,11 @@ impl Supervisor {
 | 
			
		||||
            config.env.extend(builder.worker_env_vars.clone());
 | 
			
		||||
            
 | 
			
		||||
            info!("Launching V worker: {}", worker_id);
 | 
			
		||||
            self.start_worker(&config).await?;
 | 
			
		||||
            if let Err(e) = self.start_worker(&config).await {
 | 
			
		||||
                let error_msg = format!("Failed to start V worker: {}", e);
 | 
			
		||||
                warn!("{}", error_msg);
 | 
			
		||||
                errors.push(error_msg);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Launch Python worker if configured
 | 
			
		||||
@@ -360,9 +375,21 @@ impl Supervisor {
 | 
			
		||||
            config.env.extend(builder.worker_env_vars.clone());
 | 
			
		||||
            
 | 
			
		||||
            info!("Launching Python worker: {}", worker_id);
 | 
			
		||||
            self.start_worker(&config).await?;
 | 
			
		||||
            if let Err(e) = self.start_worker(&config).await {
 | 
			
		||||
                let error_msg = format!("Failed to start Python worker: {}", e);
 | 
			
		||||
                warn!("{}", error_msg);
 | 
			
		||||
                errors.push(error_msg);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        Ok(())
 | 
			
		||||
        // Return result based on whether any workers started successfully
 | 
			
		||||
        if errors.is_empty() {
 | 
			
		||||
            info!("All configured workers started successfully");
 | 
			
		||||
            Ok(())
 | 
			
		||||
        } else {
 | 
			
		||||
            let combined_error = format!("Some workers failed to start: {}", errors.join("; "));
 | 
			
		||||
            warn!("{}", combined_error);
 | 
			
		||||
            Err(SupervisorError::ZinitError(combined_error))
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user