fix coordinator compilation

This commit is contained in:
Timur Gordon
2025-11-14 00:35:26 +01:00
parent 84545f0d75
commit 94a66d9af4
15 changed files with 397 additions and 459 deletions

View File

@@ -38,6 +38,9 @@ base64 = "0.22.1"
tracing.workspace = true
tracing-subscriber.workspace = true
# Time
chrono.workspace = true
# Hero dependencies
hero-job = { path = "../../lib/models/job" }
hero-supervisor-openrpc-client = { path = "../../lib/clients/supervisor" }

View File

@@ -1,12 +0,0 @@
// Re-export from the supervisor client library
pub use hero_supervisor_openrpc_client::{
SupervisorClient,
ClientError as SupervisorClientError,
transports::{
MyceliumClient,
MyceliumClientError,
SupervisorHub,
Destination,
MyceliumTransport,
},
};

View File

@@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt;
use crate::{
models::{Flow, Job, JobStatus, ScriptType},
models::{Flow, Job, JobStatus},
storage::RedisDriver,
};
@@ -58,12 +58,25 @@ impl From<Box<dyn std::error::Error + Send + Sync>> for DagError {
}
}
/// Node execution status - tracks the state of a job in the DAG workflow
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub enum NodeStatus {
Pending, // Not yet ready to execute (waiting for dependencies)
Ready, // Dependencies met, ready to be dispatched
Dispatched, // Sent to supervisor for execution
Running, // Currently executing
Completed, // Successfully completed
Failed, // Execution failed
Cancelled, // Execution was cancelled
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JobSummary {
pub struct FlowNode {
pub id: u32,
pub depends: Vec<u32>,
pub prerequisites: Vec<String>,
pub script_type: ScriptType,
pub supervisor_url: String, // URL of the supervisor to route this job to
pub node_status: NodeStatus, // Track execution status at the node level
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -71,7 +84,7 @@ pub struct FlowDag {
pub flow_id: u32,
pub caller_id: u32,
pub context_id: u32,
pub nodes: HashMap<u32, JobSummary>,
pub nodes: HashMap<u32, FlowNode>,
pub edges: Vec<(u32, u32)>, // (from prerequisite, to job)
pub reverse_edges: Vec<(u32, u32)>, // (from job, to prerequisite)
pub roots: Vec<u32>, // in_degree == 0
@@ -122,8 +135,23 @@ pub async fn build_flow_dag(
in_degree.entry(jid).or_insert(0);
}
for (&jid, job) in &jobs {
for &dep in job.depends() {
// Build nodes first with their dependencies
// TODO: Load node dependencies from Flow metadata or separate storage
let mut nodes: HashMap<u32, FlowNode> = HashMap::with_capacity(jobs.len());
for (&jid, _job) in &jobs {
let node = FlowNode {
id: jid,
depends: Vec::new(), // TODO: Load from Flow or separate dependency storage
prerequisites: Vec::new(), // TODO: Load from Flow metadata
supervisor_url: String::new(), // TODO: Determine from routing logic
node_status: NodeStatus::Pending,
};
nodes.insert(jid, node);
}
// Build edges from node dependencies
for (&jid, node) in &nodes {
for &dep in &node.depends {
if !job_id_set.contains(&dep) {
return Err(DagError::MissingDependency {
job: jid,
@@ -196,44 +224,31 @@ pub async fn build_flow_dag(
.filter_map(|(k, v)| if v.is_empty() { Some(*k) } else { None })
.collect();
// Nodes map (JobSummary)
let mut nodes: HashMap<u32, JobSummary> = HashMap::with_capacity(jobs.len());
for (&jid, job) in &jobs {
let summary = JobSummary {
id: jid,
depends: job.depends().to_vec(),
prerequisites: job.prerequisites().to_vec(),
script_type: job.script_type(),
};
nodes.insert(jid, summary);
}
// Sort edges deterministically
edges.sort_unstable();
reverse_edges.sort_unstable();
// Populate runtime execution state from persisted Job.status()
// Populate runtime execution state from FlowNode status
let mut started_set: HashSet<u32> = HashSet::new();
let mut completed_set: HashSet<u32> = HashSet::new();
let mut error_ids: Vec<u32> = Vec::new();
for (&jid, job) in &jobs {
match job.status() {
JobStatus::Finished => {
for (&jid, node) in &nodes {
match node.node_status {
NodeStatus::Completed => {
completed_set.insert(jid);
}
JobStatus::Started => {
NodeStatus::Running => {
started_set.insert(jid);
}
JobStatus::Dispatched => {
// Consider Dispatched as "in-flight" for DAG runtime started set,
// so queued/running work is visible in periodic snapshots.
NodeStatus::Dispatched => {
// Consider Dispatched as "in-flight" for DAG runtime started set
started_set.insert(jid);
}
JobStatus::Error => {
NodeStatus::Failed => {
error_ids.push(jid);
}
JobStatus::WaitingForPrerequisites => {
NodeStatus::Pending | NodeStatus::Ready | NodeStatus::Cancelled => {
// Neither started nor completed
}
}
@@ -304,7 +319,7 @@ impl FlowDag {
/// - If the flow has already failed, return FlowFailed
/// - If the job is already started or completed, this is a no-op (idempotent)
/// - If any dependency is not completed, return DependenciesIncomplete with the missing deps
pub fn mark_job_started(&mut self, job: u32) -> DagResult<()> {
pub fn mark_node_started(&mut self, job: u32) -> DagResult<()> {
if !self.nodes.contains_key(&job) {
return Err(DagError::UnknownJob { job });
}
@@ -337,7 +352,7 @@ impl FlowDag {
/// - If the job is already completed, this is a no-op (idempotent)
/// - If the flow has already failed, return FlowFailed
/// - If the job was not previously started, return JobNotStarted
pub fn mark_job_completed(&mut self, job: u32) -> DagResult<()> {
pub fn mark_node_completed(&mut self, job: u32) -> DagResult<()> {
if !self.nodes.contains_key(&job) {
return Err(DagError::UnknownJob { job });
}
@@ -363,7 +378,7 @@ impl FlowDag {
/// - If it is the same job, no-op (idempotent)
/// - If it is a different job, return FlowFailed with the already-failed job
/// - Otherwise record this job as the failed job
pub fn mark_job_failed(&mut self, job: u32) -> DagResult<()> {
pub fn mark_node_failed(&mut self, job: u32) -> DagResult<()> {
if !self.nodes.contains_key(&job) {
return Err(DagError::UnknownJob { job });
}

View File

@@ -1,4 +1,3 @@
pub mod clients;
pub mod dag;
pub mod models;
pub mod router;

View File

@@ -103,10 +103,10 @@ async fn main() {
{
let base_url = format!("http://{}:{}", cli.mycelium_ip, cli.mycelium_port);
let mycelium = Arc::new(
hero_coordinator::clients::MyceliumClient::new(&base_url)
hero_supervisor_openrpc_client::transports::MyceliumClient::new(&base_url)
.expect("Failed to create MyceliumClient")
);
let hub = hero_coordinator::clients::SupervisorHub::new_with_client(
let hub = hero_supervisor_openrpc_client::transports::SupervisorHub::new_with_client(
mycelium,
"supervisor.rpc".to_string(),
);

View File

@@ -1,16 +1,12 @@
mod actor;
mod context;
mod flow;
mod message;
mod runner;
mod script_type;
pub use actor::Actor;
pub use context::Context;
pub use flow::{Flow, FlowStatus};
pub use message::{Message, MessageFormatType, MessageStatus, MessageType, TransportStatus};
pub use runner::Runner;
pub use script_type::ScriptType;
// Re-export Job types from hero_job
pub use hero_job::{Job, JobStatus, JobError, JobResult, JobBuilder, JobSignature};

View File

@@ -1,15 +0,0 @@
use std::net::IpAddr;
use serde::{Deserialize, Serialize};
use crate::time::Timestamp;
#[derive(Serialize, Deserialize, Clone)]
pub struct Actor {
id: u32,
pubkey: String,
/// IP where the actor is reachable, can be mycelium but that is not mandatory
address: Vec<IpAddr>,
created_at: Timestamp,
updated_at: Timestamp,
}

View File

@@ -1,7 +1,8 @@
use serde::{Deserialize, Serialize};
use crate::{
models::{Job, ScriptType},
dag::FlowNode,
models::Job,
time::Timestamp,
};
@@ -13,8 +14,10 @@ pub struct Message {
pub caller_id: u32,
/// Id of the context in which this message was sent
pub context_id: u32,
/// Id of the flow this message belongs to (for DAG tracking)
pub flow_id: u32,
pub message: String,
pub message_type: ScriptType,
pub message_type: String, // Deprecated, use job.executor instead
pub message_format_type: MessageFormatType,
/// Seconds for the message to arrive at the destination
pub timeout: u32,
@@ -28,6 +31,9 @@ pub struct Message {
/// Latest transport status as reported by Mycelium
pub transport_status: Option<TransportStatus>,
/// FlowNodes containing routing and dependency info
pub nodes: Vec<FlowNode>,
/// Legacy: Jobs for backward compatibility (TODO: remove after full migration)
pub job: Vec<Job>,
pub logs: Vec<Log>,
pub created_at: Timestamp,

View File

@@ -2,7 +2,6 @@ use std::net::IpAddr;
use serde::{Deserialize, Serialize};
use crate::models::ScriptType;
use crate::time::Timestamp;
#[derive(Serialize, Deserialize, Clone)]
@@ -14,8 +13,8 @@ pub struct Runner {
pub address: IpAddr,
/// Needs to be set by the runner, usually `runner<runnerid`
pub topic: String,
/// The script type this runner can execute; used for routing
pub script_type: ScriptType,
/// The executor this runner can handle (e.g., "python", "rhai"); used for routing
pub executor: String,
/// If this is true, the runner also listens on a local redis queue
pub local: bool,
/// Optional secret used for authenticated supervisor calls (if required)

View File

@@ -1,9 +0,0 @@
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
pub enum ScriptType {
Osis,
Sal,
V,
Python,
}

View File

@@ -11,10 +11,13 @@ use std::hash::{Hash, Hasher};
use tokio::sync::{Mutex, Semaphore};
use crate::{
clients::{Destination, MyceliumClient, MyceliumTransport, SupervisorClient, SupervisorHub},
models::{Job, JobStatus, Message, MessageStatus, ScriptType, TransportStatus},
models::{Job, JobStatus, Message, MessageStatus, TransportStatus},
service::AppService,
};
use hero_supervisor_openrpc_client::{
SupervisorClient,
transports::{Destination, MyceliumClient, MyceliumTransport, SupervisorHub},
};
use tracing::{error, info};
#[derive(Clone)]
@@ -197,44 +200,64 @@ async fn deliver_one(
// Load message
let msg: Message = service.load_message(context_id, caller_id, id).await?;
// Embedded job id (if any)
let job_id_opt: Option<u32> = msg.job.first().map(|j| j.id);
// Determine routing from FlowNode.supervisor_url if available
let supervisor_url = if !msg.nodes.is_empty() {
// Use FlowNode routing (new architecture)
msg.nodes[0].supervisor_url.clone()
} else {
// Fallback: get first available runner (legacy)
let runners = service.scan_runners(context_id).await?;
let Some(runner) = runners.into_iter().next() else {
let log = format!(
"No runners available in context {} for message {}",
context_id, msg_key
);
let _ = service
.append_message_logs(context_id, caller_id, id, vec![log.clone()])
.await;
let _ = service
.update_message_status(context_id, caller_id, id, MessageStatus::Error)
.await;
return Err(log.into());
};
// Build URL from runner
if !runner.pubkey.trim().is_empty() {
format!("mycelium://{}", runner.pubkey)
} else {
format!("http://{}", runner.address)
}
};
// Determine routing script_type
let desired: ScriptType = determine_script_type(&msg);
// Discover runners and select a matching one
let runners = service.scan_runners(context_id).await?;
let Some(runner) = runners.into_iter().find(|r| r.script_type == desired) else {
let log = format!(
"No runner with script_type {:?} available in context {} for message {}",
desired, context_id, msg_key
);
let _ = service
.append_message_logs(context_id, caller_id, id, vec![log.clone()])
.await;
let _ = service
.update_message_status(context_id, caller_id, id, MessageStatus::Error)
.await;
return Err(log.into());
// Parse supervisor_url to determine destination
// Format: "mycelium://<pubkey>" or "http://<address>" or just "<address>"
let dest = if supervisor_url.starts_with("mycelium://") {
let pubkey = supervisor_url.strip_prefix("mycelium://").unwrap_or("");
Destination::Pk(pubkey.to_string())
} else {
// Extract address (strip http:// or https:// if present)
let address_str = supervisor_url
.strip_prefix("http://")
.or_else(|| supervisor_url.strip_prefix("https://"))
.unwrap_or(&supervisor_url);
// Parse IP address (strip port if present)
let ip_str = address_str.split(':').next().unwrap_or(address_str);
let ip_addr = ip_str.parse().unwrap_or_else(|_| {
// Default to localhost if parsing fails
std::net::IpAddr::V4(std::net::Ipv4Addr::new(127, 0, 0, 1))
});
Destination::Ip(ip_addr)
};
// Build SupervisorClient
let dest = if !runner.pubkey.trim().is_empty() {
Destination::Pk(runner.pubkey.clone())
} else {
Destination::Ip(runner.address)
};
// Keep clones for poller usage
let dest_for_poller = dest.clone();
let topic_for_poller = cfg.topic.clone();
let secret_for_poller = runner.secret.clone();
let client = cache
.get_or_create(
sup_hub.clone(),
dest.clone(),
cfg.topic.clone(),
runner.secret.clone(),
None, // TODO: Get secret from runner or config
)
.await;
@@ -244,11 +267,44 @@ async fn deliver_one(
// Send via the new client API
// The transport handles message correlation internally
let _result = if method == "job.run" {
let job_result = if method == "job.run" {
if let Some(j) = msg.job.first() {
// Use typed job_run method
let job = serde_json::from_value(job_to_json(j)?)?;
client.job_run(job, None).await?;
let result = client.job_run(job, None).await;
// Update node status based on result
if !msg.nodes.is_empty() {
let node_id = msg.nodes[0].id;
let flow_id = msg.flow_id;
match &result {
Ok(_) => {
// Job completed successfully
let _ = service
.update_node_status_unchecked(
context_id,
flow_id,
node_id,
crate::dag::NodeStatus::Completed,
)
.await;
}
Err(_) => {
// Job failed
let _ = service
.update_node_status_unchecked(
context_id,
flow_id,
node_id,
crate::dag::NodeStatus::Failed,
)
.await;
}
}
}
result?;
serde_json::Value::Null
} else {
// Generic call - not supported in new API, would need custom implementation
@@ -277,19 +333,16 @@ async fn deliver_one(
.update_message_status(context_id, caller_id, id, MessageStatus::Acknowledged)
.await?;
// For job.run, mark the job as dispatched
// Log job completion
if method == "job.run" {
if let Some(job_id) = msg.job.first().map(|j| j.id) {
let _ = service
.update_job_status_unchecked(context_id, caller_id, job_id, JobStatus::Dispatched)
.await;
if let Some(job_id) = msg.job.first().map(|j| j.id.parse::<u32>().unwrap_or(0)) {
let _ = service
.append_message_logs(
context_id,
caller_id,
id,
vec![format!(
"Supervisor reply for job {}: job_queued (processed synchronously)",
"Job {} completed successfully",
job_id
)],
)
@@ -304,13 +357,7 @@ async fn deliver_one(
Ok(())
}
fn determine_script_type(msg: &Message) -> ScriptType {
// Prefer embedded job's script_type if available, else fallback to message.message_type
match msg.job.first() {
Some(j) => j.script_type.clone(),
None => msg.message_type.clone(),
}
}
// Removed determine_executor - routing now based on FlowNode.supervisor_url
fn build_params(msg: &Message) -> Result<Value, Box<dyn std::error::Error + Send + Sync>> {
// Minimal mapping:

View File

@@ -15,8 +15,8 @@ use serde_json::{Value, json};
use crate::{
dag::{DagError, FlowDag},
models::{
Actor, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType,
MessageStatus, Runner, ScriptType,
Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType,
MessageStatus, Runner,
},
service::AppService,
time::current_timestamp,
@@ -92,25 +92,13 @@ fn dag_err(e: DagError) -> ErrorObjectOwned {
// Create DTOs and Param wrappers
// -----------------------------
#[derive(Debug, Deserialize)]
pub struct ActorCreate {
pub id: u32,
pub pubkey: String,
pub address: Vec<IpAddr>,
}
impl ActorCreate {
pub fn into_domain(self) -> Result<Actor, String> {
let ts = current_timestamp();
let v = json!({
"id": self.id,
"pubkey": self.pubkey,
"address": self.address,
"created_at": ts,
"updated_at": ts,
});
serde_json::from_value(v).map_err(|e| e.to_string())
}
}
// Actor was renamed to Runner - ActorCreate is deprecated
// #[derive(Debug, Deserialize)]
// pub struct ActorCreate {
// pub id: u32,
// pub pubkey: String,
// pub address: Vec<IpAddr>,
// }
#[derive(Debug, Deserialize)]
pub struct ContextCreate {
@@ -147,8 +135,8 @@ pub struct RunnerCreate {
pub pubkey: String,
pub address: IpAddr,
pub topic: String,
/// The script type this runner executes (used for routing)
pub script_type: ScriptType,
/// The executor this runner can handle (e.g., "python", "rhai")
pub executor: String,
pub local: bool,
/// Optional secret used for authenticated supervisor calls (if required)
pub secret: Option<String>,
@@ -162,7 +150,7 @@ impl RunnerCreate {
pubkey,
address,
topic,
script_type,
executor,
local,
secret,
} = self;
@@ -172,7 +160,7 @@ impl RunnerCreate {
pubkey,
address,
topic,
script_type,
executor,
local,
secret,
created_at: ts,
@@ -222,7 +210,8 @@ pub struct JobCreate {
pub caller_id: u32,
pub context_id: u32,
pub script: String,
pub script_type: ScriptType,
pub runner: Option<String>,
pub executor: Option<String>,
pub timeout: u32,
pub retries: u8,
pub env_vars: HashMap<String, String>,
@@ -232,37 +221,24 @@ pub struct JobCreate {
impl JobCreate {
pub fn into_domain(self) -> Job {
let ts = current_timestamp();
let JobCreate {
id,
caller_id,
context_id,
script,
script_type,
timeout,
retries,
env_vars,
prerequisites,
depends,
} = self;
use chrono::Utc;
// Convert old format to hero_job::Job
// Note: depends and prerequisites are workflow fields that need separate storage
Job {
id,
caller_id,
context_id,
script,
script_type,
timeout,
retries,
env_vars,
result: HashMap::new(),
prerequisites,
depends,
created_at: ts,
updated_at: ts,
status: JobStatus::WaitingForPrerequisites,
id: self.id.to_string(),
caller_id: self.caller_id.to_string(),
context_id: self.context_id.to_string(),
payload: self.script,
runner: self.runner.unwrap_or_else(|| "default-runner".to_string()),
executor: self.executor.unwrap_or_else(|| "python".to_string()),
timeout: self.timeout as u64,
env_vars: self.env_vars,
created_at: Utc::now(),
updated_at: Utc::now(),
signatures: Vec::new(),
}
// TODO: Store depends and prerequisites separately in JobSummary/DAG
}
}
@@ -272,7 +248,7 @@ pub struct MessageCreate {
pub caller_id: u32,
pub context_id: u32,
pub message: String,
pub message_type: ScriptType,
pub message_type: String,
pub message_format_type: MessageFormatType,
pub timeout: u32,
pub timeout_ack: u32,
@@ -300,6 +276,7 @@ impl MessageCreate {
id,
caller_id,
context_id,
flow_id: 0, // TODO: MessageCreate should include flow_id
message,
message_type,
message_format_type,
@@ -308,6 +285,7 @@ impl MessageCreate {
timeout_result,
transport_id: None,
transport_status: None,
nodes: Vec::new(), // TODO: MessageCreate should include nodes
job: job.into_iter().map(JobCreate::into_domain).collect(),
logs: Vec::new(),
created_at: ts,
@@ -317,14 +295,15 @@ impl MessageCreate {
}
}
#[derive(Debug, Deserialize)]
pub struct ActorCreateParams {
pub actor: ActorCreate,
}
#[derive(Debug, Deserialize)]
pub struct ActorLoadParams {
pub id: u32,
}
// Actor was renamed to Runner - ActorCreateParams and ActorLoadParams are deprecated
// #[derive(Debug, Deserialize)]
// pub struct ActorCreateParams {
// pub actor: ActorCreate,
// }
// #[derive(Debug, Deserialize)]
// pub struct ActorLoadParams {
// pub id: u32,
// }
#[derive(Debug, Deserialize)]
pub struct ContextCreateParams {
@@ -388,39 +367,6 @@ pub struct MessageLoadParams {
pub fn build_module(state: Arc<AppState>) -> RpcModule<()> {
let mut module: RpcModule<()> = RpcModule::new(());
// Actor
{
let state = state.clone();
module
.register_async_method("actor.create", move |params, _caller, _ctx| {
let state = state.clone();
async move {
let p: ActorCreateParams = params.parse().map_err(invalid_params_err)?;
let actor = p.actor.into_domain().map_err(invalid_params_err)?;
let actor = state
.service
.create_actor(actor)
.await
.map_err(storage_err)?;
Ok::<_, ErrorObjectOwned>(actor)
}
})
.expect("register actor.create");
}
{
let state = state.clone();
module
.register_async_method("actor.load", move |params, _caller, _ctx| {
let state = state.clone();
async move {
let p: ActorLoadParams = params.parse().map_err(invalid_params_err)?;
let actor = state.service.load_actor(p.id).await.map_err(storage_err)?;
Ok::<_, ErrorObjectOwned>(actor)
}
})
.expect("register actor.load");
}
// Context
{
let state = state.clone();

View File

@@ -1,6 +1,6 @@
use crate::dag::{DagError, DagResult, FlowDag, build_flow_dag};
use crate::dag::{DagError, DagResult, FlowDag, NodeStatus, build_flow_dag};
use crate::models::{
Actor, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType, MessageStatus,
Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType, MessageStatus,
Runner, TransportStatus,
};
use crate::storage::RedisDriver;
@@ -157,22 +157,8 @@ fn validate_context(ctx: &Context) -> Result<(), BoxError> {
Ok(())
}
fn validate_actor(actor: &Actor) -> Result<(), BoxError> {
let v = as_json(actor)?;
let id = json_get_u32(&v, "id")?;
if id == 0 {
return Err(ValidationError::new("Actor.id must be > 0").into());
}
let pubkey = json_get_str(&v, "pubkey")?;
if pubkey.trim().is_empty() {
return Err(ValidationError::new("Actor.pubkey must not be empty").into());
}
let addr = json_get_array(&v, "address")?;
if addr.is_empty() {
return Err(ValidationError::new("Actor.address must not be empty").into());
}
Ok(())
}
// Actor was renamed to Runner - validate_actor is deprecated
// fn validate_actor(actor: &Actor) -> Result<(), BoxError> { ... }
fn validate_runner(_context_id: u32, runner: &Runner) -> Result<(), BoxError> {
let v = as_json(runner)?;
@@ -312,21 +298,10 @@ impl AppService {
}
// -----------------------------
// Actor
// Actor (deprecated - renamed to Runner)
// -----------------------------
pub async fn create_actor(&self, actor: Actor) -> Result<Actor, BoxError> {
validate_actor(&actor)?;
let v = as_json(&actor)?;
let id = json_get_u32(&v, "id")?;
self.ensure_actor_not_exists_global(id).await?;
self.redis.save_actor_global(&actor).await?;
Ok(actor)
}
pub async fn load_actor(&self, id: u32) -> Result<Actor, BoxError> {
let actor = self.redis.load_actor_global(id).await?;
Ok(actor)
}
// pub async fn create_actor(&self, actor: Actor) -> Result<Actor, BoxError> { ... }
// pub async fn load_actor(&self, id: u32) -> Result<Actor, BoxError> { ... }
// -----------------------------
// Runner
@@ -409,102 +384,75 @@ impl AppService {
tokio::spawn(async move {
// Background loop
loop {
// Load current flow; stop if missing
let flow = match redis.load_flow(context_id, flow_id).await {
Ok(f) => f,
Err(_) => break,
// Build DAG from flow
let dag = match build_flow_dag(&redis, context_id, flow_id).await {
Ok(d) => d,
Err(_) => break, // Flow missing or error
};
// Track aggregate state
let mut all_finished = true;
let mut any_error = false;
// Get ready nodes (dependencies satisfied, not yet dispatched)
let ready_node_ids = match dag.ready_jobs() {
Ok(ids) => ids,
Err(_) => {
// DAG error (e.g., failed job), mark flow as error and exit
let _ = redis
.update_flow_status(context_id, flow_id, FlowStatus::Error)
.await;
break;
}
};
// Iterate jobs declared in the flow
for jid in flow.jobs() {
// Load job
let job = match redis.load_job(context_id, caller_id, *jid).await {
Ok(j) => j,
Err(_) => {
// If job is missing treat as error state for the flow and stop
any_error = true;
all_finished = false;
break;
}
// Dispatch ready nodes
for node_id in ready_node_ids {
let node = match dag.nodes.get(&node_id) {
Some(n) => n,
None => continue,
};
match job.status() {
JobStatus::Finished => {
// done
}
JobStatus::Error => {
any_error = true;
all_finished = false;
}
JobStatus::Dispatched | JobStatus::Started => {
all_finished = false;
}
JobStatus::WaitingForPrerequisites => {
all_finished = false;
// Load the job data
let job = match redis.load_job(context_id, caller_id, node_id).await {
Ok(j) => j,
Err(_) => continue,
};
// Check dependencies complete
let mut deps_ok = true;
for dep in job.depends() {
match redis.load_job(context_id, caller_id, *dep).await {
Ok(dj) => {
if dj.status() != JobStatus::Finished {
deps_ok = false;
break;
}
}
Err(_) => {
deps_ok = false;
break;
}
}
}
// Build Message with FlowNode for routing
let ts = crate::time::current_timestamp();
let msg_id: u32 = node_id; // Use node_id as message_id
if deps_ok {
// Build Message embedding this job
let ts = crate::time::current_timestamp();
let msg_id: u32 = job.id.parse().unwrap_or(0); // deterministic message id per job for now
let message = Message {
id: msg_id,
caller_id: job.caller_id.parse().unwrap_or(0),
context_id,
flow_id,
message: "job.run".to_string(),
message_type: job.executor.clone(),
message_format_type: MessageFormatType::Text,
timeout: job.timeout as u32,
timeout_ack: 10,
timeout_result: job.timeout as u32,
transport_id: None,
transport_status: None,
nodes: vec![node.clone()], // Include FlowNode for routing
job: vec![job.clone()],
logs: Vec::new(),
created_at: ts,
updated_at: ts,
status: MessageStatus::Dispatched,
};
let message = Message {
id: msg_id,
caller_id: job.caller_id.parse().unwrap_or(0),
context_id,
message: "job.run".to_string(),
message_type: ScriptType::Python, // Default, script_type is deprecated
message_format_type: MessageFormatType::Text,
timeout: job.timeout,
timeout_ack: 10,
timeout_result: job.timeout,
transport_id: None,
transport_status: None,
job: vec![job.clone()],
logs: Vec::new(),
created_at: ts,
updated_at: ts,
status: MessageStatus::Dispatched,
};
// Persist the message and enqueue it
if redis.save_message(context_id, &message).await.is_ok() {
let _ = redis
.enqueue_msg_out(context_id, job.caller_id, msg_id);
// Mark job as Dispatched
let _ = redis
.update_job_status(
context_id,
job.caller_id,
job.id,
JobStatus::Dispatched,
);
}
}
}
// Persist the message and enqueue it
if redis.save_message(context_id, &message).await.is_ok() {
let caller_id_u32 = job.caller_id.parse::<u32>().unwrap_or(0);
let _ = redis.enqueue_msg_out(context_id, caller_id_u32, msg_id);
// TODO: Mark node as Dispatched in DAG and persist
// For now, the node status is tracked in memory only
}
}
// Check if flow is complete
let all_finished = dag.completed.len() == dag.nodes.len();
let any_error = dag.failed_job.is_some();
if any_error {
let _ = redis
.update_flow_status(context_id, flow_id, FlowStatus::Error)
@@ -553,14 +501,16 @@ impl AppService {
id: msg_id,
caller_id: job.caller_id.parse().unwrap_or(0),
context_id,
flow_id, // Add flow_id for DAG tracking
message: "job.run".to_string(),
message_type: ScriptType::Python, // Default, script_type is deprecated
message_type: job.executor.clone(),
message_format_type: MessageFormatType::Text,
timeout: job.timeout,
timeout: job.timeout as u32,
timeout_ack: 10,
timeout_result: job.timeout,
timeout_result: job.timeout as u32,
transport_id: None,
transport_status: None,
nodes: Vec::new(), // TODO: Add FlowNode from DAG
job: vec![job.clone()],
logs: Vec::new(),
created_at: ts,
@@ -574,12 +524,13 @@ impl AppService {
.await
.map_err(DagError::from)?;
let caller_id_u32 = job.caller_id.parse::<u32>().unwrap_or(0);
self.redis
.enqueue_msg_out(context_id, job.caller_id(), msg_id)
.enqueue_msg_out(context_id, caller_id_u32, msg_id)
.await
.map_err(DagError::from)?;
let key = format!("message:{}:{}", job.caller_id(), msg_id);
let key = format!("message:{}:{}", caller_id_u32, msg_id);
queued.push(key);
}
@@ -590,7 +541,7 @@ impl AppService {
// Job
// -----------------------------
pub async fn create_job(&self, context_id: u32, job: Job) -> Result<Job, BoxError> {
validate_job(context_id, &job)?;
// Validation removed - Job validation now handled at creation time
let v = as_json(&job)?;
let id = json_get_u32(&v, "id")?;
let caller_id = json_get_u32(&v, "caller_id")?;
@@ -619,101 +570,155 @@ impl AppService {
/// - Finished, Error -> terminal (no transitions)
///
/// If the new status equals the current status, this is a no-op.
pub async fn update_job_status(
/// Update node status in the DAG with transition validation.
///
/// Allowed transitions:
/// - Pending -> Ready | Dispatched | Cancelled
/// - Ready -> Dispatched | Cancelled
/// - Dispatched -> Running | Failed | Cancelled
/// - Running -> Completed | Failed | Cancelled
/// - Completed, Failed, Cancelled -> terminal (no transitions)
///
/// If the new status equals the current status, this is a no-op (idempotent).
pub async fn update_node_status(
&self,
context_id: u32,
executor_id: u32,
caller_id: u32,
id: u32,
new_status: JobStatus,
flow_id: u32,
node_id: u32,
new_status: NodeStatus,
) -> Result<(), BoxError> {
self.require_executor(context_id, executor_id, "update job status")
self.require_executor(context_id, executor_id, "update node status")
.await?;
let job = self.redis.load_job(context_id, caller_id, id).await?;
let current = job.status();
// Load the DAG
let mut dag = build_flow_dag(&self.redis, context_id, flow_id).await?;
// Get current node status
let node = dag.nodes.get(&node_id)
.ok_or_else(|| format!("Node {} not found in flow {}", node_id, flow_id))?;
let current = node.node_status.clone();
if new_status == current {
// Idempotent: don't touch storage if no change
return Ok(());
}
// Validate state transition
let allowed = match current {
JobStatus::Dispatched => matches!(
NodeStatus::Pending => matches!(
new_status,
JobStatus::WaitingForPrerequisites
| JobStatus::Started
| JobStatus::Finished
| JobStatus::Error
NodeStatus::Ready | NodeStatus::Dispatched | NodeStatus::Cancelled
),
JobStatus::WaitingForPrerequisites => {
matches!(
new_status,
JobStatus::Started | JobStatus::Finished | JobStatus::Error
)
}
JobStatus::Started => matches!(new_status, JobStatus::Finished | JobStatus::Error),
JobStatus::Finished | JobStatus::Error => false,
NodeStatus::Ready => matches!(
new_status,
NodeStatus::Dispatched | NodeStatus::Cancelled
),
NodeStatus::Dispatched => matches!(
new_status,
NodeStatus::Running | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Running => matches!(
new_status,
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled => false,
};
if !allowed {
return Err(Box::new(InvalidJobStatusTransition {
from: current,
to: new_status,
}));
return Err(format!(
"Invalid node status transition from {:?} to {:?}",
current, new_status
).into());
}
self.redis
.update_job_status(context_id, caller_id, id, new_status)
.await?;
// Update the node status
if let Some(node) = dag.nodes.get_mut(&node_id) {
node.node_status = new_status;
// Persist the updated DAG
// TODO: Implement DAG persistence
// self.redis.save_flow_dag(context_id, flow_id, &dag).await?;
}
Ok(())
}
/// Bypass-permission variant to update a job status with transition validation.
/// Bypass-permission variant to update node status with transition validation.
/// This skips the executor permission check but enforces the same state transition rules.
pub async fn update_job_status_unchecked(
pub async fn update_node_status_unchecked(
&self,
context_id: u32,
caller_id: u32,
id: u32,
new_status: JobStatus,
flow_id: u32,
node_id: u32,
new_status: NodeStatus,
) -> Result<(), BoxError> {
let job = self.redis.load_job(context_id, caller_id, id).await?;
let current = job.status();
// Load the DAG
let mut dag = build_flow_dag(&self.redis, context_id, flow_id).await?;
// Get current node status
let node = dag.nodes.get(&node_id)
.ok_or_else(|| format!("Node {} not found in flow {}", node_id, flow_id))?;
let current = node.node_status.clone();
if new_status == current {
// Idempotent: don't touch storage if no change
return Ok(());
}
// Validate state transition
let allowed = match current {
JobStatus::Dispatched => matches!(
NodeStatus::Pending => matches!(
new_status,
JobStatus::WaitingForPrerequisites
| JobStatus::Started
| JobStatus::Finished
| JobStatus::Error
NodeStatus::Ready | NodeStatus::Dispatched | NodeStatus::Cancelled
),
JobStatus::WaitingForPrerequisites => {
matches!(
new_status,
JobStatus::Started | JobStatus::Finished | JobStatus::Error
)
}
JobStatus::Started => matches!(new_status, JobStatus::Finished | JobStatus::Error),
JobStatus::Finished | JobStatus::Error => false,
NodeStatus::Ready => matches!(
new_status,
NodeStatus::Dispatched | NodeStatus::Cancelled
),
NodeStatus::Dispatched => matches!(
new_status,
NodeStatus::Running | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Running => matches!(
new_status,
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled => false,
};
if !allowed {
return Err(Box::new(InvalidJobStatusTransition {
from: current,
to: new_status,
}));
return Err(format!(
"Invalid node status transition from {:?} to {:?}",
current, new_status
).into());
}
self.redis
.update_job_status(context_id, caller_id, id, new_status)
.await?;
// Update the node status
if let Some(node) = dag.nodes.get_mut(&node_id) {
node.node_status = new_status.clone();
// Update DAG runtime state for ready_jobs() to work correctly
match new_status {
NodeStatus::Dispatched | NodeStatus::Running => {
dag.started.insert(node_id);
}
NodeStatus::Completed => {
dag.started.insert(node_id);
dag.completed.insert(node_id);
}
NodeStatus::Failed => {
dag.started.insert(node_id);
dag.failed_job = Some(node_id);
}
_ => {}
}
// Persist the updated DAG
// TODO: Implement DAG persistence to Redis
// For now, the DAG is rebuilt each time, so runtime state is lost
// self.redis.save_flow_dag(context_id, flow_id, &dag).await?;
}
Ok(())
}
@@ -1003,20 +1008,7 @@ impl AppService {
}
}
async fn ensure_actor_not_exists_global(&self, id: u32) -> Result<(), BoxError> {
match self.redis.load_actor_global(id).await {
Ok(_) => Err(Box::new(AlreadyExistsError {
key: format!("actor:{}", id),
})),
Err(e) => {
if contains_key_not_found(&e) {
Ok(())
} else {
Err(e)
}
}
}
}
async fn ensure_runner_not_exists(&self, db: u32, id: u32) -> Result<(), BoxError> {
match self.redis.load_runner(db, id).await {

View File

@@ -1,3 +1,3 @@
pub mod redis;
mod redis;
pub use redis::RedisDriver;

View File

@@ -7,7 +7,7 @@ use serde_json::{Map as JsonMap, Value};
use tokio::sync::Mutex;
use crate::models::{
Actor, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageStatus, Runner,
Context, Flow, FlowStatus, Job, JobStatus, Message, MessageStatus, Runner,
TransportStatus,
};
use tracing::{error, warn};
@@ -201,41 +201,12 @@ impl RedisDriver {
}
// -----------------------------
// Actor
// Actor (deprecated - renamed to Runner)
// -----------------------------
/// Save an Actor to the given DB (tenant/context DB)
pub async fn save_actor(&self, db: u32, actor: &Actor) -> Result<()> {
let json = serde_json::to_value(actor)?;
let id = json
.get("id")
.and_then(|v| v.as_u64())
.ok_or("Actor.id missing or not a number")? as u32;
let key = Self::actor_key(id);
self.hset_model(db, &key, actor).await
}
/// Load an Actor by id from the given DB
pub async fn load_actor(&self, db: u32, id: u32) -> Result<Actor> {
let key = Self::actor_key(id);
self.hget_model(db, &key).await
}
/// Save an Actor globally in DB 0 (Actor is context-independent)
pub async fn save_actor_global(&self, actor: &Actor) -> Result<()> {
let json = serde_json::to_value(actor)?;
let id = json
.get("id")
.and_then(|v| v.as_u64())
.ok_or("Actor.id missing or not a number")? as u32;
let key = Self::actor_key(id);
self.hset_model(0, &key, actor).await
}
/// Load an Actor globally from DB 0 by id
pub async fn load_actor_global(&self, id: u32) -> Result<Actor> {
let key = Self::actor_key(id);
self.hget_model(0, &key).await
}
// pub async fn save_actor(&self, db: u32, actor: &Actor) -> Result<()> { ... }
// pub async fn load_actor(&self, db: u32, id: u32) -> Result<Actor> { ... }
// pub async fn save_actor_global(&self, actor: &Actor) -> Result<()> { ... }
// pub async fn load_actor_global(&self, id: u32) -> Result<Actor> { ... }
// -----------------------------
// Runner