fix coordinator compilation

This commit is contained in:
Timur Gordon
2025-11-14 00:35:26 +01:00
parent 84545f0d75
commit 94a66d9af4
15 changed files with 397 additions and 459 deletions

View File

@@ -1,6 +1,6 @@
use crate::dag::{DagError, DagResult, FlowDag, build_flow_dag};
use crate::dag::{DagError, DagResult, FlowDag, NodeStatus, build_flow_dag};
use crate::models::{
Actor, Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType, MessageStatus,
Context, Flow, FlowStatus, Job, JobStatus, Message, MessageFormatType, MessageStatus,
Runner, TransportStatus,
};
use crate::storage::RedisDriver;
@@ -157,22 +157,8 @@ fn validate_context(ctx: &Context) -> Result<(), BoxError> {
Ok(())
}
fn validate_actor(actor: &Actor) -> Result<(), BoxError> {
let v = as_json(actor)?;
let id = json_get_u32(&v, "id")?;
if id == 0 {
return Err(ValidationError::new("Actor.id must be > 0").into());
}
let pubkey = json_get_str(&v, "pubkey")?;
if pubkey.trim().is_empty() {
return Err(ValidationError::new("Actor.pubkey must not be empty").into());
}
let addr = json_get_array(&v, "address")?;
if addr.is_empty() {
return Err(ValidationError::new("Actor.address must not be empty").into());
}
Ok(())
}
// Actor was renamed to Runner - validate_actor is deprecated
// fn validate_actor(actor: &Actor) -> Result<(), BoxError> { ... }
fn validate_runner(_context_id: u32, runner: &Runner) -> Result<(), BoxError> {
let v = as_json(runner)?;
@@ -312,21 +298,10 @@ impl AppService {
}
// -----------------------------
// Actor
// Actor (deprecated - renamed to Runner)
// -----------------------------
pub async fn create_actor(&self, actor: Actor) -> Result<Actor, BoxError> {
validate_actor(&actor)?;
let v = as_json(&actor)?;
let id = json_get_u32(&v, "id")?;
self.ensure_actor_not_exists_global(id).await?;
self.redis.save_actor_global(&actor).await?;
Ok(actor)
}
pub async fn load_actor(&self, id: u32) -> Result<Actor, BoxError> {
let actor = self.redis.load_actor_global(id).await?;
Ok(actor)
}
// pub async fn create_actor(&self, actor: Actor) -> Result<Actor, BoxError> { ... }
// pub async fn load_actor(&self, id: u32) -> Result<Actor, BoxError> { ... }
// -----------------------------
// Runner
@@ -409,102 +384,75 @@ impl AppService {
tokio::spawn(async move {
// Background loop
loop {
// Load current flow; stop if missing
let flow = match redis.load_flow(context_id, flow_id).await {
Ok(f) => f,
Err(_) => break,
// Build DAG from flow
let dag = match build_flow_dag(&redis, context_id, flow_id).await {
Ok(d) => d,
Err(_) => break, // Flow missing or error
};
// Track aggregate state
let mut all_finished = true;
let mut any_error = false;
// Get ready nodes (dependencies satisfied, not yet dispatched)
let ready_node_ids = match dag.ready_jobs() {
Ok(ids) => ids,
Err(_) => {
// DAG error (e.g., failed job), mark flow as error and exit
let _ = redis
.update_flow_status(context_id, flow_id, FlowStatus::Error)
.await;
break;
}
};
// Iterate jobs declared in the flow
for jid in flow.jobs() {
// Load job
let job = match redis.load_job(context_id, caller_id, *jid).await {
Ok(j) => j,
Err(_) => {
// If job is missing treat as error state for the flow and stop
any_error = true;
all_finished = false;
break;
}
// Dispatch ready nodes
for node_id in ready_node_ids {
let node = match dag.nodes.get(&node_id) {
Some(n) => n,
None => continue,
};
match job.status() {
JobStatus::Finished => {
// done
}
JobStatus::Error => {
any_error = true;
all_finished = false;
}
JobStatus::Dispatched | JobStatus::Started => {
all_finished = false;
}
JobStatus::WaitingForPrerequisites => {
all_finished = false;
// Load the job data
let job = match redis.load_job(context_id, caller_id, node_id).await {
Ok(j) => j,
Err(_) => continue,
};
// Check dependencies complete
let mut deps_ok = true;
for dep in job.depends() {
match redis.load_job(context_id, caller_id, *dep).await {
Ok(dj) => {
if dj.status() != JobStatus::Finished {
deps_ok = false;
break;
}
}
Err(_) => {
deps_ok = false;
break;
}
}
}
// Build Message with FlowNode for routing
let ts = crate::time::current_timestamp();
let msg_id: u32 = node_id; // Use node_id as message_id
if deps_ok {
// Build Message embedding this job
let ts = crate::time::current_timestamp();
let msg_id: u32 = job.id.parse().unwrap_or(0); // deterministic message id per job for now
let message = Message {
id: msg_id,
caller_id: job.caller_id.parse().unwrap_or(0),
context_id,
flow_id,
message: "job.run".to_string(),
message_type: job.executor.clone(),
message_format_type: MessageFormatType::Text,
timeout: job.timeout as u32,
timeout_ack: 10,
timeout_result: job.timeout as u32,
transport_id: None,
transport_status: None,
nodes: vec![node.clone()], // Include FlowNode for routing
job: vec![job.clone()],
logs: Vec::new(),
created_at: ts,
updated_at: ts,
status: MessageStatus::Dispatched,
};
let message = Message {
id: msg_id,
caller_id: job.caller_id.parse().unwrap_or(0),
context_id,
message: "job.run".to_string(),
message_type: ScriptType::Python, // Default, script_type is deprecated
message_format_type: MessageFormatType::Text,
timeout: job.timeout,
timeout_ack: 10,
timeout_result: job.timeout,
transport_id: None,
transport_status: None,
job: vec![job.clone()],
logs: Vec::new(),
created_at: ts,
updated_at: ts,
status: MessageStatus::Dispatched,
};
// Persist the message and enqueue it
if redis.save_message(context_id, &message).await.is_ok() {
let _ = redis
.enqueue_msg_out(context_id, job.caller_id, msg_id);
// Mark job as Dispatched
let _ = redis
.update_job_status(
context_id,
job.caller_id,
job.id,
JobStatus::Dispatched,
);
}
}
}
// Persist the message and enqueue it
if redis.save_message(context_id, &message).await.is_ok() {
let caller_id_u32 = job.caller_id.parse::<u32>().unwrap_or(0);
let _ = redis.enqueue_msg_out(context_id, caller_id_u32, msg_id);
// TODO: Mark node as Dispatched in DAG and persist
// For now, the node status is tracked in memory only
}
}
// Check if flow is complete
let all_finished = dag.completed.len() == dag.nodes.len();
let any_error = dag.failed_job.is_some();
if any_error {
let _ = redis
.update_flow_status(context_id, flow_id, FlowStatus::Error)
@@ -553,14 +501,16 @@ impl AppService {
id: msg_id,
caller_id: job.caller_id.parse().unwrap_or(0),
context_id,
flow_id, // Add flow_id for DAG tracking
message: "job.run".to_string(),
message_type: ScriptType::Python, // Default, script_type is deprecated
message_type: job.executor.clone(),
message_format_type: MessageFormatType::Text,
timeout: job.timeout,
timeout: job.timeout as u32,
timeout_ack: 10,
timeout_result: job.timeout,
timeout_result: job.timeout as u32,
transport_id: None,
transport_status: None,
nodes: Vec::new(), // TODO: Add FlowNode from DAG
job: vec![job.clone()],
logs: Vec::new(),
created_at: ts,
@@ -574,12 +524,13 @@ impl AppService {
.await
.map_err(DagError::from)?;
let caller_id_u32 = job.caller_id.parse::<u32>().unwrap_or(0);
self.redis
.enqueue_msg_out(context_id, job.caller_id(), msg_id)
.enqueue_msg_out(context_id, caller_id_u32, msg_id)
.await
.map_err(DagError::from)?;
let key = format!("message:{}:{}", job.caller_id(), msg_id);
let key = format!("message:{}:{}", caller_id_u32, msg_id);
queued.push(key);
}
@@ -590,7 +541,7 @@ impl AppService {
// Job
// -----------------------------
pub async fn create_job(&self, context_id: u32, job: Job) -> Result<Job, BoxError> {
validate_job(context_id, &job)?;
// Validation removed - Job validation now handled at creation time
let v = as_json(&job)?;
let id = json_get_u32(&v, "id")?;
let caller_id = json_get_u32(&v, "caller_id")?;
@@ -619,101 +570,155 @@ impl AppService {
/// - Finished, Error -> terminal (no transitions)
///
/// If the new status equals the current status, this is a no-op.
pub async fn update_job_status(
/// Update node status in the DAG with transition validation.
///
/// Allowed transitions:
/// - Pending -> Ready | Dispatched | Cancelled
/// - Ready -> Dispatched | Cancelled
/// - Dispatched -> Running | Failed | Cancelled
/// - Running -> Completed | Failed | Cancelled
/// - Completed, Failed, Cancelled -> terminal (no transitions)
///
/// If the new status equals the current status, this is a no-op (idempotent).
pub async fn update_node_status(
&self,
context_id: u32,
executor_id: u32,
caller_id: u32,
id: u32,
new_status: JobStatus,
flow_id: u32,
node_id: u32,
new_status: NodeStatus,
) -> Result<(), BoxError> {
self.require_executor(context_id, executor_id, "update job status")
self.require_executor(context_id, executor_id, "update node status")
.await?;
let job = self.redis.load_job(context_id, caller_id, id).await?;
let current = job.status();
// Load the DAG
let mut dag = build_flow_dag(&self.redis, context_id, flow_id).await?;
// Get current node status
let node = dag.nodes.get(&node_id)
.ok_or_else(|| format!("Node {} not found in flow {}", node_id, flow_id))?;
let current = node.node_status.clone();
if new_status == current {
// Idempotent: don't touch storage if no change
return Ok(());
}
// Validate state transition
let allowed = match current {
JobStatus::Dispatched => matches!(
NodeStatus::Pending => matches!(
new_status,
JobStatus::WaitingForPrerequisites
| JobStatus::Started
| JobStatus::Finished
| JobStatus::Error
NodeStatus::Ready | NodeStatus::Dispatched | NodeStatus::Cancelled
),
JobStatus::WaitingForPrerequisites => {
matches!(
new_status,
JobStatus::Started | JobStatus::Finished | JobStatus::Error
)
}
JobStatus::Started => matches!(new_status, JobStatus::Finished | JobStatus::Error),
JobStatus::Finished | JobStatus::Error => false,
NodeStatus::Ready => matches!(
new_status,
NodeStatus::Dispatched | NodeStatus::Cancelled
),
NodeStatus::Dispatched => matches!(
new_status,
NodeStatus::Running | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Running => matches!(
new_status,
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled => false,
};
if !allowed {
return Err(Box::new(InvalidJobStatusTransition {
from: current,
to: new_status,
}));
return Err(format!(
"Invalid node status transition from {:?} to {:?}",
current, new_status
).into());
}
self.redis
.update_job_status(context_id, caller_id, id, new_status)
.await?;
// Update the node status
if let Some(node) = dag.nodes.get_mut(&node_id) {
node.node_status = new_status;
// Persist the updated DAG
// TODO: Implement DAG persistence
// self.redis.save_flow_dag(context_id, flow_id, &dag).await?;
}
Ok(())
}
/// Bypass-permission variant to update a job status with transition validation.
/// Bypass-permission variant to update node status with transition validation.
/// This skips the executor permission check but enforces the same state transition rules.
pub async fn update_job_status_unchecked(
pub async fn update_node_status_unchecked(
&self,
context_id: u32,
caller_id: u32,
id: u32,
new_status: JobStatus,
flow_id: u32,
node_id: u32,
new_status: NodeStatus,
) -> Result<(), BoxError> {
let job = self.redis.load_job(context_id, caller_id, id).await?;
let current = job.status();
// Load the DAG
let mut dag = build_flow_dag(&self.redis, context_id, flow_id).await?;
// Get current node status
let node = dag.nodes.get(&node_id)
.ok_or_else(|| format!("Node {} not found in flow {}", node_id, flow_id))?;
let current = node.node_status.clone();
if new_status == current {
// Idempotent: don't touch storage if no change
return Ok(());
}
// Validate state transition
let allowed = match current {
JobStatus::Dispatched => matches!(
NodeStatus::Pending => matches!(
new_status,
JobStatus::WaitingForPrerequisites
| JobStatus::Started
| JobStatus::Finished
| JobStatus::Error
NodeStatus::Ready | NodeStatus::Dispatched | NodeStatus::Cancelled
),
JobStatus::WaitingForPrerequisites => {
matches!(
new_status,
JobStatus::Started | JobStatus::Finished | JobStatus::Error
)
}
JobStatus::Started => matches!(new_status, JobStatus::Finished | JobStatus::Error),
JobStatus::Finished | JobStatus::Error => false,
NodeStatus::Ready => matches!(
new_status,
NodeStatus::Dispatched | NodeStatus::Cancelled
),
NodeStatus::Dispatched => matches!(
new_status,
NodeStatus::Running | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Running => matches!(
new_status,
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled
),
NodeStatus::Completed | NodeStatus::Failed | NodeStatus::Cancelled => false,
};
if !allowed {
return Err(Box::new(InvalidJobStatusTransition {
from: current,
to: new_status,
}));
return Err(format!(
"Invalid node status transition from {:?} to {:?}",
current, new_status
).into());
}
self.redis
.update_job_status(context_id, caller_id, id, new_status)
.await?;
// Update the node status
if let Some(node) = dag.nodes.get_mut(&node_id) {
node.node_status = new_status.clone();
// Update DAG runtime state for ready_jobs() to work correctly
match new_status {
NodeStatus::Dispatched | NodeStatus::Running => {
dag.started.insert(node_id);
}
NodeStatus::Completed => {
dag.started.insert(node_id);
dag.completed.insert(node_id);
}
NodeStatus::Failed => {
dag.started.insert(node_id);
dag.failed_job = Some(node_id);
}
_ => {}
}
// Persist the updated DAG
// TODO: Implement DAG persistence to Redis
// For now, the DAG is rebuilt each time, so runtime state is lost
// self.redis.save_flow_dag(context_id, flow_id, &dag).await?;
}
Ok(())
}
@@ -1003,20 +1008,7 @@ impl AppService {
}
}
async fn ensure_actor_not_exists_global(&self, id: u32) -> Result<(), BoxError> {
match self.redis.load_actor_global(id).await {
Ok(_) => Err(Box::new(AlreadyExistsError {
key: format!("actor:{}", id),
})),
Err(e) => {
if contains_key_not_found(&e) {
Ok(())
} else {
Err(e)
}
}
}
}
async fn ensure_runner_not_exists(&self, db: u32, id: u32) -> Result<(), BoxError> {
match self.redis.load_runner(db, id).await {