This commit is contained in:
2025-08-22 13:18:50 +02:00
parent bc0d90d41a
commit e8d09164ff
4 changed files with 114 additions and 197 deletions

View File

@@ -1,28 +1,6 @@
import os from typing import Dict, List, Optional
import time from dataclasses import dataclass, field
import re
import sys
import toml
import libtmux
from libtmux.pane import Pane
from libtmux.window import Window
from libtmux.session import Session
import psutil
from typing import Dict, List, Optional, Any, Set, Tuple
from dataclasses import dataclass, field, asdict
from datetime import datetime from datetime import datetime
import uuid
from pathlib import Path
import asyncio
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import threading
# Configuration
WAITING_MESSAGE = "WAITING FOR JOBS"
HPY_SH_PATH = "/root/heromonkey/functions/hpy.sh" # Path to hpy.sh
@dataclass @dataclass
class ProcessMetrics: class ProcessMetrics:

View File

@@ -1,25 +1,8 @@
import os
import time
import re
import sys
import toml
import libtmux
from libtmux.pane import Pane
from libtmux.window import Window
from libtmux.session import Session
import psutil import psutil
from typing import Dict, List, Optional, Any, Set, Tuple from typing import List, Optional, Tuple
from dataclasses import dataclass, field, asdict
from datetime import datetime from datetime import datetime
import uuid from libtmux.pane import Pane
from pathlib import Path from .model import ProcessMetrics
import asyncio
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import threading
class ProcessMonitor: class ProcessMonitor:
"""Monitor processes running in tmux panes using psutil.""" """Monitor processes running in tmux panes using psutil."""

View File

@@ -1,25 +1,21 @@
import os import os
import time import time
import re import re
import sys
import toml import toml
import libtmux import libtmux
from libtmux.pane import Pane from libtmux.pane import Pane
from libtmux.window import Window from libtmux.window import Window
from libtmux.session import Session from libtmux.session import Session
import psutil from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Any, Set, Tuple from dataclasses import asdict
from dataclasses import dataclass, field, asdict
from datetime import datetime from datetime import datetime
import uuid import uuid
from pathlib import Path from pathlib import Path
import asyncio
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import threading import threading
from .model import DAGStructure, DirectoryStatus, TaskStatus, MetaData
from .process_monitor import ProcessMonitor
# Configuration # Configuration
WAITING_MESSAGE = "WAITING FOR JOBS" WAITING_MESSAGE = "WAITING FOR JOBS"
HPY_SH_PATH = "/root/heromonkey/functions/hpy.sh" # Path to hpy.sh HPY_SH_PATH = "/root/heromonkey/functions/hpy.sh" # Path to hpy.sh
@@ -230,48 +226,48 @@ class TaskRunner:
return all_success return all_success
def run(self): # def run(self):
"""Enhanced run method with DAG tracking.""" # """Enhanced run method with DAG tracking."""
print(f"Starting enhanced task orchestration for '{self.run_name}'") # print(f"Starting enhanced task orchestration for '{self.run_name}'")
print(f"Run ID: {self.run_id}") # print(f"Run ID: {self.run_id}")
print(f"DAG file: {self.dag_file_path}") # print(f"DAG file: {self.dag_file_path}")
self.dag.state = "RUNNING" # self.dag.state = "RUNNING"
self._save_dag() # self._save_dag()
# Initialize windows and panes (similar to original) # # Initialize windows and panes (similar to original)
self._setup_windows_and_panes() # self._setup_windows_and_panes()
# Process directories sequentially # # Process directories sequentially
overall_success = True # overall_success = True
for dir_idx in range(len(self.dag.directories)): # for dir_idx in range(len(self.dag.directories)):
directory = self.dag.directories[dir_idx] # directory = self.dag.directories[dir_idx]
print(f"\n--- Processing Directory {directory.directory_num} ---") # print(f"\n--- Processing Directory {directory.directory_num} ---")
# Start tasks if not the first directory # # Start tasks if not the first directory
if dir_idx > 0: # if dir_idx > 0:
self._start_directory_tasks(dir_idx) # self._start_directory_tasks(dir_idx)
# Monitor tasks # # Monitor tasks
success = self._monitor_directory_tasks( # success = self._monitor_directory_tasks(
dir_idx, # dir_idx,
directory.timeout # directory.timeout
) # )
if not success: # if not success:
overall_success = False # overall_success = False
# Update final DAG state # # Update final DAG state
self.dag.state = "COMPLETED" if overall_success else "FAILED" # self.dag.state = "COMPLETED" if overall_success else "FAILED"
self.dag.end_time = datetime.now().isoformat() # self.dag.end_time = datetime.now().isoformat()
if self.dag.start_time: # if self.dag.start_time:
start = datetime.fromisoformat(self.dag.start_time) # start = datetime.fromisoformat(self.dag.start_time)
end = datetime.fromisoformat(self.dag.end_time) # end = datetime.fromisoformat(self.dag.end_time)
self.dag.duration_seconds = (end - start).total_seconds() # self.dag.duration_seconds = (end - start).total_seconds()
self._save_dag() # self._save_dag()
print(f"\nTask orchestration completed: {self.dag.state}") # print(f"\nTask orchestration completed: {self.dag.state}")
print(f"Total duration: {self.dag.duration_seconds:.2f} seconds") # print(f"Total duration: {self.dag.duration_seconds:.2f} seconds")
def reset(self): def reset(self):
"""Kills all processes and panes inside task windows, removes windows, and deletes .done/.error/.ok files.""" """Kills all processes and panes inside task windows, removes windows, and deletes .done/.error/.ok files."""
@@ -452,7 +448,13 @@ class TaskRunner:
print(f" Pane {pane_idx}: Command sent: 'cd {script_dir}' and 'source {HPY_SH_PATH} && hpy {script_basename}'") print(f" Pane {pane_idx}: Command sent: 'cd {script_dir}' and 'source {HPY_SH_PATH} && hpy {script_basename}'")
def run(self): def run(self):
print(f"Starting task orchestration for run '{self.run_name}' from {self.tasks_root_dir}") """Enhanced run method with DAG tracking."""
print(f"Starting enhanced task orchestration for '{self.run_name}'")
print(f"Run ID: {self.run_id}")
print(f"DAG file: {self.dag_file_path}")
self.dag.state = "RUNNING"
self._save_dag()
if not self.all_tasks_with_meta: if not self.all_tasks_with_meta:
print("No tasks found to execute.") print("No tasks found to execute.")
@@ -464,81 +466,40 @@ class TaskRunner:
for script in scripts: for script in scripts:
print(f" - {script}") print(f" - {script}")
all_dir_nums = sorted(self.all_tasks_with_meta.keys()) # Initialize windows and panes
# Phase 1: Initial setup - create all windows and panes,
# execute first window's tasks, set others to WAITING.
self._setup_windows_and_panes() self._setup_windows_and_panes()
# Phase 2: Sequential execution and monitoring # Process directories sequentially
print("\n--- Starting Sequential Task Execution ---") overall_success = True
for window_idx, dir_num in enumerate(all_dir_nums): for dir_idx in range(len(self.dag.directories)):
scripts, metadata = self.all_tasks_with_meta[dir_num] directory = self.dag.directories[dir_idx]
window_name = f"{self.run_name}_{dir_num}" print(f"\n--- Processing Directory {directory.directory_num} ---")
panes_in_current_window = self.window_panes[window_idx]
if window_idx > 0: # Start tasks if not the first directory
self._start_directory_tasks(window_idx) if dir_idx > 0:
self._start_directory_tasks(dir_idx)
print(f" Monitoring tasks in window '{window_name}' (Timeout: {metadata.timeout}s)...") # Monitor tasks
start_time = time.time() success = self._monitor_directory_tasks(
script_states = {os.path.basename(s): "INITIAL" for s in scripts} # Track state for each script dir_idx,
directory.timeout
)
while True: if not success:
all_tasks_finished_or_failed = True overall_success = False
current_status_report = []
for pane_idx, script_path in enumerate(scripts): # Update final DAG state
script_basename = os.path.basename(script_path) self.dag.state = "COMPLETED" if overall_success else "FAILED"
pane = panes_in_current_window[pane_idx] self.dag.end_time = datetime.now().isoformat()
done_file = f"{script_path}.done" if self.dag.start_time:
error_file = f"{script_path}.error" start = datetime.fromisoformat(self.dag.start_time)
ok_file = f"{script_path}.ok" # Added .ok file check end = datetime.fromisoformat(self.dag.end_time)
self.dag.duration_seconds = (end - start).total_seconds()
self._save_dag()
current_script_state = script_states[script_basename] print(f"\nTask orchestration completed: {self.dag.state}")
new_script_state = current_script_state print(f"Total duration: {self.dag.duration_seconds:.2f} seconds")
print(f"You can attach to the tmux session to review: tmux attach -t {self.session.name}")
if os.path.exists(done_file) or os.path.exists(ok_file): # Check for .ok file
new_script_state = "DONE"
elif os.path.exists(error_file):
new_script_state = "ERROR"
elif self._is_pane_running(pane):
new_script_state = "RUNNING"
all_tasks_finished_or_failed = False
elif current_script_state == "RUNNING":
# If it was running but now isn't, and no done/error file, it crashed
new_script_state = "CRASHED"
if new_script_state != current_script_state:
script_states[script_basename] = new_script_state
current_status_report.append(f" - {script_basename}: {new_script_state}")
if current_status_report:
print(f" Status update for window '{window_name}' (Elapsed: {int(time.time() - start_time)}s):")
for report_line in current_status_report:
print(report_line)
if all_tasks_finished_or_failed:
print(f" All tasks in window '{window_name}' completed or failed.")
break
elapsed_time = time.time() - start_time
if elapsed_time > metadata.timeout:
print(f" ERROR: Tasks in window '{window_name}' timed out after {metadata.timeout}s.")
for pane_idx, script_path in enumerate(scripts):
script_basename = os.path.basename(script_path)
pane = panes_in_current_window[pane_idx]
if script_states[script_basename] == "RUNNING":
script_states[script_basename] = "TIMED_OUT"
print(f" - {script_basename}: TIMED_OUT (Killing pane)")
pane.send_keys("C-c", literal=True)
time.sleep(1)
break
time.sleep(5)
print("\nAll task directories processed. You can attach to the tmux session to review:")
print(f"tmux attach -t {self.session.name}")
def cleanup(self): def cleanup(self):
"""Removes all tmux windows created by this run.""" """Removes all tmux windows created by this run."""
@@ -570,25 +531,29 @@ class TaskRunner:
print("Cleanup complete.") print("Cleanup complete.")
def reset(self): def reset(self):
"""Kills all processes and panes inside task windows, removes windows, and deletes .done/.error files.""" """Kills all processes and panes inside task windows, removes windows, and deletes .done/.error/.ok files."""
print(f"\n--- Resetting run '{self.run_name}' ---") print(f"\n--- Resetting run '{self.run_name}' ---")
self.cleanup() # First, kill all associated tmux windows self.cleanup() # First, kill all associated tmux windows
# Then, remove all .done and .error files # Then, remove all .done, .error, and .ok files
print(" Removing .done and .error files...") print(" Removing .done, .error, and .ok files...")
for dir_num, (scripts, metadata) in self.all_tasks_with_meta.items(): for dir_num, (scripts, metadata) in self.all_tasks_with_meta.items():
for script_path in scripts: for script_path in scripts:
done_file = f"{script_path}.done" done_file = f"{script_path}.done"
error_file = f"{script_path}.error" error_file = f"{script_path}.error"
ok_file = f"{script_path}.ok" # Added .ok file to be removed ok_file = f"{script_path}.ok"
if os.path.exists(done_file): if os.path.exists(done_file):
os.remove(done_file) os.remove(done_file)
print(f" Removed: {done_file}") print(f" Removed: {done_file}")
if os.path.exists(error_file): if os.path.exists(error_file):
os.remove(error_file) os.remove(error_file)
print(f" Removed: {error_file}") print(f" Removed: {error_file}")
if os.path.exists(ok_file): # Remove .ok file if os.path.exists(ok_file):
os.remove(ok_file) os.remove(ok_file)
print(f" Removed: {ok_file}") print(f" Removed: {ok_file}")
print("Reset complete.")
# Also remove the .dag.toml file if it exists
if hasattr(self, 'dag_file_path') and self.dag_file_path.exists():
os.remove(self.dag_file_path)
print(f" Removed: {self.dag_file_path}")
print("Reset complete.")

View File

@@ -1,31 +1,19 @@
import os import threading
import time import time
import re from typing import Dict, List, Optional
import sys from dataclasses import asdict
import toml
import libtmux
from libtmux.pane import Pane
from libtmux.window import Window
from libtmux.session import Session
import psutil
from typing import Dict, List, Optional, Any, Set, Tuple
from dataclasses import dataclass, field, asdict
from datetime import datetime from datetime import datetime
import uuid
from pathlib import Path
import asyncio
import uvicorn import uvicorn
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import threading
from .task_runner import TaskRunner
class TaskRunnerAPI: class TaskRunnerAPI:
"""FastAPI interface for the task runner.""" """FastAPI interface for the task runner."""
def __init__(self, runner: EnhancedTaskRunner): def __init__(self, runner: TaskRunner):
self.runner = runner self.runner = runner
self.app = FastAPI(title="Task Runner API", version="1.0.0") self.app = FastAPI(title="Task Runner API", version="1.0.0")
@@ -132,13 +120,15 @@ class TaskRunnerAPI:
"""Start the FastAPI server.""" """Start the FastAPI server."""
uvicorn.run(self.app, host=host, port=port) uvicorn.run(self.app, host=host, port=port)
class TaskOrchestrator: class TaskOrchestrator:
"""Main orchestrator that runs tasks and API server.""" """Main orchestrator that runs tasks and API server."""
def __init__(self, tasks_dir: str, api_port: int = 8000): def __init__(self, tasks_dir: str, api_port: int = 8000):
self.runner = EnhancedTaskRunner(tasks_dir) self.runner = TaskRunner(tasks_dir)
self.api = TaskRunnerAPI(self.runner) self.api = TaskRunnerAPI(self.runner)
self.api_thread = None self.api_thread = None
self.api_port = api_port
def start_api_server(self, port: int = 8000): def start_api_server(self, port: int = 8000):
"""Start API server in a separate thread.""" """Start API server in a separate thread."""
@@ -153,7 +143,7 @@ class TaskOrchestrator:
def run(self): def run(self):
"""Run the task orchestration.""" """Run the task orchestration."""
# Start API server # Start API server
self.start_api_server() self.start_api_server(self.api_port)
# Reset and run tasks # Reset and run tasks
self.runner.reset() self.runner.reset()
@@ -166,6 +156,7 @@ class TaskOrchestrator:
self.runner._save_dag() self.runner._save_dag()
print("\nExecution completed. API server still running.") print("\nExecution completed. API server still running.")
print(f"Access API at: http://localhost:{self.api_port}")
print("Press Ctrl+C to stop the API server.") print("Press Ctrl+C to stop the API server.")
try: try: