fixed cloud hypervisor issues + updated test script (working now)

2025-08-21 13:32:03 +02:00
parent d735316b7f
commit aab2b6f128
2 changed files with 100 additions and 8 deletions
--- a/packages/system/virt/src/cloudhv/mod.rs
+++ b/packages/system/virt/src/cloudhv/mod.rs
@@ -8,6 +8,7 @@ use std::time::Duration;

 use sal_os;
 use sal_process;
+use crate::qcow2;

 /// Error type for Cloud Hypervisor operations
 #[derive(Debug)]
@@ -216,6 +217,45 @@ pub fn vm_start(id: &str) -> Result<(), CloudHvError> {
    };
    let log_file = vm_log_path(id).to_string_lossy().into_owned();

+    // Ensure API socket directory exists and remove any stale socket file
+    let api_path = Path::new(&api_socket);
+    if let Some(parent) = api_path.parent() {
+        fs::create_dir_all(parent).map_err(|e| CloudHvError::IoError(e.to_string()))?;
+    }
+    // Best-effort removal of stale socket
+    let _ = fs::remove_file(&api_path);
+
+    // Preflight disk: if source is qcow2, convert to raw to avoid CH "Compressed blocks not supported"
+    // This is best-effort: if qemu-img is unavailable or info fails, we skip conversion.
+    let mut disk_to_use = rec.spec.disk_path.clone();
+    if let Ok(info) = qcow2::info(&disk_to_use) {
+        if info.get("format").and_then(|v| v.as_str()) == Some("qcow2") {
+            let dest = vm_dir(id).join("disk.raw").to_string_lossy().into_owned();
+            let cmd = format!(
+                "qemu-img convert -O raw {} {}",
+                shell_escape(&disk_to_use),
+                shell_escape(&dest)
+            );
+            match sal_process::run(&cmd).silent(true).execute() {
+                Ok(res) if res.success => {
+                    disk_to_use = dest;
+                }
+                Ok(res) => {
+                    return Err(CloudHvError::CommandFailed(format!(
+                        "Failed converting qcow2 to raw: {}",
+                        res.stderr
+                    )));
+                }
+                Err(e) => {
+                    return Err(CloudHvError::CommandFailed(format!(
+                        "Failed converting qcow2 to raw: {}",
+                        e
+                    )));
+                }
+            }
+        }
+    }
+
    // Build command (minimal args for Phase 2)
    // We redirect all output to log_file via shell and keep process in background with nohup

@@ -249,7 +289,7 @@ pub fn vm_start(id: &str) -> Result<(), CloudHvError> {
    }

    parts.push("--disk".into());
-    parts.push(format!("path={}", rec.spec.disk_path));
+    parts.push(format!("path={}", disk_to_use));
    parts.push("--cpus".into());
    parts.push(format!("boot={}", rec.spec.vcpus));
    parts.push("--memory".into());
@@ -342,20 +382,27 @@ pub fn vm_stop(id: &str, force: bool) -> Result<(), CloudHvError> {
        let _ = sal_process::run(&cmd).die(false).silent(true).execute();
    }

-    // Wait a bit for process to exit
+    // Wait for process to exit (up to ~10s)
    if let Some(pid) = rec.runtime.pid {
-        for _ in 0..20 {
+        for _ in 0..50 {
            if !proc_exists(pid) {
                break;
            }
            thread::sleep(Duration::from_millis(200));
        }
-        // If still alive and force, kill -9
+        // If still alive and force, kill -9 and wait again (up to ~10s)
        if proc_exists(pid) && force {
+            // Send SIGKILL without extra shell layers; suppress errors/noise
            let _ = sal_process::run(&format!("kill -9 {}", pid))
                .die(false)
                .silent(true)
                .execute();
+            for _ in 0..50 {
+                if !proc_exists(pid) {
+                    break;
+                }
+                thread::sleep(Duration::from_millis(200));
+            }
        }
    }

@@ -380,12 +427,22 @@ pub fn vm_delete(id: &str, delete_disks: bool) -> Result<(), CloudHvError> {
    let rec: VmRecord = serde_json::from_value(read_json(&p)?)
        .map_err(|e| CloudHvError::JsonError(e.to_string()))?;

-    // Refuse to delete if still running
+    // If appears to be running, attempt a force stop first (best-effort)
    if let Some(pid) = rec.runtime.pid {
        if proc_exists(pid) {
-            return Err(CloudHvError::CommandFailed(
-                "VM appears to be running; stop it first".into(),
-            ));
+            let _ = vm_stop(id, true);
+            // Re-check original PID for liveness (up to ~5s)
+            for _ in 0..25 {
+                if !proc_exists(pid) {
+                    break;
+                }
+                thread::sleep(Duration::from_millis(200));
+            }
+            if proc_exists(pid) {
+                return Err(CloudHvError::CommandFailed(
+                    "VM appears to be running; stop it first".into(),
+                ));
+            }
        }
    }

--- a/packages/system/virt/tests/rhai/05_cloudhv_basic.rhai
+++ b/packages/system/virt/tests/rhai/05_cloudhv_basic.rhai
@@ -105,6 +105,41 @@ if !missing {
        print(`⚠️  VM start failed (this can happen if kernel/cmdline are incompatible): ${err}`);
    }

+    print("\n waiting for VM to be ready...");
+
+    // Discover API socket and PID from SAL
+    let info1 = cloudhv_vm_info(vm_id);
+    let api_sock = info1.spec.api_socket;
+    let pid = info1.runtime.pid;
+
+    // 1) Wait for API socket to appear (up to ~50s)
+    let sock_ok = false;
+    for x in 0..50 {
+        if exist(api_sock) { sock_ok = true; break; }
+        sleep(1);
+    }
+    print(`api_sock_exists=${sock_ok} path=${api_sock}`);
+
+    // 2) Probe ch-remote info with retries (up to ~20s)
+    if sock_ok {
+        let info_ok = false;
+        for x in 0..20 {
+            let r = run_silent(`ch-remote-static --api-socket ${api_sock} info`);
+            if r.success {
+                info_ok = true;
+                break;
+            }
+            sleep(1);
+        }
+        if info_ok {
+            print("VM API is ready (ch-remote info OK)");
+        } else {
+            print("⚠️ VM API did not become ready in time (continuing)");
+        }
+    } else {
+        print("⚠️ API socket not found (continuing)");
+    }
+
    print("\n--- Test 5: Stop VM (graceful) ---");
    try {
        cloudhv_vm_stop(vm_id, false);