diff --git a/crates/orchestrator/Cargo.toml b/crates/orchestrator/Cargo.toml index d68603d..57af83b 100644 --- a/crates/orchestrator/Cargo.toml +++ b/crates/orchestrator/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "orchestrator" -version = "0.1.5" +version = "0.1.6" edition = "2024" build = "build.rs" diff --git a/crates/orchestrator/src/scheduler.rs b/crates/orchestrator/src/scheduler.rs index 2d69024..ce70da6 100644 --- a/crates/orchestrator/src/scheduler.rs +++ b/crates/orchestrator/src/scheduler.rs @@ -217,17 +217,13 @@ impl Scheduler { warn!(request_id = %item.ctx.request_id, label = %label_key, "SSH execution not supported on this platform/backend; skipping"); } - // Stop and destroy VM after attempting execution + // Stop VM after attempting execution if let Err(e) = hv.stop(&h, Duration::from_secs(10)).await { error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM"); } let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await; - if let Err(e) = hv.destroy(h.clone()).await { - error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to destroy VM"); - } - let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Destroyed).await; - // Stop console tailer + // Stop console tailer before we snapshot the file to avoid races if let Some(t) = tailer_opt.take() { t.abort(); } // If no logs were captured (e.g., SSH never connected), snapshot the final console log @@ -238,6 +234,12 @@ impl Scheduler { } } + // Destroy VM and then persist final destroyed state + if let Err(e) = hv.destroy(h.clone()).await { + error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to destroy VM"); + } + let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Destroyed).await; + // Persist final state and publish result let final_state = if success { JobState::Succeeded } else { JobState::Failed }; let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), final_state).await; @@ -339,7 +341,7 @@ async fn snapshot_console_to_joblog(persist: Arc, request_id: Uuid, con async fn discover_guest_ip_virsh(domain: &str, timeout: Duration) -> Option { use tokio::{task, time::{sleep, Instant, Duration}}; use std::process::Command; - use tracing::debug; + use tracing::{debug, warn}; fn parse_ipv4_from_text(s: &str) -> Option { for line in s.lines() { @@ -362,33 +364,66 @@ async fn discover_guest_ip_virsh(domain: &str, timeout: Duration) -> Option Option { + fn preview_bytes(b: &[u8]) -> String { + let s = String::from_utf8_lossy(b); + let s = s.trim(); + let mut out = s.to_string(); + if out.len() > 800 { out.truncate(800); out.push_str("…"); } + out + } + + #[derive(Debug, Clone)] + struct Attempt { + cmd: String, + ok: bool, + status: Option, + stdout: String, + stderr: String, + } + + async fn run_cmd(args: &[&str]) -> Attempt { let args_vec = args.iter().map(|s| s.to_string()).collect::>(); - task::spawn_blocking(move || { + let cmd_desc = format!("virsh {}", args_vec.join(" ")); + match task::spawn_blocking(move || { Command::new("virsh").args(&args_vec).output() - }).await.ok()?.ok().and_then(|out| { - if out.status.success() { - Some(String::from_utf8_lossy(&out.stdout).to_string()) - } else { None } - }) + }).await { + Ok(Ok(out)) => { + let ok = out.status.success(); + let status = out.status.code(); + let stdout = preview_bytes(&out.stdout); + let stderr = preview_bytes(&out.stderr); + Attempt { cmd: cmd_desc, ok, status, stdout, stderr } + } + other => { + // spawn or io error + Attempt { cmd: cmd_desc, ok: false, status: None, stdout: String::new(), stderr: format!("spawn error: {:?}", other) } + } + } } let deadline = Instant::now() + timeout; + let mut last_attempts: Vec = Vec::new(); while Instant::now() < deadline { + last_attempts.clear(); // 1) Try domifaddr via agent then lease then default for source in [Some("agent"), Some("lease"), None] { let mut args = vec!["domifaddr", domain]; if let Some(src) = source { args.push("--source"); args.push(src); } - if let Some(out) = run_cmd(&args).await { - if let Some(ip) = parse_ipv4_from_text(&out) { debug!(domain=%domain, method=%format!("domifaddr/{:?}", source), ip=%ip, "discovered IP"); return Some(ip); } + let att = run_cmd(&args).await; + debug!(domain=%domain, method=%format!("domifaddr/{:?}", source), ok=att.ok, status=?att.status, stdout=%att.stdout, stderr=%att.stderr, cmd=%att.cmd, "virsh attempt"); + if att.ok { + if let Some(ip) = parse_ipv4_from_text(&att.stdout) { debug!(domain=%domain, method=%format!("domifaddr/{:?}", source), ip=%ip, "discovered IP"); return Some(ip); } } + last_attempts.push(att); } // 2) Try domiflist to get MAC and possibly network name let mut mac: Option = None; let mut net_name: Option = None; - if let Some(out) = run_cmd(&["domiflist", domain]).await { - for line in out.lines().skip(2) { // skip header lines + let att_domiflist = run_cmd(&["domiflist", domain]).await; + debug!(domain=%domain, method="domiflist", ok=att_domiflist.ok, status=?att_domiflist.status, stdout=%att_domiflist.stdout, stderr=%att_domiflist.stderr, cmd=%att_domiflist.cmd, "virsh attempt"); + if att_domiflist.ok { + for line in att_domiflist.stdout.lines().skip(2) { // skip header lines let cols: Vec<&str> = line.split_whitespace().collect(); if cols.len() >= 5 { // columns: Interface Type Source Model MAC @@ -398,6 +433,7 @@ async fn discover_guest_ip_virsh(domain: &str, timeout: Duration) -> Option Option - if let Some(net) = net_name { - if let Some(out) = run_cmd(&["net-dhcp-leases", &net]).await { + if let Some(net) = net_name.clone() { + let att_leases = run_cmd(&["net-dhcp-leases", &net]).await; + debug!(domain=%domain, method="net-dhcp-leases", ok=att_leases.ok, status=?att_leases.status, stdout=%att_leases.stdout, stderr=%att_leases.stderr, cmd=%att_leases.cmd, "virsh attempt"); + if att_leases.ok { if let Some(ref mac_s) = mac { - for line in out.lines() { + for line in att_leases.stdout.lines() { if line.to_ascii_lowercase().contains(mac_s.as_str()) { if let Some(ip) = parse_ipv4_from_text(line) { debug!(domain=%domain, method="net-dhcp-leases", ip=%ip, "discovered IP"); return Some(ip); } } } } - if let Some(ip) = parse_ipv4_from_text(&out) { debug!(domain=%domain, method="net-dhcp-leases-any", ip=%ip, "discovered IP"); return Some(ip); } + if let Some(ip) = parse_ipv4_from_text(&att_leases.stdout) { debug!(domain=%domain, method="net-dhcp-leases-any", ip=%ip, "discovered IP"); return Some(ip); } } + last_attempts.push(att_leases); } sleep(Duration::from_secs(1)).await; } + + // No IP found; emit a summary at warn level with the last attempts' details + if !last_attempts.is_empty() { + for att in last_attempts { + warn!(domain=%domain, cmd=%att.cmd, ok=att.ok, status=?att.status, stdout=%att.stdout, stderr=%att.stderr, "virsh attempt summary"); + } + } None }