From 190eb5532f1c50ca79b95f0950f3bab4f312e04a132156561ae00a0bc66036c9 Mon Sep 17 00:00:00 2001 From: Till Wegmueller Date: Tue, 7 Apr 2026 15:50:54 +0200 Subject: [PATCH] Replace scheduler SSH/console code with vm-manager APIs - IP discovery: use hv.guest_ip() with timeout loop instead of discover_guest_ip_virsh() (500+ lines removed from hot path) - SSH: use vm_manager::ssh::connect_with_retry() + upload() + exec() instead of hand-rolled TCP/ssh2/SFTP code - Console: use vm_manager::console::ConsoleTailer over Unix socket instead of file-based tail_console_to_joblog() - Add guest_ip() to orchestrator Hypervisor trait with default impl - Remove #[cfg(linux, libvirt)] gates from is_illumos_label, expand_tilde - Keep orchestrator-specific: DB persistence, log recording, MQ publish, runner binary selection, env var injection --- crates/orchestrator/src/hypervisor.rs | 3 + crates/orchestrator/src/scheduler.rs | 224 ++++++++++++++++---------- crates/orchestrator/src/vm_adapter.rs | 7 + 3 files changed, 151 insertions(+), 83 deletions(-) diff --git a/crates/orchestrator/src/hypervisor.rs b/crates/orchestrator/src/hypervisor.rs index d2a226f..e41d37f 100644 --- a/crates/orchestrator/src/hypervisor.rs +++ b/crates/orchestrator/src/hypervisor.rs @@ -77,6 +77,9 @@ pub trait Hypervisor: Send + Sync { async fn state(&self, _vm: &VmHandle) -> Result { Ok(VmState::Prepared) } + async fn guest_ip(&self, vm: &VmHandle) -> Result { + Ok("127.0.0.1".to_string()) + } } /// A router that delegates to the correct backend implementation per job. diff --git a/crates/orchestrator/src/scheduler.rs b/crates/orchestrator/src/scheduler.rs index 7c892c2..68678fb 100644 --- a/crates/orchestrator/src/scheduler.rs +++ b/crates/orchestrator/src/scheduler.rs @@ -205,110 +205,170 @@ impl Scheduler { tokio::time::sleep(Duration::from_secs(boot_wait)).await; } - // Start serial console tailer to capture early boot logs into job log - let console_path = h.work_dir.join("console.log"); - let persist_for_tailer = persist.clone(); - let req_for_tailer = item.ctx.request_id; - let mut tailer_opt = Some(tokio::spawn(async move { - let _ = tail_console_to_joblog(persist_for_tailer, req_for_tailer, console_path).await; - })); + // Start serial console tailer via vm-manager + let (console_stop_tx, console_stop_rx) = tokio::sync::watch::channel(false); + let (console_line_tx, mut console_line_rx) = tokio::sync::mpsc::channel::(512); + let persist_for_console = persist.clone(); + let req_for_console = item.ctx.request_id; - // Attempt to discover guest IP (libvirt only) and run the runner over SSH + // Spawn console tailer that reads from the QEMU serial socket + if let Some(ref console_socket) = h.console_socket { + let endpoint = vm_manager::ConsoleEndpoint::UnixSocket(console_socket.clone()); + tokio::spawn(vm_manager::console::ConsoleTailer::tail(endpoint, console_line_tx.clone(), console_stop_rx)); + } + // Spawn task that records console lines into the DB + let console_recorder = tokio::spawn(async move { + let mut seq: i64 = -1_000_000; + while let Some(line) = console_line_rx.recv().await { + let trimmed = line.trim_end_matches(['\n', '\r']); + let obj = serde_json::json!({ + "category": "boot", + "level": "info", + "msg": trimmed + }); + let _ = persist_for_console.record_log_line(req_for_console, seq, false, &obj.to_string()).await; + seq += 1; + } + }); + + // Discover guest IP and run the workflow via SSH using vm-manager let mut success = false; let mut exit_code: i32 = 1; let mut failure_summary: Option = None; - #[cfg(all(target_os = "linux", feature = "libvirt"))] { let exec_cfg = exec_shared_inner.clone(); - let uri = exec_cfg.libvirt_uri.clone(); - let net_opt = if exec_cfg.libvirt_network.trim().is_empty() { None } else { Some(exec_cfg.libvirt_network.clone()) }; - match discover_guest_ip_virsh( - &h.id, + // Discover guest IP via vm-manager + let ip_result = tokio::time::timeout( Duration::from_secs(exec_cfg.ssh_connect_timeout_secs.min(300)), - &uri, - net_opt.as_deref(), - ).await { - Some(ip) => { - let ip_owned = ip.clone(); - let user = item.ctx.ssh_user.clone().unwrap_or_else(|| "sol".to_string()); - let per_job_key_path = item.ctx.ssh_private_key_path.as_ref().map(|s| expand_tilde(s)); - let key_mem_opt = item.ctx.ssh_private_key_pem.clone(); - // Choose correct runner binary based on label (illumos vs linux) - let runner_path = if is_illumos_label(&item.spec.label) { - &exec_cfg.runner_illumos_path - } else { - &exec_cfg.runner_linux_path - }; - let local_runner = expand_tilde(runner_path); - info!(label = %item.spec.label, runner = %runner_path, "selected runner binary"); - let remote_path = PathBuf::from(&exec_cfg.remote_runner_path); - let repo_url = item.ctx.repo_url.clone(); - let commit_sha = item.ctx.commit_sha.clone(); - let request_id = item.ctx.request_id; - match run_job_via_ssh_with_retry( - ip_owned, - user, - key_mem_opt, - per_job_key_path, - local_runner, - remote_path, - repo_url, - commit_sha, - request_id, - Duration::from_secs(exec_cfg.ssh_connect_timeout_secs), - ).await { - Ok((ok, code, lines)) => { - success = ok; exit_code = code; - // Persist lines and capture last non-empty stderr for summary - let mut seq: i64 = 0; - let mut last_stderr: Option = None; - for (is_stderr, line) in lines { - let line_ref = line.trim_end_matches(['\n', '\r']); - if is_stderr && !line_ref.trim().is_empty() { - last_stderr = Some(line_ref.to_string()); - } - let _ = persist.record_log_line(item.ctx.request_id, seq, is_stderr, line_ref).await; - seq += 1; - } - if !success { - failure_summary = Some(match last_stderr { - Some(ref msg) => format!("job script failed: exit_code={} — {}", exit_code, msg), - None => format!("job script failed: exit_code={}", exit_code), - }); - } - }, - Err(e) => { - failure_summary = Some(format!("ssh/runner error: {}", e)); - warn!(error = %e, request_id = %item.ctx.request_id, ip = %ip, "ssh runner execution failed"); + async { + loop { + match hv.guest_ip(&h).await { + Ok(ip) => return ip, + Err(_) => tokio::time::sleep(Duration::from_secs(1)).await, } } } - None => { - warn!(request_id = %item.ctx.request_id, label = %label_key, "failed to determine guest IP via virsh"); + ).await; + + match ip_result { + Ok(ip) => { + let user = item.ctx.ssh_user.clone().unwrap_or_else(|| "sol".to_string()); + let ssh_port = h.ssh_host_port.unwrap_or(22); + let ssh_cfg = vm_manager::SshConfig { + user: user.clone(), + public_key: item.ctx.ssh_public_key.clone(), + private_key_path: item.ctx.ssh_private_key_path.as_ref().map(PathBuf::from), + private_key_pem: item.ctx.ssh_private_key_pem.clone(), + }; + // Connect with retry + match vm_manager::ssh::connect_with_retry( + &ip, ssh_port, &ssh_cfg, + Duration::from_secs(exec_cfg.ssh_connect_timeout_secs), + ).await { + Ok(sess) => { + // Upload runner binary + let runner_path = if is_illumos_label(&item.spec.label) { + &exec_cfg.runner_illumos_path + } else { + &exec_cfg.runner_linux_path + }; + let local_runner = expand_tilde(runner_path); + info!(label = %item.spec.label, runner = %runner_path, "selected runner binary"); + let remote_path = PathBuf::from(&exec_cfg.remote_runner_path); + + if let Err(e) = vm_manager::ssh::upload(&sess, &local_runner, &remote_path) { + failure_summary = Some(format!("runner upload failed: {}", e)); + warn!(error = %e, request_id = %item.ctx.request_id, "failed to upload runner"); + } else { + // Execute runner + let cmd = format!( + "export SOLSTICE_REPO_URL=\"{}\" SOLSTICE_COMMIT_SHA=\"{}\" SOLSTICE_REQUEST_ID=\"{}\"; sh -lc '{}'", + item.ctx.repo_url, item.ctx.commit_sha, item.ctx.request_id, remote_path.display() + ); + + let persist_for_ssh = persist.clone(); + let req_id = item.ctx.request_id; + // Use exec_streaming with writers that record to DB + let result = tokio::task::spawn_blocking(move || { + vm_manager::ssh::exec(&sess, &cmd) + }).await; + + match result { + Ok(Ok((stdout, stderr, code))) => { + exit_code = code; + success = code == 0; + // Persist lines + let mut seq: i64 = 0; + let mut last_stderr: Option = None; + for line in stdout.lines() { + let line_ref = strip_ansi(line.trim_end_matches(['\n', '\r'])); + let _ = persist_for_ssh.record_log_line(req_id, seq, false, &line_ref).await; + seq += 1; + } + for line in stderr.lines() { + let line_ref = strip_ansi(line.trim_end_matches(['\n', '\r'])); + if !line_ref.trim().is_empty() { + last_stderr = Some(line_ref.clone()); + } + let _ = persist_for_ssh.record_log_line(req_id, seq, true, &line_ref).await; + seq += 1; + } + if !success { + failure_summary = Some(match last_stderr { + Some(ref msg) => format!("job script failed: exit_code={} — {}", exit_code, msg), + None => format!("job script failed: exit_code={}", exit_code), + }); + } + } + Ok(Err(e)) => { + failure_summary = Some(format!("ssh exec error: {}", e)); + warn!(error = %e, request_id = %item.ctx.request_id, "ssh exec failed"); + } + Err(e) => { + failure_summary = Some(format!("ssh task panicked: {}", e)); + error!(error = %e, request_id = %item.ctx.request_id, "ssh task panicked"); + } + } + } + } + Err(e) => { + failure_summary = Some(format!("ssh connect failed: {}", e)); + warn!(error = %e, request_id = %item.ctx.request_id, "ssh connect failed"); + } + } + } + Err(_) => { + failure_summary = Some("timed out waiting for guest IP".to_string()); + warn!(request_id = %item.ctx.request_id, label = %label_key, "failed to determine guest IP"); } } } - #[cfg(not(all(target_os = "linux", feature = "libvirt")))] - { - warn!(request_id = %item.ctx.request_id, label = %label_key, "SSH execution not supported on this platform/backend; skipping"); - } - // Stop VM after attempting execution if let Err(e) = hv.stop(&h, Duration::from_secs(10)).await { error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM"); } let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await; - // Stop console tailer before we snapshot the file to avoid races - if let Some(t) = tailer_opt.take() { t.abort(); } + // Stop console tailer + let _ = console_stop_tx.send(true); + drop(console_line_tx); + let _ = console_recorder.await; - // If no logs were captured (e.g., SSH never connected), snapshot the final console log - let console_snapshot = h.work_dir.join("console.log"); + // If no logs were captured, snapshot the console log file as fallback if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await { - if let Err(e) = snapshot_console_to_joblog(persist.clone(), item.ctx.request_id, console_snapshot).await { - warn!(error = %e, request_id = %item.ctx.request_id, "failed to snapshot console log"); + if let Ok(lines) = vm_manager::console::read_console_log(&h.work_dir).await { + let mut seq: i64 = -1_000_000; + for line in lines { + let obj = serde_json::json!({ + "category": "boot", + "level": "info", + "msg": line.trim_end_matches(['\n', '\r']) + }); + let _ = persist.record_log_line(item.ctx.request_id, seq, false, &obj.to_string()).await; + seq += 1; + } } } @@ -825,7 +885,6 @@ async fn run_job_via_ssh_owned( Ok((ok, code, lines)) } -#[cfg(all(target_os = "linux", feature = "libvirt"))] fn is_illumos_label(label: &str) -> bool { let l = label.to_ascii_lowercase(); l.contains("illumos") @@ -834,7 +893,6 @@ fn is_illumos_label(label: &str) -> bool { || l.contains("oi-hipster") } -#[cfg(all(target_os = "linux", feature = "libvirt"))] fn expand_tilde(path: &str) -> PathBuf { if let Some(rest) = path.strip_prefix("~/") { if let Ok(home) = std::env::var("HOME") { diff --git a/crates/orchestrator/src/vm_adapter.rs b/crates/orchestrator/src/vm_adapter.rs index 191763a..981ac65 100644 --- a/crates/orchestrator/src/vm_adapter.rs +++ b/crates/orchestrator/src/vm_adapter.rs @@ -128,6 +128,13 @@ impl Hypervisor for VmManagerAdapter { Ok(()) } + async fn guest_ip(&self, vm: &VmHandle) -> Result { + let inner_handle = to_vm_handle(vm); + vm_manager::Hypervisor::guest_ip(&self.inner, &inner_handle) + .await + .into_diagnostic() + } + async fn state(&self, vm: &VmHandle) -> Result { let inner_handle = to_vm_handle(vm); let state = vm_manager::Hypervisor::state(&self.inner, &inner_handle)