From d5faf319ab191b378f74b717058e0ced7c707737be42b8471c6ab086de3a7ed0 Mon Sep 17 00:00:00 2001 From: Till Wegmueller Date: Mon, 17 Nov 2025 21:12:54 +0100 Subject: [PATCH] Add boot wait configuration and improve VM startup logging, bump version to 0.1.5 - Introduce `boot_wait_secs` configuration to delay IP discovery/SSH after VM startup. - Capture console logs when no SSH logs are available for better debugging during failures. - Add a utility function to snapshot and persist console logs into job logs. - Update CLI and environment variable support for the `boot_wait_secs` parameter. - Bump orchestrator version to 0.1.5. Signed-off-by: Till Wegmueller --- crates/orchestrator/Cargo.toml | 2 +- crates/orchestrator/src/main.rs | 5 ++++ crates/orchestrator/src/scheduler.rs | 39 ++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/crates/orchestrator/Cargo.toml b/crates/orchestrator/Cargo.toml index 00afb39..d68603d 100644 --- a/crates/orchestrator/Cargo.toml +++ b/crates/orchestrator/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "orchestrator" -version = "0.1.4" +version = "0.1.5" edition = "2024" build = "build.rs" diff --git a/crates/orchestrator/src/main.rs b/crates/orchestrator/src/main.rs index ffee08b..f9ad46d 100644 --- a/crates/orchestrator/src/main.rs +++ b/crates/orchestrator/src/main.rs @@ -104,6 +104,10 @@ struct Opts { /// SSH connect timeout (seconds) #[arg(long, env = "SSH_CONNECT_TIMEOUT_SECS", default_value_t = 300)] ssh_connect_timeout_secs: u64, + + /// Delay after VM start before attempting IP discovery/SSH (seconds) + #[arg(long, env = "BOOT_WAIT_SECS", default_value_t = 10)] + boot_wait_secs: u64, } #[tokio::main(flavor = "multi_thread")] @@ -155,6 +159,7 @@ async fn main() -> Result<()> { runner_linux_path: opts.runner_linux_path.clone(), runner_illumos_path: opts.runner_illumos_path.clone(), ssh_connect_timeout_secs: opts.ssh_connect_timeout_secs, + boot_wait_secs: opts.boot_wait_secs, }; let sched = Scheduler::new( diff --git a/crates/orchestrator/src/scheduler.rs b/crates/orchestrator/src/scheduler.rs index 4c4716b..2d69024 100644 --- a/crates/orchestrator/src/scheduler.rs +++ b/crates/orchestrator/src/scheduler.rs @@ -39,6 +39,7 @@ pub struct ExecConfig { pub runner_linux_path: String, pub runner_illumos_path: String, pub ssh_connect_timeout_secs: u64, + pub boot_wait_secs: u64, } impl Scheduler { @@ -141,6 +142,13 @@ impl Scheduler { let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Running).await; info!(request_id = %item.ctx.request_id, label = %label_key, "vm started; establishing SSH session to run workflow"); + // Optional delay to allow DHCP/guest services to assign an IP before discovery + let boot_wait = exec_shared_inner.boot_wait_secs; + if boot_wait > 0 { + info!(request_id = %item.ctx.request_id, seconds = boot_wait, "waiting before attempting IP discovery/SSH"); + tokio::time::sleep(Duration::from_secs(boot_wait)).await; + } + // Start serial console tailer to capture early boot logs into job log let console_path = h.work_dir.join("console.log"); let persist_for_tailer = persist.clone(); @@ -222,6 +230,14 @@ impl Scheduler { // Stop console tailer if let Some(t) = tailer_opt.take() { t.abort(); } + // If no logs were captured (e.g., SSH never connected), snapshot the final console log + let console_snapshot = h.work_dir.join("console.log"); + if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await { + if let Err(e) = snapshot_console_to_joblog(persist.clone(), item.ctx.request_id, console_snapshot).await { + warn!(error = %e, request_id = %item.ctx.request_id, "failed to snapshot console log"); + } + } + // Persist final state and publish result let final_state = if success { JobState::Succeeded } else { JobState::Failed }; let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), final_state).await; @@ -298,6 +314,27 @@ async fn tail_console_to_joblog(persist: Arc, request_id: Uuid, console } } +// Snapshot the entire console log file once and persist its lines with negative seq numbers. +async fn snapshot_console_to_joblog(persist: Arc, request_id: Uuid, console_path: PathBuf) -> miette::Result<()> { + use miette::IntoDiagnostic as _; + match tokio::fs::read_to_string(&console_path).await { + Ok(content) => { + let mut seq: i64 = -1_000_000; // keep consistent ordering before runner logs + for raw in content.lines() { + let trimmed = raw.trim_end_matches(['\n', '\r']); + let line = if trimmed.is_empty() { String::from("[boot]") } else { format!("[boot] {}", trimmed) }; + let _ = persist.record_log_line(request_id, seq, false, &line).await; + seq += 1; + } + Ok(()) + } + Err(e) => { + // If file missing or unreadable, just return error up to caller for logging + Err(e).into_diagnostic() + } + } +} + #[cfg(all(target_os = "linux", feature = "libvirt"))] async fn discover_guest_ip_virsh(domain: &str, timeout: Duration) -> Option { use tokio::{task, time::{sleep, Instant, Duration}}; @@ -694,6 +731,7 @@ mod tests { runner_linux_path: "/tmp/runner-linux".into(), runner_illumos_path: "/tmp/runner-illumos".into(), ssh_connect_timeout_secs: 30, + boot_wait_secs: 0, }; let sched = Scheduler::new(hv, 2, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()), exec); let tx = sched.sender(); @@ -738,6 +776,7 @@ mod tests { runner_linux_path: "/tmp/runner-linux".into(), runner_illumos_path: "/tmp/runner-illumos".into(), ssh_connect_timeout_secs: 30, + boot_wait_secs: 0, }; let sched = Scheduler::new(hv, 4, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()), exec); let tx = sched.sender();