Add boot wait configuration and improve VM startup logging, bump version to 0.1.5

- Introduce `boot_wait_secs` configuration to delay IP discovery/SSH after VM startup.
- Capture console logs when no SSH logs are available for better debugging during failures.
- Add a utility function to snapshot and persist console logs into job logs.
- Update CLI and environment variable support for the `boot_wait_secs` parameter.
- Bump orchestrator version to 0.1.5.

Signed-off-by: Till Wegmueller <toasterson@gmail.com>
This commit is contained in:
Till Wegmueller 2025-11-17 21:12:54 +01:00
parent 5d8e79c8d4
commit d5faf319ab
No known key found for this signature in database
3 changed files with 45 additions and 1 deletions

View file

@ -1,6 +1,6 @@
[package]
name = "orchestrator"
version = "0.1.4"
version = "0.1.5"
edition = "2024"
build = "build.rs"

View file

@ -104,6 +104,10 @@ struct Opts {
/// SSH connect timeout (seconds)
#[arg(long, env = "SSH_CONNECT_TIMEOUT_SECS", default_value_t = 300)]
ssh_connect_timeout_secs: u64,
/// Delay after VM start before attempting IP discovery/SSH (seconds)
#[arg(long, env = "BOOT_WAIT_SECS", default_value_t = 10)]
boot_wait_secs: u64,
}
#[tokio::main(flavor = "multi_thread")]
@ -155,6 +159,7 @@ async fn main() -> Result<()> {
runner_linux_path: opts.runner_linux_path.clone(),
runner_illumos_path: opts.runner_illumos_path.clone(),
ssh_connect_timeout_secs: opts.ssh_connect_timeout_secs,
boot_wait_secs: opts.boot_wait_secs,
};
let sched = Scheduler::new(

View file

@ -39,6 +39,7 @@ pub struct ExecConfig {
pub runner_linux_path: String,
pub runner_illumos_path: String,
pub ssh_connect_timeout_secs: u64,
pub boot_wait_secs: u64,
}
impl<H: Hypervisor + 'static> Scheduler<H> {
@ -141,6 +142,13 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Running).await;
info!(request_id = %item.ctx.request_id, label = %label_key, "vm started; establishing SSH session to run workflow");
// Optional delay to allow DHCP/guest services to assign an IP before discovery
let boot_wait = exec_shared_inner.boot_wait_secs;
if boot_wait > 0 {
info!(request_id = %item.ctx.request_id, seconds = boot_wait, "waiting before attempting IP discovery/SSH");
tokio::time::sleep(Duration::from_secs(boot_wait)).await;
}
// Start serial console tailer to capture early boot logs into job log
let console_path = h.work_dir.join("console.log");
let persist_for_tailer = persist.clone();
@ -222,6 +230,14 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
// Stop console tailer
if let Some(t) = tailer_opt.take() { t.abort(); }
// If no logs were captured (e.g., SSH never connected), snapshot the final console log
let console_snapshot = h.work_dir.join("console.log");
if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await {
if let Err(e) = snapshot_console_to_joblog(persist.clone(), item.ctx.request_id, console_snapshot).await {
warn!(error = %e, request_id = %item.ctx.request_id, "failed to snapshot console log");
}
}
// Persist final state and publish result
let final_state = if success { JobState::Succeeded } else { JobState::Failed };
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), final_state).await;
@ -298,6 +314,27 @@ async fn tail_console_to_joblog(persist: Arc<Persist>, request_id: Uuid, console
}
}
// Snapshot the entire console log file once and persist its lines with negative seq numbers.
async fn snapshot_console_to_joblog(persist: Arc<Persist>, request_id: Uuid, console_path: PathBuf) -> miette::Result<()> {
use miette::IntoDiagnostic as _;
match tokio::fs::read_to_string(&console_path).await {
Ok(content) => {
let mut seq: i64 = -1_000_000; // keep consistent ordering before runner logs
for raw in content.lines() {
let trimmed = raw.trim_end_matches(['\n', '\r']);
let line = if trimmed.is_empty() { String::from("[boot]") } else { format!("[boot] {}", trimmed) };
let _ = persist.record_log_line(request_id, seq, false, &line).await;
seq += 1;
}
Ok(())
}
Err(e) => {
// If file missing or unreadable, just return error up to caller for logging
Err(e).into_diagnostic()
}
}
}
#[cfg(all(target_os = "linux", feature = "libvirt"))]
async fn discover_guest_ip_virsh(domain: &str, timeout: Duration) -> Option<String> {
use tokio::{task, time::{sleep, Instant, Duration}};
@ -694,6 +731,7 @@ mod tests {
runner_linux_path: "/tmp/runner-linux".into(),
runner_illumos_path: "/tmp/runner-illumos".into(),
ssh_connect_timeout_secs: 30,
boot_wait_secs: 0,
};
let sched = Scheduler::new(hv, 2, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()), exec);
let tx = sched.sender();
@ -738,6 +776,7 @@ mod tests {
runner_linux_path: "/tmp/runner-linux".into(),
runner_illumos_path: "/tmp/runner-illumos".into(),
ssh_connect_timeout_secs: 30,
boot_wait_secs: 0,
};
let sched = Scheduler::new(hv, 4, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()), exec);
let tx = sched.sender();