mirror of
https://codeberg.org/Toasterson/solstice-ci.git
synced 2026-04-10 21:30:41 +00:00
Add boot wait configuration and improve VM startup logging, bump version to 0.1.5
- Introduce `boot_wait_secs` configuration to delay IP discovery/SSH after VM startup. - Capture console logs when no SSH logs are available for better debugging during failures. - Add a utility function to snapshot and persist console logs into job logs. - Update CLI and environment variable support for the `boot_wait_secs` parameter. - Bump orchestrator version to 0.1.5. Signed-off-by: Till Wegmueller <toasterson@gmail.com>
This commit is contained in:
parent
5d8e79c8d4
commit
d5faf319ab
3 changed files with 45 additions and 1 deletions
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "orchestrator"
|
name = "orchestrator"
|
||||||
version = "0.1.4"
|
version = "0.1.5"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
build = "build.rs"
|
build = "build.rs"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -104,6 +104,10 @@ struct Opts {
|
||||||
/// SSH connect timeout (seconds)
|
/// SSH connect timeout (seconds)
|
||||||
#[arg(long, env = "SSH_CONNECT_TIMEOUT_SECS", default_value_t = 300)]
|
#[arg(long, env = "SSH_CONNECT_TIMEOUT_SECS", default_value_t = 300)]
|
||||||
ssh_connect_timeout_secs: u64,
|
ssh_connect_timeout_secs: u64,
|
||||||
|
|
||||||
|
/// Delay after VM start before attempting IP discovery/SSH (seconds)
|
||||||
|
#[arg(long, env = "BOOT_WAIT_SECS", default_value_t = 10)]
|
||||||
|
boot_wait_secs: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main(flavor = "multi_thread")]
|
#[tokio::main(flavor = "multi_thread")]
|
||||||
|
|
@ -155,6 +159,7 @@ async fn main() -> Result<()> {
|
||||||
runner_linux_path: opts.runner_linux_path.clone(),
|
runner_linux_path: opts.runner_linux_path.clone(),
|
||||||
runner_illumos_path: opts.runner_illumos_path.clone(),
|
runner_illumos_path: opts.runner_illumos_path.clone(),
|
||||||
ssh_connect_timeout_secs: opts.ssh_connect_timeout_secs,
|
ssh_connect_timeout_secs: opts.ssh_connect_timeout_secs,
|
||||||
|
boot_wait_secs: opts.boot_wait_secs,
|
||||||
};
|
};
|
||||||
|
|
||||||
let sched = Scheduler::new(
|
let sched = Scheduler::new(
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,7 @@ pub struct ExecConfig {
|
||||||
pub runner_linux_path: String,
|
pub runner_linux_path: String,
|
||||||
pub runner_illumos_path: String,
|
pub runner_illumos_path: String,
|
||||||
pub ssh_connect_timeout_secs: u64,
|
pub ssh_connect_timeout_secs: u64,
|
||||||
|
pub boot_wait_secs: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<H: Hypervisor + 'static> Scheduler<H> {
|
impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
|
|
@ -141,6 +142,13 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Running).await;
|
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Running).await;
|
||||||
info!(request_id = %item.ctx.request_id, label = %label_key, "vm started; establishing SSH session to run workflow");
|
info!(request_id = %item.ctx.request_id, label = %label_key, "vm started; establishing SSH session to run workflow");
|
||||||
|
|
||||||
|
// Optional delay to allow DHCP/guest services to assign an IP before discovery
|
||||||
|
let boot_wait = exec_shared_inner.boot_wait_secs;
|
||||||
|
if boot_wait > 0 {
|
||||||
|
info!(request_id = %item.ctx.request_id, seconds = boot_wait, "waiting before attempting IP discovery/SSH");
|
||||||
|
tokio::time::sleep(Duration::from_secs(boot_wait)).await;
|
||||||
|
}
|
||||||
|
|
||||||
// Start serial console tailer to capture early boot logs into job log
|
// Start serial console tailer to capture early boot logs into job log
|
||||||
let console_path = h.work_dir.join("console.log");
|
let console_path = h.work_dir.join("console.log");
|
||||||
let persist_for_tailer = persist.clone();
|
let persist_for_tailer = persist.clone();
|
||||||
|
|
@ -222,6 +230,14 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
// Stop console tailer
|
// Stop console tailer
|
||||||
if let Some(t) = tailer_opt.take() { t.abort(); }
|
if let Some(t) = tailer_opt.take() { t.abort(); }
|
||||||
|
|
||||||
|
// If no logs were captured (e.g., SSH never connected), snapshot the final console log
|
||||||
|
let console_snapshot = h.work_dir.join("console.log");
|
||||||
|
if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await {
|
||||||
|
if let Err(e) = snapshot_console_to_joblog(persist.clone(), item.ctx.request_id, console_snapshot).await {
|
||||||
|
warn!(error = %e, request_id = %item.ctx.request_id, "failed to snapshot console log");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Persist final state and publish result
|
// Persist final state and publish result
|
||||||
let final_state = if success { JobState::Succeeded } else { JobState::Failed };
|
let final_state = if success { JobState::Succeeded } else { JobState::Failed };
|
||||||
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), final_state).await;
|
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), final_state).await;
|
||||||
|
|
@ -298,6 +314,27 @@ async fn tail_console_to_joblog(persist: Arc<Persist>, request_id: Uuid, console
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Snapshot the entire console log file once and persist its lines with negative seq numbers.
|
||||||
|
async fn snapshot_console_to_joblog(persist: Arc<Persist>, request_id: Uuid, console_path: PathBuf) -> miette::Result<()> {
|
||||||
|
use miette::IntoDiagnostic as _;
|
||||||
|
match tokio::fs::read_to_string(&console_path).await {
|
||||||
|
Ok(content) => {
|
||||||
|
let mut seq: i64 = -1_000_000; // keep consistent ordering before runner logs
|
||||||
|
for raw in content.lines() {
|
||||||
|
let trimmed = raw.trim_end_matches(['\n', '\r']);
|
||||||
|
let line = if trimmed.is_empty() { String::from("[boot]") } else { format!("[boot] {}", trimmed) };
|
||||||
|
let _ = persist.record_log_line(request_id, seq, false, &line).await;
|
||||||
|
seq += 1;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// If file missing or unreadable, just return error up to caller for logging
|
||||||
|
Err(e).into_diagnostic()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
||||||
async fn discover_guest_ip_virsh(domain: &str, timeout: Duration) -> Option<String> {
|
async fn discover_guest_ip_virsh(domain: &str, timeout: Duration) -> Option<String> {
|
||||||
use tokio::{task, time::{sleep, Instant, Duration}};
|
use tokio::{task, time::{sleep, Instant, Duration}};
|
||||||
|
|
@ -694,6 +731,7 @@ mod tests {
|
||||||
runner_linux_path: "/tmp/runner-linux".into(),
|
runner_linux_path: "/tmp/runner-linux".into(),
|
||||||
runner_illumos_path: "/tmp/runner-illumos".into(),
|
runner_illumos_path: "/tmp/runner-illumos".into(),
|
||||||
ssh_connect_timeout_secs: 30,
|
ssh_connect_timeout_secs: 30,
|
||||||
|
boot_wait_secs: 0,
|
||||||
};
|
};
|
||||||
let sched = Scheduler::new(hv, 2, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()), exec);
|
let sched = Scheduler::new(hv, 2, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()), exec);
|
||||||
let tx = sched.sender();
|
let tx = sched.sender();
|
||||||
|
|
@ -738,6 +776,7 @@ mod tests {
|
||||||
runner_linux_path: "/tmp/runner-linux".into(),
|
runner_linux_path: "/tmp/runner-linux".into(),
|
||||||
runner_illumos_path: "/tmp/runner-illumos".into(),
|
runner_illumos_path: "/tmp/runner-illumos".into(),
|
||||||
ssh_connect_timeout_secs: 30,
|
ssh_connect_timeout_secs: 30,
|
||||||
|
boot_wait_secs: 0,
|
||||||
};
|
};
|
||||||
let sched = Scheduler::new(hv, 4, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()), exec);
|
let sched = Scheduler::new(hv, 4, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()), exec);
|
||||||
let tx = sched.sender();
|
let tx = sched.sender();
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue