From 4ca78144f2814721030ac12bfe77f2d7aeccf256491bfd1db90680829c503de7 Mon Sep 17 00:00:00 2001 From: Till Wegmueller Date: Sun, 26 Oct 2025 21:59:55 +0100 Subject: [PATCH] Add VM state monitoring and graceful shutdown enhancements This commit enhances the `Scheduler` to monitor VM states for completion, enabling more accurate termination detection. It introduces periodic polling combined with shutdown signals to halt operations gracefully. Additionally, VM lifecycle management in the hypervisor is updated with `state` retrieval for precise status assessments. The VM domain configuration now includes serial console support. --- crates/orchestrator/src/hypervisor.rs | 28 ++++++++++++++++++++++++++- crates/orchestrator/src/main.rs | 7 +++++++ crates/orchestrator/src/scheduler.rs | 23 ++++++++++++++++------ 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/crates/orchestrator/src/hypervisor.rs b/crates/orchestrator/src/hypervisor.rs index 426900c..cc6149b 100644 --- a/crates/orchestrator/src/hypervisor.rs +++ b/crates/orchestrator/src/hypervisor.rs @@ -138,6 +138,19 @@ impl Hypervisor for RouterHypervisor { _ => self.noop.destroy(vm).await, } } + async fn state(&self, vm: &VmHandle) -> Result { + match vm.backend { + #[cfg(all(target_os = "linux", feature = "libvirt"))] + BackendTag::Libvirt => { + if let Some(ref hv) = self.libvirt { hv.state(vm).await } else { Ok(VmState::Prepared) } + } + #[cfg(target_os = "illumos")] + BackendTag::Zones => { + if let Some(ref hv) = self.zones { hv.state(vm).await } else { Ok(VmState::Prepared) } + } + _ => Ok(VmState::Prepared), + } + } } /// No-op hypervisor for development on hosts without privileges. @@ -299,7 +312,7 @@ impl Hypervisor for LibvirtHypervisor { let seed_str = seed_iso.as_ref().map(|p| p.display().to_string()); let net = self.network.clone(); let cdrom = seed_str.map(|p| format!("\n \n \n \n \n", p)).unwrap_or_default(); - format!("\n{}\n{}\n{}\n\n hvm\n \n\n\n\n \n \n \n \n \n {}\n \n \n \n \n \n \n\ndestroy\ndestroy\n", + format!("\n{}\n{}\n{}\n\n hvm\n \n\n\n\n \n \n \n \n \n {}\n \n \n \n \n \n \n \n \n \n \n \n\ndestroy\ndestroy\n", id, mem, vcpus, overlay_str, cdrom, net) }; @@ -375,6 +388,19 @@ impl Hypervisor for LibvirtHypervisor { info!(domain = %id, "libvirt destroyed"); Ok(()) } + + async fn state(&self, vm: &VmHandle) -> Result { + let id = vm.id.clone(); + let uri = self.uri.clone(); + let active = tokio::task::spawn_blocking(move || -> miette::Result { + use virt::{connect::Connect, domain::Domain}; + let conn = Connect::open(Some(&uri)).map_err(|e| miette::miette!("libvirt connect failed: {e}"))?; + let dom = Domain::lookup_by_name(&conn, &id).map_err(|e| miette::miette!("lookup domain failed: {e}"))?; + let active = dom.is_active().unwrap_or(false); + Ok(active) + }).await.into_diagnostic()??; + Ok(if active { VmState::Running } else { VmState::Stopped }) + } } #[cfg(target_os = "illumos")] diff --git a/crates/orchestrator/src/main.rs b/crates/orchestrator/src/main.rs index 2540a0c..1f0eba5 100644 --- a/crates/orchestrator/src/main.rs +++ b/crates/orchestrator/src/main.rs @@ -221,6 +221,11 @@ write_files: content: | repo_url: {repo} commit_sha: {sha} +runcmd: + - [ sh, -c, "echo 'Solstice: preparing workspace for {sha}' | tee /dev/console" ] + - [ sh, -c, "mkdir -p /root/work && cd /root/work && if command -v git >/dev/null 2>&1; then git init && git remote add origin {repo} && git fetch --depth=1 origin {sha} && git checkout -q FETCH_HEAD || true; else echo 'git not installed'; fi" ] + - [ sh, -c, "if [ -f /root/work/.solstice/job.sh ]; then chmod +x /root/work/.solstice/job.sh && cd /root/work && /root/work/.solstice/job.sh || true; else echo 'No .solstice/job.sh found in repo'; fi" ] + - [ sh, -c, "echo 'Solstice: job complete, powering off' | tee /dev/console; (command -v poweroff >/dev/null 2>&1 && poweroff) || (command -v shutdown >/dev/null 2>&1 && shutdown -y -i5 -g0) || true" ] "#, repo = repo_url, sha = commit_sha); s.into_bytes() } @@ -250,5 +255,7 @@ mod tests { assert!(s.contains("commit_sha: deadbeef")); assert!(s.contains("write_files:")); assert!(s.contains("/etc/solstice/job.yaml")); + assert!(s.contains("runcmd:")); + assert!(s.contains("powering off")); } } diff --git a/crates/orchestrator/src/scheduler.rs b/crates/orchestrator/src/scheduler.rs index ae4e0b2..7a14b2b 100644 --- a/crates/orchestrator/src/scheduler.rs +++ b/crates/orchestrator/src/scheduler.rs @@ -93,12 +93,23 @@ impl Scheduler { } let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Running).await; let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Running).await; - info!(request_id = %item.ctx.request_id, label = %label_key, "vm started (workload execution placeholder)"); - // Placeholder job runtime (configurable), but end early on shutdown - tokio::select! { - _ = tokio::time::sleep(placeholder_runtime) => {}, - _ = shutdown.notified() => { - info!(request_id = %item.ctx.request_id, label = %label_key, "shutdown: ending placeholder early"); + info!(request_id = %item.ctx.request_id, label = %label_key, "vm started (monitoring for completion)"); + // Monitor VM state until it stops or until placeholder_runtime elapses; end early on shutdown + let start_time = std::time::Instant::now(); + loop { + // Check current state first + if let Ok(crate::hypervisor::VmState::Stopped) = hv.state(&h).await { + info!(request_id = %item.ctx.request_id, label = %label_key, "vm reported stopped"); + break; + } + if start_time.elapsed() >= placeholder_runtime { break; } + // Wait either for shutdown signal or a short delay before next poll + tokio::select! { + _ = shutdown.notified() => { + info!(request_id = %item.ctx.request_id, label = %label_key, "shutdown: ending early"); + break; + } + _ = tokio::time::sleep(Duration::from_secs(2)) => {} } } // Stop and destroy