Add VM state monitoring and graceful shutdown enhancements

This commit enhances the `Scheduler` to monitor VM states for completion, enabling more accurate termination detection. It introduces periodic polling combined with shutdown signals to halt operations gracefully. Additionally, VM lifecycle management in the hypervisor is updated with `state` retrieval for precise status assessments. The VM domain configuration now includes serial console support.
This commit is contained in:
Till Wegmueller 2025-10-26 21:59:55 +01:00
parent bddd36b16f
commit 4ca78144f2
No known key found for this signature in database
3 changed files with 51 additions and 7 deletions

View file

@ -138,6 +138,19 @@ impl Hypervisor for RouterHypervisor {
_ => self.noop.destroy(vm).await,
}
}
async fn state(&self, vm: &VmHandle) -> Result<VmState> {
match vm.backend {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
BackendTag::Libvirt => {
if let Some(ref hv) = self.libvirt { hv.state(vm).await } else { Ok(VmState::Prepared) }
}
#[cfg(target_os = "illumos")]
BackendTag::Zones => {
if let Some(ref hv) = self.zones { hv.state(vm).await } else { Ok(VmState::Prepared) }
}
_ => Ok(VmState::Prepared),
}
}
}
/// No-op hypervisor for development on hosts without privileges.
@ -299,7 +312,7 @@ impl Hypervisor for LibvirtHypervisor {
let seed_str = seed_iso.as_ref().map(|p| p.display().to_string());
let net = self.network.clone();
let cdrom = seed_str.map(|p| format!("<disk type='file' device='cdrom'>\n <driver name='qemu' type='raw'/>\n <source file='{}'/>\n <target dev='hdb' bus='ide'/>\n <readonly/>\n</disk>", p)).unwrap_or_default();
format!("<domain type='kvm'>\n<name>{}</name>\n<memory unit='MiB'>{}</memory>\n<vcpu>{}</vcpu>\n<os>\n <type arch='x86_64' machine='pc'>hvm</type>\n <boot dev='hd'/>\n</os>\n<features><acpi/></features>\n<devices>\n <disk type='file' device='disk'>\n <driver name='qemu' type='qcow2' cache='none'/>\n <source file='{}'/>\n <target dev='vda' bus='virtio'/>\n </disk>\n {}\n <interface type='network'>\n <source network='{}'/>\n <model type='virtio'/>\n </interface>\n <graphics type='vnc' autoport='yes' listen='127.0.0.1'/>\n <console type='pty'/>\n</devices>\n<on_poweroff>destroy</on_poweroff>\n<on_crash>destroy</on_crash>\n</domain>",
format!("<domain type='kvm'>\n<name>{}</name>\n<memory unit='MiB'>{}</memory>\n<vcpu>{}</vcpu>\n<os>\n <type arch='x86_64' machine='pc'>hvm</type>\n <boot dev='hd'/>\n</os>\n<features><acpi/></features>\n<devices>\n <disk type='file' device='disk'>\n <driver name='qemu' type='qcow2' cache='none'/>\n <source file='{}'/>\n <target dev='vda' bus='virtio'/>\n </disk>\n {}\n <interface type='network'>\n <source network='{}'/>\n <model type='virtio'/>\n </interface>\n <graphics type='vnc' autoport='yes' listen='127.0.0.1'/>\n <serial type='pty'>\n <target port='0'/>\n </serial>\n <console type='pty'>\n <target type='serial' port='0'/>\n </console>\n</devices>\n<on_poweroff>destroy</on_poweroff>\n<on_crash>destroy</on_crash>\n</domain>",
id, mem, vcpus, overlay_str, cdrom, net)
};
@ -375,6 +388,19 @@ impl Hypervisor for LibvirtHypervisor {
info!(domain = %id, "libvirt destroyed");
Ok(())
}
async fn state(&self, vm: &VmHandle) -> Result<VmState> {
let id = vm.id.clone();
let uri = self.uri.clone();
let active = tokio::task::spawn_blocking(move || -> miette::Result<bool> {
use virt::{connect::Connect, domain::Domain};
let conn = Connect::open(Some(&uri)).map_err(|e| miette::miette!("libvirt connect failed: {e}"))?;
let dom = Domain::lookup_by_name(&conn, &id).map_err(|e| miette::miette!("lookup domain failed: {e}"))?;
let active = dom.is_active().unwrap_or(false);
Ok(active)
}).await.into_diagnostic()??;
Ok(if active { VmState::Running } else { VmState::Stopped })
}
}
#[cfg(target_os = "illumos")]

View file

@ -221,6 +221,11 @@ write_files:
content: |
repo_url: {repo}
commit_sha: {sha}
runcmd:
- [ sh, -c, "echo 'Solstice: preparing workspace for {sha}' | tee /dev/console" ]
- [ sh, -c, "mkdir -p /root/work && cd /root/work && if command -v git >/dev/null 2>&1; then git init && git remote add origin {repo} && git fetch --depth=1 origin {sha} && git checkout -q FETCH_HEAD || true; else echo 'git not installed'; fi" ]
- [ sh, -c, "if [ -f /root/work/.solstice/job.sh ]; then chmod +x /root/work/.solstice/job.sh && cd /root/work && /root/work/.solstice/job.sh || true; else echo 'No .solstice/job.sh found in repo'; fi" ]
- [ sh, -c, "echo 'Solstice: job complete, powering off' | tee /dev/console; (command -v poweroff >/dev/null 2>&1 && poweroff) || (command -v shutdown >/dev/null 2>&1 && shutdown -y -i5 -g0) || true" ]
"#, repo = repo_url, sha = commit_sha);
s.into_bytes()
}
@ -250,5 +255,7 @@ mod tests {
assert!(s.contains("commit_sha: deadbeef"));
assert!(s.contains("write_files:"));
assert!(s.contains("/etc/solstice/job.yaml"));
assert!(s.contains("runcmd:"));
assert!(s.contains("powering off"));
}
}

View file

@ -93,12 +93,23 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
}
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Running).await;
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Running).await;
info!(request_id = %item.ctx.request_id, label = %label_key, "vm started (workload execution placeholder)");
// Placeholder job runtime (configurable), but end early on shutdown
tokio::select! {
_ = tokio::time::sleep(placeholder_runtime) => {},
_ = shutdown.notified() => {
info!(request_id = %item.ctx.request_id, label = %label_key, "shutdown: ending placeholder early");
info!(request_id = %item.ctx.request_id, label = %label_key, "vm started (monitoring for completion)");
// Monitor VM state until it stops or until placeholder_runtime elapses; end early on shutdown
let start_time = std::time::Instant::now();
loop {
// Check current state first
if let Ok(crate::hypervisor::VmState::Stopped) = hv.state(&h).await {
info!(request_id = %item.ctx.request_id, label = %label_key, "vm reported stopped");
break;
}
if start_time.elapsed() >= placeholder_runtime { break; }
// Wait either for shutdown signal or a short delay before next poll
tokio::select! {
_ = shutdown.notified() => {
info!(request_id = %item.ctx.request_id, label = %label_key, "shutdown: ending early");
break;
}
_ = tokio::time::sleep(Duration::from_secs(2)) => {}
}
}
// Stop and destroy