solstice-ci/crates/orchestrator/src/hypervisor.rs
Till Wegmueller 4ca78144f2
Add VM state monitoring and graceful shutdown enhancements
This commit enhances the `Scheduler` to monitor VM states for completion, enabling more accurate termination detection. It introduces periodic polling combined with shutdown signals to halt operations gracefully. Additionally, VM lifecycle management in the hypervisor is updated with `state` retrieval for precise status assessments. The VM domain configuration now includes serial console support.
2025-10-26 21:59:55 +01:00

473 lines
21 KiB
Rust

use std::{path::PathBuf, time::Duration};
use async_trait::async_trait;
use miette::{Result, IntoDiagnostic as _};
use tracing::{info, warn};
#[cfg(unix)]
use std::os::unix::fs::PermissionsExt;
// Backend tag is used internally to remember which backend handled this VM.
#[derive(Debug, Clone, Copy)]
pub enum BackendTag { Noop, #[cfg(all(target_os = "linux", feature = "libvirt"))] Libvirt, #[cfg(target_os = "illumos")] Zones }
#[derive(Debug, Clone)]
pub struct VmSpec {
pub label: String,
pub image_path: PathBuf,
pub cpu: u16,
pub ram_mb: u32,
pub disk_gb: u32,
pub network: Option<String>,
pub nocloud: bool,
/// Optional user-data (cloud-init NoCloud). If provided, backend will attach seed.
pub user_data: Option<Vec<u8>>,
}
#[derive(Debug, Clone)]
pub struct JobContext {
pub request_id: uuid::Uuid,
pub repo_url: String,
pub commit_sha: String,
pub workflow_job_id: Option<String>,
}
#[derive(Debug, Clone)]
pub struct VmHandle {
pub id: String,
pub backend: BackendTag,
pub work_dir: PathBuf,
pub overlay_path: Option<PathBuf>,
pub seed_iso_path: Option<PathBuf>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum VmState {
Prepared,
Running,
Stopped,
}
#[async_trait]
pub trait Hypervisor: Send + Sync {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle>;
async fn start(&self, vm: &VmHandle) -> Result<()>;
async fn stop(&self, vm: &VmHandle, graceful_timeout: Duration) -> Result<()>;
async fn destroy(&self, vm: VmHandle) -> Result<()>;
async fn state(&self, _vm: &VmHandle) -> Result<VmState> { Ok(VmState::Prepared) }
}
/// A router that delegates to the correct backend implementation per job.
pub struct RouterHypervisor {
pub noop: NoopHypervisor,
#[cfg(all(target_os = "linux", feature = "libvirt"))]
pub libvirt: Option<LibvirtHypervisor>,
#[cfg(target_os = "illumos")]
pub zones: Option<ZonesHypervisor>,
}
impl RouterHypervisor {
#[allow(unused_variables)]
pub fn build(libvirt_uri: String, libvirt_network: String) -> Self {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
{
return RouterHypervisor {
noop: NoopHypervisor::default(),
libvirt: Some(LibvirtHypervisor { uri: libvirt_uri, network: libvirt_network }),
};
}
#[cfg(target_os = "illumos")]
{
return RouterHypervisor { noop: NoopHypervisor::default(), zones: Some(ZonesHypervisor) };
}
#[cfg(all(not(target_os = "illumos"), not(all(target_os = "linux", feature = "libvirt"))))]
{
return RouterHypervisor { noop: NoopHypervisor::default() };
}
}
}
#[async_trait]
impl Hypervisor for RouterHypervisor {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle> {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
{
if let Some(ref hv) = self.libvirt { return hv.prepare(spec, ctx).await; }
}
#[cfg(target_os = "illumos")]
{
if let Some(ref hv) = self.zones { return hv.prepare(spec, ctx).await; }
}
self.noop.prepare(spec, ctx).await
}
async fn start(&self, vm: &VmHandle) -> Result<()> {
match vm.backend {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
BackendTag::Libvirt => {
if let Some(ref hv) = self.libvirt { hv.start(vm).await } else { self.noop.start(vm).await }
}
#[cfg(target_os = "illumos")]
BackendTag::Zones => {
if let Some(ref hv) = self.zones { hv.start(vm).await } else { self.noop.start(vm).await }
}
_ => self.noop.start(vm).await,
}
}
async fn stop(&self, vm: &VmHandle, t: Duration) -> Result<()> {
match vm.backend {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
BackendTag::Libvirt => {
if let Some(ref hv) = self.libvirt { hv.stop(vm, t).await } else { self.noop.stop(vm, t).await }
}
#[cfg(target_os = "illumos")]
BackendTag::Zones => {
if let Some(ref hv) = self.zones { hv.stop(vm, t).await } else { self.noop.stop(vm, t).await }
}
_ => self.noop.stop(vm, t).await,
}
}
async fn destroy(&self, vm: VmHandle) -> Result<()> {
match vm.backend {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
BackendTag::Libvirt => {
if let Some(ref hv) = self.libvirt { hv.destroy(vm).await } else { self.noop.destroy(vm).await }
}
#[cfg(target_os = "illumos")]
BackendTag::Zones => {
if let Some(ref hv) = self.zones { hv.destroy(vm).await } else { self.noop.destroy(vm).await }
}
_ => self.noop.destroy(vm).await,
}
}
async fn state(&self, vm: &VmHandle) -> Result<VmState> {
match vm.backend {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
BackendTag::Libvirt => {
if let Some(ref hv) = self.libvirt { hv.state(vm).await } else { Ok(VmState::Prepared) }
}
#[cfg(target_os = "illumos")]
BackendTag::Zones => {
if let Some(ref hv) = self.zones { hv.state(vm).await } else { Ok(VmState::Prepared) }
}
_ => Ok(VmState::Prepared),
}
}
}
/// No-op hypervisor for development on hosts without privileges.
#[derive(Debug, Clone, Default)]
pub struct NoopHypervisor;
#[async_trait]
impl Hypervisor for NoopHypervisor {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle> {
let id = format!("noop-{}", ctx.request_id);
let work_dir = std::env::temp_dir().join("solstice-noop").join(&id);
tokio::fs::create_dir_all(&work_dir).await.into_diagnostic()?;
info!(id = %id, label = %spec.label, image = ?spec.image_path, "noop prepare");
Ok(VmHandle { id, backend: BackendTag::Noop, work_dir, overlay_path: None, seed_iso_path: None })
}
async fn start(&self, vm: &VmHandle) -> Result<()> {
info!(id = %vm.id, "noop start");
Ok(())
}
async fn stop(&self, vm: &VmHandle, _t: Duration) -> Result<()> {
info!(id = %vm.id, "noop stop");
Ok(())
}
async fn destroy(&self, vm: VmHandle) -> Result<()> {
info!(id = %vm.id, "noop destroy");
Ok(())
}
}
#[cfg(all(target_os = "linux", feature = "libvirt"))]
pub struct LibvirtHypervisor {
pub uri: String,
pub network: String,
}
#[cfg(all(target_os = "linux", feature = "libvirt"))]
impl LibvirtHypervisor {
fn mk_work_dir(&self, id: &str) -> std::path::PathBuf {
// Prefer /var/lib/solstice-ci if writable, else tmp
let base = std::path::Path::new("/var/lib/solstice-ci");
let dir = if base.exists() && base.is_dir() && std::fs::metadata(base).is_ok() {
base.join(id)
} else {
std::env::temp_dir().join("solstice-libvirt").join(id)
};
let _ = std::fs::create_dir_all(&dir);
let _ = std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700));
dir
}
}
#[cfg(all(target_os = "linux", feature = "libvirt"))]
#[async_trait]
impl Hypervisor for LibvirtHypervisor {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle> {
use std::process::Command;
let id = format!("job-{}", ctx.request_id);
let work_dir = self.mk_work_dir(&id);
// Ensure network is active via virt crate; best-effort
let uri = self.uri.clone();
let net_name = self.network.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
use virt::{connect::Connect, network::Network};
let conn = Connect::open(Some(&uri)).map_err(|e| miette::miette!("libvirt connect failed: {e}"))?;
if let Ok(net) = Network::lookup_by_name(&conn, &net_name) {
// If not active, try to create (activate). Then set autostart.
let active = net.is_active().unwrap_or(false);
if !active {
let _ = net.create();
}
let _ = net.set_autostart(true);
}
Ok(())
}).await.into_diagnostic()??;
// Create qcow2 overlay
let overlay = work_dir.join("overlay.qcow2");
let size_arg = format!("{}G", spec.disk_gb);
let status = tokio::task::spawn_blocking({
let base = spec.image_path.clone();
let overlay = overlay.clone();
move || -> miette::Result<()> {
// Detect base image format to set -F accordingly (raw or qcow2)
let base_fmt_out = std::process::Command::new("qemu-img")
.args(["info", "--output=json"])
.arg(&base)
.output()
.map_err(|e| miette::miette!("qemu-img not found or failed: {e}"))?;
if !base_fmt_out.status.success() {
return Err(miette::miette!(
"qemu-img info failed: {}",
String::from_utf8_lossy(&base_fmt_out.stderr)
));
}
let base_fmt: String = {
let v: serde_json::Value = serde_json::from_slice(&base_fmt_out.stdout)
.map_err(|e| miette::miette!("parse qemu-img info json failed: {e}"))?;
v.get("format")
.and_then(|f| f.as_str())
.unwrap_or("raw")
.to_string()
};
let out = Command::new("qemu-img")
.args(["create","-f","qcow2","-F"])
.arg(&base_fmt)
.args(["-b"])
.arg(&base)
.arg(&overlay)
.arg(&size_arg)
.output()
.map_err(|e| miette::miette!("qemu-img not found or failed: {e}"))?;
if !out.status.success() { return Err(miette::miette!("qemu-img create failed: {}", String::from_utf8_lossy(&out.stderr))); }
Ok(())
}
}).await.into_diagnostic()??;
let _ = status; // appease compiler if unused
// Build NoCloud seed ISO if user_data provided
let mut seed_iso: Option<PathBuf> = None;
if let Some(ref user_data) = spec.user_data {
let seed_dir = work_dir.join("seed");
tokio::fs::create_dir_all(&seed_dir).await.into_diagnostic()?;
let ud_path = seed_dir.join("user-data");
let md_path = seed_dir.join("meta-data");
tokio::fs::write(&ud_path, user_data).await.into_diagnostic()?;
let meta = format!("instance-id: {}\nlocal-hostname: {}\n", id, id);
tokio::fs::write(&md_path, meta.as_bytes()).await.into_diagnostic()?;
// mkisofs or genisoimage
let iso_path = work_dir.join("seed.iso");
tokio::task::spawn_blocking({
let iso_path = iso_path.clone();
let seed_dir = seed_dir.clone();
move || -> miette::Result<()> {
let try_mk = |bin: &str| -> std::io::Result<std::process::Output> {
Command::new(bin)
.args(["-V","cidata","-J","-R","-o"])
.arg(&iso_path)
.arg(&seed_dir)
.output()
};
let out = try_mk("mkisofs").or_else(|_| try_mk("genisoimage"))
.map_err(|e| miette::miette!("mkisofs/genisoimage not found: {e}"))?;
if !out.status.success() { return Err(miette::miette!("mkisofs failed: {}", String::from_utf8_lossy(&out.stderr))); }
Ok(())
}
}).await.into_diagnostic()??;
seed_iso = Some(iso_path);
}
// Domain XML
let xml = {
let mem = spec.ram_mb;
let vcpus = spec.cpu;
let overlay_str = overlay.display().to_string();
let seed_str = seed_iso.as_ref().map(|p| p.display().to_string());
let net = self.network.clone();
let cdrom = seed_str.map(|p| format!("<disk type='file' device='cdrom'>\n <driver name='qemu' type='raw'/>\n <source file='{}'/>\n <target dev='hdb' bus='ide'/>\n <readonly/>\n</disk>", p)).unwrap_or_default();
format!("<domain type='kvm'>\n<name>{}</name>\n<memory unit='MiB'>{}</memory>\n<vcpu>{}</vcpu>\n<os>\n <type arch='x86_64' machine='pc'>hvm</type>\n <boot dev='hd'/>\n</os>\n<features><acpi/></features>\n<devices>\n <disk type='file' device='disk'>\n <driver name='qemu' type='qcow2' cache='none'/>\n <source file='{}'/>\n <target dev='vda' bus='virtio'/>\n </disk>\n {}\n <interface type='network'>\n <source network='{}'/>\n <model type='virtio'/>\n </interface>\n <graphics type='vnc' autoport='yes' listen='127.0.0.1'/>\n <serial type='pty'>\n <target port='0'/>\n </serial>\n <console type='pty'>\n <target type='serial' port='0'/>\n </console>\n</devices>\n<on_poweroff>destroy</on_poweroff>\n<on_crash>destroy</on_crash>\n</domain>",
id, mem, vcpus, overlay_str, cdrom, net)
};
// Define via virt crate
let uri2 = self.uri.clone();
let xml_clone = xml.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
use virt::{connect::Connect, domain::Domain};
let conn = Connect::open(Some(&uri2)).map_err(|e| miette::miette!("libvirt connect failed: {e}"))?;
let _dom = Domain::define_xml(&conn, &xml_clone).map_err(|e| miette::miette!("define domain failed: {e}"))?;
Ok(())
}).await.into_diagnostic()??;
info!(domain = %id, image = ?spec.image_path, cpu = spec.cpu, ram_mb = spec.ram_mb, "libvirt prepared");
Ok(VmHandle { id, backend: BackendTag::Libvirt, work_dir, overlay_path: Some(overlay), seed_iso_path: seed_iso })
}
async fn start(&self, vm: &VmHandle) -> Result<()> {
let id = vm.id.clone();
let uri = self.uri.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
use virt::{connect::Connect, domain::Domain};
let conn = Connect::open(Some(&uri)).map_err(|e| miette::miette!("libvirt connect failed: {e}"))?;
// Lookup domain by name and start
let dom = Domain::lookup_by_name(&conn, &id).map_err(|e| miette::miette!("lookup domain failed: {e}"))?;
dom.create().map_err(|e| miette::miette!("domain start failed: {e}"))?;
Ok(())
}).await.into_diagnostic()??;
info!(domain = %vm.id, "libvirt started");
Ok(())
}
async fn stop(&self, vm: &VmHandle, t: Duration) -> Result<()> {
let id = vm.id.clone();
let uri = self.uri.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
use virt::{connect::Connect, domain::Domain};
let conn = Connect::open(Some(&uri)).map_err(|e| miette::miette!("libvirt connect failed: {e}"))?;
let dom = Domain::lookup_by_name(&conn, &id).map_err(|e| miette::miette!("lookup domain failed: {e}"))?;
let _ = dom.shutdown();
let start = std::time::Instant::now();
while start.elapsed() < t {
match dom.is_active() {
Ok(false) => break,
_ => {}
}
std::thread::sleep(std::time::Duration::from_millis(500));
}
// Force destroy if still active
let _ = dom.destroy();
Ok(())
}).await.into_diagnostic()??;
info!(domain = %vm.id, "libvirt stopped");
Ok(())
}
async fn destroy(&self, vm: VmHandle) -> Result<()> {
let id = vm.id.clone();
let uri = self.uri.clone();
let id_for_task = id.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
use virt::{connect::Connect, domain::Domain};
let conn = Connect::open(Some(&uri)).map_err(|e| miette::miette!("libvirt connect failed: {e}"))?;
if let Ok(dom) = Domain::lookup_by_name(&conn, &id_for_task) {
let _ = dom.undefine();
}
Ok(())
}).await.into_diagnostic()??;
// Cleanup files
if let Some(p) = vm.overlay_path.as_ref() { let _ = tokio::fs::remove_file(p).await; }
if let Some(p) = vm.seed_iso_path.as_ref() { let _ = tokio::fs::remove_file(p).await; }
let _ = tokio::fs::remove_dir_all(&vm.work_dir).await;
info!(domain = %id, "libvirt destroyed");
Ok(())
}
async fn state(&self, vm: &VmHandle) -> Result<VmState> {
let id = vm.id.clone();
let uri = self.uri.clone();
let active = tokio::task::spawn_blocking(move || -> miette::Result<bool> {
use virt::{connect::Connect, domain::Domain};
let conn = Connect::open(Some(&uri)).map_err(|e| miette::miette!("libvirt connect failed: {e}"))?;
let dom = Domain::lookup_by_name(&conn, &id).map_err(|e| miette::miette!("lookup domain failed: {e}"))?;
let active = dom.is_active().unwrap_or(false);
Ok(active)
}).await.into_diagnostic()??;
Ok(if active { VmState::Running } else { VmState::Stopped })
}
}
#[cfg(target_os = "illumos")]
pub struct ZonesHypervisor;
#[cfg(target_os = "illumos")]
#[async_trait]
impl Hypervisor for ZonesHypervisor {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle> {
use std::process::Command;
let id = format!("zone-{}", ctx.request_id);
// Create working directory under /var/lib/solstice-ci if possible
let work_dir = {
let base = std::path::Path::new("/var/lib/solstice-ci");
let dir = if base.exists() && base.is_dir() && std::fs::metadata(base).is_ok() {
base.join(&id)
} else {
std::env::temp_dir().join("solstice-zones").join(&id)
};
let _ = std::fs::create_dir_all(&dir);
#[cfg(unix)]
let _ = std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700));
dir
};
// Detect base image format
let base = spec.image_path.clone();
let base_fmt = tokio::task::spawn_blocking(move || -> miette::Result<String> {
let out = Command::new("qemu-img")
.args(["info", "--output=json"]).arg(&base)
.output()
.map_err(|e| miette::miette!("qemu-img not found or failed: {e}"))?;
if !out.status.success() {
return Err(miette::miette!("qemu-img info failed: {}", String::from_utf8_lossy(&out.stderr)));
}
let v: serde_json::Value = serde_json::from_slice(&out.stdout)
.map_err(|e| miette::miette!("parse qemu-img info json failed: {e}"))?;
Ok(v.get("format").and_then(|f| f.as_str()).unwrap_or("raw").to_string())
}).await.into_diagnostic()??;
// Ensure raw image for bhyve: convert if needed
let raw_path = if base_fmt != "raw" {
let out_path = work_dir.join("disk.raw");
let src = spec.image_path.clone();
let dst = out_path.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
let out = Command::new("qemu-img")
.args(["convert", "-O", "raw"])
.arg(&src)
.arg(&dst)
.output()
.map_err(|e| miette::miette!("qemu-img convert failed to start: {e}"))?;
if !out.status.success() {
return Err(miette::miette!("qemu-img convert failed: {}", String::from_utf8_lossy(&out.stderr)));
}
Ok(())
}).await.into_diagnostic()??;
info!(label = %spec.label, src = ?spec.image_path, out = ?out_path, "converted image to raw for bhyve");
out_path
} else {
spec.image_path.clone()
};
// Seed ISO creation left to future; for now, return handle with path in overlay_path
Ok(VmHandle { id, backend: BackendTag::Zones, work_dir, overlay_path: Some(raw_path), seed_iso_path: None })
}
async fn start(&self, _vm: &VmHandle) -> Result<()> { Ok(()) }
async fn stop(&self, _vm: &VmHandle, _t: Duration) -> Result<()> { Ok(()) }
async fn destroy(&self, _vm: VmHandle) -> Result<()> { Ok(()) }
}