mirror of
https://codeberg.org/Toasterson/solstice-ci.git
synced 2026-04-10 13:20:41 +00:00
Add VM suspend handling, persistence updates, and orchestrator enhancements
This commit introduces: - VM suspend support for timeout scenarios, allowing investigation of frozen states. - Enhanced orchestrator persistence initialization with skip option for faster startup. - Improvements to orchestrator logging, job state tracking, and VM runtime monitoring. - Updates to CI tasks for capturing job request IDs and tracking completion statuses. - Extended hypervisor capabilities, including libvirt console logging configuration.
This commit is contained in:
parent
f753265a79
commit
9597bbf64d
7 changed files with 172 additions and 63 deletions
|
|
@ -72,6 +72,8 @@ ILLUMOS_URL="http://$HOST_IP:$SOL_RUNNER_PORT_ILLUMOS/solstice-runner-illumos"
|
||||||
export SOLSTICE_RUNNER_URLS="$LINUX_URL $ILLUMOS_URL"
|
export SOLSTICE_RUNNER_URLS="$LINUX_URL $ILLUMOS_URL"
|
||||||
|
|
||||||
# Start orchestrator in background (inherits env including SOLSTICE_RUNNER_URLS/ORCH_CONTACT_ADDR)
|
# Start orchestrator in background (inherits env including SOLSTICE_RUNNER_URLS/ORCH_CONTACT_ADDR)
|
||||||
|
# Speed up startup by skipping persistence unless explicitly disabled
|
||||||
|
export ORCH_SKIP_PERSIST=${ORCH_SKIP_PERSIST:-true}
|
||||||
LOGFILE=${SOL_ORCH_LOG:-"$ROOT_DIR/target/orchestrator.vm-build.log"}
|
LOGFILE=${SOL_ORCH_LOG:-"$ROOT_DIR/target/orchestrator.vm-build.log"}
|
||||||
echo "Starting orchestrator... (logs: $LOGFILE)" >&2
|
echo "Starting orchestrator... (logs: $LOGFILE)" >&2
|
||||||
(
|
(
|
||||||
|
|
@ -84,10 +86,14 @@ echo " Linux: $LINUX_URL" >&2
|
||||||
echo " Illumos: $ILLUMOS_URL" >&2
|
echo " Illumos: $ILLUMOS_URL" >&2
|
||||||
echo "Orchestrator contact: $ORCH_CONTACT_ADDR" >&2
|
echo "Orchestrator contact: $ORCH_CONTACT_ADDR" >&2
|
||||||
|
|
||||||
# Give it a moment to start
|
echo "Waiting for orchestrator to start..." >&2
|
||||||
sleep 3
|
# shellcheck disable=SC2034
|
||||||
|
for i in {1..20}; do
|
||||||
|
if grep -q "orchestrator starting" "$LOGFILE" 2>/dev/null; then break; fi
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
|
||||||
# Enqueue two jobs: one Linux, one Illumos
|
# Enqueue two jobs: one Linux, one Illumos and capture their request IDs
|
||||||
SOL_REPO_URL=${SOL_REPO_URL:-$(git -C "$ROOT_DIR" remote get-url origin 2>/dev/null || true)}
|
SOL_REPO_URL=${SOL_REPO_URL:-$(git -C "$ROOT_DIR" remote get-url origin 2>/dev/null || true)}
|
||||||
SOL_COMMIT_SHA=${SOL_COMMIT_SHA:-$(git -C "$ROOT_DIR" rev-parse HEAD 2>/dev/null || true)}
|
SOL_COMMIT_SHA=${SOL_COMMIT_SHA:-$(git -C "$ROOT_DIR" rev-parse HEAD 2>/dev/null || true)}
|
||||||
if [[ -z "${SOL_REPO_URL}" || -z "${SOL_COMMIT_SHA}" ]]; then
|
if [[ -z "${SOL_REPO_URL}" || -z "${SOL_COMMIT_SHA}" ]]; then
|
||||||
|
|
@ -95,22 +101,51 @@ if [[ -z "${SOL_REPO_URL}" || -z "${SOL_COMMIT_SHA}" ]]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Linux (Ubuntu image in example config)
|
# Linux (Ubuntu image in example config)
|
||||||
SOL_RUNS_ON=ubuntu-22.04 "$ROOT_DIR/.mise/tasks/run/forge-enqueue"
|
REQ_LINUX=$(SOL_RUNS_ON=ubuntu-22.04 "$ROOT_DIR/.mise/tasks/run/forge-enqueue")
|
||||||
# Illumos (default label / alias)
|
# Illumos (default label / alias)
|
||||||
SOL_RUNS_ON=illumos-latest "$ROOT_DIR/.mise/tasks/run/forge-enqueue"
|
REQ_ILLUMOS=$(SOL_RUNS_ON=illumos-latest "$ROOT_DIR/.mise/tasks/run/forge-enqueue")
|
||||||
|
|
||||||
# Tail orchestrator logs for a while
|
echo "Enqueued request IDs:" >&2
|
||||||
TAIL_SECS=${SOL_TAIL_SECS:-30}
|
echo " Linux: $REQ_LINUX" >&2
|
||||||
echo "Tailing orchestrator logs for ${TAIL_SECS}s..." >&2
|
echo " Illumos: $REQ_ILLUMOS" >&2
|
||||||
if command -v timeout >/dev/null 2>&1; then
|
|
||||||
(timeout ${TAIL_SECS}s tail -f "$LOGFILE" || true) 2>/dev/null
|
# Wait for both jobs to finish by watching orchestrator logs
|
||||||
elif command -v gtimeout >/dev/null 2>&1; then
|
TIMEOUT_SECS=${SOL_WAIT_TIMEOUT_SECS:-3600}
|
||||||
(gtimeout ${TAIL_SECS}s tail -f "$LOGFILE" || true) 2>/dev/null
|
DEADLINE=$(( $(date +%s) + TIMEOUT_SECS ))
|
||||||
else
|
DONE_LINUX=""
|
||||||
tail -f "$LOGFILE" &
|
DONE_ILLUMOS=""
|
||||||
TAIL_PID=$!
|
STATUS_LINUX=""
|
||||||
sleep "$TAIL_SECS" || true
|
STATUS_ILLUMOS=""
|
||||||
kill "$TAIL_PID" 2>/dev/null || true
|
|
||||||
|
echo "Waiting up to ${TIMEOUT_SECS}s for jobs to finish..." >&2
|
||||||
|
while :; do
|
||||||
|
now=$(date +%s)
|
||||||
|
if [[ $now -ge $DEADLINE ]]; then
|
||||||
|
echo "Timeout waiting for jobs to finish" >&2
|
||||||
|
exit 124
|
||||||
|
fi
|
||||||
|
if [[ -z "$DONE_LINUX" ]]; then
|
||||||
|
if grep -q "job finished: ${REQ_LINUX} succeeded" "$LOGFILE"; then DONE_LINUX=1; STATUS_LINUX=ok; fi
|
||||||
|
if grep -q "job finished: ${REQ_LINUX} failed" "$LOGFILE"; then DONE_LINUX=1; STATUS_LINUX=failed; fi
|
||||||
|
fi
|
||||||
|
if [[ -z "$DONE_ILLUMOS" ]]; then
|
||||||
|
if grep -q "job finished: ${REQ_ILLUMOS} succeeded" "$LOGFILE"; then DONE_ILLUMOS=1; STATUS_ILLUMOS=ok; fi
|
||||||
|
if grep -q "job finished: ${REQ_ILLUMOS} failed" "$LOGFILE"; then DONE_ILLUMOS=1; STATUS_ILLUMOS=failed; fi
|
||||||
|
fi
|
||||||
|
if [[ -n "$DONE_LINUX" && -n "$DONE_ILLUMOS" ]]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Job results:" >&2
|
||||||
|
echo " Linux: $STATUS_LINUX" >&2
|
||||||
|
echo " Illumos: $STATUS_ILLUMOS" >&2
|
||||||
|
|
||||||
|
# Exit non-zero if any failed
|
||||||
|
if [[ "$STATUS_LINUX" != "ok" || "$STATUS_ILLUMOS" != "ok" ]]; then
|
||||||
|
echo "One or more jobs failed" >&2
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Done. Logs at $LOGFILE" >&2
|
echo "Done. Logs at $LOGFILE" >&2
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,6 @@ export AMQP_ROUTING_KEY=${AMQP_ROUTING_KEY:-jobrequest.v1}
|
||||||
export AMQP_PREFETCH=${AMQP_PREFETCH:-2}
|
export AMQP_PREFETCH=${AMQP_PREFETCH:-2}
|
||||||
export GRPC_ADDR=${GRPC_ADDR:-0.0.0.0:50051}
|
export GRPC_ADDR=${GRPC_ADDR:-0.0.0.0:50051}
|
||||||
# For Linux + libvirt users, customize via LIBVIRT_URI and LIBVIRT_NETWORK
|
# For Linux + libvirt users, customize via LIBVIRT_URI and LIBVIRT_NETWORK
|
||||||
exec cargo run -p orchestrator -- \
|
exec cargo run -p orchestrator --features libvirt -- \
|
||||||
--config "$ORCH_CONFIG" \
|
--config "$ORCH_CONFIG" \
|
||||||
--grpc-addr "$GRPC_ADDR"
|
--grpc-addr "$GRPC_ADDR"
|
||||||
|
|
@ -113,6 +113,8 @@ async fn main() -> Result<()> {
|
||||||
let mut jr = common::JobRequest::new(common::SourceSystem::Manual, repo_url, commit_sha);
|
let mut jr = common::JobRequest::new(common::SourceSystem::Manual, repo_url, commit_sha);
|
||||||
jr.runs_on = runs_on;
|
jr.runs_on = runs_on;
|
||||||
common::publish_job(&mq_cfg, &jr).await?;
|
common::publish_job(&mq_cfg, &jr).await?;
|
||||||
|
// Print just the request_id on stdout so scripts can capture it reliably.
|
||||||
|
println!("{}", jr.request_id);
|
||||||
info!(request_id = %jr.request_id, "enqueued job request");
|
info!(request_id = %jr.request_id, "enqueued job request");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,7 @@ pub trait Hypervisor: Send + Sync {
|
||||||
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle>;
|
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle>;
|
||||||
async fn start(&self, vm: &VmHandle) -> Result<()>;
|
async fn start(&self, vm: &VmHandle) -> Result<()>;
|
||||||
async fn stop(&self, vm: &VmHandle, graceful_timeout: Duration) -> Result<()>;
|
async fn stop(&self, vm: &VmHandle, graceful_timeout: Duration) -> Result<()>;
|
||||||
|
async fn suspend(&self, vm: &VmHandle) -> Result<()>;
|
||||||
async fn destroy(&self, vm: VmHandle) -> Result<()>;
|
async fn destroy(&self, vm: VmHandle) -> Result<()>;
|
||||||
async fn state(&self, _vm: &VmHandle) -> Result<VmState> {
|
async fn state(&self, _vm: &VmHandle) -> Result<VmState> {
|
||||||
Ok(VmState::Prepared)
|
Ok(VmState::Prepared)
|
||||||
|
|
@ -162,6 +163,27 @@ impl Hypervisor for RouterHypervisor {
|
||||||
_ => self.noop.stop(vm, t).await,
|
_ => self.noop.stop(vm, t).await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
async fn suspend(&self, vm: &VmHandle) -> Result<()> {
|
||||||
|
match vm.backend {
|
||||||
|
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
||||||
|
BackendTag::Libvirt => {
|
||||||
|
if let Some(ref hv) = self.libvirt {
|
||||||
|
hv.suspend(vm).await
|
||||||
|
} else {
|
||||||
|
self.noop.suspend(vm).await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[cfg(target_os = "illumos")]
|
||||||
|
BackendTag::Zones => {
|
||||||
|
if let Some(ref hv) = self.zones {
|
||||||
|
hv.suspend(vm).await
|
||||||
|
} else {
|
||||||
|
self.noop.suspend(vm).await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => self.noop.suspend(vm).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
async fn destroy(&self, vm: VmHandle) -> Result<()> {
|
async fn destroy(&self, vm: VmHandle) -> Result<()> {
|
||||||
match vm.backend {
|
match vm.backend {
|
||||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
||||||
|
|
@ -235,6 +257,10 @@ impl Hypervisor for NoopHypervisor {
|
||||||
info!(id = %vm.id, "noop stop");
|
info!(id = %vm.id, "noop stop");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
async fn suspend(&self, vm: &VmHandle) -> Result<()> {
|
||||||
|
info!(id = %vm.id, "noop suspend");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
async fn destroy(&self, vm: VmHandle) -> Result<()> {
|
async fn destroy(&self, vm: VmHandle) -> Result<()> {
|
||||||
info!(id = %vm.id, "noop destroy");
|
info!(id = %vm.id, "noop destroy");
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
@ -389,6 +415,11 @@ impl Hypervisor for LibvirtHypervisor {
|
||||||
seed_iso = Some(iso_path);
|
seed_iso = Some(iso_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Serial console log file path
|
||||||
|
let console_log = work_dir.join("console.log");
|
||||||
|
let console_log_str = console_log.display().to_string();
|
||||||
|
info!(domain = %id, console = %console_log_str, "serial console will be logged to file");
|
||||||
|
|
||||||
// Domain XML
|
// Domain XML
|
||||||
let xml = {
|
let xml = {
|
||||||
let mem = spec.ram_mb;
|
let mem = spec.ram_mb;
|
||||||
|
|
@ -398,8 +429,8 @@ impl Hypervisor for LibvirtHypervisor {
|
||||||
let net = self.network.clone();
|
let net = self.network.clone();
|
||||||
let cdrom = seed_str.map(|p| format!("<disk type='file' device='cdrom'>\n <driver name='qemu' type='raw'/>\n <source file='{}'/>\n <target dev='hdb' bus='ide'/>\n <readonly/>\n</disk>", p)).unwrap_or_default();
|
let cdrom = seed_str.map(|p| format!("<disk type='file' device='cdrom'>\n <driver name='qemu' type='raw'/>\n <source file='{}'/>\n <target dev='hdb' bus='ide'/>\n <readonly/>\n</disk>", p)).unwrap_or_default();
|
||||||
format!(
|
format!(
|
||||||
"<domain type='kvm'>\n<name>{}</name>\n<memory unit='MiB'>{}</memory>\n<vcpu>{}</vcpu>\n<os>\n <type arch='x86_64' machine='pc'>hvm</type>\n <boot dev='hd'/>\n</os>\n<features><acpi/></features>\n<devices>\n <disk type='file' device='disk'>\n <driver name='qemu' type='qcow2' cache='none'/>\n <source file='{}'/>\n <target dev='vda' bus='virtio'/>\n </disk>\n {}\n <interface type='network'>\n <source network='{}'/>\n <model type='virtio'/>\n </interface>\n <graphics type='vnc' autoport='yes' listen='127.0.0.1'/>\n <serial type='pty'>\n <target port='0'/>\n </serial>\n <console type='pty'>\n <target type='serial' port='0'/>\n </console>\n</devices>\n<on_poweroff>destroy</on_poweroff>\n<on_crash>destroy</on_crash>\n</domain>",
|
"<domain type='kvm'>\n<name>{}</name>\n<memory unit='MiB'>{}</memory>\n<vcpu>{}</vcpu>\n<os>\n <type arch='x86_64' machine='pc'>hvm</type>\n <boot dev='hd'/>\n</os>\n<features><acpi/></features>\n<devices>\n <disk type='file' device='disk'>\n <driver name='qemu' type='qcow2' cache='none'/>\n <source file='{}'/>\n <target dev='vda' bus='virtio'/>\n </disk>\n {}\n <interface type='network'>\n <source network='{}'/>\n <model type='virtio'/>\n </interface>\n <graphics type='vnc' autoport='yes' listen='127.0.0.1'/>\n <serial type='pty'>\n <target port='0'/>\n </serial>\n <serial type='file'>\n <source path='{}'/>\n <target port='1'/>\n </serial>\n <console type='pty'>\n <target type='serial' port='0'/>\n </console>\n</devices>\n<on_poweroff>destroy</on_poweroff>\n<on_crash>destroy</on_crash>\n</domain>",
|
||||||
id, mem, vcpus, overlay_str, cdrom, net
|
id, mem, vcpus, overlay_str, cdrom, net, console_log_str
|
||||||
)
|
)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -474,6 +505,23 @@ impl Hypervisor for LibvirtHypervisor {
|
||||||
info!(domain = %vm.id, "libvirt stopped");
|
info!(domain = %vm.id, "libvirt stopped");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
async fn suspend(&self, vm: &VmHandle) -> Result<()> {
|
||||||
|
let id = vm.id.clone();
|
||||||
|
let uri = self.uri.clone();
|
||||||
|
tokio::task::spawn_blocking(move || -> miette::Result<()> {
|
||||||
|
use virt::{connect::Connect, domain::Domain};
|
||||||
|
let conn = Connect::open(Some(&uri))
|
||||||
|
.map_err(|e| miette::miette!("libvirt connect failed: {e}"))?;
|
||||||
|
let dom = Domain::lookup_by_name(&conn, &id)
|
||||||
|
.map_err(|e| miette::miette!("lookup domain failed: {e}"))?;
|
||||||
|
dom.suspend().map_err(|e| miette::miette!("domain suspend failed: {e}"))?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.into_diagnostic()??;
|
||||||
|
info!(domain = %vm.id, "libvirt suspended");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
async fn destroy(&self, vm: VmHandle) -> Result<()> {
|
async fn destroy(&self, vm: VmHandle) -> Result<()> {
|
||||||
let id = vm.id.clone();
|
let id = vm.id.clone();
|
||||||
|
|
@ -617,4 +665,7 @@ impl Hypervisor for ZonesHypervisor {
|
||||||
async fn destroy(&self, _vm: VmHandle) -> Result<()> {
|
async fn destroy(&self, _vm: VmHandle) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
async fn suspend(&self, _vm: &VmHandle) -> Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,10 @@ struct Opts {
|
||||||
#[arg(long, env = "MAX_CONCURRENCY", default_value_t = 2)]
|
#[arg(long, env = "MAX_CONCURRENCY", default_value_t = 2)]
|
||||||
max_concurrency: usize,
|
max_concurrency: usize,
|
||||||
|
|
||||||
|
/// Skip persistence initialization (faster startup; disables DB writes)
|
||||||
|
#[arg(long = "skip-persistence", env = "ORCH_SKIP_PERSIST", default_value_t = false)]
|
||||||
|
skip_persistence: bool,
|
||||||
|
|
||||||
/// Per-label capacity map (e.g., illumos-latest=2,ubuntu-22.04=4)
|
/// Per-label capacity map (e.g., illumos-latest=2,ubuntu-22.04=4)
|
||||||
#[arg(long, env = "CAPACITY_MAP")]
|
#[arg(long, env = "CAPACITY_MAP")]
|
||||||
capacity_map: Option<String>,
|
capacity_map: Option<String>,
|
||||||
|
|
@ -40,11 +44,11 @@ struct Opts {
|
||||||
#[arg(long, env = "GRPC_ADDR", default_value = "0.0.0.0:50051")]
|
#[arg(long, env = "GRPC_ADDR", default_value = "0.0.0.0:50051")]
|
||||||
grpc_addr: String,
|
grpc_addr: String,
|
||||||
|
|
||||||
/// Postgres connection string
|
/// Postgres connection string (if empty, persistence is disabled)
|
||||||
#[arg(
|
#[arg(
|
||||||
long,
|
long,
|
||||||
env = "DATABASE_URL",
|
env = "DATABASE_URL",
|
||||||
default_value = "postgres://user:pass@localhost:5432/solstice"
|
default_value = ""
|
||||||
)]
|
)]
|
||||||
database_url: String,
|
database_url: String,
|
||||||
|
|
||||||
|
|
@ -81,7 +85,7 @@ struct Opts {
|
||||||
otlp: Option<String>,
|
otlp: Option<String>,
|
||||||
|
|
||||||
/// Placeholder VM run time in seconds (temporary until agent wiring)
|
/// Placeholder VM run time in seconds (temporary until agent wiring)
|
||||||
#[arg(long, env = "VM_PLACEHOLDER_RUN_SECS", default_value_t = 300)]
|
#[arg(long, env = "VM_PLACEHOLDER_RUN_SECS", default_value_t = 3600)]
|
||||||
vm_placeholder_run_secs: u64,
|
vm_placeholder_run_secs: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -101,8 +105,12 @@ async fn main() -> Result<()> {
|
||||||
// Build hypervisor router
|
// Build hypervisor router
|
||||||
let router = RouterHypervisor::build(opts.libvirt_uri.clone(), opts.libvirt_network.clone());
|
let router = RouterHypervisor::build(opts.libvirt_uri.clone(), opts.libvirt_network.clone());
|
||||||
|
|
||||||
// Initialize persistence
|
// Initialize persistence (optional). Skip when requested for faster startup.
|
||||||
let persist = Arc::new(Persist::new(Some(opts.database_url.clone())).await?);
|
let persist = if opts.skip_persistence {
|
||||||
|
Arc::new(Persist::new(None).await?)
|
||||||
|
} else {
|
||||||
|
Arc::new(Persist::new(Some(opts.database_url.clone())).await?)
|
||||||
|
};
|
||||||
|
|
||||||
// Build MQ config and start consumer
|
// Build MQ config and start consumer
|
||||||
let mq_cfg = common::MqConfig {
|
let mq_cfg = common::MqConfig {
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,8 @@ use chrono::Utc;
|
||||||
use miette::{IntoDiagnostic as _, Result};
|
use miette::{IntoDiagnostic as _, Result};
|
||||||
use sea_orm::sea_query::{Expr, OnConflict};
|
use sea_orm::sea_query::{Expr, OnConflict};
|
||||||
use sea_orm::{
|
use sea_orm::{
|
||||||
ActiveModelTrait, ColumnTrait, Database, DatabaseConnection, QueryFilter, Set,
|
entity::prelude::*, ActiveModelTrait, ColumnTrait, Database, DatabaseConnection, QueryFilter,
|
||||||
entity::prelude::*,
|
Set,
|
||||||
};
|
};
|
||||||
use sea_orm_migration::MigratorTrait;
|
use sea_orm_migration::MigratorTrait;
|
||||||
use tracing::{debug, info, warn};
|
use tracing::{debug, info, warn};
|
||||||
|
|
@ -104,9 +104,10 @@ mod vms {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Persist {
|
impl Persist {
|
||||||
/// Initialize persistence. If `database_url` is provided and the connection
|
/// Initialize persistence.
|
||||||
/// succeeds, migrations are applied and the connection is retained.
|
/// - If `database_url` is Some(non-empty), attempt to connect and run migrations.
|
||||||
/// If not provided or connection fails, persistence is disabled (no-op).
|
/// On any failure, return an error to fail-fast so operators notice misconfiguration.
|
||||||
|
/// - If `database_url` is None or empty, persistence is disabled (no-op).
|
||||||
pub async fn new(database_url: Option<String>) -> Result<Self> {
|
pub async fn new(database_url: Option<String>) -> Result<Self> {
|
||||||
if let Some(url) = database_url.filter(|s| !s.trim().is_empty()) {
|
if let Some(url) = database_url.filter(|s| !s.trim().is_empty()) {
|
||||||
// Use a single connection for SQLite in-memory to avoid separate empty DBs per checkout
|
// Use a single connection for SQLite in-memory to avoid separate empty DBs per checkout
|
||||||
|
|
@ -115,30 +116,18 @@ impl Persist {
|
||||||
opts.max_connections(1)
|
opts.max_connections(1)
|
||||||
.min_connections(1)
|
.min_connections(1)
|
||||||
.sqlx_logging(false);
|
.sqlx_logging(false);
|
||||||
match Database::connect(opts).await.into_diagnostic() {
|
let db = Database::connect(opts).await.into_diagnostic()
|
||||||
Ok(db) => {
|
.map_err(|e| miette::miette!("failed to connect to database: {e}"))?;
|
||||||
// Apply migrations idempotently
|
migration::Migrator::up(&db, None)
|
||||||
if let Err(e) = migration::Migrator::up(&db, None).await.into_diagnostic() {
|
.await
|
||||||
#[cfg(test)]
|
.into_diagnostic()
|
||||||
{
|
.map_err(|e| miette::miette!("failed to apply migrations: {e}"))?;
|
||||||
// In tests, surface the migration failure to help debugging
|
info!("persistence initialized (connected and migrated)");
|
||||||
return Err(miette::miette!("migration failed: {e}"));
|
Ok(Self { db: Some(db) })
|
||||||
}
|
|
||||||
#[cfg(not(test))]
|
|
||||||
{
|
|
||||||
warn!(error = %e, "failed to apply migrations; proceeding without persistence");
|
|
||||||
return Ok(Self { db: None });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
info!("persistence initialized (connected and migrated)");
|
|
||||||
return Ok(Self { db: Some(db) });
|
|
||||||
}
|
|
||||||
Err(e) => warn!(error = %e, "failed to connect to database; persistence disabled"),
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
warn!("DATABASE_URL not set; persistence disabled");
|
warn!("persistence disabled (no DATABASE_URL or skipped)");
|
||||||
|
Ok(Self { db: None })
|
||||||
}
|
}
|
||||||
Ok(Self { db: None })
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_enabled(&self) -> bool {
|
pub fn is_enabled(&self) -> bool {
|
||||||
|
|
@ -335,7 +324,6 @@ mod tests {
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.expect("destroyed");
|
.expect("destroyed");
|
||||||
use sea_orm::QuerySelect;
|
|
||||||
let rows = vms::Entity::find()
|
let rows = vms::Entity::find()
|
||||||
.filter(vms::Column::RequestId.eq(req))
|
.filter(vms::Column::RequestId.eq(req))
|
||||||
.filter(vms::Column::DomainName.eq(domain.clone()))
|
.filter(vms::Column::DomainName.eq(domain.clone()))
|
||||||
|
|
|
||||||
|
|
@ -105,6 +105,7 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
if let Err(e) = hv.start(&h).await {
|
if let Err(e) = hv.start(&h).await {
|
||||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to start VM");
|
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to start VM");
|
||||||
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Failed).await;
|
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Failed).await;
|
||||||
|
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "failed");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Running).await;
|
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Running).await;
|
||||||
|
|
@ -112,13 +113,19 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
info!(request_id = %item.ctx.request_id, label = %label_key, "vm started (monitoring for completion)");
|
info!(request_id = %item.ctx.request_id, label = %label_key, "vm started (monitoring for completion)");
|
||||||
// Monitor VM state until it stops or until placeholder_runtime elapses; end early on shutdown
|
// Monitor VM state until it stops or until placeholder_runtime elapses; end early on shutdown
|
||||||
let start_time = std::time::Instant::now();
|
let start_time = std::time::Instant::now();
|
||||||
|
let mut timed_out = false;
|
||||||
|
let mut natural_stop = false;
|
||||||
loop {
|
loop {
|
||||||
// Check current state first
|
// Check current state first
|
||||||
if let Ok(crate::hypervisor::VmState::Stopped) = hv.state(&h).await {
|
if let Ok(crate::hypervisor::VmState::Stopped) = hv.state(&h).await {
|
||||||
info!(request_id = %item.ctx.request_id, label = %label_key, "vm reported stopped");
|
info!(request_id = %item.ctx.request_id, label = %label_key, "vm reported stopped");
|
||||||
|
natural_stop = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if start_time.elapsed() >= placeholder_runtime {
|
||||||
|
timed_out = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if start_time.elapsed() >= placeholder_runtime { break; }
|
|
||||||
// Wait either for shutdown signal or a short delay before next poll
|
// Wait either for shutdown signal or a short delay before next poll
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = shutdown.notified() => {
|
_ = shutdown.notified() => {
|
||||||
|
|
@ -128,20 +135,35 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
_ = tokio::time::sleep(Duration::from_secs(2)) => {}
|
_ = tokio::time::sleep(Duration::from_secs(2)) => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Stop and destroy
|
if timed_out {
|
||||||
if let Err(e) = hv.stop(&h, Duration::from_secs(10)).await {
|
// Freeze/suspend VM for debugging; keep artifacts and domain defined
|
||||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
|
if let Err(e) = hv.suspend(&h).await {
|
||||||
|
warn!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to suspend VM after timeout");
|
||||||
|
}
|
||||||
|
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
|
||||||
|
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Failed).await;
|
||||||
|
// Log where to find console log for libvirt guests
|
||||||
|
let console_hint = h.work_dir.join("console.log");
|
||||||
|
info!(request_id = %item.ctx.request_id, label = %label_key, domain = %h.id, console = %console_hint.display(), "vm timeout: suspended and kept for debugging");
|
||||||
|
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "failed");
|
||||||
|
} else {
|
||||||
|
// Stop and destroy on natural completion
|
||||||
|
if let Err(e) = hv.stop(&h, Duration::from_secs(10)).await {
|
||||||
|
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
|
||||||
|
}
|
||||||
|
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
|
||||||
|
if let Err(e) = hv.destroy(h.clone()).await {
|
||||||
|
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to destroy VM");
|
||||||
|
}
|
||||||
|
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Destroyed).await;
|
||||||
|
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Succeeded).await;
|
||||||
|
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "succeeded");
|
||||||
}
|
}
|
||||||
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
|
|
||||||
if let Err(e) = hv.destroy(h.clone()).await {
|
|
||||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to destroy VM");
|
|
||||||
}
|
|
||||||
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Destroyed).await;
|
|
||||||
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Succeeded).await;
|
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to prepare VM");
|
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to prepare VM");
|
||||||
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Failed).await;
|
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Failed).await;
|
||||||
|
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "failed");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -243,6 +265,9 @@ mod tests {
|
||||||
async fn stop(&self, _vm: &VmHandle, _t: Duration) -> miette::Result<()> {
|
async fn stop(&self, _vm: &VmHandle, _t: Duration) -> miette::Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
async fn suspend(&self, _vm: &VmHandle) -> miette::Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
async fn destroy(&self, vm: VmHandle) -> miette::Result<()> {
|
async fn destroy(&self, vm: VmHandle) -> miette::Result<()> {
|
||||||
// decrement overall current
|
// decrement overall current
|
||||||
self.active_all.fetch_sub(1, Ordering::SeqCst);
|
self.active_all.fetch_sub(1, Ordering::SeqCst);
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue