mirror of
https://codeberg.org/Toasterson/solstice-ci.git
synced 2026-04-10 13:20:41 +00:00
Relax filesystem permissions for VM directories, overlays, and logs to support host libvirt/qemu access. Introduce dead-letter queue support with enriched error messages for failed jobs.
Signed-off-by: Till Wegmueller <toasterson@gmail.com>
This commit is contained in:
parent
888aa26388
commit
f904cb88b2
6 changed files with 158 additions and 43 deletions
|
|
@ -5,8 +5,8 @@ pub mod telemetry;
|
|||
pub mod config;
|
||||
|
||||
pub use job::{Job, Step, Workflow, parse_workflow_file, parse_workflow_str};
|
||||
pub use messages::{JobRequest, JobResult, SourceSystem};
|
||||
pub use mq::{MqConfig, consume_jobs, consume_jobs_until, publish_job, publish_job_result};
|
||||
pub use messages::{JobRequest, JobResult, SourceSystem, DeadLetter};
|
||||
pub use mq::{MqConfig, consume_jobs, consume_jobs_until, publish_job, publish_job_result, publish_deadletter};
|
||||
pub use telemetry::{TelemetryGuard, init_tracing};
|
||||
pub use config::AppConfig;
|
||||
|
||||
|
|
|
|||
|
|
@ -120,3 +120,41 @@ impl JobResult {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Combined dead-letter message that includes the original JobRequest and
|
||||
/// an error summary so operators/tooling can inspect both together.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeadLetter {
|
||||
#[serde(default = "default_deadletter_schema")]
|
||||
pub schema_version: String, // e.g., "deadletter.v1"
|
||||
pub request_id: Uuid,
|
||||
pub occurred_at: OffsetDateTime,
|
||||
/// Stage where the failure occurred (e.g., "prepare", "start", "other").
|
||||
pub stage: String,
|
||||
/// Human-readable error summary (single-line preferred).
|
||||
pub error: String,
|
||||
/// Original job request that triggered the failure.
|
||||
pub original: JobRequest,
|
||||
/// Optional bag of extra diagnostic info.
|
||||
#[serde(default)]
|
||||
pub extra: Option<std::collections::BTreeMap<String, String>>,
|
||||
}
|
||||
|
||||
fn default_deadletter_schema() -> String {
|
||||
"deadletter.v1".to_string()
|
||||
}
|
||||
|
||||
impl DeadLetter {
|
||||
pub fn new(stage: impl Into<String>, error: impl Into<String>, original: JobRequest) -> Self {
|
||||
let request_id = original.request_id;
|
||||
Self {
|
||||
schema_version: default_deadletter_schema(),
|
||||
request_id,
|
||||
occurred_at: OffsetDateTime::now_utc(),
|
||||
stage: stage.into(),
|
||||
error: error.into(),
|
||||
original,
|
||||
extra: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ use miette::{IntoDiagnostic as _, Result};
|
|||
use tracing::Instrument;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
|
||||
use crate::messages::{JobRequest, JobResult};
|
||||
use crate::messages::{JobRequest, JobResult, DeadLetter};
|
||||
|
||||
/// Pretty-print an AMQP message body for logs.
|
||||
/// - If valid UTF-8 JSON, pretty-format it.
|
||||
|
|
@ -236,6 +236,42 @@ pub struct DeliveryMeta {
|
|||
pub delivery_tag: u64,
|
||||
}
|
||||
|
||||
#[instrument(skip(cfg, dl))]
|
||||
pub async fn publish_deadletter(cfg: &MqConfig, dl: &DeadLetter) -> Result<()> {
|
||||
let conn = connect(cfg).await?;
|
||||
let channel = conn.create_channel().await.into_diagnostic()?;
|
||||
declare_topology(&channel, cfg).await?;
|
||||
|
||||
// Enable confirms
|
||||
channel
|
||||
.confirm_select(ConfirmSelectOptions::default())
|
||||
.await
|
||||
.into_diagnostic()?;
|
||||
|
||||
let body = serde_json::to_vec(dl).into_diagnostic()?;
|
||||
let props = BasicProperties::default()
|
||||
.with_content_type(ShortString::from("application/json"))
|
||||
.with_type(ShortString::from(dl.schema_version.clone()))
|
||||
.with_delivery_mode(2u8.into());
|
||||
|
||||
let confirm = channel
|
||||
.basic_publish(
|
||||
&cfg.dlx,
|
||||
"",
|
||||
BasicPublishOptions {
|
||||
mandatory: true,
|
||||
immediate: false,
|
||||
},
|
||||
&body,
|
||||
props,
|
||||
)
|
||||
.await
|
||||
.into_diagnostic()?;
|
||||
|
||||
confirm.await.into_diagnostic()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip(cfg, handler))]
|
||||
pub async fn consume_jobs<F, Fut>(cfg: &MqConfig, handler: F) -> Result<()>
|
||||
where
|
||||
|
|
@ -355,50 +391,55 @@ where
|
|||
pub async fn publish_job_result(cfg: &MqConfig, result: &JobResult) -> Result<()> {
|
||||
let conn = connect(cfg).await?;
|
||||
let channel = conn.create_channel().await.into_diagnostic()?;
|
||||
|
||||
// Ensure main exchange exists
|
||||
channel
|
||||
.exchange_declare(
|
||||
&cfg.exchange,
|
||||
lapin::ExchangeKind::Direct,
|
||||
ExchangeDeclareOptions {
|
||||
durable: true,
|
||||
auto_delete: false,
|
||||
internal: false,
|
||||
nowait: false,
|
||||
passive: false,
|
||||
},
|
||||
FieldTable::default(),
|
||||
)
|
||||
.await
|
||||
.into_diagnostic()?;
|
||||
|
||||
// Enable publisher confirms
|
||||
channel
|
||||
.confirm_select(ConfirmSelectOptions::default())
|
||||
.await
|
||||
.into_diagnostic()?;
|
||||
|
||||
let payload = serde_json::to_vec(result).into_diagnostic()?;
|
||||
declare_topology(&channel, cfg).await?;
|
||||
|
||||
let props = BasicProperties::default()
|
||||
.with_content_type("application/json".into())
|
||||
.with_content_encoding("utf-8".into())
|
||||
.with_type(ShortString::from(result.schema_version.clone()))
|
||||
.with_delivery_mode(2u8.into());
|
||||
.with_content_type(ShortString::from("application/json"));
|
||||
|
||||
// Route by schema version; default routing key for results
|
||||
let routing_key = "jobresult.v1";
|
||||
let body = serde_json::to_vec(result).into_diagnostic()?;
|
||||
|
||||
let confirm = channel
|
||||
.basic_publish(
|
||||
&cfg.exchange,
|
||||
routing_key,
|
||||
"jobresult.v1",
|
||||
BasicPublishOptions {
|
||||
mandatory: true,
|
||||
..Default::default()
|
||||
immediate: false,
|
||||
},
|
||||
&payload,
|
||||
&body,
|
||||
props,
|
||||
)
|
||||
.await
|
||||
.into_diagnostic()?;
|
||||
|
||||
confirm.await.into_diagnostic()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Publish a raw message body to the DLX so it lands in the configured DLQ.
|
||||
pub async fn publish_to_dlx_raw(cfg: &MqConfig, body: &[u8]) -> Result<()> {
|
||||
let conn = connect(cfg).await?;
|
||||
let channel = conn.create_channel().await.into_diagnostic()?;
|
||||
declare_topology(&channel, cfg).await?;
|
||||
|
||||
// Enable confirms
|
||||
channel
|
||||
.confirm_select(ConfirmSelectOptions::default())
|
||||
.await
|
||||
.into_diagnostic()?;
|
||||
|
||||
let props = BasicProperties::default()
|
||||
.with_content_type(ShortString::from("application/json"));
|
||||
|
||||
let confirm = channel
|
||||
.basic_publish(
|
||||
&cfg.dlx,
|
||||
"",
|
||||
BasicPublishOptions {
|
||||
mandatory: true,
|
||||
immediate: false,
|
||||
},
|
||||
body,
|
||||
props,
|
||||
)
|
||||
.await
|
||||
|
|
|
|||
|
|
@ -284,7 +284,8 @@ impl LibvirtHypervisor {
|
|||
std::env::temp_dir().join("solstice-libvirt").join(id)
|
||||
};
|
||||
let _ = std::fs::create_dir_all(&dir);
|
||||
let _ = std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700));
|
||||
// Make directory broadly accessible so host qemu (libvirt) can create/read files
|
||||
let _ = std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o777));
|
||||
dir
|
||||
}
|
||||
}
|
||||
|
|
@ -367,6 +368,8 @@ impl Hypervisor for LibvirtHypervisor {
|
|||
.await
|
||||
.into_diagnostic()??;
|
||||
let _ = status; // appease compiler if unused
|
||||
// Relax permissions on overlay so host qemu can access it
|
||||
let _ = std::fs::set_permissions(&overlay, std::fs::Permissions::from_mode(0o666));
|
||||
|
||||
// Build NoCloud seed ISO if user_data provided
|
||||
let mut seed_iso: Option<PathBuf> = None;
|
||||
|
|
@ -415,8 +418,15 @@ impl Hypervisor for LibvirtHypervisor {
|
|||
seed_iso = Some(iso_path);
|
||||
}
|
||||
|
||||
// Serial console log file path
|
||||
// Relax permissions on seed ISO if created (readable by host qemu)
|
||||
if let Some(ref p) = seed_iso {
|
||||
let _ = std::fs::set_permissions(p, std::fs::Permissions::from_mode(0o644));
|
||||
}
|
||||
|
||||
// Serial console log file path (pre-create with permissive perms for libvirt)
|
||||
let console_log = work_dir.join("console.log");
|
||||
let _ = std::fs::OpenOptions::new().create(true).append(true).open(&console_log);
|
||||
let _ = std::fs::set_permissions(&console_log, std::fs::Permissions::from_mode(0o666));
|
||||
let console_log_str = console_log.display().to_string();
|
||||
info!(domain = %id, console = %console_log_str, "serial console will be logged to file");
|
||||
|
||||
|
|
|
|||
|
|
@ -17,7 +17,6 @@ use hypervisor::{JobContext, RouterHypervisor, VmSpec};
|
|||
use scheduler::{SchedItem, Scheduler};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Notify;
|
||||
use std::net::SocketAddr as _;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(
|
||||
|
|
@ -185,6 +184,7 @@ async fn main() -> Result<()> {
|
|||
&capacity_map,
|
||||
persist.clone(),
|
||||
Duration::from_secs(opts.vm_placeholder_run_secs),
|
||||
Arc::new(mq_cfg.clone()),
|
||||
);
|
||||
let sched_shutdown = Arc::new(Notify::new());
|
||||
let sched_tx = sched.sender();
|
||||
|
|
@ -235,6 +235,8 @@ async fn main() -> Result<()> {
|
|||
image.defaults.as_ref().and_then(|d| d.ram_mb).unwrap_or(2048),
|
||||
image.defaults.as_ref().and_then(|d| d.disk_gb).unwrap_or(40),
|
||||
);
|
||||
// Keep original job to enrich dead-letter messages if scheduling fails early
|
||||
let original = Some(job.clone());
|
||||
let spec = VmSpec {
|
||||
label: label_resolved.clone(),
|
||||
image_path: image.local_path.clone(),
|
||||
|
|
@ -254,7 +256,7 @@ async fn main() -> Result<()> {
|
|||
commit_sha: job.commit_sha,
|
||||
workflow_job_id: job.workflow_job_id,
|
||||
};
|
||||
sched_tx.send(SchedItem { spec, ctx }).await.into_diagnostic()?;
|
||||
sched_tx.send(SchedItem { spec, ctx, original }).await.into_diagnostic()?;
|
||||
Ok(()) // ack on accept
|
||||
}
|
||||
}).await
|
||||
|
|
|
|||
|
|
@ -4,11 +4,13 @@ use dashmap::DashMap;
|
|||
use miette::Result;
|
||||
use tokio::sync::{Notify, Semaphore, mpsc};
|
||||
use tracing::{error, info, warn};
|
||||
use common::{publish_deadletter, DeadLetter, JobRequest};
|
||||
|
||||
use crate::hypervisor::{BackendTag, Hypervisor, JobContext, VmSpec};
|
||||
use crate::persist::{JobState, Persist, VmPersistState};
|
||||
|
||||
pub struct Scheduler<H: Hypervisor + 'static> {
|
||||
mq_cfg: Arc<common::MqConfig>,
|
||||
hv: Arc<H>,
|
||||
tx: mpsc::Sender<SchedItem>,
|
||||
rx: mpsc::Receiver<SchedItem>,
|
||||
|
|
@ -23,6 +25,8 @@ type DashmapType = DashMap<String, Arc<Semaphore>>;
|
|||
pub struct SchedItem {
|
||||
pub spec: VmSpec,
|
||||
pub ctx: JobContext,
|
||||
/// Original JobRequest for enriching dead-letter messages on early failure
|
||||
pub original: Option<JobRequest>,
|
||||
}
|
||||
|
||||
impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||
|
|
@ -32,6 +36,7 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
|||
capacity_map: &HashMap<String, usize>,
|
||||
persist: Arc<Persist>,
|
||||
placeholder_runtime: Duration,
|
||||
mq_cfg: Arc<common::MqConfig>,
|
||||
) -> Self {
|
||||
let (tx, rx) = mpsc::channel::<SchedItem>(max_concurrency * 4);
|
||||
let label_sems = DashMap::new();
|
||||
|
|
@ -39,6 +44,7 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
|||
label_sems.insert(label.clone(), Arc::new(Semaphore::new(*cap)));
|
||||
}
|
||||
Self {
|
||||
mq_cfg,
|
||||
hv: Arc::new(hv),
|
||||
tx,
|
||||
rx,
|
||||
|
|
@ -55,6 +61,7 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
|||
|
||||
pub async fn run_with_shutdown(self, shutdown: Arc<Notify>) -> Result<()> {
|
||||
let Scheduler {
|
||||
mq_cfg,
|
||||
hv,
|
||||
mut rx,
|
||||
global_sem,
|
||||
|
|
@ -81,6 +88,7 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
|||
let label_sems = label_sems.clone();
|
||||
let persist = persist.clone();
|
||||
let shutdown = shutdown.clone();
|
||||
let mq_cfg_in = mq_cfg.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
// Acquire global and label permits (owned permits so they live inside the task)
|
||||
let _g = match global.acquire_owned().await { Ok(p) => p, Err(_) => return };
|
||||
|
|
@ -105,6 +113,13 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
|||
if let Err(e) = hv.start(&h).await {
|
||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to start VM");
|
||||
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Failed).await;
|
||||
// Publish combined dead-letter with original request and error
|
||||
if let Some(ref orig) = item.original {
|
||||
let dl = DeadLetter::new("start", format!("{}", e), orig.clone());
|
||||
if let Err(pe) = publish_deadletter(&mq_cfg_in, &dl).await {
|
||||
warn!(error = %pe, request_id = %item.ctx.request_id, "failed to publish dead-letter");
|
||||
}
|
||||
}
|
||||
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "failed");
|
||||
return;
|
||||
}
|
||||
|
|
@ -184,6 +199,13 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
|||
Err(e) => {
|
||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to prepare VM");
|
||||
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Failed).await;
|
||||
// Publish combined dead-letter with original request and error
|
||||
if let Some(ref orig) = item.original {
|
||||
let dl = DeadLetter::new("prepare", format!("{}", e), orig.clone());
|
||||
if let Err(pe) = publish_deadletter(&mq_cfg_in, &dl).await {
|
||||
warn!(error = %pe, request_id = %item.ctx.request_id, "failed to publish dead-letter");
|
||||
}
|
||||
}
|
||||
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "failed");
|
||||
return;
|
||||
}
|
||||
|
|
@ -338,7 +360,7 @@ mod tests {
|
|||
let mut caps = HashMap::new();
|
||||
caps.insert("x".to_string(), 10);
|
||||
|
||||
let sched = Scheduler::new(hv, 2, &caps, persist, Duration::from_millis(10));
|
||||
let sched = Scheduler::new(hv, 2, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()));
|
||||
let tx = sched.sender();
|
||||
let run = tokio::spawn(async move {
|
||||
let _ = sched.run().await;
|
||||
|
|
@ -349,6 +371,7 @@ mod tests {
|
|||
.send(SchedItem {
|
||||
spec: make_spec("x"),
|
||||
ctx: make_ctx(),
|
||||
original: None,
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
|
@ -375,7 +398,7 @@ mod tests {
|
|||
caps.insert("a".to_string(), 1);
|
||||
caps.insert("b".to_string(), 2);
|
||||
|
||||
let sched = Scheduler::new(hv, 4, &caps, persist, Duration::from_millis(10));
|
||||
let sched = Scheduler::new(hv, 4, &caps, persist, Duration::from_millis(10), Arc::new(common::MqConfig::default()));
|
||||
let tx = sched.sender();
|
||||
let run = tokio::spawn(async move {
|
||||
let _ = sched.run().await;
|
||||
|
|
@ -386,6 +409,7 @@ mod tests {
|
|||
.send(SchedItem {
|
||||
spec: make_spec("a"),
|
||||
ctx: make_ctx(),
|
||||
original: None,
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue