mirror of
https://codeberg.org/Toasterson/solstice-ci.git
synced 2026-04-10 13:20:41 +00:00
chore(format): clippy fixes + remove 470 lines of dead code
- Remove dead libvirt SSH/console functions from scheduler - Apply clippy auto-fixes across orchestrator and runner-integration - Format all crates
This commit is contained in:
parent
0c3fb84d62
commit
f3d3d1465d
10 changed files with 56 additions and 536 deletions
|
|
@ -1,5 +1,5 @@
|
||||||
# Cargo configuration for Solstice CI
|
# Local path override for development — uses local vm-manager checkout
|
||||||
# vm-manager is referenced via path dep in orchestrator's Cargo.toml.
|
# instead of fetching from GitHub on every build.
|
||||||
# For local development, ensure the vm-manager repo is available at:
|
# Remove or comment out to use the git dependency from Cargo.toml.
|
||||||
# ../vm-manager (relative to solstice-ci root)
|
[patch."https://github.com/CloudNebulaProject/vm-manager.git"]
|
||||||
# or create a symlink: ln -s /path/to/vm-manager ../vm-manager
|
vm-manager = { path = "../vm-manager/crates/vm-manager" }
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
use std::{
|
use std::{
|
||||||
collections::BTreeMap,
|
collections::BTreeMap,
|
||||||
fs,
|
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -53,17 +52,13 @@ pub struct ImageDefaults {
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
|
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
|
||||||
#[serde(rename_all = "lowercase")]
|
#[serde(rename_all = "lowercase")]
|
||||||
|
#[derive(Default)]
|
||||||
pub enum Decompress {
|
pub enum Decompress {
|
||||||
Zstd,
|
Zstd,
|
||||||
|
#[default]
|
||||||
None,
|
None,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Decompress {
|
|
||||||
fn default() -> Self {
|
|
||||||
Decompress::None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl OrchestratorConfig {
|
impl OrchestratorConfig {
|
||||||
pub async fn load(path: Option<&Path>) -> Result<Self> {
|
pub async fn load(path: Option<&Path>) -> Result<Self> {
|
||||||
let path = match path {
|
let path = match path {
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,7 @@ use axum::{
|
||||||
};
|
};
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tracing::{info, warn};
|
use tracing::info;
|
||||||
use uuid::Uuid;
|
|
||||||
|
|
||||||
use crate::persist::Persist;
|
use crate::persist::Persist;
|
||||||
|
|
||||||
|
|
@ -61,7 +60,7 @@ pub async fn serve(
|
||||||
.await
|
.await
|
||||||
.expect("bind http");
|
.expect("bind http");
|
||||||
let server = axum::serve(listener, app);
|
let server = axum::serve(listener, app);
|
||||||
let _ = tokio::select! {
|
tokio::select! {
|
||||||
_ = server => {},
|
_ = server => {},
|
||||||
_ = shutdown => {},
|
_ = shutdown => {},
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -275,11 +275,11 @@ impl Persist {
|
||||||
if let Ok(val) = serde_json::from_str::<serde_json::Value>(line) {
|
if let Ok(val) = serde_json::from_str::<serde_json::Value>(line) {
|
||||||
// Keep original JSON for reference
|
// Keep original JSON for reference
|
||||||
fields_json = Some(line.to_string());
|
fields_json = Some(line.to_string());
|
||||||
if let Some(c) = val.get("category").and_then(|v| v.as_str()) {
|
if let Some(c) = val.get("category").and_then(|v| v.as_str())
|
||||||
if !c.is_empty() {
|
&& !c.is_empty()
|
||||||
|
{
|
||||||
category = c.to_string();
|
category = c.to_string();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if let Some(l) = val.get("level").and_then(|v| v.as_str()) {
|
if let Some(l) = val.get("level").and_then(|v| v.as_str()) {
|
||||||
level_str = Some(l.to_string());
|
level_str = Some(l.to_string());
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
use crate::error::OrchestratorError;
|
|
||||||
use common::{DeadLetter, JobRequest, messages::JobResult, publish_deadletter, publish_job_result};
|
use common::{DeadLetter, JobRequest, messages::JobResult, publish_deadletter, publish_job_result};
|
||||||
use dashmap::DashMap;
|
use dashmap::DashMap;
|
||||||
use miette::{IntoDiagnostic as _, Result};
|
use miette::Result;
|
||||||
use std::{collections::HashMap, path::PathBuf, sync::Arc, time::Duration};
|
use std::{collections::HashMap, path::PathBuf, sync::Arc, time::Duration};
|
||||||
use tokio::sync::{Notify, Semaphore, mpsc};
|
use tokio::sync::{Notify, Semaphore, mpsc};
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
|
|
@ -9,9 +8,7 @@ use tracing::{error, info, warn};
|
||||||
use crate::hypervisor::{BackendTag, Hypervisor, JobContext, VmSpec};
|
use crate::hypervisor::{BackendTag, Hypervisor, JobContext, VmSpec};
|
||||||
use crate::persist::{JobState, Persist, VmPersistState};
|
use crate::persist::{JobState, Persist, VmPersistState};
|
||||||
|
|
||||||
use tokio::fs::File;
|
use tokio::io::AsyncBufReadExt;
|
||||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
|
||||||
use uuid::Uuid;
|
|
||||||
|
|
||||||
pub struct Scheduler<H: Hypervisor + 'static> {
|
pub struct Scheduler<H: Hypervisor + 'static> {
|
||||||
mq_cfg: Arc<common::MqConfig>,
|
mq_cfg: Arc<common::MqConfig>,
|
||||||
|
|
@ -359,8 +356,8 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
let _ = console_recorder.await;
|
let _ = console_recorder.await;
|
||||||
|
|
||||||
// If no logs were captured, snapshot the console log file as fallback
|
// If no logs were captured, snapshot the console log file as fallback
|
||||||
if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await {
|
if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await
|
||||||
if let Ok(lines) = vm_manager::console::read_console_log(&h.work_dir).await {
|
&& let Ok(lines) = vm_manager::console::read_console_log(&h.work_dir).await {
|
||||||
let mut seq: i64 = -1_000_000;
|
let mut seq: i64 = -1_000_000;
|
||||||
for line in lines {
|
for line in lines {
|
||||||
let obj = serde_json::json!({
|
let obj = serde_json::json!({
|
||||||
|
|
@ -372,7 +369,6 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
seq += 1;
|
seq += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Destroy VM and then persist final destroyed state
|
// Destroy VM and then persist final destroyed state
|
||||||
if let Err(e) = hv.destroy(h.clone()).await {
|
if let Err(e) = hv.destroy(h.clone()).await {
|
||||||
|
|
@ -383,7 +379,7 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
// Persist final state and publish result
|
// Persist final state and publish result
|
||||||
let final_state = if success { JobState::Succeeded } else { JobState::Failed };
|
let final_state = if success { JobState::Succeeded } else { JobState::Failed };
|
||||||
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), final_state).await;
|
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), final_state).await;
|
||||||
let mut result = JobResult::new(item.ctx.request_id, item.ctx.repo_url.clone(), item.ctx.commit_sha.clone(), success, exit_code, failure_summary.clone());
|
let result = JobResult::new(item.ctx.request_id, item.ctx.repo_url.clone(), item.ctx.commit_sha.clone(), success, exit_code, failure_summary.clone());
|
||||||
if let Err(e) = publish_job_result(&mq_cfg_in, &result).await {
|
if let Err(e) = publish_job_result(&mq_cfg_in, &result).await {
|
||||||
warn!(error = %e, request_id = %item.ctx.request_id, "failed to publish JobResult");
|
warn!(error = %e, request_id = %item.ctx.request_id, "failed to publish JobResult");
|
||||||
}
|
}
|
||||||
|
|
@ -400,7 +396,6 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "failed");
|
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "failed");
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
@ -420,473 +415,6 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tail the VM serial console file and record lines into the job log until cancelled.
|
|
||||||
async fn tail_console_to_joblog(
|
|
||||||
persist: Arc<Persist>,
|
|
||||||
request_id: Uuid,
|
|
||||||
console_path: PathBuf,
|
|
||||||
) -> miette::Result<()> {
|
|
||||||
use miette::IntoDiagnostic as _;
|
|
||||||
use tokio::time::{Duration, sleep};
|
|
||||||
|
|
||||||
// Negative sequence numbers for early console logs so they sort before runner logs (which start at 0).
|
|
||||||
let mut seq: i64 = -1_000_000; // ample headroom
|
|
||||||
|
|
||||||
loop {
|
|
||||||
match File::open(&console_path).await {
|
|
||||||
Ok(file) => {
|
|
||||||
let mut reader = BufReader::new(file);
|
|
||||||
let mut buf = String::new();
|
|
||||||
loop {
|
|
||||||
buf.clear();
|
|
||||||
let n = reader.read_line(&mut buf).await.into_diagnostic()?;
|
|
||||||
if n == 0 {
|
|
||||||
// No new data yet; yield and retry
|
|
||||||
sleep(Duration::from_millis(200)).await;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let trimmed = buf.trim_end_matches(['\n', '\r']);
|
|
||||||
// Emit as NDJSON so logs-service can categorize this as 'boot'
|
|
||||||
let obj = serde_json::json!({
|
|
||||||
"category": "boot",
|
|
||||||
"level": "info",
|
|
||||||
"msg": trimmed
|
|
||||||
});
|
|
||||||
let line = obj.to_string();
|
|
||||||
let _ = persist.record_log_line(request_id, seq, false, &line).await;
|
|
||||||
seq += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(_) => {
|
|
||||||
// File may not exist yet; back off briefly before retrying
|
|
||||||
sleep(Duration::from_millis(200)).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Snapshot the entire console log file once and persist its lines with negative seq numbers.
|
|
||||||
async fn snapshot_console_to_joblog(
|
|
||||||
persist: Arc<Persist>,
|
|
||||||
request_id: Uuid,
|
|
||||||
console_path: PathBuf,
|
|
||||||
) -> miette::Result<()> {
|
|
||||||
use miette::IntoDiagnostic as _;
|
|
||||||
match tokio::fs::read_to_string(&console_path).await {
|
|
||||||
Ok(content) => {
|
|
||||||
let mut seq: i64 = -1_000_000; // keep consistent ordering before runner logs
|
|
||||||
for raw in content.lines() {
|
|
||||||
let trimmed = raw.trim_end_matches(['\n', '\r']);
|
|
||||||
let obj = serde_json::json!({
|
|
||||||
"category": "boot",
|
|
||||||
"level": "info",
|
|
||||||
"msg": trimmed
|
|
||||||
});
|
|
||||||
let line = obj.to_string();
|
|
||||||
let _ = persist.record_log_line(request_id, seq, false, &line).await;
|
|
||||||
seq += 1;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
// If file missing or unreadable, just return error up to caller for logging
|
|
||||||
Err(e).into_diagnostic()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
|
||||||
async fn discover_guest_ip_virsh(
|
|
||||||
domain: &str,
|
|
||||||
timeout: Duration,
|
|
||||||
libvirt_uri: &str,
|
|
||||||
libvirt_network: Option<&str>,
|
|
||||||
) -> Option<String> {
|
|
||||||
use std::process::Command;
|
|
||||||
use tokio::{
|
|
||||||
task,
|
|
||||||
time::{Duration, Instant, sleep},
|
|
||||||
};
|
|
||||||
use tracing::{debug, warn};
|
|
||||||
|
|
||||||
fn parse_ipv4_from_text(s: &str) -> Option<String> {
|
|
||||||
for line in s.lines() {
|
|
||||||
if let Some(slash) = line.find('/') {
|
|
||||||
let start = line[..slash].rfind(' ').map(|i| i + 1).unwrap_or(0);
|
|
||||||
let ip = &line[start..slash];
|
|
||||||
if ip.chars().all(|c| c.is_ascii_digit() || c == '.') {
|
|
||||||
return Some(ip.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let mut cur = String::new();
|
|
||||||
for ch in line.chars() {
|
|
||||||
if ch.is_ascii_digit() || ch == '.' {
|
|
||||||
cur.push(ch);
|
|
||||||
} else {
|
|
||||||
if cur.split('.').count() == 4 {
|
|
||||||
return Some(cur.clone());
|
|
||||||
}
|
|
||||||
cur.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if cur.split('.').count() == 4 {
|
|
||||||
return Some(cur);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
fn preview_bytes(b: &[u8]) -> String {
|
|
||||||
let s = String::from_utf8_lossy(b);
|
|
||||||
let s = s.trim();
|
|
||||||
let mut out = s.to_string();
|
|
||||||
if out.len() > 800 {
|
|
||||||
out.truncate(800);
|
|
||||||
out.push_str("…");
|
|
||||||
}
|
|
||||||
out
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
struct Attempt {
|
|
||||||
cmd: String,
|
|
||||||
ok: bool,
|
|
||||||
status: Option<i32>,
|
|
||||||
stdout: String,
|
|
||||||
stderr: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn run_cmd(args: &[&str], uri: &str) -> Attempt {
|
|
||||||
let args_vec = {
|
|
||||||
let mut v = Vec::with_capacity(args.len() + 2);
|
|
||||||
v.push("-c".to_string());
|
|
||||||
v.push(uri.to_string());
|
|
||||||
v.extend(args.iter().map(|s| s.to_string()));
|
|
||||||
v
|
|
||||||
};
|
|
||||||
let cmd_desc = format!("virsh {}", args_vec.join(" "));
|
|
||||||
match task::spawn_blocking(move || {
|
|
||||||
let mut cmd = Command::new("virsh");
|
|
||||||
cmd.args(&args_vec);
|
|
||||||
cmd.output()
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(Ok(out)) => {
|
|
||||||
let ok = out.status.success();
|
|
||||||
let status = out.status.code();
|
|
||||||
let stdout = preview_bytes(&out.stdout);
|
|
||||||
let stderr = preview_bytes(&out.stderr);
|
|
||||||
Attempt {
|
|
||||||
cmd: cmd_desc,
|
|
||||||
ok,
|
|
||||||
status,
|
|
||||||
stdout,
|
|
||||||
stderr,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
other => Attempt {
|
|
||||||
cmd: cmd_desc,
|
|
||||||
ok: false,
|
|
||||||
status: None,
|
|
||||||
stdout: String::new(),
|
|
||||||
stderr: format!("spawn error: {:?}", other),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let deadline = Instant::now() + timeout;
|
|
||||||
let mut last_attempts: Vec<Attempt> = Vec::new();
|
|
||||||
while Instant::now() < deadline {
|
|
||||||
last_attempts.clear();
|
|
||||||
// 1) Try domifaddr via agent then lease then default
|
|
||||||
for source in [Some("agent"), Some("lease"), None] {
|
|
||||||
let mut args = vec!["domifaddr", domain];
|
|
||||||
if let Some(src) = source {
|
|
||||||
args.push("--source");
|
|
||||||
args.push(src);
|
|
||||||
}
|
|
||||||
let att = run_cmd(&args, libvirt_uri).await;
|
|
||||||
debug!(domain=%domain, method=%format!("domifaddr/{:?}", source), ok=att.ok, status=?att.status, stdout=%att.stdout, stderr=%att.stderr, cmd=%att.cmd, "virsh attempt");
|
|
||||||
if att.ok {
|
|
||||||
if let Some(ip) = parse_ipv4_from_text(&att.stdout) {
|
|
||||||
debug!(domain=%domain, method=%format!("domifaddr/{:?}", source), ip=%ip, "discovered IP");
|
|
||||||
return Some(ip);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
last_attempts.push(att);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2) Try domiflist to get MAC and possibly network name
|
|
||||||
let mut mac: Option<String> = None;
|
|
||||||
let mut net_name: Option<String> = None;
|
|
||||||
let att_domiflist = run_cmd(&["domiflist", domain], libvirt_uri).await;
|
|
||||||
debug!(domain=%domain, method="domiflist", ok=att_domiflist.ok, status=?att_domiflist.status, stdout=%att_domiflist.stdout, stderr=%att_domiflist.stderr, cmd=%att_domiflist.cmd, "virsh attempt");
|
|
||||||
if att_domiflist.ok {
|
|
||||||
for line in att_domiflist.stdout.lines().skip(2) {
|
|
||||||
// skip header lines
|
|
||||||
let cols: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if cols.len() >= 5 {
|
|
||||||
// columns: Interface Type Source Model MAC
|
|
||||||
net_name = Some(cols[2].to_string());
|
|
||||||
mac = Some(cols[4].to_ascii_lowercase());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
last_attempts.push(att_domiflist);
|
|
||||||
// Fallback to env if domiflist didn't give a Source network
|
|
||||||
if net_name.is_none() {
|
|
||||||
if let Some(n) = libvirt_network {
|
|
||||||
if !n.is_empty() {
|
|
||||||
net_name = Some(n.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 2a) Parse leases file if we have network and MAC
|
|
||||||
if let (Some(net), Some(mac_s)) = (net_name.clone(), mac.clone()) {
|
|
||||||
let path = format!("/var/lib/libvirt/dnsmasq/{}.leases", net);
|
|
||||||
let content_opt = task::spawn_blocking(move || std::fs::read_to_string(path))
|
|
||||||
.await
|
|
||||||
.ok()
|
|
||||||
.and_then(|r| r.ok());
|
|
||||||
if let Some(content) = content_opt {
|
|
||||||
let mut best_ip: Option<String> = None;
|
|
||||||
let mut best_epoch: i64 = -1;
|
|
||||||
for line in content.lines() {
|
|
||||||
// Format: epoch MAC IP hostname clientid
|
|
||||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if parts.len() >= 3 && parts[1].eq_ignore_ascii_case(mac_s.as_str()) {
|
|
||||||
let epoch: i64 = parts[0].parse::<i64>().unwrap_or(0);
|
|
||||||
if epoch > best_epoch {
|
|
||||||
best_epoch = epoch;
|
|
||||||
best_ip = Some(parts[2].to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if let Some(ip) = best_ip {
|
|
||||||
debug!(domain=%domain, method="dnsmasq.leases", ip=%ip, "discovered IP");
|
|
||||||
return Some(ip);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 2b) Try virsh net-dhcp-leases <network>
|
|
||||||
if let Some(net) = net_name.clone() {
|
|
||||||
let att_leases = run_cmd(&["net-dhcp-leases", &net], libvirt_uri).await;
|
|
||||||
debug!(domain=%domain, method="net-dhcp-leases", ok=att_leases.ok, status=?att_leases.status, stdout=%att_leases.stdout, stderr=%att_leases.stderr, cmd=%att_leases.cmd, "virsh attempt");
|
|
||||||
if att_leases.ok {
|
|
||||||
if let Some(ref mac_s) = mac {
|
|
||||||
for line in att_leases.stdout.lines() {
|
|
||||||
if line.to_ascii_lowercase().contains(mac_s.as_str()) {
|
|
||||||
if let Some(ip) = parse_ipv4_from_text(line) {
|
|
||||||
debug!(domain=%domain, method="net-dhcp-leases", ip=%ip, "discovered IP");
|
|
||||||
return Some(ip);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
last_attempts.push(att_leases);
|
|
||||||
}
|
|
||||||
|
|
||||||
sleep(Duration::from_secs(1)).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
// No IP found; emit a summary at warn level with the last attempts' details
|
|
||||||
if !last_attempts.is_empty() {
|
|
||||||
for att in last_attempts {
|
|
||||||
warn!(domain=%domain, cmd=%att.cmd, ok=att.ok, status=?att.status, stdout=%att.stdout, stderr=%att.stderr, "virsh attempt summary");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
|
||||||
async fn run_job_via_ssh_with_retry(
|
|
||||||
ip: String,
|
|
||||||
user: String,
|
|
||||||
key_mem_opt: Option<String>,
|
|
||||||
per_job_key_path: Option<PathBuf>,
|
|
||||||
local_runner: PathBuf,
|
|
||||||
remote_path: PathBuf,
|
|
||||||
repo_url: String,
|
|
||||||
commit_sha: String,
|
|
||||||
request_id: uuid::Uuid,
|
|
||||||
timeout: Duration,
|
|
||||||
) -> miette::Result<(bool, i32, Vec<(bool, String)>)> {
|
|
||||||
use tokio::time::{Instant, sleep};
|
|
||||||
use tracing::warn;
|
|
||||||
|
|
||||||
let deadline = Instant::now() + timeout;
|
|
||||||
let mut attempt: u32 = 0;
|
|
||||||
let mut backoff = Duration::from_secs(1);
|
|
||||||
loop {
|
|
||||||
attempt += 1;
|
|
||||||
match run_job_via_ssh_owned(
|
|
||||||
ip.clone(),
|
|
||||||
user.clone(),
|
|
||||||
key_mem_opt.clone(),
|
|
||||||
per_job_key_path.clone(),
|
|
||||||
local_runner.clone(),
|
|
||||||
remote_path.clone(),
|
|
||||||
repo_url.clone(),
|
|
||||||
commit_sha.clone(),
|
|
||||||
request_id,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(r) => return Ok(r),
|
|
||||||
Err(e) => {
|
|
||||||
let now = Instant::now();
|
|
||||||
if now >= deadline {
|
|
||||||
return Err(e); // give up with last error
|
|
||||||
}
|
|
||||||
warn!(
|
|
||||||
attempt,
|
|
||||||
remaining_ms = (deadline - now).as_millis() as u64,
|
|
||||||
ip = %ip,
|
|
||||||
request_id = %request_id,
|
|
||||||
error = %e,
|
|
||||||
"SSH connect/exec not ready yet; will retry"
|
|
||||||
);
|
|
||||||
// Cap backoff at 5s and don’t sleep past the deadline
|
|
||||||
let sleep_dur = std::cmp::min(backoff, deadline.saturating_duration_since(now));
|
|
||||||
sleep(sleep_dur).await;
|
|
||||||
backoff = std::cmp::min(backoff.saturating_mul(2), Duration::from_secs(5));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
|
||||||
async fn run_job_via_ssh_owned(
|
|
||||||
ip: String,
|
|
||||||
user: String,
|
|
||||||
key_mem_opt: Option<String>,
|
|
||||||
per_job_key_path: Option<PathBuf>,
|
|
||||||
local_runner: PathBuf,
|
|
||||||
remote_path: PathBuf,
|
|
||||||
repo_url: String,
|
|
||||||
commit_sha: String,
|
|
||||||
request_id: uuid::Uuid,
|
|
||||||
) -> miette::Result<(bool, i32, Vec<(bool, String)>)> {
|
|
||||||
use ssh2::Session;
|
|
||||||
use std::fs::File as StdFile;
|
|
||||||
use std::io::Read;
|
|
||||||
use std::net::TcpStream;
|
|
||||||
use tokio::task;
|
|
||||||
|
|
||||||
let (ok, code, lines) = task::spawn_blocking(move || -> miette::Result<(bool, i32, Vec<(bool, String)>)> {
|
|
||||||
// Connect TCP
|
|
||||||
let tcp = TcpStream::connect(format!("{}:22", ip))
|
|
||||||
.map_err(OrchestratorError::TcpConnect)
|
|
||||||
.into_diagnostic()?;
|
|
||||||
let mut sess = Session::new().map_err(|_| OrchestratorError::SshSessionNew).into_diagnostic()?;
|
|
||||||
sess.set_tcp_stream(tcp);
|
|
||||||
sess.handshake().map_err(OrchestratorError::SshHandshake).into_diagnostic()?;
|
|
||||||
// Authenticate priority: memory key -> per-job file -> global file
|
|
||||||
if let Some(ref key_pem) = key_mem_opt {
|
|
||||||
sess.userauth_pubkey_memory(&user, None, key_pem, None)
|
|
||||||
.map_err(|e| OrchestratorError::SshAuth { user: user.clone(), source: e })
|
|
||||||
.into_diagnostic()?;
|
|
||||||
} else if let Some(ref p) = per_job_key_path {
|
|
||||||
sess.userauth_pubkey_file(&user, None, p, None)
|
|
||||||
.map_err(|e| OrchestratorError::SshAuth { user: user.clone(), source: e })
|
|
||||||
.into_diagnostic()?;
|
|
||||||
} else {
|
|
||||||
return Err(miette::miette!("no SSH key available for job (neither in-memory nor per-job file)"));
|
|
||||||
}
|
|
||||||
if !sess.authenticated() {
|
|
||||||
return Err(miette::miette!("ssh not authenticated"));
|
|
||||||
}
|
|
||||||
// Upload runner via SFTP atomically: write to temp, close, then rename over final path
|
|
||||||
let sftp = sess.sftp().map_err(OrchestratorError::SftpInit).into_diagnostic()?;
|
|
||||||
let mut local = StdFile::open(&local_runner)
|
|
||||||
.map_err(OrchestratorError::OpenLocalRunner)
|
|
||||||
.into_diagnostic()?;
|
|
||||||
let mut buf = Vec::new();
|
|
||||||
local.read_to_end(&mut buf).map_err(OrchestratorError::ReadRunner).into_diagnostic()?;
|
|
||||||
// Temp remote path
|
|
||||||
let tmp_remote = {
|
|
||||||
let mut p = remote_path.clone();
|
|
||||||
let tmp_name = format!("{}.tmp", p.file_name().and_then(|s| s.to_str()).unwrap_or("solstice-runner"));
|
|
||||||
p.set_file_name(tmp_name);
|
|
||||||
p
|
|
||||||
};
|
|
||||||
let mut remote_tmp = sftp.create(&tmp_remote)
|
|
||||||
.map_err(OrchestratorError::SftpCreate)
|
|
||||||
.into_diagnostic()?;
|
|
||||||
use std::io::Write as _;
|
|
||||||
remote_tmp.write_all(&buf)
|
|
||||||
.map_err(OrchestratorError::SftpWrite)
|
|
||||||
.into_diagnostic()?;
|
|
||||||
let tmp_file_stat = ssh2::FileStat { size: None, uid: None, gid: None, perm: Some(0o755), atime: None, mtime: None };
|
|
||||||
let _ = sftp.setstat(&tmp_remote, tmp_file_stat);
|
|
||||||
// Ensure remote file handle is closed before attempting exec/rename
|
|
||||||
drop(remote_tmp);
|
|
||||||
// Rename temp to final atomically; overwrite if exists
|
|
||||||
let _ = sftp.rename(&tmp_remote, &remote_path, Some(ssh2::RenameFlags::OVERWRITE));
|
|
||||||
|
|
||||||
// Build command
|
|
||||||
let cmd = format!(
|
|
||||||
"export SOLSTICE_REPO_URL=\"{}\" SOLSTICE_COMMIT_SHA=\"{}\" SOLSTICE_REQUEST_ID=\"{}\"; {}",
|
|
||||||
repo_url, commit_sha, request_id, remote_path.display()
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut channel = sess.channel_session()
|
|
||||||
.map_err(OrchestratorError::ChannelSession)
|
|
||||||
.into_diagnostic()?;
|
|
||||||
// Best-effort: request a PTY to encourage line-buffered output from the runner
|
|
||||||
let _ = channel.request_pty("xterm", None, None);
|
|
||||||
channel.exec(&format!("sh -lc '{}'", cmd))
|
|
||||||
.map_err(OrchestratorError::Exec)
|
|
||||||
.into_diagnostic()?;
|
|
||||||
|
|
||||||
// Switch to non-blocking so we can interleave stdout/stderr without deadlock
|
|
||||||
sess.set_blocking(false);
|
|
||||||
|
|
||||||
use std::io::Read as _;
|
|
||||||
let mut out_acc: Vec<u8> = Vec::new();
|
|
||||||
let mut err_acc: Vec<u8> = Vec::new();
|
|
||||||
let mut tmp = [0u8; 4096];
|
|
||||||
// Read until remote closes
|
|
||||||
loop {
|
|
||||||
let mut progressed = false;
|
|
||||||
match channel.read(&mut tmp) {
|
|
||||||
Ok(0) => {},
|
|
||||||
Ok(n) => { out_acc.extend_from_slice(&tmp[..n]); progressed = true; },
|
|
||||||
Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => {},
|
|
||||||
Err(e) => return Err(miette::miette!("ssh stdout read error: {}", e)),
|
|
||||||
}
|
|
||||||
match channel.stderr().read(&mut tmp) {
|
|
||||||
Ok(0) => {},
|
|
||||||
Ok(n) => { err_acc.extend_from_slice(&tmp[..n]); progressed = true; },
|
|
||||||
Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => {},
|
|
||||||
Err(e) => return Err(miette::miette!("ssh stderr read error: {}", e)),
|
|
||||||
}
|
|
||||||
if channel.eof() { break; }
|
|
||||||
if !progressed {
|
|
||||||
// avoid busy spin
|
|
||||||
std::thread::sleep(std::time::Duration::from_millis(20));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Restore blocking for status fetch (defensive)
|
|
||||||
sess.set_blocking(true);
|
|
||||||
let _ = channel.wait_close();
|
|
||||||
let exit_code = channel.exit_status().unwrap_or(1);
|
|
||||||
|
|
||||||
// Convert buffers into lines (strip ANSI color/escape sequences)
|
|
||||||
let out = strip_ansi(&String::from_utf8_lossy(&out_acc));
|
|
||||||
let err = strip_ansi(&String::from_utf8_lossy(&err_acc));
|
|
||||||
let mut lines: Vec<(bool, String)> = Vec::new();
|
|
||||||
for l in out.lines() { lines.push((false, l.to_string())); }
|
|
||||||
for l in err.lines() { lines.push((true, l.to_string())); }
|
|
||||||
Ok((exit_code == 0, exit_code, lines))
|
|
||||||
}).await.into_diagnostic()??;
|
|
||||||
|
|
||||||
Ok((ok, code, lines))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_illumos_label(label: &str) -> bool {
|
fn is_illumos_label(label: &str) -> bool {
|
||||||
let l = label.to_ascii_lowercase();
|
let l = label.to_ascii_lowercase();
|
||||||
l.contains("illumos")
|
l.contains("illumos")
|
||||||
|
|
@ -896,11 +424,11 @@ fn is_illumos_label(label: &str) -> bool {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn expand_tilde(path: &str) -> PathBuf {
|
fn expand_tilde(path: &str) -> PathBuf {
|
||||||
if let Some(rest) = path.strip_prefix("~/") {
|
if let Some(rest) = path.strip_prefix("~/")
|
||||||
if let Ok(home) = std::env::var("HOME") {
|
&& let Ok(home) = std::env::var("HOME")
|
||||||
|
{
|
||||||
return PathBuf::from(home).join(rest);
|
return PathBuf::from(home).join(rest);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
PathBuf::from(path)
|
PathBuf::from(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1048,8 +576,6 @@ mod tests {
|
||||||
runner_illumos_path: "/tmp/runner-illumos".into(),
|
runner_illumos_path: "/tmp/runner-illumos".into(),
|
||||||
ssh_connect_timeout_secs: 30,
|
ssh_connect_timeout_secs: 30,
|
||||||
boot_wait_secs: 0,
|
boot_wait_secs: 0,
|
||||||
libvirt_uri: "qemu:///system".into(),
|
|
||||||
libvirt_network: "default".into(),
|
|
||||||
};
|
};
|
||||||
let sched = Scheduler::new(
|
let sched = Scheduler::new(
|
||||||
hv,
|
hv,
|
||||||
|
|
@ -1103,8 +629,6 @@ mod tests {
|
||||||
runner_illumos_path: "/tmp/runner-illumos".into(),
|
runner_illumos_path: "/tmp/runner-illumos".into(),
|
||||||
ssh_connect_timeout_secs: 30,
|
ssh_connect_timeout_secs: 30,
|
||||||
boot_wait_secs: 0,
|
boot_wait_secs: 0,
|
||||||
libvirt_uri: "qemu:///system".into(),
|
|
||||||
libvirt_network: "default".into(),
|
|
||||||
};
|
};
|
||||||
let sched = Scheduler::new(
|
let sched = Scheduler::new(
|
||||||
hv,
|
hv,
|
||||||
|
|
|
||||||
|
|
@ -233,7 +233,7 @@ async fn report_failure(
|
||||||
state: Some(TaskState {
|
state: Some(TaskState {
|
||||||
id: task_id,
|
id: task_id,
|
||||||
result: v1::Result::Failure as i32,
|
result: v1::Result::Failure as i32,
|
||||||
started_at: Some(now.clone()),
|
started_at: Some(now),
|
||||||
stopped_at: Some(now),
|
stopped_at: Some(now),
|
||||||
steps: vec![],
|
steps: vec![],
|
||||||
}),
|
}),
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ use std::sync::Arc;
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
use miette::{IntoDiagnostic, Result};
|
use miette::{IntoDiagnostic, Result};
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
use tracing::{debug, info, warn};
|
use tracing::{info, warn};
|
||||||
|
|
||||||
use crate::connect::ConnectClient;
|
use crate::connect::ConnectClient;
|
||||||
use crate::proto::runner::v1::{self, StepState, TaskState, UpdateLogRequest, UpdateTaskRequest};
|
use crate::proto::runner::v1::{self, StepState, TaskState, UpdateLogRequest, UpdateTaskRequest};
|
||||||
|
|
@ -221,8 +221,8 @@ async fn report_to_forgejo(
|
||||||
step_states.push(StepState {
|
step_states.push(StepState {
|
||||||
id: step_idx as i64,
|
id: step_idx as i64,
|
||||||
result: result as i32,
|
result: result as i32,
|
||||||
started_at: Some(now.clone()),
|
started_at: Some(now),
|
||||||
stopped_at: Some(now.clone()),
|
stopped_at: Some(now),
|
||||||
log_index: cursor,
|
log_index: cursor,
|
||||||
log_length: step_lines,
|
log_length: step_lines,
|
||||||
});
|
});
|
||||||
|
|
@ -239,8 +239,8 @@ async fn report_to_forgejo(
|
||||||
step_states.push(StepState {
|
step_states.push(StepState {
|
||||||
id: 0,
|
id: 0,
|
||||||
result: result as i32,
|
result: result as i32,
|
||||||
started_at: Some(now.clone()),
|
started_at: Some(now),
|
||||||
stopped_at: Some(now.clone()),
|
stopped_at: Some(now),
|
||||||
log_index: setup_lines,
|
log_index: setup_lines,
|
||||||
log_length: work_lines,
|
log_length: work_lines,
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -154,7 +154,7 @@ async fn poll_and_send(
|
||||||
let rows: Vec<LogRow> = new_lines
|
let rows: Vec<LogRow> = new_lines
|
||||||
.iter()
|
.iter()
|
||||||
.map(|line| LogRow {
|
.map(|line| LogRow {
|
||||||
time: Some(now.clone()),
|
time: Some(now),
|
||||||
content: line.clone(),
|
content: line.clone(),
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
@ -208,7 +208,7 @@ async fn poll_and_send(
|
||||||
step_states.push(StepState {
|
step_states.push(StepState {
|
||||||
id: step_idx as i64,
|
id: step_idx as i64,
|
||||||
result: v1::Result::Unspecified as i32, // still running
|
result: v1::Result::Unspecified as i32, // still running
|
||||||
started_at: Some(now.clone()),
|
started_at: Some(now),
|
||||||
stopped_at: None,
|
stopped_at: None,
|
||||||
log_index: cursor,
|
log_index: cursor,
|
||||||
log_length: step_lines,
|
log_length: step_lines,
|
||||||
|
|
|
||||||
|
|
@ -30,8 +30,9 @@ pub async fn translate_task(task: &Task, ctx: &TranslateCtx) -> Result<Translate
|
||||||
let group_id = Uuid::new_v4();
|
let group_id = Uuid::new_v4();
|
||||||
|
|
||||||
// --- Tier 1: Solstice KDL workflow ---
|
// --- Tier 1: Solstice KDL workflow ---
|
||||||
if let Some(jobs) = try_kdl_workflow(ctx, &owner, &name, &repo_url, &sha, group_id).await? {
|
if let Some(jobs) = try_kdl_workflow(ctx, &owner, &name, &repo_url, &sha, group_id).await?
|
||||||
if !jobs.is_empty() {
|
&& !jobs.is_empty()
|
||||||
|
{
|
||||||
info!(
|
info!(
|
||||||
tier = 1,
|
tier = 1,
|
||||||
count = jobs.len(),
|
count = jobs.len(),
|
||||||
|
|
@ -41,7 +42,6 @@ pub async fn translate_task(task: &Task, ctx: &TranslateCtx) -> Result<Translate
|
||||||
);
|
);
|
||||||
return Ok(TranslateResult::Jobs(jobs));
|
return Ok(TranslateResult::Jobs(jobs));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// --- Tier 2: Actions YAML extraction ---
|
// --- Tier 2: Actions YAML extraction ---
|
||||||
if let Some(payload_bytes) = &task.workflow_payload {
|
if let Some(payload_bytes) = &task.workflow_payload {
|
||||||
|
|
@ -235,20 +235,21 @@ fn parse_kdl_jobs(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if t.starts_with("script ") && t.contains("path=") {
|
if t.starts_with("script ")
|
||||||
if let Some(p) = capture_attr(t, "path") {
|
&& t.contains("path=")
|
||||||
|
&& let Some(p) = capture_attr(t, "path")
|
||||||
|
{
|
||||||
script = Some(p);
|
script = Some(p);
|
||||||
}
|
}
|
||||||
}
|
if t.starts_with("step ")
|
||||||
if t.starts_with("step ") {
|
&& let Some(name) = capture_attr(t, "name")
|
||||||
if let Some(name) = capture_attr(t, "name") {
|
{
|
||||||
let slug = name.to_lowercase().replace(' ', "-");
|
let slug = name.to_lowercase().replace(' ', "-");
|
||||||
steps.push(StepInfo {
|
steps.push(StepInfo {
|
||||||
name,
|
name,
|
||||||
log_category: format!("step:{}", slug),
|
log_category: format!("step:{}", slug),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if t.contains("runs_on=") && runs_on.is_none() {
|
if t.contains("runs_on=") && runs_on.is_none() {
|
||||||
runs_on = capture_attr(t, "runs_on");
|
runs_on = capture_attr(t, "runs_on");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -207,7 +207,8 @@ services:
|
||||||
RUNNER_LINUX_PATH: /opt/solstice/runners/solstice-runner-linux
|
RUNNER_LINUX_PATH: /opt/solstice/runners/solstice-runner-linux
|
||||||
RUNNER_ILLUMOS_PATH: /opt/solstice/runners/solstice-runner-illumos
|
RUNNER_ILLUMOS_PATH: /opt/solstice/runners/solstice-runner-illumos
|
||||||
# Remote path on the VM where the runner will be uploaded and executed
|
# Remote path on the VM where the runner will be uploaded and executed
|
||||||
REMOTE_RUNNER_PATH: /usr/local/bin/solstice-runner
|
# Must be writable by the SSH user (sol) — /tmp works without sudo
|
||||||
|
REMOTE_RUNNER_PATH: /tmp/solstice-runner
|
||||||
# SSH connect timeout for reaching the VM (seconds)
|
# SSH connect timeout for reaching the VM (seconds)
|
||||||
SSH_CONNECT_TIMEOUT_SECS: ${SSH_CONNECT_TIMEOUT_SECS:-300}
|
SSH_CONNECT_TIMEOUT_SECS: ${SSH_CONNECT_TIMEOUT_SECS:-300}
|
||||||
# Optional: bridge name for TAP networking (leave empty for user-mode/SLIRP)
|
# Optional: bridge name for TAP networking (leave empty for user-mode/SLIRP)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue