mirror of
https://codeberg.org/Toasterson/solstice-ci.git
synced 2026-04-10 13:20:41 +00:00
Replace scheduler SSH/console code with vm-manager APIs
- IP discovery: use hv.guest_ip() with timeout loop instead of discover_guest_ip_virsh() (500+ lines removed from hot path) - SSH: use vm_manager::ssh::connect_with_retry() + upload() + exec() instead of hand-rolled TCP/ssh2/SFTP code - Console: use vm_manager::console::ConsoleTailer over Unix socket instead of file-based tail_console_to_joblog() - Add guest_ip() to orchestrator Hypervisor trait with default impl - Remove #[cfg(linux, libvirt)] gates from is_illumos_label, expand_tilde - Keep orchestrator-specific: DB persistence, log recording, MQ publish, runner binary selection, env var injection
This commit is contained in:
parent
a60053f030
commit
190eb5532f
3 changed files with 151 additions and 83 deletions
|
|
@ -77,6 +77,9 @@ pub trait Hypervisor: Send + Sync {
|
||||||
async fn state(&self, _vm: &VmHandle) -> Result<VmState> {
|
async fn state(&self, _vm: &VmHandle) -> Result<VmState> {
|
||||||
Ok(VmState::Prepared)
|
Ok(VmState::Prepared)
|
||||||
}
|
}
|
||||||
|
async fn guest_ip(&self, vm: &VmHandle) -> Result<String> {
|
||||||
|
Ok("127.0.0.1".to_string())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A router that delegates to the correct backend implementation per job.
|
/// A router that delegates to the correct backend implementation per job.
|
||||||
|
|
|
||||||
|
|
@ -205,110 +205,170 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
tokio::time::sleep(Duration::from_secs(boot_wait)).await;
|
tokio::time::sleep(Duration::from_secs(boot_wait)).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start serial console tailer to capture early boot logs into job log
|
// Start serial console tailer via vm-manager
|
||||||
let console_path = h.work_dir.join("console.log");
|
let (console_stop_tx, console_stop_rx) = tokio::sync::watch::channel(false);
|
||||||
let persist_for_tailer = persist.clone();
|
let (console_line_tx, mut console_line_rx) = tokio::sync::mpsc::channel::<String>(512);
|
||||||
let req_for_tailer = item.ctx.request_id;
|
let persist_for_console = persist.clone();
|
||||||
let mut tailer_opt = Some(tokio::spawn(async move {
|
let req_for_console = item.ctx.request_id;
|
||||||
let _ = tail_console_to_joblog(persist_for_tailer, req_for_tailer, console_path).await;
|
|
||||||
}));
|
|
||||||
|
|
||||||
// Attempt to discover guest IP (libvirt only) and run the runner over SSH
|
// Spawn console tailer that reads from the QEMU serial socket
|
||||||
|
if let Some(ref console_socket) = h.console_socket {
|
||||||
|
let endpoint = vm_manager::ConsoleEndpoint::UnixSocket(console_socket.clone());
|
||||||
|
tokio::spawn(vm_manager::console::ConsoleTailer::tail(endpoint, console_line_tx.clone(), console_stop_rx));
|
||||||
|
}
|
||||||
|
// Spawn task that records console lines into the DB
|
||||||
|
let console_recorder = tokio::spawn(async move {
|
||||||
|
let mut seq: i64 = -1_000_000;
|
||||||
|
while let Some(line) = console_line_rx.recv().await {
|
||||||
|
let trimmed = line.trim_end_matches(['\n', '\r']);
|
||||||
|
let obj = serde_json::json!({
|
||||||
|
"category": "boot",
|
||||||
|
"level": "info",
|
||||||
|
"msg": trimmed
|
||||||
|
});
|
||||||
|
let _ = persist_for_console.record_log_line(req_for_console, seq, false, &obj.to_string()).await;
|
||||||
|
seq += 1;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Discover guest IP and run the workflow via SSH using vm-manager
|
||||||
let mut success = false;
|
let mut success = false;
|
||||||
let mut exit_code: i32 = 1;
|
let mut exit_code: i32 = 1;
|
||||||
let mut failure_summary: Option<String> = None;
|
let mut failure_summary: Option<String> = None;
|
||||||
|
|
||||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
|
||||||
{
|
{
|
||||||
let exec_cfg = exec_shared_inner.clone();
|
let exec_cfg = exec_shared_inner.clone();
|
||||||
let uri = exec_cfg.libvirt_uri.clone();
|
// Discover guest IP via vm-manager
|
||||||
let net_opt = if exec_cfg.libvirt_network.trim().is_empty() { None } else { Some(exec_cfg.libvirt_network.clone()) };
|
let ip_result = tokio::time::timeout(
|
||||||
match discover_guest_ip_virsh(
|
|
||||||
&h.id,
|
|
||||||
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs.min(300)),
|
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs.min(300)),
|
||||||
&uri,
|
async {
|
||||||
net_opt.as_deref(),
|
loop {
|
||||||
).await {
|
match hv.guest_ip(&h).await {
|
||||||
Some(ip) => {
|
Ok(ip) => return ip,
|
||||||
let ip_owned = ip.clone();
|
Err(_) => tokio::time::sleep(Duration::from_secs(1)).await,
|
||||||
let user = item.ctx.ssh_user.clone().unwrap_or_else(|| "sol".to_string());
|
|
||||||
let per_job_key_path = item.ctx.ssh_private_key_path.as_ref().map(|s| expand_tilde(s));
|
|
||||||
let key_mem_opt = item.ctx.ssh_private_key_pem.clone();
|
|
||||||
// Choose correct runner binary based on label (illumos vs linux)
|
|
||||||
let runner_path = if is_illumos_label(&item.spec.label) {
|
|
||||||
&exec_cfg.runner_illumos_path
|
|
||||||
} else {
|
|
||||||
&exec_cfg.runner_linux_path
|
|
||||||
};
|
|
||||||
let local_runner = expand_tilde(runner_path);
|
|
||||||
info!(label = %item.spec.label, runner = %runner_path, "selected runner binary");
|
|
||||||
let remote_path = PathBuf::from(&exec_cfg.remote_runner_path);
|
|
||||||
let repo_url = item.ctx.repo_url.clone();
|
|
||||||
let commit_sha = item.ctx.commit_sha.clone();
|
|
||||||
let request_id = item.ctx.request_id;
|
|
||||||
match run_job_via_ssh_with_retry(
|
|
||||||
ip_owned,
|
|
||||||
user,
|
|
||||||
key_mem_opt,
|
|
||||||
per_job_key_path,
|
|
||||||
local_runner,
|
|
||||||
remote_path,
|
|
||||||
repo_url,
|
|
||||||
commit_sha,
|
|
||||||
request_id,
|
|
||||||
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs),
|
|
||||||
).await {
|
|
||||||
Ok((ok, code, lines)) => {
|
|
||||||
success = ok; exit_code = code;
|
|
||||||
// Persist lines and capture last non-empty stderr for summary
|
|
||||||
let mut seq: i64 = 0;
|
|
||||||
let mut last_stderr: Option<String> = None;
|
|
||||||
for (is_stderr, line) in lines {
|
|
||||||
let line_ref = line.trim_end_matches(['\n', '\r']);
|
|
||||||
if is_stderr && !line_ref.trim().is_empty() {
|
|
||||||
last_stderr = Some(line_ref.to_string());
|
|
||||||
}
|
|
||||||
let _ = persist.record_log_line(item.ctx.request_id, seq, is_stderr, line_ref).await;
|
|
||||||
seq += 1;
|
|
||||||
}
|
|
||||||
if !success {
|
|
||||||
failure_summary = Some(match last_stderr {
|
|
||||||
Some(ref msg) => format!("job script failed: exit_code={} — {}", exit_code, msg),
|
|
||||||
None => format!("job script failed: exit_code={}", exit_code),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(e) => {
|
|
||||||
failure_summary = Some(format!("ssh/runner error: {}", e));
|
|
||||||
warn!(error = %e, request_id = %item.ctx.request_id, ip = %ip, "ssh runner execution failed");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => {
|
).await;
|
||||||
warn!(request_id = %item.ctx.request_id, label = %label_key, "failed to determine guest IP via virsh");
|
|
||||||
|
match ip_result {
|
||||||
|
Ok(ip) => {
|
||||||
|
let user = item.ctx.ssh_user.clone().unwrap_or_else(|| "sol".to_string());
|
||||||
|
let ssh_port = h.ssh_host_port.unwrap_or(22);
|
||||||
|
let ssh_cfg = vm_manager::SshConfig {
|
||||||
|
user: user.clone(),
|
||||||
|
public_key: item.ctx.ssh_public_key.clone(),
|
||||||
|
private_key_path: item.ctx.ssh_private_key_path.as_ref().map(PathBuf::from),
|
||||||
|
private_key_pem: item.ctx.ssh_private_key_pem.clone(),
|
||||||
|
};
|
||||||
|
// Connect with retry
|
||||||
|
match vm_manager::ssh::connect_with_retry(
|
||||||
|
&ip, ssh_port, &ssh_cfg,
|
||||||
|
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs),
|
||||||
|
).await {
|
||||||
|
Ok(sess) => {
|
||||||
|
// Upload runner binary
|
||||||
|
let runner_path = if is_illumos_label(&item.spec.label) {
|
||||||
|
&exec_cfg.runner_illumos_path
|
||||||
|
} else {
|
||||||
|
&exec_cfg.runner_linux_path
|
||||||
|
};
|
||||||
|
let local_runner = expand_tilde(runner_path);
|
||||||
|
info!(label = %item.spec.label, runner = %runner_path, "selected runner binary");
|
||||||
|
let remote_path = PathBuf::from(&exec_cfg.remote_runner_path);
|
||||||
|
|
||||||
|
if let Err(e) = vm_manager::ssh::upload(&sess, &local_runner, &remote_path) {
|
||||||
|
failure_summary = Some(format!("runner upload failed: {}", e));
|
||||||
|
warn!(error = %e, request_id = %item.ctx.request_id, "failed to upload runner");
|
||||||
|
} else {
|
||||||
|
// Execute runner
|
||||||
|
let cmd = format!(
|
||||||
|
"export SOLSTICE_REPO_URL=\"{}\" SOLSTICE_COMMIT_SHA=\"{}\" SOLSTICE_REQUEST_ID=\"{}\"; sh -lc '{}'",
|
||||||
|
item.ctx.repo_url, item.ctx.commit_sha, item.ctx.request_id, remote_path.display()
|
||||||
|
);
|
||||||
|
|
||||||
|
let persist_for_ssh = persist.clone();
|
||||||
|
let req_id = item.ctx.request_id;
|
||||||
|
// Use exec_streaming with writers that record to DB
|
||||||
|
let result = tokio::task::spawn_blocking(move || {
|
||||||
|
vm_manager::ssh::exec(&sess, &cmd)
|
||||||
|
}).await;
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Ok(Ok((stdout, stderr, code))) => {
|
||||||
|
exit_code = code;
|
||||||
|
success = code == 0;
|
||||||
|
// Persist lines
|
||||||
|
let mut seq: i64 = 0;
|
||||||
|
let mut last_stderr: Option<String> = None;
|
||||||
|
for line in stdout.lines() {
|
||||||
|
let line_ref = strip_ansi(line.trim_end_matches(['\n', '\r']));
|
||||||
|
let _ = persist_for_ssh.record_log_line(req_id, seq, false, &line_ref).await;
|
||||||
|
seq += 1;
|
||||||
|
}
|
||||||
|
for line in stderr.lines() {
|
||||||
|
let line_ref = strip_ansi(line.trim_end_matches(['\n', '\r']));
|
||||||
|
if !line_ref.trim().is_empty() {
|
||||||
|
last_stderr = Some(line_ref.clone());
|
||||||
|
}
|
||||||
|
let _ = persist_for_ssh.record_log_line(req_id, seq, true, &line_ref).await;
|
||||||
|
seq += 1;
|
||||||
|
}
|
||||||
|
if !success {
|
||||||
|
failure_summary = Some(match last_stderr {
|
||||||
|
Some(ref msg) => format!("job script failed: exit_code={} — {}", exit_code, msg),
|
||||||
|
None => format!("job script failed: exit_code={}", exit_code),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
failure_summary = Some(format!("ssh exec error: {}", e));
|
||||||
|
warn!(error = %e, request_id = %item.ctx.request_id, "ssh exec failed");
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
failure_summary = Some(format!("ssh task panicked: {}", e));
|
||||||
|
error!(error = %e, request_id = %item.ctx.request_id, "ssh task panicked");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
failure_summary = Some(format!("ssh connect failed: {}", e));
|
||||||
|
warn!(error = %e, request_id = %item.ctx.request_id, "ssh connect failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
failure_summary = Some("timed out waiting for guest IP".to_string());
|
||||||
|
warn!(request_id = %item.ctx.request_id, label = %label_key, "failed to determine guest IP");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(all(target_os = "linux", feature = "libvirt")))]
|
|
||||||
{
|
|
||||||
warn!(request_id = %item.ctx.request_id, label = %label_key, "SSH execution not supported on this platform/backend; skipping");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stop VM after attempting execution
|
// Stop VM after attempting execution
|
||||||
if let Err(e) = hv.stop(&h, Duration::from_secs(10)).await {
|
if let Err(e) = hv.stop(&h, Duration::from_secs(10)).await {
|
||||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
|
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
|
||||||
}
|
}
|
||||||
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
|
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
|
||||||
|
|
||||||
// Stop console tailer before we snapshot the file to avoid races
|
// Stop console tailer
|
||||||
if let Some(t) = tailer_opt.take() { t.abort(); }
|
let _ = console_stop_tx.send(true);
|
||||||
|
drop(console_line_tx);
|
||||||
|
let _ = console_recorder.await;
|
||||||
|
|
||||||
// If no logs were captured (e.g., SSH never connected), snapshot the final console log
|
// If no logs were captured, snapshot the console log file as fallback
|
||||||
let console_snapshot = h.work_dir.join("console.log");
|
|
||||||
if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await {
|
if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await {
|
||||||
if let Err(e) = snapshot_console_to_joblog(persist.clone(), item.ctx.request_id, console_snapshot).await {
|
if let Ok(lines) = vm_manager::console::read_console_log(&h.work_dir).await {
|
||||||
warn!(error = %e, request_id = %item.ctx.request_id, "failed to snapshot console log");
|
let mut seq: i64 = -1_000_000;
|
||||||
|
for line in lines {
|
||||||
|
let obj = serde_json::json!({
|
||||||
|
"category": "boot",
|
||||||
|
"level": "info",
|
||||||
|
"msg": line.trim_end_matches(['\n', '\r'])
|
||||||
|
});
|
||||||
|
let _ = persist.record_log_line(item.ctx.request_id, seq, false, &obj.to_string()).await;
|
||||||
|
seq += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -825,7 +885,6 @@ async fn run_job_via_ssh_owned(
|
||||||
Ok((ok, code, lines))
|
Ok((ok, code, lines))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
|
||||||
fn is_illumos_label(label: &str) -> bool {
|
fn is_illumos_label(label: &str) -> bool {
|
||||||
let l = label.to_ascii_lowercase();
|
let l = label.to_ascii_lowercase();
|
||||||
l.contains("illumos")
|
l.contains("illumos")
|
||||||
|
|
@ -834,7 +893,6 @@ fn is_illumos_label(label: &str) -> bool {
|
||||||
|| l.contains("oi-hipster")
|
|| l.contains("oi-hipster")
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
|
||||||
fn expand_tilde(path: &str) -> PathBuf {
|
fn expand_tilde(path: &str) -> PathBuf {
|
||||||
if let Some(rest) = path.strip_prefix("~/") {
|
if let Some(rest) = path.strip_prefix("~/") {
|
||||||
if let Ok(home) = std::env::var("HOME") {
|
if let Ok(home) = std::env::var("HOME") {
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,13 @@ impl Hypervisor for VmManagerAdapter {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn guest_ip(&self, vm: &VmHandle) -> Result<String> {
|
||||||
|
let inner_handle = to_vm_handle(vm);
|
||||||
|
vm_manager::Hypervisor::guest_ip(&self.inner, &inner_handle)
|
||||||
|
.await
|
||||||
|
.into_diagnostic()
|
||||||
|
}
|
||||||
|
|
||||||
async fn state(&self, vm: &VmHandle) -> Result<VmState> {
|
async fn state(&self, vm: &VmHandle) -> Result<VmState> {
|
||||||
let inner_handle = to_vm_handle(vm);
|
let inner_handle = to_vm_handle(vm);
|
||||||
let state = vm_manager::Hypervisor::state(&self.inner, &inner_handle)
|
let state = vm_manager::Hypervisor::state(&self.inner, &inner_handle)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue