mirror of
https://codeberg.org/Toasterson/solstice-ci.git
synced 2026-04-10 13:20:41 +00:00
Replace scheduler SSH/console code with vm-manager APIs
- IP discovery: use hv.guest_ip() with timeout loop instead of discover_guest_ip_virsh() (500+ lines removed from hot path) - SSH: use vm_manager::ssh::connect_with_retry() + upload() + exec() instead of hand-rolled TCP/ssh2/SFTP code - Console: use vm_manager::console::ConsoleTailer over Unix socket instead of file-based tail_console_to_joblog() - Add guest_ip() to orchestrator Hypervisor trait with default impl - Remove #[cfg(linux, libvirt)] gates from is_illumos_label, expand_tilde - Keep orchestrator-specific: DB persistence, log recording, MQ publish, runner binary selection, env var injection
This commit is contained in:
parent
a60053f030
commit
190eb5532f
3 changed files with 151 additions and 83 deletions
|
|
@ -77,6 +77,9 @@ pub trait Hypervisor: Send + Sync {
|
|||
async fn state(&self, _vm: &VmHandle) -> Result<VmState> {
|
||||
Ok(VmState::Prepared)
|
||||
}
|
||||
async fn guest_ip(&self, vm: &VmHandle) -> Result<String> {
|
||||
Ok("127.0.0.1".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// A router that delegates to the correct backend implementation per job.
|
||||
|
|
|
|||
|
|
@ -205,110 +205,170 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
|||
tokio::time::sleep(Duration::from_secs(boot_wait)).await;
|
||||
}
|
||||
|
||||
// Start serial console tailer to capture early boot logs into job log
|
||||
let console_path = h.work_dir.join("console.log");
|
||||
let persist_for_tailer = persist.clone();
|
||||
let req_for_tailer = item.ctx.request_id;
|
||||
let mut tailer_opt = Some(tokio::spawn(async move {
|
||||
let _ = tail_console_to_joblog(persist_for_tailer, req_for_tailer, console_path).await;
|
||||
}));
|
||||
// Start serial console tailer via vm-manager
|
||||
let (console_stop_tx, console_stop_rx) = tokio::sync::watch::channel(false);
|
||||
let (console_line_tx, mut console_line_rx) = tokio::sync::mpsc::channel::<String>(512);
|
||||
let persist_for_console = persist.clone();
|
||||
let req_for_console = item.ctx.request_id;
|
||||
|
||||
// Attempt to discover guest IP (libvirt only) and run the runner over SSH
|
||||
// Spawn console tailer that reads from the QEMU serial socket
|
||||
if let Some(ref console_socket) = h.console_socket {
|
||||
let endpoint = vm_manager::ConsoleEndpoint::UnixSocket(console_socket.clone());
|
||||
tokio::spawn(vm_manager::console::ConsoleTailer::tail(endpoint, console_line_tx.clone(), console_stop_rx));
|
||||
}
|
||||
// Spawn task that records console lines into the DB
|
||||
let console_recorder = tokio::spawn(async move {
|
||||
let mut seq: i64 = -1_000_000;
|
||||
while let Some(line) = console_line_rx.recv().await {
|
||||
let trimmed = line.trim_end_matches(['\n', '\r']);
|
||||
let obj = serde_json::json!({
|
||||
"category": "boot",
|
||||
"level": "info",
|
||||
"msg": trimmed
|
||||
});
|
||||
let _ = persist_for_console.record_log_line(req_for_console, seq, false, &obj.to_string()).await;
|
||||
seq += 1;
|
||||
}
|
||||
});
|
||||
|
||||
// Discover guest IP and run the workflow via SSH using vm-manager
|
||||
let mut success = false;
|
||||
let mut exit_code: i32 = 1;
|
||||
let mut failure_summary: Option<String> = None;
|
||||
|
||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
||||
{
|
||||
let exec_cfg = exec_shared_inner.clone();
|
||||
let uri = exec_cfg.libvirt_uri.clone();
|
||||
let net_opt = if exec_cfg.libvirt_network.trim().is_empty() { None } else { Some(exec_cfg.libvirt_network.clone()) };
|
||||
match discover_guest_ip_virsh(
|
||||
&h.id,
|
||||
// Discover guest IP via vm-manager
|
||||
let ip_result = tokio::time::timeout(
|
||||
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs.min(300)),
|
||||
&uri,
|
||||
net_opt.as_deref(),
|
||||
).await {
|
||||
Some(ip) => {
|
||||
let ip_owned = ip.clone();
|
||||
let user = item.ctx.ssh_user.clone().unwrap_or_else(|| "sol".to_string());
|
||||
let per_job_key_path = item.ctx.ssh_private_key_path.as_ref().map(|s| expand_tilde(s));
|
||||
let key_mem_opt = item.ctx.ssh_private_key_pem.clone();
|
||||
// Choose correct runner binary based on label (illumos vs linux)
|
||||
let runner_path = if is_illumos_label(&item.spec.label) {
|
||||
&exec_cfg.runner_illumos_path
|
||||
} else {
|
||||
&exec_cfg.runner_linux_path
|
||||
};
|
||||
let local_runner = expand_tilde(runner_path);
|
||||
info!(label = %item.spec.label, runner = %runner_path, "selected runner binary");
|
||||
let remote_path = PathBuf::from(&exec_cfg.remote_runner_path);
|
||||
let repo_url = item.ctx.repo_url.clone();
|
||||
let commit_sha = item.ctx.commit_sha.clone();
|
||||
let request_id = item.ctx.request_id;
|
||||
match run_job_via_ssh_with_retry(
|
||||
ip_owned,
|
||||
user,
|
||||
key_mem_opt,
|
||||
per_job_key_path,
|
||||
local_runner,
|
||||
remote_path,
|
||||
repo_url,
|
||||
commit_sha,
|
||||
request_id,
|
||||
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs),
|
||||
).await {
|
||||
Ok((ok, code, lines)) => {
|
||||
success = ok; exit_code = code;
|
||||
// Persist lines and capture last non-empty stderr for summary
|
||||
let mut seq: i64 = 0;
|
||||
let mut last_stderr: Option<String> = None;
|
||||
for (is_stderr, line) in lines {
|
||||
let line_ref = line.trim_end_matches(['\n', '\r']);
|
||||
if is_stderr && !line_ref.trim().is_empty() {
|
||||
last_stderr = Some(line_ref.to_string());
|
||||
}
|
||||
let _ = persist.record_log_line(item.ctx.request_id, seq, is_stderr, line_ref).await;
|
||||
seq += 1;
|
||||
}
|
||||
if !success {
|
||||
failure_summary = Some(match last_stderr {
|
||||
Some(ref msg) => format!("job script failed: exit_code={} — {}", exit_code, msg),
|
||||
None => format!("job script failed: exit_code={}", exit_code),
|
||||
});
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
failure_summary = Some(format!("ssh/runner error: {}", e));
|
||||
warn!(error = %e, request_id = %item.ctx.request_id, ip = %ip, "ssh runner execution failed");
|
||||
async {
|
||||
loop {
|
||||
match hv.guest_ip(&h).await {
|
||||
Ok(ip) => return ip,
|
||||
Err(_) => tokio::time::sleep(Duration::from_secs(1)).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warn!(request_id = %item.ctx.request_id, label = %label_key, "failed to determine guest IP via virsh");
|
||||
).await;
|
||||
|
||||
match ip_result {
|
||||
Ok(ip) => {
|
||||
let user = item.ctx.ssh_user.clone().unwrap_or_else(|| "sol".to_string());
|
||||
let ssh_port = h.ssh_host_port.unwrap_or(22);
|
||||
let ssh_cfg = vm_manager::SshConfig {
|
||||
user: user.clone(),
|
||||
public_key: item.ctx.ssh_public_key.clone(),
|
||||
private_key_path: item.ctx.ssh_private_key_path.as_ref().map(PathBuf::from),
|
||||
private_key_pem: item.ctx.ssh_private_key_pem.clone(),
|
||||
};
|
||||
// Connect with retry
|
||||
match vm_manager::ssh::connect_with_retry(
|
||||
&ip, ssh_port, &ssh_cfg,
|
||||
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs),
|
||||
).await {
|
||||
Ok(sess) => {
|
||||
// Upload runner binary
|
||||
let runner_path = if is_illumos_label(&item.spec.label) {
|
||||
&exec_cfg.runner_illumos_path
|
||||
} else {
|
||||
&exec_cfg.runner_linux_path
|
||||
};
|
||||
let local_runner = expand_tilde(runner_path);
|
||||
info!(label = %item.spec.label, runner = %runner_path, "selected runner binary");
|
||||
let remote_path = PathBuf::from(&exec_cfg.remote_runner_path);
|
||||
|
||||
if let Err(e) = vm_manager::ssh::upload(&sess, &local_runner, &remote_path) {
|
||||
failure_summary = Some(format!("runner upload failed: {}", e));
|
||||
warn!(error = %e, request_id = %item.ctx.request_id, "failed to upload runner");
|
||||
} else {
|
||||
// Execute runner
|
||||
let cmd = format!(
|
||||
"export SOLSTICE_REPO_URL=\"{}\" SOLSTICE_COMMIT_SHA=\"{}\" SOLSTICE_REQUEST_ID=\"{}\"; sh -lc '{}'",
|
||||
item.ctx.repo_url, item.ctx.commit_sha, item.ctx.request_id, remote_path.display()
|
||||
);
|
||||
|
||||
let persist_for_ssh = persist.clone();
|
||||
let req_id = item.ctx.request_id;
|
||||
// Use exec_streaming with writers that record to DB
|
||||
let result = tokio::task::spawn_blocking(move || {
|
||||
vm_manager::ssh::exec(&sess, &cmd)
|
||||
}).await;
|
||||
|
||||
match result {
|
||||
Ok(Ok((stdout, stderr, code))) => {
|
||||
exit_code = code;
|
||||
success = code == 0;
|
||||
// Persist lines
|
||||
let mut seq: i64 = 0;
|
||||
let mut last_stderr: Option<String> = None;
|
||||
for line in stdout.lines() {
|
||||
let line_ref = strip_ansi(line.trim_end_matches(['\n', '\r']));
|
||||
let _ = persist_for_ssh.record_log_line(req_id, seq, false, &line_ref).await;
|
||||
seq += 1;
|
||||
}
|
||||
for line in stderr.lines() {
|
||||
let line_ref = strip_ansi(line.trim_end_matches(['\n', '\r']));
|
||||
if !line_ref.trim().is_empty() {
|
||||
last_stderr = Some(line_ref.clone());
|
||||
}
|
||||
let _ = persist_for_ssh.record_log_line(req_id, seq, true, &line_ref).await;
|
||||
seq += 1;
|
||||
}
|
||||
if !success {
|
||||
failure_summary = Some(match last_stderr {
|
||||
Some(ref msg) => format!("job script failed: exit_code={} — {}", exit_code, msg),
|
||||
None => format!("job script failed: exit_code={}", exit_code),
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
failure_summary = Some(format!("ssh exec error: {}", e));
|
||||
warn!(error = %e, request_id = %item.ctx.request_id, "ssh exec failed");
|
||||
}
|
||||
Err(e) => {
|
||||
failure_summary = Some(format!("ssh task panicked: {}", e));
|
||||
error!(error = %e, request_id = %item.ctx.request_id, "ssh task panicked");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
failure_summary = Some(format!("ssh connect failed: {}", e));
|
||||
warn!(error = %e, request_id = %item.ctx.request_id, "ssh connect failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
failure_summary = Some("timed out waiting for guest IP".to_string());
|
||||
warn!(request_id = %item.ctx.request_id, label = %label_key, "failed to determine guest IP");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(all(target_os = "linux", feature = "libvirt")))]
|
||||
{
|
||||
warn!(request_id = %item.ctx.request_id, label = %label_key, "SSH execution not supported on this platform/backend; skipping");
|
||||
}
|
||||
|
||||
// Stop VM after attempting execution
|
||||
if let Err(e) = hv.stop(&h, Duration::from_secs(10)).await {
|
||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
|
||||
}
|
||||
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
|
||||
|
||||
// Stop console tailer before we snapshot the file to avoid races
|
||||
if let Some(t) = tailer_opt.take() { t.abort(); }
|
||||
// Stop console tailer
|
||||
let _ = console_stop_tx.send(true);
|
||||
drop(console_line_tx);
|
||||
let _ = console_recorder.await;
|
||||
|
||||
// If no logs were captured (e.g., SSH never connected), snapshot the final console log
|
||||
let console_snapshot = h.work_dir.join("console.log");
|
||||
// If no logs were captured, snapshot the console log file as fallback
|
||||
if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await {
|
||||
if let Err(e) = snapshot_console_to_joblog(persist.clone(), item.ctx.request_id, console_snapshot).await {
|
||||
warn!(error = %e, request_id = %item.ctx.request_id, "failed to snapshot console log");
|
||||
if let Ok(lines) = vm_manager::console::read_console_log(&h.work_dir).await {
|
||||
let mut seq: i64 = -1_000_000;
|
||||
for line in lines {
|
||||
let obj = serde_json::json!({
|
||||
"category": "boot",
|
||||
"level": "info",
|
||||
"msg": line.trim_end_matches(['\n', '\r'])
|
||||
});
|
||||
let _ = persist.record_log_line(item.ctx.request_id, seq, false, &obj.to_string()).await;
|
||||
seq += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -825,7 +885,6 @@ async fn run_job_via_ssh_owned(
|
|||
Ok((ok, code, lines))
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
||||
fn is_illumos_label(label: &str) -> bool {
|
||||
let l = label.to_ascii_lowercase();
|
||||
l.contains("illumos")
|
||||
|
|
@ -834,7 +893,6 @@ fn is_illumos_label(label: &str) -> bool {
|
|||
|| l.contains("oi-hipster")
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "linux", feature = "libvirt"))]
|
||||
fn expand_tilde(path: &str) -> PathBuf {
|
||||
if let Some(rest) = path.strip_prefix("~/") {
|
||||
if let Ok(home) = std::env::var("HOME") {
|
||||
|
|
|
|||
|
|
@ -128,6 +128,13 @@ impl Hypervisor for VmManagerAdapter {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
async fn guest_ip(&self, vm: &VmHandle) -> Result<String> {
|
||||
let inner_handle = to_vm_handle(vm);
|
||||
vm_manager::Hypervisor::guest_ip(&self.inner, &inner_handle)
|
||||
.await
|
||||
.into_diagnostic()
|
||||
}
|
||||
|
||||
async fn state(&self, vm: &VmHandle) -> Result<VmState> {
|
||||
let inner_handle = to_vm_handle(vm);
|
||||
let state = vm_manager::Hypervisor::state(&self.inner, &inner_handle)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue