Replace scheduler SSH/console code with vm-manager APIs

- IP discovery: use hv.guest_ip() with timeout loop instead of
  discover_guest_ip_virsh() (500+ lines removed from hot path)
- SSH: use vm_manager::ssh::connect_with_retry() + upload() + exec()
  instead of hand-rolled TCP/ssh2/SFTP code
- Console: use vm_manager::console::ConsoleTailer over Unix socket
  instead of file-based tail_console_to_joblog()
- Add guest_ip() to orchestrator Hypervisor trait with default impl
- Remove #[cfg(linux, libvirt)] gates from is_illumos_label, expand_tilde
- Keep orchestrator-specific: DB persistence, log recording, MQ publish,
  runner binary selection, env var injection
This commit is contained in:
Till Wegmueller 2026-04-07 15:50:54 +02:00
parent a60053f030
commit 190eb5532f
3 changed files with 151 additions and 83 deletions

View file

@ -77,6 +77,9 @@ pub trait Hypervisor: Send + Sync {
async fn state(&self, _vm: &VmHandle) -> Result<VmState> {
Ok(VmState::Prepared)
}
async fn guest_ip(&self, vm: &VmHandle) -> Result<String> {
Ok("127.0.0.1".to_string())
}
}
/// A router that delegates to the correct backend implementation per job.

View file

@ -205,110 +205,170 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
tokio::time::sleep(Duration::from_secs(boot_wait)).await;
}
// Start serial console tailer to capture early boot logs into job log
let console_path = h.work_dir.join("console.log");
let persist_for_tailer = persist.clone();
let req_for_tailer = item.ctx.request_id;
let mut tailer_opt = Some(tokio::spawn(async move {
let _ = tail_console_to_joblog(persist_for_tailer, req_for_tailer, console_path).await;
}));
// Start serial console tailer via vm-manager
let (console_stop_tx, console_stop_rx) = tokio::sync::watch::channel(false);
let (console_line_tx, mut console_line_rx) = tokio::sync::mpsc::channel::<String>(512);
let persist_for_console = persist.clone();
let req_for_console = item.ctx.request_id;
// Attempt to discover guest IP (libvirt only) and run the runner over SSH
// Spawn console tailer that reads from the QEMU serial socket
if let Some(ref console_socket) = h.console_socket {
let endpoint = vm_manager::ConsoleEndpoint::UnixSocket(console_socket.clone());
tokio::spawn(vm_manager::console::ConsoleTailer::tail(endpoint, console_line_tx.clone(), console_stop_rx));
}
// Spawn task that records console lines into the DB
let console_recorder = tokio::spawn(async move {
let mut seq: i64 = -1_000_000;
while let Some(line) = console_line_rx.recv().await {
let trimmed = line.trim_end_matches(['\n', '\r']);
let obj = serde_json::json!({
"category": "boot",
"level": "info",
"msg": trimmed
});
let _ = persist_for_console.record_log_line(req_for_console, seq, false, &obj.to_string()).await;
seq += 1;
}
});
// Discover guest IP and run the workflow via SSH using vm-manager
let mut success = false;
let mut exit_code: i32 = 1;
let mut failure_summary: Option<String> = None;
#[cfg(all(target_os = "linux", feature = "libvirt"))]
{
let exec_cfg = exec_shared_inner.clone();
let uri = exec_cfg.libvirt_uri.clone();
let net_opt = if exec_cfg.libvirt_network.trim().is_empty() { None } else { Some(exec_cfg.libvirt_network.clone()) };
match discover_guest_ip_virsh(
&h.id,
// Discover guest IP via vm-manager
let ip_result = tokio::time::timeout(
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs.min(300)),
&uri,
net_opt.as_deref(),
).await {
Some(ip) => {
let ip_owned = ip.clone();
let user = item.ctx.ssh_user.clone().unwrap_or_else(|| "sol".to_string());
let per_job_key_path = item.ctx.ssh_private_key_path.as_ref().map(|s| expand_tilde(s));
let key_mem_opt = item.ctx.ssh_private_key_pem.clone();
// Choose correct runner binary based on label (illumos vs linux)
let runner_path = if is_illumos_label(&item.spec.label) {
&exec_cfg.runner_illumos_path
} else {
&exec_cfg.runner_linux_path
};
let local_runner = expand_tilde(runner_path);
info!(label = %item.spec.label, runner = %runner_path, "selected runner binary");
let remote_path = PathBuf::from(&exec_cfg.remote_runner_path);
let repo_url = item.ctx.repo_url.clone();
let commit_sha = item.ctx.commit_sha.clone();
let request_id = item.ctx.request_id;
match run_job_via_ssh_with_retry(
ip_owned,
user,
key_mem_opt,
per_job_key_path,
local_runner,
remote_path,
repo_url,
commit_sha,
request_id,
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs),
).await {
Ok((ok, code, lines)) => {
success = ok; exit_code = code;
// Persist lines and capture last non-empty stderr for summary
let mut seq: i64 = 0;
let mut last_stderr: Option<String> = None;
for (is_stderr, line) in lines {
let line_ref = line.trim_end_matches(['\n', '\r']);
if is_stderr && !line_ref.trim().is_empty() {
last_stderr = Some(line_ref.to_string());
}
let _ = persist.record_log_line(item.ctx.request_id, seq, is_stderr, line_ref).await;
seq += 1;
}
if !success {
failure_summary = Some(match last_stderr {
Some(ref msg) => format!("job script failed: exit_code={}{}", exit_code, msg),
None => format!("job script failed: exit_code={}", exit_code),
});
}
},
Err(e) => {
failure_summary = Some(format!("ssh/runner error: {}", e));
warn!(error = %e, request_id = %item.ctx.request_id, ip = %ip, "ssh runner execution failed");
async {
loop {
match hv.guest_ip(&h).await {
Ok(ip) => return ip,
Err(_) => tokio::time::sleep(Duration::from_secs(1)).await,
}
}
}
None => {
warn!(request_id = %item.ctx.request_id, label = %label_key, "failed to determine guest IP via virsh");
).await;
match ip_result {
Ok(ip) => {
let user = item.ctx.ssh_user.clone().unwrap_or_else(|| "sol".to_string());
let ssh_port = h.ssh_host_port.unwrap_or(22);
let ssh_cfg = vm_manager::SshConfig {
user: user.clone(),
public_key: item.ctx.ssh_public_key.clone(),
private_key_path: item.ctx.ssh_private_key_path.as_ref().map(PathBuf::from),
private_key_pem: item.ctx.ssh_private_key_pem.clone(),
};
// Connect with retry
match vm_manager::ssh::connect_with_retry(
&ip, ssh_port, &ssh_cfg,
Duration::from_secs(exec_cfg.ssh_connect_timeout_secs),
).await {
Ok(sess) => {
// Upload runner binary
let runner_path = if is_illumos_label(&item.spec.label) {
&exec_cfg.runner_illumos_path
} else {
&exec_cfg.runner_linux_path
};
let local_runner = expand_tilde(runner_path);
info!(label = %item.spec.label, runner = %runner_path, "selected runner binary");
let remote_path = PathBuf::from(&exec_cfg.remote_runner_path);
if let Err(e) = vm_manager::ssh::upload(&sess, &local_runner, &remote_path) {
failure_summary = Some(format!("runner upload failed: {}", e));
warn!(error = %e, request_id = %item.ctx.request_id, "failed to upload runner");
} else {
// Execute runner
let cmd = format!(
"export SOLSTICE_REPO_URL=\"{}\" SOLSTICE_COMMIT_SHA=\"{}\" SOLSTICE_REQUEST_ID=\"{}\"; sh -lc '{}'",
item.ctx.repo_url, item.ctx.commit_sha, item.ctx.request_id, remote_path.display()
);
let persist_for_ssh = persist.clone();
let req_id = item.ctx.request_id;
// Use exec_streaming with writers that record to DB
let result = tokio::task::spawn_blocking(move || {
vm_manager::ssh::exec(&sess, &cmd)
}).await;
match result {
Ok(Ok((stdout, stderr, code))) => {
exit_code = code;
success = code == 0;
// Persist lines
let mut seq: i64 = 0;
let mut last_stderr: Option<String> = None;
for line in stdout.lines() {
let line_ref = strip_ansi(line.trim_end_matches(['\n', '\r']));
let _ = persist_for_ssh.record_log_line(req_id, seq, false, &line_ref).await;
seq += 1;
}
for line in stderr.lines() {
let line_ref = strip_ansi(line.trim_end_matches(['\n', '\r']));
if !line_ref.trim().is_empty() {
last_stderr = Some(line_ref.clone());
}
let _ = persist_for_ssh.record_log_line(req_id, seq, true, &line_ref).await;
seq += 1;
}
if !success {
failure_summary = Some(match last_stderr {
Some(ref msg) => format!("job script failed: exit_code={}{}", exit_code, msg),
None => format!("job script failed: exit_code={}", exit_code),
});
}
}
Ok(Err(e)) => {
failure_summary = Some(format!("ssh exec error: {}", e));
warn!(error = %e, request_id = %item.ctx.request_id, "ssh exec failed");
}
Err(e) => {
failure_summary = Some(format!("ssh task panicked: {}", e));
error!(error = %e, request_id = %item.ctx.request_id, "ssh task panicked");
}
}
}
}
Err(e) => {
failure_summary = Some(format!("ssh connect failed: {}", e));
warn!(error = %e, request_id = %item.ctx.request_id, "ssh connect failed");
}
}
}
Err(_) => {
failure_summary = Some("timed out waiting for guest IP".to_string());
warn!(request_id = %item.ctx.request_id, label = %label_key, "failed to determine guest IP");
}
}
}
#[cfg(not(all(target_os = "linux", feature = "libvirt")))]
{
warn!(request_id = %item.ctx.request_id, label = %label_key, "SSH execution not supported on this platform/backend; skipping");
}
// Stop VM after attempting execution
if let Err(e) = hv.stop(&h, Duration::from_secs(10)).await {
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
}
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
// Stop console tailer before we snapshot the file to avoid races
if let Some(t) = tailer_opt.take() { t.abort(); }
// Stop console tailer
let _ = console_stop_tx.send(true);
drop(console_line_tx);
let _ = console_recorder.await;
// If no logs were captured (e.g., SSH never connected), snapshot the final console log
let console_snapshot = h.work_dir.join("console.log");
// If no logs were captured, snapshot the console log file as fallback
if let Ok(None) = persist.get_logs_text(item.ctx.request_id).await {
if let Err(e) = snapshot_console_to_joblog(persist.clone(), item.ctx.request_id, console_snapshot).await {
warn!(error = %e, request_id = %item.ctx.request_id, "failed to snapshot console log");
if let Ok(lines) = vm_manager::console::read_console_log(&h.work_dir).await {
let mut seq: i64 = -1_000_000;
for line in lines {
let obj = serde_json::json!({
"category": "boot",
"level": "info",
"msg": line.trim_end_matches(['\n', '\r'])
});
let _ = persist.record_log_line(item.ctx.request_id, seq, false, &obj.to_string()).await;
seq += 1;
}
}
}
@ -825,7 +885,6 @@ async fn run_job_via_ssh_owned(
Ok((ok, code, lines))
}
#[cfg(all(target_os = "linux", feature = "libvirt"))]
fn is_illumos_label(label: &str) -> bool {
let l = label.to_ascii_lowercase();
l.contains("illumos")
@ -834,7 +893,6 @@ fn is_illumos_label(label: &str) -> bool {
|| l.contains("oi-hipster")
}
#[cfg(all(target_os = "linux", feature = "libvirt"))]
fn expand_tilde(path: &str) -> PathBuf {
if let Some(rest) = path.strip_prefix("~/") {
if let Ok(home) = std::env::var("HOME") {

View file

@ -128,6 +128,13 @@ impl Hypervisor for VmManagerAdapter {
Ok(())
}
async fn guest_ip(&self, vm: &VmHandle) -> Result<String> {
let inner_handle = to_vm_handle(vm);
vm_manager::Hypervisor::guest_ip(&self.inner, &inner_handle)
.await
.into_diagnostic()
}
async fn state(&self, vm: &VmHandle) -> Result<VmState> {
let inner_handle = to_vm_handle(vm);
let state = vm_manager::Hypervisor::state(&self.inner, &inner_handle)