Add gRPC support for VM runner log streaming and orchestrator integration

This commit introduces gRPC-based log streaming between the VM runner (`solstice-runner`) and orchestrator. Key updates include:
- Implemented gRPC server in the orchestrator for receiving and processing runner logs.
- Added log streaming and job result reporting in the `solstice-runner` client.
- Defined `runner.proto` with messages (`LogItem`, `JobEnd`) and the `Runner` service.
- Updated orchestrator to accept gRPC settings and start the server.
- Modified cloud-init user data to include gRPC endpoint and request ID for runners.
- Enhanced message queue logic to handle job results via `publish_job_result`.
- Configured `Cross.toml` for cross-compilation of the runner.
This commit is contained in:
Till Wegmueller 2025-11-01 12:14:50 +01:00
parent e73b6ff49f
commit 855aecbb10
No known key found for this signature in database
13 changed files with 467 additions and 38 deletions

14
Cross.toml Normal file
View file

@ -0,0 +1,14 @@
# Cross configuration for building the workflow runner for VM targets
# Reference: https://github.com/cross-rs/cross
[target.x86_64-unknown-linux-gnu]
image = "ghcr.io/cross-rs/x86_64-unknown-linux-gnu:main"
[target.x86_64-unknown-illumos]
image = "ghcr.io/cross-rs/x86_64-unknown-illumos:main"
[build]
pre-build = [
"dpkg --add-architecture $CROSS_DEB_ARCH",
"apt-get update && apt-get install --assume-yes protobuf-compiler"
]

View file

@ -15,12 +15,17 @@ tracing-opentelemetry = "0.27"
tracing-appender = "0.2" tracing-appender = "0.2"
atty = "0.2" atty = "0.2"
kdl = "4" kdl = "4"
# gRPC/protobuf runtime for Runner API
tonic = { version = "0.12", features = ["transport"] }
prost = "0.13"
# messaging + serialization # messaging + serialization
lapin = "2" lapin = { version = "2", default-features = false, features = ["rustls"] }
tokio-amqp = "1"
serde = { version = "1", features = ["derive"] } serde = { version = "1", features = ["derive"] }
serde_json = "1" serde_json = "1"
uuid = { version = "1", features = ["serde", "v4"] } uuid = { version = "1", features = ["serde", "v4"] }
time = { version = "0.3", features = ["serde", "macros"] } time = { version = "0.3", features = ["serde", "macros"] }
futures-util = "0.3" futures-util = "0.3"
tokio = { version = "1", features = ["rt-multi-thread", "macros", "time"] } tokio = { version = "1", features = ["rt-multi-thread", "macros", "time"] }
[build-dependencies]
tonic-build = "0.12"

8
crates/common/build.rs Normal file
View file

@ -0,0 +1,8 @@
fn main() {
// Compile gRPC protobufs for Runner <-> Orchestrator
println!("cargo:rerun-if-changed=proto/runner.proto");
tonic_build::configure()
.build_server(true)
.compile_protos(&["proto/runner.proto"], &["proto"])
.expect("failed to compile runner proto");
}

View file

@ -0,0 +1,30 @@
syntax = "proto3";
package runner.v1;
// Messages sent from the VM runner agent to the orchestrator.
message LogChunk {
string line = 1; // One line of log output (stdout/stderr)
bool stderr = 2; // True if this line came from stderr
}
message JobEnd {
int32 exit_code = 1; // Exit code of the job script/process
bool success = 2; // Convenience flag
string repo_url = 3; // Convenience context for Integration layer
string commit_sha = 4; // Convenience context for Integration layer
}
message LogItem {
string request_id = 1; // UUID string to correlate the job
oneof event {
LogChunk log = 2;
JobEnd end = 3;
}
}
message Ack { bool ok = 1; }
service Runner {
// Client-streaming RPC: runner sends a stream of LogItem; orchestrator returns Ack
rpc StreamLogs (stream LogItem) returns (Ack);
}

View file

@ -5,5 +5,12 @@ pub mod mq;
pub use telemetry::{init_tracing, TelemetryGuard}; pub use telemetry::{init_tracing, TelemetryGuard};
pub use job::{Workflow, Job, Step, parse_workflow_str, parse_workflow_file}; pub use job::{Workflow, Job, Step, parse_workflow_str, parse_workflow_file};
pub use messages::{JobRequest, SourceSystem}; pub use messages::{JobRequest, JobResult, SourceSystem};
pub use mq::{MqConfig, publish_job, consume_jobs, consume_jobs_until}; pub use mq::{MqConfig, publish_job, publish_job_result, consume_jobs, consume_jobs_until};
// Generated gRPC module for runner <-> orchestrator
pub mod runner {
pub mod v1 {
include!(concat!(env!("OUT_DIR"), "/runner.v1.rs"));
}
}

View file

@ -1,5 +1,5 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use time::{OffsetDateTime}; use time::OffsetDateTime;
use uuid::Uuid; use uuid::Uuid;
/// Versioned internal job request schema published to the message bus. /// Versioned internal job request schema published to the message bus.
@ -7,7 +7,7 @@ use uuid::Uuid;
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JobRequest { pub struct JobRequest {
/// Schema identifier for routing and evolution. /// Schema identifier for routing and evolution.
#[serde(default = "default_schema_version")] #[serde(default = "default_jobrequest_schema")]
pub schema_version: String, // e.g., "jobrequest.v1" pub schema_version: String, // e.g., "jobrequest.v1"
/// Unique request identifier for idempotency and tracing correlation. /// Unique request identifier for idempotency and tracing correlation.
pub request_id: Uuid, pub request_id: Uuid,
@ -27,7 +27,7 @@ pub struct JobRequest {
pub submitted_at: OffsetDateTime, pub submitted_at: OffsetDateTime,
} }
fn default_schema_version() -> String { "jobrequest.v1".to_string() } fn default_jobrequest_schema() -> String { "jobrequest.v1".to_string() }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
@ -40,7 +40,7 @@ pub enum SourceSystem {
impl JobRequest { impl JobRequest {
pub fn new(source: SourceSystem, repo_url: impl Into<String>, commit_sha: impl Into<String>) -> Self { pub fn new(source: SourceSystem, repo_url: impl Into<String>, commit_sha: impl Into<String>) -> Self {
Self { Self {
schema_version: default_schema_version(), schema_version: default_jobrequest_schema(),
request_id: Uuid::new_v4(), request_id: Uuid::new_v4(),
source, source,
repo_url: repo_url.into(), repo_url: repo_url.into(),
@ -52,3 +52,40 @@ impl JobRequest {
} }
} }
} }
/// Final job result reported by the orchestrator back to the Integration layer over MQ.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JobResult {
/// Schema identifier used as routing key/type. e.g., "jobresult.v1"
#[serde(default = "default_jobresult_schema")]
pub schema_version: String,
/// Correlates to the original JobRequest.request_id
pub request_id: Uuid,
/// Repository and commit info (for convenience in consumers)
pub repo_url: String,
pub commit_sha: String,
/// Outcome info
pub success: bool,
pub exit_code: i32,
/// Optional human summary
pub summary: Option<String>,
/// Completion timestamp
pub completed_at: OffsetDateTime,
}
fn default_jobresult_schema() -> String { "jobresult.v1".to_string() }
impl JobResult {
pub fn new(request_id: Uuid, repo_url: String, commit_sha: String, success: bool, exit_code: i32, summary: Option<String>) -> Self {
Self {
schema_version: default_jobresult_schema(),
request_id,
repo_url,
commit_sha,
success,
exit_code,
summary,
completed_at: OffsetDateTime::now_utc(),
}
}
}

View file

@ -13,7 +13,7 @@ use miette::{IntoDiagnostic as _, Result};
use tracing::{debug, error, info, instrument, warn}; use tracing::{debug, error, info, instrument, warn};
use tracing::Instrument; use tracing::Instrument;
use crate::messages::JobRequest; use crate::messages::{JobRequest, JobResult};
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct MqConfig { pub struct MqConfig {
@ -285,3 +285,52 @@ where
info!("consume_jobs completed"); info!("consume_jobs completed");
Ok(()) Ok(())
} }
#[instrument(skip(cfg, result))]
pub async fn publish_job_result(cfg: &MqConfig, result: &JobResult) -> Result<()> {
let conn = connect(cfg).await?;
let channel = conn.create_channel().await.into_diagnostic()?;
// Ensure main exchange exists
channel
.exchange_declare(
&cfg.exchange,
lapin::ExchangeKind::Direct,
ExchangeDeclareOptions { durable: true, auto_delete: false, internal: false, nowait: false, passive: false },
FieldTable::default(),
)
.await
.into_diagnostic()?;
// Enable publisher confirms
channel
.confirm_select(ConfirmSelectOptions::default())
.await
.into_diagnostic()?;
let payload = serde_json::to_vec(result).into_diagnostic()?;
let props = BasicProperties::default()
.with_content_type("application/json".into())
.with_content_encoding("utf-8".into())
.with_type(ShortString::from(result.schema_version.clone()))
.with_delivery_mode(2u8.into());
// Route by schema version; default routing key for results
let routing_key = "jobresult.v1";
let confirm = channel
.basic_publish(
&cfg.exchange,
routing_key,
BasicPublishOptions { mandatory: true, ..Default::default() },
&payload,
props,
)
.await
.into_diagnostic()?;
confirm.await.into_diagnostic()?;
Ok(())
}

View file

@ -21,6 +21,8 @@ config = { version = "0.14", default-features = false, features = ["yaml"] }
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2", "gzip", "brotli", "zstd"] } reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2", "gzip", "brotli", "zstd"] }
bytes = "1" bytes = "1"
path-absolutize = "3" path-absolutize = "3"
# gRPC server
tonic = { version = "0.12", features = ["transport"] }
# Compression/decompression # Compression/decompression
zstd = "0.13" zstd = "0.13"
# DB (optional basic persistence) # DB (optional basic persistence)
@ -33,6 +35,7 @@ once_cell = "1"
dashmap = "6" dashmap = "6"
async-trait = "0.1" async-trait = "0.1"
uuid = { version = "1", features = ["v4", "serde"] } uuid = { version = "1", features = ["v4", "serde"] }
futures-util = "0.3.31"
[target.'cfg(target_os = "linux")'.dependencies] [target.'cfg(target_os = "linux")'.dependencies]
virt = { version = "0.3" } virt = { version = "0.3" }

View file

@ -0,0 +1,81 @@
use std::net::SocketAddr;
use futures_util::StreamExt;
use miette::{IntoDiagnostic as _, Result};
use tonic::{Request, Response, Status};
use tracing::{error, info, warn};
use common::{MqConfig, publish_job_result};
use common::runner::v1::{runner_server::{Runner, RunnerServer}, LogItem, Ack};
pub struct RunnerSvc {
mq_cfg: MqConfig,
}
impl RunnerSvc {
pub fn new(mq_cfg: MqConfig) -> Self { Self { mq_cfg } }
}
#[tonic::async_trait]
impl Runner for RunnerSvc {
async fn stream_logs(
&self,
request: Request<tonic::Streaming<LogItem>>,
) -> std::result::Result<Response<Ack>, Status> {
let mut stream = request.into_inner();
let mut req_id: Option<uuid::Uuid> = None;
let mut repo_url: Option<String> = None;
let mut commit_sha: Option<String> = None;
let mut exit_code: i32 = 0;
let mut success: bool = true;
while let Some(item) = stream.next().await.transpose().map_err(|e| Status::internal(e.to_string()))? {
// Correlate request id
if req_id.is_none() {
match uuid::Uuid::parse_str(&item.request_id) {
Ok(u) => req_id = Some(u),
Err(_) => {
warn!(request_id = %item.request_id, "invalid request_id from runner");
}
}
}
if let Some(ev) = item.event {
match ev {
common::runner::v1::log_item::Event::Log(chunk) => {
if chunk.stderr {
info!(request_id = %item.request_id, line = %chunk.line, "runner:stderr");
} else {
info!(request_id = %item.request_id, line = %chunk.line, "runner:stdout");
}
}
common::runner::v1::log_item::Event::End(end) => {
exit_code = end.exit_code;
success = end.success;
repo_url = Some(end.repo_url);
commit_sha = Some(end.commit_sha);
}
}
}
}
// Publish final status if we have enough context
if let (Some(id), Some(repo), Some(sha)) = (req_id, repo_url, commit_sha) {
let result = common::messages::JobResult::new(id, repo, sha, success, exit_code, None);
if let Err(e) = publish_job_result(&self.mq_cfg, &result).await {
error!(error = %e, request_id = %id, "failed to publish JobResult");
}
} else {
warn!(have_req_id = req_id.is_some(), have_repo = repo_url.is_some(), have_sha = commit_sha.is_some(), "missing context for JobResult; skipping publish");
}
Ok(Response::new(Ack { ok: true }))
}
}
pub async fn serve_with_shutdown(addr: SocketAddr, mq_cfg: MqConfig, shutdown: impl std::future::Future<Output = ()>) -> Result<()> {
info!(%addr, "gRPC server starting");
tonic::transport::Server::builder()
.add_service(RunnerServer::new(RunnerSvc::new(mq_cfg)))
.serve_with_shutdown(addr, shutdown)
.await
.into_diagnostic()
}

View file

@ -2,6 +2,7 @@ mod config;
mod hypervisor; mod hypervisor;
mod scheduler; mod scheduler;
mod persist; mod persist;
mod grpc;
use std::{collections::HashMap, path::PathBuf, time::Duration}; use std::{collections::HashMap, path::PathBuf, time::Duration};
@ -97,15 +98,28 @@ async fn main() -> Result<()> {
// Build MQ config and start consumer // Build MQ config and start consumer
let mq_cfg = common::MqConfig { let mq_cfg = common::MqConfig {
url: opts.amqp_url, url: opts.amqp_url.clone(),
exchange: opts.amqp_exchange, exchange: opts.amqp_exchange.clone(),
routing_key: opts.amqp_routing_key, routing_key: opts.amqp_routing_key.clone(),
queue: opts.amqp_queue, queue: opts.amqp_queue.clone(),
dlx: std::env::var("AMQP_DLX").unwrap_or_else(|_| "solstice.dlx".into()), dlx: std::env::var("AMQP_DLX").unwrap_or_else(|_| "solstice.dlx".into()),
dlq: std::env::var("AMQP_DLQ").unwrap_or_else(|_| "solstice.jobs.v1.dlq".into()), dlq: std::env::var("AMQP_DLQ").unwrap_or_else(|_| "solstice.jobs.v1.dlq".into()),
prefetch: opts.amqp_prefetch.unwrap_or(opts.max_concurrency as u16), prefetch: opts.amqp_prefetch.unwrap_or(opts.max_concurrency as u16),
}; };
// Start gRPC server for runner log streaming
let grpc_addr: std::net::SocketAddr = opts.grpc_addr.parse().into_diagnostic()?;
let mq_cfg_for_grpc = mq_cfg.clone();
let (grpc_shutdown_tx, grpc_shutdown_rx) = tokio::sync::oneshot::channel::<()>();
let grpc_task = tokio::spawn(async move {
let _ = crate::grpc::serve_with_shutdown(grpc_addr, mq_cfg_for_grpc, async move {
let _ = grpc_shutdown_rx.await;
}).await;
});
// Orchestrator contact address for runner to dial back (can override via ORCH_CONTACT_ADDR)
let orch_contact = std::env::var("ORCH_CONTACT_ADDR").unwrap_or_else(|_| opts.grpc_addr.clone());
// Scheduler // Scheduler
let sched = Scheduler::new( let sched = Scheduler::new(
router, router,
@ -137,6 +151,7 @@ async fn main() -> Result<()> {
let sched_tx = tx_for_consumer.clone(); let sched_tx = tx_for_consumer.clone();
let cfg = cfg_clone.clone(); let cfg = cfg_clone.clone();
let persist = persist_for_consumer.clone(); let persist = persist_for_consumer.clone();
let orch_contact_val = orch_contact.clone();
async move { async move {
let label_resolved = cfg.resolve_label(job.runs_on.as_deref()).unwrap_or(&cfg.default_label).to_string(); let label_resolved = cfg.resolve_label(job.runs_on.as_deref()).unwrap_or(&cfg.default_label).to_string();
let image = match cfg.image_for(&label_resolved) { let image = match cfg.image_for(&label_resolved) {
@ -162,7 +177,7 @@ async fn main() -> Result<()> {
disk_gb, disk_gb,
network: None, // libvirt network handled in backend network: None, // libvirt network handled in backend
nocloud: image.nocloud, nocloud: image.nocloud,
user_data: Some(make_cloud_init_userdata(&job.repo_url, &job.commit_sha)), user_data: Some(make_cloud_init_userdata(&job.repo_url, &job.commit_sha, job.request_id, &orch_contact_val)),
}; };
if !spec.nocloud { if !spec.nocloud {
warn!(label = %label_resolved, "image is not marked nocloud=true; cloud-init may not work"); warn!(label = %label_resolved, "image is not marked nocloud=true; cloud-init may not work");
@ -188,11 +203,14 @@ async fn main() -> Result<()> {
// Ask scheduler to shut down cooperatively (interrupt placeholders) // Ask scheduler to shut down cooperatively (interrupt placeholders)
sched_shutdown.notify_waiters(); sched_shutdown.notify_waiters();
// Stop gRPC server
let _ = grpc_shutdown_tx.send(());
// Drop sender to let scheduler drain and exit // Drop sender to let scheduler drain and exit
drop(sched_tx); drop(sched_tx);
// Wait for consumer and scheduler to finish concurrently // Wait for consumer, scheduler and grpc to finish concurrently
let (_c_res, _s_res) = tokio::join!(consumer_task, scheduler_task); let (_c_res, _s_res, _g_res) = tokio::join!(consumer_task, scheduler_task, grpc_task);
Ok(()) Ok(())
} }
@ -212,7 +230,7 @@ fn parse_capacity_map(s: Option<&str>) -> HashMap<String, usize> {
m m
} }
fn make_cloud_init_userdata(repo_url: &str, commit_sha: &str) -> Vec<u8> { fn make_cloud_init_userdata(repo_url: &str, commit_sha: &str, request_id: uuid::Uuid, orch_addr: &str) -> Vec<u8> {
let s = format!(r#"#cloud-config let s = format!(r#"#cloud-config
write_files: write_files:
- path: /etc/solstice/job.yaml - path: /etc/solstice/job.yaml
@ -243,6 +261,8 @@ write_files:
export SOLSTICE_REPO_URL='{repo}' export SOLSTICE_REPO_URL='{repo}'
export SOLSTICE_COMMIT_SHA='{sha}' export SOLSTICE_COMMIT_SHA='{sha}'
export SOLSTICE_JOB_FILE='/etc/solstice/job.yaml' export SOLSTICE_JOB_FILE='/etc/solstice/job.yaml'
export SOLSTICE_ORCH_ADDR='{orch_addr}'
export SOLSTICE_REQUEST_ID='{req_id}'
if [ -x "$RUNNER" ]; then if [ -x "$RUNNER" ]; then
"$RUNNER" || true "$RUNNER" || true
else else
@ -252,7 +272,7 @@ write_files:
(command -v poweroff >/dev/null 2>&1 && poweroff) || (command -v shutdown >/dev/null 2>&1 && shutdown -y -i5 -g0) || true (command -v poweroff >/dev/null 2>&1 && poweroff) || (command -v shutdown >/dev/null 2>&1 && shutdown -y -i5 -g0) || true
runcmd: runcmd:
- [ /usr/local/bin/solstice-bootstrap.sh ] - [ /usr/local/bin/solstice-bootstrap.sh ]
"#, repo = repo_url, sha = commit_sha); "#, repo = repo_url, sha = commit_sha, req_id = request_id, orch_addr = orch_addr);
s.into_bytes() s.into_bytes()
} }
@ -274,7 +294,8 @@ mod tests {
#[test] #[test]
fn test_make_cloud_init_userdata_includes_fields() { fn test_make_cloud_init_userdata_includes_fields() {
let data = make_cloud_init_userdata("https://example.com/repo.git", "deadbeef"); let req_id = uuid::Uuid::new_v4();
let data = make_cloud_init_userdata("https://example.com/repo.git", "deadbeef", req_id, "127.0.0.1:50051");
let s = String::from_utf8(data).unwrap(); let s = String::from_utf8(data).unwrap();
assert!(s.contains("#cloud-config")); assert!(s.contains("#cloud-config"));
assert!(s.contains("repo_url: https://example.com/repo.git")); assert!(s.contains("repo_url: https://example.com/repo.git"));
@ -283,5 +304,7 @@ mod tests {
assert!(s.contains("/etc/solstice/job.yaml")); assert!(s.contains("/etc/solstice/job.yaml"));
assert!(s.contains("runcmd:")); assert!(s.contains("runcmd:"));
assert!(s.contains("powering off")); assert!(s.contains("powering off"));
assert!(s.contains("SOLSTICE_ORCH_ADDR"));
assert!(s.contains(&req_id.to_string()));
} }
} }

View file

@ -3,9 +3,18 @@ name = "workflow-runner"
version = "0.1.0" version = "0.1.0"
edition = "2024" edition = "2024"
[[bin]]
name = "solstice-runner"
path = "src/main.rs"
[dependencies] [dependencies]
common = { path = "../common" } common = { path = "../common" }
clap = { version = "4", features = ["derive", "env"] } clap = { version = "4", features = ["derive", "env"] }
miette = { version = "7", features = ["fancy"] } miette = { version = "7", features = ["fancy"] }
tracing = "0.1" tracing = "0.1"
tokio = { version = "1", features = ["rt-multi-thread", "macros", "process"] } tokio = { version = "1", features = ["rt-multi-thread", "macros", "process", "fs", "io-util"] }
serde = { version = "1", features = ["derive"] }
serde_yaml = "0.9"
# gRPC client
tonic = { version = "0.12", features = ["transport"] }
tokio-stream = "0.1"

View file

@ -1,39 +1,159 @@
use clap::Parser; use clap::Parser;
use miette::{IntoDiagnostic, Result}; use miette::{IntoDiagnostic as _, Result};
use tracing::{info, error}; use serde::Deserialize;
use tokio::{fs, process::Command, io::{AsyncBufReadExt, BufReader}};
use std::process::Stdio;
use tokio_stream::wrappers::ReceiverStream;
use tracing::{error, info, warn};
use tokio::sync::mpsc;
use common::runner::v1::{runner_client::RunnerClient, log_item::Event, LogItem, LogChunk, JobEnd};
use tonic::transport::Channel;
#[derive(Parser, Debug)] #[derive(Parser, Debug)]
#[command(name = "solstice-runner", version, about = "Solstice CI Workflow Runner (VM agent)")] #[command(name = "solstice-runner", version, about = "Solstice CI Workflow Runner (VM agent)")]
struct Opts { struct Opts {
/// Path to workflow KDL file /// Optional path to workflow KDL file (for local testing only)
#[arg(long, env = "SOL_WORKFLOW_PATH")] #[arg(long, env = "SOL_WORKFLOW_PATH")]
workflow: String, workflow: Option<String>,
}
#[derive(Debug, Deserialize)]
struct JobFile {
repo_url: String,
commit_sha: String,
}
async fn read_job_file() -> Result<JobFile> {
let path = std::env::var("SOLSTICE_JOB_FILE").unwrap_or_else(|_| "/etc/solstice/job.yaml".into());
let bytes = fs::read(&path).await.into_diagnostic()?;
let jf: JobFile = serde_yaml::from_slice(&bytes).into_diagnostic()?;
Ok(jf)
}
async fn run_shell(cmd: &str) -> Result<i32> {
info!(%cmd, "exec");
let status = Command::new("/bin/sh").arg("-lc").arg(cmd).status().await.into_diagnostic()?;
Ok(status.code().unwrap_or(1))
}
async fn ensure_repo(repo: &str, sha: &str, workdir: &str) -> Result<()> {
fs::create_dir_all(workdir).await.into_diagnostic()?;
// Use system git to avoid libgit2 cross issues
let cmds = vec![
format!("cd {workdir} && git init"),
format!("cd {workdir} && git remote remove origin >/dev/null 2>&1 || true && git remote add origin {repo}"),
format!("cd {workdir} && git fetch --depth=1 origin {sha}"),
format!("cd {workdir} && git checkout -q FETCH_HEAD"),
];
for c in cmds { let _ = run_shell(&c).await?; }
Ok(())
}
async fn run_job_script_streamed(workdir: &str, tx: Option<mpsc::Sender<LogItem>>, request_id: &str) -> Result<i32> {
let script = format!("{}/.solstice/job.sh", workdir);
if !fs::try_exists(&script).await.into_diagnostic()? {
warn!(path = %script, "job script not found");
return Ok(0);
}
let _ = run_shell(&format!("chmod +x {} || true", script)).await?;
let mut cmd = Command::new("/bin/sh");
cmd.arg("-lc").arg(format!("cd {workdir} && {}", script))
.stdout(Stdio::piped())
.stderr(Stdio::piped());
let mut child = cmd.spawn().into_diagnostic()?;
if let Some(tx) = tx.clone() {
if let Some(stdout) = child.stdout.take() {
let mut lines = BufReader::new(stdout).lines();
let tx2 = tx.clone();
let req = request_id.to_string();
tokio::spawn(async move {
while let Ok(Some(line)) = lines.next_line().await {
let _ = tx2.send(LogItem { request_id: req.clone(), event: Some(Event::Log(LogChunk { line, stderr: false })) }).await;
}
});
}
if let Some(stderr) = child.stderr.take() {
let mut lines = BufReader::new(stderr).lines();
let tx2 = tx.clone();
let req = request_id.to_string();
tokio::spawn(async move {
while let Ok(Some(line)) = lines.next_line().await {
let _ = tx2.send(LogItem { request_id: req.clone(), event: Some(Event::Log(LogChunk { line, stderr: true })) }).await;
}
});
}
} else {
// If no streaming, still attach to child I/O to avoid blocking
let _ = child.stdout.take();
let _ = child.stderr.take();
}
let status = child.wait().await.into_diagnostic()?;
Ok(status.code().unwrap_or(1))
} }
#[tokio::main(flavor = "multi_thread")] #[tokio::main(flavor = "multi_thread")]
async fn main() -> Result<()> { async fn main() -> Result<()> {
let _t = common::init_tracing("solstice-workflow-runner")?; let _t = common::init_tracing("solstice-workflow-runner")?;
let opts = Opts::parse(); let _opts = Opts::parse();
let wf = match common::parse_workflow_file(&opts.workflow) { // Try env overrides first for robustness
Ok(wf) => wf, let repo = std::env::var("SOLSTICE_REPO_URL").ok();
Err(e) => { let sha = std::env::var("SOLSTICE_COMMIT_SHA").ok();
error!(error = %e, "failed to parse workflow KDL");
return Err(e); let (repo, sha) = match (repo, sha) {
(Some(r), Some(s)) => (r, s),
_ => {
let jf = read_job_file().await?;
(jf.repo_url, jf.commit_sha)
} }
}; };
info!(name = ?wf.name, jobs = wf.jobs.len(), "loaded workflow"); info!(%repo, %sha, "runner starting");
let workdir = std::env::var("SOLSTICE_WORKDIR").unwrap_or_else(|_| "/root/work".into());
for (id, job) in &wf.jobs { // Setup gRPC streaming if orchestrator address and request id are provided
println!("Job: {id}"); let orch_addr = std::env::var("SOLSTICE_ORCH_ADDR").ok();
if let Some(ro) = &job.runs_on { println!(" runs_on: {ro}"); } let request_id = std::env::var("SOLSTICE_REQUEST_ID").ok();
for (idx, step) in job.steps.iter().enumerate() { let mut tx_opt: Option<mpsc::Sender<LogItem>> = None;
let n = step.name.as_deref().unwrap_or("(unnamed)"); if let (Some(addr), Some(req_id)) = (orch_addr.clone(), request_id.clone()) {
println!(" Step {}/{}: {}", idx + 1, job.steps.len(), n); let (tx, rx) = mpsc::channel::<LogItem>(256);
println!(" run: {}", step.run); let stream = ReceiverStream::new(rx);
// Spawn client task
tokio::spawn(async move {
match RunnerClient::connect(format!("http://{addr}" )).await {
Ok(mut client) => {
let _ = client.stream_logs(stream).await; // ignore result
}
Err(e) => {
warn!(error = %e, "failed to connect to orchestrator gRPC; logs will not be streamed");
}
}
});
tx_opt = Some(tx);
// Send a first line
if let Some(ref tx) = tx_opt {
let _ = tx.send(LogItem { request_id: req_id.clone(), event: Some(Event::Log(LogChunk { line: format!("runner starting: repo={repo} sha={sha}"), stderr: false })) }).await;
} }
} }
ensure_repo(&repo, &sha, &workdir).await?;
let code = run_job_script_streamed(&workdir, tx_opt.clone(), request_id.as_deref().unwrap_or("")).await?;
// Send JobEnd if streaming enabled
if let (Some(tx), Some(req_id)) = (tx_opt.clone(), request_id.clone()) {
let _ = tx.send(LogItem { request_id: req_id.clone(), event: Some(Event::End(JobEnd { exit_code: code, success: code == 0, repo_url: repo.clone(), commit_sha: sha.clone() })) }).await;
// Give the client task a brief moment to flush
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
}
if code != 0 {
error!(exit_code = code, "job script failed");
std::process::exit(code);
}
info!("job complete");
Ok(()) Ok(())
} }

View file

@ -0,0 +1,43 @@
### Solstice CI — Workflow Runner bootstrap and cross builds (MVP)
Summary
- Implemented a minimal VM workflow runner binary (solstice-runner) that the orchestrators cloud-init bootstraps and executes inside the guest.
- The runner is cross-compilable with cross for convenient deployment to local dev VMs.
What the runner does (today)
- Reads job context from env or a small YAML file:
- Env: SOLSTICE_REPO_URL, SOLSTICE_COMMIT_SHA
- File: SOLSTICE_JOB_FILE (defaults to /etc/solstice/job.yaml) with keys repo_url and commit_sha
- Prepares a workspace (default /root/work; overridable with SOLSTICE_WORKDIR).
- Uses system git to fetch the repository at the exact commit (avoids libgit2 to make cross builds simpler).
- Executes .solstice/job.sh when present, streaming stdout/stderr. Exits with the scripts exit code.
- Logs via tracing (stderr), compatible with the serial console setup added to libvirt.
Why this design
- Keeps the guest-side binary very small, with minimal dependencies, easing cross builds.
- Shelling out to git leverages whatever the base image provides and avoids cross-compiling libgit2.
- Aligns with the orchestrators cloud-init that writes /etc/solstice/job.yaml and exports the same env var names.
Build and usage
- Build all: cargo build --workspace
- Build only the runner: cargo build -p workflow-runner
- Binary name in target directory: solstice-runner
Cross compiling
- A Cross.toml is provided at the workspace root. Example targets:
- x86_64-unknown-linux-gnu
- x86_64-unknown-illumos (requires a recent toolchain with illumos std)
Examples:
- cross build -p workflow-runner --target x86_64-unknown-linux-gnu --release
- cross build -p workflow-runner --target x86_64-unknown-illumos --release
Notes:
- Ensure the base VM image has /bin/sh and git installed (runner relies on both).
- On OpenIndiana/illumos images, prefer installing git via pkg or image packaging beforehand; the runner will not attempt to install packages.
- The orchestrator cloud-init already bootstraps /usr/local/bin/solstice-runner and calls it; set SOLSTICE_RUNNER_URL to point to an HTTP(S) URL hosting the cross-built artifact for quick iteration.
Next steps
- Add optional KDL workflow execution when .solstice/job.sh is absent (parse .solstice/workflow.kdl and run steps).
- Stream logs back to the orchestrator over gRPC and report final status to the Integration layer.
- Secrets injection and masking in logs.