From c00ce54112a70eb99028cf78b2634f8b773eac937416a3f23c20727185e61859 Mon Sep 17 00:00:00 2001 From: Till Wegmueller Date: Mon, 3 Nov 2025 22:36:31 +0100 Subject: [PATCH] Add heuristic failure detection and improve runner URL configuration This commit introduces: - A heuristic to mark jobs as failed if VMs stop quickly without generating logs. - Improved configuration for runner URLs, including auto-detection of host IPs and default multi-OS runner URLs. - Updates to the orchestrator's HTTP routing for consistency. - New task scripts for Forge integration and updates to environment defaults for local development. --- .idea/data_source_mapping.xml | 6 ++++++ .mise/tasks/run/forge-integration | 31 ++++++++++++++++++++++++++++ .mise/tasks/run/orchestrator | 31 ++++++++++++++++++++++++++++ TODO.txt | 6 ++++++ crates/orchestrator/src/http.rs | 2 +- crates/orchestrator/src/scheduler.rs | 25 ++++++++++++++++++++-- fnox.toml | 1 + 7 files changed, 99 insertions(+), 3 deletions(-) create mode 100644 .idea/data_source_mapping.xml create mode 100755 .mise/tasks/run/forge-integration create mode 100644 TODO.txt diff --git a/.idea/data_source_mapping.xml b/.idea/data_source_mapping.xml new file mode 100644 index 0000000..b53bc61 --- /dev/null +++ b/.idea/data_source_mapping.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.mise/tasks/run/forge-integration b/.mise/tasks/run/forge-integration new file mode 100755 index 0000000..35c4052 --- /dev/null +++ b/.mise/tasks/run/forge-integration @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail +# Run the Solstice Forge Integration (webhook listener) with local defaults. +# Intended for local development with Hookdeck CLI forwarding. +# Example (in another terminal): +# hookdeck listen http://localhost:8080/webhooks/forgejo + +command -v cargo >/dev/null 2>&1 || { echo "cargo is required" >&2; exit 127; } + +export RUST_LOG=${RUST_LOG:-info} + +# HTTP listener configuration +export HTTP_ADDR=${HTTP_ADDR:-0.0.0.0:8080} +export WEBHOOK_PATH=${WEBHOOK_PATH:-/webhooks/forgejo} +# Optional shared secret used to verify webhook signatures. Leave empty for dev. +export WEBHOOK_SECRET=${WEBHOOK_SECRET:-} + +# AMQP defaults for local dev (RabbitMQ) +export AMQP_URL=${AMQP_URL:-amqp://127.0.0.1:5672/%2f} +export AMQP_EXCHANGE=${AMQP_EXCHANGE:-solstice.jobs} +export AMQP_QUEUE=${AMQP_QUEUE:-solstice.jobs.v1} +export AMQP_ROUTING_KEY=${AMQP_ROUTING_KEY:-jobrequest.v1} +export SOL_RUNS_ON=ubuntu-22.04 + +# Note: Other optional envs are respected by the binary if set: +# FORGEJO_BASE_URL, FORGEJO_TOKEN, FORGE_CONTEXT +# ORCH_HTTP_BASE, S3_ENDPOINT, S3_BUCKET, RUNS_ON_DEFAULT, RUNS_ON_MAP + +exec cargo run -p forge-integration -- \ + --http-addr "$HTTP_ADDR" \ + --webhook-path "$WEBHOOK_PATH" diff --git a/.mise/tasks/run/orchestrator b/.mise/tasks/run/orchestrator index 7043921..96616bd 100755 --- a/.mise/tasks/run/orchestrator +++ b/.mise/tasks/run/orchestrator @@ -9,6 +9,37 @@ export AMQP_QUEUE=${AMQP_QUEUE:-solstice.jobs.v1} export AMQP_ROUTING_KEY=${AMQP_ROUTING_KEY:-jobrequest.v1} export AMQP_PREFETCH=${AMQP_PREFETCH:-2} export GRPC_ADDR=${GRPC_ADDR:-0.0.0.0:50051} +export DATABASE_URL=${DATABASE_URL:-postgres://solstice:solstice@127.0.0.1:5432/solstice} + +# Detect a host IP reachable from VMs (prefer virbr0, fallback to 127.0.0.1) +if command -v ip >/dev/null 2>&1 && ip addr show virbr0 >/dev/null 2>&1; then + HOST_IP=$(ip -o -4 addr show virbr0 | awk '{print $4}' | cut -d/ -f1 | head -n1) +else + HOST_IP=${HOST_IP_OVERRIDE:-127.0.0.1} +fi +# Contact address for gRPC log streaming from guests (used in cloud-init) +export ORCH_CONTACT_ADDR=${ORCH_CONTACT_ADDR:-$HOST_IP:50051} + +# Auto-compose runner URLs if not provided, to match ci:vm-build behavior +# You must run a runner server in another terminal first: +# mise run run:runner-serve-multi (serves on 8090/8091 by default) +# or: mise run run:runner-serve (serves on 8089 by default) +if [[ -z "${SOLSTICE_RUNNER_URLS:-}" && -z "${SOLSTICE_RUNNER_URL:-}" ]]; then + # Multi-OS defaults + SOL_RUNNER_PORT_LINUX=${SOL_RUNNER_PORT_LINUX:-8090} + SOL_RUNNER_PORT_ILLUMOS=${SOL_RUNNER_PORT_ILLUMOS:-8091} + LINUX_URL="http://$HOST_IP:$SOL_RUNNER_PORT_LINUX/solstice-runner-linux" + ILLUMOS_URL="http://$HOST_IP:$SOL_RUNNER_PORT_ILLUMOS/solstice-runner-illumos" + export SOLSTICE_RUNNER_URLS="$LINUX_URL $ILLUMOS_URL" + # Also set single-runner URL fallback if someone uses run:runner-serve + SOL_RUNNER_PORT=${SOL_RUNNER_PORT:-8089} + export SOLSTICE_RUNNER_URL=${SOLSTICE_RUNNER_URL:-"http://$HOST_IP:$SOL_RUNNER_PORT/solstice-runner"} + echo "Using default runner URLs:" >&2 + echo " SOLSTICE_RUNNER_URLS=$SOLSTICE_RUNNER_URLS" >&2 + echo " SOLSTICE_RUNNER_URL=$SOLSTICE_RUNNER_URL" >&2 + echo "Override by exporting SOLSTICE_RUNNER_URLS or SOLSTICE_RUNNER_URL before running this task." >&2 +fi + # For Linux + libvirt users, customize via LIBVIRT_URI and LIBVIRT_NETWORK exec cargo run -p orchestrator --features libvirt -- \ --config "$ORCH_CONFIG" \ diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000..12cd241 --- /dev/null +++ b/TODO.txt @@ -0,0 +1,6 @@ +- Return status to codeberg: Extract repo from Webhook and keep in messages +- Make RabbitMQ Messages Print nicely +- move runner logs to debug level so they can be logged in the CI job but don't spam the deployed version +- Make Orchestrator serve the runner binaries so no external server is needed +- Make orchestrator detect the address it will be reachable by checking the libvirt config or on illumos use it's external IP +- Make VM reachable IP of the orchestrator configurable in case the setup on illumos gets more complicated (via config file) \ No newline at end of file diff --git a/crates/orchestrator/src/http.rs b/crates/orchestrator/src/http.rs index 3b61878..16ffc02 100644 --- a/crates/orchestrator/src/http.rs +++ b/crates/orchestrator/src/http.rs @@ -14,7 +14,7 @@ pub struct HttpState { pub fn build_router(persist: Arc) -> Router { let state = HttpState { persist }; Router::new() - .route("/jobs/:request_id/logs", get(get_logs)) + .route("/jobs/{request_id}/logs", get(get_logs)) .with_state(state) } diff --git a/crates/orchestrator/src/scheduler.rs b/crates/orchestrator/src/scheduler.rs index 9f8fb6b..5acdb0c 100644 --- a/crates/orchestrator/src/scheduler.rs +++ b/crates/orchestrator/src/scheduler.rs @@ -152,12 +152,33 @@ impl Scheduler { error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM"); } let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await; + // Heuristic: if the VM stopped very quickly and we have no runner logs, treat as failure + let ran_secs = start_time.elapsed().as_secs(); + let logs_text = match persist.get_logs_text(item.ctx.request_id).await { + Ok(opt) => opt.unwrap_or_default(), + Err(e) => { warn!(error = %e, request_id = %item.ctx.request_id, "failed to fetch logs for completion check"); String::new() } + }; + let logs_nonempty = logs_text.trim().chars().next().is_some(); + let mut succeeded = true; + let mut reason: &str = "succeeded"; + if ran_secs < 15 && !logs_nonempty { + // Likely cloud-init/runner never started; mark failed + succeeded = false; + reason = "failed"; + warn!(request_id = %item.ctx.request_id, label = %label_key, ran_secs, "vm stopped quickly with no logs; marking job failed"); + } if let Err(e) = hv.destroy(h.clone()).await { error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to destroy VM"); } let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Destroyed).await; - let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Succeeded).await; - info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "succeeded"); + let _ = persist.record_job_state( + item.ctx.request_id, + &item.ctx.repo_url, + &item.ctx.commit_sha, + Some(&item.spec.label), + if succeeded { JobState::Succeeded } else { JobState::Failed } + ).await; + info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, reason); } } Err(e) => { diff --git a/fnox.toml b/fnox.toml index 0072665..0441afe 100644 --- a/fnox.toml +++ b/fnox.toml @@ -8,3 +8,4 @@ vault = "Development" [secrets] OP_SERVICE_ACCOUNT_TOKEN= { provider = "age", value = "YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBKZnh1bW5rRzV5N0FNcTgrOWVIS1VSSmtqUmZ6b0k1RHJXR0Qyek14cEdBClZzdlY3Q3VWSFZydnJqMWNLSTc5V3BqUFZaUkRMSnFib3JUZHhxU2JpQXcKLS0tIGJvNnRrTHEvYTIxaEFMVFNxVkZlVTlYUEV5TmFCK0ExdDBZNFRpSkNUeUkKPKHB+as1NIejSD81EtZYsj2csqJ3hd9PHjQU39PBr5PZZD0efWeJmU67/Esen5xFfBJ9V0OY9Ola0hKZqMux4MAe+7DHuR7FKhaT9Ttfv25HA4QN/F8BLckA6kX42m0jcF8+IQBasiVmaLd+LtZXd+fNZm/S36pHBFvyZwbuCiW7ZSzi2cl27IbIxbo3bUr07p0JntqF+LOKt8Qu67il5C4T3eslaIs9QkFzrjXuVHsmrKRv+/b7LvSK8aCRJIxtDkXgppcH5CHPktIWuTeixwf1znW7UC5gm8w1I5FWQP7jRenjBrR3iV3erbSQPJk3RDAAXKTIptKVoVgikv0EMjI9Bn1K9Z8HSalc6gjyvZihOOsqnvLHsI3nFheuVVwl+G2p/lHwTrb74z+TWKZBrsR3jDlR56jwh4Au6nnv3IPa3lvd3nQ7SL6MRQfTknqyT0hDaH/2+rFv8hHA4dwFhV4nLrbfse3U1jsyLqE8EL5nLAFKOwaJfPfGnadmsaAq9xtuOffKHVcX3mBH9cKv6yvLXJldUZc+v3AFAu0N/KKdyfWe6I0q37GC1/0gJWymH5uJ59cYmSR3xJ/6mfwKg2y67m9se1o2q6qWzUe7ouuN3PNKM6NDKuAg7TUIcajZlylTyMIPUaWJR+RiZnbzAQB1BnMXQ0eAYcElfpOFP5baVc7v8nOZXycvBFXvY+IXYtN1FcvlxSCFv/icD3q4mMtWhtTcoEYpi8bmf40SEcFHXT4mM+gp57Fx6TakpwA9+r/avQoQwyi6Z3HZc6BaCUW3NMrDV39igbuNcOOF4rSE3ppZetkniZFq8apdCbj7Sy4yHp8zkczv7eJGaWHwOTjdcA2m3dOGBfraH6sYrddtvoLF7NPQQYprLsDTbp4j1sHwz1ZtwdH76cz5JmzaluHxUy8XirsmHX+Hw+GUGe/uIy0IYnQrjJuiKEIid1eptoTqfCk9olXM5lxbR50YZlgbtNxcH9E0gLm3TbmQ7quxfTS3f90RBaWPzz65DC4iFo9OBxj6dCK5ZYOQZrwK1OBuwdNlYoE+haZg6Ct0/ZcAolQQtN1AEGDfXIwocfe8IPcyEhHCKLTj6GBt4ayxD7Ajo/ZOktyLKVcNytA1vF44WjVBP3StZE0I+QDpupDJR19KHO03t9Sapq9GdpcGWA9IbO8=" } +WEBHOOK_SECRET= { default = "KVR1fxg6vaw1dfb.wpa", value = "KVR1fxg6vaw1dfb.wpa" }