Add heuristic failure detection and improve runner URL configuration

This commit introduces:
- A heuristic to mark jobs as failed if VMs stop quickly without generating logs.
- Improved configuration for runner URLs, including auto-detection of host IPs and default multi-OS runner URLs.
- Updates to the orchestrator's HTTP routing for consistency.
- New task scripts for Forge integration and updates to environment defaults for local development.
This commit is contained in:
Till Wegmueller 2025-11-03 22:36:31 +01:00
parent 81a93ef1a7
commit c00ce54112
No known key found for this signature in database
7 changed files with 99 additions and 3 deletions

6
.idea/data_source_mapping.xml generated Normal file
View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourcePerFileMappings">
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/8be7b89e-6fc5-4e5d-a84a-57129d3a04d4/console.sql" value="8be7b89e-6fc5-4e5d-a84a-57129d3a04d4" />
</component>
</project>

View file

@ -0,0 +1,31 @@
#!/usr/bin/env bash
set -euo pipefail
# Run the Solstice Forge Integration (webhook listener) with local defaults.
# Intended for local development with Hookdeck CLI forwarding.
# Example (in another terminal):
# hookdeck listen http://localhost:8080/webhooks/forgejo
command -v cargo >/dev/null 2>&1 || { echo "cargo is required" >&2; exit 127; }
export RUST_LOG=${RUST_LOG:-info}
# HTTP listener configuration
export HTTP_ADDR=${HTTP_ADDR:-0.0.0.0:8080}
export WEBHOOK_PATH=${WEBHOOK_PATH:-/webhooks/forgejo}
# Optional shared secret used to verify webhook signatures. Leave empty for dev.
export WEBHOOK_SECRET=${WEBHOOK_SECRET:-}
# AMQP defaults for local dev (RabbitMQ)
export AMQP_URL=${AMQP_URL:-amqp://127.0.0.1:5672/%2f}
export AMQP_EXCHANGE=${AMQP_EXCHANGE:-solstice.jobs}
export AMQP_QUEUE=${AMQP_QUEUE:-solstice.jobs.v1}
export AMQP_ROUTING_KEY=${AMQP_ROUTING_KEY:-jobrequest.v1}
export SOL_RUNS_ON=ubuntu-22.04
# Note: Other optional envs are respected by the binary if set:
# FORGEJO_BASE_URL, FORGEJO_TOKEN, FORGE_CONTEXT
# ORCH_HTTP_BASE, S3_ENDPOINT, S3_BUCKET, RUNS_ON_DEFAULT, RUNS_ON_MAP
exec cargo run -p forge-integration -- \
--http-addr "$HTTP_ADDR" \
--webhook-path "$WEBHOOK_PATH"

View file

@ -9,6 +9,37 @@ export AMQP_QUEUE=${AMQP_QUEUE:-solstice.jobs.v1}
export AMQP_ROUTING_KEY=${AMQP_ROUTING_KEY:-jobrequest.v1}
export AMQP_PREFETCH=${AMQP_PREFETCH:-2}
export GRPC_ADDR=${GRPC_ADDR:-0.0.0.0:50051}
export DATABASE_URL=${DATABASE_URL:-postgres://solstice:solstice@127.0.0.1:5432/solstice}
# Detect a host IP reachable from VMs (prefer virbr0, fallback to 127.0.0.1)
if command -v ip >/dev/null 2>&1 && ip addr show virbr0 >/dev/null 2>&1; then
HOST_IP=$(ip -o -4 addr show virbr0 | awk '{print $4}' | cut -d/ -f1 | head -n1)
else
HOST_IP=${HOST_IP_OVERRIDE:-127.0.0.1}
fi
# Contact address for gRPC log streaming from guests (used in cloud-init)
export ORCH_CONTACT_ADDR=${ORCH_CONTACT_ADDR:-$HOST_IP:50051}
# Auto-compose runner URLs if not provided, to match ci:vm-build behavior
# You must run a runner server in another terminal first:
# mise run run:runner-serve-multi (serves on 8090/8091 by default)
# or: mise run run:runner-serve (serves on 8089 by default)
if [[ -z "${SOLSTICE_RUNNER_URLS:-}" && -z "${SOLSTICE_RUNNER_URL:-}" ]]; then
# Multi-OS defaults
SOL_RUNNER_PORT_LINUX=${SOL_RUNNER_PORT_LINUX:-8090}
SOL_RUNNER_PORT_ILLUMOS=${SOL_RUNNER_PORT_ILLUMOS:-8091}
LINUX_URL="http://$HOST_IP:$SOL_RUNNER_PORT_LINUX/solstice-runner-linux"
ILLUMOS_URL="http://$HOST_IP:$SOL_RUNNER_PORT_ILLUMOS/solstice-runner-illumos"
export SOLSTICE_RUNNER_URLS="$LINUX_URL $ILLUMOS_URL"
# Also set single-runner URL fallback if someone uses run:runner-serve
SOL_RUNNER_PORT=${SOL_RUNNER_PORT:-8089}
export SOLSTICE_RUNNER_URL=${SOLSTICE_RUNNER_URL:-"http://$HOST_IP:$SOL_RUNNER_PORT/solstice-runner"}
echo "Using default runner URLs:" >&2
echo " SOLSTICE_RUNNER_URLS=$SOLSTICE_RUNNER_URLS" >&2
echo " SOLSTICE_RUNNER_URL=$SOLSTICE_RUNNER_URL" >&2
echo "Override by exporting SOLSTICE_RUNNER_URLS or SOLSTICE_RUNNER_URL before running this task." >&2
fi
# For Linux + libvirt users, customize via LIBVIRT_URI and LIBVIRT_NETWORK
exec cargo run -p orchestrator --features libvirt -- \
--config "$ORCH_CONFIG" \

6
TODO.txt Normal file
View file

@ -0,0 +1,6 @@
- Return status to codeberg: Extract repo from Webhook and keep in messages
- Make RabbitMQ Messages Print nicely
- move runner logs to debug level so they can be logged in the CI job but don't spam the deployed version
- Make Orchestrator serve the runner binaries so no external server is needed
- Make orchestrator detect the address it will be reachable by checking the libvirt config or on illumos use it's external IP
- Make VM reachable IP of the orchestrator configurable in case the setup on illumos gets more complicated (via config file)

View file

@ -14,7 +14,7 @@ pub struct HttpState {
pub fn build_router(persist: Arc<Persist>) -> Router {
let state = HttpState { persist };
Router::new()
.route("/jobs/:request_id/logs", get(get_logs))
.route("/jobs/{request_id}/logs", get(get_logs))
.with_state(state)
}

View file

@ -152,12 +152,33 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
}
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
// Heuristic: if the VM stopped very quickly and we have no runner logs, treat as failure
let ran_secs = start_time.elapsed().as_secs();
let logs_text = match persist.get_logs_text(item.ctx.request_id).await {
Ok(opt) => opt.unwrap_or_default(),
Err(e) => { warn!(error = %e, request_id = %item.ctx.request_id, "failed to fetch logs for completion check"); String::new() }
};
let logs_nonempty = logs_text.trim().chars().next().is_some();
let mut succeeded = true;
let mut reason: &str = "succeeded";
if ran_secs < 15 && !logs_nonempty {
// Likely cloud-init/runner never started; mark failed
succeeded = false;
reason = "failed";
warn!(request_id = %item.ctx.request_id, label = %label_key, ran_secs, "vm stopped quickly with no logs; marking job failed");
}
if let Err(e) = hv.destroy(h.clone()).await {
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to destroy VM");
}
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Destroyed).await;
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Succeeded).await;
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "succeeded");
let _ = persist.record_job_state(
item.ctx.request_id,
&item.ctx.repo_url,
&item.ctx.commit_sha,
Some(&item.spec.label),
if succeeded { JobState::Succeeded } else { JobState::Failed }
).await;
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, reason);
}
}
Err(e) => {

View file

@ -8,3 +8,4 @@ vault = "Development"
[secrets]
OP_SERVICE_ACCOUNT_TOKEN= { provider = "age", value = "YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBKZnh1bW5rRzV5N0FNcTgrOWVIS1VSSmtqUmZ6b0k1RHJXR0Qyek14cEdBClZzdlY3Q3VWSFZydnJqMWNLSTc5V3BqUFZaUkRMSnFib3JUZHhxU2JpQXcKLS0tIGJvNnRrTHEvYTIxaEFMVFNxVkZlVTlYUEV5TmFCK0ExdDBZNFRpSkNUeUkKPKHB+as1NIejSD81EtZYsj2csqJ3hd9PHjQU39PBr5PZZD0efWeJmU67/Esen5xFfBJ9V0OY9Ola0hKZqMux4MAe+7DHuR7FKhaT9Ttfv25HA4QN/F8BLckA6kX42m0jcF8+IQBasiVmaLd+LtZXd+fNZm/S36pHBFvyZwbuCiW7ZSzi2cl27IbIxbo3bUr07p0JntqF+LOKt8Qu67il5C4T3eslaIs9QkFzrjXuVHsmrKRv+/b7LvSK8aCRJIxtDkXgppcH5CHPktIWuTeixwf1znW7UC5gm8w1I5FWQP7jRenjBrR3iV3erbSQPJk3RDAAXKTIptKVoVgikv0EMjI9Bn1K9Z8HSalc6gjyvZihOOsqnvLHsI3nFheuVVwl+G2p/lHwTrb74z+TWKZBrsR3jDlR56jwh4Au6nnv3IPa3lvd3nQ7SL6MRQfTknqyT0hDaH/2+rFv8hHA4dwFhV4nLrbfse3U1jsyLqE8EL5nLAFKOwaJfPfGnadmsaAq9xtuOffKHVcX3mBH9cKv6yvLXJldUZc+v3AFAu0N/KKdyfWe6I0q37GC1/0gJWymH5uJ59cYmSR3xJ/6mfwKg2y67m9se1o2q6qWzUe7ouuN3PNKM6NDKuAg7TUIcajZlylTyMIPUaWJR+RiZnbzAQB1BnMXQ0eAYcElfpOFP5baVc7v8nOZXycvBFXvY+IXYtN1FcvlxSCFv/icD3q4mMtWhtTcoEYpi8bmf40SEcFHXT4mM+gp57Fx6TakpwA9+r/avQoQwyi6Z3HZc6BaCUW3NMrDV39igbuNcOOF4rSE3ppZetkniZFq8apdCbj7Sy4yHp8zkczv7eJGaWHwOTjdcA2m3dOGBfraH6sYrddtvoLF7NPQQYprLsDTbp4j1sHwz1ZtwdH76cz5JmzaluHxUy8XirsmHX+Hw+GUGe/uIy0IYnQrjJuiKEIid1eptoTqfCk9olXM5lxbR50YZlgbtNxcH9E0gLm3TbmQ7quxfTS3f90RBaWPzz65DC4iFo9OBxj6dCK5ZYOQZrwK1OBuwdNlYoE+haZg6Ct0/ZcAolQQtN1AEGDfXIwocfe8IPcyEhHCKLTj6GBt4ayxD7Ajo/ZOktyLKVcNytA1vF44WjVBP3StZE0I+QDpupDJR19KHO03t9Sapq9GdpcGWA9IbO8=" }
WEBHOOK_SECRET= { default = "KVR1fxg6vaw1dfb.wpa", value = "KVR1fxg6vaw1dfb.wpa" }