mirror of
https://codeberg.org/Toasterson/solstice-ci.git
synced 2026-04-10 13:20:41 +00:00
Add heuristic failure detection and improve runner URL configuration
This commit introduces: - A heuristic to mark jobs as failed if VMs stop quickly without generating logs. - Improved configuration for runner URLs, including auto-detection of host IPs and default multi-OS runner URLs. - Updates to the orchestrator's HTTP routing for consistency. - New task scripts for Forge integration and updates to environment defaults for local development.
This commit is contained in:
parent
81a93ef1a7
commit
c00ce54112
7 changed files with 99 additions and 3 deletions
6
.idea/data_source_mapping.xml
generated
Normal file
6
.idea/data_source_mapping.xml
generated
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="DataSourcePerFileMappings">
|
||||||
|
<file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/8be7b89e-6fc5-4e5d-a84a-57129d3a04d4/console.sql" value="8be7b89e-6fc5-4e5d-a84a-57129d3a04d4" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
31
.mise/tasks/run/forge-integration
Executable file
31
.mise/tasks/run/forge-integration
Executable file
|
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
# Run the Solstice Forge Integration (webhook listener) with local defaults.
|
||||||
|
# Intended for local development with Hookdeck CLI forwarding.
|
||||||
|
# Example (in another terminal):
|
||||||
|
# hookdeck listen http://localhost:8080/webhooks/forgejo
|
||||||
|
|
||||||
|
command -v cargo >/dev/null 2>&1 || { echo "cargo is required" >&2; exit 127; }
|
||||||
|
|
||||||
|
export RUST_LOG=${RUST_LOG:-info}
|
||||||
|
|
||||||
|
# HTTP listener configuration
|
||||||
|
export HTTP_ADDR=${HTTP_ADDR:-0.0.0.0:8080}
|
||||||
|
export WEBHOOK_PATH=${WEBHOOK_PATH:-/webhooks/forgejo}
|
||||||
|
# Optional shared secret used to verify webhook signatures. Leave empty for dev.
|
||||||
|
export WEBHOOK_SECRET=${WEBHOOK_SECRET:-}
|
||||||
|
|
||||||
|
# AMQP defaults for local dev (RabbitMQ)
|
||||||
|
export AMQP_URL=${AMQP_URL:-amqp://127.0.0.1:5672/%2f}
|
||||||
|
export AMQP_EXCHANGE=${AMQP_EXCHANGE:-solstice.jobs}
|
||||||
|
export AMQP_QUEUE=${AMQP_QUEUE:-solstice.jobs.v1}
|
||||||
|
export AMQP_ROUTING_KEY=${AMQP_ROUTING_KEY:-jobrequest.v1}
|
||||||
|
export SOL_RUNS_ON=ubuntu-22.04
|
||||||
|
|
||||||
|
# Note: Other optional envs are respected by the binary if set:
|
||||||
|
# FORGEJO_BASE_URL, FORGEJO_TOKEN, FORGE_CONTEXT
|
||||||
|
# ORCH_HTTP_BASE, S3_ENDPOINT, S3_BUCKET, RUNS_ON_DEFAULT, RUNS_ON_MAP
|
||||||
|
|
||||||
|
exec cargo run -p forge-integration -- \
|
||||||
|
--http-addr "$HTTP_ADDR" \
|
||||||
|
--webhook-path "$WEBHOOK_PATH"
|
||||||
|
|
@ -9,6 +9,37 @@ export AMQP_QUEUE=${AMQP_QUEUE:-solstice.jobs.v1}
|
||||||
export AMQP_ROUTING_KEY=${AMQP_ROUTING_KEY:-jobrequest.v1}
|
export AMQP_ROUTING_KEY=${AMQP_ROUTING_KEY:-jobrequest.v1}
|
||||||
export AMQP_PREFETCH=${AMQP_PREFETCH:-2}
|
export AMQP_PREFETCH=${AMQP_PREFETCH:-2}
|
||||||
export GRPC_ADDR=${GRPC_ADDR:-0.0.0.0:50051}
|
export GRPC_ADDR=${GRPC_ADDR:-0.0.0.0:50051}
|
||||||
|
export DATABASE_URL=${DATABASE_URL:-postgres://solstice:solstice@127.0.0.1:5432/solstice}
|
||||||
|
|
||||||
|
# Detect a host IP reachable from VMs (prefer virbr0, fallback to 127.0.0.1)
|
||||||
|
if command -v ip >/dev/null 2>&1 && ip addr show virbr0 >/dev/null 2>&1; then
|
||||||
|
HOST_IP=$(ip -o -4 addr show virbr0 | awk '{print $4}' | cut -d/ -f1 | head -n1)
|
||||||
|
else
|
||||||
|
HOST_IP=${HOST_IP_OVERRIDE:-127.0.0.1}
|
||||||
|
fi
|
||||||
|
# Contact address for gRPC log streaming from guests (used in cloud-init)
|
||||||
|
export ORCH_CONTACT_ADDR=${ORCH_CONTACT_ADDR:-$HOST_IP:50051}
|
||||||
|
|
||||||
|
# Auto-compose runner URLs if not provided, to match ci:vm-build behavior
|
||||||
|
# You must run a runner server in another terminal first:
|
||||||
|
# mise run run:runner-serve-multi (serves on 8090/8091 by default)
|
||||||
|
# or: mise run run:runner-serve (serves on 8089 by default)
|
||||||
|
if [[ -z "${SOLSTICE_RUNNER_URLS:-}" && -z "${SOLSTICE_RUNNER_URL:-}" ]]; then
|
||||||
|
# Multi-OS defaults
|
||||||
|
SOL_RUNNER_PORT_LINUX=${SOL_RUNNER_PORT_LINUX:-8090}
|
||||||
|
SOL_RUNNER_PORT_ILLUMOS=${SOL_RUNNER_PORT_ILLUMOS:-8091}
|
||||||
|
LINUX_URL="http://$HOST_IP:$SOL_RUNNER_PORT_LINUX/solstice-runner-linux"
|
||||||
|
ILLUMOS_URL="http://$HOST_IP:$SOL_RUNNER_PORT_ILLUMOS/solstice-runner-illumos"
|
||||||
|
export SOLSTICE_RUNNER_URLS="$LINUX_URL $ILLUMOS_URL"
|
||||||
|
# Also set single-runner URL fallback if someone uses run:runner-serve
|
||||||
|
SOL_RUNNER_PORT=${SOL_RUNNER_PORT:-8089}
|
||||||
|
export SOLSTICE_RUNNER_URL=${SOLSTICE_RUNNER_URL:-"http://$HOST_IP:$SOL_RUNNER_PORT/solstice-runner"}
|
||||||
|
echo "Using default runner URLs:" >&2
|
||||||
|
echo " SOLSTICE_RUNNER_URLS=$SOLSTICE_RUNNER_URLS" >&2
|
||||||
|
echo " SOLSTICE_RUNNER_URL=$SOLSTICE_RUNNER_URL" >&2
|
||||||
|
echo "Override by exporting SOLSTICE_RUNNER_URLS or SOLSTICE_RUNNER_URL before running this task." >&2
|
||||||
|
fi
|
||||||
|
|
||||||
# For Linux + libvirt users, customize via LIBVIRT_URI and LIBVIRT_NETWORK
|
# For Linux + libvirt users, customize via LIBVIRT_URI and LIBVIRT_NETWORK
|
||||||
exec cargo run -p orchestrator --features libvirt -- \
|
exec cargo run -p orchestrator --features libvirt -- \
|
||||||
--config "$ORCH_CONFIG" \
|
--config "$ORCH_CONFIG" \
|
||||||
|
|
|
||||||
6
TODO.txt
Normal file
6
TODO.txt
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
- Return status to codeberg: Extract repo from Webhook and keep in messages
|
||||||
|
- Make RabbitMQ Messages Print nicely
|
||||||
|
- move runner logs to debug level so they can be logged in the CI job but don't spam the deployed version
|
||||||
|
- Make Orchestrator serve the runner binaries so no external server is needed
|
||||||
|
- Make orchestrator detect the address it will be reachable by checking the libvirt config or on illumos use it's external IP
|
||||||
|
- Make VM reachable IP of the orchestrator configurable in case the setup on illumos gets more complicated (via config file)
|
||||||
|
|
@ -14,7 +14,7 @@ pub struct HttpState {
|
||||||
pub fn build_router(persist: Arc<Persist>) -> Router {
|
pub fn build_router(persist: Arc<Persist>) -> Router {
|
||||||
let state = HttpState { persist };
|
let state = HttpState { persist };
|
||||||
Router::new()
|
Router::new()
|
||||||
.route("/jobs/:request_id/logs", get(get_logs))
|
.route("/jobs/{request_id}/logs", get(get_logs))
|
||||||
.with_state(state)
|
.with_state(state)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -152,12 +152,33 @@ impl<H: Hypervisor + 'static> Scheduler<H> {
|
||||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
|
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
|
||||||
}
|
}
|
||||||
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
|
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Stopped).await;
|
||||||
|
// Heuristic: if the VM stopped very quickly and we have no runner logs, treat as failure
|
||||||
|
let ran_secs = start_time.elapsed().as_secs();
|
||||||
|
let logs_text = match persist.get_logs_text(item.ctx.request_id).await {
|
||||||
|
Ok(opt) => opt.unwrap_or_default(),
|
||||||
|
Err(e) => { warn!(error = %e, request_id = %item.ctx.request_id, "failed to fetch logs for completion check"); String::new() }
|
||||||
|
};
|
||||||
|
let logs_nonempty = logs_text.trim().chars().next().is_some();
|
||||||
|
let mut succeeded = true;
|
||||||
|
let mut reason: &str = "succeeded";
|
||||||
|
if ran_secs < 15 && !logs_nonempty {
|
||||||
|
// Likely cloud-init/runner never started; mark failed
|
||||||
|
succeeded = false;
|
||||||
|
reason = "failed";
|
||||||
|
warn!(request_id = %item.ctx.request_id, label = %label_key, ran_secs, "vm stopped quickly with no logs; marking job failed");
|
||||||
|
}
|
||||||
if let Err(e) = hv.destroy(h.clone()).await {
|
if let Err(e) = hv.destroy(h.clone()).await {
|
||||||
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to destroy VM");
|
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to destroy VM");
|
||||||
}
|
}
|
||||||
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Destroyed).await;
|
let _ = persist.record_vm_event(item.ctx.request_id, &h.id, overlay, seed, backend, VmPersistState::Destroyed).await;
|
||||||
let _ = persist.record_job_state(item.ctx.request_id, &item.ctx.repo_url, &item.ctx.commit_sha, Some(&item.spec.label), JobState::Succeeded).await;
|
let _ = persist.record_job_state(
|
||||||
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, "succeeded");
|
item.ctx.request_id,
|
||||||
|
&item.ctx.repo_url,
|
||||||
|
&item.ctx.commit_sha,
|
||||||
|
Some(&item.spec.label),
|
||||||
|
if succeeded { JobState::Succeeded } else { JobState::Failed }
|
||||||
|
).await;
|
||||||
|
info!(request_id = %item.ctx.request_id, label = %label_key, "job finished: {} {}", item.ctx.request_id, reason);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
|
|
||||||
|
|
@ -8,3 +8,4 @@ vault = "Development"
|
||||||
|
|
||||||
[secrets]
|
[secrets]
|
||||||
OP_SERVICE_ACCOUNT_TOKEN= { provider = "age", value = "YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBKZnh1bW5rRzV5N0FNcTgrOWVIS1VSSmtqUmZ6b0k1RHJXR0Qyek14cEdBClZzdlY3Q3VWSFZydnJqMWNLSTc5V3BqUFZaUkRMSnFib3JUZHhxU2JpQXcKLS0tIGJvNnRrTHEvYTIxaEFMVFNxVkZlVTlYUEV5TmFCK0ExdDBZNFRpSkNUeUkKPKHB+as1NIejSD81EtZYsj2csqJ3hd9PHjQU39PBr5PZZD0efWeJmU67/Esen5xFfBJ9V0OY9Ola0hKZqMux4MAe+7DHuR7FKhaT9Ttfv25HA4QN/F8BLckA6kX42m0jcF8+IQBasiVmaLd+LtZXd+fNZm/S36pHBFvyZwbuCiW7ZSzi2cl27IbIxbo3bUr07p0JntqF+LOKt8Qu67il5C4T3eslaIs9QkFzrjXuVHsmrKRv+/b7LvSK8aCRJIxtDkXgppcH5CHPktIWuTeixwf1znW7UC5gm8w1I5FWQP7jRenjBrR3iV3erbSQPJk3RDAAXKTIptKVoVgikv0EMjI9Bn1K9Z8HSalc6gjyvZihOOsqnvLHsI3nFheuVVwl+G2p/lHwTrb74z+TWKZBrsR3jDlR56jwh4Au6nnv3IPa3lvd3nQ7SL6MRQfTknqyT0hDaH/2+rFv8hHA4dwFhV4nLrbfse3U1jsyLqE8EL5nLAFKOwaJfPfGnadmsaAq9xtuOffKHVcX3mBH9cKv6yvLXJldUZc+v3AFAu0N/KKdyfWe6I0q37GC1/0gJWymH5uJ59cYmSR3xJ/6mfwKg2y67m9se1o2q6qWzUe7ouuN3PNKM6NDKuAg7TUIcajZlylTyMIPUaWJR+RiZnbzAQB1BnMXQ0eAYcElfpOFP5baVc7v8nOZXycvBFXvY+IXYtN1FcvlxSCFv/icD3q4mMtWhtTcoEYpi8bmf40SEcFHXT4mM+gp57Fx6TakpwA9+r/avQoQwyi6Z3HZc6BaCUW3NMrDV39igbuNcOOF4rSE3ppZetkniZFq8apdCbj7Sy4yHp8zkczv7eJGaWHwOTjdcA2m3dOGBfraH6sYrddtvoLF7NPQQYprLsDTbp4j1sHwz1ZtwdH76cz5JmzaluHxUy8XirsmHX+Hw+GUGe/uIy0IYnQrjJuiKEIid1eptoTqfCk9olXM5lxbR50YZlgbtNxcH9E0gLm3TbmQ7quxfTS3f90RBaWPzz65DC4iFo9OBxj6dCK5ZYOQZrwK1OBuwdNlYoE+haZg6Ct0/ZcAolQQtN1AEGDfXIwocfe8IPcyEhHCKLTj6GBt4ayxD7Ajo/ZOktyLKVcNytA1vF44WjVBP3StZE0I+QDpupDJR19KHO03t9Sapq9GdpcGWA9IbO8=" }
|
OP_SERVICE_ACCOUNT_TOKEN= { provider = "age", value = "YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBKZnh1bW5rRzV5N0FNcTgrOWVIS1VSSmtqUmZ6b0k1RHJXR0Qyek14cEdBClZzdlY3Q3VWSFZydnJqMWNLSTc5V3BqUFZaUkRMSnFib3JUZHhxU2JpQXcKLS0tIGJvNnRrTHEvYTIxaEFMVFNxVkZlVTlYUEV5TmFCK0ExdDBZNFRpSkNUeUkKPKHB+as1NIejSD81EtZYsj2csqJ3hd9PHjQU39PBr5PZZD0efWeJmU67/Esen5xFfBJ9V0OY9Ola0hKZqMux4MAe+7DHuR7FKhaT9Ttfv25HA4QN/F8BLckA6kX42m0jcF8+IQBasiVmaLd+LtZXd+fNZm/S36pHBFvyZwbuCiW7ZSzi2cl27IbIxbo3bUr07p0JntqF+LOKt8Qu67il5C4T3eslaIs9QkFzrjXuVHsmrKRv+/b7LvSK8aCRJIxtDkXgppcH5CHPktIWuTeixwf1znW7UC5gm8w1I5FWQP7jRenjBrR3iV3erbSQPJk3RDAAXKTIptKVoVgikv0EMjI9Bn1K9Z8HSalc6gjyvZihOOsqnvLHsI3nFheuVVwl+G2p/lHwTrb74z+TWKZBrsR3jDlR56jwh4Au6nnv3IPa3lvd3nQ7SL6MRQfTknqyT0hDaH/2+rFv8hHA4dwFhV4nLrbfse3U1jsyLqE8EL5nLAFKOwaJfPfGnadmsaAq9xtuOffKHVcX3mBH9cKv6yvLXJldUZc+v3AFAu0N/KKdyfWe6I0q37GC1/0gJWymH5uJ59cYmSR3xJ/6mfwKg2y67m9se1o2q6qWzUe7ouuN3PNKM6NDKuAg7TUIcajZlylTyMIPUaWJR+RiZnbzAQB1BnMXQ0eAYcElfpOFP5baVc7v8nOZXycvBFXvY+IXYtN1FcvlxSCFv/icD3q4mMtWhtTcoEYpi8bmf40SEcFHXT4mM+gp57Fx6TakpwA9+r/avQoQwyi6Z3HZc6BaCUW3NMrDV39igbuNcOOF4rSE3ppZetkniZFq8apdCbj7Sy4yHp8zkczv7eJGaWHwOTjdcA2m3dOGBfraH6sYrddtvoLF7NPQQYprLsDTbp4j1sHwz1ZtwdH76cz5JmzaluHxUy8XirsmHX+Hw+GUGe/uIy0IYnQrjJuiKEIid1eptoTqfCk9olXM5lxbR50YZlgbtNxcH9E0gLm3TbmQ7quxfTS3f90RBaWPzz65DC4iFo9OBxj6dCK5ZYOQZrwK1OBuwdNlYoE+haZg6Ct0/ZcAolQQtN1AEGDfXIwocfe8IPcyEhHCKLTj6GBt4ayxD7Ajo/ZOktyLKVcNytA1vF44WjVBP3StZE0I+QDpupDJR19KHO03t9Sapq9GdpcGWA9IbO8=" }
|
||||||
|
WEBHOOK_SECRET= { default = "KVR1fxg6vaw1dfb.wpa", value = "KVR1fxg6vaw1dfb.wpa" }
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue