Initial Commit

Signed-off-by: Till Wegmueller <toasterson@gmail.com>
This commit is contained in:
Till Wegmueller 2025-10-25 20:00:32 +02:00
parent 38230f2787
commit a71f9cc7d1
No known key found for this signature in database
38 changed files with 3575 additions and 0 deletions

106
.gitignore vendored Normal file
View file

@ -0,0 +1,106 @@
### JetBrains+all template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# SonarLint plugin
.idea/sonarlint/
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### Rust template
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb
# RustRover
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### rust-analyzer template
# Can be generated by other build systems other than cargo (ex: bazelbuild/rust_rules)
rust-project.json

8
.idea/.gitignore generated vendored Normal file
View file

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

8
.idea/modules.xml generated Normal file
View file

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/solstice-ci.iml" filepath="$PROJECT_DIR$/.idea/solstice-ci.iml" />
</modules>
</component>
</project>

18
.idea/solstice-ci.iml generated Normal file
View file

@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="EMPTY_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/crates/agent/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/crates/ciadm/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/crates/cidev/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/crates/common/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/crates/forge-integration/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/crates/github-integration/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/crates/orchestrator/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/crates/workflow-runner/src" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml generated Normal file
View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

515
.junie/guidelines.md Normal file
View file

@ -0,0 +1,515 @@
# Solstice CI — Engineering Guidelines
This document records project-specific build, test, and development conventions for the Solstice CI workspace. It is written for experienced Rust developers and focuses only on details that are unique to this repository.
- Workspace root: `solstice-ci/`
- Crates: `crates/agent`, `crates/orchestrator`, `crates/forge-integration`, `crates/github-integration`, `crates/common`
- Rust edition: `2024`
- Docs policy: Any AI-generated markdown summaries not explicitly requested in a prompt must live under `docs/ai/` with a timestamp prefix in the filename (e.g., `2025-10-25-some-topic.md`).
## 1. Build and Configuration
- Use the stable toolchain unless explicitly noted. The workspace uses the 2024 edition; keep `rustup` and `cargo` updated.
- Top-level build:
- Build everything: `cargo build --workspace`
- Run individual binaries during development using `cargo run -p <crate>`.
- Lints and formatting follow the default Rust style unless a crate specifies otherwise. Prefer `cargo fmt` and `cargo clippy --workspace --all-targets --all-features` before committing.
- Secrets and credentials are never committed. For local runs, use environment variables or a `.env` provider (do not add `.env` to VCS). In CI/deployments, use a secret store (e.g., Vault, KMS) — see the Integration layer notes.
Common configuration environment variables (pattern; per-service variables may diverge):
- Logging and tracing:
- `RUST_LOG` (or use `tracing-subscriber` filters)
- `OTEL_EXPORTER_OTLP_ENDPOINT` (e.g., `http://localhost:4317`)
- `OTEL_SERVICE_NAME` (e.g., `solstice-orchestrator`)
- Database (Postgres via SeaORM):
- `DATABASE_URL=postgres://user:pass@host:5432/solstice`
- Object storage (S3-compatible) and filesystem:
- `S3_ENDPOINT`, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_REGION`, `S3_BUCKET`
- `BLOB_FS_ROOT` (filesystem-backed blob store root)
## 2. Error Handling, Tracing, and Telemetry
We standardize on the `miette` + `thiserror` error pattern and `tracing` with OpenTelemetry.
- `thiserror` is used for domain error enums/structs; avoid stringly-typed errors.
- Wrap top-level errors with `miette::Report` for rich diagnostic output in CLIs and service logs. Prefer `eyre`-style ergonomics via `miette::Result<T>` where appropriate.
- Use `tracing` over `log`. Never mix ad-hoc `println!` in services; `println!` remains acceptable in intentionally minimal binaries or for very early bootstrap before subscribers are set.
- Emit spans/resources with OpenTelemetry. The shared initialization lives in the `common` crate so all binaries get consistent wiring.
Recommended initialization (placed in `crates/common` as a single public function and called from each service `main`):
```rust
// common/src/telemetry.rs (example; keep in `common`)
pub struct TelemetryGuard {
// Drop to flush and shutdown OTEL pipeline.
_guard: Option<tracing_appender::non_blocking::WorkerGuard>,
}
pub fn init_tracing(service_name: &str) -> miette::Result<TelemetryGuard> {
use miette::IntoDiagnostic;
use opentelemetry::sdk::{propagation::TraceContextPropagator, Resource};
use opentelemetry::sdk::trace as sdktrace;
use opentelemetry::KeyValue;
use opentelemetry_otlp::WithExportConfig;
use tracing_subscriber::{EnvFilter, fmt, layer::SubscriberExt, util::SubscriberInitExt};
// Resource describing this service.
let resource = Resource::new(vec![KeyValue::new("service.name", service_name.to_string())]);
// OTLP exporter (gRPC) if endpoint present; otherwise, only console output.
let tracer = opentelemetry_otlp::new_pipeline()
.tracing()
.with_trace_config(sdktrace::Config::default().with_resource(resource))
.with_exporter(opentelemetry_otlp::new_exporter().tonic())
.install_batch(opentelemetry::runtime::Tokio)
.into_diagnostic()?;
// Optional non-blocking file appender (example) — keep simple unless needed.
let (nb_writer, guard) = tracing_appender::non_blocking(std::io::stderr());
let fmt_layer = fmt::layer()
.with_target(false)
.with_writer(nb_writer)
.with_ansi(atty::is(atty::Stream::Stderr));
let otel_layer = tracing_opentelemetry::layer().with_tracer(tracer);
let filter = EnvFilter::try_from_default_env()
.or_else(|_| Ok(EnvFilter::new("info")))
.into_diagnostic()?;
opentelemetry::global::set_text_map_propagator(TraceContextPropagator::new());
tracing_subscriber::registry()
.with(filter)
.with(fmt_layer)
.with(otel_layer)
.init();
Ok(TelemetryGuard { _guard: Some(guard) })
}
```
In each service binary:
```rust
fn main() -> miette::Result<()> {
let _telemetry = common::telemetry::init_tracing("solstice-orchestrator")?;
// ...
Ok(())
}
```
Guidance:
- Favor spans around orchestration lifecycles (VM provisioning, job execution, webhook processing). Include high-cardinality attributes judiciously; prefer stable keys (e.g., job_id) and sample IDs for verbose content.
- When returning errors from service entrypoints, prefer `miette::Report` with context to produce actionable diagnostics.
## 3. Database Access — SeaORM + Postgres
- Use SeaORM for all relational access; keep raw SQL to migrations or perf-critical paths only.
- Model organization:
- Entities live in a dedicated `entities` module/crate generated by SeaORM tooling or handwritten as needed.
- Queries should be implemented in small, testable functions; avoid mixing business logic with query composition.
- Migrations:
- Create a migrations crate (e.g., `crates/migration`) using SeaORM CLI and point it at Postgres via `DATABASE_URL`.
- Typical workflow:
- `sea-orm-cli migrate init`
- `sea-orm-cli migrate generate <name>`
- Edit up/down migrations.
- `sea-orm-cli migrate up` (or `refresh` in dev)
- In services, run migrations on boot behind a feature flag or dedicated admin command.
- Connection management:
- Create a connection pool per service at startup (Tokio + async). Inject the pool into subsystems rather than using globals.
- Use timeouts and statement logging (via `tracing`) for observability.
### 3.1 Connection Pooling — deadpool (Postgres)
We standardize on `deadpool` for async connection pooling to Postgres, using `deadpool-postgres` (+ `tokio-postgres`) alongside SeaORM. Services construct a single pool at startup and pass it into subsystems.
Guidelines:
- Configuration via env:
- `DATABASE_URL` (Postgres connection string)
- `DB_POOL_MAX_SIZE` (e.g., 1664 depending on workload)
- `DB_POOL_TIMEOUT_MS` (acquire timeout; default 5_000ms)
- Enable TLS when required by your Postgres deployment (use `rustls` variants where possible). Avoid embedding credentials in code; prefer env/secret stores.
- Instrument queries with `tracing`. Prefer statement logging at `debug` with sampling in high-traffic paths.
Minimal setup sketch:
```rust
use std::time::Duration;
use deadpool_postgres::{Config as PgConfig, ManagerConfig, RecyclingMethod};
use miette::{IntoDiagnostic, Result};
use tokio_postgres::NoTls; // or a TLS connector
pub struct DbPool(deadpool_postgres::Pool);
pub fn build_db_pool() -> Result<DbPool> {
let mut cfg = PgConfig::new();
cfg.dbname = None; // use url
cfg.user = None; // prefer url/env
cfg.password = None;
cfg.host = None;
cfg.port = None;
cfg.manager = Some(ManagerConfig { recycling_method: RecyclingMethod::Fast });
// Use URL from env; deadpool-postgres supports it via from_env as well.
let mut cfg = PgConfig::from_env("DATABASE").unwrap_or(cfg);
let pool = cfg
.create_pool(Some(deadpool_postgres::Runtime::Tokio1), NoTls)
.into_diagnostic()?;
// Optionally wrap with a newtype to pass around.
Ok(DbPool(pool))
}
pub async fn with_client<F, Fut, T>(pool: &DbPool, f: F) -> Result<T>
where
F: FnOnce(deadpool_postgres::Client) -> Fut,
Fut: std::future::Future<Output = Result<T>>,
{
use tokio::time::timeout;
let client = timeout(
Duration::from_millis(std::env::var("DB_POOL_TIMEOUT_MS").ok()
.and_then(|s| s.parse().ok()).unwrap_or(5_000)),
pool.0.get(),
)
.await
.into_diagnostic()??;
f(client).await
}
```
Notes:
- Use one pool per service. Do not create pools per request.
- If using SeaORM, prefer constructing `Database::connect` from a `tokio_postgres::Client` via `sqlx` feature compatibility where applicable, or keep SeaORM for ORM and use raw pooled clients for admin/utility tasks.
## 4. Blob Storage — S3 and Filesystem
We support both S3-compatible object storage and a local filesystem backend.
- Prefer a storage abstraction in `common` with a trait like `BlobStore` and two implementations:
- `S3BlobStore` using the official `aws-sdk-s3` client (or a vetted S3-compatible client).
- `FsBlobStore` rooted at `BLOB_FS_ROOT` for local/dev and tests.
- Selection is via configuration (env or CLI). Keep keys and endpoints out of the repo.
- For S3, follow best practices:
- Use instance/role credentials where available; otherwise use env creds.
- Set timeouts, retries, and backoff via the client config.
- Keep bucket and path layout stable; include job IDs in keys for traceability.
- For filesystem, ensure directories are created atomically and validate paths to avoid traversal issues.
## 5. Argument Parsing — Clap
- All binaries use `clap` for flags/subcommands. Derive-based APIs (`#[derive(Parser)]`) are strongly preferred for consistency and help text quality.
- Align flags across services where semantics match (e.g., `--log-level`, `--database-url`, `--s3-endpoint`).
- Emit `--help` that documents env var fallbacks where appropriate.
Example skeleton:
```rust
use clap::Parser;
#[derive(Parser, Debug)]
#[command(name = "solstice-orchestrator", version, about)]
struct Opts {
/// Postgres connection string
#[arg(long, env = "DATABASE_URL")]
database_url: String,
/// OTLP endpoint (e.g., http://localhost:4317)
#[arg(long, env = "OTEL_EXPORTER_OTLP_ENDPOINT")]
otlp: Option<String>,
}
fn main() -> miette::Result<()> {
let _t = common::telemetry::init_tracing("solstice-orchestrator")?;
let opts = Opts::parse();
// ...
Ok(())
}
```
## 6. Testing — How We Configure and Run Tests
- Unit tests: colocated in module files under `#[cfg(test)]`.
- Integration tests: per-crate `tests/` directory. Each `*.rs` compiles as a separate test binary.
- Doc tests: keep examples correct or mark them `no_run` if they require external services.
- Workspace commands we actually verified:
- Run all tests: `cargo test --workspace`
- Run a single crates integration test (example we executed): `cargo test -p agent --test smoke`
Adding a new integration test:
1. Create `crates/<crate>/tests/<name>.rs`.
2. Write tests using the public API of the crate; avoid `#[cfg(test)]` internals.
3. Use `#[tokio::test(flavor = "multi_thread")]` when async runtime is needed.
4. Gate external dependencies (DB, S3) behind env flags or mark tests `ignore` by default.
Example minimal integration test (we created and validated this during documentation):
```rust
#[test]
fn smoke_passes() {
assert_eq!(2 + 2, 4);
}
```
Running subsets and filters:
- By crate: `cargo test -p orchestrator`
- By test target: `cargo test -p agent --test smoke`
- By name filter: `cargo test smoke`
## 7. Code Style and Conventions
- Error design:
- Domain errors via `thiserror`. Convert them to `miette::Report` at boundaries (CLI/service entry) with context.
- Prefer `Result<T, E>` where `E: Into<miette::Report>` for top-level flows.
- Telemetry:
- Every request/job gets a root span; child spans for significant phases. Include IDs in span fields, not only logs.
- When logging sensitive data, strip or hash before emitting.
- Async and runtime:
- Tokio everywhere for async services. Use a single runtime per process; dont nest.
- Use `tracing` instrument macros (`#[instrument]`) for important async fns.
- Crate versions:
- Always prefer the most recent compatible releases for `miette`, `thiserror`, `tracing`, `tracing-subscriber`, `opentelemetry`, `opentelemetry-otlp`, `tracing-opentelemetry`, `sea-orm`, `sea-orm-cli`, `aws-sdk-s3`, and `clap`.
- Avoid pinning minor/patch versions unless required for reproducibility or to work around regressions.
## 8. Local Development Tips
- Reproducible tasks: For interactions with external systems, prefer containerized dependencies (e.g., Postgres, MinIO) or `devcontainer`/`nix` flows if provided in the future.
- Migration safety: Never auto-apply migrations in production without a maintenance window and backup. In dev, `refresh` is acceptable.
- Storage backends: Provide a no-op or filesystem fallback so developers can run without cloud credentials.
- Observability: Keep logs structured and at `info` by default. For deep debugging, use `RUST_LOG=trace` with sampling to avoid log storms.
## 9. Troubleshooting Quick Notes
- No logs emitted: Ensure the tracing subscriber is initialized exactly once; double-init will panic. Also check `RUST_LOG` filters.
- OTLP export fails: Verify `OTEL_EXPORTER_OTLP_ENDPOINT` and that an OTLP collector (e.g., `otelcol`) is reachable. Fall back to console-only tracing if needed.
- DB connection errors: Validate `DATABASE_URL` and SSL/TLS requirements. Confirm the service can resolve the host and reach the port.
- S3 errors: Check credentials and bucket permissions; verify endpoint (for MinIO use path-style or specific region settings).
## 10. Documentation Routing
- Architecture and AI-generated summaries: place in `docs/ai/` with a timestamp prefix.
- This guidelines file intentionally lives under `.junie/guidelines.md` for developer tooling.
## 11. Messaging — RabbitMQ (lapin)
We use `lapin` (AMQP 0-9-1 client) for RabbitMQ access with Tokio. Keep producers and consumers simple and observable; centralize connection setup in the `common` crate where practical and inject channels into subsystems. This channel is used for asynchronous communication. For direct, synchronous service-to-service RPC, use gRPC via `tonic` (see §12).
Configuration (env; per-service may extend):
- `AMQP_URL` (e.g., `amqp://user:pass@host:5672/%2f` or `amqps://...` for TLS)
- `AMQP_PREFETCH` (QoS prefetch; default 32256 depending on workload)
- `AMQP_EXCHANGE` (default exchange name; often empty-string for default direct exchange)
- `AMQP_QUEUE` (queue name)
- `AMQP_ROUTING_KEY` (routing key when publishing)
Guidelines:
- Establish one `Connection` per process with automatic heartbeats; create dedicated `Channel`s per producer/consumer task.
- Declare exchanges/queues idempotently at startup with `durable = true` and `auto_delete = false` unless explicitly ephemeral.
- Set channel QoS with `basic_qos(prefetch_count)` to control backpressure. Use ack/nack to preserve at-least-once delivery.
- Prefer publisher confirms (`confirm_select`) and handle `BasicReturn` for unroutable messages when `mandatory` is set.
- Instrument with `tracing`: tag spans with `exchange`, `queue`, `routing_key`, and message IDs; avoid logging bodies.
- Reconnection: on connection/channel error, back off with jitter and recreate connection/channels; ensure consumers re-declare topology.
Minimal producer example:
```rust
use lapin::{options::*, types::FieldTable, BasicProperties, Connection, ConnectionProperties};
use miette::{IntoDiagnostic as _, Result};
use tokio_amqp::*; // enables Tokio reactor for lapin
pub async fn publish_one(msg: &[u8]) -> Result<()> {
let url = std::env::var("AMQP_URL").unwrap_or_else(|_| "amqp://127.0.0.1:5672/%2f".into());
let exchange = std::env::var("AMQP_EXCHANGE").unwrap_or_default();
let routing_key = std::env::var("AMQP_ROUTING_KEY").unwrap_or_else(|_| "jobs".into());
let conn = Connection::connect(&url, ConnectionProperties::default().with_tokio())
.await
.into_diagnostic()?;
let channel = conn.create_channel().await.into_diagnostic()?;
// Optional: declare exchange if non-empty.
if !exchange.is_empty() {
channel
.exchange_declare(
&exchange,
lapin::ExchangeKind::Direct,
ExchangeDeclareOptions { durable: true, auto_delete: false, ..Default::default() },
FieldTable::default(),
)
.await
.into_diagnostic()?;
}
// Publisher confirms
channel.confirm_select(ConfirmSelectOptions::default()).await.into_diagnostic()?;
let confirm = channel
.basic_publish(
&exchange,
&routing_key,
BasicPublishOptions { mandatory: true, ..Default::default() },
msg,
BasicProperties::default().with_content_type("application/octet-stream".into()),
)
.await
.into_diagnostic()?;
confirm.await.into_diagnostic()?; // wait for confirm
Ok(())
}
```
Minimal consumer example:
```rust
use lapin::{options::*, types::FieldTable, Connection, ConnectionProperties};
use miette::{IntoDiagnostic as _, Result};
use tokio_amqp::*;
pub async fn consume() -> Result<()> {
let url = std::env::var("AMQP_URL").unwrap_or_else(|_| "amqp://127.0.0.1:5672/%2f".into());
let queue = std::env::var("AMQP_QUEUE").unwrap_or_else(|_| "jobs".into());
let prefetch: u16 = std::env::var("AMQP_PREFETCH").ok().and_then(|s| s.parse().ok()).unwrap_or(64);
let conn = Connection::connect(&url, ConnectionProperties::default().with_tokio())
.await
.into_diagnostic()?;
let channel = conn.create_channel().await.into_diagnostic()?;
channel
.queue_declare(
&queue,
QueueDeclareOptions { durable: true, auto_delete: false, ..Default::default() },
FieldTable::default(),
)
.await
.into_diagnostic()?;
channel.basic_qos(prefetch, BasicQosOptions { global: false }).await.into_diagnostic()?;
let mut consumer = channel
.basic_consume(
&queue,
"worker",
BasicConsumeOptions { no_ack: false, ..Default::default() },
FieldTable::default(),
)
.await
.into_diagnostic()?;
while let Some(delivery) = consumer.next().await {
let delivery = delivery.into_diagnostic()?;
// process delivery.data ...
channel.basic_ack(delivery.delivery_tag, BasicAckOptions::default()).await.into_diagnostic()?;
}
Ok(())
}
```
Notes:
- Use TLS (`amqps://`) where brokers require it; configure certificates via the underlying TLS connector if needed.
- Keep message payloads schematized (e.g., JSON/CBOR/Protobuf) and versioned; include an explicit `content_type` and version header where applicable.
## 12. RPC — gRPC with tonic
We use `gRPC` for direct, synchronous service-to-service communication and standardize on the Rust `tonic` stack with `prost` for code generation. Prefer RabbitMQ (see §11) for asynchronous workflows, fan-out, or buffering; prefer gRPC when a caller needs an immediate response, strong request/response semantics, deadlines, and backpressure at the transport layer.
Guidance:
- Crates: `tonic`, `prost`, `prost-types`, optionally `tonic-health` (liveness/readiness) and `tonic-reflection` (dev only).
- Service boundaries: Define protobuf packages per domain (`orchestrator.v1`, `agent.v1`). Version packages; keep backward compatible changes (field additions with new tags, do not reuse/rename tags).
- Errors: Map domain errors to `tonic::Status` with appropriate `Code` (e.g., `InvalidArgument`, `NotFound`, `FailedPrecondition`, `Unavailable`, `Internal`). Preserve rich diagnostics in logs via `miette` and `tracing`; avoid leaking internals to clients.
- Deadlines and cancellation: Require callers to set deadlines; servers must honor `request.deadline()` and `request.extensions()` cancellation. Set sensible server timeouts.
- Observability: Propagate W3C TraceContext over gRPC metadata and create spans per RPC. Emit attributes for `rpc.system = "grpc"`, `rpc.service`, `rpc.method`, stable IDs (job_id) as fields.
- Security: Prefer TLS with `rustls`. Use mTLS where feasible, or bearer tokens in metadata (e.g., `authorization: Bearer ...`). Rotate certs without restarts where possible.
- Operations: Configure keepalive, max message sizes, and concurrency. Use streaming RPCs for log streaming and large artifact transfers when applicable.
Common env configuration (typical; per-service may extend):
- `GRPC_ADDR` or `GRPC_HOST`/`GRPC_PORT` (listen/connect endpoint, e.g., `0.0.0.0:50051`)
- `GRPC_TLS_CERT`, `GRPC_TLS_KEY`, `GRPC_TLS_CA` (PEM paths for TLS/mTLS)
- `GRPC_KEEPALIVE_MS` (e.g., 20_000), `GRPC_MAX_MESSAGE_MB` (e.g., 32), `GRPC_TIMEOUT_MS` (client default)
Minimal server skeleton (centralize wiring in `common`):
```rust
use miette::{IntoDiagnostic as _, Result};
use tonic::{transport::Server, Request, Response, Status};
use tracing::{info, instrument};
// Assume prost-generated module: pub mod orchestrator { pub mod v1 { include!("orchestrator.v1.rs"); }}
use orchestrator::v1::{job_service_server::{JobService, JobServiceServer}, StartJobRequest, StartJobResponse};
pub struct JobSvc;
#[tonic::async_trait]
impl JobService for JobSvc {
#[instrument(name = "rpc_start_job", skip(self, req), fields(rpc.system = "grpc", rpc.service = "orchestrator.v1.JobService", rpc.method = "StartJob"))]
async fn start_job(&self, req: Request<StartJobRequest>) -> Result<Response<StartJobResponse>, Status> {
// Respect caller deadline/cancellation
if req.deadline().elapsed().is_ok() { return Err(Status::deadline_exceeded("deadline passed")); }
let r = req.into_inner();
info!(job_id = %r.job_id, "start_job request");
// ... do work, map domain errors to Status ...
Ok(Response::new(StartJobResponse { accepted: true }))
}
}
pub async fn serve(addr: std::net::SocketAddr) -> Result<()> {
let _t = common::telemetry::init_tracing("solstice-orchestrator");
Server::builder()
.add_service(JobServiceServer::new(JobSvc))
.serve(addr)
.await
.into_diagnostic()
}
```
Minimal client sketch (with timeout and TLS optional):
```rust
use miette::{IntoDiagnostic as _, Result};
use tonic::{transport::{Channel, ClientTlsConfig, Endpoint}, Request, Code};
use orchestrator::v1::{job_service_client::JobServiceClient, StartJobRequest};
pub async fn start_job(addr: &str, job_id: String) -> Result<bool> {
let mut ep = Endpoint::from_shared(format!("https://{addr}"))?.tcp_keepalive(Some(std::time::Duration::from_secs(20)));
// ep = ep.tls_config(ClientTlsConfig::new())?; // enable when TLS configured
let channel: Channel = ep.connect_timeout(std::time::Duration::from_secs(3)).connect().await.into_diagnostic()?;
let mut client = JobServiceClient::new(channel);
let mut req = Request::new(StartJobRequest { job_id });
req.set_timeout(std::time::Duration::from_secs(10));
match client.start_job(req).await {
Ok(resp) => Ok(resp.into_inner().accepted),
Err(status) if status.code() == Code::Unavailable => Ok(false), // map transient
Err(e) => Err(miette::miette!("gRPC error: {e}")),
}
}
```
Health and reflection:
- Expose `grpc.health.v1.Health` via `tonic-health` for k8s/consumers. Include a readiness check (DB pool, AMQP connection) before reporting `SERVING`.
- Enable `tonic-reflection` only in dev/test to assist tooling; disable in prod.
Testing notes:
- Use `#[tokio::test(flavor = "multi_thread")]` and bind the server to `127.0.0.1:0` (ephemeral port) for integration tests.
- Assert deadlines and cancellation by setting short timeouts on the client `Request` and verifying `Status::deadline_exceeded`.
Cross-reference:
- Asynchronous communication: RabbitMQ via `lapin` (§11).
- Direct synchronous RPC: gRPC via `tonic` (this section).

3
Cargo.toml Normal file
View file

@ -0,0 +1,3 @@
[workspace]
members = ["crates/*"]
resolver = "3"

11
crates/ciadm/Cargo.toml Normal file
View file

@ -0,0 +1,11 @@
[package]
name = "ciadm"
version = "0.1.0"
edition = "2024"
[dependencies]
clap = { version = "4", features = ["derive", "env"] }
miette = { version = "7", features = ["fancy"] }
tracing = "0.1"
common = { path = "../common" }
tokio = { version = "1", features = ["rt-multi-thread", "macros"] }

51
crates/ciadm/src/main.rs Normal file
View file

@ -0,0 +1,51 @@
use clap::{Parser, Subcommand};
use miette::{IntoDiagnostic, Result};
use tracing::{info, warn};
#[derive(Parser, Debug)]
#[command(name = "ciadm", version, about = "Solstice CI Admin CLI")]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand, Debug)]
enum Commands {
/// Trigger a workflow run for a repository ref
Trigger {
/// Repository URL (e.g., https://github.com/org/repo.git)
#[arg(long)]
repo: String,
/// Git ref or SHA
#[arg(long, default_value = "main")]
r#ref: String,
/// Path to workflow KDL file in repo
#[arg(long, default_value = ".solstice/workflow.kdl")]
workflow: String,
},
/// Query status of a job by ID
Status {
/// Job ID
#[arg(long)]
job_id: String,
},
}
#[tokio::main(flavor = "multi_thread")]
async fn main() -> Result<()> {
let _t = common::init_tracing("ciadm")?;
let cli = Cli::parse();
match cli.command {
Commands::Trigger { repo, r#ref, workflow } => {
info!(%repo, %r#ref, %workflow, "trigger requested");
// TODO: Call orchestrator API to enqueue job
println!("Triggered job for {repo}@{ref} using {workflow}", r#ref = r#ref);
}
Commands::Status { job_id } => {
info!(%job_id, "status requested");
// TODO: Query orchestrator for job status
println!("Job {job_id} status: PENDING (stub)");
}
}
Ok(())
}

11
crates/cidev/Cargo.toml Normal file
View file

@ -0,0 +1,11 @@
[package]
name = "cidev"
version = "0.1.0"
edition = "2024"
[dependencies]
common = { path = "../common" }
clap = { version = "4", features = ["derive", "env"] }
miette = { version = "7", features = ["fancy"] }
tracing = "0.1"
tokio = { version = "1", features = ["rt-multi-thread", "macros"] }

59
crates/cidev/src/main.rs Normal file
View file

@ -0,0 +1,59 @@
use clap::{Parser, Subcommand};
use miette::Result;
use tracing::info;
#[derive(Parser, Debug)]
#[command(name = "cidev", version, about = "Solstice CI Dev CLI — validate and inspect KDL workflows")]
struct Cli {
#[command(subcommand)]
command: Command,
}
#[derive(Subcommand, Debug)]
enum Command {
/// Validate a workflow KDL file
Validate { #[arg(long)] path: String },
/// List jobs in a workflow
List { #[arg(long)] path: String },
/// Show a job's steps (by job id)
Show { #[arg(long)] path: String, #[arg(long)] job: String },
}
#[tokio::main(flavor = "multi_thread")]
async fn main() -> Result<()> {
let _t = common::init_tracing("cidev")?;
let cli = Cli::parse();
match cli.command {
Command::Validate { path } => {
let wf = common::parse_workflow_file(&path)?;
println!("OK: parsed workflow{} with {} job(s)",
wf.name.as_ref().map(|n| format!(" '{n}'")).unwrap_or_default(),
wf.jobs.len());
}
Command::List { path } => {
let wf = common::parse_workflow_file(&path)?;
for (id, job) in wf.jobs {
println!("{id}{}", job.runs_on.map(|ro| format!(" (runs_on: {ro})")).unwrap_or_default());
}
}
Command::Show { path, job } => {
let wf = common::parse_workflow_file(&path)?;
match wf.jobs.get(&job) {
Some(j) => {
println!("Job: {}", j.id);
if let Some(ro) = &j.runs_on { println!("runs_on: {ro}"); }
for (i, s) in j.steps.iter().enumerate() {
let name = s.name.as_deref().unwrap_or("(unnamed)");
println!("- Step {}/{}: {}", i + 1, j.steps.len(), name);
println!(" run: {}", s.run);
}
}
None => {
eprintln!("No such job id: {job}");
std::process::exit(1);
}
}
}
}
Ok(())
}

26
crates/common/Cargo.toml Normal file
View file

@ -0,0 +1,26 @@
[package]
name = "common"
version = "0.1.0"
edition = "2024"
[dependencies]
miette = { version = "7", features = ["fancy"] }
thiserror = "1"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
opentelemetry = { version = "0.26", features = ["trace"] }
opentelemetry_sdk = { version = "0.26", features = ["rt-tokio"] }
opentelemetry-otlp = { version = "0.26", features = ["grpc-tonic"] }
tracing-opentelemetry = "0.27"
tracing-appender = "0.2"
atty = "0.2"
kdl = "4"
# messaging + serialization
lapin = "2"
tokio-amqp = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
uuid = { version = "1", features = ["serde", "v4"] }
time = { version = "0.3", features = ["serde", "macros"] }
futures-util = "0.3"
tokio = { version = "1", features = ["rt-multi-thread", "macros", "time"] }

94
crates/common/src/job.rs Normal file
View file

@ -0,0 +1,94 @@
use std::{collections::BTreeMap, fs, path::Path};
use kdl::{KdlDocument, KdlNode};
use miette::{IntoDiagnostic, Report, Result};
#[derive(Debug, Clone)]
pub struct Workflow {
pub name: Option<String>,
pub jobs: BTreeMap<String, Job>,
}
#[derive(Debug, Clone)]
pub struct Job {
pub id: String,
pub runs_on: Option<String>,
pub steps: Vec<Step>,
}
#[derive(Debug, Clone)]
pub struct Step {
pub name: Option<String>,
pub run: String,
}
pub fn parse_workflow_file<P: AsRef<Path>>(path: P) -> Result<Workflow> {
let s = fs::read_to_string(path).into_diagnostic()?;
parse_workflow_str(&s)
}
pub fn parse_workflow_str(s: &str) -> Result<Workflow> {
let doc: KdlDocument = s.parse().into_diagnostic()?;
// Expect a top-level `workflow` node
let wf_node = doc
.nodes()
.iter()
.find(|n| n.name().value() == "workflow")
.ok_or_else(|| Report::msg("missing `workflow {}` root node"))?;
let name = wf_node
.get("name")
.and_then(|e| e.value().as_string())
.map(|s| s.to_string());
// Child nodes are within the workflow node body
let mut jobs = BTreeMap::new();
if let Some(children) = wf_node.children() {
for node in children.nodes().iter() {
if node.name().value() == "job" {
let job = parse_job(node)?;
jobs.insert(job.id.clone(), job);
}
}
}
Ok(Workflow { name, jobs })
}
fn parse_job(node: &KdlNode) -> Result<Job> {
let id = node
.get("id")
.and_then(|e| e.value().as_string())
.map(|s| s.to_string())
.ok_or_else(|| Report::msg("job missing string `id` property"))?;
let runs_on = node
.get("runs_on")
.and_then(|e| e.value().as_string())
.map(|s| s.to_string());
let mut steps = Vec::new();
if let Some(children) = node.children() {
for child in children.nodes() {
if child.name().value() == "step" {
steps.push(parse_step(child)?);
}
}
}
Ok(Job { id, runs_on, steps })
}
fn parse_step(node: &KdlNode) -> Result<Step> {
let run = node
.get("run")
.and_then(|e| e.value().as_string())
.map(|s| s.to_string())
.ok_or_else(|| Report::msg("step missing string `run` property"))?;
let name = node
.get("name")
.and_then(|e| e.value().as_string())
.map(|s| s.to_string());
Ok(Step { name, run })
}

9
crates/common/src/lib.rs Normal file
View file

@ -0,0 +1,9 @@
pub mod telemetry;
pub mod job;
pub mod messages;
pub mod mq;
pub use telemetry::{init_tracing, TelemetryGuard};
pub use job::{Workflow, Job, Step, parse_workflow_str, parse_workflow_file};
pub use messages::{JobRequest, SourceSystem};
pub use mq::{MqConfig, publish_job, consume_jobs};

View file

@ -0,0 +1,54 @@
use serde::{Deserialize, Serialize};
use time::{OffsetDateTime};
use uuid::Uuid;
/// Versioned internal job request schema published to the message bus.
/// Keep additions backward compatible; never reuse or repurpose existing fields.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JobRequest {
/// Schema identifier for routing and evolution.
#[serde(default = "default_schema_version")]
pub schema_version: String, // e.g., "jobrequest.v1"
/// Unique request identifier for idempotency and tracing correlation.
pub request_id: Uuid,
/// Source system of this request (forge or manual trigger).
pub source: SourceSystem,
/// Repository clone URL (SSH or HTTPS).
pub repo_url: String,
/// Commit SHA to check out.
pub commit_sha: String,
/// Optional path to the workflow file within the repo (KDL).
pub workflow_path: Option<String>,
/// Optional specific job id from the workflow to run.
pub workflow_job_id: Option<String>,
/// Optional scheduling hint selecting a base image or host group.
pub runs_on: Option<String>,
/// Submission timestamp (UTC).
pub submitted_at: OffsetDateTime,
}
fn default_schema_version() -> String { "jobrequest.v1".to_string() }
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SourceSystem {
Github,
Forgejo,
Manual,
}
impl JobRequest {
pub fn new(source: SourceSystem, repo_url: impl Into<String>, commit_sha: impl Into<String>) -> Self {
Self {
schema_version: default_schema_version(),
request_id: Uuid::new_v4(),
source,
repo_url: repo_url.into(),
commit_sha: commit_sha.into(),
workflow_path: None,
workflow_job_id: None,
runs_on: None,
submitted_at: OffsetDateTime::now_utc(),
}
}
}

246
crates/common/src/mq.rs Normal file
View file

@ -0,0 +1,246 @@
use std::time::Duration;
use futures_util::StreamExt;
use lapin::{
options::{
BasicAckOptions, BasicConsumeOptions, BasicNackOptions, BasicPublishOptions, BasicQosOptions,
ConfirmSelectOptions, ExchangeDeclareOptions, QueueBindOptions, QueueDeclareOptions,
},
types::{AMQPValue, FieldTable, LongString, ShortString},
BasicProperties, Channel, Connection, ConnectionProperties, Consumer,
};
use miette::{IntoDiagnostic as _, Result};
use tracing::{debug, error, info, instrument, warn};
use tracing::Instrument;
use crate::messages::JobRequest;
#[derive(Clone, Debug)]
pub struct MqConfig {
pub url: String,
pub exchange: String,
pub routing_key: String,
pub queue: String,
pub dlx: String,
pub dlq: String,
pub prefetch: u16,
}
impl Default for MqConfig {
fn default() -> Self {
Self {
url: std::env::var("AMQP_URL").unwrap_or_else(|_| "amqp://127.0.0.1:5672/%2f".into()),
exchange: std::env::var("AMQP_EXCHANGE").unwrap_or_else(|_| "solstice.jobs".into()),
routing_key: std::env::var("AMQP_ROUTING_KEY").unwrap_or_else(|_| "jobrequest.v1".into()),
queue: std::env::var("AMQP_QUEUE").unwrap_or_else(|_| "solstice.jobs.v1".into()),
dlx: std::env::var("AMQP_DLX").unwrap_or_else(|_| "solstice.dlx".into()),
dlq: std::env::var("AMQP_DLQ").unwrap_or_else(|_| "solstice.jobs.v1.dlq".into()),
prefetch: std::env::var("AMQP_PREFETCH").ok().and_then(|s| s.parse().ok()).unwrap_or(64),
}
}
}
#[instrument(skip(cfg))]
pub async fn connect(cfg: &MqConfig) -> Result<Connection> {
Connection::connect(&cfg.url, ConnectionProperties::default())
.await
.into_diagnostic()
}
#[instrument(skip(channel, cfg))]
pub async fn declare_topology(channel: &Channel, cfg: &MqConfig) -> Result<()> {
// Declare main exchange (durable direct)
channel
.exchange_declare(
&cfg.exchange,
lapin::ExchangeKind::Direct,
ExchangeDeclareOptions {
durable: true,
auto_delete: false,
internal: false,
nowait: false,
passive: false,
},
FieldTable::default(),
)
.await
.into_diagnostic()?;
// Declare DLX
channel
.exchange_declare(
&cfg.dlx,
lapin::ExchangeKind::Fanout,
ExchangeDeclareOptions { durable: true, auto_delete: false, internal: false, nowait: false, passive: false },
FieldTable::default(),
)
.await
.into_diagnostic()?;
// Declare DLQ with dead-lettering from main queue
let mut dlq_args = FieldTable::default();
dlq_args.insert(
ShortString::from("x-dead-letter-exchange"),
AMQPValue::LongString(LongString::from(cfg.exchange.clone())),
);
channel
.queue_declare(
&cfg.dlq,
QueueDeclareOptions { durable: true, auto_delete: false, exclusive: false, nowait: false, passive: false },
dlq_args,
)
.await
.into_diagnostic()?;
// Bind DLQ to DLX (fanout)
channel
.queue_bind(
&cfg.dlq,
&cfg.dlx,
"",
QueueBindOptions { nowait: false },
FieldTable::default(),
)
.await
.into_diagnostic()?;
// Declare main queue with DLX
let mut q_args = FieldTable::default();
q_args.insert(
ShortString::from("x-dead-letter-exchange"),
AMQPValue::LongString(LongString::from(cfg.dlx.clone())),
);
channel
.queue_declare(
&cfg.queue,
QueueDeclareOptions { durable: true, auto_delete: false, exclusive: false, nowait: false, passive: false },
q_args,
)
.await
.into_diagnostic()?;
// Bind queue
channel
.queue_bind(
&cfg.queue,
&cfg.exchange,
&cfg.routing_key,
QueueBindOptions { nowait: false },
FieldTable::default(),
)
.await
.into_diagnostic()?;
Ok(())
}
#[instrument(skip(cfg, job))]
pub async fn publish_job(cfg: &MqConfig, job: &JobRequest) -> Result<()> {
let conn = connect(cfg).await?;
let channel = conn.create_channel().await.into_diagnostic()?;
declare_topology(&channel, cfg).await?;
// Enable publisher confirms
channel
.confirm_select(ConfirmSelectOptions::default())
.await
.into_diagnostic()?;
let payload = serde_json::to_vec(job).into_diagnostic()?;
let props = BasicProperties::default()
.with_content_type("application/json".into())
.with_content_encoding("utf-8".into())
.with_type(ShortString::from(job.schema_version.clone()))
.with_delivery_mode(2u8.into()); // persistent
let confirm = channel
.basic_publish(
&cfg.exchange,
&cfg.routing_key,
BasicPublishOptions { mandatory: true, ..Default::default() },
&payload,
props,
)
.await
.into_diagnostic()?;
// Wait for broker confirm
confirm.await.into_diagnostic()?;
Ok(())
}
pub struct DeliveryMeta {
pub delivery_tag: u64,
}
#[instrument(skip(cfg, handler))]
pub async fn consume_jobs<F, Fut>(cfg: &MqConfig, handler: F) -> Result<()>
where
F: Fn(JobRequest) -> Fut + Send + Sync + 'static,
Fut: std::future::Future<Output = Result<()>> + Send + 'static,
{
let conn = connect(cfg).await?;
let channel = conn.create_channel().await.into_diagnostic()?;
declare_topology(&channel, cfg).await?;
// Set QoS
channel
.basic_qos(cfg.prefetch, BasicQosOptions { global: false })
.await
.into_diagnostic()?;
let consumer: Consumer = channel
.basic_consume(
&cfg.queue,
"orchestrator",
BasicConsumeOptions { no_ack: false, ..Default::default() },
FieldTable::default(),
)
.await
.into_diagnostic()?;
info!(queue = %cfg.queue, prefetch = cfg.prefetch, "consumer started");
tokio::pin!(consumer);
while let Some(delivery) = consumer.next().await {
match delivery {
Ok(d) => {
let tag = d.delivery_tag;
match serde_json::from_slice::<JobRequest>(&d.data) {
Ok(job) => {
let span = tracing::info_span!("handle_job", request_id = %job.request_id, repo = %job.repo_url, sha = %job.commit_sha);
if let Err(err) = handler(job).instrument(span).await {
error!(error = %err, "handler error; nacking to DLQ");
channel
.basic_nack(tag, BasicNackOptions { requeue: false, multiple: false })
.await
.into_diagnostic()?;
} else {
channel
.basic_ack(tag, BasicAckOptions { multiple: false })
.await
.into_diagnostic()?;
}
}
Err(e) => {
warn!(error = %e, "failed to deserialize JobRequest; dead-lettering");
channel
.basic_nack(tag, BasicNackOptions { requeue: false, multiple: false })
.await
.into_diagnostic()?;
}
}
}
Err(e) => {
error!(error = %e, "consumer delivery error; continuing");
// Backoff briefly to avoid tight loop on errors
tokio::time::sleep(Duration::from_millis(200)).await;
}
}
}
Ok(())
}

View file

@ -0,0 +1,28 @@
use miette::IntoDiagnostic;
use tracing_subscriber::{EnvFilter, fmt, layer::SubscriberExt, util::SubscriberInitExt};
/// Guard that keeps non-blocking writer alive and allows flushing on drop
pub struct TelemetryGuard {
// Drop to flush writer.
_guard: Option<tracing_appender::non_blocking::WorkerGuard>,
}
/// Initialize tracing. Console-only for now; OTEL can be wired later.
pub fn init_tracing(_service_name: &str) -> miette::Result<TelemetryGuard> {
let (nb_writer, guard) = tracing_appender::non_blocking(std::io::stderr());
let fmt_layer = fmt::layer()
.with_target(false)
.with_writer(nb_writer)
.with_ansi(atty::is(atty::Stream::Stderr));
let filter = EnvFilter::try_from_default_env()
.unwrap_or_else(|_| EnvFilter::new("info"));
tracing_subscriber::registry()
.with(filter)
.with(fmt_layer)
.init();
Ok(TelemetryGuard { _guard: Some(guard) })
}

View file

@ -0,0 +1,19 @@
[package]
name = "forge-integration"
version = "0.1.0"
edition = "2024"
[dependencies]
common = { path = "../common" }
clap = { version = "4", features = ["derive", "env"] }
miette = { version = "7", features = ["fancy"] }
tracing = "0.1"
tokio = { version = "1", features = ["rt-multi-thread", "macros", "signal"] }
# HTTP + Webhooks
axum = { version = "0.7", features = ["macros"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
# Signature verification
hmac = "0.12"
sha2 = "0.10"
hex = "0.4"

View file

@ -0,0 +1,296 @@
use std::net::SocketAddr;
use std::sync::Arc;
use axum::{
body::Bytes,
extract::State,
http::{HeaderMap, StatusCode},
response::IntoResponse,
routing::post,
Router,
};
use clap::{Parser, Subcommand};
use hmac::{Hmac, Mac};
use miette::Result;
use serde::Deserialize;
use sha2::Sha256;
use tracing::{error, info, warn};
#[derive(Subcommand, Debug)]
enum Cmd {
/// Enqueue a sample JobRequest (dev/test helper)
Enqueue {
/// Repository URL
#[arg(long)]
repo_url: String,
/// Commit SHA
#[arg(long)]
commit_sha: String,
/// Optional runs_on hint
#[arg(long)]
runs_on: Option<String>,
},
}
#[derive(Parser, Debug)]
#[command(name = "solstice-forge", version, about = "Solstice CI — Forge Integration Layer")]
struct Opts {
/// HTTP bind address for webhooks (e.g., 0.0.0.0:8080)
#[arg(long, env = "HTTP_ADDR", default_value = "0.0.0.0:8080")]
http_addr: String,
/// Webhook path (route) to listen on
#[arg(long, env = "WEBHOOK_PATH", default_value = "/webhooks/forgejo")]
webhook_path: String,
/// Shared secret for validating webhooks (per-forge configuration will evolve)
#[arg(long, env = "WEBHOOK_SECRET")]
webhook_secret: Option<String>,
/// RabbitMQ URL (AMQP)
#[arg(long, env = "AMQP_URL")]
amqp_url: Option<String>,
/// Exchange for job requests
#[arg(long, env = "AMQP_EXCHANGE")]
amqp_exchange: Option<String>,
/// Queue (declared by orchestrator too)
#[arg(long, env = "AMQP_QUEUE")]
amqp_queue: Option<String>,
/// Routing key for job requests
#[arg(long, env = "AMQP_ROUTING_KEY")]
amqp_routing_key: Option<String>,
/// OTLP endpoint (e.g., http://localhost:4317)
#[arg(long, env = "OTEL_EXPORTER_OTLP_ENDPOINT")]
otlp: Option<String>,
#[command(subcommand)]
cmd: Option<Cmd>,
}
#[derive(Clone)]
struct AppState {
mq_cfg: common::MqConfig,
webhook_secret: Option<String>,
}
type HmacSha256 = Hmac<Sha256>;
#[tokio::main(flavor = "multi_thread")]
async fn main() -> Result<()> {
let _t = common::init_tracing("solstice-forge-integration")?;
let opts = Opts::parse();
info!(http_addr = %opts.http_addr, path = %opts.webhook_path, "forge integration starting");
// Apply AMQP overrides if provided
let mut mq_cfg = common::MqConfig::default();
if let Some(u) = opts.amqp_url { mq_cfg.url = u; }
if let Some(x) = opts.amqp_exchange { mq_cfg.exchange = x; }
if let Some(q) = opts.amqp_queue { mq_cfg.queue = q; }
if let Some(rk) = opts.amqp_routing_key { mq_cfg.routing_key = rk; }
if let Some(Cmd::Enqueue { repo_url, commit_sha, runs_on }) = opts.cmd {
let mut jr = common::JobRequest::new(common::SourceSystem::Manual, repo_url, commit_sha);
jr.runs_on = runs_on;
common::publish_job(&mq_cfg, &jr).await?;
info!(request_id = %jr.request_id, "enqueued job request");
return Ok(());
}
if opts.webhook_secret.is_none() {
warn!("WEBHOOK_SECRET is not set — accepting webhooks without signature validation (dev mode)");
}
let state = Arc::new(AppState { mq_cfg, webhook_secret: opts.webhook_secret });
// Leak the path string to satisfy 'static requirement for axum route API
let path: &'static str = Box::leak(opts.webhook_path.clone().into_boxed_str());
let router = Router::new()
.route(path, post(handle_webhook))
.with_state(state.clone());
let addr: SocketAddr = opts.http_addr.parse().expect("invalid HTTP_ADDR");
axum::serve(tokio::net::TcpListener::bind(addr).await.expect("bind"), router)
.await
.expect("server error");
Ok(())
}
async fn handle_webhook(
State(state): State<Arc<AppState>>,
headers: HeaderMap,
body: Bytes,
) -> impl IntoResponse {
// Signature validation (X-Gitea-Signature or X-Forgejo-Signature)
if let Some(secret) = state.webhook_secret.as_ref() {
if let Some(sig_hdr) = headers
.get("X-Gitea-Signature")
.or_else(|| headers.get("X-Forgejo-Signature"))
{
let sig_hex = match sig_hdr.to_str() {
Ok(s) => s.trim(),
Err(_) => return StatusCode::UNAUTHORIZED,
};
if !verify_hmac(sig_hex, secret.as_bytes(), &body) {
warn!("invalid webhook signature");
return StatusCode::UNAUTHORIZED;
}
} else {
warn!("missing signature header");
return StatusCode::UNAUTHORIZED;
}
} else {
// No secret configured; accept (dev)
}
let event = headers
.get("X-Gitea-Event")
.or_else(|| headers.get("X-Forgejo-Event"))
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_lowercase();
match event.as_str() {
"push" => handle_push(state, body).await,
"pull_request" => handle_pull_request(state, body).await,
_ => {
// Unknown or unhandled event
StatusCode::NO_CONTENT
}
}
}
fn verify_hmac(sig_hex: &str, secret: &[u8], body: &[u8]) -> bool {
let mut mac = HmacSha256::new_from_slice(secret).expect("HMAC can take key of any size");
mac.update(body);
match hex::decode(sig_hex) {
Ok(sig) => mac.verify_slice(&sig).is_ok(),
Err(_) => false,
}
}
#[derive(Debug, Deserialize)]
struct RepoInfo {
#[serde(default)]
clone_url: Option<String>,
#[serde(default)]
ssh_url: Option<String>,
}
#[derive(Debug, Deserialize)]
struct PushPayload {
after: String,
repository: RepoInfo,
}
async fn handle_push(state: Arc<AppState>, body: Bytes) -> StatusCode {
let payload: PushPayload = match serde_json::from_slice(&body) {
Ok(p) => p,
Err(e) => {
warn!(error = %e, "failed to parse push payload");
return StatusCode::BAD_REQUEST;
}
};
// Ignore delete events (after = all zeros)
let is_delete = payload.after.chars().all(|c| c == '0');
if is_delete {
return StatusCode::NO_CONTENT;
}
let repo_url = pick_repo_url(&payload.repository);
let sha = payload.after;
match enqueue_job(&state, repo_url, sha).await {
Ok(_) => StatusCode::ACCEPTED,
Err(e) => {
error!(error = %e, "failed to publish job");
StatusCode::INTERNAL_SERVER_ERROR
}
}
}
#[derive(Debug, Deserialize)]
struct PrRepoInfo {
#[serde(default)]
clone_url: Option<String>,
#[serde(default)]
ssh_url: Option<String>,
}
#[derive(Debug, Deserialize)]
struct PrHead {
sha: String,
repo: PrRepoInfo,
}
#[derive(Debug, Deserialize)]
struct PullRequest {
head: PrHead,
}
#[derive(Debug, Deserialize)]
struct PrPayload {
action: String,
#[serde(rename = "pull_request")]
pull_request: PullRequest,
}
async fn handle_pull_request(state: Arc<AppState>, body: Bytes) -> StatusCode {
let payload: PrPayload = match serde_json::from_slice(&body) {
Ok(p) => p,
Err(e) => {
warn!(error = %e, "failed to parse pull_request payload");
return StatusCode::BAD_REQUEST;
}
};
// Only act on opened/synchronize/reopened
match payload.action.as_str() {
"opened" | "synchronize" | "reopened" => {}
_ => return StatusCode::NO_CONTENT,
}
let repo_url = pick_repo_url_pr(&payload.pull_request.head.repo);
let sha = payload.pull_request.head.sha;
match enqueue_job(&state, repo_url, sha).await {
Ok(_) => StatusCode::ACCEPTED,
Err(e) => {
error!(error = %e, "failed to publish job");
StatusCode::INTERNAL_SERVER_ERROR
}
}
}
fn pick_repo_url(repo: &RepoInfo) -> String {
repo.clone_url
.as_deref()
.or(repo.ssh_url.as_deref())
.unwrap_or("")
.to_string()
}
fn pick_repo_url_pr(repo: &PrRepoInfo) -> String {
repo.clone_url
.as_deref()
.or(repo.ssh_url.as_deref())
.unwrap_or("")
.to_string()
}
async fn enqueue_job(state: &Arc<AppState>, repo_url: String, commit_sha: String) -> Result<()> {
if repo_url.is_empty() {
miette::bail!("missing repo_url in webhook payload");
}
let mut jr = common::JobRequest::new(common::SourceSystem::Forgejo, repo_url, commit_sha);
// TODO: infer runs_on from repo defaults or labels
common::publish_job(&state.mq_cfg, &jr).await?;
info!(request_id = %jr.request_id, repo = %jr.repo_url, sha = %jr.commit_sha, "enqueued job from webhook");
Ok(())
}

View file

@ -0,0 +1,11 @@
[package]
name = "github-integration"
version = "0.1.0"
edition = "2024"
[dependencies]
common = { path = "../common" }
clap = { version = "4", features = ["derive", "env"] }
miette = { version = "7", features = ["fancy"] }
tracing = "0.1"
tokio = { version = "1", features = ["rt-multi-thread", "macros", "signal"] }

View file

@ -0,0 +1,36 @@
use clap::Parser;
use miette::Result;
use tracing::{info, warn};
#[derive(Parser, Debug)]
#[command(name = "solstice-github", version, about = "Solstice CI — GitHub Integration (GitHub App)")]
struct Opts {
/// HTTP bind address for GitHub webhooks (e.g., 0.0.0.0:8081)
#[arg(long, env = "HTTP_ADDR", default_value = "0.0.0.0:8081")]
http_addr: String,
/// GitHub App ID
#[arg(long, env = "GITHUB_APP_ID")]
app_id: Option<u64>,
/// Path to GitHub App private key (PEM)
#[arg(long, env = "GITHUB_APP_KEY_PATH")]
app_key_path: Option<String>,
/// OTLP endpoint (e.g., http://localhost:4317)
#[arg(long, env = "OTEL_EXPORTER_OTLP_ENDPOINT")]
otlp: Option<String>,
}
#[tokio::main(flavor = "multi_thread")]
async fn main() -> Result<()> {
let _t = common::init_tracing("solstice-github-integration")?;
let opts = Opts::parse();
info!(http_addr = %opts.http_addr, "github integration starting");
// TODO: Start HTTP server, validate signatures, implement GitHub App auth flow
warn!("github-integration skeleton running; HTTP/GitHub App not implemented yet");
tokio::signal::ctrl_c().await.expect("listen for ctrl-c");
Ok(())
}

View file

@ -0,0 +1,12 @@
[package]
name = "migration"
version = "0.1.0"
edition = "2024"
[lib]
name = "migration"
path = "src/lib.rs"
[dependencies]
sea-orm-migration = { version = "0.12", default-features = false, features = ["runtime-tokio-rustls", "sqlx-postgres"] }
sea-orm = { version = "0.12", default-features = false, features = ["sqlx-postgres", "runtime-tokio-rustls"] }

103
crates/migration/src/lib.rs Normal file
View file

@ -0,0 +1,103 @@
use sea_orm_migration::prelude::*;
pub struct Migrator;
#[async_trait::async_trait]
impl MigratorTrait for Migrator {
fn migrations() -> Vec<Box<dyn MigrationTrait>> {
vec![
Box::new(m2025_10_25_000001_create_jobs::Migration),
Box::new(m2025_10_25_000002_create_vms::Migration),
]
}
}
mod m2025_10_25_000001_create_jobs {
use super::*;
#[derive(DeriveMigrationName)]
pub struct Migration;
#[async_trait::async_trait]
impl MigrationTrait for Migration {
async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> {
manager
.create_table(
Table::create()
.table(Jobs::Table)
.if_not_exists()
.col(ColumnDef::new(Jobs::RequestId).uuid().not_null().primary_key())
.col(ColumnDef::new(Jobs::RepoUrl).string().not_null())
.col(ColumnDef::new(Jobs::CommitSha).string().not_null())
.col(ColumnDef::new(Jobs::RunsOn).string().null())
.col(ColumnDef::new(Jobs::State).string().not_null())
.col(ColumnDef::new(Jobs::CreatedAt).timestamp_with_time_zone().not_null())
.col(ColumnDef::new(Jobs::UpdatedAt).timestamp_with_time_zone().not_null())
.to_owned(),
)
.await
}
async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> {
manager.drop_table(Table::drop().table(Jobs::Table).to_owned()).await
}
}
#[derive(Iden)]
enum Jobs {
Table,
RequestId,
RepoUrl,
CommitSha,
RunsOn,
State,
CreatedAt,
UpdatedAt,
}
}
mod m2025_10_25_000002_create_vms {
use super::*;
#[derive(DeriveMigrationName)]
pub struct Migration;
#[async_trait::async_trait]
impl MigrationTrait for Migration {
async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> {
manager
.create_table(
Table::create()
.table(Vms::Table)
.if_not_exists()
.col(ColumnDef::new(Vms::RequestId).uuid().not_null())
.col(ColumnDef::new(Vms::DomainName).string().not_null())
.col(ColumnDef::new(Vms::OverlayPath).string().null())
.col(ColumnDef::new(Vms::SeedPath).string().null())
.col(ColumnDef::new(Vms::Backend).string().not_null())
.col(ColumnDef::new(Vms::State).string().not_null())
.col(ColumnDef::new(Vms::CreatedAt).timestamp_with_time_zone().not_null())
.col(ColumnDef::new(Vms::UpdatedAt).timestamp_with_time_zone().not_null())
.to_owned(),
)
.await
}
async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> {
manager.drop_table(Table::drop().table(Vms::Table).to_owned()).await
}
}
#[derive(Iden)]
enum Vms {
Table,
RequestId,
DomainName,
OverlayPath,
SeedPath,
Backend,
State,
CreatedAt,
UpdatedAt,
}
}

View file

@ -0,0 +1,34 @@
[package]
name = "orchestrator"
version = "0.1.0"
edition = "2024"
[features]
# Enable libvirt backend on Linux hosts
libvirt = ["dep:libvirt"]
[dependencies]
common = { path = "../common" }
clap = { version = "4", features = ["derive", "env"] }
miette = { version = "7", features = ["fancy"] }
thiserror = "1"
tracing = "0.1"
tokio = { version = "1", features = ["rt-multi-thread", "macros", "signal", "fs", "io-util"] }
serde = { version = "1", features = ["derive"] }
serde_yaml = "0.9"
config = { version = "0.14", default-features = false, features = ["yaml"] }
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2", "gzip", "brotli", "zstd"] }
bytes = "1"
path-absolutize = "3"
# Compression/decompression
zstd = "0.13"
# Linux-only optional libvirt bindings (feature-gated)
libvirt = { version = "0.1", optional = true }
# DB (optional basic persistence)
sea-orm = { version = "0.12", default-features = false, features = ["sqlx-postgres", "runtime-tokio-rustls" ] }
migration = { path = "../migration", optional = true }
# Utilities
once_cell = "1"
dashmap = "6"
async-trait = "0.1"
uuid = { version = "1", features = ["v4"] }

View file

@ -0,0 +1,147 @@
use std::{collections::BTreeMap, fs, path::{Path, PathBuf}};
use miette::{IntoDiagnostic as _, Result};
use serde::Deserialize;
use tokio::task;
#[derive(Debug, Clone, Deserialize)]
pub struct OrchestratorConfig {
pub default_label: String,
#[serde(default)]
pub aliases: BTreeMap<String, String>,
#[serde(default)]
pub sizes: BTreeMap<String, SizePreset>,
#[serde(default)]
pub images: BTreeMap<String, ImageEntry>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct SizePreset {
pub cpu: u16,
pub ram_mb: u32,
}
#[derive(Debug, Clone, Deserialize)]
pub struct ImageEntry {
/// Remote source URL. If local_path does not exist, we will download it.
pub source: String,
/// Target local path for the prepared base image (raw .img or qcow2)
pub local_path: PathBuf,
/// Decompression method for downloaded artifact ("zstd" or "none"/missing)
#[serde(default)]
pub decompress: Option<Decompress>,
/// Images must support NoCloud for metadata injection
#[serde(default)]
pub nocloud: bool,
/// Default VM resource overrides for this label
#[serde(default)]
pub defaults: Option<ImageDefaults>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct ImageDefaults {
pub cpu: Option<u16>,
pub ram_mb: Option<u32>,
pub disk_gb: Option<u32>,
}
#[derive(Debug, Clone, Copy, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum Decompress {
Zstd,
None,
}
impl Default for Decompress {
fn default() -> Self { Decompress::None }
}
impl OrchestratorConfig {
pub async fn load(path: Option<&Path>) -> Result<Self> {
let path = match path {
Some(p) => p.to_path_buf(),
None => default_example_path(),
};
// Use blocking read via spawn_blocking to avoid blocking Tokio
let cfg: OrchestratorConfig = task::spawn_blocking(move || {
let builder = config::Config::builder()
.add_source(config::File::from(path));
let cfg = builder.build().into_diagnostic()?;
cfg.try_deserialize().into_diagnostic()
})
.await
.into_diagnostic()??;
Ok(cfg)
}
/// Resolve an incoming label using aliases to the canonical key used in `images`.
pub fn resolve_label<'a>(&'a self, label: Option<&'a str>) -> Option<&'a str> {
let l = label.unwrap_or(&self.default_label);
if let Some(canon) = self.aliases.get(l) {
Some(canon.as_str())
} else {
Some(l)
}
}
/// Get image entry for a resolved label (canonical key)
pub fn image_for(&self, resolved_label: &str) -> Option<&ImageEntry> {
self.images.get(resolved_label)
}
}
fn default_example_path() -> PathBuf {
// default to examples/orchestrator-image-map.yaml relative to cwd
PathBuf::from("examples/orchestrator-image-map.yaml")
}
/// Ensure images referenced in config exist at local_path. If missing, download
/// from `source` and optionally decompress according to `decompress`.
pub async fn ensure_images(cfg: &OrchestratorConfig) -> Result<()> {
for (label, image) in cfg.images.iter() {
if image.local_path.exists() {
continue;
}
// Create parent dirs
if let Some(parent) = image.local_path.parent() {
tokio::fs::create_dir_all(parent).await.into_diagnostic()?;
}
// Download to temporary file
let tmp_path = image.local_path.with_extension("part");
tracing::info!(label = %label, url = %image.source, local = ?image.local_path, "downloading base image");
let resp = reqwest::get(&image.source).await.into_diagnostic()?;
let status = resp.status();
if !status.is_success() {
miette::bail!("failed to download {url}: {status}", url = image.source, status = status);
}
let bytes = resp.bytes().await.into_diagnostic()?;
tokio::fs::write(&tmp_path, &bytes).await.into_diagnostic()?;
// Decompress or move into place
match image.decompress.unwrap_or(Decompress::None) {
Decompress::None => {
tokio::fs::rename(&tmp_path, &image.local_path)
.await
.into_diagnostic()?;
}
Decompress::Zstd => {
let src = tmp_path.clone();
let dst = image.local_path.clone();
task::spawn_blocking(move || -> miette::Result<()> {
let infile = fs::File::open(&src).into_diagnostic()?;
let mut decoder = zstd::stream::read::Decoder::new(infile).into_diagnostic()?;
let mut outfile = fs::File::create(&dst).into_diagnostic()?;
std::io::copy(&mut decoder, &mut outfile).into_diagnostic()?;
// remove compressed temp
std::fs::remove_file(&src).ok();
Ok(())
})
.await
.into_diagnostic()??;
}
}
tracing::info!(label = %label, local = ?image.local_path, "image ready");
}
Ok(())
}

View file

@ -0,0 +1,370 @@
use std::{path::PathBuf, time::Duration};
use async_trait::async_trait;
use miette::{Result, IntoDiagnostic as _};
use tracing::{info, warn};
#[cfg(unix)]
use std::os::unix::fs::PermissionsExt;
// Backend tag is used internally to remember which backend handled this VM.
#[derive(Debug, Clone, Copy)]
pub enum BackendTag { Noop, #[cfg(all(target_os = "linux", feature = "libvirt"))] Libvirt, #[cfg(target_os = "illumos")] Zones }
#[derive(Debug, Clone)]
pub struct VmSpec {
pub label: String,
pub image_path: PathBuf,
pub cpu: u16,
pub ram_mb: u32,
pub disk_gb: u32,
pub network: Option<String>,
pub nocloud: bool,
/// Optional user-data (cloud-init NoCloud). If provided, backend will attach seed.
pub user_data: Option<Vec<u8>>,
}
#[derive(Debug, Clone)]
pub struct JobContext {
pub request_id: uuid::Uuid,
pub repo_url: String,
pub commit_sha: String,
pub workflow_job_id: Option<String>,
}
#[derive(Debug, Clone)]
pub struct VmHandle {
pub id: String,
pub backend: BackendTag,
pub work_dir: PathBuf,
pub overlay_path: Option<PathBuf>,
pub seed_iso_path: Option<PathBuf>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum VmState {
Prepared,
Running,
Stopped,
}
#[async_trait]
pub trait Hypervisor: Send + Sync {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle>;
async fn start(&self, vm: &VmHandle) -> Result<()>;
async fn stop(&self, vm: &VmHandle, graceful_timeout: Duration) -> Result<()>;
async fn destroy(&self, vm: VmHandle) -> Result<()>;
async fn state(&self, _vm: &VmHandle) -> Result<VmState> { Ok(VmState::Prepared) }
}
/// A router that delegates to the correct backend implementation per job.
pub struct RouterHypervisor {
pub noop: NoopHypervisor,
#[cfg(all(target_os = "linux", feature = "libvirt"))]
pub libvirt: Option<LibvirtHypervisor>,
#[cfg(target_os = "illumos")]
pub zones: Option<ZonesHypervisor>,
}
impl RouterHypervisor {
#[allow(unused_variables)]
pub fn build(libvirt_uri: String, libvirt_network: String) -> Self {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
{
return RouterHypervisor {
noop: NoopHypervisor::default(),
libvirt: Some(LibvirtHypervisor { uri: libvirt_uri, network: libvirt_network }),
};
}
#[cfg(target_os = "illumos")]
{
return RouterHypervisor { noop: NoopHypervisor::default(), zones: Some(ZonesHypervisor) };
}
#[cfg(all(not(target_os = "illumos"), not(all(target_os = "linux", feature = "libvirt"))))]
{
return RouterHypervisor { noop: NoopHypervisor::default() };
}
}
}
#[async_trait]
impl Hypervisor for RouterHypervisor {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle> {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
{
if let Some(ref hv) = self.libvirt { return hv.prepare(spec, ctx).await; }
}
#[cfg(target_os = "illumos")]
{
if let Some(ref hv) = self.zones { return hv.prepare(spec, ctx).await; }
}
self.noop.prepare(spec, ctx).await
}
async fn start(&self, vm: &VmHandle) -> Result<()> {
match vm.backend {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
BackendTag::Libvirt => {
if let Some(ref hv) = self.libvirt { hv.start(vm).await } else { self.noop.start(vm).await }
}
#[cfg(target_os = "illumos")]
BackendTag::Zones => {
if let Some(ref hv) = self.zones { hv.start(vm).await } else { self.noop.start(vm).await }
}
_ => self.noop.start(vm).await,
}
}
async fn stop(&self, vm: &VmHandle, t: Duration) -> Result<()> {
match vm.backend {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
BackendTag::Libvirt => {
if let Some(ref hv) = self.libvirt { hv.stop(vm, t).await } else { self.noop.stop(vm, t).await }
}
#[cfg(target_os = "illumos")]
BackendTag::Zones => {
if let Some(ref hv) = self.zones { hv.stop(vm, t).await } else { self.noop.stop(vm, t).await }
}
_ => self.noop.stop(vm, t).await,
}
}
async fn destroy(&self, vm: VmHandle) -> Result<()> {
match vm.backend {
#[cfg(all(target_os = "linux", feature = "libvirt"))]
BackendTag::Libvirt => {
if let Some(ref hv) = self.libvirt { hv.destroy(vm).await } else { self.noop.destroy(vm).await }
}
#[cfg(target_os = "illumos")]
BackendTag::Zones => {
if let Some(ref hv) = self.zones { hv.destroy(vm).await } else { self.noop.destroy(vm).await }
}
_ => self.noop.destroy(vm).await,
}
}
}
/// No-op hypervisor for development on hosts without privileges.
#[derive(Debug, Clone, Default)]
pub struct NoopHypervisor;
#[async_trait]
impl Hypervisor for NoopHypervisor {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle> {
let id = format!("noop-{}", ctx.request_id);
let work_dir = std::env::temp_dir().join("solstice-noop").join(&id);
tokio::fs::create_dir_all(&work_dir).await.into_diagnostic()?;
info!(id = %id, label = %spec.label, image = ?spec.image_path, "noop prepare");
Ok(VmHandle { id, backend: BackendTag::Noop, work_dir, overlay_path: None, seed_iso_path: None })
}
async fn start(&self, vm: &VmHandle) -> Result<()> {
info!(id = %vm.id, "noop start");
Ok(())
}
async fn stop(&self, vm: &VmHandle, _t: Duration) -> Result<()> {
info!(id = %vm.id, "noop stop");
Ok(())
}
async fn destroy(&self, vm: VmHandle) -> Result<()> {
info!(id = %vm.id, "noop destroy");
Ok(())
}
}
#[cfg(all(target_os = "linux", feature = "libvirt"))]
pub struct LibvirtHypervisor {
pub uri: String,
pub network: String,
}
#[cfg(all(target_os = "linux", feature = "libvirt"))]
impl LibvirtHypervisor {
fn mk_work_dir(&self, id: &str) -> std::path::PathBuf {
// Prefer /var/lib/solstice-ci if writable, else tmp
let base = std::path::Path::new("/var/lib/solstice-ci");
let dir = if base.exists() && base.is_dir() && std::fs::metadata(base).is_ok() {
base.join(id)
} else {
std::env::temp_dir().join("solstice-libvirt").join(id)
};
let _ = std::fs::create_dir_all(&dir);
let _ = std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700));
dir
}
}
#[cfg(all(target_os = "linux", feature = "libvirt"))]
#[async_trait]
impl Hypervisor for LibvirtHypervisor {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle> {
use libvirt::{Connect, Network, Domain};
use std::process::Command;
let id = format!("job-{}", ctx.request_id);
let work_dir = self.mk_work_dir(&id);
// Connect and ensure network is active
let uri = self.uri.clone();
let net_name = self.network.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
let conn = Connect::open(&uri).map_err(|e| miette::miette!("libvirt connect error: {e}"))?;
let net: Network = conn.network_lookup_by_name(&net_name)
.map_err(|e| miette::miette!("libvirt network '{}' not found: {e}", net_name))?;
if !net.is_active().unwrap_or(false) {
net.create().map_err(|e| miette::miette!("failed to activate network '{}': {e}", net_name))?;
}
if !net.get_autostart().unwrap_or(true) {
net.set_autostart(true).ok();
}
Ok(())
}).await.into_diagnostic()??;
// Create qcow2 overlay
let overlay = work_dir.join("overlay.qcow2");
let size_arg = format!("{}G", spec.disk_gb);
let status = tokio::task::spawn_blocking({
let base = spec.image_path.clone();
let overlay = overlay.clone();
move || -> miette::Result<()> {
let out = Command::new("qemu-img")
.args(["create","-f","qcow2","-F","qcow2","-b"])
.arg(&base)
.arg(&overlay)
.arg(&size_arg)
.output()
.map_err(|e| miette::miette!("qemu-img not found or failed: {e}"))?;
if !out.status.success() { return Err(miette::miette!("qemu-img failed: {}", String::from_utf8_lossy(&out.stderr))); }
Ok(())
}
}).await.into_diagnostic()??;
let _ = status; // appease compiler if unused
// Build NoCloud seed ISO if user_data provided
let mut seed_iso: Option<PathBuf> = None;
if let Some(ref user_data) = spec.user_data {
let seed_dir = work_dir.join("seed");
tokio::fs::create_dir_all(&seed_dir).await.into_diagnostic()?;
let ud_path = seed_dir.join("user-data");
let md_path = seed_dir.join("meta-data");
tokio::fs::write(&ud_path, user_data).await.into_diagnostic()?;
let meta = format!("instance-id: {}\nlocal-hostname: {}\n", id, id);
tokio::fs::write(&md_path, meta.as_bytes()).await.into_diagnostic()?;
// mkisofs or genisoimage
let iso_path = work_dir.join("seed.iso");
tokio::task::spawn_blocking({
let iso_path = iso_path.clone();
let seed_dir = seed_dir.clone();
move || -> miette::Result<()> {
let try_mk = |bin: &str| -> std::io::Result<std::process::Output> {
Command::new(bin)
.args(["-V","cidata","-J","-R","-o"])
.arg(&iso_path)
.arg(&seed_dir)
.output()
};
let out = try_mk("mkisofs").or_else(|_| try_mk("genisoimage"))
.map_err(|e| miette::miette!("mkisofs/genisoimage not found: {e}"))?;
if !out.status.success() { return Err(miette::miette!("mkisofs failed: {}", String::from_utf8_lossy(&out.stderr))); }
Ok(())
}
}).await.into_diagnostic()??;
seed_iso = Some(iso_path);
}
// Domain XML
let xml = {
let mem = spec.ram_mb;
let vcpus = spec.cpu;
let overlay_str = overlay.display().to_string();
let seed_str = seed_iso.as_ref().map(|p| p.display().to_string());
let net = self.network.clone();
let cdrom = seed_str.map(|p| format!("<disk type='file' device='cdrom'>\n <driver name='qemu' type='raw'/>\n <source file='{}'/>\n <target dev='hdb' bus='ide'/>\n <readonly/>\n</disk>", p)).unwrap_or_default();
format!("<domain type='kvm'>\n<name>{}</name>\n<memory unit='MiB'>{}</memory>\n<vcpu>{}</vcpu>\n<os>\n <type arch='x86_64' machine='pc'>hvm</type>\n <boot dev='hd'/>\n</os>\n<features><acpi/></features>\n<devices>\n <disk type='file' device='disk'>\n <driver name='qemu' type='qcow2' cache='none'/>\n <source file='{}'/>\n <target dev='vda' bus='virtio'/>\n </disk>\n {}\n <interface type='network'>\n <source network='{}'/>\n <model type='virtio'/>\n </interface>\n <graphics type='vnc' autoport='yes' listen='127.0.0.1'/>\n <console type='pty'/>\n</devices>\n<on_poweroff>destroy</on_poweroff>\n<on_crash>destroy</on_crash>\n</domain>",
id, mem, vcpus, overlay_str, cdrom, net)
};
// Define domain
let uri2 = self.uri.clone();
tokio::task::spawn_blocking({
let xml = xml.clone();
move || -> miette::Result<()> {
let conn = Connect::open(&uri2).map_err(|e| miette::miette!("libvirt connect error: {e}"))?;
let _dom: Domain = conn.domain_define_xml(&xml)
.map_err(|e| miette::miette!("domain define failed: {e}"))?;
Ok(())
}
}).await.into_diagnostic()??;
info!(domain = %id, image = ?spec.image_path, cpu = spec.cpu, ram_mb = spec.ram_mb, "libvirt prepared");
Ok(VmHandle { id, backend: BackendTag::Libvirt, work_dir, overlay_path: Some(overlay), seed_iso_path: seed_iso })
}
async fn start(&self, vm: &VmHandle) -> Result<()> {
use libvirt::Connect;
let id = vm.id.clone();
let uri = self.uri.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
let conn = Connect::open(&uri).map_err(|e| miette::miette!("libvirt connect error: {e}"))?;
let dom = conn.domain_lookup_by_name(&id).map_err(|e| miette::miette!("lookup domain {}: {e}", id))?;
dom.create().map_err(|e| miette::miette!("start domain {} failed: {e}", id))?;
Ok(())
}).await.into_diagnostic()??;
info!(domain = %vm.id, "libvirt started");
Ok(())
}
async fn stop(&self, vm: &VmHandle, t: Duration) -> Result<()> {
use libvirt::Connect;
let id = vm.id.clone();
let uri = self.uri.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
let conn = Connect::open(&uri).map_err(|e| miette::miette!("libvirt connect error: {e}"))?;
let dom = conn.domain_lookup_by_name(&id).map_err(|e| miette::miette!("lookup domain {}: {e}", id))?;
dom.shutdown().ok();
// Poll for inactive up to timeout
let start = std::time::Instant::now();
while start.elapsed() < t {
if !dom.is_active().unwrap_or(true) { break; }
std::thread::sleep(std::time::Duration::from_millis(500));
}
if dom.is_active().unwrap_or(false) {
dom.destroy().ok();
}
Ok(())
}).await.into_diagnostic()??;
info!(domain = %vm.id, "libvirt stopped");
Ok(())
}
async fn destroy(&self, vm: VmHandle) -> Result<()> {
use libvirt::Connect;
let id = vm.id.clone();
let uri = self.uri.clone();
tokio::task::spawn_blocking(move || -> miette::Result<()> {
let conn = Connect::open(&uri).map_err(|e| miette::miette!("libvirt connect error: {e}"))?;
if let Ok(dom) = conn.domain_lookup_by_name(&id) {
let _ = dom.undefine();
}
Ok(())
}).await.into_diagnostic()??;
// Cleanup files
if let Some(p) = vm.overlay_path.as_ref() { let _ = tokio::fs::remove_file(p).await; }
if let Some(p) = vm.seed_iso_path.as_ref() { let _ = tokio::fs::remove_file(p).await; }
let _ = tokio::fs::remove_dir_all(&vm.work_dir).await;
info!(domain = %id, "libvirt destroyed");
Ok(())
}
}
#[cfg(target_os = "illumos")]
pub struct ZonesHypervisor;
#[cfg(target_os = "illumos")]
#[async_trait]
impl Hypervisor for ZonesHypervisor {
async fn prepare(&self, spec: &VmSpec, ctx: &JobContext) -> Result<VmHandle> {
warn!(label = %spec.label, "zones hypervisor not yet implemented; returning noop-like handle");
let id = format!("zone-{}", ctx.request_id);
Ok(VmHandle { id, backend: BackendTag::Zones, work_dir: std::env::temp_dir().join("solstice-zones"), overlay_path: None, seed_iso_path: None })
}
async fn start(&self, _vm: &VmHandle) -> Result<()> { Ok(()) }
async fn stop(&self, _vm: &VmHandle, _t: Duration) -> Result<()> { Ok(()) }
async fn destroy(&self, _vm: VmHandle) -> Result<()> { Ok(()) }
}

View file

@ -0,0 +1,194 @@
mod config;
mod hypervisor;
mod scheduler;
use std::{collections::HashMap, path::PathBuf, time::Duration};
use clap::Parser;
use miette::{IntoDiagnostic as _, Result};
use tracing::{info, warn};
use config::OrchestratorConfig;
use hypervisor::{RouterHypervisor, NoopHypervisor, VmSpec, JobContext, Hypervisor};
use scheduler::{Scheduler, SchedItem};
#[derive(Parser, Debug)]
#[command(name = "solstice-orchestrator", version, about = "Solstice CI Orchestrator")]
struct Opts {
/// Path to orchestrator YAML config (image map)
#[arg(long, env = "ORCH_CONFIG")]
config: Option<PathBuf>,
/// Global max concurrency for VM provisioning/execution
#[arg(long, env = "MAX_CONCURRENCY", default_value_t = 2)]
max_concurrency: usize,
/// Per-label capacity map (e.g., illumos-latest=2,ubuntu-22.04=4)
#[arg(long, env = "CAPACITY_MAP")]
capacity_map: Option<String>,
/// gRPC listen address (e.g., 0.0.0.0:50051)
#[arg(long, env = "GRPC_ADDR", default_value = "0.0.0.0:50051")]
grpc_addr: String,
/// Postgres connection string
#[arg(long, env = "DATABASE_URL", default_value = "postgres://user:pass@localhost:5432/solstice")]
database_url: String,
/// RabbitMQ URL (AMQP)
#[arg(long, env = "AMQP_URL", default_value = "amqp://127.0.0.1:5672/%2f")]
amqp_url: String,
/// Exchange name
#[arg(long, env = "AMQP_EXCHANGE", default_value = "solstice.jobs")]
amqp_exchange: String,
/// Queue name to consume
#[arg(long, env = "AMQP_QUEUE", default_value = "solstice.jobs.v1")]
amqp_queue: String,
/// Routing key (bind)
#[arg(long, env = "AMQP_ROUTING_KEY", default_value = "jobrequest.v1")]
amqp_routing_key: String,
/// Prefetch for consumer QoS (will be overridden to max_concurrency if absent)
#[arg(long, env = "AMQP_PREFETCH")]
amqp_prefetch: Option<u16>,
/// Libvirt URI (Linux only)
#[arg(long, env = "LIBVIRT_URI", default_value = "qemu:///system")]
libvirt_uri: String,
/// Libvirt network name (Linux only)
#[arg(long, env = "LIBVIRT_NETWORK", default_value = "default")]
libvirt_network: String,
/// OTLP endpoint (e.g., http://localhost:4317)
#[arg(long, env = "OTEL_EXPORTER_OTLP_ENDPOINT")]
otlp: Option<String>,
}
#[tokio::main(flavor = "multi_thread")]
async fn main() -> Result<()> {
let _t = common::init_tracing("solstice-orchestrator")?;
let opts = Opts::parse();
info!(grpc_addr = %opts.grpc_addr, db = %opts.database_url, amqp = %opts.amqp_url, "orchestrator starting");
// Load orchestrator config (image map) and ensure images are present locally
let cfg = OrchestratorConfig::load(opts.config.as_deref()).await?;
config::ensure_images(&cfg).await?;
// Capacity settings
let capacity_map = parse_capacity_map(opts.capacity_map.as_deref());
// Build hypervisor router
let router = RouterHypervisor::build(opts.libvirt_uri.clone(), opts.libvirt_network.clone());
// Build MQ config and start consumer
let mq_cfg = common::MqConfig {
url: opts.amqp_url,
exchange: opts.amqp_exchange,
routing_key: opts.amqp_routing_key,
queue: opts.amqp_queue,
dlx: std::env::var("AMQP_DLX").unwrap_or_else(|_| "solstice.dlx".into()),
dlq: std::env::var("AMQP_DLQ").unwrap_or_else(|_| "solstice.jobs.v1.dlq".into()),
prefetch: opts.amqp_prefetch.unwrap_or(opts.max_concurrency as u16),
};
// Scheduler
let sched = Scheduler::new(router, opts.max_concurrency, &capacity_map);
let sched_tx = sched.sender();
let scheduler_task = tokio::spawn(async move {
if let Err(e) = sched.run().await {
warn!(error = %e, "scheduler stopped with error");
}
});
// Consumer: enqueue and ack-on-accept
let cfg_clone = cfg.clone();
let mq_cfg_clone = mq_cfg.clone();
let tx_for_consumer = sched_tx.clone();
let consumer_task = tokio::spawn(async move {
common::consume_jobs(&mq_cfg_clone, move |job| {
let sched_tx = tx_for_consumer.clone();
let cfg = cfg_clone.clone();
async move {
let label_resolved = cfg.resolve_label(job.runs_on.as_deref()).unwrap_or(&cfg.default_label).to_string();
let image = match cfg.image_for(&label_resolved) {
Some(img) => img,
None => {
warn!(runs_on = ?job.runs_on, "no image mapping for label");
miette::bail!("no image mapping for label {}", label_resolved);
}
};
// Build spec
let (cpu, ram_mb, disk_gb) = (
image.defaults.as_ref().and_then(|d| d.cpu).unwrap_or(2),
image.defaults.as_ref().and_then(|d| d.ram_mb).unwrap_or(2048),
image.defaults.as_ref().and_then(|d| d.disk_gb).unwrap_or(40),
);
let spec = VmSpec {
label: label_resolved.clone(),
image_path: image.local_path.clone(),
cpu,
ram_mb,
disk_gb,
network: None, // libvirt network handled in backend
nocloud: image.nocloud,
user_data: Some(make_cloud_init_userdata(&job.repo_url, &job.commit_sha)),
};
if !spec.nocloud {
warn!(label = %label_resolved, "image is not marked nocloud=true; cloud-init may not work");
}
let ctx = JobContext {
request_id: job.request_id,
repo_url: job.repo_url,
commit_sha: job.commit_sha,
workflow_job_id: job.workflow_job_id,
};
sched_tx.send(SchedItem { spec, ctx }).await.into_diagnostic()?;
Ok(()) // ack on accept
}
}).await
});
// Wait for ctrl-c
tokio::signal::ctrl_c().await.into_diagnostic()?;
// Cancel consumer to stop accepting new jobs
consumer_task.abort();
// Drop sender to let scheduler drain and exit
drop(sched_tx);
// Wait for scheduler to finish draining
let _ = scheduler_task.await;
Ok(())
}
fn parse_capacity_map(s: Option<&str>) -> HashMap<String, usize> {
let mut m = HashMap::new();
if let Some(s) = s {
for part in s.split(',') {
if part.trim().is_empty() { continue; }
if let Some((k,v)) = part.split_once('=') {
if let Ok(n) = v.parse::<usize>() { m.insert(k.trim().to_string(), n); }
}
}
}
m
}
fn make_cloud_init_userdata(repo_url: &str, commit_sha: &str) -> Vec<u8> {
let s = format!(r#"#cloud-config
write_files:
- path: /etc/solstice/job.yaml
permissions: '0644'
owner: root:root
content: |
repo_url: {repo}
commit_sha: {sha}
"#, repo = repo_url, sha = commit_sha);
s.into_bytes()
}

View file

@ -0,0 +1,78 @@
use miette::{IntoDiagnostic as _, Result};
use sea_orm::{Database, DatabaseConnection, sea_query::{OnConflict, Expr}, Query, EntityTrait, DbBackend, Statement};
use sea_orm::sea_query::{InsertStatement, PostgresQueryBuilder, ColumnRef};
use time::OffsetDateTime;
use uuid::Uuid;
pub async fn maybe_init_db(url: &str, run_migrations: bool) -> Result<Option<DatabaseConnection>> {
// Only enable DB when explicitly requested to avoid forcing Postgres locally.
let enabled = std::env::var("ORCH_ENABLE_DB").ok().map(|v| v == "1" || v.eq_ignore_ascii_case("true")).unwrap_or(false);
if !enabled { return Ok(None); }
let db = Database::connect(url).await.into_diagnostic()?;
if run_migrations {
migration::Migrator::up(&db, None).await.into_diagnostic()?;
}
Ok(Some(db))
}
pub async fn record_job_state(
db: &DatabaseConnection,
request_id: Uuid,
repo_url: &str,
commit_sha: &str,
runs_on: Option<&str>,
state: &str,
) -> Result<()> {
let now = OffsetDateTime::now_utc();
// INSERT ... ON CONFLICT (request_id) DO UPDATE SET state=EXCLUDED.state, updated_at=EXCLUDED.updated_at
let mut insert: InsertStatement = sea_orm::sea_query::Query::insert()
.into_table(Jobs::Table)
.columns([
Jobs::RequestId,
Jobs::RepoUrl,
Jobs::CommitSha,
Jobs::RunsOn,
Jobs::State,
Jobs::CreatedAt,
Jobs::UpdatedAt,
])
.values_panic([
Expr::val(request_id),
Expr::val(repo_url),
Expr::val(commit_sha),
Expr::val(runs_on.map(|s| s.to_string())),
Expr::val(state),
Expr::val(now),
Expr::val(now),
])
.on_conflict(
OnConflict::column(Jobs::RequestId)
.update_columns([Jobs::State, Jobs::UpdatedAt])
.to_owned(),
)
.to_owned();
let sql = insert.build(PostgresQueryBuilder);
db.execute(Statement::from_string(DbBackend::Postgres, sql)).await.into_diagnostic()?;
Ok(())
}
#[derive(sea_orm::sea_query::Iden)]
enum Jobs {
#[iden = "jobs"]
Table,
#[iden = "request_id"]
RequestId,
#[iden = "repo_url"]
RepoUrl,
#[iden = "commit_sha"]
CommitSha,
#[iden = "runs_on"]
RunsOn,
#[iden = "state"]
State,
#[iden = "created_at"]
CreatedAt,
#[iden = "updated_at"]
UpdatedAt,
}

View file

@ -0,0 +1,102 @@
use std::{collections::HashMap, sync::Arc, time::Duration};
use dashmap::DashMap;
use miette::Result;
use tokio::sync::{mpsc, Semaphore};
use tokio::task::JoinSet;
use tracing::{error, info};
use crate::hypervisor::{Hypervisor, VmSpec, JobContext};
use crate::persist;
use sea_orm::DatabaseConnection;
pub struct Scheduler<H: Hypervisor + 'static> {
hv: Arc<H>,
tx: mpsc::Sender<SchedItem>,
rx: mpsc::Receiver<SchedItem>,
global_sem: Arc<Semaphore>,
label_sems: Arc<DashmapType>,
}
type DashmapType = DashMap<String, Arc<Semaphore>>;
pub struct SchedItem {
pub spec: VmSpec,
pub ctx: JobContext,
}
impl<H: Hypervisor + 'static> Scheduler<H> {
pub fn new(hv: H, max_concurrency: usize, capacity_map: &HashMap<String, usize>) -> Self {
let (tx, rx) = mpsc::channel::<SchedItem>(max_concurrency * 4);
let label_sems = DashMap::new();
for (label, cap) in capacity_map.iter() {
label_sems.insert(label.clone(), Arc::new(Semaphore::new(*cap)));
}
Self {
hv: Arc::new(hv),
tx,
rx,
global_sem: Arc::new(Semaphore::new(max_concurrency)),
label_sems: Arc::new(label_sems),
}
}
pub fn sender(&self) -> mpsc::Sender<SchedItem> { self.tx.clone() }
pub async fn run(self) -> Result<()> {
let Scheduler { hv, mut rx, global_sem, label_sems, .. } = self;
let mut handles = Vec::new();
while let Some(item) = rx.recv().await {
let hv = hv.clone();
let global = global_sem.clone();
let label_sems = label_sems.clone();
let handle = tokio::spawn(async move {
// Acquire global and label permits (owned permits so they live inside the task)
let _g = match global.acquire_owned().await {
Ok(p) => p,
Err(_) => return,
};
let label_key = item.spec.label.clone();
let sem_arc = if let Some(entry) = label_sems.get(&label_key) {
entry.clone()
} else {
let s = Arc::new(Semaphore::new(1));
label_sems.insert(label_key.clone(), s.clone());
s
};
let _l = match sem_arc.acquire_owned().await {
Ok(p) => p,
Err(_) => return,
};
// Provision and run
match hv.prepare(&item.spec, &item.ctx).await {
Ok(handle) => {
if let Err(e) = hv.start(&handle).await {
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to start VM");
return;
}
info!(request_id = %item.ctx.request_id, label = %label_key, "vm started (workload execution placeholder)");
// Placeholder job runtime
tokio::time::sleep(Duration::from_secs(1)).await;
// Stop and destroy
if let Err(e) = hv.stop(&handle, Duration::from_secs(10)).await {
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to stop VM");
}
if let Err(e) = hv.destroy(handle).await {
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to destroy VM");
}
}
Err(e) => {
error!(error = %e, request_id = %item.ctx.request_id, label = %label_key, "failed to prepare VM");
return;
}
}
});
handles.push(handle);
}
// Wait for all in-flight tasks to finish
for h in handles { let _ = h.await; }
Ok(())
}
}

View file

@ -0,0 +1,11 @@
[package]
name = "workflow-runner"
version = "0.1.0"
edition = "2024"
[dependencies]
common = { path = "../common" }
clap = { version = "4", features = ["derive", "env"] }
miette = { version = "7", features = ["fancy"] }
tracing = "0.1"
tokio = { version = "1", features = ["rt-multi-thread", "macros", "process"] }

View file

@ -0,0 +1,39 @@
use clap::Parser;
use miette::{IntoDiagnostic, Result};
use tracing::{info, error};
#[derive(Parser, Debug)]
#[command(name = "solstice-runner", version, about = "Solstice CI Workflow Runner (VM agent)")]
struct Opts {
/// Path to workflow KDL file
#[arg(long, env = "SOL_WORKFLOW_PATH")]
workflow: String,
}
#[tokio::main(flavor = "multi_thread")]
async fn main() -> Result<()> {
let _t = common::init_tracing("solstice-workflow-runner")?;
let opts = Opts::parse();
let wf = match common::parse_workflow_file(&opts.workflow) {
Ok(wf) => wf,
Err(e) => {
error!(error = %e, "failed to parse workflow KDL");
return Err(e);
}
};
info!(name = ?wf.name, jobs = wf.jobs.len(), "loaded workflow");
for (id, job) in &wf.jobs {
println!("Job: {id}");
if let Some(ro) = &job.runs_on { println!(" runs_on: {ro}"); }
for (idx, step) in job.steps.iter().enumerate() {
let n = step.name.as_deref().unwrap_or("(unnamed)");
println!(" Step {}/{}: {}", idx + 1, job.steps.len(), n);
println!(" run: {}", step.run);
}
}
Ok(())
}

23
docker-compose.yml Normal file
View file

@ -0,0 +1,23 @@
services:
rabbitmq:
image: rabbitmq:4-management-alpine
container_name: solstice-rabbitmq
restart: unless-stopped
environment:
RABBITMQ_DEFAULT_USER: guest
RABBITMQ_DEFAULT_PASS: guest
ports:
- "5672:5672" # AMQP
- "15672:15672" # Management UI
healthcheck:
test: ["CMD", "rabbitmq-diagnostics", "-q", "ping"]
interval: 10s
timeout: 5s
retries: 5
start_period: 5s
# volumes:
# - rabbitmq-data:/var/lib/rabbitmq
#volumes:
# rabbitmq-data:

View file

@ -0,0 +1,158 @@
### Forgejo Webhooks → JobRequest Mapping (Integration Layer)
This document explains how the Forge Integration service maps real Forgejo (Gitea-compatible) webhooks to our internal `JobRequest` messages and publishes them to RabbitMQ.
---
### Overview
- Service: `crates/forge-integration`
- Endpoint: `POST /webhooks/forgejo` (configurable via `WEBHOOK_PATH`)
- Auth: HMAC-SHA256 validation using `WEBHOOK_SECRET`
- Events handled: `push`, `pull_request` (`opened`, `synchronize`, `reopened`)
- Output: `JobRequest` (JSON) published to exchange `solstice.jobs` with routing key `jobrequest.v1`
---
### Headers and Security
- Event type header: `X-Gitea-Event` (or `X-Forgejo-Event`)
- Delivery ID header (optional, for logs): `X-Gitea-Delivery`
- Signature header: `X-Gitea-Signature` (or `X-Forgejo-Signature`)
- Value is lowercase hex of `HMAC_SHA256(secret, raw_request_body)`
- If `WEBHOOK_SECRET` is set, the service requires a valid signature and returns `401` on mismatch/missing header.
- If unset, the service accepts requests (dev mode) and logs a warning.
Signature example (shell):
```bash
SECRET=your_shared_secret
BODY='{"after":"deadbeef", "repository":{}}'
SIG=$(printf %s "$BODY" | openssl dgst -sha256 -hmac "$SECRET" -binary | xxd -p -c 256)
# Send:
curl -sS -X POST http://127.0.0.1:8080/webhooks/forgejo \
-H "Content-Type: application/json" \
-H "X-Gitea-Event: push" \
-H "X-Gitea-Signature: $SIG" \
--data "$BODY"
```
---
### Payload Mapping → JobRequest
We only deserialize the minimal fields required to construct a `JobRequest`. Unused fields are ignored.
- Push event (`X-Gitea-Event: push`):
- `repo_url` <- `repository.clone_url` (fallback `repository.ssh_url`)
- `commit_sha` <- `after`
- Ignore branch deletions where `after` is all zeros
- `source` = `forgejo`
Minimal push payload shape:
```json
{
"after": "0123456789abcdef0123456789abcdef01234567",
"repository": {
"clone_url": "https://forge.example.com/org/repo.git",
"ssh_url": "ssh://git@forge.example.com:2222/org/repo.git"
}
}
```
- Pull request event (`X-Gitea-Event: pull_request`):
- Only actions: `opened`, `synchronize`, `reopened` (others → 204 No Content)
- `repo_url` <- `pull_request.head.repo.clone_url` (fallback `ssh_url`)
- `commit_sha` <- `pull_request.head.sha`
- `source` = `forgejo`
Minimal PR payload shape:
```json
{
"action": "synchronize",
"pull_request": {
"head": {
"sha": "89abcdef0123456789abcdef0123456789abcd",
"repo": {
"clone_url": "https://forge.example.com/org/repo.git",
"ssh_url": "ssh://git@forge.example.com:2222/org/repo.git"
}
}
}
}
```
`JobRequest` fields set now:
- `schema_version = "jobrequest.v1"`
- `request_id = Uuid::v4()`
- `source = forgejo`
- `repo_url` as above
- `commit_sha` as above
- `workflow_path = null` (may be inferred later)
- `workflow_job_id = null`
- `runs_on = null` (future enhancement to infer)
- `submitted_at = now(UTC)`
---
### AMQP Publication
- Exchange: `solstice.jobs` (durable, direct)
- Routing key: `jobrequest.v1`
- Queue: `solstice.jobs.v1` (declared by both publisher and consumer)
- DLX/DLQ: `solstice.dlx` / `solstice.jobs.v1.dlq`
- Publisher confirms enabled; messages are persistent (`delivery_mode = 2`).
Env/CLI (defaults):
- `AMQP_URL=amqp://127.0.0.1:5672/%2f`
- `AMQP_EXCHANGE=solstice.jobs`
- `AMQP_ROUTING_KEY=jobrequest.v1`
- `AMQP_QUEUE=solstice.jobs.v1`
- `AMQP_DLX=solstice.dlx`
- `AMQP_DLQ=solstice.jobs.v1.dlq`
---
### Configuration
- HTTP address: `HTTP_ADDR` (default `0.0.0.0:8080`)
- Webhook path: `WEBHOOK_PATH` (default `/webhooks/forgejo`)
- Shared secret: `WEBHOOK_SECRET` (required in prod)
- AMQP settings: see above
Example run:
```bash
export WEBHOOK_SECRET=devsecret
cargo run -p orchestrator &
cargo run -p forge-integration -- --http-addr 0.0.0.0:8080 --webhook-path /webhooks/forgejo
```
---
### Forgejo Setup
1. In the repository Settings → Webhooks, add a new webhook:
- Target URL: `http://<your-host>:8080/webhooks/forgejo`
- Content type: `application/json`
- Secret: your `WEBHOOK_SECRET`
- Events: check "Just the push event" and "Pull request events" (or their equivalents)
2. Save and use "Test Delivery" to verify a 202 response.
---
### Local Verification via curl
Create a minimal push body and compute signature:
```bash
SECRET=devsecret
BODY='{"after":"deadbeefdeadbeefdeadbeefdeadbeefdeadbeef","repository":{"clone_url":"https://example/repo.git"}}'
SIG=$(printf %s "$BODY" | openssl dgst -sha256 -hmac "$SECRET" -binary | xxd -p -c 256)
curl -i -X POST http://127.0.0.1:8080/webhooks/forgejo \
-H "Content-Type: application/json" \
-H "X-Gitea-Event: push" \
-H "X-Gitea-Signature: $SIG" \
--data "$BODY"
```
You should see `HTTP/1.1 202 Accepted`, and the Orchestrator should log a received `JobRequest`.
---
### Notes & Next Steps
- Add commit status updates back to Forgejo (`pending` on receipt; `success`/`failure` on completion).
- Consider repo allowlist/branch filters to reduce noise.
- Add idempotency keyed by `X-Gitea-Delivery` + `repo+sha` to avoid duplicate enqueues.
- Optional: prefer SSH URLs via config if your Orchestrator uses SSH keys for fetch.

View file

@ -0,0 +1,62 @@
### Solstice CI — Orchestrator Scheduling, Image Map Config, and Libvirt/Zones Backends (MVP)
This document summarizes the initial implementation of Orchestrator scheduling, a YAML-based image map configuration, cloud image preparation, and a hypervisor abstraction with Linux/KVM (libvirt) and illumos zones scaffolding.
#### Whats included (MVP)
- Scheduler and capacity
- Global max concurrency (`MAX_CONCURRENCY`, default 2) with backpressure by aligning AMQP `prefetch` to concurrency.
- Optional per-label capacity via `CAPACITY_MAP` (e.g., `illumos-latest=2,ubuntu-22.04=4`).
- Ack-on-accept: AMQP message is acked after basic validation and enqueue to scheduler; errors during provisioning are handled internally.
- YAML image map configuration
- Loaded at startup from `--config` / `ORCH_CONFIG`; defaults to `examples/orchestrator-image-map.yaml`.
- Keys: `default_label`, `aliases`, optional `sizes` presets, and `images` map with backend (`zones` or `libvirt`), `source` URL, `local_path`, `decompress` (`zstd` or none), `nocloud` (bool), and per-image default resources.
- Default mapping provided:
- `default_label: illumos-latest`
- Alias: `illumos-latest → openindiana-hipster`
- `openindiana-hipster` image points to current OI cloud image: `https://dlc.openindiana.org/isos/hipster/20250402/OI-hipster-cloudimage.img.zstd`, marked `nocloud: true` and `backend: zones`.
- Size presets (not yet consumed directly by jobs): `small` (1 CPU, 1 GiB), `medium` (2 CPU, 2 GiB), `large` (4 CPU, 4 GiB).
- Image preparation (downloader)
- On startup, the orchestrator ensures each configured image exists at `local_path`.
- If missing, downloads from `source` and optionally decompresses with Zstd into the target path.
- Hypervisor abstraction
- `Hypervisor` trait and `RouterHypervisor` dispatcher.
- Backends:
- `libvirt` (Linux/KVM): skeleton that connects to libvirt in `prepare`; domain XML/overlay/NoCloud seed wiring to follow.
- `zones` (illumos/bhyve): stub scaffold (not yet functional); will integrate with `zone` crate + ZFS clones in a follow-up.
- `NoopHypervisor` for development on hosts without privileges.
- Orchestrator MQ wiring
- Consumes `JobRequest` messages and builds `VmSpec` from resolved label and image defaults.
- Injects minimal cloud-init user-data content (NoCloud) into the spec for future seeding.
#### Configuration (CLI/env)
- `--config`, `ORCH_CONFIG` — path to YAML image map (default `examples/orchestrator-image-map.yaml`).
- `--max-concurrency`, `MAX_CONCURRENCY` — global VM concurrency (default 2).
- `--capacity-map`, `CAPACITY_MAP` — per-label capacity (e.g., `illumos-latest=2,ubuntu-22.04=4`).
- AMQP: `AMQP_URL`, `AMQP_EXCHANGE`, `AMQP_QUEUE`, `AMQP_ROUTING_KEY`, `AMQP_PREFETCH` (defaulted to `MAX_CONCURRENCY`).
- Libvirt (Linux): `LIBVIRT_URI` (default `qemu:///system`), `LIBVIRT_NETWORK` (default `default`).
#### Local usage (dev)
1. Ensure RabbitMQ is running (docker-compose service `rabbitmq`).
2. Start the Orchestrator:
```bash
cargo run -p orchestrator -- \
--config examples/orchestrator-image-map.yaml \
--max-concurrency 2
```
On first run, the OI cloud image will be downloaded and decompressed to the configured `local_path`.
3. In another terminal, enqueue a job (Forge Integration webhook or CLI `enqueue`). The orchestrator will resolve `runs_on` (or default label) and schedule a VM using the configured backend.
Note: The current libvirt/zones backends are partial; actual VM boot is a follow-up. The scheduler and config wiring are complete and ready for backend integration.
#### Whats next (planned)
- Libvirt backend:
- Create qcow2 overlays, generate domain XML (virtio devices), attach NoCloud ISO seed, define, start, shutdown, and destroy.
- Ensure libvirt default network is active at startup if necessary.
- Illumos zones backend:
- Integrate `oxidecomputer/zone` and ZFS clone workflow; set bhyve attributes (`vcpus`, `ram`, `bootdisk`), networking, and SMF.
- Lifecycle and runner coordination:
- gRPC Orchestrator↔Runner for logs/status, job completion handling, and cleanup.
- Persistence and recovery:
- Store job/VM state in Postgres; graceful recovery on restart.
- Tests and docs:
- Unit tests for config parsing and scheduler; feature-gated libvirt smoke test; expand docs.

View file

@ -0,0 +1,117 @@
### Solstice CI — RabbitMQ Integration and Internal JobRequest Schema (v1)
This document describes the first iteration of our Integration Layer wiring to a real RabbitMQ broker, the internal message schema used between Integration and Orchestrator, and the common utilities that ensure consistent messaging topology across services.
#### Goals
- Decouple Integration and Orchestrator via a durable message queue.
- Use a versioned, forward-compatible message schema (`jobrequest.v1`).
- Centralize messaging code in `crates/common` so all services share the same setup and patterns.
- Follow RabbitMQ best practices: durable exchanges/queues, DLX/DLQ, publisher confirms, consumer prefetch, explicit ack/nack.
---
### Topology (created automatically by services)
- Exchange (direct, durable): `solstice.jobs`
- Routing key: `jobrequest.v1` (aligns with schema version)
- Queue (durable): `solstice.jobs.v1`
- Dead-letter exchange (fanout, durable): `solstice.dlx`
- Dead-letter queue (durable): `solstice.jobs.v1.dlq`
- Bindings:
- `solstice.jobs.v1` is bound to `solstice.jobs` with `jobrequest.v1`.
- `solstice.jobs.v1.dlq` is bound to `solstice.dlx` (fanout, no routing key).
Characteristics:
- Publishing uses `mandatory = true`, `delivery_mode = 2` (persistent), and publisher confirms.
- Consumers use `basic_qos(prefetch)` for backpressure and explicitly `ack` success or `nack(requeue=false)` on error → DLQ.
---
### Shared Code in `crates/common`
- `common::messages::JobRequest` (serde-JSON; versioned schema):
- `schema_version: "jobrequest.v1"` (default)
- `request_id: uuid::Uuid`
- `source: enum { github, forgejo, manual }`
- `repo_url: String`
- `commit_sha: String`
- `workflow_path: Option<String>` — optional KDL workflow path
- `workflow_job_id: Option<String>` — optional specific job id to run
- `runs_on: Option<String>` — scheduling hint
- `submitted_at: time::OffsetDateTime` (UTC)
- `common::mq`:
- `MqConfig { url, exchange, routing_key, queue, dlx, dlq, prefetch }` — env/CLI configurable.
- `declare_topology(&Channel, &MqConfig)` — idempotent, called by both publishers and consumers.
- `publish_job(&MqConfig, &JobRequest)` — JSON publish with confirms.
- `consume_jobs(&MqConfig, handler)` — starts a consumer with QoS, deserializes `JobRequest`, and `ack`/`nack`s accordingly.
---
### Service Wiring
- Forge Integration (`crates/forge-integration`):
- New CLI subcommand `enqueue` for developer testing:
- Flags: `--repo-url`, `--commit-sha`, `--runs-on` (optional)
- Uses `common::publish_job` to send a `JobRequest` to the broker.
- The service will later accept real webhooks and publish `JobRequest`s in response.
- Orchestrator (`crates/orchestrator`):
- Starts a consumer via `common::consume_jobs` and logs each received `JobRequest`.
- This is where capacity checks, scheduling, and job provisioning will be added next.
---
### Configuration (env or CLI)
- `AMQP_URL` (default `amqp://127.0.0.1:5672/%2f`)
- `AMQP_EXCHANGE` (default `solstice.jobs`)
- `AMQP_QUEUE` (default `solstice.jobs.v1`)
- `AMQP_ROUTING_KEY` (default `jobrequest.v1`)
- `AMQP_DLX` (default `solstice.dlx`)
- `AMQP_DLQ` (default `solstice.jobs.v1.dlq`)
- `AMQP_PREFETCH` (default `64`)
Each service exposes overrides via Clap flags that mirror the env vars.
---
### Local Development
1. Start RabbitMQ with Docker Compose:
```bash
docker compose up -d rabbitmq
# AMQP: localhost:5672, Management UI: http://localhost:15672 (guest/guest)
```
2. Run the Orchestrator consumer:
```bash
cargo run -p orchestrator -- \
--amqp-url amqp://127.0.0.1:5672/%2f \
--amqp-exchange solstice.jobs \
--amqp-queue solstice.jobs.v1 \
--amqp-routing-key jobrequest.v1 \
--amqp-prefetch 64
```
3. Enqueue a sample `JobRequest` from Forge Integration:
```bash
cargo run -p forge-integration -- enqueue \
--repo-url https://github.com/example/repo.git \
--commit-sha deadbeefdeadbeefdeadbeefdeadbeefdeadbeef \
--runs-on illumos-stable
```
You should see the orchestrator log receipt of the job and ack it. Any deserialization or handler error results in the message being dead-lettered to `solstice.jobs.v1.dlq`.
---
### Rationale and Best Practices Used
- Durable exchange/queue and persistent messages to avoid data loss on broker restarts.
- Publisher confirms and `mandatory` flag to ensure broker acceptance; failures can be surfaced to the publisher.
- DLX/DLQ for poison messages and non-transient failures, preventing consumer lockups.
- QoS prefetch to match consumer concurrency and protect Orchestrator capacity.
- Versioned routing key (`jobrequest.v1`) to allow schema evolution without breaking existing consumers.
- Centralized declaration logic (`common::mq`) to keep all services consistent.
---
### Next Steps
- Map real webhooks in Integration to `JobRequest` creation.
- Implement Orchestrator scheduling and VM provisioning based on `runs_on` and workload capacity.
- Add observability fields (trace context, forge metadata) to `JobRequest` as needed (additive only, maintain compatibility).

View file

@ -0,0 +1,83 @@
### Solstice CI — Architecture Overview (KDL Jobs + MultiHost Orchestrator)
This document updates the earlier blueprint to reflect the current direction of Solstice CI:
- The project name is Solstice CI (not Helios CI).
- Workflows are defined in KDL (KDL:0) instead of YAML.
- The Orchestrator is designed to run on multiple hosts behind a shared queue for horizontal scale.
- A small set of crates provides clean separation of concerns: `orchestrator`, `forge-integration`, `github-integration`, `workflow-runner`, `common`, `ciadm`, and `cidev`.
#### Core Components
- Forge Integration Layer (`crates/forge-integration` and `crates/github-integration`)
- Receives webhooks from Forgejo or GitHub.
- Normalizes events and publishes job requests to the Orchestrator (direct API or message queue; see multihost section).
- Reports status back to the forge (Checks API for GitHub; Commit Status API for Forgejo).
- Orchestrator (`crates/orchestrator`)
- Provisions ephemeral VMs via bhyve branded zones on illumos hosts and manages their lifecycle using ZFS clones.
- Streams logs and results between the VM resident runner and the Integration Layer.
- Multihost aware: multiple Orchestrator instances can run on different illumos hosts and share work (see below).
- Workflow Runner (`crates/workflow-runner`)
- Minimal agent binary preinstalled in the base VM image.
- Fetches job definition from the Orchestrator, executes steps, streams logs, and returns final status.
- Common (`crates/common`)
- DRY utilities used by all binaries: tracing/log initialization, KDL job parsing, and future shared abstractions.
- Admin CLI (`crates/ciadm`)
- Operator utility to trigger jobs, check status, etc., against the Orchestrator.
- Dev CLI (`crates/cidev`)
- Developer utility to validate KDL files locally, inspect jobs and steps, and debug CI issues without needing the full system.
#### MultiHost Orchestration
To support multiple hosts, Solstice CI uses a shared queue (e.g., RabbitMQ) between the Integration Layer and Orchestrators:
- The Integration Layer publishes job requests into a durable queue.
- Any healthy Orchestrator node can consume a job, subject to capacity constraints.
- Nodes coordinate through the queue and an internal state store (e.g., Postgres) for job status.
- Each node manages ZFS clones and bhyve zones locally; failure isolation is pernode.
- This model scales linearly by adding illumos hosts with Orchestrator instances.
#### KDL Workflow Definition
Solstice CI adopts a simple, explicit KDL schema for workflows. Example:
```
workflow name="Solstice CI" {
job id="build" runs_on="illumos-stable" {
step name="Format" run="cargo fmt --check"
step name="Clippy" run="cargo clippy -- -D warnings"
step name="Test" run="cargo test --workspace"
}
job id="lint" runs_on="ubuntu-22.04" {
step name="Lint" run="ruff check ."
}
}
```
Key points:
- `workflow` is the root node; `name` is optional.
- One or more `job` nodes define independent VMs. Each job can have a `runs_on` hint to select a base image.
- Each `job` contains one or more `step` nodes with a `run` command and optional `name`.
The current parser lives in `crates/common/src/job.rs` and performs strict, typed parsing using the `kdl` crate.
#### Execution Flow (HighLevel)
1. A forge sends a webhook to the Integration Layer.
2. Integration validates/authenticates and publishes a job request to the queue (or calls the Orchestrator API in singlenode setups).
3. An Orchestrator node accepts the job, creates a ZFS clone of a golden VM image, builds a bhyve zone config, and boots the VM.
4. The Runner starts in the VM, obtains the job definition (including parsed KDL steps), then executes each step, streaming logs back.
5. On completion or failure, the Orchestrator halts the zone and destroys the ZFS clone, then finalizes status via the Integration Layer.
#### Security & Observability Notes
- Secrets should be injected via a secrets backend (e.g., Vault) and masked in logs.
- Tracing/logs are initialized consistently via `crates/common` and can be wired to OTLP later.
- Network isolation defaults to an isolated VNIC and restricted egress.
#### Current Repository Skeleton
- Tracing/log initialization is provided by `common::init_tracing` (console only for now).
- KDL job parsing types: `Workflow`, `Job`, `Step` and helpers in `crates/common/src/job.rs`.
- Binaries provide Clapbased CLIs with environment variable support.
- `cidev` validates and inspects KDL locally; `ciadm` is oriented to operator interactions with the Orchestrator.
#### Next Steps
- Wire the Integration Layer to a real message queue and define the internal job request schema.
- Implement Orchestrator capacity management and host selection.
- Add gRPC service definitions for Orchestrator <-> Runner streaming logs and control.
- Add GitHub App authentication (octocrab) and Forgejo (Gitea) client for status updates.
- Implement secure secrets injection and masking.

View file

@ -0,0 +1,373 @@
# **Helios CI: An Architectural Blueprint for a Native Illumos Continuous Integration System**
## **I. Executive Summary & Architectural Blueprint**
### **A. Vision Statement**
This document outlines the architecture for "Helios CI," a next-generation, self-hosted Continuous Integration (CI) platform engineered for security, performance, and operational simplicity. Helios CI leverages the robust foundation of the illumos operating system, the efficiency of the Rust programming language, and the native virtualization capabilities of bHyve to provide ephemeral, fully-isolated build environments. Its primary design goal is to offer a developer experience on par with market-leading cloud-native solutions through deep, native-like integration with both GitHub and Forgejo, running entirely on-premises. The system is designed for organizations that require absolute control over their build infrastructure, uncompromising security isolation between jobs, and the performance benefits of a purpose-built, vertically-integrated solution.
### **B. Core Architectural Pillars**
The Helios CI architecture is composed of three distinct, decoupled services that communicate over internal APIs. This separation of concerns ensures modularity, scalability, and maintainability.
* **Forge Integration Layer:** A stateless, public-facing web service responsible for all communication with external forges (GitHub, Forgejo). It authenticates and processes incoming webhook events, translates them into a standardized internal format, and uses the forge's API to report back detailed status and results. This layer acts as the system's ambassador to the outside world.
* **Orchestration Engine:** The stateful heart of the system. It receives validated job requests from the Integration Layer, manages the complete lifecycle of bHyve virtual machines (provisioning, booting, monitoring, teardown), and acts as a conduit for streaming logs and results between the Job Agent and the Integration Layer. This engine is the master of the illumos host's virtualization capabilities.
* **Job Execution Agent:** A lightweight, ephemeral agent residing within the bHyve VM guest operating system. It is responsible for receiving a job definition from the Orchestration Engine, executing the user-defined workflow steps, capturing all output, and communicating status back to the Orchestration Engine.
### **C. System Flow Diagram**
The end-to-end process of a CI run is a choreographed sequence of events spanning all three architectural pillars, triggered by a single developer action. The flow is as follows:
1. A developer pushes a commit to a tracked branch in a GitHub or Forgejo repository.
2. The forge detects this event and sends a webhook (e.g., check\_suite from GitHub, push from Forgejo) to the publicly exposed endpoint of the Forge Integration Layer.
3. The Integration Layer validates the webhook's signature to ensure authenticity. It then performs the necessary authentication with the forge's API. For GitHub, this involves a multi-step JWT and installation token exchange; for Forgejo, it uses a pre-configured API token.
4. Immediately upon successful authentication, the Integration Layer makes an API call back to the forge to create an initial "pending" or "queued" status on the commit. This provides immediate feedback to the developer in the pull request or commit history UI.
5. The Integration Layer translates the webhook payload into a standardized job request and sends it to the Orchestration Engine via an internal, private API.
6. The Orchestration Engine receives the request and begins provisioning a new, isolated build environment. It uses the illumos ZFS filesystem to create a near-instantaneous, copy-on-write clone of a pre-configured base VM image.
7. Using the zonecfg utility or a corresponding Rust library, the Orchestrator defines a new bhyve branded zone, attaching the cloned ZFS volume as the VM's primary disk and configuring its virtual network interface. It then boots the zone, which starts the bHyve VM.
8. Inside the newly booted VM, the Job Execution Agent starts automatically. It contacts the Orchestration Engine to fetch the full job definition, which includes the repository URL, commit SHA, and the parsed steps from the workflow YAML file.
9. The Agent sets up the workspace by cloning the specified commit from the repository. It then begins executing each step defined in the workflow file sequentially.
10. As each step runs, the Agent captures its stdout and stderr in real-time and streams this log data back to the Orchestration Engine. The Orchestrator, in turn, forwards these log chunks to the Forge Integration Layer.
11. The Integration Layer continuously updates the status check on the forge's platform, appending the new log data. This allows developers to watch the build log live from their browser, directly within the GitHub or Forgejo UI.
12. Upon completion of all steps, the Agent reports the final status (e.g., success, failure) and any structured results (like test failures or code annotations) to the Orchestrator and then terminates.
13. The Orchestration Engine receives the final status and immediately begins teardown. It halts the bhyve zone and issues a zfs destroy command on the cloned ZFS volume, completely and irrevocably wiping the entire build environment and all its artifacts.
14. The Orchestrator forwards the final job result to the Forge Integration Layer, which makes a final API call to the forge, updating the check with its terminal conclusion (success or failure) and any detailed summary or annotations. The developer now sees the final green checkmark or red 'X' next to their commit.
## **II. The Forge Integration Layer: The System's Public Face**
This layer serves as the critical bridge between the external developer platforms and the internal CI logic. Its design must prioritize security, robustness, and the creation of an abstract interface that can gracefully handle the significant differences in capabilities between target forges like GitHub and Forgejo.
### **A. Integrating with GitHub: The Gold Standard via GitHub Apps**
To achieve a truly "native-like experience" on GitHub, the use of a GitHub App is not merely a preference but a strict architectural requirement. The GitHub Checks API, which enables the rich, multi-stage, and annotated feedback that defines modern CI systems, is exclusively available to GitHub Apps.1 This stands in stark contrast to older methods like Personal Access Tokens (PATs) or standard OAuth Apps, which are restricted to the much simpler Commit Status API.
Furthermore, from a security standpoint, GitHub Apps represent a significant leap forward. They employ a model of fine-grained permissions, allowing the application to request only the specific access it needs (e.g., checks:write, contents:read) rather than broad, all-encompassing scopes like repo. Access can be granted on a per-repository basis by the installing user or organization, and the operational tokens are deliberately short-lived. This principle of least privilege is a cornerstone of modern security design and is strongly recommended by GitHub's own best practices.2
#### **Authentication Flow Deep Dive**
The authentication process for a GitHub App is a sophisticated, multi-step dance designed to maximize security by minimizing the exposure of long-lived credentials.
1. **Secure Credential Storage:** Upon registration, the GitHub App is assigned an App ID and allows for the generation of a private key in .pem format. These two pieces of information are the root credentials for the application. The private key is paramount and must be treated with the utmost care, stored in a secure, managed secrets store such as HashiCorp Vault or a cloud provider's equivalent (e.g., Azure Key Vault).7 It must never be stored in plaintext in configuration files or environment variables.
2. **Generating a JSON Web Token (JWT):** To initiate any API communication, the service must first authenticate *as the app itself*. It does this by creating and signing a JWT using its private key. This JWT is a short-lived credential (maximum 10-minute validity) that proves the service has access to the private key without ever transmitting the key over the network. The JWT payload must contain specific claims as mandated by GitHub: iat (issued at time), exp (expiration time), and iss (issuer, which is the App ID).8
3. **Requesting an Installation Access Token:** The generated JWT is then used to acquire a token that can act on behalf of a specific installation (i.e., a specific user or organization that has installed the app). The service makes a POST request to the GitHub API endpoint /app/installations/{installation\_id}/access\_tokens. The installation\_id is conveniently provided in the payload of every webhook event the app receives. The request must include the JWT in the Authorization header, formatted as Bearer \<JWT\>.10
4. **Using the Installation Access Token:** The response from the API contains a temporary *installation access token*. This token is typically valid for one hour. All subsequent API calls made to perform actions on the repository (such as creating or updating a check run) will use this token in the Authorization header, again formatted as Bearer \<TOKEN\>.8 This two-step process ensures that the powerful private key is used only briefly and indirectly, while the operational token that interacts with repository data has a limited lifetime, drastically reducing the potential impact if it were ever compromised.
#### **Implementing a "Native" Experience with the Checks API**
The Checks API is the key to unlocking a rich user experience within the GitHub UI.
* **Check Suites and Check Runs:** When a developer pushes code, GitHub automatically creates a check\_suite for the commit. It then sends a check\_suite webhook to all installed GitHub Apps with the checks:write permission. Upon receiving this webhook, the Integration Layer should immediately use the API to create a corresponding check\_run. This single action provides instant feedback in the pull request UI, showing that the CI process has been acknowledged and is underway.1 The initial check\_run should be created with a status of queued or in\_progress.
* **Rich, Granular Feedback:** A check\_run is a mutable object that can be updated throughout the job's lifecycle. The service can post a title, a detailed summary (which supports Markdown for formatting), and, most powerfully, annotations. Annotations are messages that can be tied to specific files and line numbers within the commit, complete with a severity level (notice, warning, or failure). This allows the CI system to report linting errors, test failures, or security vulnerabilities directly in the "Files changed" view of a pull request, providing context-rich feedback exactly where the developer is looking.1
* **Interactive UI Elements:** The Checks API also supports the definition of actions. These are rendered as buttons in the GitHub Checks UI and can be configured to send a new webhook event back to the app when clicked. This opens up possibilities for interactive features like "Re-run failed tests" or "Apply suggested fix" without requiring the developer to leave the GitHub interface.1
#### **Recommended Rust Crate: octocrab**
For implementing this integration, the octocrab crate is the standout choice. It is a modern, well-maintained, and extensible GitHub API client for Rust. Crucially, it has explicit, high-level support for the GitHub App authentication flow and provides a dedicated checks module as part of its strongly-typed semantic API.12 Its comprehensive set of data models for webhook payloads will also greatly simplify the process of deserializing and handling incoming events, reducing boilerplate and the risk of parsing errors.12 Using octocrab significantly de-risks the implementation by abstracting away the complexities of raw HTTP requests and manual JSON parsing.
### **B. Integrating with Forgejo & Gitea: A Pragmatic Approach**
Forgejo is a soft fork of Gitea, a popular self-hosted Git service.14 A critical piece of information for this project is that Forgejo maintains a high degree of API compatibility with Gitea, even providing a Gitea-compatible /api/v1 endpoint.14 This allows the project to leverage the more extensive documentation, community support, and broader ecosystem of SDKs available for Gitea, treating a Forgejo instance as a Gitea target for all practical purposes.16
#### **Authentication**
Authentication for the Forgejo/Gitea API is more straightforward than for GitHub Apps. It relies on a standard API access token, which can be generated by a user through the web interface. This token is then included in API requests within the Authorization header, using the format token \<TOKEN\>.19 These tokens are typically long-lived and must be securely stored and managed by the administrator of the Helios CI system.
#### **The Commit Status API**
A pivotal distinction between GitHub and Forgejo/Gitea is the absence of a direct equivalent to the rich Checks API in the latter. The mechanism available for reporting build status is the Commit Status API.16 This API allows an external service to attach a status to a specific commit SHA. The primary endpoint for this is POST /api/v1/repos/{owner}/{repo}/statuses/{sha}.25
#### **Capabilities and Limitations**
The Commit Status API is functional but limited. It accepts a payload containing:
* A state: pending, success, failure, error, or warning.
* A target\_url: A URL that the status will link to, typically the CI job log view.
* A description: A short, one-line string summarizing the status.
* A context: A string used to differentiate this status from others (e.g., ci/helios/build, ci/helios/test).
This API provides a monolithic, per-job status. It lacks the granularity of the GitHub Checks API; there is no built-in support for reporting per-step feedback, streaming logs directly into the UI, adding line-level code annotations, or creating interactive UI elements. This represents a fundamental capability gap that will result in a less integrated and less rich user experience on Forgejo compared to GitHub.
#### **Recommended Rust Crates**
Given the API compatibility, a Gitea-focused Rust crate is the most pragmatic choice. The gitea-sdk crate appears to be a modern and well-structured option, offering a fluent builder pattern for API requests that is conceptually similar to octocrab.26 While other crates like gritea 28 and gitea 29 exist, they appear less actively maintained or documented. The native forgejo-api crate is still nascent and has sparse documentation, making the mature Gitea SDKs a lower-risk choice.30
### **C. Designing a Unified Forge Abstraction in Rust**
To prevent the core logic of the Orchestration Engine from being polluted with if github {... } else if forgejo {... } conditional blocks, a strong abstraction layer is essential. This will be achieved by defining a Forge trait in Rust, which will present a unified, idealized interface for all interactions required by the CI system.
The core challenge in designing this trait is the feature disparity between the two platforms. A "native-like experience" means different things on GitHub versus Forgejo. The GitHub experience is defined by the rich, interactive feedback of the Checks API, while the Forgejo experience is limited to the simpler, monolithic updates of the Commit Status API. The abstraction must be designed to accommodate the richer feature set of GitHub, while allowing for a "best-effort" or graceful degradation on platforms that lack those features.
Rust
use async\_trait::async\_trait;
// Represents a line-level annotation.
pub struct Annotation {
pub path: String,
pub line: u32,
pub message: String,
pub level: AnnotationLevel, // e.g., Notice, Warning, Failure
}
// Represents the state of a running job.
pub trait CheckRun {
//... methods to manage internal state...
}
// The core abstraction for interacting with a forge.
\#\[async\_trait\]
pub trait Forge {
/// Posts an initial "pending" status to the forge, returning a handle
/// to the check run that can be used for subsequent updates.
async fn report\_pending(\&self, job\_context: \&JobContext) \-\> Result\<Box\<dyn CheckRun\>\>;
/// Updates the status of an in-progress check run, typically by
/// appending new log output.
async fn update\_progress(\&self, check\_run: \&mut Box\<dyn CheckRun\>, new\_log\_chunk: \&str);
/// Adds a specific code annotation to the check run.
/// This will be a no-op for forges that do not support annotations.
async fn add\_annotation(\&self, check\_run: \&mut Box\<dyn CheckRun\>, annotation: Annotation);
/// Posts the final result of the job to the forge.
async fn report\_final\_status(\&self, check\_run: Box\<dyn CheckRun\>, result: JobResult);
}
This design allows the rest of the system to operate against the idealized Forge trait. The system will attempt to add\_annotation regardless of the target forge. The GitHubForge implementation will translate this into a Checks API call, while the ForgejoForge implementation will simply do nothing. This cleanly isolates the platform-specific logic and makes the system extensible to other forges in the future.
The following table provides a clear, at-a-glance comparison for stakeholders and developers, highlighting the feature gap that informs the design of this unified abstraction.
| Feature | GitHub Checks API | Forgejo Commit Status API | Implication for Helios CI |
| :---- | :---- | :---- | :---- |
| **Overall Status** | Supported (queued, in\_progress, completed with conclusion) | Supported (pending, success, failure, etc.) | Core functionality is available on both platforms. |
| **Per-Step Status Updates** | Supported via check\_run updates. Can show "Step 2/5: Running tests..." | Not supported. A single description for the entire job. | The UI experience on Forgejo will be less granular, showing only the overall job status. |
| **Code Annotations** | Supported. Line-specific feedback with severity levels. | Not supported. | A major feature gap. Linting/test failures cannot be shown inline on Forgejo PRs. |
| **Detailed Log Streaming** | Supported. The output.text field can be updated in real-time. | Not supported. Status links to an external target\_url for logs. | Live log viewing must happen on the Helios CI web UI for Forgejo, not within the Forgejo UI itself. |
| **Custom UI Actions** | Supported. Can add buttons to the Checks UI to trigger new webhooks. | Not supported. | Interactive features like "re-run" must be initiated from outside the Forgejo UI. |
| **Authentication Model** | GitHub App (short-lived, scoped, per-installation tokens) | User API Token (long-lived, user-scoped) | The security model for GitHub is inherently stronger and more flexible. |
## **III. The Orchestration Engine: Managing Execution Environments on illumos**
The Orchestration Engine is the core of the Helios CI system, where the unique and powerful features of the illumos operating system are leveraged to create a highly efficient and secure job execution environment. The design of this component is critical to delivering on the promises of performance and isolation.
### **A. The bhyve Zone Brand: A Superior Model for VM Management**
Illumos zones are a mature and robust form of OS-level virtualization, providing strong process, filesystem, and network isolation for applications running in a shared kernel environment.31 A key feature of the zones framework is the concept of "brands," which allows a zone to run an environment other than the native illumos one. While brands like ipkg run a full, independent copy of illumos and lx runs Linux binaries, the bhyve brand is particularly relevant for our use case. It allows a full hardware-virtualized bHyve virtual machine to be managed as a standard illumos zone.31
Managing a bHyve VM through the zone framework is vastly superior to scripting raw bhyve command-line invocations. This approach integrates the VM's entire lifecycle into the operating system's core management facilities:
* **Service Management:** The VM becomes a standard Service Management Facility (SMF) service, manageable with svcadm and observable with svcs.
* **Resource Controls:** Standard zone resource controls (e.g., for CPU shares, memory caps) can be applied to the VM.
* **Unified Tooling:** The same set of commands (zonecfg for configuration, zoneadm for administration, zlogin for console access) are used for the bHyve VM as for a simple container-like zone, providing a consistent and powerful management paradigm.32
This elevates the VM from a mere process to a first-class citizen of the operating system, simplifying automation and enhancing reliability.
### **B. Programmatic VM Provisioning with zonecfg and ZFS**
The Orchestration Engine will not rely on statically pre-configured zones. To achieve true on-demand, ephemeral environments, it will programmatically generate a unique zone configuration for every CI job it processes.
#### **Dynamic Configuration with zonecfg**
The zonecfg utility is the standard tool for defining a zone's configuration.33 The Orchestrator will either generate a command file to be passed to zonecfg \-f or, preferably, use a native Rust library that performs the equivalent operations. The configuration for a bhyve branded zone will include several key properties:
* create \-b: This command initiates the creation of a new zone configuration.
* set brand=bhyve: This specifies that the zone will host a bHyve VM.32
* set zonepath=/path/to/zone/root: This defines the directory where the zone's configuration and runtime state will be stored. This path will point to a temporary, job-specific ZFS dataset.
* set ip-type=exclusive: This grants the zone its own dedicated virtual network interface and IP stack, ensuring complete network isolation from the host and other zones.32
* add net: This resource block configures the virtual network interface, specifying its physical link (typically a virtual switch or vnic) and allowed IP addresses.32
* add device: This resource is used to pass a block device from the global zone into the zone. This is how the VM's virtual disk will be provided.32
* add attr: The bhyve brand is configured almost entirely through these generic attribute resources. Key attributes include ram (e.g., 4G), vcpus (e.g., 2), and bootdisk (which points to the device added previously).32
#### **The Central Role of ZFS**
ZFS is not merely a filesystem in this architecture; it is the foundational technology that enables the system's efficiency and security.
* **Base VM Image:** A "golden" VM image, containing the guest OS and the pre-installed Job Execution Agent, will be maintained on a ZFS volume (a zvol), for example, at rpool/bhyve-images/ubuntu-agent-v1. This image is read-only.
* **Instantaneous Clones:** When a new CI job arrives, the Orchestrator's first action is to execute a zfs clone command. For example: zfs clone rpool/bhyve-images/ubuntu-agent-v1@latest rpool/ci-vms/job-123-disk. This operation creates a new, writable ZFS volume that initially shares all its data blocks with the parent image. It is a copy-on-write clone, meaning it is created almost instantaneously and consumes negligible disk space initially. This completely sidesteps the slow process of copying a multi-gigabyte disk image, which is a major bottleneck in traditional VM-based CI systems.
* **Guaranteed Isolation and Atomic Cleanup:** The cloned ZFS volume is dedicated to a single job. The zonecfg configuration will pass the path to this clone (e.g., /dev/zvol/rdsk/rpool/ci-vms/job-123-disk) into the zone as its boot disk. When the job is complete, the Orchestrator halts the zone and executes a single zfs destroy command on the clone. This atomically and irrevocably removes all traces of the build environment, including any modifications, downloaded dependencies, or generated artifacts. This provides a forensically-sound guarantee of a clean slate for every job.
This combination of bhyve branded zones and ZFS clones offers a unique and powerful architectural advantage, providing the strong security isolation of full hardware virtualization with the speed and efficiency approaching that of container-based systems.
### **C. Rust-based Orchestration using the oxidecomputer/zone Crate**
While it is possible to orchestrate this process by shelling out to zonecfg, zoneadm, and zfs commands, this approach is brittle, difficult to maintain, and presents potential security risks (e.g., command injection). A native Rust library that provides a safe, typed API for these operations is the professionally sound choice.
The oxidecomputer/zone crate is purpose-built for creating and managing illumos zones from within a Rust application.35 Although its public documentation and usage examples are sparse, its origin within the Oxide Computer Company—a company built on illumos—suggests it is a production-quality library designed for exactly this type of systems management task.36
The reliance on this sparsely documented crate represents the most significant technical risk in the project's implementation. Public information is limited, and the crate is not widely discussed in community forums.38 Oxide's own development philosophy indicates that their open-source contributions are primarily for their own use and customer support, not necessarily for building a broad user community.38
Therefore, a critical first step in the implementation phase must be a dedicated "discovery and de-risking" task. An engineer must be allocated time to thoroughly analyze the zone crate's source code, understand its API surface, and build small proof-of-concept applications to validate its capabilities for creating, configuring, booting, and destroying bhyve branded zones. This upfront investment is essential to mitigate the risk and ensure the project's success.
Based on the crate's stated purpose, the anticipated usage pattern within the Orchestrator will be:
1. Instantiate the zone crate's data structures to build a complete zone configuration in memory, programmatically setting the brand, zonepath, and adding network, device, and attribute resources.
2. Invoke a function within the crate that serializes this configuration and applies it to the system, equivalent to running zonecfg.
3. Utilize other functions in the crate that serve as safe wrappers around zoneadm commands to boot, halt, and ultimately destroy the zone after the job is complete.
The following table serves as a quick-reference guide for the key zonecfg properties that will need to be set programmatically for each bhyve zone.
| Resource Type | Property/Attribute Name | Type | Example Value | Description |
| :---- | :---- | :---- | :---- | :---- |
| (global) | brand | string | bhyve | Sets the zone brand to bHyve for hardware virtualization.32 |
| (global) | zonepath | string | /zones/job-123 | The root directory for the zone's configuration on a ZFS dataset.33 |
| (global) | ip-type | enum | exclusive | Gives the zone its own dedicated IP stack for network isolation.33 |
| net | physical | string | vnic0 | The name of the virtual NIC in the global zone to connect to.32 |
| device | match | string | /dev/zvol/rdsk/rpool/ci-vms/job-123-disk | Passes the job-specific ZFS volume into the zone as a block device.32 |
| attr | name=ram | string | 4G | Sets the amount of RAM allocated to the virtual machine.32 |
| attr | name=vcpus | string | 2 | Sets the number of virtual CPUs for the VM.32 |
| attr | name=bootdisk | string | rpool/ci-vms/job-123-disk | Specifies which device (matched above) is the primary boot disk.32 |
| attr | name=vnc | string | on | Enables VNC access for debugging or graphical installers.32 |
## **IV. CI Job Definition and Execution**
This section addresses the user-facing aspect of the CI system: how developers define their build and test pipelines, and how those definitions are translated into actions executed within the ephemeral virtual machines.
### **A. Defining Workflows: Adopting a Familiar YAML Syntax**
Rather than inventing a proprietary workflow syntax, which would create a significant barrier to adoption, Helios CI will adopt a schema that is largely compatible with the common features of GitHub Actions and Forgejo Actions.39 Both platforms utilize a similar YAML structure based on concepts like jobs, steps, and triggers. This approach allows developers to leverage their existing knowledge and, in many cases, use basic workflow files with minimal modification.
#### **Core Schema Elements**
The workflow file, typically located at .github/workflows/ci.yml or a similar path, will be structured around the following core keys:
* name: An optional string that provides a human-readable name for the workflow.
* on: A required key that specifies the events that trigger the workflow, such as push or pull\_request, potentially filtered by branch or path.
* jobs: A map where each key is a unique job ID. Each job runs in its own, separate VM environment.
* Within each job:
* runs-on: A string that specifies the type of build environment required. The Orchestrator will map this string (e.g., ubuntu-22.04, illumos-stable) to a specific "golden" VM image to be cloned.
* steps: A list of sequential steps to be executed. Each step is an individual task.
* Within each step:
* name: An optional descriptive name for the step, which will be displayed in the UI.
* uses: Specifies a reusable action to be run.
* run: A string or multi-line string containing a shell command to be executed.
Adopting this syntax provides immediate familiarity but also requires careful management of user expectations. While the *syntax* is compatible, the *execution environment* is unique to Helios CI. This means that complex, marketplace-style actions from GitHub or Forgejo (which are often JavaScript or Docker-based and rely on a specific runner environment) will not be compatible out of the box.46 The initial implementation of uses should be clearly documented to support only "local actions" (uses:./path/to/action), where the action's code is checked into the user's own repository.47 This focuses the system on its core strength—executing arbitrary run commands in a secure environment—while avoiding the immense complexity of replicating the full GitHub Actions runner environment.
#### **Parsing with serde\_yaml**
The Rust ecosystem provides first-class tools for parsing and handling structured data. The serde framework, in combination with a YAML parsing crate, will be used to deserialize the workflow file into strongly-typed Rust structs. While serde\_yaml has been a popular choice, it is now largely unmaintained.48 A modern, maintained fork such as serde\_yaml\_ng is the recommended choice to ensure ongoing support and security.50 This approach provides compile-time safety, automatic validation of the workflow file's structure, and a clean, safe way to pass job definitions between the system's components.
### **B. The Ephemeral Job Agent**
The Job Agent is a small, self-contained, and statically-linked Rust binary that is pre-installed on every base VM image. Its design prioritizes simplicity and robustness, as it runs in an untrusted environment executing user-provided code.
#### **Agent Lifecycle**
1. **Startup:** When the bHyve VM is booted by the Orchestrator, a startup service (e.g., an SMF service on an illumos guest, or a systemd service on a Linux guest) immediately launches the Job Agent binary.
2. **Configuration and Job Fetching:** The agent needs to receive its specific job context (repository URL, commit SHA, parsed workflow steps, etc.). This information can be passed from the Orchestrator to the guest environment through several mechanisms, such as a small, mounted configuration drive, environment variables injected by the bhyve brand, or an initial API call from the agent back to a private metadata endpoint on the Orchestration Engine.
3. **Workspace Setup:** The agent's first task is to prepare the build environment. It will use an embedded Git library or shell out to git to clone the specified repository and check out the exact commit SHA into a local working directory.
4. **Step Execution:** The agent iterates through the list of steps provided in its job context. For each step containing a run command, it will spawn a shell process (/bin/sh or /bin/bash), execute the command, and meticulously capture every byte of its stdout and stderr streams in real-time.
5. **Live Log Streaming:** As the output is captured, it is immediately streamed back to the Orchestration Engine over a simple, persistent gRPC or TCP connection established at startup. This is the mechanism that enables live log viewing for the developer.
6. **Status Reporting:** After each step completes, the agent inspects its exit code. If the code is non-zero, the step is marked as failed. By default, a failed step will halt the execution of the entire job. The agent will immediately report the step's success or failure back to the Orchestrator.
7. **Shutdown:** Once all steps have been executed, or if a step fails and the job is aborted, the agent sends a final job status report (including the overall success or failure conclusion) to the Orchestration Engine. It then cleanly terminates its own process. This termination signals to the Orchestrator that the job is complete and the VM is ready for destruction.
## **V. Tying It All Together: Anatomy of a CI Run**
To synthesize the interactions between these components, this section provides a narrative, step-by-step walkthrough of a complete CI process. It follows a single git push from the developer's machine to the final, detailed result appearing in the GitHub pull request interface.
1. **The Trigger:** A developer on their local machine finalizes a feature and executes git push origin feature-branch. The commit is pushed to the corresponding repository on GitHub.
2. **The Webhook:** GitHub receives the push, identifies that it corresponds to an open pull request, and creates a check\_suite for the new commit SHA. It then dispatches a check\_suite webhook event to the pre-configured public endpoint of the Helios CI Forge Integration Layer. The JSON payload of this webhook contains the repository details, the commit SHA, and the unique installation\_id for the GitHub App.
3. **Authentication and Initial Feedback (Forge Integration Layer):**
* The web service receives the HTTP POST request. It first validates the X-Hub-Signature-256 header to confirm the request is authentic and originated from GitHub.
* Using its securely stored App ID and private key, the service generates a short-lived JWT. It immediately uses this JWT to request a one-hour installation access token from GitHub's API, specific to the installation\_id from the webhook.10
* With the newly acquired installation access token, it makes its first API call back to GitHub: a POST request to create a new check\_run. It sets the status to queued and the name to something descriptive, like 'Helios CI / build'. Within seconds of the push, a new pending check appears in the developer's pull request UI.
* The service then sanitizes the relevant information from the webhook payload (repo URL, SHA, etc.) and dispatches a validated job request to the Orchestration Engine's internal gRPC API.
4. **Provisioning the Environment (Orchestration Engine):**
* The Orchestration Engine receives the job request from its internal queue.
* It executes zfs clone rpool/bhyve-images/ubuntu-agent-v1@latest rpool/ci-vms/job-451-disk, creating an instantaneous, writable disk for the new VM.
* Leveraging the oxidecomputer/zone Rust crate, it programmatically constructs a new zone configuration in memory for a zone named job-451. It sets the brand to bhyve, points the zone's boot disk device to the newly created ZFS volume, and configures its networking to connect to an isolated virtual switch.32
* It commits this configuration to the system and then calls the equivalent of zoneadm \-z job-451 boot to start the VM.
5. **Job Execution (Job Agent):**
* The bHyve VM boots the Ubuntu guest OS. A systemd service automatically starts the pre-installed Rust-based Job Agent.
* The agent establishes a gRPC connection back to the Orchestrator and receives the parsed YAML steps for its assigned job.
* It clones the repository (https://github.com/org/repo.git) and checks out the specific commit SHA.
* It begins executing the defined steps: run: cargo fmt \--check, then run: cargo clippy \-- \-D warnings, and finally run: cargo test.
6. **Real-time Reporting:**
* As cargo test executes, it prints test status lines to stdout. The Job Agent captures this output line-by-line and immediately sends each line over its gRPC stream to the Orchestrator.
* The Orchestrator forwards these log chunks to the Forge Integration Layer.
* The Integration Layer makes a series of PATCH requests to the GitHub Checks API, updating the output.text field of the check\_run. The developer, watching the pull request in their browser, can see the test output appearing in real-time within the GitHub UI.
7. **A Test Failure:** One of the integration tests fails, causing cargo test to exit with a non-zero status code.
* The Job Agent detects the failure. It can optionally be configured to parse the cargo test output to identify the exact file and line number of the failing test assertion.
* It constructs a final report for the Orchestrator, indicating an overall job failure and including a structured Annotation object with the file path, line number, and error message of the failed test.
8. **Final Status and Teardown:**
* The Orchestrator receives the failure report. It immediately commands the bhyve zone to shut down via zoneadm \-z job-451 halt. Once the zone is halted, it executes zfs destroy \-r rpool/ci-vms/job-451-disk. The entire build environment, including the failed test's artifacts and logs, is instantly and completely destroyed.
* The Orchestrator forwards the final result, including the structured annotation, to the Forge Integration Layer.
* The Integration Layer makes one last PATCH request to the check\_run on GitHub. It sets the conclusion to failure and includes the annotation in the output object. Instantly, the pending check in the PR turns into a red 'X'. When the developer expands the details, they see the full log, and an annotation is placed directly on the line of code containing the failed assertion in the "Files changed" tab.
## **VI. Advanced Topics and Strategic Recommendations**
A robust CI system requires more than just job execution. This section outlines critical considerations for security, scalability, and performance that must be addressed to create a production-ready platform.
### **A. Security Hardening**
Security must be a foundational principle of the CI system's design, especially as it will be executing untrusted code from pull requests.
* **Private Key Management:** The GitHub App's private key is the ultimate credential for the system's identity on GitHub. If compromised, an attacker could impersonate the CI system across all installed repositories. This key must never be stored in a configuration file or environment variable in plain text. It should be stored in a dedicated secrets management system like HashiCorp Vault or a Hardware Security Module (HSM). The Forge Integration Layer should be configured to fetch this key at startup, and access to the secrets manager should be tightly controlled.
* **Network Isolation:** By default, the bHyve VMs should be provisioned on a completely isolated virtual network. On illumos, this can be achieved using an etherstub and a dedicated vnic for each zone. This network should have no route to the public internet or to sensitive internal production networks. For jobs that legitimately need to download dependencies from the internet (e.g., from crates.io), egress should be explicitly enabled and routed through a dedicated, filtering proxy that can enforce policies and log all outbound traffic.
* **Secrets Management for Jobs:** CI jobs often require access to secrets like deployment credentials or API keys. The Helios CI system must provide a secure mechanism for injecting these into the build environment. A robust solution would involve integrating the Orchestration Engine with a secrets backend (like Vault). The workflow YAML could specify which secrets are needed, and the Orchestrator would fetch them from Vault and inject them into the VM at boot time, making them available to the Job Agent as environment variables or temporary files in a tmpfs. These secrets must be masked from the build logs to prevent accidental exposure.
### **B. Scalability and Concurrency**
While the initial design can operate on a single powerful illumos host, a true production system must be able to scale horizontally.
* **Decoupling with a Job Queue:** The direct API call from the Integration Layer to the Orchestrator can become a bottleneck. A more scalable architecture would introduce a message queue (e.g., RabbitMQ, NATS) between the two services. The Integration Layer would simply publish a job request message to the queue.
* **Pool of Orchestrator Nodes:** The system can be scaled by creating a cluster of multiple physical illumos servers, each running an instance of the Orchestration Engine. These engines would act as consumers, pulling job requests from the central message queue. Each node would manage its own pool of local resources (CPU, RAM, ZFS storage) and run a certain number of concurrent bHyve VMs. This distributed model allows the system's total capacity to be scaled horizontally simply by adding more illumos hosts to the cluster.
### **C. Artifact and Cache Management with ZFS**
The capabilities of ZFS extend beyond just provisioning, offering powerful solutions for managing build artifacts and caching.
* **Artifacts:** For successful builds that produce artifacts (e.g., compiled binaries, documentation, container images), the Job Agent can package them into an archive. This archive can be streamed back to the Orchestrator before the VM is destroyed. The Orchestrator can then store this artifact on a dedicated ZFS dataset. The Integration Layer can either expose a secure download link for this artifact or use the forge's API to upload it to a "release" or as a job artifact.
* **Intelligent Caching:** The long pole in many CI jobs is downloading dependencies and recompiling code that hasn't changed. ZFS provides a uniquely elegant solution to this problem.
1. After a successful build for a job on feature-branch, before destroying its ZFS clone (rpool/ci-vms/job-451-disk), the Orchestrator can take a snapshot of it: zfs snapshot rpool/ci-vms/job-451-disk@cache.
2. It can then identify directories that are good candidates for caching, such as /root/.cargo/registry or the project's target directory.
3. When the next job for feature-branch (job 452\) comes in, the Orchestrator creates a new clone as usual (rpool/ci-vms/job-452-disk).
4. Instead of starting with a clean slate, it can use zfs send/recv to efficiently stream the data from the cached directories in the previous snapshot into the new volume. This process is extremely fast as it operates at the block level.
5. When the VM for job 452 boots, its Cargo registry and target directory are already populated, potentially saving many minutes of download and compilation time. This provides a highly efficient, storage-level caching mechanism with minimal overhead.
#### **Works cited**
1. Using the REST API to interact with checks \- GitHub Docs, accessed on October 25, 2025, [https://docs.github.com/en/rest/guides/using-the-rest-api-to-interact-with-checks](https://docs.github.com/en/rest/guides/using-the-rest-api-to-interact-with-checks)
2. PAT vs oAuth vs GitHub App · community · Discussion \#109668, accessed on October 25, 2025, [https://github.com/orgs/community/discussions/109668](https://github.com/orgs/community/discussions/109668)
3. Differences between GitHub Apps and OAuth apps \- GitHub Docs, accessed on October 25, 2025, [https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/differences-between-github-apps-and-oauth-apps](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/differences-between-github-apps-and-oauth-apps)
4. Deciding when to build a GitHub App, accessed on October 25, 2025, [https://docs.github.com/en/apps/creating-github-apps/about-creating-github-apps/deciding-when-to-build-a-github-app](https://docs.github.com/en/apps/creating-github-apps/about-creating-github-apps/deciding-when-to-build-a-github-app)
5. Best practices for creating a GitHub App, accessed on October 25, 2025, [https://docs.github.com/en/apps/creating-github-apps/about-creating-github-apps/best-practices-for-creating-a-github-app](https://docs.github.com/en/apps/creating-github-apps/about-creating-github-apps/best-practices-for-creating-a-github-app)
6. Replacing a GitHub Personal Access Token with a GitHub Application \- Aembit, accessed on October 25, 2025, [https://aembit.io/blog/replacing-a-github-personal-access-token-with-a-github-application/](https://aembit.io/blog/replacing-a-github-personal-access-token-with-a-github-application/)
7. Making GitHub API Requests with a JWT \- Thomas Stringer, accessed on October 25, 2025, [https://trstringer.com/github-api-requests-with-jwt/](https://trstringer.com/github-api-requests-with-jwt/)
8. GitHub App Token Authorization: A Complete Guide | by Abhishek Tiwari | Medium, accessed on October 25, 2025, [https://medium.com/@tiwari09abhi/github-app-token-authorization-a-complete-guide-169461f2953f](https://medium.com/@tiwari09abhi/github-app-token-authorization-a-complete-guide-169461f2953f)
9. Generating a JSON Web Token (JWT) for a GitHub App, accessed on October 25, 2025, [https://docs.github.com/en/apps/creating-github-apps/authenticating-with-a-github-app/generating-a-json-web-token-jwt-for-a-github-app](https://docs.github.com/en/apps/creating-github-apps/authenticating-with-a-github-app/generating-a-json-web-token-jwt-for-a-github-app)
10. Authenticating as a GitHub App installation, accessed on October 25, 2025, [https://docs.github.com/en/apps/creating-github-apps/authenticating-with-a-github-app/authenticating-as-a-github-app-installation](https://docs.github.com/en/apps/creating-github-apps/authenticating-with-a-github-app/authenticating-as-a-github-app-installation)
11. About authentication with a GitHub App, accessed on October 25, 2025, [https://docs.github.com/en/apps/creating-github-apps/authenticating-with-a-github-app/about-authentication-with-a-github-app](https://docs.github.com/en/apps/creating-github-apps/authenticating-with-a-github-app/about-authentication-with-a-github-app)
12. XAMPPRocky/octocrab: A modern, extensible GitHub API ... \- GitHub, accessed on October 25, 2025, [https://github.com/XAMPPRocky/octocrab](https://github.com/XAMPPRocky/octocrab)
13. octocrab \- crates.io: Rust Package Registry, accessed on October 25, 2025, [https://crates.io/crates/octocrab](https://crates.io/crates/octocrab)
14. Forgejo numbering scheme | Forgejo Beyond coding. We forge., accessed on October 25, 2025, [https://forgejo.org/docs/latest/user/versions/](https://forgejo.org/docs/latest/user/versions/)
15. Gitea Documentation, accessed on October 25, 2025, [https://docs.gitea.cn/en-us/1.19/](https://docs.gitea.cn/en-us/1.19/)
16. Gitea API | Gitea Documentation, accessed on October 25, 2025, [https://docs.gitea.com/api/1.24/](https://docs.gitea.com/api/1.24/)
17. API Reference — gitea v1.1.11 \- HexDocs, accessed on October 25, 2025, [https://hexdocs.pm/gitea/](https://hexdocs.pm/gitea/)
18. Gitea API. | Documentation | Postman API Network, accessed on October 25, 2025, [https://www.postman.com/api-evangelist/gitea/documentation/1jqejxn/gitea-api](https://www.postman.com/api-evangelist/gitea/documentation/1jqejxn/gitea-api)
19. API Usage \- Gitea Documentation, accessed on October 25, 2025, [https://docs.gitea.com/development/api-usage](https://docs.gitea.com/development/api-usage)
20. Gitea Official Website, accessed on October 25, 2025, [https://about.gitea.com/](https://about.gitea.com/)
21. API Usage | Forgejo Beyond coding. We forge., accessed on October 25, 2025, [https://forgejo.org/docs/latest/user/api-usage/](https://forgejo.org/docs/latest/user/api-usage/)
22. API Usage | Forgejo Beyond coding. We forge., accessed on October 25, 2025, [https://forgejo.org/docs/v1.20/user/api-usage/](https://forgejo.org/docs/v1.20/user/api-usage/)
23. set gitea status \- Tekton task \- Artifact Hub, accessed on October 25, 2025, [https://artifacthub.io/packages/tekton-task/tekton-tasks/gitea-set-status](https://artifacthub.io/packages/tekton-task/tekton-tasks/gitea-set-status)
24. Gitea Checks | Jenkins plugin, accessed on October 25, 2025, [https://plugins.jenkins.io/gitea-checks/](https://plugins.jenkins.io/gitea-checks/)
25. Create a commit status | Gitea API. \- Postman, accessed on October 25, 2025, [https://www.postman.com/api-evangelist/gitea/request/t0hjvmx/create-a-commit-status](https://www.postman.com/api-evangelist/gitea/request/t0hjvmx/create-a-commit-status)
26. gitea-sdk \- crates.io: Rust Package Registry, accessed on October 25, 2025, [https://crates.io/crates/gitea-sdk](https://crates.io/crates/gitea-sdk)
27. gitea\_sdk \- Rust \- Docs.rs, accessed on October 25, 2025, [https://docs.rs/gitea-sdk](https://docs.rs/gitea-sdk)
28. Gritea — async Rust library // Lib.rs, accessed on October 25, 2025, [https://lib.rs/crates/gritea](https://lib.rs/crates/gritea)
29. gitea \- Rust \- Docs.rs, accessed on October 25, 2025, [https://docs.rs/gitea](https://docs.rs/gitea)
30. forgejo\_api \- Rust \- Docs.rs, accessed on October 25, 2025, [https://docs.rs/forgejo-api](https://docs.rs/forgejo-api)
31. OmniOS zones, accessed on October 25, 2025, [https://omnios.org/setup/zones](https://omnios.org/setup/zones)
32. bhyve and KVM branded zones \- OmniOS, accessed on October 25, 2025, [https://omnios.org/info/bhyve\_kvm\_brand](https://omnios.org/info/bhyve_kvm_brand)
33. illumos: manual page: zonecfg.8 \- SmartOS, accessed on October 25, 2025, [https://smartos.org/man/8/zonecfg](https://smartos.org/man/8/zonecfg)
34. Using the zonecfg Command to Modify a Zone Configuration \- Oracle Solaris 11.1 Administration, accessed on October 25, 2025, [https://docs.oracle.com/cd/E26502\_01/html/E29024/z.conf.start-115.html](https://docs.oracle.com/cd/E26502_01/html/E29024/z.conf.start-115.html)
35. zone \- crates.io: Rust Package Registry, accessed on October 25, 2025, [https://crates.io/crates/zone](https://crates.io/crates/zone)
36. oxidecomputer/zone \- GitHub, accessed on October 25, 2025, [https://github.com/oxidecomputer/zone](https://github.com/oxidecomputer/zone)
37. Oxide Computer Company \- GitHub, accessed on October 25, 2025, [https://github.com/oxidecomputer](https://github.com/oxidecomputer)
38. GitHub \- oxidecomputer/dropshot: expose REST APIs from a Rust program \- Reddit, accessed on October 25, 2025, [https://www.reddit.com/r/rust/comments/1ixqzlx/github\_oxidecomputerdropshot\_expose\_rest\_apis/](https://www.reddit.com/r/rust/comments/1ixqzlx/github_oxidecomputerdropshot_expose_rest_apis/)
39. Getting Started with GitHub Actions \- Waylon Walker, accessed on October 25, 2025, [https://waylonwalker.com/github-actions-syntax/](https://waylonwalker.com/github-actions-syntax/)
40. Understanding GitHub Actions, accessed on October 25, 2025, [https://docs.github.com/articles/getting-started-with-github-actions](https://docs.github.com/articles/getting-started-with-github-actions)
41. GitHub Actions documentation, accessed on October 25, 2025, [https://docs.github.com/actions](https://docs.github.com/actions)
42. Forgejo Actions user guide, accessed on October 25, 2025, [https://forgejo.org/docs/v1.21/user/actions/](https://forgejo.org/docs/v1.21/user/actions/)
43. Forgejo Actions user guide, accessed on October 25, 2025, [https://forgejo.org/docs/v1.20/user/actions/](https://forgejo.org/docs/v1.20/user/actions/)
44. Forgejo Actions | Basic concepts, accessed on October 25, 2025, [https://forgejo.org/docs/latest/user/actions/basic-concepts/](https://forgejo.org/docs/latest/user/actions/basic-concepts/)
45. Forgejo Actions | Reference | Forgejo Beyond coding. We forge., accessed on October 25, 2025, [https://forgejo.org/docs/latest/user/actions/](https://forgejo.org/docs/latest/user/actions/)
46. About custom actions \- GitHub Docs, accessed on October 25, 2025, [https://docs.github.com/actions/creating-actions/about-custom-actions](https://docs.github.com/actions/creating-actions/about-custom-actions)
47. Using Actions | Forgejo Beyond coding. We forge., accessed on October 25, 2025, [https://forgejo.org/docs/latest/user/actions/actions/](https://forgejo.org/docs/latest/user/actions/actions/)
48. serde\_yaml \- Rust \- Docs.rs, accessed on October 25, 2025, [https://docs.rs/serde-yaml](https://docs.rs/serde-yaml)
49. serde\_yaml \- crates.io: Rust Package Registry, accessed on October 25, 2025, [https://crates.io/crates/serde\_yaml](https://crates.io/crates/serde_yaml)
50. Serde and YAML-support status? \- community \- The Rust Programming Language Forum, accessed on October 25, 2025, [https://users.rust-lang.org/t/serde-and-yaml-support-status/125684](https://users.rust-lang.org/t/serde-and-yaml-support-status/125684)

View file

@ -0,0 +1,54 @@
# Solstice Orchestrator image and scheduling config (example)
# This file is loaded by the orchestrator at startup. Provide your own path via --config or ORCH_CONFIG.
# Keys:
# - default_label: the label used when a job does not specify runs_on.
# - aliases: optional map of label -> canonical label.
# - sizes: named size presets you can reference in future (not yet consumed by jobs), kept for operators.
# - images: map of canonical labels to image entries. Each entry can specify how to fetch/prepare an image
# and which backend it targets. All images should support NoCloud metadata.
# Default label to use when a job doesn't specify runs_on
default_label: illumos-latest
# Optional label aliases
aliases:
illumos-latest: openindiana-hipster
# Size presets (matrix): CPUs and RAM in MiB
sizes:
small:
cpu: 1
ram_mb: 1024
medium:
cpu: 2
ram_mb: 2048
large:
cpu: 4
ram_mb: 4096
# Images by canonical label
images:
# OpenIndiana Hipster cloud image (illumos). Intended for bhyve brand zones on illumos hosts.
openindiana-hipster:
# All images are backend-agnostic and must support NoCloud. Backends are chosen by host.
source: https://dlc.openindiana.org/isos/hipster/20250402/OI-hipster-cloudimage.img.zstd
# Local path (raw .img) target after download/decompression. Adjust per host.
local_path: /var/lib/solstice/images/openindiana-hipster.img
decompress: zstd # if omitted, assumed already uncompressed raw or qcow2
nocloud: true
# Default resources if job doesn't specify (vCPUs, RAM MiB, disk GiB for overlay/clone)
defaults:
cpu: 2
ram_mb: 2048
disk_gb: 40
# Example Ubuntu image for libvirt/KVM on Linux hosts (commented by default)
ubuntu-22.04:
source: https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img
local_path: /var/lib/libvirt/images/ubuntu-22.04-base.qcow2
decompress: none
nocloud: true
defaults:
cpu: 2
ram_mb: 2048
disk_gb: 40