2026-03-19 20:28:40 +00:00
|
|
|
use crate::brand::bhyve::bhyve_install_args;
|
2026-02-08 21:29:17 +01:00
|
|
|
use crate::brand::lx::lx_install_args;
|
Add health probes (liveness/readiness/startup) with exec, HTTP, and TCP checks
Implement Kubernetes-style health probes that run during the reconcile loop
to detect unhealthy applications inside running zones. Previously the pod
controller only checked zone liveness via get_zone_state(), missing cases
where the zone is running but the application inside has crashed.
- Add exec_in_zone() to ZoneRuntime trait, implemented via zlogin on illumos
and with configurable mock results for testing
- Add probe type system (ProbeKind, ProbeAction, ContainerProbeConfig) that
decouples from k8s_openapi and extracts probes from pod container specs
with proper k8s defaults (period=10s, timeout=1s, failure=3, success=1)
- Add ProbeExecutor for exec/HTTP/TCP checks with tokio timeout support
(HTTPS falls back to TCP-only with warning)
- Add ProbeTracker state machine that tracks per-pod/container/probe-kind
state, respects initial delays and periods, gates liveness on startup
probes, and aggregates results into PodProbeStatus
- Integrate into PodController reconcile loop: on liveness failure set
phase=Failed with reason LivenessProbeFailure; on readiness failure set
Ready=False; on all-pass restore Ready=True
- Add ProbeFailed error variant with miette diagnostic
Known v1 limitation: probes execute at reconcile cadence (~30s), not at
their configured periodSeconds.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 22:41:30 +01:00
|
|
|
use crate::command::{exec, CommandOutput};
|
2026-02-09 00:47:28 +01:00
|
|
|
use crate::error::Result;
|
|
|
|
|
use crate::storage::StorageEngine;
|
2026-02-08 21:29:17 +01:00
|
|
|
use crate::traits::ZoneRuntime;
|
|
|
|
|
use crate::types::*;
|
|
|
|
|
use crate::zone::config::generate_zonecfg;
|
|
|
|
|
use crate::zone::state::parse_zoneadm_line;
|
|
|
|
|
use async_trait::async_trait;
|
2026-02-09 00:47:28 +01:00
|
|
|
use std::sync::Arc;
|
2026-02-08 21:29:17 +01:00
|
|
|
use tracing::info;
|
|
|
|
|
|
|
|
|
|
/// illumos zone runtime implementation
|
|
|
|
|
///
|
2026-02-09 00:47:28 +01:00
|
|
|
/// Manages real zones via zonecfg/zoneadm, dladm for networking.
|
|
|
|
|
/// Storage (ZFS datasets) is delegated to the injected `StorageEngine`.
|
|
|
|
|
pub struct IllumosRuntime {
|
|
|
|
|
storage: Arc<dyn StorageEngine>,
|
2026-02-08 21:29:17 +01:00
|
|
|
}
|
|
|
|
|
|
2026-02-09 00:47:28 +01:00
|
|
|
impl IllumosRuntime {
|
|
|
|
|
pub fn new(storage: Arc<dyn StorageEngine>) -> Self {
|
|
|
|
|
Self { storage }
|
2026-02-08 21:29:17 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[async_trait]
|
|
|
|
|
impl ZoneRuntime for IllumosRuntime {
|
|
|
|
|
async fn create_zone(&self, config: &ZoneConfig) -> Result<()> {
|
|
|
|
|
info!("Creating zone: {}", config.zone_name);
|
|
|
|
|
|
|
|
|
|
let zonecfg_content = generate_zonecfg(config)?;
|
|
|
|
|
|
|
|
|
|
// Write config to a temp file, then apply via zonecfg
|
|
|
|
|
let tmp_path = format!("/tmp/zonecfg-{}.cmd", config.zone_name);
|
|
|
|
|
tokio::fs::write(&tmp_path, &zonecfg_content)
|
|
|
|
|
.await
|
2026-02-09 00:47:28 +01:00
|
|
|
.map_err(|e| {
|
|
|
|
|
crate::error::RuntimeError::zone_operation_failed(&config.zone_name, e.to_string())
|
|
|
|
|
})?;
|
2026-02-08 21:29:17 +01:00
|
|
|
|
|
|
|
|
let result = exec("zonecfg", &["-z", &config.zone_name, "-f", &tmp_path]).await;
|
|
|
|
|
|
|
|
|
|
// Clean up temp file (best-effort)
|
|
|
|
|
let _ = tokio::fs::remove_file(&tmp_path).await;
|
|
|
|
|
|
|
|
|
|
result?;
|
|
|
|
|
info!("Zone configured: {}", config.zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn install_zone(&self, zone_name: &str) -> Result<()> {
|
|
|
|
|
info!("Installing zone: {}", zone_name);
|
|
|
|
|
exec("zoneadm", &["-z", zone_name, "install"]).await?;
|
|
|
|
|
info!("Zone installed: {}", zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn boot_zone(&self, zone_name: &str) -> Result<()> {
|
|
|
|
|
info!("Booting zone: {}", zone_name);
|
|
|
|
|
exec("zoneadm", &["-z", zone_name, "boot"]).await?;
|
|
|
|
|
info!("Zone booted: {}", zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn shutdown_zone(&self, zone_name: &str) -> Result<()> {
|
|
|
|
|
info!("Shutting down zone: {}", zone_name);
|
|
|
|
|
exec("zoneadm", &["-z", zone_name, "shutdown"]).await?;
|
|
|
|
|
info!("Zone shutdown: {}", zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn halt_zone(&self, zone_name: &str) -> Result<()> {
|
|
|
|
|
info!("Halting zone: {}", zone_name);
|
|
|
|
|
exec("zoneadm", &["-z", zone_name, "halt"]).await?;
|
|
|
|
|
info!("Zone halted: {}", zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn uninstall_zone(&self, zone_name: &str) -> Result<()> {
|
|
|
|
|
info!("Uninstalling zone: {}", zone_name);
|
|
|
|
|
exec("zoneadm", &["-z", zone_name, "uninstall", "-F"]).await?;
|
|
|
|
|
info!("Zone uninstalled: {}", zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn delete_zone(&self, zone_name: &str) -> Result<()> {
|
|
|
|
|
info!("Deleting zone: {}", zone_name);
|
|
|
|
|
exec("zonecfg", &["-z", zone_name, "delete", "-F"]).await?;
|
|
|
|
|
info!("Zone deleted: {}", zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
Add health probes (liveness/readiness/startup) with exec, HTTP, and TCP checks
Implement Kubernetes-style health probes that run during the reconcile loop
to detect unhealthy applications inside running zones. Previously the pod
controller only checked zone liveness via get_zone_state(), missing cases
where the zone is running but the application inside has crashed.
- Add exec_in_zone() to ZoneRuntime trait, implemented via zlogin on illumos
and with configurable mock results for testing
- Add probe type system (ProbeKind, ProbeAction, ContainerProbeConfig) that
decouples from k8s_openapi and extracts probes from pod container specs
with proper k8s defaults (period=10s, timeout=1s, failure=3, success=1)
- Add ProbeExecutor for exec/HTTP/TCP checks with tokio timeout support
(HTTPS falls back to TCP-only with warning)
- Add ProbeTracker state machine that tracks per-pod/container/probe-kind
state, respects initial delays and periods, gates liveness on startup
probes, and aggregates results into PodProbeStatus
- Integrate into PodController reconcile loop: on liveness failure set
phase=Failed with reason LivenessProbeFailure; on readiness failure set
Ready=False; on all-pass restore Ready=True
- Add ProbeFailed error variant with miette diagnostic
Known v1 limitation: probes execute at reconcile cadence (~30s), not at
their configured periodSeconds.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 22:41:30 +01:00
|
|
|
async fn exec_in_zone(&self, zone_name: &str, command: &[String]) -> Result<CommandOutput> {
|
|
|
|
|
let mut args: Vec<&str> = vec![zone_name];
|
|
|
|
|
let str_refs: Vec<&str> = command.iter().map(|s| s.as_str()).collect();
|
|
|
|
|
args.extend(str_refs);
|
|
|
|
|
crate::command::exec_unchecked("zlogin", &args).await
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-08 21:29:17 +01:00
|
|
|
async fn get_zone_state(&self, zone_name: &str) -> Result<ZoneState> {
|
|
|
|
|
let output = exec("zoneadm", &["-z", zone_name, "list", "-p"]).await?;
|
|
|
|
|
let line = output.stdout.trim();
|
|
|
|
|
let info = parse_zoneadm_line(line)?;
|
|
|
|
|
Ok(info.state)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn get_zone_info(&self, zone_name: &str) -> Result<ZoneInfo> {
|
|
|
|
|
let output = exec("zoneadm", &["-z", zone_name, "list", "-cp"]).await?;
|
|
|
|
|
let line = output.stdout.trim();
|
|
|
|
|
parse_zoneadm_line(line)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn list_zones(&self) -> Result<Vec<ZoneInfo>> {
|
|
|
|
|
let output = exec("zoneadm", &["list", "-cp"]).await?;
|
|
|
|
|
let mut zones = Vec::new();
|
|
|
|
|
|
|
|
|
|
for line in output.stdout.lines() {
|
|
|
|
|
let line = line.trim();
|
|
|
|
|
if line.is_empty() {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
let info = parse_zoneadm_line(line)?;
|
|
|
|
|
// Filter out the global zone
|
|
|
|
|
if info.zone_name == "global" {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
zones.push(info);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(zones)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn setup_network(&self, zone_name: &str, network: &NetworkMode) -> Result<()> {
|
|
|
|
|
info!("Setting up network for zone: {}", zone_name);
|
|
|
|
|
|
|
|
|
|
match network {
|
|
|
|
|
NetworkMode::Etherstub(cfg) => {
|
|
|
|
|
// Create etherstub (ignore if already exists)
|
|
|
|
|
let _ = exec("dladm", &["create-etherstub", &cfg.etherstub_name]).await;
|
|
|
|
|
// Create VNIC on etherstub
|
|
|
|
|
exec(
|
|
|
|
|
"dladm",
|
|
|
|
|
&["create-vnic", "-l", &cfg.etherstub_name, &cfg.vnic_name],
|
|
|
|
|
)
|
|
|
|
|
.await?;
|
|
|
|
|
}
|
|
|
|
|
NetworkMode::Direct(cfg) => {
|
|
|
|
|
// Create VNIC directly on physical NIC
|
|
|
|
|
exec(
|
|
|
|
|
"dladm",
|
|
|
|
|
&["create-vnic", "-l", &cfg.physical_nic, &cfg.vnic_name],
|
|
|
|
|
)
|
|
|
|
|
.await?;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info!("Network setup complete for zone: {}", zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-19 20:28:40 +00:00
|
|
|
async fn configure_zone_ip(&self, zone_name: &str, network: &NetworkMode) -> Result<()> {
|
|
|
|
|
let (vnic_name, ip_address, prefix_len, gateway) = match network {
|
|
|
|
|
NetworkMode::Etherstub(cfg) => (
|
|
|
|
|
&cfg.vnic_name,
|
|
|
|
|
&cfg.ip_address,
|
|
|
|
|
cfg.prefix_len,
|
|
|
|
|
&cfg.gateway,
|
|
|
|
|
),
|
|
|
|
|
NetworkMode::Direct(cfg) => (
|
|
|
|
|
&cfg.vnic_name,
|
|
|
|
|
&cfg.ip_address,
|
|
|
|
|
cfg.prefix_len,
|
|
|
|
|
&cfg.gateway,
|
|
|
|
|
),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
info!(
|
|
|
|
|
"Configuring IP {} on {} in zone {}",
|
|
|
|
|
ip_address, vnic_name, zone_name
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Create the IP interface
|
|
|
|
|
self.exec_in_zone(
|
|
|
|
|
zone_name,
|
|
|
|
|
&[
|
|
|
|
|
"ipadm".to_string(),
|
|
|
|
|
"create-if".to_string(),
|
|
|
|
|
"-t".to_string(),
|
|
|
|
|
vnic_name.clone(),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| RuntimeError::network_error(format!("ipadm create-if failed: {}", e)))?;
|
|
|
|
|
|
|
|
|
|
// Assign a static IP address
|
|
|
|
|
self.exec_in_zone(
|
|
|
|
|
zone_name,
|
|
|
|
|
&[
|
|
|
|
|
"ipadm".to_string(),
|
|
|
|
|
"create-addr".to_string(),
|
|
|
|
|
"-T".to_string(),
|
|
|
|
|
"static".to_string(),
|
|
|
|
|
"-a".to_string(),
|
|
|
|
|
format!("{}/{}", ip_address, prefix_len),
|
|
|
|
|
format!("{}/v4", vnic_name),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| RuntimeError::network_error(format!("ipadm create-addr failed: {}", e)))?;
|
|
|
|
|
|
|
|
|
|
// Add default route
|
|
|
|
|
self.exec_in_zone(
|
|
|
|
|
zone_name,
|
|
|
|
|
&[
|
|
|
|
|
"route".to_string(),
|
|
|
|
|
"-p".to_string(),
|
|
|
|
|
"add".to_string(),
|
|
|
|
|
"default".to_string(),
|
|
|
|
|
gateway.clone(),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| RuntimeError::network_error(format!("route add default failed: {}", e)))?;
|
|
|
|
|
|
|
|
|
|
info!("IP configuration complete for zone: {}", zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-08 21:29:17 +01:00
|
|
|
async fn teardown_network(&self, zone_name: &str, network: &NetworkMode) -> Result<()> {
|
|
|
|
|
info!("Tearing down network for zone: {}", zone_name);
|
|
|
|
|
|
|
|
|
|
let vnic_name = match network {
|
|
|
|
|
NetworkMode::Etherstub(cfg) => &cfg.vnic_name,
|
|
|
|
|
NetworkMode::Direct(cfg) => &cfg.vnic_name,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
exec("dladm", &["delete-vnic", vnic_name]).await?;
|
|
|
|
|
|
|
|
|
|
info!("Network teardown complete for zone: {}", zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn provision(&self, config: &ZoneConfig) -> Result<()> {
|
|
|
|
|
info!("Provisioning zone: {}", config.zone_name);
|
|
|
|
|
|
2026-02-09 00:47:28 +01:00
|
|
|
self.storage
|
|
|
|
|
.create_zone_dataset(&config.zone_name, &config.storage)
|
|
|
|
|
.await?;
|
2026-02-08 21:29:17 +01:00
|
|
|
self.setup_network(&config.zone_name, &config.network)
|
|
|
|
|
.await?;
|
|
|
|
|
self.create_zone(config).await?;
|
|
|
|
|
|
2026-03-19 20:28:40 +00:00
|
|
|
// Brand-specific install
|
|
|
|
|
match config.brand {
|
|
|
|
|
ZoneBrand::Lx => {
|
|
|
|
|
let args = lx_install_args(config)?;
|
|
|
|
|
let mut cmd_args = vec!["-z", &config.zone_name, "install"];
|
|
|
|
|
let str_args: Vec<&str> = args.iter().map(|s| s.as_str()).collect();
|
|
|
|
|
cmd_args.extend(str_args);
|
|
|
|
|
exec("zoneadm", &cmd_args).await?;
|
|
|
|
|
}
|
|
|
|
|
ZoneBrand::Bhyve => {
|
|
|
|
|
let args = bhyve_install_args(config)?;
|
|
|
|
|
let mut cmd_args = vec!["-z", &config.zone_name, "install"];
|
|
|
|
|
let str_args: Vec<&str> = args.iter().map(|s| s.as_str()).collect();
|
|
|
|
|
cmd_args.extend(str_args);
|
|
|
|
|
exec("zoneadm", &cmd_args).await?;
|
|
|
|
|
}
|
|
|
|
|
ZoneBrand::Reddwarf => {
|
|
|
|
|
self.install_zone(&config.zone_name).await?;
|
|
|
|
|
}
|
2026-02-08 21:29:17 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
self.boot_zone(&config.zone_name).await?;
|
|
|
|
|
|
2026-03-19 20:28:40 +00:00
|
|
|
// Brief pause to let the zone finish booting before configuring IP
|
|
|
|
|
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
|
|
|
|
|
|
|
|
|
self.configure_zone_ip(&config.zone_name, &config.network)
|
|
|
|
|
.await?;
|
|
|
|
|
|
2026-02-08 21:29:17 +01:00
|
|
|
info!("Zone provisioned: {}", config.zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn deprovision(&self, config: &ZoneConfig) -> Result<()> {
|
|
|
|
|
info!("Deprovisioning zone: {}", config.zone_name);
|
|
|
|
|
|
|
|
|
|
// Best-effort halt (may fail if already not running)
|
|
|
|
|
let _ = self.halt_zone(&config.zone_name).await;
|
|
|
|
|
self.uninstall_zone(&config.zone_name).await?;
|
|
|
|
|
self.delete_zone(&config.zone_name).await?;
|
|
|
|
|
self.teardown_network(&config.zone_name, &config.network)
|
|
|
|
|
.await?;
|
2026-02-09 00:47:28 +01:00
|
|
|
self.storage.destroy_zone_dataset(&config.zone_name).await?;
|
2026-02-08 21:29:17 +01:00
|
|
|
|
|
|
|
|
info!("Zone deprovisioned: {}", config.zone_name);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
}
|