From d3eb0b2511f063015705e7a965a6fe43f1bed8b9 Mon Sep 17 00:00:00 2001 From: Till Wegmueller Date: Sat, 14 Feb 2026 21:17:43 +0100 Subject: [PATCH] Add dynamic node resource detection with configurable system reservations Replace hardcoded memory (8Gi) and pod limits (110) in the node agent with actual system detection via the sys-info crate. CPU and memory are detected once at NodeAgent construction and reused on every heartbeat. Capacity reports raw hardware values while allocatable subtracts configurable reservations (--system-reserved-cpu, --system-reserved-memory, --max-pods), giving the scheduler accurate data for filtering and scoring. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 11 ++ Cargo.toml | 3 + crates/reddwarf-runtime/Cargo.toml | 1 + crates/reddwarf-runtime/src/error.rs | 17 ++ crates/reddwarf-runtime/src/lib.rs | 1 + crates/reddwarf-runtime/src/node_agent.rs | 198 +++++++++++++++++++--- crates/reddwarf-runtime/src/sysinfo.rs | 173 +++++++++++++++++++ crates/reddwarf/src/main.rs | 44 ++++- 8 files changed, 423 insertions(+), 25 deletions(-) create mode 100644 crates/reddwarf-runtime/src/sysinfo.rs diff --git a/Cargo.lock b/Cargo.lock index c24f206..95c8c3e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1543,6 +1543,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sys-info", "tempfile", "thiserror 2.0.18", "tokio", @@ -2020,6 +2021,16 @@ dependencies = [ "syn", ] +[[package]] +name = "sys-info" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b3a0d0aba8bf96a0e1ddfdc352fc53b3df7f39318c71854910c3c4b024ae52c" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "system-configuration" version = "0.6.1" diff --git a/Cargo.toml b/Cargo.toml index 0376bba..fbe4e61 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,6 +80,9 @@ rustls = "0.23" rustls-pemfile = "2.0" axum-server = { version = "0.7", features = ["tls-rustls"] } +# System info +sys-info = "0.9" + # Testing tempfile = "3.0" diff --git a/crates/reddwarf-runtime/Cargo.toml b/crates/reddwarf-runtime/Cargo.toml index 31d886e..f14f9f5 100644 --- a/crates/reddwarf-runtime/Cargo.toml +++ b/crates/reddwarf-runtime/Cargo.toml @@ -24,6 +24,7 @@ async-trait = { workspace = true } reqwest = { workspace = true } chrono = { workspace = true } futures-util = { workspace = true } +sys-info = { workspace = true } [dev-dependencies] tempfile = { workspace = true } diff --git a/crates/reddwarf-runtime/src/error.rs b/crates/reddwarf-runtime/src/error.rs index 57b19d6..7c5dd2e 100644 --- a/crates/reddwarf-runtime/src/error.rs +++ b/crates/reddwarf-runtime/src/error.rs @@ -144,6 +144,17 @@ pub enum RuntimeError { message: String, }, + /// Resource detection failed + #[error("Failed to detect system resources: {message}")] + #[diagnostic( + code(reddwarf::runtime::resource_detection_failed), + help("System resource detection is non-fatal. The node agent will fall back to default values (CPU from available_parallelism, 8Gi memory, 110 pods). To investigate, check that /proc/meminfo is readable (Linux) or that the system supports sysconf (illumos/macOS)") + )] + ResourceDetectionFailed { + #[allow(unused)] + message: String, + }, + /// Internal error #[error("Internal runtime error: {message}")] #[diagnostic( @@ -224,6 +235,12 @@ impl RuntimeError { } } + pub fn resource_detection_failed(message: impl Into) -> Self { + Self::ResourceDetectionFailed { + message: message.into(), + } + } + pub fn internal_error(message: impl Into) -> Self { Self::InternalError { message: message.into(), diff --git a/crates/reddwarf-runtime/src/lib.rs b/crates/reddwarf-runtime/src/lib.rs index dfb691a..3e9e6f8 100644 --- a/crates/reddwarf-runtime/src/lib.rs +++ b/crates/reddwarf-runtime/src/lib.rs @@ -13,6 +13,7 @@ pub mod network; pub mod node_agent; pub mod node_health; pub mod storage; +pub mod sysinfo; pub mod traits; pub mod types; pub mod zone; diff --git a/crates/reddwarf-runtime/src/node_agent.rs b/crates/reddwarf-runtime/src/node_agent.rs index c2d49b6..30860c9 100644 --- a/crates/reddwarf-runtime/src/node_agent.rs +++ b/crates/reddwarf-runtime/src/node_agent.rs @@ -1,5 +1,8 @@ use crate::api_client::ApiClient; use crate::error::{Result, RuntimeError}; +use crate::sysinfo::{ + compute_node_resources, format_memory_quantity, NodeResources, ResourceReservation, +}; use k8s_openapi::api::core::v1::{Node, NodeAddress, NodeCondition, NodeStatus}; use k8s_openapi::apimachinery::pkg::api::resource::Quantity; use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; @@ -18,6 +21,12 @@ pub struct NodeAgentConfig { pub api_url: String, /// Interval between heartbeats pub heartbeat_interval: Duration, + /// CPU to reserve for system daemons, in millicores (default: 100 = 100m) + pub system_reserved_cpu_millicores: i64, + /// Memory to reserve for system daemons, in bytes (default: 256Mi) + pub system_reserved_memory_bytes: i64, + /// Maximum number of pods this node will accept (default: 110) + pub max_pods: u32, } impl NodeAgentConfig { @@ -26,6 +35,9 @@ impl NodeAgentConfig { node_name, api_url, heartbeat_interval: Duration::from_secs(10), + system_reserved_cpu_millicores: 100, + system_reserved_memory_bytes: 256 * 1024 * 1024, + max_pods: 110, } } } @@ -34,11 +46,56 @@ impl NodeAgentConfig { pub struct NodeAgent { api_client: Arc, config: NodeAgentConfig, + /// Detected system resources (None if detection failed at startup). + detected: Option, } impl NodeAgent { pub fn new(api_client: Arc, config: NodeAgentConfig) -> Self { - Self { api_client, config } + let reservation = ResourceReservation { + cpu_millicores: config.system_reserved_cpu_millicores, + memory_bytes: config.system_reserved_memory_bytes, + }; + + let detected = match compute_node_resources(&reservation, config.max_pods) { + Ok(nr) => { + info!( + cpu_count = nr.capacity.cpu_count, + total_memory = %format_memory_quantity(nr.capacity.total_memory_bytes), + allocatable_cpu_m = nr.allocatable_cpu_millicores, + allocatable_memory = %format_memory_quantity(nr.allocatable_memory_bytes), + max_pods = nr.max_pods, + "Detected system resources" + ); + Some(nr) + } + Err(e) => { + warn!( + error = %e, + "Failed to detect system resources, falling back to defaults" + ); + None + } + }; + + Self { + api_client, + config, + detected, + } + } + + #[cfg(test)] + fn new_with_detected( + api_client: Arc, + config: NodeAgentConfig, + detected: Option, + ) -> Self { + Self { + api_client, + config, + detected, + } } /// Register this host as a Node resource @@ -108,21 +165,47 @@ impl NodeAgent { fn build_node(&self) -> Node { let hostname = self.config.node_name.clone(); - let cpu_count = std::thread::available_parallelism() - .map(|n| n.get().to_string()) - .unwrap_or_else(|_| "1".to_string()); + let (capacity, allocatable) = if let Some(ref nr) = self.detected { + let cap_cpu = nr.capacity.cpu_count.to_string(); + let cap_mem = format_memory_quantity(nr.capacity.total_memory_bytes); + let pods = nr.max_pods.to_string(); - let allocatable = BTreeMap::from([ - ("cpu".to_string(), Quantity(cpu_count.clone())), - ("memory".to_string(), Quantity("8Gi".to_string())), - ("pods".to_string(), Quantity("110".to_string())), - ]); + let alloc_cpu = format!("{}m", nr.allocatable_cpu_millicores); + let alloc_mem = format_memory_quantity(nr.allocatable_memory_bytes); - let capacity = BTreeMap::from([ - ("cpu".to_string(), Quantity(cpu_count)), - ("memory".to_string(), Quantity("8Gi".to_string())), - ("pods".to_string(), Quantity("110".to_string())), - ]); + let capacity = BTreeMap::from([ + ("cpu".to_string(), Quantity(cap_cpu)), + ("memory".to_string(), Quantity(cap_mem)), + ("pods".to_string(), Quantity(pods.clone())), + ]); + + let allocatable = BTreeMap::from([ + ("cpu".to_string(), Quantity(alloc_cpu)), + ("memory".to_string(), Quantity(alloc_mem)), + ("pods".to_string(), Quantity(pods)), + ]); + + (capacity, allocatable) + } else { + // Fallback: use available_parallelism for CPU, hardcoded memory + let cpu_count = std::thread::available_parallelism() + .map(|n| n.get().to_string()) + .unwrap_or_else(|_| "1".to_string()); + + let capacity = BTreeMap::from([ + ("cpu".to_string(), Quantity(cpu_count.clone())), + ("memory".to_string(), Quantity("8Gi".to_string())), + ("pods".to_string(), Quantity("110".to_string())), + ]); + + let allocatable = BTreeMap::from([ + ("cpu".to_string(), Quantity(cpu_count)), + ("memory".to_string(), Quantity("8Gi".to_string())), + ("pods".to_string(), Quantity("110".to_string())), + ]); + + (capacity, allocatable) + }; Node { metadata: ObjectMeta { @@ -169,6 +252,7 @@ impl NodeAgent { #[cfg(test)] mod tests { use super::*; + use crate::sysinfo::detect_system_resources; #[test] fn test_node_agent_config_defaults() { @@ -176,6 +260,9 @@ mod tests { NodeAgentConfig::new("test-node".to_string(), "http://127.0.0.1:6443".to_string()); assert_eq!(config.node_name, "test-node"); assert_eq!(config.heartbeat_interval, Duration::from_secs(10)); + assert_eq!(config.system_reserved_cpu_millicores, 100); + assert_eq!(config.system_reserved_memory_bytes, 256 * 1024 * 1024); + assert_eq!(config.max_pods, 110); } #[test] @@ -206,25 +293,90 @@ mod tests { let node = agent.build_node(); let status = node.status.unwrap(); - // Check allocatable + // Check allocatable has all keys let alloc = status.allocatable.unwrap(); assert!(alloc.contains_key("cpu")); assert!(alloc.contains_key("memory")); assert!(alloc.contains_key("pods")); - assert_eq!(alloc["memory"].0, "8Gi"); assert_eq!(alloc["pods"].0, "110"); - // CPU should match available parallelism + // Memory should come from detected system (not hardcoded 8Gi) + let sys = detect_system_resources().expect("detection works in test"); + let expected_cap_mem = format_memory_quantity(sys.total_memory_bytes); + let cap = status.capacity.unwrap(); + assert_eq!(cap["memory"].0, expected_cap_mem); + + // Capacity CPU should match detected cpu_count + assert_eq!(cap["cpu"].0, sys.cpu_count.to_string()); + } + + #[test] + fn test_build_node_allocatable_less_than_capacity() { + let api_client = Arc::new(ApiClient::new("http://127.0.0.1:6443")); + let config = + NodeAgentConfig::new("test-node".to_string(), "http://127.0.0.1:6443".to_string()); + let agent = NodeAgent::new(api_client, config); + + // Agent should have detected resources (we're on a real host) + assert!(agent.detected.is_some(), "detection should succeed in tests"); + + let node = agent.build_node(); + let status = node.status.unwrap(); + let cap = status.capacity.unwrap(); + let alloc = status.allocatable.unwrap(); + + // Allocatable CPU (millicores) should be less than capacity CPU (whole cores) + let cap_cpu_m = reddwarf_core::resources::ResourceQuantities::parse_cpu(&cap["cpu"].0) + .expect("valid cpu"); + let alloc_cpu_m = + reddwarf_core::resources::ResourceQuantities::parse_cpu(&alloc["cpu"].0) + .expect("valid cpu"); + assert!( + alloc_cpu_m < cap_cpu_m, + "allocatable CPU {}m should be less than capacity {}m", + alloc_cpu_m, + cap_cpu_m, + ); + + // Allocatable memory should be less than capacity memory + let cap_mem = + reddwarf_core::resources::ResourceQuantities::parse_memory(&cap["memory"].0) + .expect("valid mem"); + let alloc_mem = + reddwarf_core::resources::ResourceQuantities::parse_memory(&alloc["memory"].0) + .expect("valid mem"); + assert!( + alloc_mem < cap_mem, + "allocatable memory {} should be less than capacity {}", + alloc_mem, + cap_mem, + ); + } + + #[test] + fn test_build_node_fallback_on_detection_failure() { + let api_client = Arc::new(ApiClient::new("http://127.0.0.1:6443")); + let config = + NodeAgentConfig::new("test-node".to_string(), "http://127.0.0.1:6443".to_string()); + // Simulate detection failure + let agent = NodeAgent::new_with_detected(api_client, config, None); + + let node = agent.build_node(); + let status = node.status.unwrap(); + let alloc = status.allocatable.unwrap(); + let cap = status.capacity.unwrap(); + + // Should fall back to hardcoded defaults + assert_eq!(alloc["memory"].0, "8Gi"); + assert_eq!(alloc["pods"].0, "110"); + assert_eq!(cap["memory"].0, "8Gi"); + assert_eq!(cap["pods"].0, "110"); + + // CPU falls back to available_parallelism let expected_cpu = std::thread::available_parallelism() .map(|n| n.get().to_string()) .unwrap_or_else(|_| "1".to_string()); assert_eq!(alloc["cpu"].0, expected_cpu); - - // Check capacity - let cap = status.capacity.unwrap(); - assert!(cap.contains_key("cpu")); - assert!(cap.contains_key("memory")); - assert!(cap.contains_key("pods")); assert_eq!(cap["cpu"].0, expected_cpu); } } diff --git a/crates/reddwarf-runtime/src/sysinfo.rs b/crates/reddwarf-runtime/src/sysinfo.rs new file mode 100644 index 0000000..b7db695 --- /dev/null +++ b/crates/reddwarf-runtime/src/sysinfo.rs @@ -0,0 +1,173 @@ +use crate::error::RuntimeError; + +/// Detected physical resources of the host. +#[derive(Debug, Clone)] +pub struct SystemResources { + /// Number of logical CPUs. + pub cpu_count: u32, + /// Total physical memory in bytes. + pub total_memory_bytes: u64, +} + +/// How much CPU / memory to reserve for system daemons. +#[derive(Debug, Clone)] +pub struct ResourceReservation { + /// CPU to reserve in millicores (e.g. 100 = 100m). + pub cpu_millicores: i64, + /// Memory to reserve in bytes. + pub memory_bytes: i64, +} + +/// Computed node resource budget (capacity minus reservation). +#[derive(Debug, Clone)] +pub struct NodeResources { + /// Raw hardware capacity. + pub capacity: SystemResources, + /// Allocatable CPU after subtracting reservation, in millicores. + pub allocatable_cpu_millicores: i64, + /// Allocatable memory after subtracting reservation, in bytes. + pub allocatable_memory_bytes: u64, + /// Maximum number of pods this node will accept. + pub max_pods: u32, +} + +/// Detect the host's CPU count and total memory. +/// +/// Uses the `sys_info` crate which supports illumos, Linux, and macOS. +pub fn detect_system_resources() -> Result { + let cpu_count = sys_info::cpu_num().map_err(|e| { + RuntimeError::resource_detection_failed(format!("failed to detect CPU count: {e}")) + })?; + + let mem = sys_info::mem_info().map_err(|e| { + RuntimeError::resource_detection_failed(format!("failed to detect memory: {e}")) + })?; + + // sys_info::mem_info().total is in KiB + let total_memory_bytes = mem.total * 1024; + + Ok(SystemResources { + cpu_count, + total_memory_bytes, + }) +} + +/// Detect system resources and compute allocatable values after subtracting +/// the given reservation. Allocatable values are clamped so they never go +/// negative. +pub fn compute_node_resources( + reservation: &ResourceReservation, + max_pods: u32, +) -> Result { + let capacity = detect_system_resources()?; + + let capacity_cpu_millicores = capacity.cpu_count as i64 * 1000; + let allocatable_cpu_millicores = + (capacity_cpu_millicores - reservation.cpu_millicores).max(0); + + let allocatable_memory_bytes = + (capacity.total_memory_bytes as i64 - reservation.memory_bytes).max(0) as u64; + + Ok(NodeResources { + capacity, + allocatable_cpu_millicores, + allocatable_memory_bytes, + max_pods, + }) +} + +/// Convert a byte count to the most human-friendly Kubernetes Quantity string. +/// +/// Picks the largest clean binary unit: `"16Gi"`, `"7680Mi"`, `"512Ki"`, or +/// raw bytes if nothing divides evenly. +pub fn format_memory_quantity(bytes: u64) -> String { + const GIB: u64 = 1024 * 1024 * 1024; + const MIB: u64 = 1024 * 1024; + const KIB: u64 = 1024; + + if bytes > 0 && bytes % GIB == 0 { + format!("{}Gi", bytes / GIB) + } else if bytes > 0 && bytes % MIB == 0 { + format!("{}Mi", bytes / MIB) + } else if bytes > 0 && bytes % KIB == 0 { + format!("{}Ki", bytes / KIB) + } else { + format!("{}", bytes) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_detect_system_resources() { + let res = detect_system_resources().expect("detection should succeed in test env"); + assert!(res.cpu_count > 0, "should detect at least 1 CPU"); + assert!( + res.total_memory_bytes > 0, + "should detect nonzero memory" + ); + } + + #[test] + fn test_format_memory_quantity_gi() { + assert_eq!(format_memory_quantity(16 * 1024 * 1024 * 1024), "16Gi"); + assert_eq!(format_memory_quantity(1024 * 1024 * 1024), "1Gi"); + } + + #[test] + fn test_format_memory_quantity_mi() { + assert_eq!(format_memory_quantity(7680 * 1024 * 1024), "7680Mi"); + assert_eq!(format_memory_quantity(512 * 1024 * 1024), "512Mi"); + } + + #[test] + fn test_format_memory_quantity_ki() { + assert_eq!(format_memory_quantity(512 * 1024), "512Ki"); + } + + #[test] + fn test_format_memory_quantity_raw_bytes() { + assert_eq!(format_memory_quantity(1023), "1023"); + assert_eq!(format_memory_quantity(0), "0"); + } + + #[test] + fn test_compute_reserves_subtracted() { + let reservation = ResourceReservation { + cpu_millicores: 100, + memory_bytes: 256 * 1024 * 1024, + }; + let nr = compute_node_resources(&reservation, 110) + .expect("should succeed in test env"); + + let capacity_cpu_millis = nr.capacity.cpu_count as i64 * 1000; + assert!( + nr.allocatable_cpu_millicores < capacity_cpu_millis, + "allocatable CPU ({}) should be less than capacity ({})", + nr.allocatable_cpu_millicores, + capacity_cpu_millis, + ); + assert!( + nr.allocatable_memory_bytes < nr.capacity.total_memory_bytes, + "allocatable memory ({}) should be less than capacity ({})", + nr.allocatable_memory_bytes, + nr.capacity.total_memory_bytes, + ); + assert_eq!(nr.max_pods, 110); + } + + #[test] + fn test_reservation_clamp_to_zero() { + let reservation = ResourceReservation { + cpu_millicores: i64::MAX, + memory_bytes: i64::MAX, + }; + let nr = compute_node_resources(&reservation, 110) + .expect("should succeed in test env"); + + assert_eq!(nr.allocatable_cpu_millicores, 0); + assert_eq!(nr.allocatable_memory_bytes, 0); + } +} diff --git a/crates/reddwarf/src/main.rs b/crates/reddwarf/src/main.rs index e2248b0..dda8247 100644 --- a/crates/reddwarf/src/main.rs +++ b/crates/reddwarf/src/main.rs @@ -1,6 +1,6 @@ use clap::{Parser, Subcommand}; use reddwarf_apiserver::{ApiError, ApiServer, AppState, Config as ApiConfig, TlsMode}; -use reddwarf_core::Namespace; +use reddwarf_core::{Namespace, ResourceQuantities}; use reddwarf_runtime::{ ApiClient, Ipam, MockRuntime, MockStorageEngine, NodeAgent, NodeAgentConfig, NodeHealthChecker, NodeHealthCheckerConfig, PodController, PodControllerConfig, StorageEngine, @@ -84,6 +84,15 @@ enum Commands { /// Etherstub name for pod networking #[arg(long, default_value = "reddwarf0")] etherstub_name: String, + /// CPU to reserve for system daemons (e.g. "100m", "0.1") + #[arg(long, default_value = "100m")] + system_reserved_cpu: String, + /// Memory to reserve for system daemons (e.g. "256Mi", "1Gi") + #[arg(long, default_value = "256Mi")] + system_reserved_memory: String, + /// Maximum number of pods this node will accept + #[arg(long, default_value_t = 110)] + max_pods: u32, #[command(flatten)] tls_args: TlsArgs, }, @@ -118,8 +127,30 @@ async fn main() -> miette::Result<()> { zonepath_prefix, pod_cidr, etherstub_name, + system_reserved_cpu, + system_reserved_memory, + max_pods, tls_args, } => { + let reserved_cpu_millicores = + ResourceQuantities::parse_cpu(&system_reserved_cpu).map_err(|e| { + miette::miette!( + help = "Use a value like '100m' or '0.1' for --system-reserved-cpu", + "Invalid --system-reserved-cpu '{}': {}", + system_reserved_cpu, + e + ) + })?; + let reserved_memory_bytes = + ResourceQuantities::parse_memory(&system_reserved_memory).map_err(|e| { + miette::miette!( + help = "Use a value like '256Mi' or '1Gi' for --system-reserved-memory", + "Invalid --system-reserved-memory '{}': {}", + system_reserved_memory, + e + ) + })?; + run_agent( &node_name, &bind, @@ -131,6 +162,9 @@ async fn main() -> miette::Result<()> { zonepath_prefix.as_deref(), &pod_cidr, ðerstub_name, + reserved_cpu_millicores, + reserved_memory_bytes, + max_pods, &tls_args, ) .await @@ -228,6 +262,9 @@ async fn run_agent( zonepath_prefix: Option<&str>, pod_cidr: &str, etherstub_name: &str, + system_reserved_cpu_millicores: i64, + system_reserved_memory_bytes: i64, + max_pods: u32, tls_args: &TlsArgs, ) -> miette::Result<()> { info!("Starting reddwarf agent for node '{}'", node_name); @@ -337,7 +374,10 @@ async fn run_agent( }); // 6. Spawn node agent - let node_agent_config = NodeAgentConfig::new(node_name.to_string(), api_url); + let mut node_agent_config = NodeAgentConfig::new(node_name.to_string(), api_url); + node_agent_config.system_reserved_cpu_millicores = system_reserved_cpu_millicores; + node_agent_config.system_reserved_memory_bytes = system_reserved_memory_bytes; + node_agent_config.max_pods = max_pods; let node_agent = NodeAgent::new(api_client.clone(), node_agent_config); let agent_token = token.clone(); let node_agent_handle = tokio::spawn(async move {