mirror of
https://github.com/CloudNebulaProject/reddwarf.git
synced 2026-04-10 13:20:40 +00:00
Add dynamic node resource detection with configurable system reservations
Replace hardcoded memory (8Gi) and pod limits (110) in the node agent with actual system detection via the sys-info crate. CPU and memory are detected once at NodeAgent construction and reused on every heartbeat. Capacity reports raw hardware values while allocatable subtracts configurable reservations (--system-reserved-cpu, --system-reserved-memory, --max-pods), giving the scheduler accurate data for filtering and scoring. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
58171c7555
commit
d3eb0b2511
8 changed files with 423 additions and 25 deletions
11
Cargo.lock
generated
11
Cargo.lock
generated
|
|
@ -1543,6 +1543,7 @@ dependencies = [
|
|||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sys-info",
|
||||
"tempfile",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
|
|
@ -2020,6 +2021,16 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sys-info"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b3a0d0aba8bf96a0e1ddfdc352fc53b3df7f39318c71854910c3c4b024ae52c"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "system-configuration"
|
||||
version = "0.6.1"
|
||||
|
|
|
|||
|
|
@ -80,6 +80,9 @@ rustls = "0.23"
|
|||
rustls-pemfile = "2.0"
|
||||
axum-server = { version = "0.7", features = ["tls-rustls"] }
|
||||
|
||||
# System info
|
||||
sys-info = "0.9"
|
||||
|
||||
# Testing
|
||||
tempfile = "3.0"
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ async-trait = { workspace = true }
|
|||
reqwest = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
futures-util = { workspace = true }
|
||||
sys-info = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -144,6 +144,17 @@ pub enum RuntimeError {
|
|||
message: String,
|
||||
},
|
||||
|
||||
/// Resource detection failed
|
||||
#[error("Failed to detect system resources: {message}")]
|
||||
#[diagnostic(
|
||||
code(reddwarf::runtime::resource_detection_failed),
|
||||
help("System resource detection is non-fatal. The node agent will fall back to default values (CPU from available_parallelism, 8Gi memory, 110 pods). To investigate, check that /proc/meminfo is readable (Linux) or that the system supports sysconf (illumos/macOS)")
|
||||
)]
|
||||
ResourceDetectionFailed {
|
||||
#[allow(unused)]
|
||||
message: String,
|
||||
},
|
||||
|
||||
/// Internal error
|
||||
#[error("Internal runtime error: {message}")]
|
||||
#[diagnostic(
|
||||
|
|
@ -224,6 +235,12 @@ impl RuntimeError {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn resource_detection_failed(message: impl Into<String>) -> Self {
|
||||
Self::ResourceDetectionFailed {
|
||||
message: message.into(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn internal_error(message: impl Into<String>) -> Self {
|
||||
Self::InternalError {
|
||||
message: message.into(),
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ pub mod network;
|
|||
pub mod node_agent;
|
||||
pub mod node_health;
|
||||
pub mod storage;
|
||||
pub mod sysinfo;
|
||||
pub mod traits;
|
||||
pub mod types;
|
||||
pub mod zone;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
use crate::api_client::ApiClient;
|
||||
use crate::error::{Result, RuntimeError};
|
||||
use crate::sysinfo::{
|
||||
compute_node_resources, format_memory_quantity, NodeResources, ResourceReservation,
|
||||
};
|
||||
use k8s_openapi::api::core::v1::{Node, NodeAddress, NodeCondition, NodeStatus};
|
||||
use k8s_openapi::apimachinery::pkg::api::resource::Quantity;
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
|
|
@ -18,6 +21,12 @@ pub struct NodeAgentConfig {
|
|||
pub api_url: String,
|
||||
/// Interval between heartbeats
|
||||
pub heartbeat_interval: Duration,
|
||||
/// CPU to reserve for system daemons, in millicores (default: 100 = 100m)
|
||||
pub system_reserved_cpu_millicores: i64,
|
||||
/// Memory to reserve for system daemons, in bytes (default: 256Mi)
|
||||
pub system_reserved_memory_bytes: i64,
|
||||
/// Maximum number of pods this node will accept (default: 110)
|
||||
pub max_pods: u32,
|
||||
}
|
||||
|
||||
impl NodeAgentConfig {
|
||||
|
|
@ -26,6 +35,9 @@ impl NodeAgentConfig {
|
|||
node_name,
|
||||
api_url,
|
||||
heartbeat_interval: Duration::from_secs(10),
|
||||
system_reserved_cpu_millicores: 100,
|
||||
system_reserved_memory_bytes: 256 * 1024 * 1024,
|
||||
max_pods: 110,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -34,11 +46,56 @@ impl NodeAgentConfig {
|
|||
pub struct NodeAgent {
|
||||
api_client: Arc<ApiClient>,
|
||||
config: NodeAgentConfig,
|
||||
/// Detected system resources (None if detection failed at startup).
|
||||
detected: Option<NodeResources>,
|
||||
}
|
||||
|
||||
impl NodeAgent {
|
||||
pub fn new(api_client: Arc<ApiClient>, config: NodeAgentConfig) -> Self {
|
||||
Self { api_client, config }
|
||||
let reservation = ResourceReservation {
|
||||
cpu_millicores: config.system_reserved_cpu_millicores,
|
||||
memory_bytes: config.system_reserved_memory_bytes,
|
||||
};
|
||||
|
||||
let detected = match compute_node_resources(&reservation, config.max_pods) {
|
||||
Ok(nr) => {
|
||||
info!(
|
||||
cpu_count = nr.capacity.cpu_count,
|
||||
total_memory = %format_memory_quantity(nr.capacity.total_memory_bytes),
|
||||
allocatable_cpu_m = nr.allocatable_cpu_millicores,
|
||||
allocatable_memory = %format_memory_quantity(nr.allocatable_memory_bytes),
|
||||
max_pods = nr.max_pods,
|
||||
"Detected system resources"
|
||||
);
|
||||
Some(nr)
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
error = %e,
|
||||
"Failed to detect system resources, falling back to defaults"
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
Self {
|
||||
api_client,
|
||||
config,
|
||||
detected,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn new_with_detected(
|
||||
api_client: Arc<ApiClient>,
|
||||
config: NodeAgentConfig,
|
||||
detected: Option<NodeResources>,
|
||||
) -> Self {
|
||||
Self {
|
||||
api_client,
|
||||
config,
|
||||
detected,
|
||||
}
|
||||
}
|
||||
|
||||
/// Register this host as a Node resource
|
||||
|
|
@ -108,22 +165,48 @@ impl NodeAgent {
|
|||
fn build_node(&self) -> Node {
|
||||
let hostname = self.config.node_name.clone();
|
||||
|
||||
let (capacity, allocatable) = if let Some(ref nr) = self.detected {
|
||||
let cap_cpu = nr.capacity.cpu_count.to_string();
|
||||
let cap_mem = format_memory_quantity(nr.capacity.total_memory_bytes);
|
||||
let pods = nr.max_pods.to_string();
|
||||
|
||||
let alloc_cpu = format!("{}m", nr.allocatable_cpu_millicores);
|
||||
let alloc_mem = format_memory_quantity(nr.allocatable_memory_bytes);
|
||||
|
||||
let capacity = BTreeMap::from([
|
||||
("cpu".to_string(), Quantity(cap_cpu)),
|
||||
("memory".to_string(), Quantity(cap_mem)),
|
||||
("pods".to_string(), Quantity(pods.clone())),
|
||||
]);
|
||||
|
||||
let allocatable = BTreeMap::from([
|
||||
("cpu".to_string(), Quantity(alloc_cpu)),
|
||||
("memory".to_string(), Quantity(alloc_mem)),
|
||||
("pods".to_string(), Quantity(pods)),
|
||||
]);
|
||||
|
||||
(capacity, allocatable)
|
||||
} else {
|
||||
// Fallback: use available_parallelism for CPU, hardcoded memory
|
||||
let cpu_count = std::thread::available_parallelism()
|
||||
.map(|n| n.get().to_string())
|
||||
.unwrap_or_else(|_| "1".to_string());
|
||||
|
||||
let allocatable = BTreeMap::from([
|
||||
let capacity = BTreeMap::from([
|
||||
("cpu".to_string(), Quantity(cpu_count.clone())),
|
||||
("memory".to_string(), Quantity("8Gi".to_string())),
|
||||
("pods".to_string(), Quantity("110".to_string())),
|
||||
]);
|
||||
|
||||
let capacity = BTreeMap::from([
|
||||
let allocatable = BTreeMap::from([
|
||||
("cpu".to_string(), Quantity(cpu_count)),
|
||||
("memory".to_string(), Quantity("8Gi".to_string())),
|
||||
("pods".to_string(), Quantity("110".to_string())),
|
||||
]);
|
||||
|
||||
(capacity, allocatable)
|
||||
};
|
||||
|
||||
Node {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.config.node_name.clone()),
|
||||
|
|
@ -169,6 +252,7 @@ impl NodeAgent {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::sysinfo::detect_system_resources;
|
||||
|
||||
#[test]
|
||||
fn test_node_agent_config_defaults() {
|
||||
|
|
@ -176,6 +260,9 @@ mod tests {
|
|||
NodeAgentConfig::new("test-node".to_string(), "http://127.0.0.1:6443".to_string());
|
||||
assert_eq!(config.node_name, "test-node");
|
||||
assert_eq!(config.heartbeat_interval, Duration::from_secs(10));
|
||||
assert_eq!(config.system_reserved_cpu_millicores, 100);
|
||||
assert_eq!(config.system_reserved_memory_bytes, 256 * 1024 * 1024);
|
||||
assert_eq!(config.max_pods, 110);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -206,25 +293,90 @@ mod tests {
|
|||
let node = agent.build_node();
|
||||
let status = node.status.unwrap();
|
||||
|
||||
// Check allocatable
|
||||
// Check allocatable has all keys
|
||||
let alloc = status.allocatable.unwrap();
|
||||
assert!(alloc.contains_key("cpu"));
|
||||
assert!(alloc.contains_key("memory"));
|
||||
assert!(alloc.contains_key("pods"));
|
||||
assert_eq!(alloc["memory"].0, "8Gi");
|
||||
assert_eq!(alloc["pods"].0, "110");
|
||||
|
||||
// CPU should match available parallelism
|
||||
// Memory should come from detected system (not hardcoded 8Gi)
|
||||
let sys = detect_system_resources().expect("detection works in test");
|
||||
let expected_cap_mem = format_memory_quantity(sys.total_memory_bytes);
|
||||
let cap = status.capacity.unwrap();
|
||||
assert_eq!(cap["memory"].0, expected_cap_mem);
|
||||
|
||||
// Capacity CPU should match detected cpu_count
|
||||
assert_eq!(cap["cpu"].0, sys.cpu_count.to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_node_allocatable_less_than_capacity() {
|
||||
let api_client = Arc::new(ApiClient::new("http://127.0.0.1:6443"));
|
||||
let config =
|
||||
NodeAgentConfig::new("test-node".to_string(), "http://127.0.0.1:6443".to_string());
|
||||
let agent = NodeAgent::new(api_client, config);
|
||||
|
||||
// Agent should have detected resources (we're on a real host)
|
||||
assert!(agent.detected.is_some(), "detection should succeed in tests");
|
||||
|
||||
let node = agent.build_node();
|
||||
let status = node.status.unwrap();
|
||||
let cap = status.capacity.unwrap();
|
||||
let alloc = status.allocatable.unwrap();
|
||||
|
||||
// Allocatable CPU (millicores) should be less than capacity CPU (whole cores)
|
||||
let cap_cpu_m = reddwarf_core::resources::ResourceQuantities::parse_cpu(&cap["cpu"].0)
|
||||
.expect("valid cpu");
|
||||
let alloc_cpu_m =
|
||||
reddwarf_core::resources::ResourceQuantities::parse_cpu(&alloc["cpu"].0)
|
||||
.expect("valid cpu");
|
||||
assert!(
|
||||
alloc_cpu_m < cap_cpu_m,
|
||||
"allocatable CPU {}m should be less than capacity {}m",
|
||||
alloc_cpu_m,
|
||||
cap_cpu_m,
|
||||
);
|
||||
|
||||
// Allocatable memory should be less than capacity memory
|
||||
let cap_mem =
|
||||
reddwarf_core::resources::ResourceQuantities::parse_memory(&cap["memory"].0)
|
||||
.expect("valid mem");
|
||||
let alloc_mem =
|
||||
reddwarf_core::resources::ResourceQuantities::parse_memory(&alloc["memory"].0)
|
||||
.expect("valid mem");
|
||||
assert!(
|
||||
alloc_mem < cap_mem,
|
||||
"allocatable memory {} should be less than capacity {}",
|
||||
alloc_mem,
|
||||
cap_mem,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_node_fallback_on_detection_failure() {
|
||||
let api_client = Arc::new(ApiClient::new("http://127.0.0.1:6443"));
|
||||
let config =
|
||||
NodeAgentConfig::new("test-node".to_string(), "http://127.0.0.1:6443".to_string());
|
||||
// Simulate detection failure
|
||||
let agent = NodeAgent::new_with_detected(api_client, config, None);
|
||||
|
||||
let node = agent.build_node();
|
||||
let status = node.status.unwrap();
|
||||
let alloc = status.allocatable.unwrap();
|
||||
let cap = status.capacity.unwrap();
|
||||
|
||||
// Should fall back to hardcoded defaults
|
||||
assert_eq!(alloc["memory"].0, "8Gi");
|
||||
assert_eq!(alloc["pods"].0, "110");
|
||||
assert_eq!(cap["memory"].0, "8Gi");
|
||||
assert_eq!(cap["pods"].0, "110");
|
||||
|
||||
// CPU falls back to available_parallelism
|
||||
let expected_cpu = std::thread::available_parallelism()
|
||||
.map(|n| n.get().to_string())
|
||||
.unwrap_or_else(|_| "1".to_string());
|
||||
assert_eq!(alloc["cpu"].0, expected_cpu);
|
||||
|
||||
// Check capacity
|
||||
let cap = status.capacity.unwrap();
|
||||
assert!(cap.contains_key("cpu"));
|
||||
assert!(cap.contains_key("memory"));
|
||||
assert!(cap.contains_key("pods"));
|
||||
assert_eq!(cap["cpu"].0, expected_cpu);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
173
crates/reddwarf-runtime/src/sysinfo.rs
Normal file
173
crates/reddwarf-runtime/src/sysinfo.rs
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
use crate::error::RuntimeError;
|
||||
|
||||
/// Detected physical resources of the host.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SystemResources {
|
||||
/// Number of logical CPUs.
|
||||
pub cpu_count: u32,
|
||||
/// Total physical memory in bytes.
|
||||
pub total_memory_bytes: u64,
|
||||
}
|
||||
|
||||
/// How much CPU / memory to reserve for system daemons.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ResourceReservation {
|
||||
/// CPU to reserve in millicores (e.g. 100 = 100m).
|
||||
pub cpu_millicores: i64,
|
||||
/// Memory to reserve in bytes.
|
||||
pub memory_bytes: i64,
|
||||
}
|
||||
|
||||
/// Computed node resource budget (capacity minus reservation).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NodeResources {
|
||||
/// Raw hardware capacity.
|
||||
pub capacity: SystemResources,
|
||||
/// Allocatable CPU after subtracting reservation, in millicores.
|
||||
pub allocatable_cpu_millicores: i64,
|
||||
/// Allocatable memory after subtracting reservation, in bytes.
|
||||
pub allocatable_memory_bytes: u64,
|
||||
/// Maximum number of pods this node will accept.
|
||||
pub max_pods: u32,
|
||||
}
|
||||
|
||||
/// Detect the host's CPU count and total memory.
|
||||
///
|
||||
/// Uses the `sys_info` crate which supports illumos, Linux, and macOS.
|
||||
pub fn detect_system_resources() -> Result<SystemResources, RuntimeError> {
|
||||
let cpu_count = sys_info::cpu_num().map_err(|e| {
|
||||
RuntimeError::resource_detection_failed(format!("failed to detect CPU count: {e}"))
|
||||
})?;
|
||||
|
||||
let mem = sys_info::mem_info().map_err(|e| {
|
||||
RuntimeError::resource_detection_failed(format!("failed to detect memory: {e}"))
|
||||
})?;
|
||||
|
||||
// sys_info::mem_info().total is in KiB
|
||||
let total_memory_bytes = mem.total * 1024;
|
||||
|
||||
Ok(SystemResources {
|
||||
cpu_count,
|
||||
total_memory_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
/// Detect system resources and compute allocatable values after subtracting
|
||||
/// the given reservation. Allocatable values are clamped so they never go
|
||||
/// negative.
|
||||
pub fn compute_node_resources(
|
||||
reservation: &ResourceReservation,
|
||||
max_pods: u32,
|
||||
) -> Result<NodeResources, RuntimeError> {
|
||||
let capacity = detect_system_resources()?;
|
||||
|
||||
let capacity_cpu_millicores = capacity.cpu_count as i64 * 1000;
|
||||
let allocatable_cpu_millicores =
|
||||
(capacity_cpu_millicores - reservation.cpu_millicores).max(0);
|
||||
|
||||
let allocatable_memory_bytes =
|
||||
(capacity.total_memory_bytes as i64 - reservation.memory_bytes).max(0) as u64;
|
||||
|
||||
Ok(NodeResources {
|
||||
capacity,
|
||||
allocatable_cpu_millicores,
|
||||
allocatable_memory_bytes,
|
||||
max_pods,
|
||||
})
|
||||
}
|
||||
|
||||
/// Convert a byte count to the most human-friendly Kubernetes Quantity string.
|
||||
///
|
||||
/// Picks the largest clean binary unit: `"16Gi"`, `"7680Mi"`, `"512Ki"`, or
|
||||
/// raw bytes if nothing divides evenly.
|
||||
pub fn format_memory_quantity(bytes: u64) -> String {
|
||||
const GIB: u64 = 1024 * 1024 * 1024;
|
||||
const MIB: u64 = 1024 * 1024;
|
||||
const KIB: u64 = 1024;
|
||||
|
||||
if bytes > 0 && bytes % GIB == 0 {
|
||||
format!("{}Gi", bytes / GIB)
|
||||
} else if bytes > 0 && bytes % MIB == 0 {
|
||||
format!("{}Mi", bytes / MIB)
|
||||
} else if bytes > 0 && bytes % KIB == 0 {
|
||||
format!("{}Ki", bytes / KIB)
|
||||
} else {
|
||||
format!("{}", bytes)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_detect_system_resources() {
|
||||
let res = detect_system_resources().expect("detection should succeed in test env");
|
||||
assert!(res.cpu_count > 0, "should detect at least 1 CPU");
|
||||
assert!(
|
||||
res.total_memory_bytes > 0,
|
||||
"should detect nonzero memory"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_memory_quantity_gi() {
|
||||
assert_eq!(format_memory_quantity(16 * 1024 * 1024 * 1024), "16Gi");
|
||||
assert_eq!(format_memory_quantity(1024 * 1024 * 1024), "1Gi");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_memory_quantity_mi() {
|
||||
assert_eq!(format_memory_quantity(7680 * 1024 * 1024), "7680Mi");
|
||||
assert_eq!(format_memory_quantity(512 * 1024 * 1024), "512Mi");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_memory_quantity_ki() {
|
||||
assert_eq!(format_memory_quantity(512 * 1024), "512Ki");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_memory_quantity_raw_bytes() {
|
||||
assert_eq!(format_memory_quantity(1023), "1023");
|
||||
assert_eq!(format_memory_quantity(0), "0");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_reserves_subtracted() {
|
||||
let reservation = ResourceReservation {
|
||||
cpu_millicores: 100,
|
||||
memory_bytes: 256 * 1024 * 1024,
|
||||
};
|
||||
let nr = compute_node_resources(&reservation, 110)
|
||||
.expect("should succeed in test env");
|
||||
|
||||
let capacity_cpu_millis = nr.capacity.cpu_count as i64 * 1000;
|
||||
assert!(
|
||||
nr.allocatable_cpu_millicores < capacity_cpu_millis,
|
||||
"allocatable CPU ({}) should be less than capacity ({})",
|
||||
nr.allocatable_cpu_millicores,
|
||||
capacity_cpu_millis,
|
||||
);
|
||||
assert!(
|
||||
nr.allocatable_memory_bytes < nr.capacity.total_memory_bytes,
|
||||
"allocatable memory ({}) should be less than capacity ({})",
|
||||
nr.allocatable_memory_bytes,
|
||||
nr.capacity.total_memory_bytes,
|
||||
);
|
||||
assert_eq!(nr.max_pods, 110);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reservation_clamp_to_zero() {
|
||||
let reservation = ResourceReservation {
|
||||
cpu_millicores: i64::MAX,
|
||||
memory_bytes: i64::MAX,
|
||||
};
|
||||
let nr = compute_node_resources(&reservation, 110)
|
||||
.expect("should succeed in test env");
|
||||
|
||||
assert_eq!(nr.allocatable_cpu_millicores, 0);
|
||||
assert_eq!(nr.allocatable_memory_bytes, 0);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
use clap::{Parser, Subcommand};
|
||||
use reddwarf_apiserver::{ApiError, ApiServer, AppState, Config as ApiConfig, TlsMode};
|
||||
use reddwarf_core::Namespace;
|
||||
use reddwarf_core::{Namespace, ResourceQuantities};
|
||||
use reddwarf_runtime::{
|
||||
ApiClient, Ipam, MockRuntime, MockStorageEngine, NodeAgent, NodeAgentConfig,
|
||||
NodeHealthChecker, NodeHealthCheckerConfig, PodController, PodControllerConfig, StorageEngine,
|
||||
|
|
@ -84,6 +84,15 @@ enum Commands {
|
|||
/// Etherstub name for pod networking
|
||||
#[arg(long, default_value = "reddwarf0")]
|
||||
etherstub_name: String,
|
||||
/// CPU to reserve for system daemons (e.g. "100m", "0.1")
|
||||
#[arg(long, default_value = "100m")]
|
||||
system_reserved_cpu: String,
|
||||
/// Memory to reserve for system daemons (e.g. "256Mi", "1Gi")
|
||||
#[arg(long, default_value = "256Mi")]
|
||||
system_reserved_memory: String,
|
||||
/// Maximum number of pods this node will accept
|
||||
#[arg(long, default_value_t = 110)]
|
||||
max_pods: u32,
|
||||
#[command(flatten)]
|
||||
tls_args: TlsArgs,
|
||||
},
|
||||
|
|
@ -118,8 +127,30 @@ async fn main() -> miette::Result<()> {
|
|||
zonepath_prefix,
|
||||
pod_cidr,
|
||||
etherstub_name,
|
||||
system_reserved_cpu,
|
||||
system_reserved_memory,
|
||||
max_pods,
|
||||
tls_args,
|
||||
} => {
|
||||
let reserved_cpu_millicores =
|
||||
ResourceQuantities::parse_cpu(&system_reserved_cpu).map_err(|e| {
|
||||
miette::miette!(
|
||||
help = "Use a value like '100m' or '0.1' for --system-reserved-cpu",
|
||||
"Invalid --system-reserved-cpu '{}': {}",
|
||||
system_reserved_cpu,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
let reserved_memory_bytes =
|
||||
ResourceQuantities::parse_memory(&system_reserved_memory).map_err(|e| {
|
||||
miette::miette!(
|
||||
help = "Use a value like '256Mi' or '1Gi' for --system-reserved-memory",
|
||||
"Invalid --system-reserved-memory '{}': {}",
|
||||
system_reserved_memory,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
run_agent(
|
||||
&node_name,
|
||||
&bind,
|
||||
|
|
@ -131,6 +162,9 @@ async fn main() -> miette::Result<()> {
|
|||
zonepath_prefix.as_deref(),
|
||||
&pod_cidr,
|
||||
ðerstub_name,
|
||||
reserved_cpu_millicores,
|
||||
reserved_memory_bytes,
|
||||
max_pods,
|
||||
&tls_args,
|
||||
)
|
||||
.await
|
||||
|
|
@ -228,6 +262,9 @@ async fn run_agent(
|
|||
zonepath_prefix: Option<&str>,
|
||||
pod_cidr: &str,
|
||||
etherstub_name: &str,
|
||||
system_reserved_cpu_millicores: i64,
|
||||
system_reserved_memory_bytes: i64,
|
||||
max_pods: u32,
|
||||
tls_args: &TlsArgs,
|
||||
) -> miette::Result<()> {
|
||||
info!("Starting reddwarf agent for node '{}'", node_name);
|
||||
|
|
@ -337,7 +374,10 @@ async fn run_agent(
|
|||
});
|
||||
|
||||
// 6. Spawn node agent
|
||||
let node_agent_config = NodeAgentConfig::new(node_name.to_string(), api_url);
|
||||
let mut node_agent_config = NodeAgentConfig::new(node_name.to_string(), api_url);
|
||||
node_agent_config.system_reserved_cpu_millicores = system_reserved_cpu_millicores;
|
||||
node_agent_config.system_reserved_memory_bytes = system_reserved_memory_bytes;
|
||||
node_agent_config.max_pods = max_pods;
|
||||
let node_agent = NodeAgent::new(api_client.clone(), node_agent_config);
|
||||
let agent_token = token.clone();
|
||||
let node_agent_handle = tokio::spawn(async move {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue