mirror of
https://github.com/CloudNebulaProject/reddwarf.git
synced 2026-04-10 13:20:40 +00:00
Close the control loop: versioned bind, event-driven controller, graceful shutdown
- Move WatchEventType and ResourceEvent to reddwarf-core so scheduler and runtime can use them without depending on the apiserver crate - Fix scheduler bind_pod to create versioned commits and publish MODIFIED events so the pod controller learns about scheduled pods - Replace polling loop in pod controller with event bus subscription, wire handle_delete for DELETED events, keep reconcile_all for startup sync and lag recovery - Add allocatable/capacity resources (cpu, memory, pods) to node agent build_node so the scheduler's resource filter accepts nodes - Bootstrap "default" namespace on startup to prevent pod creation failures in the default namespace - Replace .abort() shutdown with CancellationToken-based graceful shutdown across scheduler, controller, and node agent Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
8d9ae6ac62
commit
c50ecb2664
13 changed files with 452 additions and 138 deletions
5
Cargo.lock
generated
5
Cargo.lock
generated
|
|
@ -1337,6 +1337,7 @@ dependencies = [
|
||||||
"reddwarf-storage",
|
"reddwarf-storage",
|
||||||
"reddwarf-versioning",
|
"reddwarf-versioning",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"tokio-util",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
]
|
]
|
||||||
|
|
@ -1399,6 +1400,7 @@ dependencies = [
|
||||||
"thiserror 2.0.18",
|
"thiserror 2.0.18",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
|
"tokio-util",
|
||||||
"tracing",
|
"tracing",
|
||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
@ -1411,11 +1413,13 @@ dependencies = [
|
||||||
"miette",
|
"miette",
|
||||||
"reddwarf-core",
|
"reddwarf-core",
|
||||||
"reddwarf-storage",
|
"reddwarf-storage",
|
||||||
|
"reddwarf-versioning",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"thiserror 2.0.18",
|
"thiserror 2.0.18",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"tokio-util",
|
||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -2051,6 +2055,7 @@ dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"futures-core",
|
"futures-core",
|
||||||
"futures-sink",
|
"futures-sink",
|
||||||
|
"futures-util",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,7 @@ anyhow = "1.0"
|
||||||
# Async runtime
|
# Async runtime
|
||||||
tokio = { version = "1.40", features = ["full"] }
|
tokio = { version = "1.40", features = ["full"] }
|
||||||
tokio-stream = { version = "0.1", features = ["sync"] }
|
tokio-stream = { version = "0.1", features = ["sync"] }
|
||||||
|
tokio-util = { version = "0.7", features = ["rt"] }
|
||||||
futures-util = "0.3"
|
futures-util = "0.3"
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,4 @@
|
||||||
use reddwarf_core::{GroupVersionKind, ResourceKey};
|
pub use reddwarf_core::{ResourceEvent, WatchEventType};
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
|
|
||||||
use crate::watch::WatchEventType;
|
|
||||||
|
|
||||||
/// Configuration for the event bus
|
/// Configuration for the event bus
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
|
|
@ -16,74 +13,12 @@ impl Default for EventBusConfig {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A resource event emitted by the API server on mutations
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct ResourceEvent {
|
|
||||||
/// Type of watch event (ADDED, MODIFIED, DELETED)
|
|
||||||
pub event_type: WatchEventType,
|
|
||||||
/// GroupVersionKind of the resource
|
|
||||||
pub gvk: GroupVersionKind,
|
|
||||||
/// Full resource key (gvk + namespace + name)
|
|
||||||
pub resource_key: ResourceKey,
|
|
||||||
/// The serialized resource object
|
|
||||||
pub object: serde_json::Value,
|
|
||||||
/// Resource version at the time of the event
|
|
||||||
pub resource_version: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ResourceEvent {
|
|
||||||
/// Create an ADDED event
|
|
||||||
pub fn added(
|
|
||||||
resource_key: ResourceKey,
|
|
||||||
object: serde_json::Value,
|
|
||||||
resource_version: String,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
|
||||||
event_type: WatchEventType::Added,
|
|
||||||
gvk: resource_key.gvk.clone(),
|
|
||||||
resource_key,
|
|
||||||
object,
|
|
||||||
resource_version,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a MODIFIED event
|
|
||||||
pub fn modified(
|
|
||||||
resource_key: ResourceKey,
|
|
||||||
object: serde_json::Value,
|
|
||||||
resource_version: String,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
|
||||||
event_type: WatchEventType::Modified,
|
|
||||||
gvk: resource_key.gvk.clone(),
|
|
||||||
resource_key,
|
|
||||||
object,
|
|
||||||
resource_version,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a DELETED event
|
|
||||||
pub fn deleted(
|
|
||||||
resource_key: ResourceKey,
|
|
||||||
object: serde_json::Value,
|
|
||||||
resource_version: String,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
|
||||||
event_type: WatchEventType::Deleted,
|
|
||||||
gvk: resource_key.gvk.clone(),
|
|
||||||
resource_key,
|
|
||||||
object,
|
|
||||||
resource_version,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::handlers::common::{create_resource, delete_resource, update_resource};
|
use crate::handlers::common::{create_resource, delete_resource, update_resource};
|
||||||
use crate::AppState;
|
use crate::AppState;
|
||||||
use reddwarf_core::{GroupVersionKind, Pod, Resource};
|
use reddwarf_core::{GroupVersionKind, Pod, Resource, ResourceKey, WatchEventType};
|
||||||
use reddwarf_storage::RedbBackend;
|
use reddwarf_storage::RedbBackend;
|
||||||
use reddwarf_versioning::VersionStore;
|
use reddwarf_versioning::VersionStore;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
|
||||||
|
|
@ -3,21 +3,12 @@ use crate::AppState;
|
||||||
use axum::response::sse::{Event, KeepAlive, Sse};
|
use axum::response::sse::{Event, KeepAlive, Sse};
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
use reddwarf_core::GroupVersionKind;
|
use reddwarf_core::GroupVersionKind;
|
||||||
|
pub use reddwarf_core::WatchEventType;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::convert::Infallible;
|
use std::convert::Infallible;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio_stream::wrappers::{errors::BroadcastStreamRecvError, BroadcastStream};
|
use tokio_stream::wrappers::{errors::BroadcastStreamRecvError, BroadcastStream};
|
||||||
|
|
||||||
/// Watch event type
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
#[serde(rename_all = "UPPERCASE")]
|
|
||||||
pub enum WatchEventType {
|
|
||||||
Added,
|
|
||||||
Modified,
|
|
||||||
Deleted,
|
|
||||||
Error,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Watch event
|
/// Watch event
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct WatchEvent<T> {
|
pub struct WatchEvent<T> {
|
||||||
|
|
|
||||||
74
crates/reddwarf-core/src/events.rs
Normal file
74
crates/reddwarf-core/src/events.rs
Normal file
|
|
@ -0,0 +1,74 @@
|
||||||
|
use crate::types::{GroupVersionKind, ResourceKey};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// Watch event type
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "UPPERCASE")]
|
||||||
|
pub enum WatchEventType {
|
||||||
|
Added,
|
||||||
|
Modified,
|
||||||
|
Deleted,
|
||||||
|
Error,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A resource event emitted by the API server on mutations
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ResourceEvent {
|
||||||
|
/// Type of watch event (ADDED, MODIFIED, DELETED)
|
||||||
|
pub event_type: WatchEventType,
|
||||||
|
/// GroupVersionKind of the resource
|
||||||
|
pub gvk: GroupVersionKind,
|
||||||
|
/// Full resource key (gvk + namespace + name)
|
||||||
|
pub resource_key: ResourceKey,
|
||||||
|
/// The serialized resource object
|
||||||
|
pub object: serde_json::Value,
|
||||||
|
/// Resource version at the time of the event
|
||||||
|
pub resource_version: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ResourceEvent {
|
||||||
|
/// Create an ADDED event
|
||||||
|
pub fn added(
|
||||||
|
resource_key: ResourceKey,
|
||||||
|
object: serde_json::Value,
|
||||||
|
resource_version: String,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
event_type: WatchEventType::Added,
|
||||||
|
gvk: resource_key.gvk.clone(),
|
||||||
|
resource_key,
|
||||||
|
object,
|
||||||
|
resource_version,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a MODIFIED event
|
||||||
|
pub fn modified(
|
||||||
|
resource_key: ResourceKey,
|
||||||
|
object: serde_json::Value,
|
||||||
|
resource_version: String,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
event_type: WatchEventType::Modified,
|
||||||
|
gvk: resource_key.gvk.clone(),
|
||||||
|
resource_key,
|
||||||
|
object,
|
||||||
|
resource_version,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a DELETED event
|
||||||
|
pub fn deleted(
|
||||||
|
resource_key: ResourceKey,
|
||||||
|
object: serde_json::Value,
|
||||||
|
resource_version: String,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
event_type: WatchEventType::Deleted,
|
||||||
|
gvk: resource_key.gvk.clone(),
|
||||||
|
resource_key,
|
||||||
|
object,
|
||||||
|
resource_version,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -7,11 +7,13 @@
|
||||||
//! - Serialization helpers
|
//! - Serialization helpers
|
||||||
|
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
pub mod events;
|
||||||
pub mod resources;
|
pub mod resources;
|
||||||
pub mod types;
|
pub mod types;
|
||||||
|
|
||||||
// Re-export commonly used types
|
// Re-export commonly used types
|
||||||
pub use error::{ReddwarfError, Result};
|
pub use error::{ReddwarfError, Result};
|
||||||
|
pub use events::{ResourceEvent, WatchEventType};
|
||||||
pub use resources::{is_valid_name, Resource, ResourceError};
|
pub use resources::{is_valid_name, Resource, ResourceError};
|
||||||
pub use types::{GroupVersionKind, ResourceKey, ResourceVersion};
|
pub use types::{GroupVersionKind, ResourceKey, ResourceVersion};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ reddwarf-core = { workspace = true }
|
||||||
k8s-openapi = { workspace = true }
|
k8s-openapi = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
tokio-stream = { workspace = true }
|
tokio-stream = { workspace = true }
|
||||||
|
tokio-util = { workspace = true }
|
||||||
serde = { workspace = true }
|
serde = { workspace = true }
|
||||||
serde_json = { workspace = true }
|
serde_json = { workspace = true }
|
||||||
miette = { workspace = true }
|
miette = { workspace = true }
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,10 @@ use crate::error::{Result, RuntimeError};
|
||||||
use crate::traits::ZoneRuntime;
|
use crate::traits::ZoneRuntime;
|
||||||
use crate::types::*;
|
use crate::types::*;
|
||||||
use k8s_openapi::api::core::v1::{Pod, PodCondition, PodStatus};
|
use k8s_openapi::api::core::v1::{Pod, PodCondition, PodStatus};
|
||||||
|
use reddwarf_core::{ResourceEvent, WatchEventType};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::broadcast;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
|
|
||||||
/// Configuration for the pod controller
|
/// Configuration for the pod controller
|
||||||
|
|
@ -27,6 +30,7 @@ pub struct PodControllerConfig {
|
||||||
pub struct PodController {
|
pub struct PodController {
|
||||||
runtime: Arc<dyn ZoneRuntime>,
|
runtime: Arc<dyn ZoneRuntime>,
|
||||||
api_client: Arc<ApiClient>,
|
api_client: Arc<ApiClient>,
|
||||||
|
event_tx: broadcast::Sender<ResourceEvent>,
|
||||||
config: PodControllerConfig,
|
config: PodControllerConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -34,33 +38,89 @@ impl PodController {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
runtime: Arc<dyn ZoneRuntime>,
|
runtime: Arc<dyn ZoneRuntime>,
|
||||||
api_client: Arc<ApiClient>,
|
api_client: Arc<ApiClient>,
|
||||||
|
event_tx: broadcast::Sender<ResourceEvent>,
|
||||||
config: PodControllerConfig,
|
config: PodControllerConfig,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
runtime,
|
runtime,
|
||||||
api_client,
|
api_client,
|
||||||
|
event_tx,
|
||||||
config,
|
config,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the controller — polls for unscheduled-to-this-node pods in a loop.
|
/// Run the controller — reacts to pod events from the in-process event bus.
|
||||||
///
|
///
|
||||||
/// In a real implementation, this would use SSE watch. For now, we receive
|
/// On startup, performs a full reconcile to catch up on any pods that were
|
||||||
/// events via the in-process event bus by subscribing to the broadcast channel.
|
/// scheduled while the controller was down. Then switches to event-driven mode.
|
||||||
/// Since the controller runs in the same process as the API server, we use
|
pub async fn run(&self, token: CancellationToken) -> Result<()> {
|
||||||
/// a simpler polling approach over the HTTP API.
|
|
||||||
pub async fn run(&self) -> Result<()> {
|
|
||||||
info!(
|
info!(
|
||||||
"Starting pod controller for node '{}'",
|
"Starting pod controller for node '{}'",
|
||||||
self.config.node_name
|
self.config.node_name
|
||||||
);
|
);
|
||||||
|
|
||||||
// Poll loop — watches for pods via HTTP list
|
// Initial full sync
|
||||||
|
if let Err(e) = self.reconcile_all().await {
|
||||||
|
error!("Initial reconcile failed: {}", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut rx = self.event_tx.subscribe();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if let Err(e) = self.reconcile_all().await {
|
tokio::select! {
|
||||||
error!("Pod controller reconcile cycle failed: {}", e);
|
_ = token.cancelled() => {
|
||||||
|
info!("Pod controller shutting down");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
result = rx.recv() => {
|
||||||
|
match result {
|
||||||
|
Ok(event) => {
|
||||||
|
if event.gvk.kind != "Pod" {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
match event.event_type {
|
||||||
|
WatchEventType::Added | WatchEventType::Modified => {
|
||||||
|
match serde_json::from_value::<Pod>(event.object) {
|
||||||
|
Ok(pod) => {
|
||||||
|
if let Err(e) = self.reconcile(&pod).await {
|
||||||
|
let name = pod.metadata.name.as_deref().unwrap_or("<unknown>");
|
||||||
|
error!("Failed to reconcile pod {}: {}", name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Failed to parse pod from event: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WatchEventType::Deleted => {
|
||||||
|
match serde_json::from_value::<Pod>(event.object) {
|
||||||
|
Ok(pod) => {
|
||||||
|
if let Err(e) = self.handle_delete(&pod).await {
|
||||||
|
let name = pod.metadata.name.as_deref().unwrap_or("<unknown>");
|
||||||
|
error!("Failed to handle pod deletion {}: {}", name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Failed to parse pod from delete event: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(broadcast::error::RecvError::Lagged(n)) => {
|
||||||
|
warn!("Missed {} events, doing full resync", n);
|
||||||
|
if let Err(e) = self.reconcile_all().await {
|
||||||
|
error!("Resync after lag failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(broadcast::error::RecvError::Closed) => {
|
||||||
|
info!("Event bus closed, stopping pod controller");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,12 @@
|
||||||
use crate::api_client::ApiClient;
|
use crate::api_client::ApiClient;
|
||||||
use crate::error::{Result, RuntimeError};
|
use crate::error::{Result, RuntimeError};
|
||||||
use k8s_openapi::api::core::v1::{Node, NodeAddress, NodeCondition, NodeStatus};
|
use k8s_openapi::api::core::v1::{Node, NodeAddress, NodeCondition, NodeStatus};
|
||||||
|
use k8s_openapi::apimachinery::pkg::api::resource::Quantity;
|
||||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{info, warn};
|
use tracing::{info, warn};
|
||||||
|
|
||||||
/// Configuration for the node agent
|
/// Configuration for the node agent
|
||||||
|
|
@ -65,7 +68,7 @@ impl NodeAgent {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the heartbeat loop
|
/// Run the heartbeat loop
|
||||||
pub async fn run(&self) -> Result<()> {
|
pub async fn run(&self, token: CancellationToken) -> Result<()> {
|
||||||
// Register first
|
// Register first
|
||||||
self.register().await?;
|
self.register().await?;
|
||||||
|
|
||||||
|
|
@ -75,10 +78,16 @@ impl NodeAgent {
|
||||||
);
|
);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
tokio::time::sleep(self.config.heartbeat_interval).await;
|
tokio::select! {
|
||||||
|
_ = token.cancelled() => {
|
||||||
if let Err(e) = self.heartbeat().await {
|
info!("Node agent shutting down");
|
||||||
warn!("Heartbeat failed: {} — will retry", e);
|
return Ok(());
|
||||||
|
}
|
||||||
|
_ = tokio::time::sleep(self.config.heartbeat_interval) => {
|
||||||
|
if let Err(e) = self.heartbeat().await {
|
||||||
|
warn!("Heartbeat failed: {} — will retry", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -99,6 +108,22 @@ impl NodeAgent {
|
||||||
fn build_node(&self) -> Node {
|
fn build_node(&self) -> Node {
|
||||||
let hostname = self.config.node_name.clone();
|
let hostname = self.config.node_name.clone();
|
||||||
|
|
||||||
|
let cpu_count = std::thread::available_parallelism()
|
||||||
|
.map(|n| n.get().to_string())
|
||||||
|
.unwrap_or_else(|_| "1".to_string());
|
||||||
|
|
||||||
|
let allocatable = BTreeMap::from([
|
||||||
|
("cpu".to_string(), Quantity(cpu_count.clone())),
|
||||||
|
("memory".to_string(), Quantity("8Gi".to_string())),
|
||||||
|
("pods".to_string(), Quantity("110".to_string())),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let capacity = BTreeMap::from([
|
||||||
|
("cpu".to_string(), Quantity(cpu_count)),
|
||||||
|
("memory".to_string(), Quantity("8Gi".to_string())),
|
||||||
|
("pods".to_string(), Quantity("110".to_string())),
|
||||||
|
]);
|
||||||
|
|
||||||
Node {
|
Node {
|
||||||
metadata: ObjectMeta {
|
metadata: ObjectMeta {
|
||||||
name: Some(self.config.node_name.clone()),
|
name: Some(self.config.node_name.clone()),
|
||||||
|
|
@ -132,6 +157,8 @@ impl NodeAgent {
|
||||||
type_: "Hostname".to_string(),
|
type_: "Hostname".to_string(),
|
||||||
address: hostname,
|
address: hostname,
|
||||||
}]),
|
}]),
|
||||||
|
allocatable: Some(allocatable),
|
||||||
|
capacity: Some(capacity),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
|
|
@ -168,4 +195,36 @@ mod tests {
|
||||||
assert_eq!(conditions[0].status, "True");
|
assert_eq!(conditions[0].status, "True");
|
||||||
assert!(conditions[0].last_heartbeat_time.is_some());
|
assert!(conditions[0].last_heartbeat_time.is_some());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_build_node_has_allocatable_resources() {
|
||||||
|
let api_client = Arc::new(ApiClient::new("http://127.0.0.1:6443"));
|
||||||
|
let config =
|
||||||
|
NodeAgentConfig::new("test-node".to_string(), "http://127.0.0.1:6443".to_string());
|
||||||
|
let agent = NodeAgent::new(api_client, config);
|
||||||
|
|
||||||
|
let node = agent.build_node();
|
||||||
|
let status = node.status.unwrap();
|
||||||
|
|
||||||
|
// Check allocatable
|
||||||
|
let alloc = status.allocatable.unwrap();
|
||||||
|
assert!(alloc.contains_key("cpu"));
|
||||||
|
assert!(alloc.contains_key("memory"));
|
||||||
|
assert!(alloc.contains_key("pods"));
|
||||||
|
assert_eq!(alloc["memory"].0, "8Gi");
|
||||||
|
assert_eq!(alloc["pods"].0, "110");
|
||||||
|
|
||||||
|
// CPU should match available parallelism
|
||||||
|
let expected_cpu = std::thread::available_parallelism()
|
||||||
|
.map(|n| n.get().to_string())
|
||||||
|
.unwrap_or_else(|_| "1".to_string());
|
||||||
|
assert_eq!(alloc["cpu"].0, expected_cpu);
|
||||||
|
|
||||||
|
// Check capacity
|
||||||
|
let cap = status.capacity.unwrap();
|
||||||
|
assert!(cap.contains_key("cpu"));
|
||||||
|
assert!(cap.contains_key("memory"));
|
||||||
|
assert!(cap.contains_key("pods"));
|
||||||
|
assert_eq!(cap["cpu"].0, expected_cpu);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,8 +10,10 @@ rust-version.workspace = true
|
||||||
[dependencies]
|
[dependencies]
|
||||||
reddwarf-core = { workspace = true }
|
reddwarf-core = { workspace = true }
|
||||||
reddwarf-storage = { workspace = true }
|
reddwarf-storage = { workspace = true }
|
||||||
|
reddwarf-versioning = { workspace = true }
|
||||||
k8s-openapi = { workspace = true }
|
k8s-openapi = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
|
tokio-util = { workspace = true }
|
||||||
miette = { workspace = true }
|
miette = { workspace = true }
|
||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
|
|
|
||||||
|
|
@ -2,11 +2,14 @@ use crate::filter::{default_filters, FilterPredicate};
|
||||||
use crate::score::{calculate_weighted_score, default_scores, ScoreFunction};
|
use crate::score::{calculate_weighted_score, default_scores, ScoreFunction};
|
||||||
use crate::types::SchedulingContext;
|
use crate::types::SchedulingContext;
|
||||||
use crate::{Result, SchedulerError};
|
use crate::{Result, SchedulerError};
|
||||||
use reddwarf_core::{Node, Pod};
|
use reddwarf_core::{Node, Pod, ResourceEvent};
|
||||||
use reddwarf_storage::{KVStore, KeyEncoder, RedbBackend};
|
use reddwarf_storage::{KVStore, KeyEncoder, RedbBackend};
|
||||||
|
use reddwarf_versioning::{Change, CommitBuilder, VersionStore};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
use tokio::sync::broadcast;
|
||||||
use tokio::time::sleep;
|
use tokio::time::sleep;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
|
|
||||||
/// Configuration for the scheduler
|
/// Configuration for the scheduler
|
||||||
|
|
@ -27,6 +30,8 @@ impl Default for SchedulerConfig {
|
||||||
/// Pod scheduler
|
/// Pod scheduler
|
||||||
pub struct Scheduler {
|
pub struct Scheduler {
|
||||||
storage: Arc<RedbBackend>,
|
storage: Arc<RedbBackend>,
|
||||||
|
version_store: Arc<VersionStore>,
|
||||||
|
event_tx: broadcast::Sender<ResourceEvent>,
|
||||||
config: SchedulerConfig,
|
config: SchedulerConfig,
|
||||||
filters: Vec<Box<dyn FilterPredicate>>,
|
filters: Vec<Box<dyn FilterPredicate>>,
|
||||||
scorers: Vec<Box<dyn ScoreFunction>>,
|
scorers: Vec<Box<dyn ScoreFunction>>,
|
||||||
|
|
@ -34,9 +39,16 @@ pub struct Scheduler {
|
||||||
|
|
||||||
impl Scheduler {
|
impl Scheduler {
|
||||||
/// Create a new scheduler
|
/// Create a new scheduler
|
||||||
pub fn new(storage: Arc<RedbBackend>, config: SchedulerConfig) -> Self {
|
pub fn new(
|
||||||
|
storage: Arc<RedbBackend>,
|
||||||
|
version_store: Arc<VersionStore>,
|
||||||
|
event_tx: broadcast::Sender<ResourceEvent>,
|
||||||
|
config: SchedulerConfig,
|
||||||
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
storage,
|
storage,
|
||||||
|
version_store,
|
||||||
|
event_tx,
|
||||||
config,
|
config,
|
||||||
filters: default_filters(),
|
filters: default_filters(),
|
||||||
scorers: default_scores(),
|
scorers: default_scores(),
|
||||||
|
|
@ -44,15 +56,21 @@ impl Scheduler {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the scheduler loop
|
/// Run the scheduler loop
|
||||||
pub async fn run(&self) -> Result<()> {
|
pub async fn run(&self, token: CancellationToken) -> Result<()> {
|
||||||
info!("Starting scheduler");
|
info!("Starting scheduler");
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if let Err(e) = self.schedule_cycle().await {
|
tokio::select! {
|
||||||
error!("Scheduling cycle failed: {}", e);
|
_ = token.cancelled() => {
|
||||||
|
info!("Scheduler shutting down");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
_ = sleep(self.config.schedule_interval) => {
|
||||||
|
if let Err(e) = self.schedule_cycle().await {
|
||||||
|
error!("Scheduling cycle failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sleep(self.config.schedule_interval).await;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -240,21 +258,39 @@ impl Scheduler {
|
||||||
Ok(best_node)
|
Ok(best_node)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Bind a pod to a node (update spec.nodeName)
|
/// Bind a pod to a node (update spec.nodeName) with versioning and event publishing
|
||||||
async fn bind_pod(&self, pod: &mut Pod, node_name: &str) -> Result<()> {
|
async fn bind_pod(&self, pod: &mut Pod, node_name: &str) -> Result<()> {
|
||||||
let pod_name = pod
|
let pod_name = pod
|
||||||
.metadata
|
.metadata
|
||||||
.name
|
.name
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or_else(|| SchedulerError::internal_error("Pod has no name"))?;
|
.ok_or_else(|| SchedulerError::internal_error("Pod has no name"))?
|
||||||
|
.clone();
|
||||||
let namespace = pod
|
let namespace = pod
|
||||||
.metadata
|
.metadata
|
||||||
.namespace
|
.namespace
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or_else(|| SchedulerError::internal_error("Pod has no namespace"))?;
|
.ok_or_else(|| SchedulerError::internal_error("Pod has no namespace"))?
|
||||||
|
.clone();
|
||||||
|
|
||||||
info!("Binding pod {} to node {}", pod_name, node_name);
|
info!("Binding pod {} to node {}", pod_name, node_name);
|
||||||
|
|
||||||
|
let key = reddwarf_core::ResourceKey::new(
|
||||||
|
reddwarf_core::GroupVersionKind::from_api_version_kind("v1", "Pod"),
|
||||||
|
&namespace,
|
||||||
|
&pod_name,
|
||||||
|
);
|
||||||
|
let storage_key = KeyEncoder::encode_resource_key(&key);
|
||||||
|
|
||||||
|
// Read the current pod bytes for version diff
|
||||||
|
let prev_data = self
|
||||||
|
.storage
|
||||||
|
.as_ref()
|
||||||
|
.get(storage_key.as_bytes())?
|
||||||
|
.ok_or_else(|| {
|
||||||
|
SchedulerError::internal_error(format!("Pod not found in storage: {}", pod_name))
|
||||||
|
})?;
|
||||||
|
|
||||||
// Update pod spec
|
// Update pod spec
|
||||||
if let Some(spec) = &mut pod.spec {
|
if let Some(spec) = &mut pod.spec {
|
||||||
spec.node_name = Some(node_name.to_string());
|
spec.node_name = Some(node_name.to_string());
|
||||||
|
|
@ -262,21 +298,54 @@ impl Scheduler {
|
||||||
return Err(SchedulerError::internal_error("Pod has no spec"));
|
return Err(SchedulerError::internal_error("Pod has no spec"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save updated pod
|
// Serialize new pod
|
||||||
let key = reddwarf_core::ResourceKey::new(
|
let new_data = serde_json::to_vec(&pod).map_err(|e| {
|
||||||
reddwarf_core::GroupVersionKind::from_api_version_kind("v1", "Pod"),
|
|
||||||
namespace,
|
|
||||||
pod_name,
|
|
||||||
);
|
|
||||||
|
|
||||||
let storage_key = KeyEncoder::encode_resource_key(&key);
|
|
||||||
let data = serde_json::to_vec(&pod).map_err(|e| {
|
|
||||||
SchedulerError::internal_error(format!("Failed to serialize pod: {}", e))
|
SchedulerError::internal_error(format!("Failed to serialize pod: {}", e))
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
self.storage.as_ref().put(storage_key.as_bytes(), &data)?;
|
// Create a versioned commit
|
||||||
|
let change = Change::update(
|
||||||
|
storage_key.clone(),
|
||||||
|
String::from_utf8_lossy(&new_data).to_string(),
|
||||||
|
String::from_utf8_lossy(&prev_data).to_string(),
|
||||||
|
);
|
||||||
|
|
||||||
info!("Successfully bound pod {} to node {}", pod_name, node_name);
|
let commit = self
|
||||||
|
.version_store
|
||||||
|
.create_commit(
|
||||||
|
CommitBuilder::new()
|
||||||
|
.change(change)
|
||||||
|
.message(format!("Bind pod {} to node {}", pod_name, node_name)),
|
||||||
|
)
|
||||||
|
.map_err(|e| {
|
||||||
|
SchedulerError::internal_error(format!("Failed to create commit: {}", e))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Set resource version to commit ID
|
||||||
|
pod.metadata.resource_version = Some(commit.id().to_string());
|
||||||
|
|
||||||
|
// Re-serialize with updated resource version
|
||||||
|
let final_data = serde_json::to_vec(&pod).map_err(|e| {
|
||||||
|
SchedulerError::internal_error(format!("Failed to serialize pod: {}", e))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Write to storage
|
||||||
|
self.storage
|
||||||
|
.as_ref()
|
||||||
|
.put(storage_key.as_bytes(), &final_data)?;
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Successfully bound pod {} to node {} at version {}",
|
||||||
|
pod_name,
|
||||||
|
node_name,
|
||||||
|
commit.id()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Publish MODIFIED event (best-effort)
|
||||||
|
if let Ok(object) = serde_json::to_value(&*pod) {
|
||||||
|
let event = ResourceEvent::modified(key, object, commit.id().to_string());
|
||||||
|
let _ = self.event_tx.send(event);
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -285,10 +354,23 @@ impl Scheduler {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use reddwarf_core::WatchEventType;
|
||||||
use reddwarf_storage::RedbBackend;
|
use reddwarf_storage::RedbBackend;
|
||||||
|
use reddwarf_versioning::VersionStore;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
fn create_test_scheduler() -> (Scheduler, broadcast::Receiver<ResourceEvent>) {
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let db_path = dir.path().join("test.redb");
|
||||||
|
let storage = Arc::new(RedbBackend::new(&db_path).unwrap());
|
||||||
|
let version_store = Arc::new(VersionStore::new(storage.clone()).unwrap());
|
||||||
|
let (event_tx, event_rx) = broadcast::channel(64);
|
||||||
|
let scheduler =
|
||||||
|
Scheduler::new(storage, version_store, event_tx, SchedulerConfig::default());
|
||||||
|
(scheduler, event_rx)
|
||||||
|
}
|
||||||
|
|
||||||
fn create_test_node(name: &str, cpu: &str, memory: &str) -> Node {
|
fn create_test_node(name: &str, cpu: &str, memory: &str) -> Node {
|
||||||
let mut node = Node::default();
|
let mut node = Node::default();
|
||||||
node.metadata.name = Some(name.to_string());
|
node.metadata.name = Some(name.to_string());
|
||||||
|
|
@ -355,13 +437,25 @@ mod tests {
|
||||||
pod
|
pod
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Helper: store a pod in storage so bind_pod can read prev version
|
||||||
|
fn store_pod(scheduler: &Scheduler, pod: &Pod) {
|
||||||
|
let key = reddwarf_core::ResourceKey::new(
|
||||||
|
reddwarf_core::GroupVersionKind::from_api_version_kind("v1", "Pod"),
|
||||||
|
pod.metadata.namespace.as_deref().unwrap(),
|
||||||
|
pod.metadata.name.as_deref().unwrap(),
|
||||||
|
);
|
||||||
|
let storage_key = KeyEncoder::encode_resource_key(&key);
|
||||||
|
let data = serde_json::to_vec(pod).unwrap();
|
||||||
|
scheduler
|
||||||
|
.storage
|
||||||
|
.as_ref()
|
||||||
|
.put(storage_key.as_bytes(), &data)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_schedule_pod_success() {
|
async fn test_schedule_pod_success() {
|
||||||
let dir = tempdir().unwrap();
|
let (scheduler, _rx) = create_test_scheduler();
|
||||||
let db_path = dir.path().join("test.redb");
|
|
||||||
let storage = Arc::new(RedbBackend::new(&db_path).unwrap());
|
|
||||||
|
|
||||||
let scheduler = Scheduler::new(storage.clone(), SchedulerConfig::default());
|
|
||||||
|
|
||||||
// Create nodes
|
// Create nodes
|
||||||
let nodes = vec![
|
let nodes = vec![
|
||||||
|
|
@ -369,8 +463,9 @@ mod tests {
|
||||||
create_test_node("node2", "2", "4Gi"),
|
create_test_node("node2", "2", "4Gi"),
|
||||||
];
|
];
|
||||||
|
|
||||||
// Create pod
|
// Create pod and store it so bind_pod can read the previous version
|
||||||
let pod = create_test_pod("test-pod", "default", "1", "1Gi");
|
let pod = create_test_pod("test-pod", "default", "1", "1Gi");
|
||||||
|
store_pod(&scheduler, &pod);
|
||||||
|
|
||||||
// Schedule pod
|
// Schedule pod
|
||||||
let result = scheduler.schedule_pod(pod, &nodes).await;
|
let result = scheduler.schedule_pod(pod, &nodes).await;
|
||||||
|
|
@ -382,11 +477,7 @@ mod tests {
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_schedule_pod_no_suitable_nodes() {
|
async fn test_schedule_pod_no_suitable_nodes() {
|
||||||
let dir = tempdir().unwrap();
|
let (scheduler, _rx) = create_test_scheduler();
|
||||||
let db_path = dir.path().join("test.redb");
|
|
||||||
let storage = Arc::new(RedbBackend::new(&db_path).unwrap());
|
|
||||||
|
|
||||||
let scheduler = Scheduler::new(storage, SchedulerConfig::default());
|
|
||||||
|
|
||||||
// Create node with insufficient resources
|
// Create node with insufficient resources
|
||||||
let nodes = vec![create_test_node("node1", "1", "1Gi")];
|
let nodes = vec![create_test_node("node1", "1", "1Gi")];
|
||||||
|
|
@ -399,4 +490,36 @@ mod tests {
|
||||||
|
|
||||||
assert!(result.is_err());
|
assert!(result.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_bind_pod_publishes_modified_event() {
|
||||||
|
let (scheduler, mut rx) = create_test_scheduler();
|
||||||
|
|
||||||
|
let mut pod = create_test_pod("event-pod", "default", "1", "1Gi");
|
||||||
|
store_pod(&scheduler, &pod);
|
||||||
|
|
||||||
|
scheduler.bind_pod(&mut pod, "node1").await.unwrap();
|
||||||
|
|
||||||
|
let event = rx.try_recv().unwrap();
|
||||||
|
assert!(matches!(event.event_type, WatchEventType::Modified));
|
||||||
|
assert_eq!(event.resource_key.name, "event-pod");
|
||||||
|
assert_eq!(event.gvk.kind, "Pod");
|
||||||
|
|
||||||
|
// Verify the event object has the updated node name
|
||||||
|
let bound_pod: Pod = serde_json::from_value(event.object).unwrap();
|
||||||
|
assert_eq!(bound_pod.spec.unwrap().node_name, Some("node1".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_bind_pod_sets_resource_version() {
|
||||||
|
let (scheduler, _rx) = create_test_scheduler();
|
||||||
|
|
||||||
|
let mut pod = create_test_pod("version-pod", "default", "1", "1Gi");
|
||||||
|
store_pod(&scheduler, &pod);
|
||||||
|
|
||||||
|
scheduler.bind_pod(&mut pod, "node1").await.unwrap();
|
||||||
|
|
||||||
|
assert!(pod.metadata.resource_version.is_some());
|
||||||
|
assert!(!pod.metadata.resource_version.as_ref().unwrap().is_empty());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,7 @@ reddwarf-apiserver = { workspace = true }
|
||||||
reddwarf-scheduler = { workspace = true }
|
reddwarf-scheduler = { workspace = true }
|
||||||
reddwarf-runtime = { workspace = true }
|
reddwarf-runtime = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
|
tokio-util = { workspace = true }
|
||||||
clap = { workspace = true }
|
clap = { workspace = true }
|
||||||
miette = { workspace = true }
|
miette = { workspace = true }
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
use reddwarf_apiserver::{ApiServer, AppState, Config as ApiConfig};
|
use reddwarf_apiserver::{ApiError, ApiServer, AppState, Config as ApiConfig};
|
||||||
|
use reddwarf_core::Namespace;
|
||||||
use reddwarf_runtime::{
|
use reddwarf_runtime::{
|
||||||
ApiClient, EtherstubConfig, MockRuntime, NetworkMode, NodeAgent, NodeAgentConfig,
|
ApiClient, EtherstubConfig, MockRuntime, NetworkMode, NodeAgent, NodeAgentConfig,
|
||||||
PodController, PodControllerConfig, ZoneBrand,
|
PodController, PodControllerConfig, ZoneBrand,
|
||||||
|
|
@ -9,6 +10,7 @@ use reddwarf_scheduler::Scheduler;
|
||||||
use reddwarf_storage::RedbBackend;
|
use reddwarf_storage::RedbBackend;
|
||||||
use reddwarf_versioning::VersionStore;
|
use reddwarf_versioning::VersionStore;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
|
|
@ -79,6 +81,8 @@ async fn run_serve(bind: &str, data_dir: &str) -> miette::Result<()> {
|
||||||
|
|
||||||
let state = create_app_state(data_dir)?;
|
let state = create_app_state(data_dir)?;
|
||||||
|
|
||||||
|
bootstrap_default_namespace(&state).await?;
|
||||||
|
|
||||||
let config = ApiConfig {
|
let config = ApiConfig {
|
||||||
listen_addr: bind
|
listen_addr: bind
|
||||||
.parse()
|
.parse()
|
||||||
|
|
@ -106,6 +110,8 @@ async fn run_agent(
|
||||||
|
|
||||||
let state = create_app_state(data_dir)?;
|
let state = create_app_state(data_dir)?;
|
||||||
|
|
||||||
|
bootstrap_default_namespace(&state).await?;
|
||||||
|
|
||||||
let listen_addr: std::net::SocketAddr = bind
|
let listen_addr: std::net::SocketAddr = bind
|
||||||
.parse()
|
.parse()
|
||||||
.map_err(|e| miette::miette!("Invalid bind address '{}': {}", bind, e))?;
|
.map_err(|e| miette::miette!("Invalid bind address '{}': {}", bind, e))?;
|
||||||
|
|
@ -113,12 +119,22 @@ async fn run_agent(
|
||||||
// Determine the API URL for internal components to connect to
|
// Determine the API URL for internal components to connect to
|
||||||
let api_url = format!("http://127.0.0.1:{}", listen_addr.port());
|
let api_url = format!("http://127.0.0.1:{}", listen_addr.port());
|
||||||
|
|
||||||
|
let token = CancellationToken::new();
|
||||||
|
|
||||||
// 1. Spawn API server
|
// 1. Spawn API server
|
||||||
let api_config = ApiConfig { listen_addr };
|
let api_config = ApiConfig { listen_addr };
|
||||||
let api_server = ApiServer::new(api_config, state.clone());
|
let api_server = ApiServer::new(api_config, state.clone());
|
||||||
|
let api_token = token.clone();
|
||||||
let api_handle = tokio::spawn(async move {
|
let api_handle = tokio::spawn(async move {
|
||||||
if let Err(e) = api_server.run().await {
|
tokio::select! {
|
||||||
error!("API server error: {}", e);
|
result = api_server.run() => {
|
||||||
|
if let Err(e) = result {
|
||||||
|
error!("API server error: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = api_token.cancelled() => {
|
||||||
|
info!("API server shutting down");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
@ -126,9 +142,15 @@ async fn run_agent(
|
||||||
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
||||||
|
|
||||||
// 2. Spawn scheduler
|
// 2. Spawn scheduler
|
||||||
let scheduler = Scheduler::new(state.storage.clone(), SchedulerConfig::default());
|
let scheduler = Scheduler::new(
|
||||||
|
state.storage.clone(),
|
||||||
|
state.version_store.clone(),
|
||||||
|
state.event_tx.clone(),
|
||||||
|
SchedulerConfig::default(),
|
||||||
|
);
|
||||||
|
let scheduler_token = token.clone();
|
||||||
let scheduler_handle = tokio::spawn(async move {
|
let scheduler_handle = tokio::spawn(async move {
|
||||||
if let Err(e) = scheduler.run().await {
|
if let Err(e) = scheduler.run(scheduler_token).await {
|
||||||
error!("Scheduler error: {}", e);
|
error!("Scheduler error: {}", e);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
@ -152,9 +174,15 @@ async fn run_agent(
|
||||||
}),
|
}),
|
||||||
};
|
};
|
||||||
|
|
||||||
let controller = PodController::new(runtime, api_client.clone(), controller_config);
|
let controller = PodController::new(
|
||||||
|
runtime,
|
||||||
|
api_client.clone(),
|
||||||
|
state.event_tx.clone(),
|
||||||
|
controller_config,
|
||||||
|
);
|
||||||
|
let controller_token = token.clone();
|
||||||
let controller_handle = tokio::spawn(async move {
|
let controller_handle = tokio::spawn(async move {
|
||||||
if let Err(e) = controller.run().await {
|
if let Err(e) = controller.run(controller_token).await {
|
||||||
error!("Pod controller error: {}", e);
|
error!("Pod controller error: {}", e);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
@ -162,8 +190,9 @@ async fn run_agent(
|
||||||
// 5. Spawn node agent
|
// 5. Spawn node agent
|
||||||
let node_agent_config = NodeAgentConfig::new(node_name.to_string(), api_url);
|
let node_agent_config = NodeAgentConfig::new(node_name.to_string(), api_url);
|
||||||
let node_agent = NodeAgent::new(api_client, node_agent_config);
|
let node_agent = NodeAgent::new(api_client, node_agent_config);
|
||||||
|
let agent_token = token.clone();
|
||||||
let node_agent_handle = tokio::spawn(async move {
|
let node_agent_handle = tokio::spawn(async move {
|
||||||
if let Err(e) = node_agent.run().await {
|
if let Err(e) = node_agent.run(agent_token).await {
|
||||||
error!("Node agent error: {}", e);
|
error!("Node agent error: {}", e);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
@ -178,14 +207,45 @@ async fn run_agent(
|
||||||
.await
|
.await
|
||||||
.map_err(|e| miette::miette!("Failed to listen for ctrl-c: {}", e))?;
|
.map_err(|e| miette::miette!("Failed to listen for ctrl-c: {}", e))?;
|
||||||
|
|
||||||
info!("Shutting down...");
|
info!("Shutting down gracefully...");
|
||||||
|
token.cancel();
|
||||||
|
|
||||||
// Abort all tasks
|
// Wait for all tasks to finish with a timeout
|
||||||
api_handle.abort();
|
let shutdown_timeout = std::time::Duration::from_secs(5);
|
||||||
scheduler_handle.abort();
|
let _ = tokio::time::timeout(shutdown_timeout, async {
|
||||||
controller_handle.abort();
|
let _ = tokio::join!(
|
||||||
node_agent_handle.abort();
|
api_handle,
|
||||||
|
scheduler_handle,
|
||||||
|
controller_handle,
|
||||||
|
node_agent_handle,
|
||||||
|
);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
info!("Shutdown complete");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bootstrap the "default" namespace if it doesn't already exist
|
||||||
|
async fn bootstrap_default_namespace(state: &AppState) -> miette::Result<()> {
|
||||||
|
use reddwarf_apiserver::handlers::common::create_resource;
|
||||||
|
|
||||||
|
let mut ns = Namespace::default();
|
||||||
|
ns.metadata.name = Some("default".to_string());
|
||||||
|
|
||||||
|
match create_resource(state, ns).await {
|
||||||
|
Ok(_) => info!("Created default namespace"),
|
||||||
|
Err(ApiError::AlreadyExists(_)) => {
|
||||||
|
// Already exists — fine
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
return Err(miette::miette!(
|
||||||
|
"Failed to bootstrap default namespace: {:?}",
|
||||||
|
e
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue