diff --git a/.gitignore b/.gitignore index ea8c4bf..fbe4996 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +/output diff --git a/crates/forge-builder/src/config.rs b/crates/forge-builder/src/config.rs index f14d450..8189627 100644 --- a/crates/forge-builder/src/config.rs +++ b/crates/forge-builder/src/config.rs @@ -9,6 +9,8 @@ pub struct BuilderConfig { pub vcpus: u16, /// Memory in MB for the builder VM. pub memory_mb: u64, + /// Disk size in GB for the builder VM overlay. + pub disk_gb: u32, } impl BuilderConfig { @@ -22,11 +24,13 @@ impl BuilderConfig { image: node.image.clone().unwrap_or(default_image), vcpus: node.vcpus.unwrap_or(2), memory_mb: node.memory.unwrap_or(2048), + disk_gb: node.disk.unwrap_or(20), }, None => Self { image: default_image, vcpus: 2, memory_mb: 2048, + disk_gb: 20, }, } } @@ -53,12 +57,14 @@ mod tests { assert!(config.image.contains("ubuntu-builder")); assert_eq!(config.vcpus, 2); assert_eq!(config.memory_mb, 2048); + assert_eq!(config.disk_gb, 20); } #[test] fn defaults_for_omnios() { let config = BuilderConfig::resolve(None, &DistroFamily::OmniOS); assert!(config.image.contains("omnios-builder")); + assert_eq!(config.disk_gb, 20); } #[test] @@ -67,11 +73,13 @@ mod tests { image: Some("oci://custom/image:v1".to_string()), vcpus: Some(4), memory: Some(4096), + disk: Some(50), }; let config = BuilderConfig::resolve(Some(&node), &DistroFamily::Ubuntu); assert_eq!(config.image, "oci://custom/image:v1"); assert_eq!(config.vcpus, 4); assert_eq!(config.memory_mb, 4096); + assert_eq!(config.disk_gb, 50); } #[test] @@ -80,10 +88,12 @@ mod tests { image: None, vcpus: Some(8), memory: None, + disk: None, }; let config = BuilderConfig::resolve(Some(&node), &DistroFamily::Ubuntu); assert!(config.image.contains("ubuntu-builder")); assert_eq!(config.vcpus, 8); assert_eq!(config.memory_mb, 2048); + assert_eq!(config.disk_gb, 20); } } diff --git a/crates/forge-builder/src/error.rs b/crates/forge-builder/src/error.rs index 1832b76..3f06e3b 100644 --- a/crates/forge-builder/src/error.rs +++ b/crates/forge-builder/src/error.rs @@ -47,9 +47,9 @@ pub enum BuilderError { #[error("remote build inside builder VM failed with exit code {exit_code}")] #[diagnostic( code(forge_builder::remote_build_failed), - help("check the build output above for errors — the forger build ran inside the builder VM") + help("check the build output above for errors — the forger build ran inside the builder VM\n{detail}") )] - RemoteBuildFailed { exit_code: i32 }, + RemoteBuildFailed { exit_code: i32, detail: String }, #[error("failed to download build artifacts from builder VM: {detail}")] #[diagnostic( diff --git a/crates/forge-builder/src/lib.rs b/crates/forge-builder/src/lib.rs index 0c57341..04a7f5e 100644 --- a/crates/forge-builder/src/lib.rs +++ b/crates/forge-builder/src/lib.rs @@ -39,7 +39,13 @@ pub async fn run_in_builder( info!("Starting builder VM for remote build"); let session = lifecycle::BuilderSession::start(&config).await?; - let result = run_build_in_session(&session, &binary.path, spec_path, files_dir, output_dir, target, profiles).await; + let result = run_build_in_session(&session, spec, &binary.path, spec_path, files_dir, output_dir, target, profiles).await; + + // On failure, try to collect diagnostic info before teardown + if let Err(ref e) = result { + tracing::error!(error = %e, "Remote build failed — collecting diagnostics"); + collect_diagnostics(&session); + } // Always teardown, even on error info!("Tearing down builder VM"); @@ -50,8 +56,124 @@ pub async fn run_in_builder( result } +/// Verify the builder VM has working network connectivity (DNS + HTTP). +fn verify_network(session: &lifecycle::BuilderSession) -> Result<(), BuilderError> { + info!("Verifying network connectivity in builder VM"); + + // Check DNS resolution and HTTP connectivity + let check_cmd = "cat /etc/resolv.conf && echo '---' && \ + nslookup archive.ubuntu.com 2>&1 || host archive.ubuntu.com 2>&1 || \ + getent hosts archive.ubuntu.com 2>&1 || echo 'DNS_FAILED'"; + + let (net_stdout, _, _) = + vm_manager::ssh::exec_streaming(&session.ssh_session, check_cmd, stdout(), stderr()) + .map_err(|e| BuilderError::TransferFailed { + detail: format!("network check: {e}"), + })?; + + if net_stdout.contains("DNS_FAILED") { + tracing::warn!("DNS resolution failed in builder VM — attempting to fix resolv.conf"); + + // Try to fix DNS by writing resolv.conf with SLIRP DNS server + let fix_cmd = "echo 'nameserver 10.0.2.3' | sudo tee /etc/resolv.conf"; + let (_, _, exit_code) = + vm_manager::ssh::exec_streaming(&session.ssh_session, fix_cmd, stdout(), stderr()) + .map_err(|e| BuilderError::TransferFailed { + detail: format!("fix resolv.conf: {e}"), + })?; + + if exit_code != 0 { + return Err(BuilderError::TransferFailed { + detail: "failed to configure DNS in builder VM".to_string(), + }); + } + + // Verify DNS now works + let (verify_out, _, _) = vm_manager::ssh::exec_streaming( + &session.ssh_session, + "getent hosts archive.ubuntu.com 2>&1 || echo 'STILL_FAILED'", + stdout(), + stderr(), + ) + .map_err(|e| BuilderError::TransferFailed { + detail: format!("DNS verify: {e}"), + })?; + + if verify_out.contains("STILL_FAILED") { + return Err(BuilderError::TransferFailed { + detail: "DNS resolution still failing after fix — check VM networking".to_string(), + }); + } + } + + info!("Network connectivity verified"); + Ok(()) +} + +/// Install required build tools inside the builder VM based on distro. +fn install_build_deps( + session: &lifecycle::BuilderSession, + spec: &ImageSpec, +) -> Result<(), BuilderError> { + let distro = DistroFamily::from_distro_str(spec.distro.as_deref()); + + let install_cmd = match distro { + DistroFamily::Ubuntu => { + "sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq && \ + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + debootstrap qemu-utils parted dosfstools e2fsprogs grub-efi-amd64-bin mount" + } + DistroFamily::OmniOS => { + // OmniOS builder images should already have pkg tools; install qemu-img if missing + "sudo pkg install -q system/qemu/img || true" + } + }; + + info!("Installing build dependencies in builder VM"); + + let (_, _, exit_code) = + vm_manager::ssh::exec_streaming(&session.ssh_session, install_cmd, stdout(), stderr()) + .map_err(|e| BuilderError::TransferFailed { + detail: format!("install build deps: {e}"), + })?; + + if exit_code != 0 { + return Err(BuilderError::TransferFailed { + detail: format!("build dependency installation failed with exit code {exit_code}"), + }); + } + + Ok(()) +} + +/// Collect diagnostic information from the builder VM after a failed build. +fn collect_diagnostics(session: &lifecycle::BuilderSession) { + tracing::warn!("--- Builder VM Diagnostics ---"); + + // Try to find and print debootstrap log (debootstrap writes errors here, not stderr) + let diag_cmd = "echo '=== debootstrap log ===' && \ + sudo find /tmp -name 'debootstrap.log' -exec cat {} \\; 2>/dev/null && \ + echo '=== resolv.conf ===' && \ + cat /etc/resolv.conf 2>/dev/null && \ + echo '=== disk space ===' && \ + df -h / /tmp 2>/dev/null"; + + match vm_manager::ssh::exec_streaming( + &session.ssh_session, + diag_cmd, + stdout(), + stderr(), + ) { + Ok(_) => {} + Err(e) => tracing::warn!(error = %e, "Failed to collect diagnostics"), + } + + tracing::warn!("--- End Diagnostics ---"); +} + async fn run_build_in_session( session: &lifecycle::BuilderSession, + spec: &ImageSpec, binary_path: &Path, spec_path: &Path, files_dir: &Path, @@ -59,6 +181,12 @@ async fn run_build_in_session( target: Option<&str>, profiles: &[String], ) -> Result<(), BuilderError> { + // Verify network connectivity (DNS) before doing anything + verify_network(session)?; + + // Install build dependencies in the builder VM + install_build_deps(session, spec)?; + // Upload inputs transfer::upload_build_inputs(session, binary_path, spec_path, files_dir)?; @@ -78,14 +206,24 @@ async fn run_build_in_session( info!(cmd = %cmd, "Running build in builder VM"); // Stream output to the user's terminal - let (_, _, exit_code) = + let (build_stdout, build_stderr, exit_code) = vm_manager::ssh::exec_streaming(&session.ssh_session, &cmd, stdout(), stderr()) .map_err(|e| BuilderError::TransferFailed { detail: format!("remote exec: {e}"), })?; if exit_code != 0 { - return Err(BuilderError::RemoteBuildFailed { exit_code }); + // Include captured output in the error for better diagnostics + let detail = if !build_stderr.is_empty() { + build_stderr + } else if !build_stdout.is_empty() { + // debootstrap and some tools write to stdout, not stderr + let lines: Vec<&str> = build_stdout.lines().rev().take(20).collect(); + lines.into_iter().rev().collect::>().join("\n") + } else { + String::new() + }; + return Err(BuilderError::RemoteBuildFailed { exit_code, detail }); } // Download artifacts diff --git a/crates/forge-builder/src/lifecycle.rs b/crates/forge-builder/src/lifecycle.rs index 059b965..d687e8a 100644 --- a/crates/forge-builder/src/lifecycle.rs +++ b/crates/forge-builder/src/lifecycle.rs @@ -23,7 +23,7 @@ pub struct BuilderSession { impl BuilderSession { /// Start a builder VM: resolve image, generate SSH keys, create + boot VM, connect SSH. pub async fn start(config: &BuilderConfig) -> Result { - info!(image = %config.image, vcpus = config.vcpus, memory_mb = config.memory_mb, "Starting builder VM"); + info!(image = %config.image, vcpus = config.vcpus, memory_mb = config.memory_mb, disk_gb = config.disk_gb, "Starting builder VM"); // 1. Resolve builder image let image_path = resolve_builder_image(&config.image).await?; @@ -31,7 +31,9 @@ impl BuilderSession { // 2. Generate ephemeral SSH keypair let (pub_key, priv_pem) = generate_ssh_keypair()?; - // 3. Build cloud-config with builder user + injected pubkey + // 3. Build cloud-config with builder user + injected pubkey + disk growth + // growpart + resize_rootfs ensure the root partition expands to fill the + // resized overlay disk (cloud images ship with tiny 2GB roots). let cloud_config = format!( r#"#cloud-config users: @@ -40,6 +42,13 @@ users: shell: /bin/bash ssh_authorized_keys: - {pub_key} + +growpart: + mode: auto + devices: + - / + +resize_rootfs: true "# ); @@ -57,7 +66,7 @@ users: image_path: image_path.clone(), vcpus: config.vcpus, memory_mb: config.memory_mb, - disk_gb: None, + disk_gb: Some(config.disk_gb), network: NetworkConfig::User, cloud_init: Some(CloudInitConfig { user_data: cloud_config.into_bytes(), diff --git a/crates/forge-engine/src/phase2/qcow2_ext4.rs b/crates/forge-engine/src/phase2/qcow2_ext4.rs index eec3602..89cf8ca 100644 --- a/crates/forge-engine/src/phase2/qcow2_ext4.rs +++ b/crates/forge-engine/src/phase2/qcow2_ext4.rs @@ -65,7 +65,7 @@ pub async fn build_qcow2_ext4( crate::tools::partition::mount(runner, &root_part, mount_str).await?; // Copy staging rootfs into mounted root - copy_rootfs(staging_root, mount_dir.path())?; + copy_rootfs(staging_root, mount_dir.path(), runner).await?; info!("Step 6: Mounting EFI partition"); let efi_mount = mount_dir.path().join("boot/efi"); @@ -136,25 +136,25 @@ pub async fn build_qcow2_ext4( } /// Copy the staging rootfs into the mounted root partition. -fn copy_rootfs(src: &Path, dest: &Path) -> Result<(), ForgeError> { - for entry in walkdir::WalkDir::new(src).follow_links(false) { - let entry = entry.map_err(|e| ForgeError::Qcow2Build { +/// +/// Uses `cp -a` (archive mode) to properly preserve symlinks, permissions, +/// ownership, timestamps, and special files. This is critical for modern +/// distros with merged /usr where /lib, /bin, /sbin are symlinks. +async fn copy_rootfs( + src: &Path, + dest: &Path, + runner: &dyn ToolRunner, +) -> Result<(), ForgeError> { + let src_str = format!("{}/.", src.display()); + let dest_str = dest.to_str().unwrap(); + + runner + .run("cp", &["-a", &src_str, dest_str]) + .await + .map_err(|_| ForgeError::Qcow2Build { step: "copy_rootfs".to_string(), - detail: e.to_string(), + detail: format!("cp -a {}/. -> {}", src.display(), dest.display()), })?; - let rel = entry.path().strip_prefix(src).unwrap_or(entry.path()); - let target = dest.join(rel); - - if entry.path().is_dir() { - std::fs::create_dir_all(&target)?; - } else if entry.path().is_file() { - if let Some(parent) = target.parent() { - std::fs::create_dir_all(parent)?; - } - std::fs::copy(entry.path(), &target)?; - } - } - Ok(()) } diff --git a/crates/spec-parser/src/lib.rs b/crates/spec-parser/src/lib.rs index 6917a52..fbb055c 100644 --- a/crates/spec-parser/src/lib.rs +++ b/crates/spec-parser/src/lib.rs @@ -249,6 +249,7 @@ mod tests { image "oci://ghcr.io/custom/builder:v1" vcpus 4 memory 4096 + disk 50 } "#; @@ -257,6 +258,7 @@ mod tests { assert_eq!(builder.image.as_deref(), Some("oci://ghcr.io/custom/builder:v1")); assert_eq!(builder.vcpus, Some(4)); assert_eq!(builder.memory, Some(4096)); + assert_eq!(builder.disk, Some(50)); } #[test] @@ -290,6 +292,7 @@ mod tests { assert_eq!(builder.image, None); assert_eq!(builder.vcpus, None); assert_eq!(builder.memory, None); + assert_eq!(builder.disk, None); } #[test] diff --git a/crates/spec-parser/src/schema.rs b/crates/spec-parser/src/schema.rs index e0c36d2..222651c 100644 --- a/crates/spec-parser/src/schema.rs +++ b/crates/spec-parser/src/schema.rs @@ -74,6 +74,9 @@ pub struct BuilderNode { #[knuffel(child, unwrap(argument))] pub memory: Option, + + #[knuffel(child, unwrap(argument))] + pub disk: Option, } #[derive(Debug, Decode)] diff --git a/images/ubuntu-rust-ci.kdl b/images/ubuntu-rust-ci.kdl index 24363e6..5143435 100644 --- a/images/ubuntu-rust-ci.kdl +++ b/images/ubuntu-rust-ci.kdl @@ -30,6 +30,11 @@ overlays { ensure-dir "/home/ci" owner="ci" group="ci" mode="755" } +builder { + vcpus 4 + memory 4096 +} + target "qcow2" kind="qcow2" { disk-size "8G" bootloader "grub"