Fix builder VM disk sizing, rootfs copy, and diagnostics

- Add disk_gb field to BuilderNode/BuilderConfig with 20GB default,
  fixing debootstrap failure caused by 2GB cloud image running out of
  space. Cloud-init growpart/resize_rootfs expand the partition.

- Replace walkdir-based copy_rootfs with cp -a to preserve symlinks,
  fixing grub-install failure caused by broken merged-/usr symlinks
  (/lib, /bin, /sbin -> /usr/*) in modern Ubuntu.

- Add network verification step that checks DNS before building and
  auto-fixes resolv.conf with SLIRP DNS (10.0.2.3) if needed.

- Add diagnostic collection on failure (debootstrap log, resolv.conf,
  disk space) before VM teardown.

- Include build stderr/stdout in RemoteBuildFailed error for better
  error reporting.

- Install build dependencies (debootstrap, qemu-utils, etc.) inside
  the builder VM before running the build.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Till Wegmueller 2026-02-15 18:01:49 +01:00
parent 19c8379fc6
commit d24dcc0363
No known key found for this signature in database
9 changed files with 195 additions and 26 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
/target /target
/output

View file

@ -9,6 +9,8 @@ pub struct BuilderConfig {
pub vcpus: u16, pub vcpus: u16,
/// Memory in MB for the builder VM. /// Memory in MB for the builder VM.
pub memory_mb: u64, pub memory_mb: u64,
/// Disk size in GB for the builder VM overlay.
pub disk_gb: u32,
} }
impl BuilderConfig { impl BuilderConfig {
@ -22,11 +24,13 @@ impl BuilderConfig {
image: node.image.clone().unwrap_or(default_image), image: node.image.clone().unwrap_or(default_image),
vcpus: node.vcpus.unwrap_or(2), vcpus: node.vcpus.unwrap_or(2),
memory_mb: node.memory.unwrap_or(2048), memory_mb: node.memory.unwrap_or(2048),
disk_gb: node.disk.unwrap_or(20),
}, },
None => Self { None => Self {
image: default_image, image: default_image,
vcpus: 2, vcpus: 2,
memory_mb: 2048, memory_mb: 2048,
disk_gb: 20,
}, },
} }
} }
@ -53,12 +57,14 @@ mod tests {
assert!(config.image.contains("ubuntu-builder")); assert!(config.image.contains("ubuntu-builder"));
assert_eq!(config.vcpus, 2); assert_eq!(config.vcpus, 2);
assert_eq!(config.memory_mb, 2048); assert_eq!(config.memory_mb, 2048);
assert_eq!(config.disk_gb, 20);
} }
#[test] #[test]
fn defaults_for_omnios() { fn defaults_for_omnios() {
let config = BuilderConfig::resolve(None, &DistroFamily::OmniOS); let config = BuilderConfig::resolve(None, &DistroFamily::OmniOS);
assert!(config.image.contains("omnios-builder")); assert!(config.image.contains("omnios-builder"));
assert_eq!(config.disk_gb, 20);
} }
#[test] #[test]
@ -67,11 +73,13 @@ mod tests {
image: Some("oci://custom/image:v1".to_string()), image: Some("oci://custom/image:v1".to_string()),
vcpus: Some(4), vcpus: Some(4),
memory: Some(4096), memory: Some(4096),
disk: Some(50),
}; };
let config = BuilderConfig::resolve(Some(&node), &DistroFamily::Ubuntu); let config = BuilderConfig::resolve(Some(&node), &DistroFamily::Ubuntu);
assert_eq!(config.image, "oci://custom/image:v1"); assert_eq!(config.image, "oci://custom/image:v1");
assert_eq!(config.vcpus, 4); assert_eq!(config.vcpus, 4);
assert_eq!(config.memory_mb, 4096); assert_eq!(config.memory_mb, 4096);
assert_eq!(config.disk_gb, 50);
} }
#[test] #[test]
@ -80,10 +88,12 @@ mod tests {
image: None, image: None,
vcpus: Some(8), vcpus: Some(8),
memory: None, memory: None,
disk: None,
}; };
let config = BuilderConfig::resolve(Some(&node), &DistroFamily::Ubuntu); let config = BuilderConfig::resolve(Some(&node), &DistroFamily::Ubuntu);
assert!(config.image.contains("ubuntu-builder")); assert!(config.image.contains("ubuntu-builder"));
assert_eq!(config.vcpus, 8); assert_eq!(config.vcpus, 8);
assert_eq!(config.memory_mb, 2048); assert_eq!(config.memory_mb, 2048);
assert_eq!(config.disk_gb, 20);
} }
} }

View file

@ -47,9 +47,9 @@ pub enum BuilderError {
#[error("remote build inside builder VM failed with exit code {exit_code}")] #[error("remote build inside builder VM failed with exit code {exit_code}")]
#[diagnostic( #[diagnostic(
code(forge_builder::remote_build_failed), code(forge_builder::remote_build_failed),
help("check the build output above for errors — the forger build ran inside the builder VM") help("check the build output above for errors — the forger build ran inside the builder VM\n{detail}")
)] )]
RemoteBuildFailed { exit_code: i32 }, RemoteBuildFailed { exit_code: i32, detail: String },
#[error("failed to download build artifacts from builder VM: {detail}")] #[error("failed to download build artifacts from builder VM: {detail}")]
#[diagnostic( #[diagnostic(

View file

@ -39,7 +39,13 @@ pub async fn run_in_builder(
info!("Starting builder VM for remote build"); info!("Starting builder VM for remote build");
let session = lifecycle::BuilderSession::start(&config).await?; let session = lifecycle::BuilderSession::start(&config).await?;
let result = run_build_in_session(&session, &binary.path, spec_path, files_dir, output_dir, target, profiles).await; let result = run_build_in_session(&session, spec, &binary.path, spec_path, files_dir, output_dir, target, profiles).await;
// On failure, try to collect diagnostic info before teardown
if let Err(ref e) = result {
tracing::error!(error = %e, "Remote build failed — collecting diagnostics");
collect_diagnostics(&session);
}
// Always teardown, even on error // Always teardown, even on error
info!("Tearing down builder VM"); info!("Tearing down builder VM");
@ -50,8 +56,124 @@ pub async fn run_in_builder(
result result
} }
/// Verify the builder VM has working network connectivity (DNS + HTTP).
fn verify_network(session: &lifecycle::BuilderSession) -> Result<(), BuilderError> {
info!("Verifying network connectivity in builder VM");
// Check DNS resolution and HTTP connectivity
let check_cmd = "cat /etc/resolv.conf && echo '---' && \
nslookup archive.ubuntu.com 2>&1 || host archive.ubuntu.com 2>&1 || \
getent hosts archive.ubuntu.com 2>&1 || echo 'DNS_FAILED'";
let (net_stdout, _, _) =
vm_manager::ssh::exec_streaming(&session.ssh_session, check_cmd, stdout(), stderr())
.map_err(|e| BuilderError::TransferFailed {
detail: format!("network check: {e}"),
})?;
if net_stdout.contains("DNS_FAILED") {
tracing::warn!("DNS resolution failed in builder VM — attempting to fix resolv.conf");
// Try to fix DNS by writing resolv.conf with SLIRP DNS server
let fix_cmd = "echo 'nameserver 10.0.2.3' | sudo tee /etc/resolv.conf";
let (_, _, exit_code) =
vm_manager::ssh::exec_streaming(&session.ssh_session, fix_cmd, stdout(), stderr())
.map_err(|e| BuilderError::TransferFailed {
detail: format!("fix resolv.conf: {e}"),
})?;
if exit_code != 0 {
return Err(BuilderError::TransferFailed {
detail: "failed to configure DNS in builder VM".to_string(),
});
}
// Verify DNS now works
let (verify_out, _, _) = vm_manager::ssh::exec_streaming(
&session.ssh_session,
"getent hosts archive.ubuntu.com 2>&1 || echo 'STILL_FAILED'",
stdout(),
stderr(),
)
.map_err(|e| BuilderError::TransferFailed {
detail: format!("DNS verify: {e}"),
})?;
if verify_out.contains("STILL_FAILED") {
return Err(BuilderError::TransferFailed {
detail: "DNS resolution still failing after fix — check VM networking".to_string(),
});
}
}
info!("Network connectivity verified");
Ok(())
}
/// Install required build tools inside the builder VM based on distro.
fn install_build_deps(
session: &lifecycle::BuilderSession,
spec: &ImageSpec,
) -> Result<(), BuilderError> {
let distro = DistroFamily::from_distro_str(spec.distro.as_deref());
let install_cmd = match distro {
DistroFamily::Ubuntu => {
"sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq && \
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \
debootstrap qemu-utils parted dosfstools e2fsprogs grub-efi-amd64-bin mount"
}
DistroFamily::OmniOS => {
// OmniOS builder images should already have pkg tools; install qemu-img if missing
"sudo pkg install -q system/qemu/img || true"
}
};
info!("Installing build dependencies in builder VM");
let (_, _, exit_code) =
vm_manager::ssh::exec_streaming(&session.ssh_session, install_cmd, stdout(), stderr())
.map_err(|e| BuilderError::TransferFailed {
detail: format!("install build deps: {e}"),
})?;
if exit_code != 0 {
return Err(BuilderError::TransferFailed {
detail: format!("build dependency installation failed with exit code {exit_code}"),
});
}
Ok(())
}
/// Collect diagnostic information from the builder VM after a failed build.
fn collect_diagnostics(session: &lifecycle::BuilderSession) {
tracing::warn!("--- Builder VM Diagnostics ---");
// Try to find and print debootstrap log (debootstrap writes errors here, not stderr)
let diag_cmd = "echo '=== debootstrap log ===' && \
sudo find /tmp -name 'debootstrap.log' -exec cat {} \\; 2>/dev/null && \
echo '=== resolv.conf ===' && \
cat /etc/resolv.conf 2>/dev/null && \
echo '=== disk space ===' && \
df -h / /tmp 2>/dev/null";
match vm_manager::ssh::exec_streaming(
&session.ssh_session,
diag_cmd,
stdout(),
stderr(),
) {
Ok(_) => {}
Err(e) => tracing::warn!(error = %e, "Failed to collect diagnostics"),
}
tracing::warn!("--- End Diagnostics ---");
}
async fn run_build_in_session( async fn run_build_in_session(
session: &lifecycle::BuilderSession, session: &lifecycle::BuilderSession,
spec: &ImageSpec,
binary_path: &Path, binary_path: &Path,
spec_path: &Path, spec_path: &Path,
files_dir: &Path, files_dir: &Path,
@ -59,6 +181,12 @@ async fn run_build_in_session(
target: Option<&str>, target: Option<&str>,
profiles: &[String], profiles: &[String],
) -> Result<(), BuilderError> { ) -> Result<(), BuilderError> {
// Verify network connectivity (DNS) before doing anything
verify_network(session)?;
// Install build dependencies in the builder VM
install_build_deps(session, spec)?;
// Upload inputs // Upload inputs
transfer::upload_build_inputs(session, binary_path, spec_path, files_dir)?; transfer::upload_build_inputs(session, binary_path, spec_path, files_dir)?;
@ -78,14 +206,24 @@ async fn run_build_in_session(
info!(cmd = %cmd, "Running build in builder VM"); info!(cmd = %cmd, "Running build in builder VM");
// Stream output to the user's terminal // Stream output to the user's terminal
let (_, _, exit_code) = let (build_stdout, build_stderr, exit_code) =
vm_manager::ssh::exec_streaming(&session.ssh_session, &cmd, stdout(), stderr()) vm_manager::ssh::exec_streaming(&session.ssh_session, &cmd, stdout(), stderr())
.map_err(|e| BuilderError::TransferFailed { .map_err(|e| BuilderError::TransferFailed {
detail: format!("remote exec: {e}"), detail: format!("remote exec: {e}"),
})?; })?;
if exit_code != 0 { if exit_code != 0 {
return Err(BuilderError::RemoteBuildFailed { exit_code }); // Include captured output in the error for better diagnostics
let detail = if !build_stderr.is_empty() {
build_stderr
} else if !build_stdout.is_empty() {
// debootstrap and some tools write to stdout, not stderr
let lines: Vec<&str> = build_stdout.lines().rev().take(20).collect();
lines.into_iter().rev().collect::<Vec<_>>().join("\n")
} else {
String::new()
};
return Err(BuilderError::RemoteBuildFailed { exit_code, detail });
} }
// Download artifacts // Download artifacts

View file

@ -23,7 +23,7 @@ pub struct BuilderSession {
impl BuilderSession { impl BuilderSession {
/// Start a builder VM: resolve image, generate SSH keys, create + boot VM, connect SSH. /// Start a builder VM: resolve image, generate SSH keys, create + boot VM, connect SSH.
pub async fn start(config: &BuilderConfig) -> Result<Self, BuilderError> { pub async fn start(config: &BuilderConfig) -> Result<Self, BuilderError> {
info!(image = %config.image, vcpus = config.vcpus, memory_mb = config.memory_mb, "Starting builder VM"); info!(image = %config.image, vcpus = config.vcpus, memory_mb = config.memory_mb, disk_gb = config.disk_gb, "Starting builder VM");
// 1. Resolve builder image // 1. Resolve builder image
let image_path = resolve_builder_image(&config.image).await?; let image_path = resolve_builder_image(&config.image).await?;
@ -31,7 +31,9 @@ impl BuilderSession {
// 2. Generate ephemeral SSH keypair // 2. Generate ephemeral SSH keypair
let (pub_key, priv_pem) = generate_ssh_keypair()?; let (pub_key, priv_pem) = generate_ssh_keypair()?;
// 3. Build cloud-config with builder user + injected pubkey // 3. Build cloud-config with builder user + injected pubkey + disk growth
// growpart + resize_rootfs ensure the root partition expands to fill the
// resized overlay disk (cloud images ship with tiny 2GB roots).
let cloud_config = format!( let cloud_config = format!(
r#"#cloud-config r#"#cloud-config
users: users:
@ -40,6 +42,13 @@ users:
shell: /bin/bash shell: /bin/bash
ssh_authorized_keys: ssh_authorized_keys:
- {pub_key} - {pub_key}
growpart:
mode: auto
devices:
- /
resize_rootfs: true
"# "#
); );
@ -57,7 +66,7 @@ users:
image_path: image_path.clone(), image_path: image_path.clone(),
vcpus: config.vcpus, vcpus: config.vcpus,
memory_mb: config.memory_mb, memory_mb: config.memory_mb,
disk_gb: None, disk_gb: Some(config.disk_gb),
network: NetworkConfig::User, network: NetworkConfig::User,
cloud_init: Some(CloudInitConfig { cloud_init: Some(CloudInitConfig {
user_data: cloud_config.into_bytes(), user_data: cloud_config.into_bytes(),

View file

@ -65,7 +65,7 @@ pub async fn build_qcow2_ext4(
crate::tools::partition::mount(runner, &root_part, mount_str).await?; crate::tools::partition::mount(runner, &root_part, mount_str).await?;
// Copy staging rootfs into mounted root // Copy staging rootfs into mounted root
copy_rootfs(staging_root, mount_dir.path())?; copy_rootfs(staging_root, mount_dir.path(), runner).await?;
info!("Step 6: Mounting EFI partition"); info!("Step 6: Mounting EFI partition");
let efi_mount = mount_dir.path().join("boot/efi"); let efi_mount = mount_dir.path().join("boot/efi");
@ -136,25 +136,25 @@ pub async fn build_qcow2_ext4(
} }
/// Copy the staging rootfs into the mounted root partition. /// Copy the staging rootfs into the mounted root partition.
fn copy_rootfs(src: &Path, dest: &Path) -> Result<(), ForgeError> { ///
for entry in walkdir::WalkDir::new(src).follow_links(false) { /// Uses `cp -a` (archive mode) to properly preserve symlinks, permissions,
let entry = entry.map_err(|e| ForgeError::Qcow2Build { /// ownership, timestamps, and special files. This is critical for modern
/// distros with merged /usr where /lib, /bin, /sbin are symlinks.
async fn copy_rootfs(
src: &Path,
dest: &Path,
runner: &dyn ToolRunner,
) -> Result<(), ForgeError> {
let src_str = format!("{}/.", src.display());
let dest_str = dest.to_str().unwrap();
runner
.run("cp", &["-a", &src_str, dest_str])
.await
.map_err(|_| ForgeError::Qcow2Build {
step: "copy_rootfs".to_string(), step: "copy_rootfs".to_string(),
detail: e.to_string(), detail: format!("cp -a {}/. -> {}", src.display(), dest.display()),
})?; })?;
let rel = entry.path().strip_prefix(src).unwrap_or(entry.path());
let target = dest.join(rel);
if entry.path().is_dir() {
std::fs::create_dir_all(&target)?;
} else if entry.path().is_file() {
if let Some(parent) = target.parent() {
std::fs::create_dir_all(parent)?;
}
std::fs::copy(entry.path(), &target)?;
}
}
Ok(()) Ok(())
} }

View file

@ -249,6 +249,7 @@ mod tests {
image "oci://ghcr.io/custom/builder:v1" image "oci://ghcr.io/custom/builder:v1"
vcpus 4 vcpus 4
memory 4096 memory 4096
disk 50
} }
"#; "#;
@ -257,6 +258,7 @@ mod tests {
assert_eq!(builder.image.as_deref(), Some("oci://ghcr.io/custom/builder:v1")); assert_eq!(builder.image.as_deref(), Some("oci://ghcr.io/custom/builder:v1"));
assert_eq!(builder.vcpus, Some(4)); assert_eq!(builder.vcpus, Some(4));
assert_eq!(builder.memory, Some(4096)); assert_eq!(builder.memory, Some(4096));
assert_eq!(builder.disk, Some(50));
} }
#[test] #[test]
@ -290,6 +292,7 @@ mod tests {
assert_eq!(builder.image, None); assert_eq!(builder.image, None);
assert_eq!(builder.vcpus, None); assert_eq!(builder.vcpus, None);
assert_eq!(builder.memory, None); assert_eq!(builder.memory, None);
assert_eq!(builder.disk, None);
} }
#[test] #[test]

View file

@ -74,6 +74,9 @@ pub struct BuilderNode {
#[knuffel(child, unwrap(argument))] #[knuffel(child, unwrap(argument))]
pub memory: Option<u64>, pub memory: Option<u64>,
#[knuffel(child, unwrap(argument))]
pub disk: Option<u32>,
} }
#[derive(Debug, Decode)] #[derive(Debug, Decode)]

View file

@ -30,6 +30,11 @@ overlays {
ensure-dir "/home/ci" owner="ci" group="ci" mode="755" ensure-dir "/home/ci" owner="ci" group="ci" mode="755"
} }
builder {
vcpus 4
memory 4096
}
target "qcow2" kind="qcow2" { target "qcow2" kind="qcow2" {
disk-size "8G" disk-size "8G"
bootloader "grub" bootloader "grub"