From 6ead33740dbe98e8fd96a51ba8c4b58063c14a7d Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:30:25 -0500 Subject: [PATCH 1/5] fix(docker): reap orphan processes via tini + init, cap pids The node runs as PID 1 in the container and never reaps reparented orphans, so git grandchildren (e.g. pack-objects left behind when a served clone disconnects) accumulate as zombies until fork() fails with EAGAIN. Run under tini and set compose init: true so reparented orphans are reaped, and add pids_limit as a blast-radius cap. Refs #53 --- Dockerfile | 9 ++++++++- docker-compose.yml | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 962b48d..e9357c9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,6 +44,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ git \ ca-certificates \ curl \ + tini \ && rm -rf /var/lib/apt/lists/* # Non-root user for runtime @@ -72,4 +73,10 @@ VOLUME ["/data"] HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \ CMD curl -fsS http://127.0.0.1:${GITLAWB_PORT}/health || exit 1 -ENTRYPOINT ["gitlawb-node"] +# Run under tini so the node is never PID 1. As PID 1 a process must reap +# reparented orphans itself; the node does not, so git's reparented grandchildren +# (e.g. pack-objects orphaned when a served upload-pack dies on a client +# disconnect) would accumulate as zombies until fork() fails with EAGAIN (#53). +# tini reaps them. `-g` forwards signals to the child's process group so graceful +# shutdown still reaches the node. +ENTRYPOINT ["tini", "-g", "--", "gitlawb-node"] diff --git a/docker-compose.yml b/docker-compose.yml index f56aab0..58f1e0c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,6 +19,14 @@ services: node: build: . + # Reap reparented orphan processes so git's grandchildren (e.g. pack-objects + # left behind when a served clone disconnects) can't accumulate as zombies + # and exhaust the PID table (#53). The image also runs under tini, so this is + # belt-and-suspenders for the quickstart. + init: true + # Cap the process table so a regression fails loudly instead of taking the + # host down. Sits well above normal concurrent git fan-out. + pids_limit: 1024 depends_on: postgres: condition: service_healthy From b8868ccb69e21ab03c25af72bc63b15e9145e89c Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:41:47 -0500 Subject: [PATCH 2/5] fix(node): reap leaked git child processes (#53) Serving git over smart-HTTP could leak zombie processes that, over weeks, exhaust the PID table until fork() fails with EAGAIN and all git operations break until restart. The serve path (run_git_service) now runs git in its own process group and SIGKILLs that group when the request future is dropped (client disconnect) or returns early, so git and its pack-objects child are torn down together instead of orphaning the grandchild to PID 1 as a zombie. A drop guard, disarmed once the request completes cleanly, owns this teardown. create_issue and build_filtered_pack spawned a std Child and dropped it unwaited if the stdin write failed (and build_filtered_pack could panic on a missing stdin handle). Both now always wait the child before surfacing any error, so the failure path cannot leak. Adds libc for the group-kill syscall and three regression tests covering direct-child kill, grandchild reaping via the group, and the disarmed no-kill path. Refs #53 --- Cargo.lock | 1 + crates/gitlawb-node/Cargo.toml | 1 + crates/gitlawb-node/src/git/issues.rs | 17 +- crates/gitlawb-node/src/git/smart_http.rs | 193 +++++++++++++++++++++- 4 files changed, 198 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3cde378..c40f31d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3295,6 +3295,7 @@ dependencies = [ "hex", "hmac", "http-body-util", + "libc", "libp2p-core", "libp2p-dns", "libp2p-gossipsub", diff --git a/crates/gitlawb-node/Cargo.toml b/crates/gitlawb-node/Cargo.toml index 5f10ec9..23fa399 100644 --- a/crates/gitlawb-node/Cargo.toml +++ b/crates/gitlawb-node/Cargo.toml @@ -29,6 +29,7 @@ tower-http = { version = "0.6", features = ["cors", "trace", "request-id", "limi sqlx = { version = "0.8", features = ["postgres", "runtime-tokio-rustls", "chrono", "uuid"] } clap = { version = "4", features = ["derive", "env"] } bytes = "1" +libc = "0.2" cid = { workspace = true } hex = { workspace = true } sha2 = { workspace = true } diff --git a/crates/gitlawb-node/src/git/issues.rs b/crates/gitlawb-node/src/git/issues.rs index 58984a0..7309830 100644 --- a/crates/gitlawb-node/src/git/issues.rs +++ b/crates/gitlawb-node/src/git/issues.rs @@ -23,13 +23,18 @@ pub fn create_issue(repo_path: &Path, issue_id: &str, json: &str) -> Result<()> use std::io::Write; let mut child = hash_output; - if let Some(stdin) = child.stdin.take() { - let mut stdin = stdin; - stdin - .write_all(json.as_bytes()) - .context("failed to write to git hash-object stdin")?; - } + // Write the blob to stdin, but always reap the child afterward even if the + // write fails, so a stdin-write error can't drop the Child unwaited and leak + // a zombie (#53). + let write_result = match child.stdin.take() { + Some(mut stdin) => stdin.write_all(json.as_bytes()), + None => Err(std::io::Error::new( + std::io::ErrorKind::BrokenPipe, + "git hash-object stdin unavailable", + )), + }; let output = child.wait_with_output().context("git hash-object failed")?; + write_result.context("failed to write to git hash-object stdin")?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index 80374fb..d183555 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -75,15 +75,64 @@ pub async fn receive_pack(repo_path: &Path, request_body: Bytes) -> Result, +} + +#[cfg(unix)] +impl KillGroupOnDrop { + fn disarm(&mut self) { + self.pgid = None; + } +} + +#[cfg(unix)] +impl Drop for KillGroupOnDrop { + fn drop(&mut self) { + if let Some(pgid) = self.pgid { + // SAFETY: kill(2) is async-signal-safe and takes no borrowed memory. + // Signalling a stale group just returns ESRCH, which we ignore. + unsafe { + libc::kill(-pgid, libc::SIGKILL); + } + } + } +} + async fn run_git_service(service: &str, repo_path: &Path, input: Bytes) -> Result> { - let mut child = Command::new("git") + let mut command = Command::new("git"); + command .arg(service_to_command(service)) .arg("--stateless-rpc") .arg(repo_path) .stdin(Stdio::piped()) .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn()?; + .stderr(Stdio::piped()); + + // Run git in its own process group so the whole tree (git + pack-objects) + // can be killed together on disconnect rather than orphaning a grandchild. + #[cfg(unix)] + command.process_group(0); + + let mut child = command.spawn()?; + + // Arm the group-kill guard for the lifetime of the request. With + // process_group(0) the child is its own group leader, so pgid == its pid. + #[cfg(unix)] + let mut group_guard = KillGroupOnDrop { + pgid: child.id().map(|id| id as i32), + }; // Write request body to git's stdin if let Some(mut stdin) = child.stdin.take() { @@ -92,6 +141,10 @@ async fn run_git_service(service: &str, repo_path: &Path, input: Bytes) -> Resul let output = child.wait_with_output().await?; + // Clean completion: git and its group have exited, nothing to kill. + #[cfg(unix)] + group_guard.disarm(); + if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); bail!("{service} failed: {stderr}"); @@ -152,13 +205,25 @@ pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Resu .stdout(Stdio::piped()) .stderr(Stdio::piped()) .spawn()?; - { + // Feed the object ids on stdin, but always reap the child afterward even if + // the write fails or stdin is missing, so an error can't drop the Child + // unwaited and leak a zombie (#53). + let write_result: std::io::Result<()> = { use std::io::Write as _; - let mut stdin = child.stdin.take().expect("stdin"); - stdin.write_all(keep.join("\n").as_bytes())?; - stdin.write_all(b"\n")?; - } + match child.stdin.take() { + Some(mut stdin) => { + let mut data = keep.join("\n").into_bytes(); + data.push(b'\n'); + stdin.write_all(&data) + } + None => Err(std::io::Error::new( + std::io::ErrorKind::BrokenPipe, + "git pack-objects stdin unavailable", + )), + } + }; let out = child.wait_with_output()?; + write_result.context("failed to write object ids to git pack-objects stdin")?; if !out.status.success() { bail!( "git pack-objects failed: {}", @@ -688,4 +753,116 @@ mod tests { server.abort(); } + + // ── #53 regression: served-git process-group teardown ────────────────── + // + // run_git_service runs git in its own process group and SIGKILLs that group + // when the request future is dropped (client disconnect) or returns early, so + // git AND its pack-objects child die together instead of orphaning a zombie. + // These exercise the KillGroupOnDrop guard that wires that up. + + #[cfg(unix)] + fn alive(pid: i32) -> bool { + // kill(pid, 0) probes existence: 0/EPERM => exists, ESRCH => gone. + unsafe { libc::kill(pid, 0) == 0 } + } + + #[cfg(unix)] + #[tokio::test] + async fn kill_group_guard_terminates_child_on_drop() { + let mut child = tokio::process::Command::new("sleep") + .arg("300") + .process_group(0) + .spawn() + .unwrap(); + let pgid = child.id().map(|id| id as i32); + + { + let _guard = KillGroupOnDrop { pgid }; + } // guard drops here -> SIGKILL to the group + + use std::os::unix::process::ExitStatusExt; + let status = child.wait().await.unwrap(); + assert_eq!( + status.signal(), + Some(libc::SIGKILL), + "child must be killed by SIGKILL via its process group" + ); + } + + #[cfg(unix)] + #[tokio::test] + async fn kill_group_guard_reaps_grandchild_on_drop() { + // The #53 scenario: git forks pack-objects. A group kill must reach the + // grandchild, not just the direct child. sh (the group leader) backgrounds + // a sleep (standing in for pack-objects) and prints its pid. + use tokio::io::AsyncReadExt; + let mut child = tokio::process::Command::new("sh") + .arg("-c") + .arg("sleep 300 & echo \"$!\"; wait") + .stdout(Stdio::piped()) + .process_group(0) + .spawn() + .unwrap(); + let pgid = child.id().map(|id| id as i32); + + // Read the backgrounded grandchild's pid from the first stdout line. + let mut stdout = child.stdout.take().unwrap(); + let mut buf = Vec::new(); + loop { + let mut byte = [0u8; 1]; + let n = stdout.read(&mut byte).await.unwrap(); + if n == 0 || byte[0] == b'\n' { + break; + } + buf.push(byte[0]); + } + let grandchild: i32 = String::from_utf8(buf).unwrap().trim().parse().unwrap(); + assert!(alive(grandchild), "grandchild should be running"); + + { + let _guard = KillGroupOnDrop { pgid }; + } // group SIGKILL kills sh AND the sleep grandchild + + let _ = child.wait().await; // reap sh + + // The grandchild reparents to init and is reaped; poll until it's gone. + let mut gone = false; + for _ in 0..200 { + if !alive(grandchild) { + gone = true; + break; + } + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + } + assert!(gone, "grandchild must be killed by the group SIGKILL (#53)"); + } + + #[cfg(unix)] + #[tokio::test] + async fn kill_group_guard_disarmed_does_not_kill() { + // A request that completed cleanly disarms the guard; dropping it must not + // signal anything. + let mut child = tokio::process::Command::new("sleep") + .arg("300") + .process_group(0) + .spawn() + .unwrap(); + + { + let mut guard = KillGroupOnDrop { + pgid: child.id().map(|id| id as i32), + }; + guard.disarm(); + } // disarmed -> no kill + + assert!( + child.try_wait().unwrap().is_none(), + "disarmed guard must not kill the child" + ); + + // Clean up the still-running child. + let _ = child.kill().await; + let _ = child.wait().await; + } } From 7e0a98728418c94c9fec1ebda8796300595effeb Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Fri, 19 Jun 2026 10:00:37 -0500 Subject: [PATCH 3/5] fix(review): reap-then-disarm on serve-path errors; tini/comment nits Address code-review findings on the #53 fix: - run_git_service no longer early-returns on a stdin-write error. It captures the write result, always reaps the child, then disarms the group guard before surfacing the error. Previously a write/wait error path could drop the still-armed guard after the child was already reaped, so the group SIGKILL could in theory land on a reused pgid. (correctness, reliability, adversarial all flagged this region.) - Drop tini's -g flag: the node manages its own children and graceful shutdown, so group-wide signalling would only disturb in-flight git helpers on shutdown. - Correct the SAFETY comment (kill(2) is not relevant to async-signal-safety here) and the alive() test-helper comment (EPERM does not read as alive). --- Dockerfile | 7 ++++--- crates/gitlawb-node/src/git/smart_http.rs | 22 ++++++++++++++-------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index e9357c9..55ef0c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,6 +77,7 @@ HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \ # reparented orphans itself; the node does not, so git's reparented grandchildren # (e.g. pack-objects orphaned when a served upload-pack dies on a client # disconnect) would accumulate as zombies until fork() fails with EAGAIN (#53). -# tini reaps them. `-g` forwards signals to the child's process group so graceful -# shutdown still reaches the node. -ENTRYPOINT ["tini", "-g", "--", "gitlawb-node"] +# tini reaps them and forwards SIGTERM/SIGINT to the node, which runs its own +# graceful shutdown. (No `-g`: the node manages its own children, so group-wide +# signalling would only disturb in-flight git helpers during shutdown.) +ENTRYPOINT ["tini", "--", "gitlawb-node"] diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index d183555..3d17d64 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -101,8 +101,8 @@ impl KillGroupOnDrop { impl Drop for KillGroupOnDrop { fn drop(&mut self) { if let Some(pgid) = self.pgid { - // SAFETY: kill(2) is async-signal-safe and takes no borrowed memory. - // Signalling a stale group just returns ESRCH, which we ignore. + // SAFETY: kill(2) takes only integer arguments and borrows no Rust + // memory. Signalling a stale group just returns ESRCH, which we ignore. unsafe { libc::kill(-pgid, libc::SIGKILL); } @@ -134,17 +134,22 @@ async fn run_git_service(service: &str, repo_path: &Path, input: Bytes) -> Resul pgid: child.id().map(|id| id as i32), }; - // Write request body to git's stdin - if let Some(mut stdin) = child.stdin.take() { - stdin.write_all(&input).await?; - } + // Write the request body to git's stdin, but don't early-return on a write + // error: always reap the child first (below), so the guard only ever fires on + // an actual future-drop (client disconnect), never on a pid we just reaped. + let write_result: std::io::Result<()> = match child.stdin.take() { + Some(mut stdin) => stdin.write_all(&input).await, + None => Ok(()), + }; let output = child.wait_with_output().await?; - // Clean completion: git and its group have exited, nothing to kill. + // Child reaped, so its group is gone: disarm before surfacing any error. #[cfg(unix)] group_guard.disarm(); + write_result.context("failed to write to git stdin")?; + if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); bail!("{service} failed: {stderr}"); @@ -763,7 +768,8 @@ mod tests { #[cfg(unix)] fn alive(pid: i32) -> bool { - // kill(pid, 0) probes existence: 0/EPERM => exists, ESRCH => gone. + // kill(pid, 0) probes existence: returns 0 while the pid exists, -1 once + // it's gone (ESRCH). Same-uid here, so EPERM never applies. unsafe { libc::kill(pid, 0) == 0 } } From 610f93e45562265438c2ac910e0ca779f7a094b2 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Fri, 19 Jun 2026 10:07:44 -0500 Subject: [PATCH 4/5] fix(node): SIGTERM (not SIGKILL) the served-git group on disconnect SIGKILL gave git no chance to run its signal cleanup, so an aborted receive-pack could leave a stale .git/*.lock that blocks the next push. SIGTERM lets git remove its lock files before exiting. Signalling the whole process group still reaches pack-objects directly, so the grandchild is terminated rather than orphaned, and a single immediate signal avoids any delayed-kill pid-reuse window. Refs #53 --- crates/gitlawb-node/src/git/smart_http.rs | 29 +++++++++++++---------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index 3d17d64..e4b296c 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -75,16 +75,18 @@ pub async fn receive_pack(repo_path: &Path, request_body: Bytes) -> Result, @@ -104,7 +106,7 @@ impl Drop for KillGroupOnDrop { // SAFETY: kill(2) takes only integer arguments and borrows no Rust // memory. Signalling a stale group just returns ESRCH, which we ignore. unsafe { - libc::kill(-pgid, libc::SIGKILL); + libc::kill(-pgid, libc::SIGTERM); } } } @@ -121,7 +123,7 @@ async fn run_git_service(service: &str, repo_path: &Path, input: Bytes) -> Resul .stderr(Stdio::piped()); // Run git in its own process group so the whole tree (git + pack-objects) - // can be killed together on disconnect rather than orphaning a grandchild. + // can be signalled together on disconnect rather than orphaning a grandchild. #[cfg(unix)] command.process_group(0); @@ -791,8 +793,8 @@ mod tests { let status = child.wait().await.unwrap(); assert_eq!( status.signal(), - Some(libc::SIGKILL), - "child must be killed by SIGKILL via its process group" + Some(libc::SIGTERM), + "child must be terminated by SIGTERM via its process group" ); } @@ -828,7 +830,7 @@ mod tests { { let _guard = KillGroupOnDrop { pgid }; - } // group SIGKILL kills sh AND the sleep grandchild + } // group SIGTERM reaches sh AND the sleep grandchild let _ = child.wait().await; // reap sh @@ -841,7 +843,10 @@ mod tests { } tokio::time::sleep(std::time::Duration::from_millis(10)).await; } - assert!(gone, "grandchild must be killed by the group SIGKILL (#53)"); + assert!( + gone, + "grandchild must be terminated by the group signal (#53)" + ); } #[cfg(unix)] From b3a0599ff91d811c363bf94001b6bd2d4b99cbe3 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Fri, 19 Jun 2026 16:18:50 -0500 Subject: [PATCH 5/5] docs(node): correct stale SIGKILL comments to SIGTERM in served-git teardown --- crates/gitlawb-node/src/git/smart_http.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index e4b296c..1f0779a 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -763,7 +763,7 @@ mod tests { // ── #53 regression: served-git process-group teardown ────────────────── // - // run_git_service runs git in its own process group and SIGKILLs that group + // run_git_service runs git in its own process group and SIGTERMs that group // when the request future is dropped (client disconnect) or returns early, so // git AND its pack-objects child die together instead of orphaning a zombie. // These exercise the KillGroupOnDrop guard that wires that up. @@ -787,7 +787,7 @@ mod tests { { let _guard = KillGroupOnDrop { pgid }; - } // guard drops here -> SIGKILL to the group + } // guard drops here -> SIGTERM to the group use std::os::unix::process::ExitStatusExt; let status = child.wait().await.unwrap();