From 00d2649cc292173f831a2dcfb1ef3c550e090302 Mon Sep 17 00:00:00 2001 From: Riccardo Pirruccio Date: Sat, 28 Mar 2026 19:03:37 -0500 Subject: [PATCH] fix(bootstrap): detect low inotify limits before gateway startup (#552) The embedded k3s cluster and its components (containerd, kubelet, flannel, CoreDNS) create many inotify instances. On hosts that already run Kubernetes or other container workloads, the default fs.inotify.max_user_instances limit (128) can be exhausted, causing containerd's CRI plugin to fail with "too many open files" when creating its fsnotify watcher. This prevents the RuntimeService from registering, which surfaces as the opaque "K8s namespace not ready" timeout. Add inotify limit checks in two places: 1. cluster-entrypoint.sh: warn before starting k3s if the limit is below 256. Prints the current value and the exact fix command. Does not auto-modify kernel parameters -- enterprise environments may audit sysctl changes. 2. doctor_check() in the CLI: adds an inotify instances check after the existing Docker check, so `openshell doctor check` catches the issue diagnostically. Closes #552 Signed-off-by: Riccardo Pirruccio --- crates/openshell-cli/src/run.rs | 36 +++++++++++++++++++++++++++++ deploy/docker/cluster-entrypoint.sh | 27 ++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index e32eec2a..ff0bb621 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1772,6 +1772,42 @@ pub async fn doctor_check() -> Result<()> { Err(_) => writeln!(stdout, "(not set, using default socket)").into_diagnostic()?, }; + // --- inotify limits (Linux only) --- + #[cfg(target_os = "linux")] + { + write!(stdout, " inotify instances .. ").into_diagnostic()?; + stdout.flush().into_diagnostic()?; + match std::fs::read_to_string("/proc/sys/fs/inotify/max_user_instances") { + Ok(val) => { + let current: u64 = val.trim().parse().unwrap_or(0); + if current < 256 { + writeln!(stdout, "warning ({current}, recommend 512+)") + .into_diagnostic()?; + writeln!(stdout).into_diagnostic()?; + writeln!(stdout, " Hosts running existing Kubernetes clusters or container workloads").into_diagnostic()?; + writeln!( + stdout, + " may not have enough inotify instances for the embedded k3s." + ) + .into_diagnostic()?; + writeln!(stdout).into_diagnostic()?; + writeln!( + stdout, + " Fix: sudo sysctl -w fs.inotify.max_user_instances=512" + ) + .into_diagnostic()?; + writeln!(stdout, " Persist: echo 'fs.inotify.max_user_instances=512' | sudo tee /etc/sysctl.d/99-openshell.conf").into_diagnostic()?; + writeln!(stdout).into_diagnostic()?; + } else { + writeln!(stdout, "ok ({current})").into_diagnostic()?; + } + } + Err(_) => { + writeln!(stdout, "skip (not available)").into_diagnostic()?; + } + } + } + writeln!(stdout, "\nAll checks passed.").into_diagnostic()?; Ok(()) } diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 2fea6fa6..7e0413da 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -557,6 +557,33 @@ if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then EXTRA_KUBELET_ARGS="$EXTRA_KUBELET_ARGS --disable-network-policy" fi +# --------------------------------------------------------------------------- +# Check inotify limits +# --------------------------------------------------------------------------- +# The embedded k3s cluster and its components (containerd, kubelet, flannel, +# CoreDNS) create many inotify instances. On hosts that already run Kubernetes +# or other container workloads, the default limit (128) can be exhausted, +# causing containerd's CRI plugin to fail with "too many open files" during +# fsnotify watcher creation. This surfaces as the opaque "K8s namespace not +# ready" timeout because the RuntimeService never registers. +# +# Check the current limit and warn if it is too low. We do not auto-modify +# kernel parameters — enterprise environments may audit sysctl changes and +# require them to go through change management. +INOTIFY_MIN=256 +INOTIFY_RECOMMENDED=512 +INOTIFY_CURRENT=$(cat /proc/sys/fs/inotify/max_user_instances 2>/dev/null || echo 0) +if [ "$INOTIFY_CURRENT" -lt "$INOTIFY_MIN" ]; then + echo "" + echo "Warning: fs.inotify.max_user_instances is $INOTIFY_CURRENT (need $INOTIFY_MIN+, recommend $INOTIFY_RECOMMENDED)" + echo " Hosts running existing Kubernetes clusters or container workloads may not" + echo " have enough inotify instances for the gateway's embedded k3s." + echo "" + echo " Fix: sudo sysctl -w fs.inotify.max_user_instances=$INOTIFY_RECOMMENDED" + echo " Persist: echo 'fs.inotify.max_user_instances=$INOTIFY_RECOMMENDED' | sudo tee /etc/sysctl.d/99-openshell.conf" + echo "" +fi + # Docker Desktop can briefly start the container before its bridge default route # is fully installed. k3s exits immediately in that state, so wait briefly for # routing to settle first.