diff --git a/sigma-agent/README.md b/sigma-agent/README.md index e048ca5..83ad409 100644 --- a/sigma-agent/README.md +++ b/sigma-agent/README.md @@ -194,6 +194,49 @@ Unreachable backends still appear with `reachable=0` and a `latency_ms` reflecti `query_backend_health` returns the latest snapshot. Pass `unreachable_only: true` to filter to failing probes only — useful for incident triage queries from sigma-api's LLM-assisted workflows. +## Self-Update Watchdog + +This watchdog **DETECTS** available updates. It **never** downloads, restarts, or applies anything. Operators are notified via Prometheus + MCP and act consciously. + +The rationale is operator safety: a misconfigured auto-updater on a VPN fleet can take every node offline simultaneously. Detection-only keeps the human in the loop — the agent merely signals that a newer version exists; the operator chooses when and how to roll out. + +### Configuration + +| Env var | CLI flag | Default | Description | +|---------|----------|---------|-------------| +| `AGENT_UPDATE_CHECK` | `--update-check` | `false` | Enable the update-detection watchdog | +| `AGENT_UPDATE_MANIFEST_URL` | `--update-manifest-url` | `https://lai3d.github.io/sigma/agent-version.json` | URL of the version manifest JSON | +| `AGENT_UPDATE_CHECK_INTERVAL` | `--update-check-interval` | `3600` | Poll interval in seconds (default 1 hour) | + +### Manifest JSON schema + +```json +{ + "version": "3.2.0", + "binary_url": "https://example.com/sigma-agent-3.2.0", + "sha256": "abc123def456..." +} +``` + +The agent compares `version` against its compiled-in `CARGO_PKG_VERSION` using component-wise integer parse on `.` (e.g. `3.2.0` vs `3.2.1`). Anything unparseable, or a component-count mismatch (`3.2` vs `3.2.0`), is treated as "no update" rather than guessing. + +### Prometheus metrics + +Exposed on the existing `/metrics` endpoint when enabled: + +- `sigma_agent_update_available{hostname,current_version,latest_version}` — `1` if an update is detected, `0` otherwise. `0` also means "unknown" after a network or parse failure (graceful degradation). +- `sigma_agent_update_last_check_timestamp{hostname}` — Unix timestamp of the last poll attempt (success or failure). + +A simple Grafana alert: `max by (hostname) (sigma_agent_update_available) == 1` for the fleet's "agents-behind-latest" panel; `time() - sigma_agent_update_last_check_timestamp > 86400` to alert on stale watchdogs. + +### MCP tool + +When MCP is also enabled, the tool `agent_check_update` returns the cached snapshot, or — with `{"force": true}` — triggers an immediate manifest poll. The response includes `detection_only: true` to make the contract explicit to LLM callers. + +### Graceful degradation + +Network errors, non-2xx responses, and malformed JSON all set `last_error`, force `update_available = false` in the gauge, and log a `warn!`. The watchdog never panics and never blocks the heartbeat loop. HTTP timeout is 5 seconds per check. + ## MCP Tool Surface (LLM-callable control plane) When `--mcp-enabled` is set, the agent runs a [Model Context Protocol](https://modelcontextprotocol.io) server at `POST /mcp` (JSON-RPC 2.0). This exposes agent capabilities as **tools that an external LLM can call** — e.g., an SRE assistant in `sigma-api` invoking `query_ebpf_traffic` during incident triage, or an automation calling `allocate_ports` when provisioning new Envoy routes. @@ -213,6 +256,7 @@ When `--mcp-enabled` is set, the agent runs a [Model Context Protocol](https://m | `query_dns_leaks` | `min_queries?` (default 1) | Processes sending UDP to port 53 | | `query_gpu_metrics` | none | Per-GPU utilization/memory/temp/power (nvidia-smi) | | `query_backend_health` | `unreachable_only?` (default false) | Latest TCP-probe snapshot of Envoy upstreams | +| `agent_check_update` | `force?` (bool) | Detection-only update check — cached snapshot, or forced poll | Tools that depend on capabilities (port scan, eBPF, registration) return a structured `isError` or `enabled=false` payload when their dependency is not configured — they do not break the MCP session. diff --git a/sigma-agent/src/config.rs b/sigma-agent/src/config.rs index bc6e0cf..f76e5bc 100644 --- a/sigma-agent/src/config.rs +++ b/sigma-agent/src/config.rs @@ -103,6 +103,18 @@ pub struct Config { #[arg(long, env = "AGENT_HEALTH_PROBE_INTERVAL", default_value = "30")] pub health_probe_interval: u64, + /// Enable update-check watchdog (DETECTION ONLY — never auto-applies) + #[arg(long, env = "AGENT_UPDATE_CHECK", default_value = "false")] + pub update_check: bool, + + /// URL of the version manifest JSON + #[arg(long, env = "AGENT_UPDATE_MANIFEST_URL", default_value = "https://lai3d.github.io/sigma/agent-version.json")] + pub update_manifest_url: String, + + /// Update check interval in seconds (default 1 hour) + #[arg(long, env = "AGENT_UPDATE_CHECK_INTERVAL", default_value = "3600")] + pub update_check_interval: u64, + /// Enable MCP (Model Context Protocol) server — exposes agent capabilities as LLM-callable tools #[arg(long, env = "AGENT_MCP_ENABLED", default_value = "false")] pub mcp_enabled: bool, diff --git a/sigma-agent/src/main.rs b/sigma-agent/src/main.rs index 518fa30..4d6f6c9 100644 --- a/sigma-agent/src/main.rs +++ b/sigma-agent/src/main.rs @@ -10,6 +10,7 @@ mod metrics; mod models; mod port_scan; mod system; +mod watchdog; mod xds; mod xds_resources; @@ -116,8 +117,30 @@ async fn main() -> Result<()> { None }; + // Conditionally start self-update detection watchdog (DETECTION ONLY — never auto-applies) + let update_info: Option = if config.update_check { + let shared = Arc::new(RwLock::new(watchdog::UpdateInfo::new( + env!("CARGO_PKG_VERSION").to_string(), + ))); + let manifest_url = config.update_manifest_url.clone(); + let interval = config.update_check_interval; + let current = env!("CARGO_PKG_VERSION").to_string(); + let task_shared = shared.clone(); + info!( + manifest_url = %manifest_url, + interval_secs = interval, + "Update-detection watchdog enabled (DETECTION ONLY — never auto-applies)" + ); + tokio::spawn(async move { + watchdog::watchdog_loop(task_shared, current, manifest_url, interval).await; + }); + Some(shared) + } else { + None + }; + // Metrics server is spawned after registration + health-probe setup so it - // can include both gpu_metrics and probe_results in its rendering state. + // can include gpu_metrics, probe_results, and update_info in its rendering state. // Fetch public IP once at startup (the IP the world sees via default route) let public_ip = match system::fetch_public_ip().await { @@ -197,8 +220,8 @@ async fn main() -> Result<()> { None }; - // Conditionally start metrics server (now that both gpu_metrics and - // probe_results are bound). + // Conditionally start metrics server (now that gpu_metrics, probe_results, + // and update_info are all bound). if config.metrics_port > 0 { let shared = scan_result.clone(); let port = config.metrics_port; @@ -206,8 +229,9 @@ async fn main() -> Result<()> { let ts = traffic_stats.clone(); let gm = gpu_metrics.clone(); let pr = probe_results.clone(); + let ui = update_info.clone(); tokio::spawn(async move { - metrics::serve_metrics(port, shared, hn, port_range, ts, gm, pr).await; + metrics::serve_metrics(port, shared, hn, port_range, ts, gm, pr, ui).await; }); } @@ -262,6 +286,12 @@ async fn main() -> Result<()> { traffic_stats: traffic_stats.clone(), gpu_metrics: gpu_metrics.clone(), probe_results: probe_results.clone(), + update_info: update_info.clone(), + update_manifest_url: if config.update_check { + Some(config.update_manifest_url.clone()) + } else { + None + }, }); let bind = config.mcp_bind.clone(); info!(bind = %bind, "MCP server enabled"); diff --git a/sigma-agent/src/mcp.rs b/sigma-agent/src/mcp.rs index 0b920ba..5a58a67 100644 --- a/sigma-agent/src/mcp.rs +++ b/sigma-agent/src/mcp.rs @@ -47,6 +47,7 @@ use crate::gpu::SharedGpuMetrics; use crate::health_probe::SharedProbeResults; use crate::port_scan::{self, SharedScanResult}; use crate::system; +use crate::watchdog::{self, SharedUpdateInfo}; #[cfg(feature = "ebpf-traffic")] use crate::ebpf_traffic::SharedTrafficStats; @@ -125,6 +126,9 @@ pub struct McpState { pub traffic_stats: Option, pub gpu_metrics: Option, pub probe_results: Option, + pub update_info: Option, + /// Manifest URL — needed so `agent_check_update` can run a forced poll. + pub update_manifest_url: Option, } // ---------- Tool schemas (returned by tools/list) ---------- @@ -187,6 +191,20 @@ fn tools_list_response() -> Value { "additionalProperties": false } }, + { + "name": "agent_check_update", + "description": "Check the version-manifest URL for an available sigma-agent update. DETECTION ONLY — does not download or apply. Returns current version, latest version, and update_available flag.", + "inputSchema": { + "type": "object", + "properties": { + "force": { + "type": "boolean", + "description": "If true, trigger an immediate poll of the manifest URL instead of returning the cached snapshot." + } + }, + "additionalProperties": false + } + }, { "name": "query_dns_leaks", "description": "Detect potential DNS leaks: processes that emitted UDP packets to port 53 in the last eBPF harvest window. On a VPN node, any process bypassing the tunnel for DNS resolution is a security concern. Returns enabled=false if eBPF traffic monitoring is not configured.", @@ -263,6 +281,7 @@ async fn handle_tool_call(state: Arc, params: Value) -> Value { "query_dns_leaks" => tool_query_dns_leaks(&state, args).await, "query_gpu_metrics" => tool_query_gpu_metrics(&state).await, "query_backend_health" => tool_query_backend_health(&state, args).await, + "agent_check_update" => tool_agent_check_update(&state, args).await, other => return tool_err(format!("unknown tool: {}", other)), }; @@ -558,6 +577,48 @@ async fn tool_query_backend_health(state: &McpState, args: Value) -> Result Result { + let Some(ref shared) = state.update_info else { + return Ok(json!({"enabled": false}).to_string()); + }; + + let force = args + .get("force") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + let info = if force { + let Some(ref url) = state.update_manifest_url else { + return Ok(json!({ + "enabled": false, + "note": "manifest URL not configured" + }) + .to_string()); + }; + watchdog::check_once( + shared.clone(), + state.agent_version.to_string(), + url.clone(), + ) + .await + } else { + shared.read().await.clone() + }; + + serde_json::to_string_pretty(&json!({ + "enabled": true, + "detection_only": true, + "current_version": info.current_version, + "latest_version": info.latest_version, + "update_available": info.update_available, + "binary_url": info.binary_url, + "sha256": info.sha256, + "last_checked": info.last_checked.to_rfc3339(), + "last_error": info.last_error, + })) + .map_err(|e| e.to_string()) +} + // ---------- HTTP handler ---------- async fn mcp_handler( diff --git a/sigma-agent/src/metrics.rs b/sigma-agent/src/metrics.rs index 4cf3696..4f92617 100644 --- a/sigma-agent/src/metrics.rs +++ b/sigma-agent/src/metrics.rs @@ -14,6 +14,7 @@ use tracing::{error, info}; use crate::gpu::{GpuMetrics, SharedGpuMetrics}; use crate::health_probe::{BackendProbeResult, SharedProbeResults}; use crate::port_scan::{self, PortScanResult, SharedScanResult}; +use crate::watchdog::{SharedUpdateInfo, UpdateInfo}; #[cfg(feature = "ebpf-traffic")] use crate::ebpf_traffic::SharedTrafficStats; @@ -37,6 +38,7 @@ struct MetricsState { traffic_stats: Option, gpu_metrics: Option, probe_results: Option, + update_info: Option, } #[derive(Deserialize)] @@ -562,6 +564,56 @@ pub async fn render_probe_metrics(results: &[BackendProbeResult], hostname: &str out } +/// Render self-update watchdog metrics in Prometheus text format. +/// +/// Emits two gauges: +/// - `sigma_agent_update_available{hostname,current_version,latest_version}` +/// — 1 if an update is detected, 0 otherwise (including the "unknown" +/// state after a network/parse failure). +/// - `sigma_agent_update_last_check_timestamp{hostname}` — unix timestamp +/// of the last check attempt (success or failure). +pub async fn render_update_metrics(info: &UpdateInfo, hostname: &str) -> String { + let mut out = String::with_capacity(512); + + let latest = info.latest_version.as_deref().unwrap_or("unknown"); + let available = if info.update_available { 1 } else { 0 }; + + writeln!( + out, + "# HELP sigma_agent_update_available Whether a sigma-agent update is detected (DETECTION ONLY — agent never auto-applies; 0 also means unknown after a check failure)" + ) + .unwrap(); + writeln!(out, "# TYPE sigma_agent_update_available gauge").unwrap(); + writeln!( + out, + "sigma_agent_update_available{{hostname=\"{}\",current_version=\"{}\",latest_version=\"{}\"}} {}", + hostname, info.current_version, latest, available + ) + .unwrap(); + + writeln!(out).unwrap(); + + writeln!( + out, + "# HELP sigma_agent_update_last_check_timestamp Unix timestamp of the last update check (success or failure)" + ) + .unwrap(); + writeln!( + out, + "# TYPE sigma_agent_update_last_check_timestamp gauge" + ) + .unwrap(); + writeln!( + out, + "sigma_agent_update_last_check_timestamp{{hostname=\"{}\"}} {}", + hostname, + info.last_checked.timestamp() + ) + .unwrap(); + + out +} + async fn metrics_handler(State(state): State>) -> impl IntoResponse { let result = state.scan_result.read().await; @@ -593,6 +645,12 @@ async fn metrics_handler(State(state): State>) -> impl IntoRes } } + if let Some(ref ui) = state.update_info { + let snapshot = ui.read().await.clone(); + body.push('\n'); + body.push_str(&render_update_metrics(&snapshot, &state.hostname).await); + } + ( [( header::CONTENT_TYPE, @@ -654,6 +712,7 @@ pub async fn serve_metrics( #[cfg(not(feature = "ebpf-traffic"))] _traffic_stats: Option<()>, gpu_metrics: Option, probe_results: Option, + update_info: Option, ) { let state = Arc::new(MetricsState { scan_result, @@ -663,6 +722,7 @@ pub async fn serve_metrics( traffic_stats, gpu_metrics, probe_results, + update_info, }); let app = Router::new() diff --git a/sigma-agent/src/watchdog.rs b/sigma-agent/src/watchdog.rs new file mode 100644 index 0000000..1cd1d16 --- /dev/null +++ b/sigma-agent/src/watchdog.rs @@ -0,0 +1,277 @@ +//! Self-update detection watchdog. +//! +//! **DETECTION ONLY.** This module periodically polls a version-manifest URL +//! and exposes the result via: +//! - Prometheus gauges (`sigma_agent_update_available`, +//! `sigma_agent_update_last_check_timestamp`) +//! - MCP tool (`agent_check_update`) +//! +//! It NEVER downloads, restarts, or applies anything. The operator is +//! notified via the surfaces above and must consciously act on the signal. +//! +//! ## Resource budget +//! +//! Default check interval is 1 hour with a 5s HTTP timeout. Steady-state +//! CPU is effectively zero; memory is bounded by the small `UpdateInfo` +//! struct held in an `Arc>`. This keeps the watchdog well +//! within the agent's <1% CPU / <50 MiB RSS budget on 1 vCPU VPS. +//! +//! ## Manifest schema +//! +//! ```json +//! { +//! "version": "3.2.0", +//! "binary_url": "https://example.com/sigma-agent-3.2.0", +//! "sha256": "abc123..." +//! } +//! ``` +//! +//! ## Graceful degradation +//! +//! Network errors, non-2xx responses, and malformed JSON all set +//! `last_error` and force `update_available = false` (unknown). + +use std::sync::Arc; +use std::time::Duration; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; +use tracing::{debug, info, warn}; + +/// Snapshot of the agent's self-update state. Detection-only — the agent +/// never acts on this; it is purely a signal for the operator. +#[derive(Serialize, Clone, Debug)] +pub struct UpdateInfo { + pub current_version: String, + pub latest_version: Option, + pub update_available: bool, + pub binary_url: Option, + pub sha256: Option, + pub last_checked: DateTime, + pub last_error: Option, +} + +impl UpdateInfo { + /// Initial state — current version known, nothing checked yet. + pub fn new(current_version: String) -> Self { + Self { + current_version, + latest_version: None, + update_available: false, + binary_url: None, + sha256: None, + last_checked: Utc::now(), + last_error: None, + } + } +} + +pub type SharedUpdateInfo = Arc>; + +#[derive(Deserialize, Debug)] +struct VersionManifest { + version: String, + #[serde(default)] + binary_url: Option, + #[serde(default)] + sha256: Option, +} + +/// Returns true if `latest > current` parsed as component-wise u32 on '.'. +/// Any parse failure or component-count mismatch returns false (no update). +/// +/// This deliberately avoids the `semver` crate — manifests use simple +/// `x.y.z` integer triples, and "missing component != 0" is the safer +/// default (refuse to claim an update when versions are ambiguous). +pub fn compare_versions(current: &str, latest: &str) -> bool { + let parse = |s: &str| -> Option> { + s.split('.') + .map(|p| p.parse::().ok()) + .collect::>>() + }; + + let Some(cur) = parse(current) else { return false; }; + let Some(lat) = parse(latest) else { return false; }; + + // Treat missing components as a mismatch (no update) rather than + // implicitly padding with zeros — matches the spec test + // compare_versions("3.2", "3.2.0") → false. + if cur.len() != lat.len() { + return false; + } + + for (c, l) in cur.iter().zip(lat.iter()) { + if l > c { + return true; + } + if l < c { + return false; + } + } + false +} + +/// Execute a single update-manifest poll and return the resulting UpdateInfo. +/// Also updates the shared snapshot in place. +pub async fn check_once( + shared: SharedUpdateInfo, + current_version: String, + manifest_url: String, +) -> UpdateInfo { + let now = Utc::now(); + + let client = match reqwest::Client::builder() + .timeout(Duration::from_secs(5)) + .user_agent(concat!("sigma-agent/", env!("CARGO_PKG_VERSION"))) + .build() + { + Ok(c) => c, + Err(e) => { + let info = UpdateInfo { + current_version: current_version.clone(), + latest_version: None, + update_available: false, + binary_url: None, + sha256: None, + last_checked: now, + last_error: Some(format!("client build failed: {}", e)), + }; + *shared.write().await = info.clone(); + return info; + } + }; + + let result: Result = async { + let resp = client + .get(&manifest_url) + .send() + .await + .map_err(|e| format!("request failed: {}", e))?; + if !resp.status().is_success() { + return Err(format!("non-success status: {}", resp.status())); + } + let body = resp + .text() + .await + .map_err(|e| format!("body read failed: {}", e))?; + serde_json::from_str::(&body) + .map_err(|e| format!("parse failed: {}", e)) + } + .await; + + let info = match result { + Ok(manifest) => { + let update_available = compare_versions(¤t_version, &manifest.version); + if update_available { + info!( + current = %current_version, + latest = %manifest.version, + "sigma-agent update available (detection-only — no auto-apply)" + ); + } else { + debug!( + current = %current_version, + latest = %manifest.version, + "Update check complete: agent is up to date" + ); + } + UpdateInfo { + current_version: current_version.clone(), + latest_version: Some(manifest.version), + update_available, + binary_url: manifest.binary_url, + sha256: manifest.sha256, + last_checked: now, + last_error: None, + } + } + Err(e) => { + warn!(url = %manifest_url, error = %e, "Update check failed; gauge will be 0 (unknown)"); + UpdateInfo { + current_version: current_version.clone(), + latest_version: None, + update_available: false, + binary_url: None, + sha256: None, + last_checked: now, + last_error: Some(e), + } + } + }; + + *shared.write().await = info.clone(); + info +} + +/// Background loop: every `interval_secs`, runs `check_once`. Logs errors +/// but never panics or exits. +pub async fn watchdog_loop( + shared: SharedUpdateInfo, + current_version: String, + manifest_url: String, + interval_secs: u64, +) { + info!( + manifest_url = %manifest_url, + interval_secs, + current_version = %current_version, + "Update-detection watchdog started (DETECTION ONLY — never auto-applies)" + ); + + loop { + let _ = check_once( + shared.clone(), + current_version.clone(), + manifest_url.clone(), + ) + .await; + tokio::time::sleep(Duration::from_secs(interval_secs)).await; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_compare_versions_patch_bump() { + assert!(compare_versions("3.2.0", "3.2.1")); + } + + #[test] + fn test_compare_versions_equal() { + assert!(!compare_versions("3.2.0", "3.2.0")); + } + + #[test] + fn test_compare_versions_downgrade() { + assert!(!compare_versions("3.2.0", "3.1.9")); + } + + #[test] + fn test_compare_versions_garbage() { + assert!(!compare_versions("3.2.0", "garbage")); + } + + #[test] + fn test_compare_versions_component_count_mismatch() { + // Per spec: "3.2" vs "3.2.0" → false (treat missing component as a mismatch) + assert!(!compare_versions("3.2", "3.2.0")); + } + + #[test] + fn test_compare_versions_minor_bump() { + assert!(compare_versions("3.2.0", "3.3.0")); + } + + #[test] + fn test_compare_versions_major_bump() { + assert!(compare_versions("3.99.99", "4.0.0")); + } + + #[test] + fn test_compare_versions_garbage_current() { + assert!(!compare_versions("not-a-version", "3.2.1")); + } +}