From 48c1c026de46b4374ee0d0d2dff7cb7537818962 Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Mon, 9 Mar 2026 11:50:46 +0530 Subject: [PATCH 1/4] feat(observability): tcp round trip details --- infrastructure/main.tf | 2 +- modules/observability/network_observability.tf | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/infrastructure/main.tf b/infrastructure/main.tf index c09b011..90ba7e3 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -22,7 +22,7 @@ module "cluster-issuer" { # Complete Observability Stack Deployment module "observability" { - source = "git::https://github.com/necro-cloud/modules//modules/observability?ref=main" + source = "git::https://github.com/necro-cloud/modules//modules/observability?ref=task/105/network-dashboard" // Certificates Details cluster_issuer_name = module.cluster-issuer.cluster-issuer-name diff --git a/modules/observability/network_observability.tf b/modules/observability/network_observability.tf index e12ec36..ff84c2e 100644 --- a/modules/observability/network_observability.tf +++ b/modules/observability/network_observability.tf @@ -18,8 +18,8 @@ resource "kubernetes_manifest" "network_observability" { // PRIVILEGED: Required for "PacketDrop" to read kernel drop reasons privileged = true - // FEATURES: Enable drop detection - features = ["PacketDrop"] + // Enable drop detection and TCP round trips metrics + features = ["PacketDrop", "FlowRTT"] // SAMPLING: 25 means 1 in 25 packets sampling = 25 @@ -81,9 +81,8 @@ resource "kubernetes_manifest" "network_observability" { expiryTime = "2m" } - // Disable direct metric export (optional) metrics = { - enable = false + enable = true } tls = { enable = false From bc2ae5034c6aa658bdea9e48d56034bc47013088 Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Mon, 9 Mar 2026 12:37:02 +0530 Subject: [PATCH 2/4] feat(observability): dashboard for network observability --- modules/observability/dashboards/network.json | 1090 +++++++++++++++++ modules/observability/grafana.tf | 16 + 2 files changed, 1106 insertions(+) create mode 100644 modules/observability/dashboards/network.json diff --git a/modules/observability/dashboards/network.json b/modules/observability/dashboards/network.json new file mode 100644 index 0000000..a425d62 --- /dev/null +++ b/modules/observability/dashboards/network.json @@ -0,0 +1,1090 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 11, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 3, + "panels": [], + "title": "Quick Statistics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\"}[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Cluster Egress Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(netobserv_workload_ingress_bytes_total{DstK8S_Namespace=~\"$DstNamespace\"}[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Cluster Ingress Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 6, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(netobserv_workload_rtt_seconds_bucket{SrcK8S_Namespace=~\"$SrcNamespace\"}[5m])) by (le))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Global TCP Latency (P95)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(netobserv_workload_drop_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\"}[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active Network Drops", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 2, + "panels": [], + "title": "Visual Network Topology", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 23, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 1, + "options": { + "edges": {}, + "layoutAlgorithm": "force", + "nodes": {}, + "zoomMode": "greedy" + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "label_join(\n label_replace(\n label_replace(\n sum by (SrcK8S_Namespace, DstK8S_Namespace) (rate(netobserv_workload_flows_total{SrcK8S_Namespace!=\"\", DstK8S_Namespace!=\"\"}[5m])) > 0,\n \"target\", \"$1\", \"DstK8S_Namespace\", \"(.*)\"\n ),\n \"source\", \"$1\", \"SrcK8S_Namespace\", \"(.*)\"\n ),\n \"id\", \"-\", \"source\", \"target\"\n)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Dynamic Namespace Topology Map", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "DstK8S_Namespace": true, + "SrcK8S_Namespace": true, + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "mainStat" + } + } + } + ], + "type": "nodeGraph" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 8, + "panels": [], + "title": "Cross-Namespace Top Talkers", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "topk(10, sum by (SrcK8S_Namespace, DstK8S_Namespace) (rate(netobserv_workload_flows_total{SrcK8S_Namespace=~\"$SrcNamespace\", DstK8S_Namespace=~\"$DstNamespace\"}[5m])))", + "legendFormat": "{{SrcK8S_Namespace}} -> {{DstK8S_Namespace}}", + "range": true, + "refId": "A" + } + ], + "title": "Top 10 Cross-Namespace Flows", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "shades" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "topk(10, sum by (SrcK8S_OwnerName) (rate(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\", DstK8S_Namespace=\"\"}[5m])))", + "legendFormat": "{{SrcK8S_OwnerName}} -> External", + "range": true, + "refId": "A" + } + ], + "title": "Top External Talkers", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 42 + }, + "id": 11, + "panels": [], + "title": "Workload Bandwidth", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "topk(10, sum by (SrcK8S_OwnerName) (rate(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\", SrcK8S_OwnerName=~\"$Pod\"}[5m])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Top 10 Pods by Egress", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "topk(10, sum by (DstK8S_OwnerName) (rate(netobserv_workload_ingress_bytes_total{DstK8S_Namespace=~\"$DstNamespace\", DstK8S_OwnerName=~\"$Pod\"}[5m])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Top 10 Pods by Ingress", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 51 + }, + "id": 14, + "panels": [], + "title": "TCP Health & Network Drops", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 52 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le, SrcK8S_OwnerName) (rate(netobserv_workload_rtt_seconds_bucket{SrcK8S_Namespace=~\"$SrcNamespace\", SrcK8S_OwnerName=~\"$Pod\"}[5m])))", + "legendFormat": "{{ SrcK8S_OwnerName }}", + "range": true, + "refId": "A" + } + ], + "title": "TCP Latency (RTT) by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P4169E866C3094E38" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "shades" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 52 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "editorMode": "code", + "expr": "sum by (SrcK8S_OwnerName) (rate(netobserv_workload_drop_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\", SrcK8S_OwnerName=~\"$Pod\"}[5m]))", + "legendFormat": "Dropped: {{SrcK8S_OwnerName}}", + "range": true, + "refId": "A" + } + ], + "title": "Dropped Traffic by Pod", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 17, + "panels": [], + "title": "Raw Flow Logs", + "type": "row" + }, + { + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 24, + "w": 24, + "x": 0, + "y": 61 + }, + "id": 18, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "prettifyLogMessage": true, + "showControls": true, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "pluginVersion": "12.3.3", + "targets": [ + { + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" + }, + "direction": "desc", + "editorMode": "code", + "expr": "source.k8s.namespace.name: \"$SrcNamespace\"", + "queryType": "instant", + "refId": "A" + } + ], + "title": "Real-Time eBPF Network Logs", + "type": "logs" + } + ], + "preload": false, + "refresh": "1m", + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": [ + "postgres" + ], + "value": [ + "postgres" + ] + }, + "definition": "label_values(netobserv_workload_egress_bytes_total,SrcK8S_Namespace)", + "includeAll": true, + "label": "Source Namespace", + "multi": true, + "name": "SrcNamespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(netobserv_workload_egress_bytes_total,SrcK8S_Namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": [ + "garage" + ], + "value": [ + "garage" + ] + }, + "definition": "label_values(netobserv_workload_ingress_bytes_total,DstK8S_Namespace)", + "includeAll": true, + "label": "Destination Namespace", + "multi": true, + "name": "DstNamespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(netobserv_workload_ingress_bytes_total,DstK8S_Namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": [ + "postgresql-cluster" + ], + "value": [ + "postgresql-cluster" + ] + }, + "definition": "label_values(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=\"$SrcNamespace\"},SrcK8S_OwnerName)", + "includeAll": true, + "label": "Pod Name", + "multi": true, + "name": "Pod", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=\"$SrcNamespace\"},SrcK8S_OwnerName)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Kubernetes Network Observability Dashboard", + "uid": "obbw2v6", + "version": 1 +} \ No newline at end of file diff --git a/modules/observability/grafana.tf b/modules/observability/grafana.tf index ea0f757..7640522 100644 --- a/modules/observability/grafana.tf +++ b/modules/observability/grafana.tf @@ -170,6 +170,17 @@ resource "helm_release" "grafana" { options = { path = "/var/lib/grafana/dashboards/pod" }, + }, + { + name = "Network Level Monitoring Dashboard" + orgId = 1 + folder = "Kubernetes Monitoring" + type = "file" + disableDeletion = false + editable = true + options = { + path = "/var/lib/grafana/dashboards/network" + }, } ] } @@ -207,6 +218,11 @@ resource "helm_release" "grafana" { json = file("${path.module}/dashboards/pod.json") } } + network = { + network-dashboard = { + json = file("${path.module}/dashboards/network.json") + } + } } affinity = { From 3bfc134fb7d2e78a7d8fc0786342116c4de06c8c Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Mon, 9 Mar 2026 12:55:24 +0530 Subject: [PATCH 3/4] feat(observability): some comments --- .../observability/network_observability.tf | 21 +++++++++---------- modules/observability/otel-collector.tf | 16 +++++++------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/modules/observability/network_observability.tf b/modules/observability/network_observability.tf index ff84c2e..de6c665 100644 --- a/modules/observability/network_observability.tf +++ b/modules/observability/network_observability.tf @@ -9,23 +9,25 @@ resource "kubernetes_manifest" "network_observability" { spec = { namespace = kubernetes_namespace.namespace.metadata[0].name - // "Direct" mode sends logs straight to OTel (bypassing Kafka/IPFIX) + // "Direct" mode sends logs straight to OTel deploymentModel = "Direct" agent = { type = "eBPF" ebpf = { - // PRIVILEGED: Required for "PacketDrop" to read kernel drop reasons + // Required for "PacketDrop" to read kernel drop reasons privileged = true // Enable drop detection and TCP round trips metrics features = ["PacketDrop", "FlowRTT"] - // SAMPLING: 25 means 1 in 25 packets + // 25 means 1 in 25 packets sampling = 25 cacheActiveTimeout = "15s" cacheMaxFlows = 100000 - excludeInterfaces = ["lo"] // Ignore loopback traffic + + // Ignore loopback traffic + excludeInterfaces = ["lo"] // Resource Constraints resources = { @@ -34,8 +36,8 @@ resource "kubernetes_manifest" "network_observability" { memory = "100Mi" } limits = { - cpu = "500m" // Hard cap to prevent eBPF compiler spikes - memory = "512Mi" // Hard cap to prevent leaks + cpu = "500m" + memory = "512Mi" } } } @@ -54,23 +56,20 @@ resource "kubernetes_manifest" "network_observability" { enable = false } - // PROCESSOR: Enrichment settings + // Enrichment settings processor = { logTypes = "Flows" metrics = { // Disable agent-side metrics generation to save CPU - // We will derive metrics from logs in Victoria if needed disableAlerts = ["NetObservLokiError", "NetObservNoFlows"] } } - // EXPORT: Pushing to the OTel Collector + // Pushing metrics to the OTel Collector exporters = [ { type = "OpenTelemetry" openTelemetry = { - // 1. POINT THIS TO YOUR OTEL COLLECTOR SERVICE - // Format: ..svc.cluster.local targetHost = "otel-collector.${kubernetes_namespace.namespace.metadata[0].name}.svc.cluster.local" targetPort = 4317 protocol = "grpc" diff --git a/modules/observability/otel-collector.tf b/modules/observability/otel-collector.tf index 313f69d..5e9880a 100644 --- a/modules/observability/otel-collector.tf +++ b/modules/observability/otel-collector.tf @@ -43,7 +43,6 @@ resource "helm_release" "otel_collector" { } } - // Contrib image supports all required features image = { repository = "otel/opentelemetry-collector-contrib" } @@ -107,7 +106,8 @@ resource "helm_release" "otel_collector" { tls_config = { insecure_skip_verify = true } - + + // Service Discovery Configuration kubernetes_sd_configs = [ { role = "pod" @@ -121,8 +121,6 @@ resource "helm_release" "otel_collector" { regex = "true" }, // Only scrape pods on the SAME NODE as this collector - // This uses the Env Var we injected above. - // Note: The double $$ is for Terraform escaping. Result in YAML: ${env:K8S_NODE_NAME} { source_labels = ["__meta_kubernetes_pod_node_name"] action = "keep" @@ -180,13 +178,13 @@ resource "helm_release" "otel_collector" { ] relabel_configs = [ - // 1. Only scrape the local node this DaemonSet pod is running on + // Only scrape the local node this DaemonSet pod is running on { source_labels = ["__meta_kubernetes_node_name"] action = "keep" regex = "$${env:K8S_NODE_NAME}" }, - // 2. Point directly to the internal cAdvisor endpoint + // Point directly to the internal cAdvisor endpoint { action = "replace" target_label = "__metrics_path__" @@ -205,7 +203,7 @@ resource "helm_release" "otel_collector" { // Strict memory limits for the 512Mi constraint memory_limiter = { check_interval = "5s" - limit_mib = 400 // Hard cap for the process (leaving 112Mi buffer for OS) + limit_mib = 400 spike_limit_mib = 100 } // Tag Netobserv logs appropriately @@ -220,6 +218,10 @@ resource "helm_release" "otel_collector" { } transform = { + // If a metric comes in missing its namespace or pod label, + // look at the underlying server/container it came from. + // If that server/container has a namespace or pod name + // attached to it, copy it over to the metric. metric_statements = [ { context = "datapoint" From f1821b496e54ee639ef0b17a1f48e3be2402b67e Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Mon, 9 Mar 2026 13:01:24 +0530 Subject: [PATCH 4/4] [INF] All modules switch to main branch --- infrastructure/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/main.tf b/infrastructure/main.tf index 90ba7e3..c09b011 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -22,7 +22,7 @@ module "cluster-issuer" { # Complete Observability Stack Deployment module "observability" { - source = "git::https://github.com/necro-cloud/modules//modules/observability?ref=task/105/network-dashboard" + source = "git::https://github.com/necro-cloud/modules//modules/observability?ref=main" // Certificates Details cluster_issuer_name = module.cluster-issuer.cluster-issuer-name