diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index fd6d41c1..638bb423 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0 +version: 0.12.0-rc.1 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index 146d61b1..e00b12f3 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -136,6 +136,38 @@ helm --namespace mlrun upgrade my-mlrun \ > **Note:** The above assumes a single-namespace installation. For multi-namespace (admin/non-admin) deployments, refer to the MLRun documentation. +#### Producer-side telemetry for mlrun-api + +The top-level `telemetry` block exposes OpenTelemetry producer-side config that mlrun-api consumes as `MLRUN_TELEMETRY__*` env vars. **Out of the box, telemetry is OFF**; enabling the in-cluster collector (`opentelemetry.collector.enabled=true`) is enough to turn mlrun-api telemetry on with in-cluster defaults — no other flags required. + +All four knobs default to `""`, which means "fall back to MLRun's own default". Override only the values you want to change. + +| Value | Chart default | Effective default at mlrun-api | +|---|---|---| +| `telemetry.enabled` | `""` (inherits collector state) | `false` when collector is off, `true` when on | +| `telemetry.otlpEndpoint` | `""` (derives in-cluster) | `otel-collector..svc.cluster.local:` | +| `telemetry.insecure` | `""` | `true` (MLRun default — plaintext gRPC, correct for in-cluster) | +| `telemetry.headersSecretName` | `""` | `""` (no auth headers) | + +Resolution rules: +- `telemetry.otlpEndpoint` blank + collector on → in-cluster endpoint above. +- `telemetry.enabled=true` with no in-cluster collector AND no `otlpEndpoint` → forced to `false` (safety: no listener means spans would silently drop). +- A user-supplied `otlpEndpoint` always wins over the in-cluster derivation. + +Example — point mlrun-api at an external OTLP endpoint without enabling the in-cluster collector: + +```bash +helm --namespace mlrun upgrade my-mlrun \ + --set telemetry.enabled=true \ + --set telemetry.otlpEndpoint=otlp.example.com:4317 \ + --set telemetry.insecure=false \ + mlrun/mlrun-ce +``` + +> 💡 **Using a SaaS or HTTPS endpoint?** Most cloud observability providers (Grafana Cloud, Honeycomb, Datadog, etc.) require TLS. Add `--set telemetry.insecure=false` so mlrun-api negotiates HTTPS instead of plaintext — without it, the connection fails silently in the background and your dashboard stays empty (mlrun-api itself keeps working normally). +> +> SaaS providers usually also require auth headers (Bearer token, `X-Scope-OrgID`, etc.). Create a K8s Secret with one key per header, then point the chart at it with `--set telemetry.headersSecretName=my-otlp-headers`. + ### Working with ECR To work with ECR, you must create a secret with your AWS credentials and a secret with ECR Token while providing both secret names to the helm install command. diff --git a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml index e72d7ce7..6a6e126b 100644 --- a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml +++ b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml @@ -28,4 +28,30 @@ data: MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__LIMITS__MEMORY: "{{ .Values.mlrun.defaultFunctionPodResources.limits.memory | default "" }}" MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__REQUESTS__CPU: "{{ .Values.mlrun.defaultFunctionPodResources.requests.cpu | default "" }}" MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__REQUESTS__MEMORY: "{{ .Values.mlrun.defaultFunctionPodResources.requests.memory | default "" }}" + # OpenTelemetry producer-side env vars (CEML-708). See top-level `telemetry` + # block in values.yaml. ENABLED is always emitted; others only when ENABLED=true. + {{- $tel := .Values.telemetry }} + {{- $collectorEnabled := .Values.opentelemetry.collector.enabled }} + {{- $userEndpoint := $tel.otlpEndpoint }} + {{- $enabled := $tel.enabled | toString }} + {{- if eq $enabled "" }} + {{- $enabled = $collectorEnabled | toString }} + {{- end }} + {{- if and (eq $enabled "true") (not $collectorEnabled) (not $userEndpoint) }} + {{- $enabled = "false" }} + {{- end }} + MLRUN_TELEMETRY__ENABLED: {{ $enabled | quote }} + {{- if eq $enabled "true" }} + {{- if $userEndpoint }} + MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ $userEndpoint | quote }} + {{- else }} + MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ printf "%s-collector.%s.svc.cluster.local:%v" (include "mlrun-ce.otel.collector.fullname" .) .Release.Namespace .Values.opentelemetry.collector.otlp.grpcPort | quote }} + {{- end }} + {{- if ne ($tel.insecure | toString) "" }} + MLRUN_TELEMETRY__INSECURE: {{ $tel.insecure | quote }} + {{- end }} + {{- if $tel.headersSecretName }} + MLRUN_TELEMETRY__HEADERS_SECRET_NAME: {{ $tel.headersSecretName | quote }} + {{- end }} + {{- end }} {{- end}} diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index a7609aff..021342ac 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -723,6 +723,31 @@ kafka: spark: enabled: true +# ============================================================================= +# Shared OpenTelemetry producer-side configuration (CEML-708) +# Top-level so any service in the bundle (mlrun-api today; Nuclio and future +# components later) reads the same OTel knobs. Wired into mlrun-api via +# MLRUN_TELEMETRY__* env vars in templates/config/mlrun-env-configmap.yaml. +# +# Resolution at render time: +# - enabled: "" inherits from .Values.opentelemetry.collector.enabled; +# explicit "true"/"false" overrides. Safety: enabled="true" with no +# in-cluster collector AND no user-supplied otlpEndpoint is forced to +# "false" (no listener — exporting would silently drop). +# - otlpEndpoint: "" derives otel-collector..svc.cluster.local: +# from the in-cluster collector; user-supplied value wins (e.g. external SaaS). +# - insecure: "" falls back to MLRun's own default (true, plaintext gRPC — +# correct for the in-cluster collector). Set "false" when pointing +# otlpEndpoint at a TLS-terminated endpoint (most SaaS providers). +# - headersSecretName: "" falls back to MLRun's own default (no auth headers). +# Only emitted as MLRUN_TELEMETRY__* env var when a value is supplied. +# ============================================================================= +telemetry: + enabled: "" + otlpEndpoint: "" + insecure: "" + headersSecretName: "" + # ============================================================================= # OpenTelemetry Operator configuration # Installs the OpenTelemetry Operator for managing collectors and instrumentation diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh index 2bff55dd..8f173655 100755 --- a/tests/helm-template-test.sh +++ b/tests/helm-template-test.sh @@ -463,6 +463,102 @@ test_otel_cr_installer_restart_guard() { } +# ============================================================================ +# Telemetry Env Var Tests (CEML-708) +# ============================================================================ + +# Defaults: telemetry.enabled is "" and collector is off → ENABLED=false, +# no other MLRUN_TELEMETRY__* keys emitted. +test_telemetry_default_inherits_collector_disabled() { + log_test "Telemetry - defaults inherit collector=disabled" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml") + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "false"' "Telemetry disabled by default" + assert_not_contains "$output" "MLRUN_TELEMETRY__OTLP_ENDPOINT" "No endpoint emitted when disabled" + assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "No insecure key when disabled" + assert_not_contains "$output" "MLRUN_TELEMETRY__HEADERS_SECRET_NAME" "No headers secret key when disabled" +} + +# Empty telemetry.enabled inherits from opentelemetry.collector.enabled; with +# collector on, ENABLED resolves to true and endpoint derives from the release +# namespace + configured grpc port. INSECURE is NOT emitted by default — +# mlrun-api falls back to its own default (true, plaintext gRPC, correct for +# the in-cluster collector). +test_telemetry_inherits_collector_enabled() { + log_test "Telemetry - inherits collector=enabled" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set opentelemetry.collector.enabled=true) + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "Telemetry inherits enabled=true" + assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "test-otel-collector.default.svc.cluster.local:4317"' "Endpoint derived from in-cluster collector (uses fullname helper)" + assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "Insecure not emitted by default (mlrun-api default = true)" +} + +# User-supplied otlpEndpoint always wins, even with the in-cluster collector +# off — supports pointing mlrun-api at an external SaaS endpoint. INSECURE is +# not auto-flipped here; mlrun-api falls back to its own default (true), and +# users targeting a TLS endpoint must explicitly set telemetry.insecure=false. +test_telemetry_external_endpoint() { + log_test "Telemetry - user external endpoint honored" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set telemetry.enabled=true \ + --set telemetry.otlpEndpoint=external.com:4317 \ + --set opentelemetry.collector.enabled=false) + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "User opt-in honored despite collector off" + assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "external.com:4317"' "User endpoint passed through verbatim" + assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "Insecure not auto-emitted for user endpoint (mlrun-api default applies)" +} + +# When the user explicitly sets telemetry.insecure (e.g. =false for a TLS +# endpoint), the chart MUST emit it — otherwise the mlrun-api default of +# true would silently break TLS. +test_telemetry_insecure_emitted_when_set() { + log_test "Telemetry - insecure emitted when user overrides" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set telemetry.enabled=true \ + --set telemetry.otlpEndpoint=external.com:4317 \ + --set telemetry.insecure=false) + + assert_contains "$output" 'MLRUN_TELEMETRY__INSECURE: "false"' "User-supplied insecure=false passed through" +} + +# Safety override: enabled=true with no in-cluster collector AND no user +# otlpEndpoint must force ENABLED=false to avoid silently dropping spans. +test_telemetry_safety_force_disable() { + log_test "Telemetry - safety forces disable when no listener" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set telemetry.enabled=true \ + --set opentelemetry.collector.enabled=false) + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "false"' "Safety override forces false" + assert_not_contains "$output" "MLRUN_TELEMETRY__OTLP_ENDPOINT" "No endpoint emitted when force-disabled" +} + +# headersSecretName must not be rendered as an env var when telemetry is off — +# downstream consumers shouldn't see a stale auth-headers reference. +test_telemetry_headers_secret_emitted_only_when_enabled() { + log_test "Telemetry - headers secret skipped when disabled" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set telemetry.headersSecretName=my-secret \ + --set telemetry.enabled=false) + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "false"' "Explicit disable honored" + assert_not_contains "$output" "MLRUN_TELEMETRY__HEADERS_SECRET_NAME" "Headers secret skipped when disabled" +} + # ============================================================================ # Full Chart Render Test # ============================================================================ @@ -563,6 +659,17 @@ main() { test_otel_cr_installer_retry_counter test_otel_cr_installer_restart_guard + echo "" + echo "========================================" + echo "Telemetry Env Var Tests" + echo "========================================" + test_telemetry_default_inherits_collector_disabled + test_telemetry_inherits_collector_enabled + test_telemetry_external_endpoint + test_telemetry_insecure_emitted_when_set + test_telemetry_safety_force_disable + test_telemetry_headers_secret_emitted_only_when_enabled + echo "" echo "========================================" echo "Full Chart Tests"