From 74f8db7cace3afd52e1521a990a463052c89330c Mon Sep 17 00:00:00 2001 From: royischoss Date: Sun, 24 May 2026 12:56:03 +0300 Subject: [PATCH 1/6] =?UTF-8?q?Files=20changed=20(4,=20+68/-1):=20=20=20-?= =?UTF-8?q?=20values.yaml=20=E2=80=94=20top-level=20telemetry:=20block=20w?= =?UTF-8?q?ith=20""=20default=20for=20enabled=20(inherits=20collector=20st?= =?UTF-8?q?ate)=20=20=20-=20templates/config/mlrun-env-configmap.yaml=20?= =?UTF-8?q?=E2=80=94=20resolution=20logic=20+=20safety=20override=20+=20en?= =?UTF-8?q?dpoint=20derivation=20=20=20-=20README.md=20=E2=80=94=20opt-in?= =?UTF-8?q?=20docs=20with=20truth=20table=20and=20external-endpoint=20exam?= =?UTF-8?q?ple=20=20=20-=20Chart.yaml=20=E2=80=94=20bumped=20to=200.11.0-r?= =?UTF-8?q?c.38?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CLAUDE.md | 2 +- charts/mlrun-ce/Chart.yaml | 2 +- charts/mlrun-ce/README.md | 23 ++++++++++++++++++ .../templates/config/mlrun-env-configmap.yaml | 24 +++++++++++++++++++ charts/mlrun-ce/values.yaml | 20 ++++++++++++++++ 5 files changed, 69 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 6a998595..59acdaa4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ ## Preferred Response Patterns -- Values changes: show `--set` flags or a patch values file overlay, not edits to `values.yaml` directly, unless there is a change with the default value that should be reflected in `values.yaml` (e.g. a new component's `enabled` flag) +-it Values changes: show `--set` flags or a patch values file overlay, not edits to `values.yaml` directly, unless there is a change with the default value that should be reflected in `values.yaml` (e.g. a new component's `enabled` flag) - New templates: show the complete file including the `{{- if .Values..enabled }}` guard and `include "mlrun-ce.common.labels"` call - Service references within templates: use `{{ .Release.Namespace }}`, never hardcode namespace strings - After any `requirements.yaml` change: remind the user to run `make helm-update-dependencies` and commit `requirements.lock` diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index 166ad46f..448f4c52 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc.37 +version: 0.11.0-rc.38 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index f77e16ac..9227b385 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -136,6 +136,29 @@ helm --namespace mlrun upgrade my-mlrun \ > **Note:** The above assumes a single-namespace installation. For multi-namespace (admin/non-admin) deployments, refer to the MLRun documentation. +#### Producer-side telemetry for mlrun-api + +The top-level `telemetry` block exposes OpenTelemetry producer-side config that mlrun-api consumes as `MLRUN_TELEMETRY__*` env vars. By default `telemetry.enabled: ""` inherits from `opentelemetry.collector.enabled` — so enabling the in-cluster collector also turns mlrun-api telemetry on without any extra flag. + +| Value | Default | Purpose | +|---|---|---| +| `telemetry.enabled` | `""` (inherits collector state) | `"true"`/`"false"` to override explicitly | +| `telemetry.otlpEndpoint` | `""` (derives in-cluster) | Override with an external endpoint (e.g. SaaS) | +| `telemetry.insecure` | `"true"` | Set `"false"` for TLS-terminated endpoints | +| `telemetry.headersSecretName` | `""` | K8s Secret with OTLP auth headers (file-mount wiring is future work) | + +When `telemetry.otlpEndpoint` is blank and the in-cluster collector is on, the endpoint resolves to `otel-collector..svc.cluster.local:`. As a safety check, `telemetry.enabled=true` with no in-cluster collector AND no `otlpEndpoint` is forced to `false` to avoid silently dropping spans. + +Example — point mlrun-api at an external OTLP endpoint without enabling the in-cluster collector: + +```bash +helm --namespace mlrun upgrade my-mlrun \ + --set telemetry.enabled=true \ + --set telemetry.otlpEndpoint=otlp.example.com:4317 \ + --set telemetry.insecure=false \ + mlrun/mlrun-ce +``` + ### Working with ECR To work with ECR, you must create a secret with your AWS credentials and a secret with ECR Token while providing both secret names to the helm install command. diff --git a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml index e72d7ce7..65fdba8d 100644 --- a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml +++ b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml @@ -28,4 +28,28 @@ data: MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__LIMITS__MEMORY: "{{ .Values.mlrun.defaultFunctionPodResources.limits.memory | default "" }}" MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__REQUESTS__CPU: "{{ .Values.mlrun.defaultFunctionPodResources.requests.cpu | default "" }}" MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__REQUESTS__MEMORY: "{{ .Values.mlrun.defaultFunctionPodResources.requests.memory | default "" }}" + # OpenTelemetry producer-side env vars (CEML-708). See top-level `telemetry` + # block in values.yaml. ENABLED is always emitted; others only when ENABLED=true. + {{- $tel := .Values.telemetry }} + {{- $collectorEnabled := .Values.opentelemetry.collector.enabled }} + {{- $userEndpoint := $tel.otlpEndpoint }} + {{- $enabled := $tel.enabled | toString }} + {{- if eq $enabled "" }} + {{- $enabled = $collectorEnabled | toString }} + {{- end }} + {{- if and (eq $enabled "true") (not $collectorEnabled) (not $userEndpoint) }} + {{- $enabled = "false" }} + {{- end }} + MLRUN_TELEMETRY__ENABLED: {{ $enabled | quote }} + {{- if eq $enabled "true" }} + MLRUN_TELEMETRY__INSECURE: {{ $tel.insecure | quote }} + {{- if $userEndpoint }} + MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ $userEndpoint | quote }} + {{- else }} + MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ printf "otel-collector.%s.svc.cluster.local:%v" .Release.Namespace .Values.opentelemetry.collector.otlp.grpcPort | quote }} + {{- end }} + {{- if $tel.headersSecretName }} + MLRUN_TELEMETRY__HEADERS_SECRET_NAME: {{ $tel.headersSecretName | quote }} + {{- end }} + {{- end }} {{- end}} diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 008b2154..30507b87 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -729,6 +729,26 @@ kafka: spark: enabled: true +# ============================================================================= +# Shared OpenTelemetry producer-side configuration (CEML-708) +# Top-level so any service in the bundle (mlrun-api today; Nuclio and future +# components later) reads the same OTel knobs. Wired into mlrun-api via +# MLRUN_TELEMETRY__* env vars in templates/config/mlrun-env-configmap.yaml. +# +# Resolution at render time: +# - enabled: "" inherits from .Values.opentelemetry.collector.enabled; +# explicit "true"/"false" overrides. Safety: enabled="true" with no +# in-cluster collector AND no user-supplied otlpEndpoint is forced to +# "false" (no listener — exporting would silently drop). +# - otlpEndpoint: "" derives otel-collector..svc.cluster.local: +# from the in-cluster collector; user-supplied value wins (e.g. external SaaS). +# ============================================================================= +telemetry: + enabled: "" + otlpEndpoint: "" + insecure: "true" + headersSecretName: "" + # ============================================================================= # OpenTelemetry Operator configuration # Installs the OpenTelemetry Operator for managing collectors and instrumentation From cc98fcf75632453b620c775e29e5e4ec541f3a44 Mon Sep 17 00:00:00 2001 From: royischoss Date: Tue, 26 May 2026 17:40:45 +0300 Subject: [PATCH 2/6] Add telemetry unit tests + CLAUDE.md typo fix Review feedback from davesh0812 on PR #297: - Add 5 helm-template-test.sh functions covering the telemetry resolution matrix (14 asserts): * test_telemetry_default_inherits_collector_disabled * test_telemetry_inherits_collector_enabled * test_telemetry_external_endpoint * test_telemetry_safety_force_disable * test_telemetry_headers_secret_emitted_only_when_enabled - Apply suggested fix to CLAUDE.md line 7 (drop stray "-it "). - Bump chart version to 0.11.0-rc.39. --- CLAUDE.md | 2 +- charts/mlrun-ce/Chart.yaml | 2 +- tests/helm-template-test.sh | 86 +++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 59acdaa4..6a998595 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ ## Preferred Response Patterns --it Values changes: show `--set` flags or a patch values file overlay, not edits to `values.yaml` directly, unless there is a change with the default value that should be reflected in `values.yaml` (e.g. a new component's `enabled` flag) +- Values changes: show `--set` flags or a patch values file overlay, not edits to `values.yaml` directly, unless there is a change with the default value that should be reflected in `values.yaml` (e.g. a new component's `enabled` flag) - New templates: show the complete file including the `{{- if .Values..enabled }}` guard and `include "mlrun-ce.common.labels"` call - Service references within templates: use `{{ .Release.Namespace }}`, never hardcode namespace strings - After any `requirements.yaml` change: remind the user to run `make helm-update-dependencies` and commit `requirements.lock` diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index 448f4c52..94fdc2a1 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc.38 +version: 0.11.0-rc.39 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh index 2bff55dd..ac0bb6b3 100755 --- a/tests/helm-template-test.sh +++ b/tests/helm-template-test.sh @@ -463,6 +463,82 @@ test_otel_cr_installer_restart_guard() { } +# ============================================================================ +# Telemetry Env Var Tests (CEML-708) +# ============================================================================ + +# Defaults: telemetry.enabled is "" and collector is off → ENABLED=false, +# no other MLRUN_TELEMETRY__* keys emitted. +test_telemetry_default_inherits_collector_disabled() { + log_test "Telemetry - defaults inherit collector=disabled" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml") + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "false"' "Telemetry disabled by default" + assert_not_contains "$output" "MLRUN_TELEMETRY__OTLP_ENDPOINT" "No endpoint emitted when disabled" + assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "No insecure key when disabled" + assert_not_contains "$output" "MLRUN_TELEMETRY__HEADERS_SECRET_NAME" "No headers secret key when disabled" +} + +# Empty telemetry.enabled inherits from opentelemetry.collector.enabled; with +# collector on, ENABLED resolves to true and endpoint derives from the release +# namespace + configured grpc port. +test_telemetry_inherits_collector_enabled() { + log_test "Telemetry - inherits collector=enabled" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set opentelemetry.collector.enabled=true) + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "Telemetry inherits enabled=true" + assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "otel-collector.default.svc.cluster.local:4317"' "Endpoint derived from in-cluster collector" + assert_contains "$output" 'MLRUN_TELEMETRY__INSECURE: "true"' "Insecure default emitted" +} + +# User-supplied otlpEndpoint always wins, even with the in-cluster collector +# off — supports pointing mlrun-api at an external SaaS endpoint. +test_telemetry_external_endpoint() { + log_test "Telemetry - user external endpoint honored" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set telemetry.enabled=true \ + --set telemetry.otlpEndpoint=external.com:4317 \ + --set opentelemetry.collector.enabled=false) + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "User opt-in honored despite collector off" + assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "external.com:4317"' "User endpoint passed through verbatim" +} + +# Safety override: enabled=true with no in-cluster collector AND no user +# otlpEndpoint must force ENABLED=false to avoid silently dropping spans. +test_telemetry_safety_force_disable() { + log_test "Telemetry - safety forces disable when no listener" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set telemetry.enabled=true \ + --set opentelemetry.collector.enabled=false) + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "false"' "Safety override forces false" + assert_not_contains "$output" "MLRUN_TELEMETRY__OTLP_ENDPOINT" "No endpoint emitted when force-disabled" +} + +# headersSecretName must not be rendered as an env var when telemetry is off — +# downstream consumers shouldn't see a stale auth-headers reference. +test_telemetry_headers_secret_emitted_only_when_enabled() { + log_test "Telemetry - headers secret skipped when disabled" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set telemetry.headersSecretName=my-secret \ + --set telemetry.enabled=false) + + assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "false"' "Explicit disable honored" + assert_not_contains "$output" "MLRUN_TELEMETRY__HEADERS_SECRET_NAME" "Headers secret skipped when disabled" +} + # ============================================================================ # Full Chart Render Test # ============================================================================ @@ -563,6 +639,16 @@ main() { test_otel_cr_installer_retry_counter test_otel_cr_installer_restart_guard + echo "" + echo "========================================" + echo "Telemetry Env Var Tests" + echo "========================================" + test_telemetry_default_inherits_collector_disabled + test_telemetry_inherits_collector_enabled + test_telemetry_external_endpoint + test_telemetry_safety_force_disable + test_telemetry_headers_secret_emitted_only_when_enabled + echo "" echo "========================================" echo "Full Chart Tests" From 42bda37b0f80ba16f1c03625edc6630a16d96d06 Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 3 Jun 2026 10:23:28 +0300 Subject: [PATCH 3/6] version in Chart.yaml --- charts/mlrun-ce/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index d28b26cd..638bb423 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc.40 +version: 0.12.0-rc.1 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png From 7c30c5d0cbc4a07ec6c949fb8d9a11b7f9b4b863 Mon Sep 17 00:00:00 2001 From: royischoss Date: Tue, 9 Jun 2026 11:28:50 +0300 Subject: [PATCH 4/6] fix cr defaults. --- charts/mlrun-ce/README.md | 20 +++++++++------ .../templates/config/mlrun-env-configmap.yaml | 6 ++++- charts/mlrun-ce/values.yaml | 8 +++++- tests/helm-template-test.sh | 25 ++++++++++++++++--- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index c9f6e320..128d9da3 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -138,16 +138,21 @@ helm --namespace mlrun upgrade my-mlrun \ #### Producer-side telemetry for mlrun-api -The top-level `telemetry` block exposes OpenTelemetry producer-side config that mlrun-api consumes as `MLRUN_TELEMETRY__*` env vars. By default `telemetry.enabled: ""` inherits from `opentelemetry.collector.enabled` — so enabling the in-cluster collector also turns mlrun-api telemetry on without any extra flag. +The top-level `telemetry` block exposes OpenTelemetry producer-side config that mlrun-api consumes as `MLRUN_TELEMETRY__*` env vars. **Out of the box, telemetry is OFF**; enabling the in-cluster collector (`opentelemetry.collector.enabled=true`) is enough to turn mlrun-api telemetry on with in-cluster defaults — no other flags required. -| Value | Default | Purpose | +All four knobs default to `""`, which means "fall back to MLRun's own default". Override only the values you want to change. + +| Value | Chart default | Effective default at mlrun-api | |---|---|---| -| `telemetry.enabled` | `""` (inherits collector state) | `"true"`/`"false"` to override explicitly | -| `telemetry.otlpEndpoint` | `""` (derives in-cluster) | Override with an external endpoint (e.g. SaaS) | -| `telemetry.insecure` | `"true"` | Set `"false"` for TLS-terminated endpoints | -| `telemetry.headersSecretName` | `""` | K8s Secret with OTLP auth headers (file-mount wiring is future work) | +| `telemetry.enabled` | `""` (inherits collector state) | `false` when collector is off, `true` when on | +| `telemetry.otlpEndpoint` | `""` (derives in-cluster) | `otel-collector..svc.cluster.local:` | +| `telemetry.insecure` | `""` (auto by endpoint) | `true` for in-cluster (plaintext); `false` when `otlpEndpoint` is user-supplied (TLS) | +| `telemetry.headersSecretName` | `""` | `""` (no auth headers) | -When `telemetry.otlpEndpoint` is blank and the in-cluster collector is on, the endpoint resolves to `otel-collector..svc.cluster.local:`. As a safety check, `telemetry.enabled=true` with no in-cluster collector AND no `otlpEndpoint` is forced to `false` to avoid silently dropping spans. +Resolution rules: +- `telemetry.otlpEndpoint` blank + collector on → in-cluster endpoint above. +- `telemetry.enabled=true` with no in-cluster collector AND no `otlpEndpoint` → forced to `false` (safety: no listener means spans would silently drop). +- A user-supplied `otlpEndpoint` always wins over the in-cluster derivation, and flips `insecure` to `false` by default (override with `--set telemetry.insecure=true` for a plaintext external listener). Example — point mlrun-api at an external OTLP endpoint without enabling the in-cluster collector: @@ -155,7 +160,6 @@ Example — point mlrun-api at an external OTLP endpoint without enabling the in helm --namespace mlrun upgrade my-mlrun \ --set telemetry.enabled=true \ --set telemetry.otlpEndpoint=otlp.example.com:4317 \ - --set telemetry.insecure=false \ mlrun/mlrun-ce ``` diff --git a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml index 65fdba8d..1096ec26 100644 --- a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml +++ b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml @@ -42,12 +42,16 @@ data: {{- end }} MLRUN_TELEMETRY__ENABLED: {{ $enabled | quote }} {{- if eq $enabled "true" }} - MLRUN_TELEMETRY__INSECURE: {{ $tel.insecure | quote }} {{- if $userEndpoint }} MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ $userEndpoint | quote }} {{- else }} MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ printf "otel-collector.%s.svc.cluster.local:%v" .Release.Namespace .Values.opentelemetry.collector.otlp.grpcPort | quote }} {{- end }} + {{- if ne ($tel.insecure | toString) "" }} + MLRUN_TELEMETRY__INSECURE: {{ $tel.insecure | quote }} + {{- else if $userEndpoint }} + MLRUN_TELEMETRY__INSECURE: "false" + {{- end }} {{- if $tel.headersSecretName }} MLRUN_TELEMETRY__HEADERS_SECRET_NAME: {{ $tel.headersSecretName | quote }} {{- end }} diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 13715369..62362dce 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -736,11 +736,17 @@ spark: # "false" (no listener — exporting would silently drop). # - otlpEndpoint: "" derives otel-collector..svc.cluster.local: # from the in-cluster collector; user-supplied value wins (e.g. external SaaS). +# - insecure: "" auto-resolves based on the endpoint — plaintext for the +# in-cluster collector (falls back to MLRun's true default), "false" when +# a user-supplied otlpEndpoint is in play (external endpoints are +# TLS-terminated by default). Explicit "true"/"false" always overrides. +# - headersSecretName: "" falls back to MLRun's own default (no auth headers). +# Only emitted as MLRUN_TELEMETRY__* env var when a value is supplied. # ============================================================================= telemetry: enabled: "" otlpEndpoint: "" - insecure: "true" + insecure: "" headersSecretName: "" # ============================================================================= diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh index ac0bb6b3..41f3ed86 100755 --- a/tests/helm-template-test.sh +++ b/tests/helm-template-test.sh @@ -483,7 +483,9 @@ test_telemetry_default_inherits_collector_disabled() { # Empty telemetry.enabled inherits from opentelemetry.collector.enabled; with # collector on, ENABLED resolves to true and endpoint derives from the release -# namespace + configured grpc port. +# namespace + configured grpc port. INSECURE is NOT emitted by default — +# mlrun-api falls back to its own default (true, plaintext gRPC, correct for +# the in-cluster collector). test_telemetry_inherits_collector_enabled() { log_test "Telemetry - inherits collector=enabled" @@ -493,11 +495,12 @@ test_telemetry_inherits_collector_enabled() { assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "Telemetry inherits enabled=true" assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "otel-collector.default.svc.cluster.local:4317"' "Endpoint derived from in-cluster collector" - assert_contains "$output" 'MLRUN_TELEMETRY__INSECURE: "true"' "Insecure default emitted" + assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "Insecure not emitted by default (mlrun-api default = true)" } # User-supplied otlpEndpoint always wins, even with the in-cluster collector -# off — supports pointing mlrun-api at an external SaaS endpoint. +# off — supports pointing mlrun-api at an external SaaS endpoint. The chart +# auto-defaults insecure=false in this path so users don't silently break TLS. test_telemetry_external_endpoint() { log_test "Telemetry - user external endpoint honored" @@ -509,6 +512,21 @@ test_telemetry_external_endpoint() { assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "User opt-in honored despite collector off" assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "external.com:4317"' "User endpoint passed through verbatim" + assert_contains "$output" 'MLRUN_TELEMETRY__INSECURE: "false"' "Insecure auto-defaults to false for user endpoint (TLS)" +} + +# Explicit user override always wins over the auto-default — covers the edge +# case of a plaintext external listener (insecure=true with otlpEndpoint set). +test_telemetry_insecure_explicit_override() { + log_test "Telemetry - explicit insecure overrides auto-default" + + local output + output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ + --set telemetry.enabled=true \ + --set telemetry.otlpEndpoint=external.com:4317 \ + --set telemetry.insecure=true) + + assert_contains "$output" 'MLRUN_TELEMETRY__INSECURE: "true"' "User-supplied insecure=true wins over auto-default" } # Safety override: enabled=true with no in-cluster collector AND no user @@ -646,6 +664,7 @@ main() { test_telemetry_default_inherits_collector_disabled test_telemetry_inherits_collector_enabled test_telemetry_external_endpoint + test_telemetry_insecure_explicit_override test_telemetry_safety_force_disable test_telemetry_headers_secret_emitted_only_when_enabled From f56349b92e75d246f8b42697aa1e741199c8c7c8 Mon Sep 17 00:00:00 2001 From: royischoss Date: Tue, 9 Jun 2026 11:43:37 +0300 Subject: [PATCH 5/6] fix cr defaults. --- charts/mlrun-ce/README.md | 9 ++++++-- .../templates/config/mlrun-env-configmap.yaml | 2 -- charts/mlrun-ce/values.yaml | 7 +++--- tests/helm-template-test.sh | 22 ++++++++++--------- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index 128d9da3..e00b12f3 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -146,13 +146,13 @@ All four knobs default to `""`, which means "fall back to MLRun's own default". |---|---|---| | `telemetry.enabled` | `""` (inherits collector state) | `false` when collector is off, `true` when on | | `telemetry.otlpEndpoint` | `""` (derives in-cluster) | `otel-collector..svc.cluster.local:` | -| `telemetry.insecure` | `""` (auto by endpoint) | `true` for in-cluster (plaintext); `false` when `otlpEndpoint` is user-supplied (TLS) | +| `telemetry.insecure` | `""` | `true` (MLRun default — plaintext gRPC, correct for in-cluster) | | `telemetry.headersSecretName` | `""` | `""` (no auth headers) | Resolution rules: - `telemetry.otlpEndpoint` blank + collector on → in-cluster endpoint above. - `telemetry.enabled=true` with no in-cluster collector AND no `otlpEndpoint` → forced to `false` (safety: no listener means spans would silently drop). -- A user-supplied `otlpEndpoint` always wins over the in-cluster derivation, and flips `insecure` to `false` by default (override with `--set telemetry.insecure=true` for a plaintext external listener). +- A user-supplied `otlpEndpoint` always wins over the in-cluster derivation. Example — point mlrun-api at an external OTLP endpoint without enabling the in-cluster collector: @@ -160,9 +160,14 @@ Example — point mlrun-api at an external OTLP endpoint without enabling the in helm --namespace mlrun upgrade my-mlrun \ --set telemetry.enabled=true \ --set telemetry.otlpEndpoint=otlp.example.com:4317 \ + --set telemetry.insecure=false \ mlrun/mlrun-ce ``` +> 💡 **Using a SaaS or HTTPS endpoint?** Most cloud observability providers (Grafana Cloud, Honeycomb, Datadog, etc.) require TLS. Add `--set telemetry.insecure=false` so mlrun-api negotiates HTTPS instead of plaintext — without it, the connection fails silently in the background and your dashboard stays empty (mlrun-api itself keeps working normally). +> +> SaaS providers usually also require auth headers (Bearer token, `X-Scope-OrgID`, etc.). Create a K8s Secret with one key per header, then point the chart at it with `--set telemetry.headersSecretName=my-otlp-headers`. + ### Working with ECR To work with ECR, you must create a secret with your AWS credentials and a secret with ECR Token while providing both secret names to the helm install command. diff --git a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml index 1096ec26..1abb780e 100644 --- a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml +++ b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml @@ -49,8 +49,6 @@ data: {{- end }} {{- if ne ($tel.insecure | toString) "" }} MLRUN_TELEMETRY__INSECURE: {{ $tel.insecure | quote }} - {{- else if $userEndpoint }} - MLRUN_TELEMETRY__INSECURE: "false" {{- end }} {{- if $tel.headersSecretName }} MLRUN_TELEMETRY__HEADERS_SECRET_NAME: {{ $tel.headersSecretName | quote }} diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 62362dce..021342ac 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -736,10 +736,9 @@ spark: # "false" (no listener — exporting would silently drop). # - otlpEndpoint: "" derives otel-collector..svc.cluster.local: # from the in-cluster collector; user-supplied value wins (e.g. external SaaS). -# - insecure: "" auto-resolves based on the endpoint — plaintext for the -# in-cluster collector (falls back to MLRun's true default), "false" when -# a user-supplied otlpEndpoint is in play (external endpoints are -# TLS-terminated by default). Explicit "true"/"false" always overrides. +# - insecure: "" falls back to MLRun's own default (true, plaintext gRPC — +# correct for the in-cluster collector). Set "false" when pointing +# otlpEndpoint at a TLS-terminated endpoint (most SaaS providers). # - headersSecretName: "" falls back to MLRun's own default (no auth headers). # Only emitted as MLRUN_TELEMETRY__* env var when a value is supplied. # ============================================================================= diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh index 41f3ed86..551f30b0 100755 --- a/tests/helm-template-test.sh +++ b/tests/helm-template-test.sh @@ -499,8 +499,9 @@ test_telemetry_inherits_collector_enabled() { } # User-supplied otlpEndpoint always wins, even with the in-cluster collector -# off — supports pointing mlrun-api at an external SaaS endpoint. The chart -# auto-defaults insecure=false in this path so users don't silently break TLS. +# off — supports pointing mlrun-api at an external SaaS endpoint. INSECURE is +# not auto-flipped here; mlrun-api falls back to its own default (true), and +# users targeting a TLS endpoint must explicitly set telemetry.insecure=false. test_telemetry_external_endpoint() { log_test "Telemetry - user external endpoint honored" @@ -512,21 +513,22 @@ test_telemetry_external_endpoint() { assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "User opt-in honored despite collector off" assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "external.com:4317"' "User endpoint passed through verbatim" - assert_contains "$output" 'MLRUN_TELEMETRY__INSECURE: "false"' "Insecure auto-defaults to false for user endpoint (TLS)" + assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "Insecure not auto-emitted for user endpoint (mlrun-api default applies)" } -# Explicit user override always wins over the auto-default — covers the edge -# case of a plaintext external listener (insecure=true with otlpEndpoint set). -test_telemetry_insecure_explicit_override() { - log_test "Telemetry - explicit insecure overrides auto-default" +# When the user explicitly sets telemetry.insecure (e.g. =false for a TLS +# endpoint), the chart MUST emit it — otherwise the mlrun-api default of +# true would silently break TLS. +test_telemetry_insecure_emitted_when_set() { + log_test "Telemetry - insecure emitted when user overrides" local output output=$(render_template "templates/config/mlrun-env-configmap.yaml" \ --set telemetry.enabled=true \ --set telemetry.otlpEndpoint=external.com:4317 \ - --set telemetry.insecure=true) + --set telemetry.insecure=false) - assert_contains "$output" 'MLRUN_TELEMETRY__INSECURE: "true"' "User-supplied insecure=true wins over auto-default" + assert_contains "$output" 'MLRUN_TELEMETRY__INSECURE: "false"' "User-supplied insecure=false passed through" } # Safety override: enabled=true with no in-cluster collector AND no user @@ -664,7 +666,7 @@ main() { test_telemetry_default_inherits_collector_disabled test_telemetry_inherits_collector_enabled test_telemetry_external_endpoint - test_telemetry_insecure_explicit_override + test_telemetry_insecure_emitted_when_set test_telemetry_safety_force_disable test_telemetry_headers_secret_emitted_only_when_enabled From f7c089fe6ebbc1fbf30ce7e6b113ac47319a08e9 Mon Sep 17 00:00:00 2001 From: royischoss Date: Tue, 9 Jun 2026 12:06:49 +0300 Subject: [PATCH 6/6] fix collector name as otlp endpoint --- charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml | 2 +- tests/helm-template-test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml index 1abb780e..6a6e126b 100644 --- a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml +++ b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml @@ -45,7 +45,7 @@ data: {{- if $userEndpoint }} MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ $userEndpoint | quote }} {{- else }} - MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ printf "otel-collector.%s.svc.cluster.local:%v" .Release.Namespace .Values.opentelemetry.collector.otlp.grpcPort | quote }} + MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ printf "%s-collector.%s.svc.cluster.local:%v" (include "mlrun-ce.otel.collector.fullname" .) .Release.Namespace .Values.opentelemetry.collector.otlp.grpcPort | quote }} {{- end }} {{- if ne ($tel.insecure | toString) "" }} MLRUN_TELEMETRY__INSECURE: {{ $tel.insecure | quote }} diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh index 551f30b0..8f173655 100755 --- a/tests/helm-template-test.sh +++ b/tests/helm-template-test.sh @@ -494,7 +494,7 @@ test_telemetry_inherits_collector_enabled() { --set opentelemetry.collector.enabled=true) assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "Telemetry inherits enabled=true" - assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "otel-collector.default.svc.cluster.local:4317"' "Endpoint derived from in-cluster collector" + assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "test-otel-collector.default.svc.cluster.local:4317"' "Endpoint derived from in-cluster collector (uses fullname helper)" assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "Insecure not emitted by default (mlrun-api default = true)" }