Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/mlrun-ce/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v1
name: mlrun-ce
version: 0.11.0
version: 0.12.0-rc.1
description: MLRun Open Source Stack
home: https://iguazio.com
icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png
Expand Down
32 changes: 32 additions & 0 deletions charts/mlrun-ce/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,38 @@ helm --namespace mlrun upgrade my-mlrun \

> **Note:** The above assumes a single-namespace installation. For multi-namespace (admin/non-admin) deployments, refer to the MLRun documentation.

#### Producer-side telemetry for mlrun-api

The top-level `telemetry` block exposes OpenTelemetry producer-side config that mlrun-api consumes as `MLRUN_TELEMETRY__*` env vars. **Out of the box, telemetry is OFF**; enabling the in-cluster collector (`opentelemetry.collector.enabled=true`) is enough to turn mlrun-api telemetry on with in-cluster defaults β€” no other flags required.

All four knobs default to `""`, which means "fall back to MLRun's own default". Override only the values you want to change.

| Value | Chart default | Effective default at mlrun-api |
|---|---|---|
| `telemetry.enabled` | `""` (inherits collector state) | `false` when collector is off, `true` when on |
| `telemetry.otlpEndpoint` | `""` (derives in-cluster) | `otel-collector.<release-ns>.svc.cluster.local:<grpcPort>` |
| `telemetry.insecure` | `""` | `true` (MLRun default β€” plaintext gRPC, correct for in-cluster) |
| `telemetry.headersSecretName` | `""` | `""` (no auth headers) |

Resolution rules:
- `telemetry.otlpEndpoint` blank + collector on β†’ in-cluster endpoint above.
- `telemetry.enabled=true` with no in-cluster collector AND no `otlpEndpoint` β†’ forced to `false` (safety: no listener means spans would silently drop).
- A user-supplied `otlpEndpoint` always wins over the in-cluster derivation.

Example β€” point mlrun-api at an external OTLP endpoint without enabling the in-cluster collector:

```bash
helm --namespace mlrun upgrade my-mlrun \
--set telemetry.enabled=true \
--set telemetry.otlpEndpoint=otlp.example.com:4317 \
--set telemetry.insecure=false \
mlrun/mlrun-ce
```

> πŸ’‘ **Using a SaaS or HTTPS endpoint?** Most cloud observability providers (Grafana Cloud, Honeycomb, Datadog, etc.) require TLS. Add `--set telemetry.insecure=false` so mlrun-api negotiates HTTPS instead of plaintext β€” without it, the connection fails silently in the background and your dashboard stays empty (mlrun-api itself keeps working normally).
>
> SaaS providers usually also require auth headers (Bearer token, `X-Scope-OrgID`, etc.). Create a K8s Secret with one key per header, then point the chart at it with `--set telemetry.headersSecretName=my-otlp-headers`.

### Working with ECR

To work with ECR, you must create a secret with your AWS credentials and a secret with ECR Token while providing both secret names to the helm install command.
Expand Down
26 changes: 26 additions & 0 deletions charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,30 @@ data:
MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__LIMITS__MEMORY: "{{ .Values.mlrun.defaultFunctionPodResources.limits.memory | default "" }}"
MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__REQUESTS__CPU: "{{ .Values.mlrun.defaultFunctionPodResources.requests.cpu | default "" }}"
MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__REQUESTS__MEMORY: "{{ .Values.mlrun.defaultFunctionPodResources.requests.memory | default "" }}"
# OpenTelemetry producer-side env vars (CEML-708). See top-level `telemetry`
# block in values.yaml. ENABLED is always emitted; others only when ENABLED=true.
{{- $tel := .Values.telemetry }}
{{- $collectorEnabled := .Values.opentelemetry.collector.enabled }}
{{- $userEndpoint := $tel.otlpEndpoint }}
{{- $enabled := $tel.enabled | toString }}
{{- if eq $enabled "" }}
{{- $enabled = $collectorEnabled | toString }}
{{- end }}
{{- if and (eq $enabled "true") (not $collectorEnabled) (not $userEndpoint) }}
{{- $enabled = "false" }}
{{- end }}
MLRUN_TELEMETRY__ENABLED: {{ $enabled | quote }}
{{- if eq $enabled "true" }}
{{- if $userEndpoint }}
MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ $userEndpoint | quote }}
{{- else }}
MLRUN_TELEMETRY__OTLP_ENDPOINT: {{ printf "%s-collector.%s.svc.cluster.local:%v" (include "mlrun-ce.otel.collector.fullname" .) .Release.Namespace .Values.opentelemetry.collector.otlp.grpcPort | quote }}
{{- end }}
{{- if ne ($tel.insecure | toString) "" }}
MLRUN_TELEMETRY__INSECURE: {{ $tel.insecure | quote }}
{{- end }}
{{- if $tel.headersSecretName }}
MLRUN_TELEMETRY__HEADERS_SECRET_NAME: {{ $tel.headersSecretName | quote }}
{{- end }}
{{- end }}
{{- end}}
25 changes: 25 additions & 0 deletions charts/mlrun-ce/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,31 @@ kafka:
spark:
enabled: true

# =============================================================================
# Shared OpenTelemetry producer-side configuration (CEML-708)
# Top-level so any service in the bundle (mlrun-api today; Nuclio and future
# components later) reads the same OTel knobs. Wired into mlrun-api via
# MLRUN_TELEMETRY__* env vars in templates/config/mlrun-env-configmap.yaml.
#
# Resolution at render time:
# - enabled: "" inherits from .Values.opentelemetry.collector.enabled;
# explicit "true"/"false" overrides. Safety: enabled="true" with no
# in-cluster collector AND no user-supplied otlpEndpoint is forced to
# "false" (no listener β€” exporting would silently drop).
# - otlpEndpoint: "" derives otel-collector.<release-ns>.svc.cluster.local:<grpcPort>
# from the in-cluster collector; user-supplied value wins (e.g. external SaaS).
# - insecure: "" falls back to MLRun's own default (true, plaintext gRPC β€”
# correct for the in-cluster collector). Set "false" when pointing
# otlpEndpoint at a TLS-terminated endpoint (most SaaS providers).
# - headersSecretName: "" falls back to MLRun's own default (no auth headers).
# Only emitted as MLRUN_TELEMETRY__* env var when a value is supplied.
# =============================================================================
telemetry:
Comment thread
royischoss marked this conversation as resolved.
enabled: ""
otlpEndpoint: ""
insecure: ""
headersSecretName: ""

# =============================================================================
# OpenTelemetry Operator configuration
# Installs the OpenTelemetry Operator for managing collectors and instrumentation
Expand Down
107 changes: 107 additions & 0 deletions tests/helm-template-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,102 @@ test_otel_cr_installer_restart_guard() {
}


# ============================================================================
# Telemetry Env Var Tests (CEML-708)
# ============================================================================

# Defaults: telemetry.enabled is "" and collector is off β†’ ENABLED=false,
# no other MLRUN_TELEMETRY__* keys emitted.
test_telemetry_default_inherits_collector_disabled() {
log_test "Telemetry - defaults inherit collector=disabled"

local output
output=$(render_template "templates/config/mlrun-env-configmap.yaml")

assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "false"' "Telemetry disabled by default"
assert_not_contains "$output" "MLRUN_TELEMETRY__OTLP_ENDPOINT" "No endpoint emitted when disabled"
assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "No insecure key when disabled"
assert_not_contains "$output" "MLRUN_TELEMETRY__HEADERS_SECRET_NAME" "No headers secret key when disabled"
}

# Empty telemetry.enabled inherits from opentelemetry.collector.enabled; with
# collector on, ENABLED resolves to true and endpoint derives from the release
# namespace + configured grpc port. INSECURE is NOT emitted by default β€”
# mlrun-api falls back to its own default (true, plaintext gRPC, correct for
# the in-cluster collector).
test_telemetry_inherits_collector_enabled() {
log_test "Telemetry - inherits collector=enabled"

local output
output=$(render_template "templates/config/mlrun-env-configmap.yaml" \
--set opentelemetry.collector.enabled=true)

assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "Telemetry inherits enabled=true"
assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "test-otel-collector.default.svc.cluster.local:4317"' "Endpoint derived from in-cluster collector (uses fullname helper)"
assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "Insecure not emitted by default (mlrun-api default = true)"
}

# User-supplied otlpEndpoint always wins, even with the in-cluster collector
# off β€” supports pointing mlrun-api at an external SaaS endpoint. INSECURE is
# not auto-flipped here; mlrun-api falls back to its own default (true), and
# users targeting a TLS endpoint must explicitly set telemetry.insecure=false.
test_telemetry_external_endpoint() {
log_test "Telemetry - user external endpoint honored"

local output
output=$(render_template "templates/config/mlrun-env-configmap.yaml" \
--set telemetry.enabled=true \
--set telemetry.otlpEndpoint=external.com:4317 \
--set opentelemetry.collector.enabled=false)

assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "true"' "User opt-in honored despite collector off"
assert_contains "$output" 'MLRUN_TELEMETRY__OTLP_ENDPOINT: "external.com:4317"' "User endpoint passed through verbatim"
assert_not_contains "$output" "MLRUN_TELEMETRY__INSECURE" "Insecure not auto-emitted for user endpoint (mlrun-api default applies)"
}

# When the user explicitly sets telemetry.insecure (e.g. =false for a TLS
# endpoint), the chart MUST emit it β€” otherwise the mlrun-api default of
# true would silently break TLS.
test_telemetry_insecure_emitted_when_set() {
log_test "Telemetry - insecure emitted when user overrides"

local output
output=$(render_template "templates/config/mlrun-env-configmap.yaml" \
--set telemetry.enabled=true \
--set telemetry.otlpEndpoint=external.com:4317 \
--set telemetry.insecure=false)

assert_contains "$output" 'MLRUN_TELEMETRY__INSECURE: "false"' "User-supplied insecure=false passed through"
}

# Safety override: enabled=true with no in-cluster collector AND no user
# otlpEndpoint must force ENABLED=false to avoid silently dropping spans.
test_telemetry_safety_force_disable() {
log_test "Telemetry - safety forces disable when no listener"

local output
output=$(render_template "templates/config/mlrun-env-configmap.yaml" \
--set telemetry.enabled=true \
--set opentelemetry.collector.enabled=false)

assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "false"' "Safety override forces false"
assert_not_contains "$output" "MLRUN_TELEMETRY__OTLP_ENDPOINT" "No endpoint emitted when force-disabled"
}

# headersSecretName must not be rendered as an env var when telemetry is off β€”
# downstream consumers shouldn't see a stale auth-headers reference.
test_telemetry_headers_secret_emitted_only_when_enabled() {
log_test "Telemetry - headers secret skipped when disabled"

local output
output=$(render_template "templates/config/mlrun-env-configmap.yaml" \
--set telemetry.headersSecretName=my-secret \
--set telemetry.enabled=false)

assert_contains "$output" 'MLRUN_TELEMETRY__ENABLED: "false"' "Explicit disable honored"
assert_not_contains "$output" "MLRUN_TELEMETRY__HEADERS_SECRET_NAME" "Headers secret skipped when disabled"
}

# ============================================================================
# Full Chart Render Test
# ============================================================================
Expand Down Expand Up @@ -563,6 +659,17 @@ main() {
test_otel_cr_installer_retry_counter
test_otel_cr_installer_restart_guard

echo ""
echo "========================================"
echo "Telemetry Env Var Tests"
echo "========================================"
test_telemetry_default_inherits_collector_disabled
test_telemetry_inherits_collector_enabled
test_telemetry_external_endpoint
test_telemetry_insecure_emitted_when_set
test_telemetry_safety_force_disable
test_telemetry_headers_secret_emitted_only_when_enabled

echo ""
echo "========================================"
echo "Full Chart Tests"
Expand Down
Loading