From 80138593a69f09ad0551fbc42fc11b9affad2c53 Mon Sep 17 00:00:00 2001 From: Gnani Rahul <89947795+gnanirahulnutakki@users.noreply.github.com> Date: Tue, 2 Jun 2026 15:41:30 -0500 Subject: [PATCH 1/2] feat(common-services): upgrade observability/search stack to current targets Bump the 7 observability/search subcharts to researched + live-tested targets and add the Elasticsearch/Kibana 8.x security-off configuration. Chart.yaml (chart 2.0.2 -> 2.1.0): - fluent-bit 0.48.0 -> 0.49.1 (Fluent Bit 3.2.1 -> 4.0.3) - grafana 8.10.0 -> 11.6.1 (app 11.5.1 -> 12.4.3); repo moved grafana.github.io -> grafana-community - prometheus 20.2.1 -> 25.30.1 (v2.43.0 -> v2.55.1, final 2.x LTS) - elasticsearch 7.17.3 -> 8.5.1 (frozen chart; app via imageTag 8.19.16) - kibana 7.17.3 -> 8.5.1 (frozen chart; app via imageTag 8.19.16) - opensearch 2.16.1 -> 2.37.0 (app 2.11.0 -> 2.19.5) - opensearch-dashboards 2.14.0 -> 2.33.0 (app 2.11.0 -> 2.19.5, matches engine) values.yaml: - elasticsearch: imageTag 8.19.16; security OFF (createCert/protocol/secret.enabled /xpack) + ELASTIC_PASSWORD readiness-probe shim required by the frozen 8.5.1 chart - kibana: imageTag 8.19.16; elasticsearchHosts http; documents the two secrets the frozen 8.5.1 chart hard-requires (elasticsearch-master-certs, kibana-es-token) - opensearch: prometheus-exporter plugin 2.11.0.0 -> 2.19.5.0 (MUST match the engine version exactly or it CrashLoops) -- found in live testing - grafana Elasticsearch datasource esVersion 7.17.3 -> 8.19.16 All seven validated by a live deploy+upgrade test on qa-self-managed. Upgrade gotchas (documented per-dependency in Chart.yaml): - prometheus: helm upgrade fails on the immutable StatefulSet selector -> delete the prometheus-server STS (PVC retained) then re-upgrade - opensearch-dashboards: Deployment selector is immutable -> delete the Deployment (stateless) then re-upgrade - opensearch engine upgrades in place (data retained); the exporter plugin must be bumped in lockstep with the app version --- charts/common-services/Chart.yaml | 38 ++++++++++++++++++++---------- charts/common-services/values.yaml | 38 ++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 17 deletions(-) diff --git a/charts/common-services/Chart.yaml b/charts/common-services/Chart.yaml index 1f8c558..e01adec 100644 --- a/charts/common-services/Chart.yaml +++ b/charts/common-services/Chart.yaml @@ -10,7 +10,7 @@ type: application # Chart Version # ------------- # This version should be incremented following Semantic Versioning (https://semver.org/) whenever the chart's structure or templates change. -version: 2.0.2 +version: 2.1.0 # Application Version (Informational) # ----------------------------------- @@ -45,21 +45,27 @@ dependencies: repository: https://argoproj.github.io/argo-helm condition: argo-cd.enabled - name: prometheus - version: 20.2.1 - # version: 15.13.0 + # app v2.43.0 -> v2.55.1 (final 2.x LTS). NOTE: helm upgrade fails on the immutable + # StatefulSet selector — delete the prometheus-server STS (PVC retained) then re-upgrade. + version: 25.30.1 repository: https://prometheus-community.github.io/helm-charts condition: prometheus.enabled - name: grafana - version: 8.10.0 - # version: 6.40.0 - repository: https://grafana.github.io/helm-charts + # chart repo moved grafana.github.io -> grafana-community (grafana.github.io stopped + # publishing the grafana chart after 2026-01-30). app 11.5.1 -> 12.4.3. + version: 11.6.1 + repository: https://grafana-community.github.io/helm-charts condition: grafana.enabled - name: elasticsearch - version: 7.17.3 + # chart frozen at 8.5.1 (elastic/helm-charts archived); app driven via imageTag 8.19.16. + # 8.x defaults security ON — values.yaml sets it OFF to match the 7.17 posture. + version: 8.5.1 repository: https://helm.elastic.co condition: elasticsearch.enabled - name: kibana - version: 7.17.3 + # locked to the elasticsearch version; chart frozen at 8.5.1, app via imageTag 8.19.16. + # requires the elasticsearch-master-certs + kibana-es-token secrets (see values.yaml). + version: 8.5.1 repository: https://helm.elastic.co condition: kibana.enabled - name: haproxy @@ -80,16 +86,24 @@ dependencies: repository: https://helm.runix.net condition: pgadmin4.enabled - name: opensearch - version: 2.16.1 + # app 2.11.0 -> 2.19.5 (stay on 2.x / Lucene 9 — engine StatefulSet upgrades IN PLACE, + # data retained, no reindex). NOTE: the pinned prometheus-exporter plugin in values.yaml + # MUST be bumped to the matching app version (2.19.5.0) or the engine CrashLoops on start. + # 2.19.x is also the required stepping stone for any future 3.x jump. + version: 2.37.0 repository: https://opensearch-project.github.io/helm-charts condition: opensearch.enabled - name: opensearch-dashboards - version: 2.14.0 + # chart 2.33.0 -> app 2.19.5, matching the engine exactly (chart number != app version here). + # Dashboards must equal the engine major.minor (and be <= it). NOTE: the Deployment selector + # changed vs 2.14.0, so helm upgrade fails 'field is immutable' — delete the + # opensearch-dashboards Deployment (stateless) then re-upgrade. + version: 2.33.0 repository: https://opensearch-project.github.io/helm-charts condition: opensearch-dashboards.enabled - name: fluent-bit - version: 0.48.0 - # version: 0.39.0 + # app 3.2.1 -> 4.0.3 (single 3.x->4.x boundary; classic .conf still accepted). + version: 0.49.1 repository: https://fluent.github.io/helm-charts condition: fluent-bit.enabled - name: velero diff --git a/charts/common-services/values.yaml b/charts/common-services/values.yaml index 9bbe9f1..cbdc5b9 100644 --- a/charts/common-services/values.yaml +++ b/charts/common-services/values.yaml @@ -313,7 +313,7 @@ grafana: access: proxy isDefault: false jsonData: - esVersion: '7.17.3' + esVersion: '8.19.16' logLevelField: fields.level logMessageField: message maxConcurrentShardRequests: 5 @@ -376,11 +376,26 @@ grafana: elasticsearch: enabled: true - imageTag: "7.17.25" + # ES 8.x driven via imageTag on the frozen elastic/elasticsearch 8.5.1 chart. + imageTag: "8.19.16" replicas: 1 nodeSelector: {} service: type: ClusterIP + # --- ES 8.x with security OFF (preserves the previous 7.17 posture) --- + # 8.x turns security/TLS ON by default; these turn it back off (plain http, no auth). + createCert: false + protocol: http + secret: + enabled: false + # The frozen 8.5.1 chart's readiness probe requires ELASTIC_PASSWORD to be SET even + # when security is disabled (ES ignores the value). Without it the pod never goes Ready. + extraEnvs: + - name: ELASTIC_PASSWORD + value: "security-disabled" + esConfig: + elasticsearch.yml: | + xpack.security.enabled: false # Persistence enabled by default and size to 100Gi volumeClaimTemplate: resources: @@ -400,7 +415,17 @@ elasticsearch: kibana: enabled: true - imageTag: "7.17.25" + # Kibana 8.x via imageTag on the frozen kibana 8.5.1 chart; MUST match the ES version. + imageTag: "8.19.16" + # ES security is OFF, so reach ES over http with no credentials (chart default is https). + elasticsearchHosts: "http://elasticsearch-master:9200" + # NOTE: the frozen kibana 8.5.1 chart UNCONDITIONALLY mounts an elasticsearch-master-certs + # secret and requires a kibana-es-token service-account-token secret — both are normally + # created by the security-ON helm hooks. With security off, pre-create them once per namespace: + # kubectl -n create secret generic elasticsearch-master-certs \ + # --from-file=ca.crt=dummy.crt --from-file=tls.crt=dummy.crt --from-file=tls.key=dummy.key + # kubectl -n create secret generic kibana-es-token --from-literal=token=security-disabled + # (ES ignores the bogus token with security off; the cert is mounted but unused over http.) fullnameOverride: kibana nodeSelector: {} service: @@ -964,11 +989,14 @@ opensearch: value: "true" - name: "DISABLE_INSTALL_DEMO_CONFIG" value: "true" - # Install Prometheus exporter plugin for metrics + # Install Prometheus exporter plugin for metrics. + # IMPORTANT: this Aiven plugin version MUST match the OpenSearch app version EXACTLY, + # or the engine refuses to start ("plugin built for X but version Y is running"). + # Bump it in lockstep on every OpenSearch upgrade (app 2.19.5 -> plugin 2.19.5.0). plugins: enabled: true installList: - - https://github.com/Aiven-Open/prometheus-exporter-plugin-for-opensearch/releases/download/2.11.0.0/prometheus-exporter-2.11.0.0.zip + - https://github.com/Aiven-Open/prometheus-exporter-plugin-for-opensearch/releases/download/2.19.5.0/prometheus-exporter-2.19.5.0.zip # Add pod annotations for Prometheus scraping podAnnotations: prometheus.io/scrape: "true" From c95f2d9595013c9ef1ede081f268a600e95df3b5 Mon Sep 17 00:00:00 2001 From: Gnani Rahul <89947795+gnanirahulnutakki@users.noreply.github.com> Date: Tue, 2 Jun 2026 16:28:55 -0500 Subject: [PATCH 2/2] feat(common-services): hands-free upgrade migration Job (helm + ArgoCD) Automate the manual steps the stack upgrade otherwise requires, so a plain `helm upgrade` or an ArgoCD sync from an older common-services release to this one completes without operator intervention. New templates/upgrade-migration/ (Job + script ConfigMap + namespaced RBAC), modeled on the existing crds-installer: runs as BOTH a Helm pre-install/ pre-upgrade hook AND an ArgoCD PreSync hook (weight -5 RBAC/CM, 0 Job). It: 1. Creates the elasticsearch-master-certs + kibana-es-token placeholder secrets the frozen kibana 8.5.1 chart hard-requires when ES security is OFF (only if absent). 2. Deletes the prometheus-server StatefulSet and opensearch-dashboards Deployment ONLY while they still carry the legacy selector labels (selector is immutable and changed in the new subcharts). Detects "already migrated" by the presence of the app.kubernetes.io/name selector label. StatefulSet PVCs are retained, so TSDB/index data survive. Fully IDEMPOTENT: on a cluster already on the new versions every check is a no-op and the Job exits 0 silently, so it is safe on every consecutive sync. values.yaml: new `upgradeMigration` block (enabled: true by default; elasticsearchSecurityOffSecrets toggle; image/resources). Validated on qa-self-managed: (a) fabricated legacy-selector StatefulSet is deleted while a new-selector one is skipped; (b) the real Job run against the already-migrated cluster skips all four items and completes in ~16s. --- .../upgrade-migration/configmap.yaml | 64 +++++++++++++++++++ .../templates/upgrade-migration/job.yaml | 50 +++++++++++++++ .../templates/upgrade-migration/rbac.yaml | 54 ++++++++++++++++ charts/common-services/values.yaml | 36 +++++++++++ 4 files changed, 204 insertions(+) create mode 100644 charts/common-services/templates/upgrade-migration/configmap.yaml create mode 100644 charts/common-services/templates/upgrade-migration/job.yaml create mode 100644 charts/common-services/templates/upgrade-migration/rbac.yaml diff --git a/charts/common-services/templates/upgrade-migration/configmap.yaml b/charts/common-services/templates/upgrade-migration/configmap.yaml new file mode 100644 index 0000000..2153209 --- /dev/null +++ b/charts/common-services/templates/upgrade-migration/configmap.yaml @@ -0,0 +1,64 @@ +{{- if .Values.upgradeMigration.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "common-services.name" . }}-upgrade-migration-script + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation + "argocd.argoproj.io/hook": PreSync + "argocd.argoproj.io/hook-delete-policy": HookSucceeded,BeforeHookCreation +data: + migrate.sh: | + #!/bin/sh + # Idempotent upgrade migration. On a cluster already on the new versions every + # check below is a no-op and the script exits 0 silently. + set -eu + NS="{{ .Release.Namespace }}" + echo "==> common-services upgrade-migration (namespace: ${NS})" + + # ---- 1) ES/Kibana 8.x security-off placeholder secrets (create only if absent) ---- + DO_ES_SHIM="{{ if and .Values.upgradeMigration.elasticsearchSecurityOffSecrets .Values.elasticsearch.enabled .Values.kibana.enabled }}true{{ else }}false{{ end }}" + if [ "${DO_ES_SHIM}" = "true" ]; then + if kubectl -n "${NS}" get secret elasticsearch-master-certs >/dev/null 2>&1; then + echo " ok secret/elasticsearch-master-certs present - skip" + else + echo " + creating placeholder secret/elasticsearch-master-certs (ES security-off shim)" + openssl req -x509 -newkey rsa:2048 -nodes -keyout /tmp/tls.key -out /tmp/tls.crt -days 3650 -subj "/CN=security-disabled" >/dev/null 2>&1 + kubectl -n "${NS}" create secret generic elasticsearch-master-certs \ + --from-file=ca.crt=/tmp/tls.crt --from-file=tls.crt=/tmp/tls.crt --from-file=tls.key=/tmp/tls.key + fi + if kubectl -n "${NS}" get secret kibana-es-token >/dev/null 2>&1; then + echo " ok secret/kibana-es-token present - skip" + else + echo " + creating placeholder secret/kibana-es-token (ES security-off shim)" + kubectl -n "${NS}" create secret generic kibana-es-token --from-literal=token=security-disabled + fi + fi + + # ---- 2) Delete workloads whose immutable selector changed, ONLY while still on the + # legacy selector. Detect "already migrated" by the presence of the new + # app.kubernetes.io/name selector label -> then it is a silent no-op. ---- + migrate_workload() { + kind="$1"; name="$2" + if ! kubectl -n "${NS}" get "${kind}" "${name}" >/dev/null 2>&1; then + echo " ok ${kind}/${name} absent - nothing to migrate" + return 0 + fi + if kubectl -n "${NS}" get "${kind}" "${name}" -o "jsonpath={.spec.selector.matchLabels.app\.kubernetes\.io/name}" 2>/dev/null | grep -q .; then + echo " ok ${kind}/${name} already on the new selector - skip" + return 0 + fi + echo " ~ ${kind}/${name} on the LEGACY selector -> deleting so the new selector applies" + echo " (StatefulSet PVCs are retained from volumeClaimTemplates - data survives)" + kubectl -n "${NS}" delete "${kind}" "${name}" --wait=true --timeout=180s + echo " + ${kind}/${name} deleted; the sync will recreate it with the new selector" + } + + migrate_workload statefulset "{{ dig "server" "fullnameOverride" (printf "%s-prometheus-server" .Release.Name) .Values.prometheus }}" + migrate_workload deployment "{{ dig "fullnameOverride" (printf "%s-opensearch-dashboards" .Release.Name) (index .Values "opensearch-dashboards") }}" + + echo "==> common-services upgrade-migration complete" +{{- end }} diff --git a/charts/common-services/templates/upgrade-migration/job.yaml b/charts/common-services/templates/upgrade-migration/job.yaml new file mode 100644 index 0000000..e2137f5 --- /dev/null +++ b/charts/common-services/templates/upgrade-migration/job.yaml @@ -0,0 +1,50 @@ +{{- if .Values.upgradeMigration.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "common-services.name" . }}-upgrade-migration + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation + "argocd.argoproj.io/hook": PreSync + "argocd.argoproj.io/hook-delete-policy": HookSucceeded,BeforeHookCreation +spec: + backoffLimit: 2 + activeDeadlineSeconds: 300 + ttlSecondsAfterFinished: 120 + template: + metadata: + name: {{ include "common-services.name" . }}-upgrade-migration + spec: + serviceAccountName: {{ include "common-services.name" . }}-upgrade-migration + restartPolicy: Never + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: scripts + configMap: + name: {{ include "common-services.name" . }}-upgrade-migration-script + defaultMode: 0755 + containers: + - name: upgrade-migration + image: "{{ .Values.upgradeMigration.image.repository }}:{{ .Values.upgradeMigration.image.tag }}" + imagePullPolicy: {{ .Values.upgradeMigration.image.pullPolicy }} + resources: + {{- toYaml .Values.upgradeMigration.resources | nindent 12 }} + volumeMounts: + - name: scripts + mountPath: /etc/scripts + command: + - /bin/sh + - -c + - | + set -eu + export LANG=C.UTF-8 LC_ALL=C.UTF-8 + echo "Installing kubectl + openssl..." + apk add --no-cache kubectl openssl >/dev/null 2>&1 + /etc/scripts/migrate.sh +{{- end }} diff --git a/charts/common-services/templates/upgrade-migration/rbac.yaml b/charts/common-services/templates/upgrade-migration/rbac.yaml new file mode 100644 index 0000000..6675262 --- /dev/null +++ b/charts/common-services/templates/upgrade-migration/rbac.yaml @@ -0,0 +1,54 @@ +{{- if .Values.upgradeMigration.enabled }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "common-services.name" . }}-upgrade-migration + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation + "argocd.argoproj.io/hook": PreSync + "argocd.argoproj.io/hook-delete-policy": HookSucceeded,BeforeHookCreation +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "common-services.name" . }}-upgrade-migration + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation + "argocd.argoproj.io/hook": PreSync + "argocd.argoproj.io/hook-delete-policy": HookSucceeded,BeforeHookCreation +rules: + # delete the immutable-selector workloads so the new selector can be applied + - apiGroups: ["apps"] + resources: ["statefulsets", "deployments"] + verbs: ["get", "list", "delete"] + # create the ES/Kibana security-off placeholder secrets if they are absent + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "common-services.name" . }}-upgrade-migration + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation + "argocd.argoproj.io/hook": PreSync + "argocd.argoproj.io/hook-delete-policy": HookSucceeded,BeforeHookCreation +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "common-services.name" . }}-upgrade-migration +subjects: + - kind: ServiceAccount + name: {{ include "common-services.name" . }}-upgrade-migration + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/charts/common-services/values.yaml b/charts/common-services/values.yaml index cbdc5b9..943a591 100644 --- a/charts/common-services/values.yaml +++ b/charts/common-services/values.yaml @@ -2106,3 +2106,39 @@ prometheus-elasticsearch-exporter: # enabled: false # fluent-bit: # enabled: false + +# ================================================================= +# Upgrade Migration (hands-free helm upgrade / ArgoCD sync) +# ----------------------------------------------------------------- +# A pre-sync Job that performs the idempotent steps a plain `helm upgrade` +# or an ArgoCD sync cannot do on their own when moving an OLDER +# common-services release up to this one: +# 1. Creates the placeholder secrets the frozen Kibana 8.5.1 chart +# hard-requires when Elasticsearch security is disabled +# (elasticsearch-master-certs, kibana-es-token). +# 2. Deletes the prometheus-server StatefulSet and the +# opensearch-dashboards Deployment ONLY while they still carry the +# LEGACY selector labels (the selector is immutable and changed in the +# new subcharts). StatefulSet PVCs are retained, so TSDB/index data +# survive the recreate. +# Runs as BOTH a Helm pre-install/pre-upgrade hook AND an ArgoCD PreSync +# hook. It is fully IDEMPOTENT: on a cluster already on the new versions it +# inspects the live selectors/secrets, finds nothing to do, and exits 0 +# silently — safe to leave enabled across every sync/upgrade. +upgradeMigration: + enabled: true + image: + repository: alpine + tag: "3.20" + pullPolicy: IfNotPresent + # Create the elasticsearch-master-certs + kibana-es-token placeholder secrets + # required by the kibana 8.5.1 chart when ES security is OFF. Set to false if you + # enable ES security (the elasticsearch chart's own hooks then create them). + elasticsearchSecurityOffSecrets: true + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi