diff --git a/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/templates/kafka-metrics-configmap.yaml b/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/templates/kafka-metrics-configmap.yaml new file mode 100644 index 0000000000..94a2f79674 --- /dev/null +++ b/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/templates/kafka-metrics-configmap.yaml @@ -0,0 +1,64 @@ +# +# Copyright Kroxylicious Authors. +# +# Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0 +# + +{{- if .Values.kafka.metrics.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: kafka-jmx-metrics-config + labels: + app: kafka +data: + kafka-metrics-config.yml: | + lowercaseOutputName: true + rules: + # Request handler pool idle % - near 0 means Kafka I/O thread pool is saturated + - pattern: "kafka.server<>RequestHandlerAvgIdlePercent" + name: kafka_server_requesthandlerpool_requesthandleravagidle + type: GAUGE + + # Network processor idle % - near 0 means network threads saturated + - pattern: "kafka.network<>NetworkProcessorAvgIdlePercent" + name: kafka_network_socketserver_networkprocessoravagidle + type: GAUGE + + # Broker-wide byte rates - aggregate inbound/outbound throughput + - pattern: "kafka.server<>OneMinuteRate" + name: kafka_server_brokertopicmetrics_bytesinpersec_rate + type: GAUGE + - pattern: "kafka.server<>OneMinuteRate" + name: kafka_server_brokertopicmetrics_bytesoutpersec_rate + type: GAUGE + + # Messages in per second (broker total) + - pattern: "kafka.server<>OneMinuteRate" + name: kafka_server_brokertopicmetrics_messagesinpersec_rate + type: GAUGE + + # Per-topic byte rates - identify which topic is hitting the ceiling + - pattern: "kafka.server<>OneMinuteRate" + name: kafka_server_brokertopicmetrics_bytesinpersec_rate + labels: + topic: "$1" + type: GAUGE + - pattern: "kafka.server<>OneMinuteRate" + name: kafka_server_brokertopicmetrics_bytesoutpersec_rate + labels: + topic: "$1" + type: GAUGE + + # ISR shrinks - indicates replica lag / replication bottleneck + - pattern: "kafka.server<>OneMinuteRate" + name: kafka_server_replicamanager_isrshrinks_rate + type: GAUGE + + # Produce request latency - how long Kafka spends processing produce requests + - pattern: "kafka.network<>(\\w+)" + name: kafka_network_requestmetrics_totaltimems_produce + labels: + quantile: "$1" + type: GAUGE +{{- end }} diff --git a/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/templates/kafka-strimzi.yaml b/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/templates/kafka-strimzi.yaml index eacce86065..9bf9e8780c 100644 --- a/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/templates/kafka-strimzi.yaml +++ b/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/templates/kafka-strimzi.yaml @@ -13,6 +13,14 @@ metadata: spec: kafka: version: {{ .Values.kafka.version }} + {{- if .Values.kafka.metrics.enabled }} + metricsConfig: + type: jmxPrometheusExporter + valueFrom: + configMapKeyRef: + name: kafka-jmx-metrics-config + key: kafka-metrics-config.yml + {{- end }} listeners: - name: plain port: 9092 diff --git a/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/values.yaml b/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/values.yaml index 08b29582d8..18117dc406 100644 --- a/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/values.yaml +++ b/kroxylicious-openmessaging-benchmarks/helm/kroxylicious-benchmark/values.yaml @@ -24,6 +24,8 @@ kafka: storage: size: "100Gi" storageClass: "" # Use default storage class + metrics: + enabled: false # Enable JMX Prometheus exporter on Kafka brokers (port 9404) # Kroxylicious proxy (enabled for proxy scenarios) # Requires the Kroxylicious operator to be installed in the cluster. diff --git a/kroxylicious-openmessaging-benchmarks/scripts/poll-kafka-metrics.sh b/kroxylicious-openmessaging-benchmarks/scripts/poll-kafka-metrics.sh new file mode 100755 index 0000000000..137ce7d0c2 --- /dev/null +++ b/kroxylicious-openmessaging-benchmarks/scripts/poll-kafka-metrics.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# +# Copyright Kroxylicious Authors. +# +# Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0 +# + +set -euo pipefail + +# Polls the Kafka JMX Prometheus exporter during a benchmark run. +# Intended to be started as a background process by run-benchmark.sh. +# +# The exporter is only available when kafka.metrics.enabled=true in the Helm chart. +# If the endpoint does not respond within the initial timeout, this script exits +# cleanly (exit 0) so run-benchmark.sh is not disrupted when metrics are disabled. +# +# Each poll appends a snapshot header followed by the raw Prometheus text format output. +# The header line format is: +# # SNAPSHOT timestamp= datetime= +# +# Usage: poll-kafka-metrics.sh [interval-seconds] + +usage() { + cat >&2 < [interval-seconds] + +Polls the Kafka JMX Prometheus exporter endpoint (/metrics on port 9404) via +kubectl port-forward and appends timestamped Prometheus snapshots to +/kafka-metrics.txt. + +Exits cleanly (exit 0) if the endpoint does not respond within 15 seconds — +this happens when kafka.metrics.enabled=false in the Helm chart. + +Arguments: + broker-pod Kubernetes pod name for a Kafka broker + namespace Kubernetes namespace containing the pod + output-dir Directory to write kafka-metrics.txt into + interval-seconds Polling interval in seconds (default: 30) +EOF + exit 1 +} + +if [[ $# -lt 3 ]]; then + usage +fi + +BROKER_POD="$1" +NAMESPACE="$2" +OUTPUT_DIR="$3" +INTERVAL="${4:-30}" + +METRICS_FILE="${OUTPUT_DIR}/kafka-metrics.txt" +LOCAL_PORT=19404 + +cleanup() { + if [[ -n "${PF_PID:-}" ]]; then + kill "${PF_PID}" 2>/dev/null || true + fi +} +trap cleanup EXIT + +mkdir -p "${OUTPUT_DIR}" + +echo "Starting port-forward to ${BROKER_POD}:9404 on localhost:${LOCAL_PORT}..." +kubectl port-forward "pod/${BROKER_POD}" "${LOCAL_PORT}:9404" \ + -n "${NAMESPACE}" &>/dev/null & +PF_PID=$! + +# Wait for endpoint to respond. Exit cleanly if it doesn't — JMX exporter is not deployed. +echo "Waiting for Kafka JMX metrics endpoint to be ready..." +PF_DEADLINE=$((SECONDS + 15)) +until curl -sf "http://localhost:${LOCAL_PORT}/metrics" >/dev/null 2>&1; do + if [[ $SECONDS -ge $PF_DEADLINE ]]; then + echo "Kafka JMX metrics endpoint not available on ${BROKER_POD}:9404 — skipping Kafka metrics collection" \ + "(enable with kafka.metrics.enabled=true in cluster-overrides.yaml)" >&2 + exit 0 + fi + if ! kill -0 "${PF_PID}" 2>/dev/null; then + echo "Kafka metrics port-forward exited — JMX exporter likely not deployed" >&2 + exit 0 + fi + sleep 1 +done +echo "Kafka JMX metrics endpoint ready." + +{ + echo "# kafka-metrics polling started" + echo "# broker=${BROKER_POD} namespace=${NAMESPACE} interval=${INTERVAL}s" + echo "# started=$(date -u +%Y-%m-%dT%H:%M:%SZ)" +} > "${METRICS_FILE}" + +while true; do + NOW=$(date +%s) + { + echo "" + echo "# SNAPSHOT datetime=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "# HELP benchmark_sample_timestamp_seconds Unix timestamp of this metrics snapshot" + echo "# TYPE benchmark_sample_timestamp_seconds gauge" + echo "benchmark_sample_timestamp_seconds ${NOW}" + if ! curl -sf "http://localhost:${LOCAL_PORT}/metrics"; then + echo "# WARNING: kafka metrics fetch failed at ${NOW}" + fi + } >> "${METRICS_FILE}" + sleep "${INTERVAL}" +done diff --git a/kroxylicious-openmessaging-benchmarks/scripts/run-benchmark.sh b/kroxylicious-openmessaging-benchmarks/scripts/run-benchmark.sh index 4db44f897e..4857ae69db 100755 --- a/kroxylicious-openmessaging-benchmarks/scripts/run-benchmark.sh +++ b/kroxylicious-openmessaging-benchmarks/scripts/run-benchmark.sh @@ -176,6 +176,7 @@ if ! kubectl auth can-i get pods -n "${NAMESPACE}" &>/dev/null; then fi METRICS_PID="" +KAFKA_METRICS_PID="" LOGS_PID="" teardown() { @@ -183,6 +184,7 @@ teardown() { echo "--- Tearing down benchmark infrastructure ---" stop_logs_tailer stop_metrics_poller + stop_kafka_metrics_poller if helm status "${HELM_RELEASE}" -n "${NAMESPACE}" &>/dev/null; then helm uninstall "${HELM_RELEASE}" -n "${NAMESPACE}" --wait --timeout 120s fi @@ -291,6 +293,31 @@ stop_metrics_poller() { fi } +start_kafka_metrics_poller() { + local kafka_pod + kafka_pod=$(kubectl get pod -n "${NAMESPACE}" \ + -l "strimzi.io/cluster=kafka,strimzi.io/pool-name=kafka-pool" \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || true + if [[ -z "${kafka_pod}" ]]; then + return + fi + echo "Starting Kafka JMX metrics polling (every ${METRICS_INTERVAL}s) for pod ${kafka_pod}..." + mkdir -p "${OUTPUT_DIR}" + "${SCRIPT_DIR}/poll-kafka-metrics.sh" \ + "${kafka_pod}" "${NAMESPACE}" "${OUTPUT_DIR}" "${METRICS_INTERVAL}" & + KAFKA_METRICS_PID=$! + echo "Kafka metrics poller running (PID ${KAFKA_METRICS_PID})" +} + +stop_kafka_metrics_poller() { + if [[ -n "${KAFKA_METRICS_PID}" ]]; then + echo "Stopping Kafka metrics poller (PID ${KAFKA_METRICS_PID})..." + kill "${KAFKA_METRICS_PID}" 2>/dev/null || true + wait "${KAFKA_METRICS_PID}" 2>/dev/null || true + KAFKA_METRICS_PID="" + fi +} + # Creates the results PVC if it does not already exist. # The PVC is not managed by Helm — it persists across probes and Helm installs. ensure_results_pvc() { @@ -590,6 +617,7 @@ fi create_benchmark_job start_metrics_poller +start_kafka_metrics_poller echo "" echo "--- Running benchmark (${SCENARIO} / ${WORKLOAD}) ---" @@ -726,6 +754,7 @@ if [[ -n "${PROXY_POD}" ]]; then fi stop_metrics_poller +stop_kafka_metrics_poller # --- Collect results ---