From 5501e2efe4768c7fd48393a765940354553c1cb6 Mon Sep 17 00:00:00 2001 From: Vladislav Antonov Date: Sat, 14 Mar 2026 20:04:56 +0200 Subject: [PATCH 1/6] Fix db preparation script execution --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 85ba20d..efa5072 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,6 +44,7 @@ ENV PATH="/opt/venv/bin:$PATH" # Copy application source and packaging metadata COPY app/ ./app/ +COPY scripts/ ./scripts/ COPY pyproject.toml README.md ./ # Create an editable install so all sub-packages (app.api, app.core, …) are From cc2b0228555f37733942333a108241eac98d0589 Mon Sep 17 00:00:00 2001 From: Vladislav Antonov Date: Sat, 14 Mar 2026 20:23:39 +0200 Subject: [PATCH 2/6] Add histograms to p95 values --- .../dashboards/commerce-observability-p2.json | 14 +++++ .../dashboards/commerce-observability-p3.json | 56 +++++++++++++++++++ .../dashboards/commerce-observability.json | 28 ++++++++++ 3 files changed, 98 insertions(+) diff --git a/observability/grafana/dashboards/commerce-observability-p2.json b/observability/grafana/dashboards/commerce-observability-p2.json index 0d34856..a296c3c 100644 --- a/observability/grafana/dashboards/commerce-observability-p2.json +++ b/observability/grafana/dashboards/commerce-observability-p2.json @@ -105,6 +105,20 @@ "legendFormat": "{{reason}}" } ] + }, + { + "id": 8, + "type": "heatmap", + "title": "Histogram - Search Result Count Buckets", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_search_result_count_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] } ] } diff --git a/observability/grafana/dashboards/commerce-observability-p3.json b/observability/grafana/dashboards/commerce-observability-p3.json index 63c6c18..0ec3ea6 100644 --- a/observability/grafana/dashboards/commerce-observability-p3.json +++ b/observability/grafana/dashboards/commerce-observability-p3.json @@ -138,6 +138,62 @@ "queryType": "range" } ] + }, + { + "id": 9, + "type": "heatmap", + "title": "Histogram - Response Time Buckets (s)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_response_time_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] + }, + { + "id": 10, + "type": "heatmap", + "title": "Histogram - Processing Time Buckets (s)", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_processing_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] + }, + { + "id": 11, + "type": "heatmap", + "title": "Histogram - Queue Wait Time Buckets (s)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 32}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_queue_wait_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] + }, + { + "id": 12, + "type": "heatmap", + "title": "Histogram - Response Payload Size Buckets (bytes)", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 32}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_response_payload_size_bytes_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] } ] } diff --git a/observability/grafana/dashboards/commerce-observability.json b/observability/grafana/dashboards/commerce-observability.json index 7a4de44..7f967ee 100644 --- a/observability/grafana/dashboards/commerce-observability.json +++ b/observability/grafana/dashboards/commerce-observability.json @@ -122,6 +122,34 @@ "filters": [] } ] + }, + { + "id": 9, + "type": "heatmap", + "title": "Histogram - Request Latency Buckets (s)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_request_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] + }, + { + "id": 10, + "type": "heatmap", + "title": "Histogram - DB Query Duration Buckets (s)", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_db_query_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] } ] } From a820e8afc17dfd06e9090847f99258cfc88b6aca Mon Sep 17 00:00:00 2001 From: Vladislav Antonov Date: Sat, 14 Mar 2026 20:53:03 +0200 Subject: [PATCH 3/6] Fix Grafana alert setup --- observability/grafana/provisioning/datasources/datasources.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/observability/grafana/provisioning/datasources/datasources.yml b/observability/grafana/provisioning/datasources/datasources.yml index b867d09..503d4e7 100644 --- a/observability/grafana/provisioning/datasources/datasources.yml +++ b/observability/grafana/provisioning/datasources/datasources.yml @@ -32,4 +32,5 @@ datasources: url: http://loki:3100 editable: true jsonData: + manageAlerts: false maxLines: 1000 From 77ab895d8296817c2d8907a03657b4883303db76 Mon Sep 17 00:00:00 2001 From: Vladislav Antonov Date: Sat, 14 Mar 2026 21:01:02 +0200 Subject: [PATCH 4/6] Set the p95 values to be ms --- .../dashboards/commerce-observability-p3.json | 16 ++++++++++++++++ .../dashboards/commerce-observability.json | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/observability/grafana/dashboards/commerce-observability-p3.json b/observability/grafana/dashboards/commerce-observability-p3.json index 0ec3ea6..46f7379 100644 --- a/observability/grafana/dashboards/commerce-observability-p3.json +++ b/observability/grafana/dashboards/commerce-observability-p3.json @@ -14,6 +14,10 @@ "title": "P95 Response Time (ms)", "gridPos": {"h": 8, "w": 8, "x": 0, "y": 0}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -28,6 +32,10 @@ "title": "P95 Processing Time (ms)", "gridPos": {"h": 8, "w": 8, "x": 8, "y": 0}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -42,6 +50,10 @@ "title": "P95 Queue Wait Time (ms)", "gridPos": {"h": 8, "w": 8, "x": 16, "y": 0}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -70,6 +82,10 @@ "title": "DB Query Duration by Operation (ms)", "gridPos": {"h": 8, "w": 8, "x": 8, "y": 8}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", diff --git a/observability/grafana/dashboards/commerce-observability.json b/observability/grafana/dashboards/commerce-observability.json index 7f967ee..2e1cde3 100644 --- a/observability/grafana/dashboards/commerce-observability.json +++ b/observability/grafana/dashboards/commerce-observability.json @@ -70,6 +70,10 @@ "title": "P95 Request Latency (ms)", "gridPos": {"h": 8, "w": 8, "x": 8, "y": 8}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -84,6 +88,10 @@ "title": "P95 DB Query Duration (ms)", "gridPos": {"h": 8, "w": 8, "x": 16, "y": 8}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", From 34f1be81ba0c30f1b318293b6efca19652c42a2e Mon Sep 17 00:00:00 2001 From: Vladislav Antonov Date: Sat, 14 Mar 2026 21:16:14 +0200 Subject: [PATCH 5/6] Fix histogram buckets --- app/observability/setup.py | 59 +++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/app/observability/setup.py b/app/observability/setup.py index ff5c97a..7bd7660 100644 --- a/app/observability/setup.py +++ b/app/observability/setup.py @@ -8,6 +8,7 @@ from opentelemetry.exporter.prometheus import PrometheusMetricReader from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.view import ExplicitBucketHistogramAggregation, View from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor @@ -22,6 +23,58 @@ _METRICS_PROVIDER_CONFIGURED = False _TRACE_PROVIDER_CONFIGURED = False +# Focus histogram precision on sub-second latencies while keeping larger buckets +# for slow-path visibility. +_DURATION_BUCKET_BOUNDARIES_SECONDS: tuple[float, ...] = ( + 0.005, + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.5, + 0.75, + 1.0, + 1.5, + 2.0, + 3.0, + 5.0, + 7.5, + 10.0, +) + + +def _build_metric_views() -> list[View]: + """Return metric views with explicit latency buckets for key histograms.""" + explicit_duration_aggregation = ExplicitBucketHistogramAggregation( + boundaries=_DURATION_BUCKET_BOUNDARIES_SECONDS + ) + return [ + View( + instrument_name="commerce_http_request_duration_seconds", + aggregation=explicit_duration_aggregation, + ), + View( + instrument_name="commerce_http_response_time_seconds", + aggregation=explicit_duration_aggregation, + ), + View( + instrument_name="commerce_http_processing_duration_seconds", + aggregation=explicit_duration_aggregation, + ), + View( + instrument_name="commerce_http_queue_wait_duration_seconds", + aggregation=explicit_duration_aggregation, + ), + View( + instrument_name="commerce_db_query_duration_seconds", + aggregation=explicit_duration_aggregation, + ), + ] + def _build_resource(settings: Settings) -> Resource: """Build OpenTelemetry resource attributes from runtime settings.""" @@ -49,7 +102,11 @@ def _configure_metrics_provider(settings: Settings) -> None: resource = _build_resource(settings) metric_reader = PrometheusMetricReader() - meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) + meter_provider = MeterProvider( + resource=resource, + metric_readers=[metric_reader], + views=_build_metric_views(), + ) metrics.set_meter_provider(meter_provider) _METRICS_PROVIDER_CONFIGURED = True From 0dbfc5e8343aef95cdb737dce548a298f790da60 Mon Sep 17 00:00:00 2001 From: Vladislav Antonov Date: Sat, 14 Mar 2026 21:38:05 +0200 Subject: [PATCH 6/6] Fix the buckets of historgam diagrams --- .../dashboards/commerce-observability-p3.json | 18 +++++++++++++++--- .../dashboards/commerce-observability.json | 12 ++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/observability/grafana/dashboards/commerce-observability-p3.json b/observability/grafana/dashboards/commerce-observability-p3.json index 46f7379..b932427 100644 --- a/observability/grafana/dashboards/commerce-observability-p3.json +++ b/observability/grafana/dashboards/commerce-observability-p3.json @@ -158,9 +158,13 @@ { "id": 9, "type": "heatmap", - "title": "Histogram - Response Time Buckets (s)", + "title": "Histogram - Response Time Buckets (ms)", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -172,9 +176,13 @@ { "id": 10, "type": "heatmap", - "title": "Histogram - Processing Time Buckets (s)", + "title": "Histogram - Processing Time Buckets (ms)", "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -186,9 +194,13 @@ { "id": 11, "type": "heatmap", - "title": "Histogram - Queue Wait Time Buckets (s)", + "title": "Histogram - Queue Wait Time Buckets (ms)", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 32}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", diff --git a/observability/grafana/dashboards/commerce-observability.json b/observability/grafana/dashboards/commerce-observability.json index 2e1cde3..c32ced8 100644 --- a/observability/grafana/dashboards/commerce-observability.json +++ b/observability/grafana/dashboards/commerce-observability.json @@ -134,9 +134,13 @@ { "id": 9, "type": "heatmap", - "title": "Histogram - Request Latency Buckets (s)", + "title": "Histogram - Request Latency Buckets (ms)", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -148,9 +152,13 @@ { "id": 10, "type": "heatmap", - "title": "Histogram - DB Query Duration Buckets (s)", + "title": "Histogram - DB Query Duration Buckets (ms)", "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A",