diff --git a/Dockerfile b/Dockerfile index 85ba20d..efa5072 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,6 +44,7 @@ ENV PATH="/opt/venv/bin:$PATH" # Copy application source and packaging metadata COPY app/ ./app/ +COPY scripts/ ./scripts/ COPY pyproject.toml README.md ./ # Create an editable install so all sub-packages (app.api, app.core, …) are diff --git a/app/observability/setup.py b/app/observability/setup.py index ff5c97a..7bd7660 100644 --- a/app/observability/setup.py +++ b/app/observability/setup.py @@ -8,6 +8,7 @@ from opentelemetry.exporter.prometheus import PrometheusMetricReader from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.view import ExplicitBucketHistogramAggregation, View from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor @@ -22,6 +23,58 @@ _METRICS_PROVIDER_CONFIGURED = False _TRACE_PROVIDER_CONFIGURED = False +# Focus histogram precision on sub-second latencies while keeping larger buckets +# for slow-path visibility. +_DURATION_BUCKET_BOUNDARIES_SECONDS: tuple[float, ...] = ( + 0.005, + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.5, + 0.75, + 1.0, + 1.5, + 2.0, + 3.0, + 5.0, + 7.5, + 10.0, +) + + +def _build_metric_views() -> list[View]: + """Return metric views with explicit latency buckets for key histograms.""" + explicit_duration_aggregation = ExplicitBucketHistogramAggregation( + boundaries=_DURATION_BUCKET_BOUNDARIES_SECONDS + ) + return [ + View( + instrument_name="commerce_http_request_duration_seconds", + aggregation=explicit_duration_aggregation, + ), + View( + instrument_name="commerce_http_response_time_seconds", + aggregation=explicit_duration_aggregation, + ), + View( + instrument_name="commerce_http_processing_duration_seconds", + aggregation=explicit_duration_aggregation, + ), + View( + instrument_name="commerce_http_queue_wait_duration_seconds", + aggregation=explicit_duration_aggregation, + ), + View( + instrument_name="commerce_db_query_duration_seconds", + aggregation=explicit_duration_aggregation, + ), + ] + def _build_resource(settings: Settings) -> Resource: """Build OpenTelemetry resource attributes from runtime settings.""" @@ -49,7 +102,11 @@ def _configure_metrics_provider(settings: Settings) -> None: resource = _build_resource(settings) metric_reader = PrometheusMetricReader() - meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) + meter_provider = MeterProvider( + resource=resource, + metric_readers=[metric_reader], + views=_build_metric_views(), + ) metrics.set_meter_provider(meter_provider) _METRICS_PROVIDER_CONFIGURED = True diff --git a/observability/grafana/dashboards/commerce-observability-p2.json b/observability/grafana/dashboards/commerce-observability-p2.json index 0d34856..a296c3c 100644 --- a/observability/grafana/dashboards/commerce-observability-p2.json +++ b/observability/grafana/dashboards/commerce-observability-p2.json @@ -105,6 +105,20 @@ "legendFormat": "{{reason}}" } ] + }, + { + "id": 8, + "type": "heatmap", + "title": "Histogram - Search Result Count Buckets", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_search_result_count_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] } ] } diff --git a/observability/grafana/dashboards/commerce-observability-p3.json b/observability/grafana/dashboards/commerce-observability-p3.json index 63c6c18..b932427 100644 --- a/observability/grafana/dashboards/commerce-observability-p3.json +++ b/observability/grafana/dashboards/commerce-observability-p3.json @@ -14,6 +14,10 @@ "title": "P95 Response Time (ms)", "gridPos": {"h": 8, "w": 8, "x": 0, "y": 0}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -28,6 +32,10 @@ "title": "P95 Processing Time (ms)", "gridPos": {"h": 8, "w": 8, "x": 8, "y": 0}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -42,6 +50,10 @@ "title": "P95 Queue Wait Time (ms)", "gridPos": {"h": 8, "w": 8, "x": 16, "y": 0}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -70,6 +82,10 @@ "title": "DB Query Duration by Operation (ms)", "gridPos": {"h": 8, "w": 8, "x": 8, "y": 8}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -138,6 +154,74 @@ "queryType": "range" } ] + }, + { + "id": 9, + "type": "heatmap", + "title": "Histogram - Response Time Buckets (ms)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_response_time_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] + }, + { + "id": 10, + "type": "heatmap", + "title": "Histogram - Processing Time Buckets (ms)", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_processing_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] + }, + { + "id": 11, + "type": "heatmap", + "title": "Histogram - Queue Wait Time Buckets (ms)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 32}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_queue_wait_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] + }, + { + "id": 12, + "type": "heatmap", + "title": "Histogram - Response Payload Size Buckets (bytes)", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 32}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_response_payload_size_bytes_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] } ] } diff --git a/observability/grafana/dashboards/commerce-observability.json b/observability/grafana/dashboards/commerce-observability.json index 7a4de44..c32ced8 100644 --- a/observability/grafana/dashboards/commerce-observability.json +++ b/observability/grafana/dashboards/commerce-observability.json @@ -70,6 +70,10 @@ "title": "P95 Request Latency (ms)", "gridPos": {"h": 8, "w": 8, "x": 8, "y": 8}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -84,6 +88,10 @@ "title": "P95 DB Query Duration (ms)", "gridPos": {"h": 8, "w": 8, "x": 16, "y": 8}, "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, "targets": [ { "refId": "A", @@ -122,6 +130,42 @@ "filters": [] } ] + }, + { + "id": 9, + "type": "heatmap", + "title": "Histogram - Request Latency Buckets (ms)", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_http_request_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] + }, + { + "id": 10, + "type": "heatmap", + "title": "Histogram - DB Query Duration Buckets (ms)", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_db_query_duration_seconds_bucket[5m])) by (le)", + "legendFormat": "{{le}}" + } + ] } ] } diff --git a/observability/grafana/provisioning/datasources/datasources.yml b/observability/grafana/provisioning/datasources/datasources.yml index b867d09..503d4e7 100644 --- a/observability/grafana/provisioning/datasources/datasources.yml +++ b/observability/grafana/provisioning/datasources/datasources.yml @@ -32,4 +32,5 @@ datasources: url: http://loki:3100 editable: true jsonData: + manageAlerts: false maxLines: 1000