Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ ENV PATH="/opt/venv/bin:$PATH"

# Copy application source and packaging metadata
COPY app/ ./app/
COPY scripts/ ./scripts/
COPY pyproject.toml README.md ./

# Create an editable install so all sub-packages (app.api, app.core, …) are
Expand Down
59 changes: 58 additions & 1 deletion app/observability/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.view import ExplicitBucketHistogramAggregation, View
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
Expand All @@ -22,6 +23,58 @@
_METRICS_PROVIDER_CONFIGURED = False
_TRACE_PROVIDER_CONFIGURED = False

# Focus histogram precision on sub-second latencies while keeping larger buckets
# for slow-path visibility.
_DURATION_BUCKET_BOUNDARIES_SECONDS: tuple[float, ...] = (
0.005,
0.01,
0.025,
0.05,
0.075,
0.1,
0.15,
0.2,
0.3,
0.5,
0.75,
1.0,
1.5,
2.0,
3.0,
5.0,
7.5,
10.0,
)


def _build_metric_views() -> list[View]:
"""Return metric views with explicit latency buckets for key histograms."""
explicit_duration_aggregation = ExplicitBucketHistogramAggregation(
boundaries=_DURATION_BUCKET_BOUNDARIES_SECONDS
)
return [
View(
instrument_name="commerce_http_request_duration_seconds",
aggregation=explicit_duration_aggregation,
),
View(
instrument_name="commerce_http_response_time_seconds",
aggregation=explicit_duration_aggregation,
),
View(
instrument_name="commerce_http_processing_duration_seconds",
aggregation=explicit_duration_aggregation,
),
View(
instrument_name="commerce_http_queue_wait_duration_seconds",
aggregation=explicit_duration_aggregation,
),
View(
instrument_name="commerce_db_query_duration_seconds",
aggregation=explicit_duration_aggregation,
),
]


def _build_resource(settings: Settings) -> Resource:
"""Build OpenTelemetry resource attributes from runtime settings."""
Expand Down Expand Up @@ -49,7 +102,11 @@ def _configure_metrics_provider(settings: Settings) -> None:

resource = _build_resource(settings)
metric_reader = PrometheusMetricReader()
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
meter_provider = MeterProvider(
resource=resource,
metric_readers=[metric_reader],
views=_build_metric_views(),
)
metrics.set_meter_provider(meter_provider)
_METRICS_PROVIDER_CONFIGURED = True

Expand Down
14 changes: 14 additions & 0 deletions observability/grafana/dashboards/commerce-observability-p2.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,20 @@
"legendFormat": "{{reason}}"
}
]
},
{
"id": 8,
"type": "heatmap",
"title": "Histogram - Search Result Count Buckets",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 24},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"targets": [
{
"refId": "A",
"expr": "sum(rate(commerce_search_result_count_bucket[5m])) by (le)",
"legendFormat": "{{le}}"
}
]
}
]
}
84 changes: 84 additions & 0 deletions observability/grafana/dashboards/commerce-observability-p3.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
"title": "P95 Response Time (ms)",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
Expand All @@ -28,6 +32,10 @@
"title": "P95 Processing Time (ms)",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
Expand All @@ -42,6 +50,10 @@
"title": "P95 Queue Wait Time (ms)",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
Expand Down Expand Up @@ -70,6 +82,10 @@
"title": "DB Query Duration by Operation (ms)",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 8},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
Expand Down Expand Up @@ -138,6 +154,74 @@
"queryType": "range"
}
]
},
{
"id": 9,
"type": "heatmap",
"title": "Histogram - Response Time Buckets (ms)",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(commerce_http_response_time_seconds_bucket[5m])) by (le)",
"legendFormat": "{{le}}"
}
]
},
{
"id": 10,
"type": "heatmap",
"title": "Histogram - Processing Time Buckets (ms)",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(commerce_http_processing_duration_seconds_bucket[5m])) by (le)",
"legendFormat": "{{le}}"
}
]
},
{
"id": 11,
"type": "heatmap",
"title": "Histogram - Queue Wait Time Buckets (ms)",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 32},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(commerce_http_queue_wait_duration_seconds_bucket[5m])) by (le)",
"legendFormat": "{{le}}"
}
]
},
{
"id": 12,
"type": "heatmap",
"title": "Histogram - Response Payload Size Buckets (bytes)",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 32},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"targets": [
{
"refId": "A",
"expr": "sum(rate(commerce_http_response_payload_size_bytes_bucket[5m])) by (le)",
"legendFormat": "{{le}}"
}
]
}
]
}
44 changes: 44 additions & 0 deletions observability/grafana/dashboards/commerce-observability.json
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@
"title": "P95 Request Latency (ms)",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 8},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
Expand All @@ -84,6 +88,10 @@
"title": "P95 DB Query Duration (ms)",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 8},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
Expand Down Expand Up @@ -122,6 +130,42 @@
"filters": []
}
]
},
{
"id": 9,
"type": "heatmap",
"title": "Histogram - Request Latency Buckets (ms)",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(commerce_http_request_duration_seconds_bucket[5m])) by (le)",
"legendFormat": "{{le}}"
}
]
},
{
"id": 10,
"type": "heatmap",
"title": "Histogram - DB Query Duration Buckets (ms)",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(commerce_db_query_duration_seconds_bucket[5m])) by (le)",
"legendFormat": "{{le}}"
}
]
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@ datasources:
url: http://loki:3100
editable: true
jsonData:
manageAlerts: false
maxLines: 1000
Loading