Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Guide for AI coding agents working on the Commerce System Demo project.

Commerce System Demo is a FastAPI-based commerce service that provides RESTful APIs for managing products, categories, and implementing search functionality. The project includes built-in observability with OpenTelemetry metrics, logging, and distributed tracing.

**Current Version**: 0.1.2 (following [Semantic Versioning](https://semver.org/))
**Current Version**: 0.1.3 (following [Semantic Versioning](https://semver.org/))

## Setup Commands

Expand Down Expand Up @@ -101,7 +101,7 @@ app/

### Docker

- **Build image**: `docker build -t commerce-system-demo:0.1.2 .`
- **Build image**: `docker build -t commerce-system-demo:0.1.3 .`
- **View Dockerfile**: Includes Python dependencies, migration scripts, and app code
- **Build context**: Includes `scripts/`, `app/`, and `observability/` directories

Expand Down Expand Up @@ -184,7 +184,7 @@ This project follows [Semantic Versioning 2.0.0](https://semver.org/):
- **MINOR**: Backward-compatible new features
- **PATCH**: Backward-compatible bug fixes

Current version is **0.1.2** (initial development). Version is defined in:
Current version is **0.1.3** (initial development). Version is defined in:
- `pyproject.toml` (project metadata)
- `app/main.py` (FastAPI version)
- `app/observability/metrics.py` (meter version)
Expand Down
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Advanced filtering options for product search
- Product images storage optimization

## [0.1.3] - 2026-03-20

### Added

- Health check endpoint now verifies database connectivity via `SELECT 1` probe
- Retry fallback policy for health check database connection with configurable retries and timeout
- New settings `health_check_db_retries` (default 3) and `health_check_db_timeout` (default 2.0s)
- OpenTelemetry metrics for health checks: `commerce_health_check_total` counter and `commerce_health_check_duration_seconds` histogram
- Integration tests for health check success, DB failure with retries, recovery on retry, and metrics recording

## [0.1.2] - 2026-03-20

### Added
Expand Down
2 changes: 2 additions & 0 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class Settings(BaseSettings):
otel_metrics_path: str = "/metrics"
otel_trace_excluded_urls: str = ""
log_level: str = "INFO"
health_check_db_retries: int = 3
health_check_db_timeout: float = 2.0


@lru_cache
Expand Down
49 changes: 46 additions & 3 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path

from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.templating import Jinja2Templates
from jinja2 import TemplateNotFound

Expand Down Expand Up @@ -77,7 +77,7 @@ def create_app() -> FastAPI:

app = FastAPI(
title="Commerce System Demo",
version="0.1.2",
version="0.1.3",
lifespan=lifespan,
)
app.router.route_class = ObservabilityRoute
Expand Down Expand Up @@ -116,7 +116,50 @@ async def home(request: Request):

@app.get("/health", tags=["health"])
async def health() -> dict[str, str]:
return {"status": "ok"}
import asyncio
import time

from sqlalchemy import text

from app.db.session import get_engine
from app.observability.metrics import health_check_duration_seconds, health_check_total

settings = get_settings()
retries = settings.health_check_db_retries
timeout = settings.health_check_db_timeout
engine = get_engine()
start = time.monotonic()
last_error: Exception | None = None

for attempt in range(1, retries + 1):
try:
async with asyncio.timeout(timeout):
async with engine.connect() as conn:
await conn.execute(text("SELECT 1"))
duration = time.monotonic() - start
health_check_total.add(1, {"status": "ok"})
health_check_duration_seconds.record(duration, {"status": "ok"})
return {"status": "ok", "database": "available"}
except Exception as exc:
last_error = exc
logger.warning(
"health_check_db_attempt_failed",
extra={"attempt": attempt, "max_retries": retries, "error": str(exc)},
)
if attempt < retries:
await asyncio.sleep(0.1 * attempt)

duration = time.monotonic() - start
health_check_total.add(1, {"status": "error"})
health_check_duration_seconds.record(duration, {"status": "error"})
logger.error(
"health_check_database_failure",
extra={"retries_exhausted": retries, "error": str(last_error)},
)
return JSONResponse(
status_code=503,
content={"status": "error", "database": "unavailable"},
)

return app

Expand Down
14 changes: 13 additions & 1 deletion app/observability/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from opentelemetry import metrics
from opentelemetry.metrics import Counter, Histogram, UpDownCounter

_meter = metrics.get_meter("commerce-system-demo-observability", version="0.1.2")
_meter = metrics.get_meter("commerce-system-demo-observability", version="0.1.3")

http_request_duration_seconds: Histogram = _meter.create_histogram(
name="commerce_http_request_duration_seconds",
Expand Down Expand Up @@ -100,3 +100,15 @@
unit="1",
description="Total number of category validation failures",
)

health_check_total: Counter = _meter.create_counter(
name="commerce_health_check_total",
unit="1",
description="Total number of health check requests",
)

health_check_duration_seconds: Histogram = _meter.create_histogram(
name="commerce_health_check_duration_seconds",
unit="s",
description="Duration of health check requests including database probe",
)
2 changes: 1 addition & 1 deletion app/observability/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _build_resource(settings: Settings) -> Resource:
"""Build OpenTelemetry resource attributes from runtime settings."""
attributes = {
"service.name": settings.otel_service_name,
"service.version": "0.1.2",
"service.version": "0.1.3",
"deployment.environment": settings.otel_environment,
}

Expand Down
9 changes: 8 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ services:
OTEL_TRACE_EXCLUDED_URLS: /metrics,/health
ports:
- "8000:8000"
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
depends_on:
db:
condition: service_healthy
Expand Down Expand Up @@ -112,7 +118,8 @@ services:
- ./observability/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./observability/prometheus-alerts:/etc/prometheus/alerts:ro
depends_on:
- app
app:
condition: service_healthy
ports:
- "9090:9090"

Expand Down
50 changes: 50 additions & 0 deletions observability/grafana/dashboards/commerce-observability.json
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,56 @@
"legendFormat": "{{le}}"
}
]
},
{
"id": 11,
"type": "timeseries",
"title": "Health Check Rate by Status (req/s)",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 32},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"targets": [
{
"refId": "A",
"expr": "sum(rate(commerce_health_check_total[5m])) by (status)",
"legendFormat": "{{status}}"
}
]
},
{
"id": 12,
"type": "timeseries",
"title": "Health Check Failure Ratio (%)",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 32},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "percentunit", "min": 0, "max": 1},
"overrides": []
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(commerce_health_check_total{status=\"error\"}[5m])) / sum(rate(commerce_health_check_total[5m]))",
"legendFormat": "failure ratio"
}
]
},
{
"id": 13,
"type": "timeseries",
"title": "P95 Health Check Duration (ms)",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 32},
"datasource": {"type": "prometheus", "uid": "prometheus"},
"fieldConfig": {
"defaults": {"unit": "ms"},
"overrides": []
},
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.95, sum(rate(commerce_health_check_duration_seconds_bucket[5m])) by (le)) * 1000",
"legendFormat": "p95"
}
]
}
]
}
15 changes: 15 additions & 0 deletions observability/prometheus-alerts/commerce-alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,18 @@ groups:
annotations:
summary: "Database pool pressure"
description: "DB pool in-use connections are above 12 for at least 10 minutes."

- alert: CommerceHealthCheckFailures
expr: |
(
sum(rate(commerce_health_check_total{status="error"}[5m]))
/
clamp_min(sum(rate(commerce_health_check_total[5m])), 0.001)
) > 0.5
for: 5m
labels:
severity: critical
priority: p1
annotations:
summary: "Health check database failures"
description: "More than 50% of health checks are failing for at least 5 minutes."
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "commerce-system-demo"
version = "0.1.2"
version = "0.1.3"
description = "FastAPI commerce service demo"
readme = "README.md"
requires-python = ">=3.11"
Expand Down
Loading
Loading