diff --git a/AGENTS.md b/AGENTS.md index 215ad3b..e793726 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,7 +6,7 @@ Guide for AI coding agents working on the Commerce System Demo project. Commerce System Demo is a FastAPI-based commerce service that provides RESTful APIs for managing products, categories, and implementing search functionality. The project includes built-in observability with OpenTelemetry metrics, logging, and distributed tracing. -**Current Version**: 0.1.2 (following [Semantic Versioning](https://semver.org/)) +**Current Version**: 0.1.3 (following [Semantic Versioning](https://semver.org/)) ## Setup Commands @@ -101,7 +101,7 @@ app/ ### Docker -- **Build image**: `docker build -t commerce-system-demo:0.1.2 .` +- **Build image**: `docker build -t commerce-system-demo:0.1.3 .` - **View Dockerfile**: Includes Python dependencies, migration scripts, and app code - **Build context**: Includes `scripts/`, `app/`, and `observability/` directories @@ -184,7 +184,7 @@ This project follows [Semantic Versioning 2.0.0](https://semver.org/): - **MINOR**: Backward-compatible new features - **PATCH**: Backward-compatible bug fixes -Current version is **0.1.2** (initial development). Version is defined in: +Current version is **0.1.3** (initial development). Version is defined in: - `pyproject.toml` (project metadata) - `app/main.py` (FastAPI version) - `app/observability/metrics.py` (meter version) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e916ac..3e52447 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Advanced filtering options for product search - Product images storage optimization +## [0.1.3] - 2026-03-20 + +### Added + +- Health check endpoint now verifies database connectivity via `SELECT 1` probe +- Retry fallback policy for health check database connection with configurable retries and timeout +- New settings `health_check_db_retries` (default 3) and `health_check_db_timeout` (default 2.0s) +- OpenTelemetry metrics for health checks: `commerce_health_check_total` counter and `commerce_health_check_duration_seconds` histogram +- Integration tests for health check success, DB failure with retries, recovery on retry, and metrics recording + ## [0.1.2] - 2026-03-20 ### Added diff --git a/app/core/config.py b/app/core/config.py index 5fbcc82..020ac9a 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -39,6 +39,8 @@ class Settings(BaseSettings): otel_metrics_path: str = "/metrics" otel_trace_excluded_urls: str = "" log_level: str = "INFO" + health_check_db_retries: int = 3 + health_check_db_timeout: float = 2.0 @lru_cache diff --git a/app/main.py b/app/main.py index 05d3dd5..79b3fab 100644 --- a/app/main.py +++ b/app/main.py @@ -6,7 +6,7 @@ from pathlib import Path from fastapi import FastAPI, Request -from fastapi.responses import HTMLResponse +from fastapi.responses import HTMLResponse, JSONResponse from fastapi.templating import Jinja2Templates from jinja2 import TemplateNotFound @@ -77,7 +77,7 @@ def create_app() -> FastAPI: app = FastAPI( title="Commerce System Demo", - version="0.1.2", + version="0.1.3", lifespan=lifespan, ) app.router.route_class = ObservabilityRoute @@ -116,7 +116,50 @@ async def home(request: Request): @app.get("/health", tags=["health"]) async def health() -> dict[str, str]: - return {"status": "ok"} + import asyncio + import time + + from sqlalchemy import text + + from app.db.session import get_engine + from app.observability.metrics import health_check_duration_seconds, health_check_total + + settings = get_settings() + retries = settings.health_check_db_retries + timeout = settings.health_check_db_timeout + engine = get_engine() + start = time.monotonic() + last_error: Exception | None = None + + for attempt in range(1, retries + 1): + try: + async with asyncio.timeout(timeout): + async with engine.connect() as conn: + await conn.execute(text("SELECT 1")) + duration = time.monotonic() - start + health_check_total.add(1, {"status": "ok"}) + health_check_duration_seconds.record(duration, {"status": "ok"}) + return {"status": "ok", "database": "available"} + except Exception as exc: + last_error = exc + logger.warning( + "health_check_db_attempt_failed", + extra={"attempt": attempt, "max_retries": retries, "error": str(exc)}, + ) + if attempt < retries: + await asyncio.sleep(0.1 * attempt) + + duration = time.monotonic() - start + health_check_total.add(1, {"status": "error"}) + health_check_duration_seconds.record(duration, {"status": "error"}) + logger.error( + "health_check_database_failure", + extra={"retries_exhausted": retries, "error": str(last_error)}, + ) + return JSONResponse( + status_code=503, + content={"status": "error", "database": "unavailable"}, + ) return app diff --git a/app/observability/metrics.py b/app/observability/metrics.py index 1b54f8f..815bed3 100644 --- a/app/observability/metrics.py +++ b/app/observability/metrics.py @@ -3,7 +3,7 @@ from opentelemetry import metrics from opentelemetry.metrics import Counter, Histogram, UpDownCounter -_meter = metrics.get_meter("commerce-system-demo-observability", version="0.1.2") +_meter = metrics.get_meter("commerce-system-demo-observability", version="0.1.3") http_request_duration_seconds: Histogram = _meter.create_histogram( name="commerce_http_request_duration_seconds", @@ -100,3 +100,15 @@ unit="1", description="Total number of category validation failures", ) + +health_check_total: Counter = _meter.create_counter( + name="commerce_health_check_total", + unit="1", + description="Total number of health check requests", +) + +health_check_duration_seconds: Histogram = _meter.create_histogram( + name="commerce_health_check_duration_seconds", + unit="s", + description="Duration of health check requests including database probe", +) diff --git a/app/observability/setup.py b/app/observability/setup.py index 47311e2..3b35f6c 100644 --- a/app/observability/setup.py +++ b/app/observability/setup.py @@ -76,7 +76,7 @@ def _build_resource(settings: Settings) -> Resource: """Build OpenTelemetry resource attributes from runtime settings.""" attributes = { "service.name": settings.otel_service_name, - "service.version": "0.1.2", + "service.version": "0.1.3", "deployment.environment": settings.otel_environment, } diff --git a/docker-compose.yml b/docker-compose.yml index 0323d5f..b5ab359 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -52,6 +52,12 @@ services: OTEL_TRACE_EXCLUDED_URLS: /metrics,/health ports: - "8000:8000" + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s depends_on: db: condition: service_healthy @@ -112,7 +118,8 @@ services: - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./observability/prometheus-alerts:/etc/prometheus/alerts:ro depends_on: - - app + app: + condition: service_healthy ports: - "9090:9090" diff --git a/observability/grafana/dashboards/commerce-observability.json b/observability/grafana/dashboards/commerce-observability.json index c32ced8..cb11177 100644 --- a/observability/grafana/dashboards/commerce-observability.json +++ b/observability/grafana/dashboards/commerce-observability.json @@ -166,6 +166,56 @@ "legendFormat": "{{le}}" } ] + }, + { + "id": 11, + "type": "timeseries", + "title": "Health Check Rate by Status (req/s)", + "gridPos": {"h": 8, "w": 8, "x": 0, "y": 32}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_health_check_total[5m])) by (status)", + "legendFormat": "{{status}}" + } + ] + }, + { + "id": 12, + "type": "timeseries", + "title": "Health Check Failure Ratio (%)", + "gridPos": {"h": 8, "w": 8, "x": 8, "y": 32}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "percentunit", "min": 0, "max": 1}, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(commerce_health_check_total{status=\"error\"}[5m])) / sum(rate(commerce_health_check_total[5m]))", + "legendFormat": "failure ratio" + } + ] + }, + { + "id": 13, + "type": "timeseries", + "title": "P95 Health Check Duration (ms)", + "gridPos": {"h": 8, "w": 8, "x": 16, "y": 32}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": {"unit": "ms"}, + "overrides": [] + }, + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.95, sum(rate(commerce_health_check_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "p95" + } + ] } ] } diff --git a/observability/prometheus-alerts/commerce-alerts.yml b/observability/prometheus-alerts/commerce-alerts.yml index fb3859d..2dc7dae 100644 --- a/observability/prometheus-alerts/commerce-alerts.yml +++ b/observability/prometheus-alerts/commerce-alerts.yml @@ -63,3 +63,18 @@ groups: annotations: summary: "Database pool pressure" description: "DB pool in-use connections are above 12 for at least 10 minutes." + + - alert: CommerceHealthCheckFailures + expr: | + ( + sum(rate(commerce_health_check_total{status="error"}[5m])) + / + clamp_min(sum(rate(commerce_health_check_total[5m])), 0.001) + ) > 0.5 + for: 5m + labels: + severity: critical + priority: p1 + annotations: + summary: "Health check database failures" + description: "More than 50% of health checks are failing for at least 5 minutes." diff --git a/pyproject.toml b/pyproject.toml index 09d2c4d..3ad2108 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "commerce-system-demo" -version = "0.1.2" +version = "0.1.3" description = "FastAPI commerce service demo" readme = "README.md" requires-python = ">=3.11" diff --git a/tests/test_api.py b/tests/test_api.py index 743edf0..dd73b45 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -35,10 +35,148 @@ async def override_get_session(): @pytest.mark.asyncio async def test_health_endpoint(client: AsyncClient): - """Test the health check endpoint.""" + """Test the health check endpoint returns ok with database available.""" response = await client.get("/health") assert response.status_code == 200 - assert response.json() == {"status": "ok"} + data = response.json() + assert data["status"] == "ok" + assert data["database"] == "available" + + +@pytest.mark.asyncio +async def test_health_endpoint_database_unavailable(db_session: AsyncSession): + """Test the health check reports error after all retries are exhausted.""" + from unittest.mock import AsyncMock, patch + + app = create_app() + + async def override_get_session(): + yield db_session + + app.dependency_overrides[get_session] = override_get_session + + mock_engine = AsyncMock() + mock_engine.connect = AsyncMock(side_effect=Exception("connection refused")) + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as ac: + with patch("app.db.session.get_engine", return_value=mock_engine): + response = await ac.get("/health") + + app.dependency_overrides.clear() + + assert response.status_code == 503 + data = response.json() + assert data["status"] == "error" + assert data["database"] == "unavailable" + # Default retries is 3 — engine.connect should be called 3 times + assert mock_engine.connect.call_count == 3 + + +@pytest.mark.asyncio +async def test_health_endpoint_database_recovers_on_retry(db_session: AsyncSession): + """Test that health check succeeds when DB fails first then recovers.""" + from unittest.mock import AsyncMock, MagicMock, patch + + app = create_app() + + async def override_get_session(): + yield db_session + + app.dependency_overrides[get_session] = override_get_session + + # First call fails, second call succeeds + mock_conn = AsyncMock() + mock_conn.execute = AsyncMock(return_value=None) + mock_conn.__aenter__ = AsyncMock(return_value=mock_conn) + mock_conn.__aexit__ = AsyncMock(return_value=False) + + mock_engine = AsyncMock() + mock_engine.connect = MagicMock( + side_effect=[Exception("transient error"), MagicMock( + __aenter__=AsyncMock(return_value=mock_conn), + __aexit__=AsyncMock(return_value=False), + )] + ) + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as ac: + with patch("app.db.session.get_engine", return_value=mock_engine): + response = await ac.get("/health") + + app.dependency_overrides.clear() + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "ok" + assert data["database"] == "available" + assert mock_engine.connect.call_count == 2 + + +@pytest.mark.asyncio +async def test_health_endpoint_metrics_recorded_on_success(db_session: AsyncSession): + """Test that health check metrics are recorded on successful check.""" + from unittest.mock import MagicMock, patch + + app = create_app() + + async def override_get_session(): + yield db_session + + app.dependency_overrides[get_session] = override_get_session + + mock_counter = MagicMock() + mock_histogram = MagicMock() + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as ac: + with patch("app.observability.metrics.health_check_total", mock_counter), \ + patch("app.observability.metrics.health_check_duration_seconds", mock_histogram): + response = await ac.get("/health") + + app.dependency_overrides.clear() + + assert response.status_code == 200 + assert response.json()["status"] == "ok" + mock_counter.add.assert_called_once_with(1, {"status": "ok"}) + mock_histogram.record.assert_called_once() + record_args = mock_histogram.record.call_args + assert record_args[0][1] == {"status": "ok"} + + +@pytest.mark.asyncio +async def test_health_endpoint_metrics_recorded_on_failure(db_session: AsyncSession): + """Test that health check metrics are recorded on DB failure.""" + from unittest.mock import AsyncMock, MagicMock, call, patch + + app = create_app() + + async def override_get_session(): + yield db_session + + app.dependency_overrides[get_session] = override_get_session + + mock_engine = AsyncMock() + mock_engine.connect = AsyncMock(side_effect=Exception("connection refused")) + + mock_counter = MagicMock() + mock_histogram = MagicMock() + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as ac: + with patch("app.db.session.get_engine", return_value=mock_engine), \ + patch("app.observability.metrics.health_check_total", mock_counter), \ + patch("app.observability.metrics.health_check_duration_seconds", mock_histogram): + response = await ac.get("/health") + + app.dependency_overrides.clear() + + assert response.status_code == 503 + assert response.json()["status"] == "error" + mock_counter.add.assert_called_once_with(1, {"status": "error"}) + mock_histogram.record.assert_called_once() + record_args = mock_histogram.record.call_args + assert record_args[1] == {"status": "error"} or record_args[0][1] == {"status": "error"} @pytest.mark.asyncio