diff --git a/.gitignore b/.gitignore index 5a92b74c66..4087794f20 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ test .secrets +monitoring/.env \ No newline at end of file diff --git a/app_python/app.py b/app_python/app.py index 532bd6786a..85b6af2740 100644 --- a/app_python/app.py +++ b/app_python/app.py @@ -5,22 +5,94 @@ import socket from datetime import datetime, timezone import logging +import json +import time +import uuid +from contextlib import asynccontextmanager HOST = os.getenv('HOST', '0.0.0.0') PORT = int(os.getenv('PORT', 5000)) DEBUG = os.getenv('DEBUG', 'False').lower() == 'true' -logging.basicConfig( - level=logging.INFO if not DEBUG else logging.DEBUG, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) +class JsonFormatter(logging.Formatter): + def format(self, record): + log_record = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "level": record.levelname, + "message": record.getMessage(), + "app": "devops-python", + "logger": record.name + } + if hasattr(record, "extra_info"): + log_record.update(record.extra_info) + return json.dumps(log_record) + + +logger = logging.getLogger("app") +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setFormatter(JsonFormatter()) +logger.addHandler(handler) +logger.propagate = False app = FastAPI() start_time = datetime.now() +@app.middleware("http") +async def log_requests(request: Request, call_next): + request_id = str(uuid.uuid4()) + start_time = time.time() + + log_context = { + "request_id": request_id, + "client_ip": request.client.host if request.client else "unknown", + "method": request.method, + "path": request.url.path + } + + try: + response = await call_next(request) + process_time = int((time.time() - start_time) * 1000) + + log_context.update({ + "status_code": response.status_code, + "duration_ms": process_time + }) + + logger.info(f"Request handled: {request.method} {request.url.path}", + extra={"extra_info": log_context}) + + return response + + except Exception as e: + process_time = int((time.time() - start_time) * 1000) + log_context.update({ + "status_code": 500, + "duration_ms": process_time, + "error": str(e) + }) + logger.error(f"Request failed: {str(e)}", + extra={"extra_info": log_context}) + raise e + + +@asynccontextmanager +async def lifespan(app: FastAPI): + startup_config = { + "version": "1.0.0", + "mode": "production", + "log_level": "INFO" + } + logger.info("Application starting up", extra={ + "extra_info": {"config": startup_config}}) + + yield + + logger.info("Application shutting down") + + @app.get("/") def read_root(request: Request): logger.debug(f'Request: {request.method} {request.url}') @@ -94,8 +166,6 @@ async def not_found(request, exc): } ) -# Custom error handler for 500 Internal Server Error - @app.exception_handler(500) async def internal_error(request, exc): diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..93a53a0d57 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,92 @@ +services: + app-python: + build: + context: ../app_python + ports: + - "8000:5000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + healthcheck: + test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health').read()"] + interval: 30s + timeout: 10s + retries: 3 + + loki: + image: grafana/loki:3.0.0 + container_name: loki + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml + - loki-data:/loki + command: -config.file=/etc/loki/config.yml + networks: + - logging + deploy: + resources: + limits: + cpus: '1.0' + memory: 512M + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + + promtail: + image: grafana/promtail:3.0.0 + container_name: promtail + ports: + - 9080:9080 + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + command: -config.file=/etc/promtail/config.yml + networks: + - logging + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + + grafana: + image: grafana/grafana:12.3.1 + container_name: grafana + ports: + - "3000:3000" + environment: + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD} + - GF_SECURITY_ALLOW_EMBEDDING=true + volumes: + - grafana-data:/var/lib/grafana + networks: + - logging + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"] + interval: 30s + timeout: 10s + retries: 3 + +volumes: + loki-data: + grafana-data: + +networks: + logging: \ No newline at end of file diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..56d3cfd298 --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,131 @@ +# Lab 07 + +## 1. Architecture + +The monitoring stack follows a "Push" architecture where logs are collected at the source and shipped to a central aggregator. + +* **FastAPI App**: Generates structured JSON logs. +* **Promtail**: Scrapes logs from the Docker socket, filters by labels, and pushes to Loki. +* **Loki 3.0**: Stores log chunks and indexes metadata using the TSDB engine. +* **Grafana**: Visualizes logs and converts log streams into metrics via LogQL. + +## 2. Setup Guide + +1. **Environment**: Create a `.env` file with `GRAFANA_ADMIN_PASSWORD`. +2. **Deployment**: + +```bash +cd monitoring +docker compose up -d + +``` + +**Verification**: Access Grafana at `http://localhost:3000` and login with the credentials from your `.env`. + +## 3. Configuration + +### Loki 3.0 (TSDB & Retention) + +I utilized the new `common` block and `tsdb` shipping to optimize storage for Loki 3.0. + +```yaml +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + +``` + +* **Why**: TSDB is significantly faster than the older Boltdb-shipper and is the recommended engine for version 3.0. + +### Promtail (Filtering) + +Promtail is configured to only collect logs from containers with specific Docker labels. + +```yaml +- source_labels: ['__meta_docker_container_label_logging'] + regex: 'promtail' + action: keep + +``` + +* **Why**: This prevents "log spam" from system containers and ensures I only monitor what I explicitly label. + +## 4. Application Logging + +I implemented structured logging using a custom `JsonFormatter` and FastAPI **Lifespan** events. + +**Implementation Snippet:** + +```python +class JsonFormatter(logging.Formatter): + def format(self, record): + log_record = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "level": record.levelname, + "message": record.getMessage(), + "app": "devops-python", + "logger": record.name + } + if hasattr(record, "extra_info"): + log_record.update(record.extra_info) + return json.dumps(log_record) + +handler = logging.StreamHandler() +handler.setFormatter(JsonFormatter()) +logger.addHandler(handler) + +``` + +By outputting JSON directly to `stdout`, Promtail captures the entire object, allowing us to use the `| json` parser in LogQL. + +## 5. Dashboard & LogQL + +| Panel | Query | Explanation | +| --- | --- | --- | +| Logs Table | `{app=~"devops-.*"}` | Shows the raw log stream for all related apps. | +| **Request Rate** | `sum by (app) (rate({app=~"devops-.*"} [1m]))` | Converts log lines into a "Requests Per Second" metric. | +| **Error Logs** | `{app=~"devops-.*"} \| json \| level="ERROR"` | Filters JSON objects where the level key is specifically ERROR. | +| **Status Codes** | `sum by (status_code) (count_over_time({app=~"devops-.*"} \| json [5m]))` | A pie chart showing the distribution of HTTP response codes. | + +![alt text]() + +## 6. Production Configuration + +* **Security**: Anonymous access is disabled (`GF_AUTH_ANONYMOUS_ENABLED=false`). +* **Resources**: +* Loki: 1.0 CPU, 512MB RAM. +* Grafana/App: 0.5 CPU, 256MB RAM. + +* **Health Checks**: Defined in `docker-compose.yml` using `/ready` (Loki) and `/api/health` (Grafana). + +## 7. Testing + +| Component | Command | Expected Result | +| --- | --- | --- | +| **Stack Status** | `docker compose ps` | All containers `(healthy)` | +| **Loki API** | `curl http://localhost:3100/ready` | `ready` | +| **Promtail UI** | `curl http://localhost:9080/targets` | List of active containers | + +## 8. Evidence + +### Task 1 + +![alt text](image.png) + +### Task 2 + +![alt text]() + +![alt text]() + +### Task 3 + +![alt text]() + +### Task 4 + +![alt text](image-1.png) +![alt text](image-2.png) diff --git a/monitoring/docs/image copy 2.png b/monitoring/docs/image copy 2.png new file mode 100644 index 0000000000..9eba95f270 Binary files /dev/null and b/monitoring/docs/image copy 2.png differ diff --git a/monitoring/docs/image copy 3.png b/monitoring/docs/image copy 3.png new file mode 100644 index 0000000000..79cc4230a4 Binary files /dev/null and b/monitoring/docs/image copy 3.png differ diff --git a/monitoring/docs/image copy.png b/monitoring/docs/image copy.png new file mode 100644 index 0000000000..acd608f879 Binary files /dev/null and b/monitoring/docs/image copy.png differ diff --git a/monitoring/docs/image-1.png b/monitoring/docs/image-1.png new file mode 100644 index 0000000000..acf6dbf5b7 Binary files /dev/null and b/monitoring/docs/image-1.png differ diff --git a/monitoring/docs/image-2.png b/monitoring/docs/image-2.png new file mode 100644 index 0000000000..c7dc5115ed Binary files /dev/null and b/monitoring/docs/image-2.png differ diff --git a/monitoring/docs/image.png b/monitoring/docs/image.png new file mode 100644 index 0000000000..8cba12f69b Binary files /dev/null and b/monitoring/docs/image.png differ diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..a1bbe53e55 --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,38 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /tmp/loki + storage: + filesystem: + chunks_directory: /tmp/loki/chunks + rules_directory: /tmp/loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 168h + +compactor: + working_directory: /tmp/loki/retention + compaction_interval: 10m + retention_enabled: True + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem \ No newline at end of file diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..5eb4650c06 --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,28 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container' + - source_labels: ['__meta_docker_container_image'] + target_label: 'image' + - source_labels: ['__meta_docker_container_label_logging'] + target_label: 'logging' + - source_labels: ['__meta_docker_container_label_app'] + target_label: 'app' + - replacement: 'docker' + target_label: 'job' + \ No newline at end of file