diff --git a/backend/app/core/container.py b/backend/app/core/container.py index 20f6b101..44c281d4 100644 --- a/backend/app/core/container.py +++ b/backend/app/core/container.py @@ -1,5 +1,6 @@ from dishka import AsyncContainer, make_async_container from dishka.integrations.fastapi import FastapiProvider +from faststream.kafka import KafkaBroker from app.core.providers import ( AdminServicesProvider, @@ -8,8 +9,11 @@ CoordinatorProvider, CoreServicesProvider, DatabaseProvider, + DLQProvider, + DLQWorkerProvider, EventProvider, EventReplayProvider, + EventReplayWorkerProvider, K8sWorkerProvider, KafkaServicesProvider, KubernetesProvider, @@ -22,6 +26,7 @@ ResourceCleanerProvider, ResultProcessorProvider, SagaOrchestratorProvider, + SagaWorkerProvider, SettingsProvider, SSEProvider, UserServicesProvider, @@ -29,12 +34,13 @@ from app.settings import Settings -def create_app_container(settings: Settings) -> AsyncContainer: +def create_app_container(settings: Settings, broker: KafkaBroker) -> AsyncContainer: """ Create the application DI container. Args: settings: Application settings (injected via from_context). + broker: KafkaBroker instance (injected via from_context for MessagingProvider). """ return make_async_container( SettingsProvider(), @@ -45,6 +51,7 @@ def create_app_container(settings: Settings) -> AsyncContainer: MetricsProvider(), RepositoryProvider(), MessagingProvider(), + DLQProvider(), EventProvider(), SagaOrchestratorProvider(), KafkaServicesProvider(), @@ -58,17 +65,12 @@ def create_app_container(settings: Settings) -> AsyncContainer: KubernetesProvider(), ResourceCleanerProvider(), FastapiProvider(), - context={Settings: settings}, + context={Settings: settings, KafkaBroker: broker}, ) -def create_result_processor_container(settings: Settings) -> AsyncContainer: - """ - Create a minimal DI container for the ResultProcessor worker. - - Args: - settings: Application settings (injected via from_context). - """ +def create_result_processor_container(settings: Settings, broker: KafkaBroker) -> AsyncContainer: + """Create a minimal DI container for the ResultProcessor worker.""" return make_async_container( SettingsProvider(), LoggingProvider(), @@ -79,12 +81,13 @@ def create_result_processor_container(settings: Settings) -> AsyncContainer: RepositoryProvider(), EventProvider(), MessagingProvider(), + DLQProvider(), ResultProcessorProvider(), - context={Settings: settings}, + context={Settings: settings, KafkaBroker: broker}, ) -def create_coordinator_container(settings: Settings) -> AsyncContainer: +def create_coordinator_container(settings: Settings, broker: KafkaBroker) -> AsyncContainer: """Create DI container for the ExecutionCoordinator worker.""" return make_async_container( SettingsProvider(), @@ -95,13 +98,14 @@ def create_coordinator_container(settings: Settings) -> AsyncContainer: MetricsProvider(), RepositoryProvider(), MessagingProvider(), + DLQProvider(), EventProvider(), CoordinatorProvider(), - context={Settings: settings}, + context={Settings: settings, KafkaBroker: broker}, ) -def create_k8s_worker_container(settings: Settings) -> AsyncContainer: +def create_k8s_worker_container(settings: Settings, broker: KafkaBroker) -> AsyncContainer: """Create DI container for the KubernetesWorker.""" return make_async_container( SettingsProvider(), @@ -112,14 +116,15 @@ def create_k8s_worker_container(settings: Settings) -> AsyncContainer: MetricsProvider(), RepositoryProvider(), MessagingProvider(), + DLQProvider(), EventProvider(), KubernetesProvider(), K8sWorkerProvider(), - context={Settings: settings}, + context={Settings: settings, KafkaBroker: broker}, ) -def create_pod_monitor_container(settings: Settings) -> AsyncContainer: +def create_pod_monitor_container(settings: Settings, broker: KafkaBroker) -> AsyncContainer: """Create DI container for the PodMonitor worker.""" return make_async_container( SettingsProvider(), @@ -130,16 +135,20 @@ def create_pod_monitor_container(settings: Settings) -> AsyncContainer: MetricsProvider(), RepositoryProvider(), MessagingProvider(), + DLQProvider(), EventProvider(), KafkaServicesProvider(), KubernetesProvider(), PodMonitorProvider(), - context={Settings: settings}, + context={Settings: settings, KafkaBroker: broker}, ) -def create_saga_orchestrator_container(settings: Settings) -> AsyncContainer: - """Create DI container for the SagaOrchestrator worker.""" +def create_saga_orchestrator_container(settings: Settings, broker: KafkaBroker) -> AsyncContainer: + """Create DI container for the SagaOrchestrator worker. + + Uses SagaWorkerProvider which adds APScheduler-managed timeout checking. + """ return make_async_container( SettingsProvider(), LoggingProvider(), @@ -149,14 +158,18 @@ def create_saga_orchestrator_container(settings: Settings) -> AsyncContainer: MetricsProvider(), RepositoryProvider(), MessagingProvider(), + DLQProvider(), EventProvider(), - SagaOrchestratorProvider(), - context={Settings: settings}, + SagaWorkerProvider(), + context={Settings: settings, KafkaBroker: broker}, ) -def create_event_replay_container(settings: Settings) -> AsyncContainer: - """Create DI container for the EventReplay worker.""" +def create_event_replay_container(settings: Settings, broker: KafkaBroker) -> AsyncContainer: + """Create DI container for the EventReplay worker. + + Uses EventReplayWorkerProvider which adds APScheduler-managed session cleanup. + """ return make_async_container( SettingsProvider(), LoggingProvider(), @@ -166,14 +179,19 @@ def create_event_replay_container(settings: Settings) -> AsyncContainer: MetricsProvider(), RepositoryProvider(), MessagingProvider(), + DLQProvider(), EventProvider(), - EventReplayProvider(), - context={Settings: settings}, + EventReplayWorkerProvider(), + context={Settings: settings, KafkaBroker: broker}, ) -def create_dlq_processor_container(settings: Settings) -> AsyncContainer: - """Create DI container for the DLQ processor worker.""" +def create_dlq_processor_container(settings: Settings, broker: KafkaBroker) -> AsyncContainer: + """Create DI container for the DLQ processor worker. + + Uses DLQWorkerProvider which adds APScheduler-managed retry monitoring + and configures retry policies and filters. + """ return make_async_container( SettingsProvider(), LoggingProvider(), @@ -183,8 +201,7 @@ def create_dlq_processor_container(settings: Settings) -> AsyncContainer: MetricsProvider(), RepositoryProvider(), MessagingProvider(), + DLQWorkerProvider(), EventProvider(), - context={Settings: settings}, + context={Settings: settings, KafkaBroker: broker}, ) - - diff --git a/backend/app/core/dishka_lifespan.py b/backend/app/core/dishka_lifespan.py index eb8ac0ef..f44bfbed 100644 --- a/backend/app/core/dishka_lifespan.py +++ b/backend/app/core/dishka_lifespan.py @@ -1,24 +1,15 @@ from __future__ import annotations -import asyncio import logging from contextlib import asynccontextmanager from typing import AsyncGenerator -import redis.asyncio as redis -from beanie import init_beanie from dishka import AsyncContainer from fastapi import FastAPI +from faststream.kafka import KafkaBroker -from app.core.database_context import Database -from app.core.metrics import RateLimitMetrics -from app.core.startup import initialize_rate_limits from app.core.tracing import init_tracing -from app.db.docs import ALL_DOCUMENTS -from app.events.event_store import EventStore -from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas from app.services.notification_scheduler import NotificationScheduler -from app.services.notification_service import NotificationService from app.settings import Settings @@ -27,13 +18,14 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: """ Application lifespan with dishka dependency injection. - This is much cleaner than the old lifespan.py: - - No dependency_overrides - - No manual service management - - Dishka handles all lifecycle automatically + Infrastructure init (Beanie, schemas, rate limits) is handled inside + DI providers. Resolving NotificationScheduler cascades through the + dependency graph and triggers all required initialisation. + + Kafka broker lifecycle is managed here (start/stop). """ - # Get settings and logger from DI container (uses test settings in tests) container: AsyncContainer = app.state.dishka_container + broker: KafkaBroker = app.state.kafka_broker settings = await container.get(Settings) logger = await container.get(logging.Logger) @@ -45,9 +37,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: }, ) - # Metrics setup moved to app creation to allow middleware registration - logger.info("Lifespan start: tracing and services initialization") - # Initialize tracing only when enabled (avoid exporter retries in tests) if settings.ENABLE_TRACING and not settings.TESTING: instrumentation_report = init_tracing( @@ -76,32 +65,17 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: extra={"testing": settings.TESTING, "enable_tracing": settings.ENABLE_TRACING}, ) - # Phase 1: Resolve all DI dependencies in parallel - # Consumers and the notification scheduler (APScheduler) start automatically via their DI providers - ( - schema_registry, - database, - redis_client, - rate_limit_metrics, - _event_store, - _notification_service, - _notification_scheduler, - ) = await asyncio.gather( - container.get(SchemaRegistryManager), - container.get(Database), - container.get(redis.Redis), - container.get(RateLimitMetrics), - container.get(EventStore), - container.get(NotificationService), - container.get(NotificationScheduler), - ) + # Resolve NotificationScheduler — cascades init_beanie, schema registration, + # and starts APScheduler via the DI provider graph. + await container.get(NotificationScheduler) + logger.info("Infrastructure initialized via DI providers") - # Phase 2: Initialize infrastructure in parallel (independent subsystems) - await asyncio.gather( - initialize_event_schemas(schema_registry), - init_beanie(database=database, document_models=ALL_DOCUMENTS), - initialize_rate_limits(redis_client, settings, logger, rate_limit_metrics), - ) - logger.info("Infrastructure initialized (schemas, beanie, rate limits)") + # Start Kafka broker (subscribers begin consuming) + await broker.start() + logger.info("Kafka broker started — consumers active") - yield + try: + yield + finally: + await broker.stop() + logger.info("Kafka broker stopped") diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index 62fb0c34..a1824dda 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -1,14 +1,16 @@ from __future__ import annotations -import asyncio import logging from typing import AsyncIterator import redis.asyncio as redis -from aiokafka import AIOKafkaProducer +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from beanie import init_beanie from dishka import Provider, Scope, from_context, provide +from faststream.kafka import KafkaBroker from kubernetes_asyncio import client as k8s_client from kubernetes_asyncio import config as k8s_config +from kubernetes_asyncio.client.rest import ApiException from pymongo.asynchronous.mongo_client import AsyncMongoClient from app.core.database_context import Database @@ -29,6 +31,7 @@ ) from app.core.security import SecurityService from app.core.tracing import TracerManager +from app.db.docs import ALL_DOCUMENTS from app.db.repositories import ( EventRepository, ExecutionRepository, @@ -46,21 +49,11 @@ from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository from app.db.repositories.user_settings_repository import UserSettingsRepository from app.dlq.manager import DLQManager -from app.domain.enums.events import EventType -from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId -from app.domain.idempotency import KeyStrategy +from app.domain.rate_limit import RateLimitConfig from app.domain.saga.models import SagaConfig -from app.events.core import ( - ConsumerConfig, - EventDispatcher, - ProducerMetrics, - UnifiedConsumer, - UnifiedProducer, - create_dlq_error_handler, -) +from app.events.core import UnifiedProducer from app.events.event_store import EventStore, create_event_store from app.events.schema.schema_registry import SchemaRegistryManager -from app.infrastructure.kafka.topics import get_all_topics from app.services.admin import AdminEventsService, AdminSettingsService, AdminUserService from app.services.auth_service import AuthService from app.services.coordinator.coordinator import ExecutionCoordinator @@ -69,7 +62,6 @@ from app.services.execution_service import ExecutionService from app.services.grafana_alert_processor import GrafanaAlertProcessor from app.services.idempotency import IdempotencyConfig, IdempotencyManager -from app.services.idempotency.middleware import IdempotentEventDispatcher from app.services.idempotency.redis_repository import RedisIdempotencyRepository from app.services.k8s_worker import KubernetesWorker from app.services.kafka_event_service import KafkaEventService @@ -77,7 +69,7 @@ from app.services.notification_service import NotificationService from app.services.pod_monitor.config import PodMonitorConfig from app.services.pod_monitor.event_mapper import PodEventMapper -from app.services.pod_monitor.monitor import PodMonitor +from app.services.pod_monitor.monitor import ErrorType, PodMonitor from app.services.rate_limit_service import RateLimitService from app.services.result_processor.processor import ResultProcessor from app.services.result_processor.resource_cleaner import ResourceCleaner @@ -127,13 +119,28 @@ async def get_redis_client(self, settings: Settings, logger: logging.Logger) -> try: yield client finally: - await client.close() + await client.aclose() @provide - def get_rate_limit_service( - self, redis_client: redis.Redis, settings: Settings, rate_limit_metrics: RateLimitMetrics + async def get_rate_limit_service( + self, + redis_client: redis.Redis, + settings: Settings, + rate_limit_metrics: RateLimitMetrics, + logger: logging.Logger, ) -> RateLimitService: - return RateLimitService(redis_client, settings, rate_limit_metrics) + service = RateLimitService(redis_client, settings, rate_limit_metrics) + try: + config_key = f"{settings.RATE_LIMIT_REDIS_PREFIX}config" + existing_config = await redis_client.get(config_key) + if not existing_config: + logger.info("Initializing default rate limit configuration in Redis") + default_config = RateLimitConfig.get_default_config() + await service.update_config(default_config) + logger.info(f"Initialized {len(default_config.default_rules)} default rate limit rules") + except Exception as e: + logger.error(f"Failed to initialize rate limits: {e}") + return service class DatabaseProvider(Provider): @@ -145,7 +152,8 @@ async def get_database(self, settings: Settings, logger: logging.Logger) -> Asyn settings.MONGODB_URL, tz_aware=True, serverSelectionTimeoutMS=5000 ) database = client[settings.DATABASE_NAME] - logger.info(f"MongoDB connected: {settings.DATABASE_NAME}") + await init_beanie(database=database, document_models=ALL_DOCUMENTS) + logger.info(f"MongoDB connected and Beanie initialized: {settings.DATABASE_NAME}") try: yield database finally: @@ -167,60 +175,18 @@ def get_tracer_manager(self, settings: Settings) -> TracerManager: class MessagingProvider(Provider): scope = Scope.APP - @provide - async def get_kafka_producer( - self, settings: Settings, schema_registry: SchemaRegistryManager, logger: logging.Logger, - event_metrics: EventMetrics - ) -> AsyncIterator[UnifiedProducer]: - aiokafka_producer = AIOKafkaProducer( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - client_id=f"{settings.SERVICE_NAME}-producer", - acks="all", - compression_type="gzip", - max_batch_size=16384, - linger_ms=10, - enable_idempotence=True, - ) - await aiokafka_producer.start() - logger.info(f"Kafka producer started: {settings.KAFKA_BOOTSTRAP_SERVERS}") - try: - yield UnifiedProducer( - aiokafka_producer, schema_registry, logger, settings, event_metrics, ProducerMetrics(), - ) - finally: - await aiokafka_producer.stop() - logger.info("Kafka producer stopped") + broker = from_context(provides=KafkaBroker, scope=Scope.APP) @provide - async def get_dlq_manager( + def get_unified_producer( self, - settings: Settings, + broker: KafkaBroker, schema_registry: SchemaRegistryManager, logger: logging.Logger, - dlq_metrics: DLQMetrics, - repository: DLQRepository, - ) -> AsyncIterator[DLQManager]: - producer = AIOKafkaProducer( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - client_id="dlq-manager-producer", - acks="all", - compression_type="gzip", - max_batch_size=16384, - linger_ms=10, - enable_idempotence=True, - ) - await producer.start() - try: - yield DLQManager( - settings=settings, - producer=producer, - schema_registry=schema_registry, - logger=logger, - dlq_metrics=dlq_metrics, - repository=repository, - ) - finally: - await producer.stop() + settings: Settings, + event_metrics: EventMetrics, + ) -> UnifiedProducer: + return UnifiedProducer(broker, schema_registry, logger, settings, event_metrics) @provide def get_idempotency_repository(self, redis_client: redis.Redis) -> RedisIdempotencyRepository: @@ -233,64 +199,98 @@ def get_idempotency_manager( return IdempotencyManager(IdempotencyConfig(), repo, logger, database_metrics) -class EventProvider(Provider): - scope = Scope.APP +class DLQProvider(Provider): + """Provides DLQManager without scheduling. Used by all containers except the DLQ worker.""" - @provide - def get_schema_registry(self, settings: Settings, logger: logging.Logger) -> SchemaRegistryManager: - return SchemaRegistryManager(settings, logger) + scope = Scope.APP @provide - async def get_event_store( + def get_dlq_manager( self, - schema_registry: SchemaRegistryManager, + broker: KafkaBroker, settings: Settings, - kafka_producer: UnifiedProducer, + schema_registry: SchemaRegistryManager, logger: logging.Logger, - event_metrics: EventMetrics, - ) -> AsyncIterator[EventStore]: - event_store = create_event_store( - schema_registry=schema_registry, logger=logger, event_metrics=event_metrics, ttl_days=90 + dlq_metrics: DLQMetrics, + repository: DLQRepository, + ) -> DLQManager: + return DLQManager( + settings=settings, + broker=broker, + schema_registry=schema_registry, + logger=logger, + dlq_metrics=dlq_metrics, + repository=repository, ) - dispatcher = EventDispatcher(logger=logger) - for event_type in EventType: - dispatcher.register_handler(event_type, event_store.store_event) - - config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=GroupId.EVENT_STORE_CONSUMER, - enable_auto_commit=False, - max_poll_records=100, - session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, - heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, - max_poll_interval_ms=settings.KAFKA_MAX_POLL_INTERVAL_MS, - request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, - ) - kafka_consumer = UnifiedConsumer( - config, - event_dispatcher=dispatcher, - schema_registry=schema_registry, + +class DLQWorkerProvider(Provider): + """Provides DLQManager with APScheduler-managed retry monitoring. + + Used by the DLQ worker container only. DLQManager configures its own + retry policies and filters; the provider only handles scheduling. + """ + + scope = Scope.APP + + @provide + async def get_dlq_manager( + self, + broker: KafkaBroker, + settings: Settings, + schema_registry: SchemaRegistryManager, + logger: logging.Logger, + dlq_metrics: DLQMetrics, + repository: DLQRepository, + database: Database, + ) -> AsyncIterator[DLQManager]: + manager = DLQManager( settings=settings, + broker=broker, + schema_registry=schema_registry, logger=logger, - event_metrics=event_metrics, + dlq_metrics=dlq_metrics, + repository=repository, ) - dlq_handler = create_dlq_error_handler( - producer=kafka_producer, logger=logger, max_retries=3, + scheduler = AsyncIOScheduler() + scheduler.add_job( + manager.process_monitoring_cycle, + trigger="interval", + seconds=10, + id="dlq_monitor_retries", + max_instances=1, + misfire_grace_time=60, ) - kafka_consumer.register_error_callback(dlq_handler) - - topics = get_all_topics() - await kafka_consumer.start(list(topics)) - logger.info(f"Event store consumer started for topics: {list(topics)}") + scheduler.start() + logger.info("DLQManager retry monitor started (APScheduler interval=10s)") try: - yield event_store + yield manager finally: - await kafka_consumer.stop() - logger.info("Event store consumer stopped") + scheduler.shutdown(wait=False) + logger.info("DLQManager retry monitor stopped") + + +class EventProvider(Provider): + scope = Scope.APP + + @provide + async def get_schema_registry(self, settings: Settings, logger: logging.Logger) -> SchemaRegistryManager: + registry = SchemaRegistryManager(settings, logger) + await registry.initialize_schemas() + return registry + @provide + def get_event_store( + self, + schema_registry: SchemaRegistryManager, + logger: logging.Logger, + event_metrics: EventMetrics, + ) -> EventStore: + return create_event_store( + schema_registry=schema_registry, logger=logger, event_metrics=event_metrics, ttl_days=90 + ) class KubernetesProvider(Provider): @@ -432,65 +432,14 @@ def get_user_repository(self) -> UserRepository: return UserRepository() -def _build_sse_consumers( - bus: SSERedisBus, - schema_registry: SchemaRegistryManager, - settings: Settings, - event_metrics: EventMetrics, - logger: logging.Logger, -) -> list[UnifiedConsumer]: - """Build SSE Kafka consumer pool (without starting them).""" - consumers: list[UnifiedConsumer] = [] - for i in range(settings.SSE_CONSUMER_POOL_SIZE): - config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id="sse-bridge-pool", - client_id=f"sse-consumer-{i}", - enable_auto_commit=True, - auto_offset_reset="latest", - max_poll_interval_ms=settings.KAFKA_MAX_POLL_INTERVAL_MS, - session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, - heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, - request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, - ) - dispatcher = EventDispatcher(logger=logger) - for et in SSERedisBus.SSE_ROUTED_EVENTS: - dispatcher.register_handler(et, bus.route_domain_event) - consumers.append(UnifiedConsumer( - config=config, - event_dispatcher=dispatcher, - schema_registry=schema_registry, - settings=settings, - logger=logger, - event_metrics=event_metrics, - )) - return consumers - - class SSEProvider(Provider): """Provides SSE (Server-Sent Events) related services.""" scope = Scope.APP @provide - async def get_sse_redis_bus( - self, - redis_client: redis.Redis, - schema_registry: SchemaRegistryManager, - settings: Settings, - event_metrics: EventMetrics, - logger: logging.Logger, - ) -> AsyncIterator[SSERedisBus]: - bus = SSERedisBus(redis_client, logger) - consumers = _build_sse_consumers(bus, schema_registry, settings, event_metrics, logger) - topics = list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.WEBSOCKET_GATEWAY]) - await asyncio.gather(*[c.start(topics) for c in consumers]) - logger.info(f"SSE bus started with {len(consumers)} consumers") - try: - yield bus - finally: - await asyncio.gather(*[c.stop() for c in consumers], return_exceptions=True) - logger.info("SSE consumers stopped") + def get_sse_redis_bus(self, redis_client: redis.Redis, logger: logging.Logger) -> SSERedisBus: + return SSERedisBus(redis_client, logger) @provide(scope=Scope.REQUEST) def get_sse_service( @@ -582,18 +531,16 @@ def get_admin_settings_service( return AdminSettingsService(admin_settings_repository, logger) @provide - async def get_notification_service( + def get_notification_service( self, notification_repository: NotificationRepository, kafka_event_service: KafkaEventService, - schema_registry: SchemaRegistryManager, sse_redis_bus: SSERedisBus, settings: Settings, logger: logging.Logger, notification_metrics: NotificationMetrics, - event_metrics: EventMetrics, - ) -> AsyncIterator[NotificationService]: - service = NotificationService( + ) -> NotificationService: + return NotificationService( notification_repository=notification_repository, event_service=kafka_event_service, sse_bus=sse_redis_bus, @@ -602,48 +549,14 @@ async def get_notification_service( notification_metrics=notification_metrics, ) - dispatcher = EventDispatcher(logger=logger) - dispatcher.register_handler(EventType.EXECUTION_COMPLETED, service._handle_execution_event) - dispatcher.register_handler(EventType.EXECUTION_FAILED, service._handle_execution_event) - dispatcher.register_handler(EventType.EXECUTION_TIMEOUT, service._handle_execution_event) - - consumer_config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=GroupId.NOTIFICATION_SERVICE, - max_poll_records=10, - enable_auto_commit=False, - auto_offset_reset="latest", - session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, - heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, - max_poll_interval_ms=settings.KAFKA_MAX_POLL_INTERVAL_MS, - request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, - ) - consumer = UnifiedConsumer( - consumer_config, - event_dispatcher=dispatcher, - schema_registry=schema_registry, - settings=settings, - logger=logger, - event_metrics=event_metrics, - ) - await consumer.start(list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.NOTIFICATION_SERVICE])) - - logger.info("NotificationService started") - - try: - yield service - finally: - await consumer.stop() - logger.info("NotificationService stopped") - @provide async def get_notification_scheduler( self, notification_repository: NotificationRepository, notification_service: NotificationService, logger: logging.Logger, + database: Database, # ensures init_beanie completes before scheduler starts ) -> AsyncIterator[NotificationScheduler]: - from apscheduler.schedulers.asyncio import AsyncIOScheduler scheduler_service = NotificationScheduler( notification_repository=notification_repository, @@ -762,151 +675,42 @@ def get_admin_user_service( class CoordinatorProvider(Provider): scope = Scope.APP - @provide - def get_coordinator_dispatcher( - self, logger: logging.Logger, idempotency_manager: IdempotencyManager - ) -> EventDispatcher: - """Create idempotent EventDispatcher for coordinator.""" - return IdempotentEventDispatcher( - logger=logger, - idempotency_manager=idempotency_manager, - key_strategy=KeyStrategy.EVENT_BASED, - ttl_seconds=7200, - ) - @provide def get_execution_coordinator( self, producer: UnifiedProducer, - dispatcher: EventDispatcher, execution_repository: ExecutionRepository, logger: logging.Logger, coordinator_metrics: CoordinatorMetrics, ) -> ExecutionCoordinator: - """Create ExecutionCoordinator - registers handlers on dispatcher in constructor.""" return ExecutionCoordinator( producer=producer, - dispatcher=dispatcher, execution_repository=execution_repository, logger=logger, coordinator_metrics=coordinator_metrics, ) - @provide - async def get_coordinator_consumer( - self, - coordinator: ExecutionCoordinator, # Ensures coordinator created first (handlers registered) - dispatcher: EventDispatcher, - schema_registry: SchemaRegistryManager, - settings: Settings, - logger: logging.Logger, - event_metrics: EventMetrics, - ) -> AsyncIterator[UnifiedConsumer]: - """Create and start consumer for coordinator.""" - consumer_config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=GroupId.EXECUTION_COORDINATOR, - enable_auto_commit=False, - session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, - heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, - max_poll_interval_ms=settings.KAFKA_MAX_POLL_INTERVAL_MS, - request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, - ) - - consumer = UnifiedConsumer( - consumer_config, - event_dispatcher=dispatcher, - schema_registry=schema_registry, - settings=settings, - logger=logger, - event_metrics=event_metrics, - ) - - await consumer.start(list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.EXECUTION_COORDINATOR])) - logger.info("Coordinator consumer started") - - try: - yield consumer - finally: - await consumer.stop() - logger.info("Coordinator consumer stopped") - class K8sWorkerProvider(Provider): scope = Scope.APP - @provide - def get_k8s_worker_dispatcher( - self, logger: logging.Logger, idempotency_manager: IdempotencyManager - ) -> EventDispatcher: - """Create idempotent EventDispatcher for K8s worker.""" - return IdempotentEventDispatcher( - logger=logger, - idempotency_manager=idempotency_manager, - key_strategy=KeyStrategy.CONTENT_HASH, - ttl_seconds=3600, - ) - @provide def get_kubernetes_worker( self, api_client: k8s_client.ApiClient, kafka_producer: UnifiedProducer, - dispatcher: EventDispatcher, settings: Settings, logger: logging.Logger, event_metrics: EventMetrics, ) -> KubernetesWorker: - """Create KubernetesWorker - registers handlers on dispatcher in constructor.""" return KubernetesWorker( api_client=api_client, producer=kafka_producer, - dispatcher=dispatcher, - settings=settings, - logger=logger, - event_metrics=event_metrics, - ) - - @provide - async def get_k8s_worker_consumer( - self, - worker: KubernetesWorker, # Ensures worker created first (handlers registered) - dispatcher: EventDispatcher, - schema_registry: SchemaRegistryManager, - settings: Settings, - logger: logging.Logger, - event_metrics: EventMetrics, - ) -> AsyncIterator[UnifiedConsumer]: - """Create and start consumer for K8s worker.""" - consumer_config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=GroupId.K8S_WORKER, - enable_auto_commit=False, - session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, - heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, - max_poll_interval_ms=settings.KAFKA_MAX_POLL_INTERVAL_MS, - request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, - ) - - consumer = UnifiedConsumer( - consumer_config, - event_dispatcher=dispatcher, - schema_registry=schema_registry, settings=settings, logger=logger, event_metrics=event_metrics, ) - await consumer.start(list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.K8S_WORKER])) - logger.info("K8s worker consumer started") - - try: - yield consumer - finally: - await worker.wait_for_active_creations() - await consumer.stop() - logger.info("K8s worker consumer stopped") - class PodMonitorProvider(Provider): scope = Scope.APP @@ -927,7 +731,9 @@ async def get_pod_monitor( logger: logging.Logger, event_mapper: PodEventMapper, kubernetes_metrics: KubernetesMetrics, + database: Database, ) -> AsyncIterator[PodMonitor]: + config = PodMonitorConfig() monitor = PodMonitor( config=config, @@ -937,28 +743,54 @@ async def get_pod_monitor( event_mapper=event_mapper, kubernetes_metrics=kubernetes_metrics, ) - await monitor.start() + + async def _watch_cycle() -> None: + try: + await monitor.watch_pod_events() + except ApiException as e: + if e.status == 410: + logger.warning("Resource version expired, resetting watch cursor") + monitor._last_resource_version = None + kubernetes_metrics.record_pod_monitor_watch_error(ErrorType.RESOURCE_VERSION_EXPIRED) + else: + logger.error(f"API error in watch: {e}") + kubernetes_metrics.record_pod_monitor_watch_error(ErrorType.API_ERROR) + kubernetes_metrics.increment_pod_monitor_watch_reconnects() + except Exception as e: + logger.error(f"Unexpected error in watch: {e}", exc_info=True) + kubernetes_metrics.record_pod_monitor_watch_error(ErrorType.UNEXPECTED) + kubernetes_metrics.increment_pod_monitor_watch_reconnects() + + scheduler = AsyncIOScheduler() + scheduler.add_job( + _watch_cycle, + trigger="interval", + seconds=5, + id="pod_monitor_watch", + max_instances=1, + misfire_grace_time=60, + ) + scheduler.start() + logger.info("PodMonitor scheduler started (list-then-watch)") + try: yield monitor finally: - await monitor.stop() + scheduler.shutdown(wait=False) class SagaOrchestratorProvider(Provider): scope = Scope.APP @provide - async def get_saga_orchestrator( + def get_saga_orchestrator( self, saga_repository: SagaRepository, kafka_producer: UnifiedProducer, - schema_registry: SchemaRegistryManager, - settings: Settings, resource_allocation_repository: ResourceAllocationRepository, logger: logging.Logger, - event_metrics: EventMetrics, - ) -> AsyncIterator[SagaOrchestrator]: - orchestrator = SagaOrchestrator( + ) -> SagaOrchestrator: + return SagaOrchestrator( config=_create_default_saga_config(), saga_repository=saga_repository, producer=kafka_producer, @@ -966,68 +798,66 @@ async def get_saga_orchestrator( logger=logger, ) - dispatcher = EventDispatcher(logger=logger) - dispatcher.register_handler(EventType.EXECUTION_REQUESTED, orchestrator.handle_execution_requested) - dispatcher.register_handler(EventType.EXECUTION_COMPLETED, orchestrator.handle_execution_completed) - dispatcher.register_handler(EventType.EXECUTION_FAILED, orchestrator.handle_execution_failed) - dispatcher.register_handler(EventType.EXECUTION_TIMEOUT, orchestrator.handle_execution_timeout) - - consumer_config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=GroupId.SAGA_ORCHESTRATOR, - enable_auto_commit=False, - session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, - heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, - max_poll_interval_ms=settings.KAFKA_MAX_POLL_INTERVAL_MS, - request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, - ) - consumer = UnifiedConsumer( - consumer_config, - event_dispatcher=dispatcher, - schema_registry=schema_registry, - settings=settings, - logger=logger, - event_metrics=event_metrics, - ) +class SagaWorkerProvider(Provider): + """Provides SagaOrchestrator with APScheduler-managed timeout checking. + + Used by the saga worker container only. The main app container uses + SagaOrchestratorProvider (no scheduler needed). + """ - await consumer.start(list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.SAGA_ORCHESTRATOR])) + scope = Scope.APP + + @provide + async def get_saga_orchestrator( + self, + saga_repository: SagaRepository, + kafka_producer: UnifiedProducer, + resource_allocation_repository: ResourceAllocationRepository, + logger: logging.Logger, + database: Database, + ) -> AsyncIterator[SagaOrchestrator]: - async def timeout_loop() -> None: - while True: - await asyncio.sleep(30) - try: - await orchestrator.check_timeouts() - except Exception as exc: - logger.error(f"Error checking saga timeouts: {exc}") + orchestrator = SagaOrchestrator( + config=_create_default_saga_config(), + saga_repository=saga_repository, + producer=kafka_producer, + resource_allocation_repository=resource_allocation_repository, + logger=logger, + ) - timeout_task = asyncio.create_task(timeout_loop()) - logger.info("Saga orchestrator consumer and timeout checker started") + scheduler = AsyncIOScheduler() + scheduler.add_job( + orchestrator.check_timeouts, + trigger="interval", + seconds=30, + id="saga_check_timeouts", + max_instances=1, + misfire_grace_time=60, + ) + scheduler.start() + logger.info("SagaOrchestrator timeout scheduler started (APScheduler interval=30s)") try: yield orchestrator finally: - timeout_task.cancel() - await consumer.stop() - logger.info("Saga orchestrator stopped") + scheduler.shutdown(wait=False) + logger.info("SagaOrchestrator timeout scheduler stopped") class ResultProcessorProvider(Provider): scope = Scope.APP @provide - async def get_result_processor( + def get_result_processor( self, execution_repo: ExecutionRepository, kafka_producer: UnifiedProducer, - schema_registry: SchemaRegistryManager, settings: Settings, logger: logging.Logger, execution_metrics: ExecutionMetrics, - event_metrics: EventMetrics, - idempotency_manager: IdempotencyManager, - ) -> AsyncIterator[ResultProcessor]: - processor = ResultProcessor( + ) -> ResultProcessor: + return ResultProcessor( execution_repo=execution_repo, producer=kafka_producer, settings=settings, @@ -1035,60 +865,49 @@ async def get_result_processor( execution_metrics=execution_metrics, ) - dispatcher = IdempotentEventDispatcher( - logger=logger, - idempotency_manager=idempotency_manager, - key_strategy=KeyStrategy.CONTENT_HASH, - ttl_seconds=7200, - ) - dispatcher.register_handler(EventType.EXECUTION_COMPLETED, processor.handle_execution_completed) - dispatcher.register_handler(EventType.EXECUTION_FAILED, processor.handle_execution_failed) - dispatcher.register_handler(EventType.EXECUTION_TIMEOUT, processor.handle_execution_timeout) - - consumer_config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=GroupId.RESULT_PROCESSOR, - max_poll_records=1, - enable_auto_commit=False, - auto_offset_reset="earliest", - session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, - heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, - max_poll_interval_ms=settings.KAFKA_MAX_POLL_INTERVAL_MS, - request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, - ) - consumer = UnifiedConsumer( - consumer_config, - event_dispatcher=dispatcher, - schema_registry=schema_registry, - settings=settings, +class EventReplayProvider(Provider): + scope = Scope.APP + + @provide + def get_event_replay_service( + self, + replay_repository: ReplayRepository, + kafka_producer: UnifiedProducer, + event_store: EventStore, + replay_metrics: ReplayMetrics, + logger: logging.Logger, + ) -> EventReplayService: + return EventReplayService( + repository=replay_repository, + producer=kafka_producer, + event_store=event_store, + replay_metrics=replay_metrics, logger=logger, - event_metrics=event_metrics, ) - await consumer.start(list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.RESULT_PROCESSOR])) - logger.info("ResultProcessor consumer started") - try: - yield processor - finally: - await consumer.stop() - logger.info("ResultProcessor stopped") +class EventReplayWorkerProvider(Provider): + """Provides EventReplayService with APScheduler-managed session cleanup. + Used by the event replay worker container only. The main app container + uses EventReplayProvider (no scheduled cleanup needed). + """ -class EventReplayProvider(Provider): scope = Scope.APP @provide - def get_event_replay_service( + async def get_event_replay_service( self, replay_repository: ReplayRepository, kafka_producer: UnifiedProducer, event_store: EventStore, replay_metrics: ReplayMetrics, logger: logging.Logger, - ) -> EventReplayService: - return EventReplayService( + database: Database, + ) -> AsyncIterator[EventReplayService]: + + service = EventReplayService( repository=replay_repository, producer=kafka_producer, event_store=event_store, @@ -1096,4 +915,21 @@ def get_event_replay_service( logger=logger, ) + scheduler = AsyncIOScheduler() + scheduler.add_job( + service.cleanup_old_sessions, + trigger="interval", + hours=6, + kwargs={"older_than_hours": 48}, + id="replay_cleanup_old_sessions", + max_instances=1, + misfire_grace_time=300, + ) + scheduler.start() + logger.info("EventReplayService cleanup scheduler started (APScheduler interval=6h)") + try: + yield service + finally: + scheduler.shutdown(wait=False) + logger.info("EventReplayService cleanup scheduler stopped") diff --git a/backend/app/core/startup.py b/backend/app/core/startup.py deleted file mode 100644 index ca605ec2..00000000 --- a/backend/app/core/startup.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import annotations - -import logging - -import redis.asyncio as redis - -from app.core.metrics import RateLimitMetrics -from app.domain.rate_limit import RateLimitConfig -from app.services.rate_limit_service import RateLimitService -from app.settings import Settings - - -async def initialize_rate_limits( - redis_client: redis.Redis, - settings: Settings, - logger: logging.Logger, - rate_limit_metrics: RateLimitMetrics, -) -> None: - """ - Initialize default rate limits in Redis on application startup. - This ensures default limits are always available. - """ - try: - service = RateLimitService(redis_client, settings, rate_limit_metrics) - - # Check if config already exists - config_key = f"{settings.RATE_LIMIT_REDIS_PREFIX}config" - existing_config = await redis_client.get(config_key) - - if not existing_config: - logger.info("Initializing default rate limit configuration in Redis") - - # Get default config and save it - default_config = RateLimitConfig.get_default_config() - await service.update_config(default_config) - - logger.info(f"Initialized {len(default_config.default_rules)} default rate limit rules") - else: - logger.info("Rate limit configuration already exists in Redis") - - except Exception as e: - logger.error(f"Failed to initialize rate limits: {e}") - # Don't fail startup if rate limit init fails - # The service will use defaults if Redis is unavailable diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py index d912adad..17232b8c 100644 --- a/backend/app/dlq/manager.py +++ b/backend/app/dlq/manager.py @@ -3,7 +3,7 @@ from datetime import datetime, timezone from typing import Any, Callable -from aiokafka import AIOKafkaProducer +from faststream.kafka import KafkaBroker from app.core.metrics import DLQMetrics from app.core.tracing.utils import inject_trace_context @@ -39,7 +39,7 @@ class DLQManager: def __init__( self, settings: Settings, - producer: AIOKafkaProducer, + broker: KafkaBroker, schema_registry: SchemaRegistryManager, logger: logging.Logger, dlq_metrics: DLQMetrics, @@ -51,29 +51,89 @@ def __init__( filters: list[Callable[[DLQMessage], bool]] | None = None, ): self.settings = settings - self.producer = producer + self._broker = broker self.schema_registry = schema_registry self.logger = logger self.metrics = dlq_metrics self.repository = repository self.dlq_topic = dlq_topic self.retry_topic_suffix = retry_topic_suffix + self.default_retry_policy = default_retry_policy or RetryPolicy( - topic="default", strategy=RetryStrategy.EXPONENTIAL_BACKOFF + topic="default", + strategy=RetryStrategy.EXPONENTIAL_BACKOFF, + max_retries=4, + base_delay_seconds=60, + max_delay_seconds=1800, + retry_multiplier=2.5, ) - self._retry_policies: dict[str, RetryPolicy] = dict(retry_policies or {}) - self._filters: list[Callable[[DLQMessage], bool]] = list(filters or []) + self._retry_policies: dict[str, RetryPolicy] = retry_policies if retry_policies is not None else { + "execution-requests": RetryPolicy( + topic="execution-requests", + strategy=RetryStrategy.EXPONENTIAL_BACKOFF, + max_retries=5, + base_delay_seconds=30, + max_delay_seconds=300, + retry_multiplier=2.0, + ), + "pod-events": RetryPolicy( + topic="pod-events", + strategy=RetryStrategy.EXPONENTIAL_BACKOFF, + max_retries=3, + base_delay_seconds=60, + max_delay_seconds=600, + retry_multiplier=3.0, + ), + "resource-allocation": RetryPolicy( + topic="resource-allocation", + strategy=RetryStrategy.IMMEDIATE, + max_retries=3, + ), + "websocket-events": RetryPolicy( + topic="websocket-events", + strategy=RetryStrategy.FIXED_INTERVAL, + max_retries=10, + base_delay_seconds=10, + ), + } + + self._filters: list[Callable[[DLQMessage], bool]] = filters if filters is not None else [ + f for f in [ + None if settings.TESTING else self._filter_test_events, + self._filter_old_messages, + ] if f is not None + ] self._dlq_events_topic = f"{settings.KAFKA_TOPIC_PREFIX}{KafkaTopic.DLQ_EVENTS}" self._event_metadata = EventMetadata(service_name="dlq-manager", service_version="1.0.0") + def _filter_test_events(self, message: DLQMessage) -> bool: + event_id = message.event.event_id or "" + return not event_id.startswith("test-") + + def _filter_old_messages(self, message: DLQMessage) -> bool: + max_age_days = 7 + age_seconds = (datetime.now(timezone.utc) - message.failed_at).total_seconds() + return age_seconds < (max_age_days * 24 * 3600) + + async def process_monitoring_cycle(self) -> None: + """Process due retries and update queue metrics. Called by APScheduler.""" + await self.process_due_retries() + await self.update_queue_metrics() + def parse_kafka_message(self, msg: Any) -> DLQMessage: """Parse a raw Kafka ConsumerRecord into a DLQMessage.""" data = json.loads(msg.value) headers = {k: v.decode() for k, v in (msg.headers or [])} return DLQMessage(**data, dlq_offset=msg.offset, dlq_partition=msg.partition, headers=headers) + def parse_dlq_body( + self, data: dict[str, Any], offset: int, partition: int, headers: dict[str, str] + ) -> DLQMessage: + """Parse a deserialized DLQ message body into a DLQMessage.""" + return DLQMessage(**data, dlq_offset=offset, dlq_partition=partition, headers=headers) + async def handle_message(self, message: DLQMessage) -> None: """Process a single DLQ message: filter → store → decide retry/discard.""" for filter_func in self._filters: @@ -111,21 +171,21 @@ async def retry_message(self, message: DLQMessage) -> None: "dlq_retry_timestamp": datetime.now(timezone.utc).isoformat(), } hdrs = inject_trace_context(hdrs) - kafka_headers: list[tuple[str, bytes]] = [(k, v.encode()) for k, v in hdrs.items()] event = message.event + serialized = json.dumps(event.model_dump(mode="json")).encode() - await self.producer.send_and_wait( + await self._broker.publish( + message=serialized, topic=retry_topic, - value=json.dumps(event.model_dump(mode="json")).encode(), key=message.event.event_id.encode(), - headers=kafka_headers, + headers=hdrs, ) - await self.producer.send_and_wait( + await self._broker.publish( + message=serialized, topic=message.original_topic, - value=json.dumps(event.model_dump(mode="json")).encode(), key=message.event.event_id.encode(), - headers=kafka_headers, + headers=hdrs, ) self.metrics.record_dlq_message_retried(message.original_topic, message.event.event_type, "success") @@ -285,9 +345,9 @@ async def _produce_dlq_event( ) -> None: try: serialized = await self.schema_registry.serialize_event(event) - await self.producer.send_and_wait( + await self._broker.publish( + message=serialized, topic=self._dlq_events_topic, - value=serialized, key=event.event_id.encode(), ) except Exception as e: diff --git a/backend/app/events/broker.py b/backend/app/events/broker.py new file mode 100644 index 00000000..3a6e3336 --- /dev/null +++ b/backend/app/events/broker.py @@ -0,0 +1,39 @@ +import logging +from typing import Any + +from faststream import StreamMessage +from faststream.kafka import KafkaBroker + +from app.domain.events.typed import DomainEvent +from app.events.schema.schema_registry import SchemaRegistryManager +from app.settings import Settings + + +def create_avro_decoder( + schema_registry: SchemaRegistryManager, +) -> Any: + """Create a custom Avro decoder closure for FastStream subscribers. + + The decoder receives a StreamMessage whose body is Confluent wire-format + Avro bytes (magic byte + 4-byte schema ID + Avro payload). We delegate + deserialization to SchemaRegistryManager which resolves the schema from + the registry and decodes into the concrete DomainEvent subclass. + """ + + async def avro_decoder(msg: StreamMessage[Any]) -> DomainEvent: + return await schema_registry.deserialize_event(msg.body, msg.raw_message.topic) + + return avro_decoder + + +def create_broker( + settings: Settings, + schema_registry: SchemaRegistryManager, + logger: logging.Logger, +) -> KafkaBroker: + """Create a KafkaBroker with Avro decoder for standalone workers.""" + return KafkaBroker( + settings.KAFKA_BOOTSTRAP_SERVERS, + decoder=create_avro_decoder(schema_registry), + logger=logger, + ) diff --git a/backend/app/events/core/__init__.py b/backend/app/events/core/__init__.py index f18b41e3..555a3e77 100644 --- a/backend/app/events/core/__init__.py +++ b/backend/app/events/core/__init__.py @@ -1,28 +1,5 @@ -from .consumer import UnifiedConsumer -from .dispatcher import EventDispatcher -from .dlq_handler import ( - create_dlq_error_handler, - create_immediate_dlq_handler, -) from .producer import UnifiedProducer -from .types import ( - ConsumerConfig, - ConsumerMetrics, - ConsumerState, - ProducerMetrics, -) __all__ = [ - # Types - "ConsumerState", - "ConsumerConfig", - "ProducerMetrics", - "ConsumerMetrics", - # Core components "UnifiedProducer", - "UnifiedConsumer", - "EventDispatcher", - # Helpers - "create_dlq_error_handler", - "create_immediate_dlq_handler", ] diff --git a/backend/app/events/core/consumer.py b/backend/app/events/core/consumer.py deleted file mode 100644 index 2c4e2854..00000000 --- a/backend/app/events/core/consumer.py +++ /dev/null @@ -1,261 +0,0 @@ -import asyncio -import logging -from collections.abc import Awaitable, Callable -from datetime import datetime, timezone -from typing import Any - -from aiokafka import AIOKafkaConsumer, TopicPartition -from aiokafka.errors import KafkaError -from opentelemetry.trace import SpanKind - -from app.core.metrics import EventMetrics -from app.core.tracing import EventAttributes -from app.core.tracing.utils import extract_trace_context, get_tracer -from app.domain.enums.kafka import KafkaTopic -from app.domain.events.typed import DomainEvent -from app.events.schema.schema_registry import SchemaRegistryManager -from app.settings import Settings - -from .dispatcher import EventDispatcher -from .types import ConsumerConfig, ConsumerMetrics, ConsumerMetricsSnapshot, ConsumerState, ConsumerStatus - - -class UnifiedConsumer: - def __init__( - self, - config: ConsumerConfig, - event_dispatcher: EventDispatcher, - schema_registry: SchemaRegistryManager, - settings: Settings, - logger: logging.Logger, - event_metrics: EventMetrics, - ): - self._config = config - self.logger = logger - self._schema_registry = schema_registry - self._dispatcher = event_dispatcher - self._consumer: AIOKafkaConsumer | None = None - self._state = ConsumerState.STOPPED - self._running = False - self._metrics = ConsumerMetrics() - self._event_metrics = event_metrics - self._error_callback: "Callable[[Exception, DomainEvent, str], Awaitable[None]] | None" = None - self._consume_task: asyncio.Task[None] | None = None - self._topic_prefix = settings.KAFKA_TOPIC_PREFIX - - async def start(self, topics: list[KafkaTopic]) -> None: - self._state = self._state if self._state != ConsumerState.STOPPED else ConsumerState.STARTING - - topic_strings = [f"{self._topic_prefix}{str(topic)}" for topic in topics] - - self._consumer = AIOKafkaConsumer( - *topic_strings, - bootstrap_servers=self._config.bootstrap_servers, - group_id=self._config.group_id, - client_id=self._config.client_id, - auto_offset_reset=self._config.auto_offset_reset, - enable_auto_commit=self._config.enable_auto_commit, - session_timeout_ms=self._config.session_timeout_ms, - heartbeat_interval_ms=self._config.heartbeat_interval_ms, - max_poll_interval_ms=self._config.max_poll_interval_ms, - request_timeout_ms=self._config.request_timeout_ms, - fetch_min_bytes=self._config.fetch_min_bytes, - fetch_max_wait_ms=self._config.fetch_max_wait_ms, - ) - - await self._consumer.start() - self._running = True - self._consume_task = asyncio.create_task(self._consume_loop()) - - self._state = ConsumerState.RUNNING - - self.logger.info(f"Consumer started for topics: {topic_strings}") - - async def stop(self) -> None: - self._state = ( - ConsumerState.STOPPING - if self._state not in (ConsumerState.STOPPED, ConsumerState.STOPPING) - else self._state - ) - - self._running = False - - if self._consume_task: - self._consume_task.cancel() - await asyncio.gather(self._consume_task, return_exceptions=True) - self._consume_task = None - - await self._cleanup() - self._state = ConsumerState.STOPPED - - async def _cleanup(self) -> None: - if self._consumer: - await self._consumer.stop() - self._consumer = None - - async def _consume_loop(self) -> None: - self.logger.info(f"Consumer loop started for group {self._config.group_id}") - poll_count = 0 - message_count = 0 - - while self._running and self._consumer: - poll_count += 1 - if poll_count % 100 == 0: # Log every 100 polls - self.logger.debug(f"Consumer loop active: polls={poll_count}, messages={message_count}") - - try: - # Use getone() with timeout for single message consumption - msg = await asyncio.wait_for( - self._consumer.getone(), - timeout=0.1 - ) - - message_count += 1 - self.logger.debug( - f"Message received from topic {msg.topic}, partition {msg.partition}, offset {msg.offset}" - ) - await self._process_message(msg) - if not self._config.enable_auto_commit: - await self._consumer.commit() - - except asyncio.TimeoutError: - # No message available within timeout, continue polling - await asyncio.sleep(0.01) - except KafkaError as e: - self.logger.error(f"Consumer error: {e}") - self._metrics.processing_errors += 1 - except Exception as e: - self.logger.error(f"Unexpected error in consume loop: {e}", exc_info=True) - - self.logger.warning( - f"Consumer loop ended for group {self._config.group_id}: " - f"running={self._running}, consumer={self._consumer is not None}" - ) - - async def _process_message(self, message: Any) -> None: - """Process a ConsumerRecord from aiokafka.""" - topic = message.topic - if not topic: - self.logger.warning("Message with no topic received") - return - - raw_value = message.value - if not raw_value: - self.logger.warning(f"Empty message from topic {topic}") - return - - self.logger.debug(f"Deserializing message from topic {topic}, size={len(raw_value)} bytes") - event = await self._schema_registry.deserialize_event(raw_value, topic) - self.logger.info(f"Deserialized event: type={event.event_type}, id={event.event_id}") - - # Extract trace context from Kafka headers and start a consumer span - # aiokafka headers are list of tuples: [(key, value), ...] - header_list = message.headers or [] - headers: dict[str, str] = {} - for k, v in header_list: - headers[str(k)] = v.decode("utf-8") if isinstance(v, (bytes, bytearray)) else (v or "") - ctx = extract_trace_context(headers) - tracer = get_tracer() - - # Dispatch event through EventDispatcher - try: - self.logger.debug(f"Dispatching {event.event_type} to handlers") - partition_val = message.partition - offset_val = message.offset - part_attr = partition_val if partition_val is not None else -1 - off_attr = offset_val if offset_val is not None else -1 - with tracer.start_as_current_span( - name="kafka.consume", - context=ctx, - kind=SpanKind.CONSUMER, - attributes={ - EventAttributes.KAFKA_TOPIC: topic, - EventAttributes.KAFKA_PARTITION: part_attr, - EventAttributes.KAFKA_OFFSET: off_attr, - EventAttributes.EVENT_TYPE: event.event_type, - EventAttributes.EVENT_ID: event.event_id, - }, - ): - await self._dispatcher.dispatch(event) - self.logger.debug(f"Successfully dispatched {event.event_type}") - # Update metrics on successful dispatch - self._metrics.messages_consumed += 1 - self._metrics.bytes_consumed += len(raw_value) - self._metrics.last_message_time = datetime.now(timezone.utc) - # Record Kafka consumption metrics - self._event_metrics.record_kafka_message_consumed(topic=topic, consumer_group=self._config.group_id) - except Exception as e: - self.logger.error(f"Dispatcher error for event {event.event_type}: {e}") - self._metrics.processing_errors += 1 - # Record Kafka consumption error - self._event_metrics.record_kafka_consumption_error( - topic=topic, consumer_group=self._config.group_id, error_type=type(e).__name__ - ) - if self._error_callback: - await self._error_callback(e, event, topic) - raise - - def register_error_callback(self, callback: Callable[[Exception, DomainEvent, str], Awaitable[None]]) -> None: - self._error_callback = callback - - @property - def state(self) -> ConsumerState: - return self._state - - @property - def metrics(self) -> ConsumerMetrics: - return self._metrics - - @property - def is_running(self) -> bool: - return self._state == ConsumerState.RUNNING - - @property - def consumer(self) -> AIOKafkaConsumer | None: - return self._consumer - - def get_status(self) -> ConsumerStatus: - return ConsumerStatus( - state=self._state, - is_running=self.is_running, - group_id=self._config.group_id, - client_id=self._config.client_id, - metrics=ConsumerMetricsSnapshot( - messages_consumed=self._metrics.messages_consumed, - bytes_consumed=self._metrics.bytes_consumed, - consumer_lag=self._metrics.consumer_lag, - commit_failures=self._metrics.commit_failures, - processing_errors=self._metrics.processing_errors, - last_message_time=self._metrics.last_message_time, - last_updated=self._metrics.last_updated, - ), - ) - - async def seek_to_beginning(self) -> None: - """Seek all assigned partitions to the beginning.""" - if not self._consumer: - self.logger.warning("Cannot seek: consumer not initialized") - return - - assignment = self._consumer.assignment() - if assignment: - await self._consumer.seek_to_beginning(*assignment) - - async def seek_to_end(self) -> None: - """Seek all assigned partitions to the end.""" - if not self._consumer: - self.logger.warning("Cannot seek: consumer not initialized") - return - - assignment = self._consumer.assignment() - if assignment: - await self._consumer.seek_to_end(*assignment) - - async def seek_to_offset(self, topic: str, partition: int, offset: int) -> None: - """Seek a specific partition to a specific offset.""" - if not self._consumer: - self.logger.warning("Cannot seek to offset: consumer not initialized") - return - - tp = TopicPartition(topic, partition) - self._consumer.seek(tp, offset) diff --git a/backend/app/events/core/dispatcher.py b/backend/app/events/core/dispatcher.py deleted file mode 100644 index c07afe1b..00000000 --- a/backend/app/events/core/dispatcher.py +++ /dev/null @@ -1,110 +0,0 @@ -import asyncio -import logging -from collections import defaultdict -from collections.abc import Awaitable, Callable -from typing import Any, TypeAlias, TypeVar - -from app.domain.enums.events import EventType -from app.domain.events.typed import DomainEvent - -T = TypeVar("T", bound=DomainEvent) -EventHandler: TypeAlias = Callable[[DomainEvent], Awaitable[Any]] - - -class EventDispatcher: - """ - Type-safe event dispatcher with automatic routing. - - This dispatcher eliminates the need for manual if/elif routing by maintaining - a direct mapping from event types to their handlers. - - Subclasses may override ``_wrap_handler`` to intercept handler registration - (e.g. ``IdempotentEventDispatcher`` adds idempotency protection). - """ - - def __init__(self, logger: logging.Logger) -> None: - self.logger = logger - - # Map event types to their handlers - self._handlers: dict[EventType, list[EventHandler]] = defaultdict(list) - - def _wrap_handler(self, handler: EventHandler) -> EventHandler: - """Hook for subclasses to wrap handlers at registration time.""" - return handler - - def register( - self, event_type: EventType - ) -> Callable[[Callable[[T], Awaitable[None]]], Callable[[T], Awaitable[None]]]: - """ - Decorator for registering type-safe event handlers. - - Generic over T (any DomainEvent subtype) - accepts handlers with specific - event types while preserving their type signature for callers. - - Usage: - @dispatcher.register(EventType.EXECUTION_REQUESTED) - async def handle_execution(event: ExecutionRequestedEvent) -> None: - # Handler logic here - event is properly typed - """ - - def decorator(handler: Callable[[T], Awaitable[None]]) -> Callable[[T], Awaitable[None]]: - self.logger.info(f"Registering handler '{handler.__name__}' for event type '{event_type}'") - # Safe: dispatch() routes by event_type, guaranteeing correct types at runtime - self._handlers[event_type].append(self._wrap_handler(handler)) # type: ignore[arg-type] - return handler - - return decorator - - def register_handler(self, event_type: EventType, handler: EventHandler) -> None: - """ - Direct registration method for handlers. - - Args: - event_type: The event type this handler processes - handler: The async handler function - """ - self.logger.info(f"Registering handler '{handler.__name__}' for event type '{event_type}'") - self._handlers[event_type].append(self._wrap_handler(handler)) - - async def dispatch(self, event: DomainEvent) -> None: - """ - Dispatch an event to all registered handlers for its type. - - Args: - event: The event to dispatch - """ - event_type = event.event_type - handlers = self._handlers.get(event_type, []) - self.logger.debug(f"Dispatcher has {len(self._handlers)} event types registered") - self.logger.debug( - f"For event type {event_type}, found {len(handlers)} handlers: {[h.__class__.__name__ for h in handlers]}" - ) - - if not handlers: - self.logger.debug(f"No handlers registered for event type {event_type}") - return - - self.logger.debug(f"Dispatching {event_type} to {len(handlers)} handler(s)") - - # Run handlers concurrently for better performance - tasks = [self._execute_handler(handler, event) for handler in handlers] - await asyncio.gather(*tasks) - - async def _execute_handler(self, handler: EventHandler, event: DomainEvent) -> None: - """ - Execute a single handler with error handling. - - Args: - handler: The handler function - event: The event to process - """ - try: - self.logger.debug(f"Executing handler {handler.__class__.__name__} for event {event.event_id}") - await handler(event) - self.logger.debug(f"Handler {handler.__class__.__name__} completed") - except Exception as e: - self.logger.error( - f"Handler '{handler.__class__.__name__}' failed for event {event.event_id}: {e}", exc_info=True - ) - raise - diff --git a/backend/app/events/core/dlq_handler.py b/backend/app/events/core/dlq_handler.py deleted file mode 100644 index 3482ceb4..00000000 --- a/backend/app/events/core/dlq_handler.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging -from typing import Awaitable, Callable - -from app.domain.events.typed import DomainEvent - -from .producer import UnifiedProducer - - -def create_dlq_error_handler( - producer: UnifiedProducer, logger: logging.Logger, max_retries: int = 3 -) -> Callable[[Exception, DomainEvent, str], Awaitable[None]]: - """Create an error handler that sends failed events to DLQ after max retries.""" - retry_counts: dict[str, int] = {} - - async def handle_error_with_dlq(error: Exception, event: DomainEvent, topic: str) -> None: - event_id = event.event_id or "unknown" - retry_count = retry_counts.get(event_id, 0) - retry_counts[event_id] = retry_count + 1 - logger.error(f"Error processing {event_id}: {error}. Retry {retry_count + 1}/{max_retries}", exc_info=True) - if retry_count >= max_retries: - logger.warning(f"Event {event_id} exceeded max retries. Sending to DLQ.") - await producer.send_to_dlq(event, topic, error, retry_count) - retry_counts.pop(event_id, None) - - return handle_error_with_dlq - - -def create_immediate_dlq_handler( - producer: UnifiedProducer, logger: logging.Logger -) -> Callable[[Exception, DomainEvent, str], Awaitable[None]]: - """Create an error handler that immediately sends failed events to DLQ.""" - - async def handle_error_immediate_dlq(error: Exception, event: DomainEvent, topic: str) -> None: - logger.error(f"Critical error processing {event.event_id}: {error}. Sending to DLQ.", exc_info=True) - await producer.send_to_dlq(event, topic, error, 0) - - return handle_error_immediate_dlq diff --git a/backend/app/events/core/producer.py b/backend/app/events/core/producer.py index 6745abc8..b7b4037c 100644 --- a/backend/app/events/core/producer.py +++ b/backend/app/events/core/producer.py @@ -4,8 +4,7 @@ import socket from datetime import datetime, timezone -from aiokafka import AIOKafkaProducer -from aiokafka.errors import KafkaError +from faststream.kafka import KafkaBroker from app.core.metrics import EventMetrics from app.core.tracing.utils import inject_trace_context @@ -16,28 +15,25 @@ from app.infrastructure.kafka.mappings import EVENT_TYPE_TO_TOPIC from app.settings import Settings -from .types import ProducerMetrics - class UnifiedProducer: - """Fully async Kafka producer using aiokafka. + """Fully async Kafka producer backed by FastStream KafkaBroker. - Lifecycle (start/stop of AIOKafkaProducer) is managed by the DI provider. + The broker's lifecycle (start/stop) is managed externally — either by + the FastStream app (worker entry points) or by the FastAPI lifespan. """ def __init__( self, - producer: AIOKafkaProducer, + broker: KafkaBroker, schema_registry_manager: SchemaRegistryManager, logger: logging.Logger, settings: Settings, event_metrics: EventMetrics, - producer_metrics: ProducerMetrics, ): - self._producer = producer + self._broker = broker self._schema_registry = schema_registry_manager self.logger = logger - self.metrics = producer_metrics self._event_metrics = event_metrics self._topic_prefix = settings.KAFKA_TOPIC_PREFIX @@ -52,28 +48,18 @@ async def produce(self, event_to_produce: DomainEvent, key: str) -> None: "correlation_id": event_to_produce.metadata.correlation_id or "", "service": event_to_produce.metadata.service_name, }) - header_list = [(k, v.encode()) for k, v in headers.items()] - await self._producer.send_and_wait( + await self._broker.publish( + message=serialized_value, topic=topic, - value=serialized_value, key=key.encode(), - headers=header_list, + headers=headers, ) - # Update metrics on success - self.metrics.messages_sent += 1 - self.metrics.bytes_sent += len(serialized_value) - - # Record Kafka metrics self._event_metrics.record_kafka_message_produced(topic) - self.logger.debug(f"Message [{event_to_produce}] sent to topic: {topic}") - except KafkaError as e: - self.metrics.messages_failed += 1 - self.metrics.last_error = str(e) - self.metrics.last_error_time = datetime.now(timezone.utc) + except Exception as e: self._event_metrics.record_kafka_production_error(topic=topic, error_type=type(e).__name__) self.logger.error(f"Failed to produce message: {e}") raise @@ -83,12 +69,10 @@ async def send_to_dlq( ) -> None: """Send a failed event to the Dead Letter Queue.""" try: - # Get producer ID (hostname + task name) current_task = asyncio.current_task() task_name = current_task.get_name() if current_task else "main" producer_id = f"{socket.gethostname()}-{task_name}" - # Create DLQ message directly dlq_message = DLQMessage( event=original_event, original_topic=original_topic, @@ -99,7 +83,6 @@ async def send_to_dlq( producer_id=producer_id, ) - # Create DLQ event wrapper dlq_event_data = { "event": dlq_message.event.model_dump(mode="json"), "original_topic": dlq_message.original_topic, @@ -110,27 +93,21 @@ async def send_to_dlq( "status": str(dlq_message.status), } - # Serialize as JSON (DLQ uses JSON format for flexibility) serialized_value = json.dumps(dlq_event_data).encode("utf-8") - dlq_topic = f"{self._topic_prefix}{str(KafkaTopic.DEAD_LETTER_QUEUE)}" - # Send to DLQ topic - await self._producer.send_and_wait( + await self._broker.publish( + message=serialized_value, topic=dlq_topic, - value=serialized_value, key=original_event.event_id.encode() if original_event.event_id else None, - headers=[ - ("original_topic", original_topic.encode()), - ("error_type", type(error).__name__.encode()), - ("retry_count", str(retry_count).encode()), - ], + headers={ + "original_topic": original_topic, + "error_type": type(error).__name__, + "retry_count": str(retry_count), + }, ) - # Record metrics self._event_metrics.record_kafka_message_produced(dlq_topic) - self.metrics.messages_sent += 1 - self.logger.warning( f"Event {original_event.event_id} sent to DLQ. " f"Original topic: {original_topic}, Error: {error}, " @@ -138,8 +115,6 @@ async def send_to_dlq( ) except Exception as e: - # If we can't send to DLQ, log critically but don't crash self.logger.critical( f"Failed to send event {original_event.event_id} to DLQ: {e}. Original error: {error}", exc_info=True ) - self.metrics.messages_failed += 1 diff --git a/backend/app/events/core/types.py b/backend/app/events/core/types.py deleted file mode 100644 index baf30239..00000000 --- a/backend/app/events/core/types.py +++ /dev/null @@ -1,108 +0,0 @@ -from dataclasses import dataclass -from datetime import datetime, timezone - -from pydantic import BaseModel, ConfigDict - -from app.core.utils import StringEnum - - -class ConsumerState(StringEnum): - """Kafka consumer state enumeration.""" - - STOPPED = "stopped" - STARTING = "starting" - RUNNING = "running" - STOPPING = "stopping" - ERROR = "error" - - -@dataclass(slots=True) -class ConsumerConfig: - """Kafka consumer configuration.""" - - bootstrap_servers: str - group_id: str - client_id: str = "integr8scode-consumer" - - # Offset management - auto_offset_reset: str = "earliest" - enable_auto_commit: bool = False - - # Session configuration - session_timeout_ms: int = 45000 - heartbeat_interval_ms: int = 10000 - max_poll_interval_ms: int = 300000 - request_timeout_ms: int = 40000 - - # Fetch configuration - max_poll_records: int = 500 - fetch_min_bytes: int = 1 - fetch_max_wait_ms: int = 500 - - -@dataclass(slots=True) -class ProducerMetrics: - """Metrics tracking for Kafka producer.""" - - # Message counters - messages_sent: int = 0 - messages_failed: int = 0 - bytes_sent: int = 0 - - # Performance metrics - queue_size: int = 0 - avg_latency_ms: float = 0.0 - - # Error tracking - last_error: str | None = None - last_error_time: datetime | None = None - - -@dataclass(slots=True) -class ConsumerMetrics: - """Metrics tracking for Kafka consumer.""" - - # Message counters - messages_consumed: int = 0 - bytes_consumed: int = 0 - - # Performance metrics - consumer_lag: int = 0 - - # Error tracking - commit_failures: int = 0 - processing_errors: int = 0 - - # Timestamps - last_message_time: datetime | None = None - last_updated: datetime | None = None - - def __post_init__(self) -> None: - """Initialize timestamps if not provided.""" - self.last_updated = self.last_updated or datetime.now(timezone.utc) - - -class ConsumerMetricsSnapshot(BaseModel): - """Snapshot of consumer metrics for status reporting.""" - - model_config = ConfigDict(from_attributes=True) - - messages_consumed: int - bytes_consumed: int - consumer_lag: int - commit_failures: int - processing_errors: int - last_message_time: datetime | None - last_updated: datetime | None - - -class ConsumerStatus(BaseModel): - """Consumer status information.""" - - model_config = ConfigDict(from_attributes=True) - - state: str - is_running: bool - group_id: str - client_id: str - metrics: ConsumerMetricsSnapshot diff --git a/backend/app/events/handlers.py b/backend/app/events/handlers.py new file mode 100644 index 00000000..c3d8166c --- /dev/null +++ b/backend/app/events/handlers.py @@ -0,0 +1,364 @@ +import asyncio +import json +import logging +from collections.abc import Awaitable, Callable +from datetime import datetime, timezone +from typing import Any + +from dishka.integrations.faststream import FromDishka +from faststream import AckPolicy, StreamMessage +from faststream.kafka import KafkaBroker +from opentelemetry.trace import SpanKind + +from app.core.tracing import EventAttributes +from app.core.tracing.utils import extract_trace_context, get_tracer +from app.dlq.manager import DLQManager +from app.domain.enums.events import EventType +from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId, KafkaTopic +from app.domain.events.typed import ( + CreatePodCommandEvent, + DeletePodCommandEvent, + DomainEvent, + ExecutionCancelledEvent, + ExecutionCompletedEvent, + ExecutionFailedEvent, + ExecutionRequestedEvent, + ExecutionTimeoutEvent, +) +from app.domain.idempotency import KeyStrategy +from app.events.core import UnifiedProducer +from app.events.event_store import EventStore +from app.infrastructure.kafka.mappings import EVENT_TYPE_TO_TOPIC +from app.infrastructure.kafka.topics import get_all_topics +from app.services.coordinator.coordinator import ExecutionCoordinator +from app.services.idempotency import IdempotencyManager +from app.services.k8s_worker import KubernetesWorker +from app.services.notification_service import NotificationService +from app.services.result_processor.processor import ResultProcessor +from app.services.saga import SagaOrchestrator +from app.services.sse.redis_bus import SSERedisBus +from app.settings import Settings + + +async def with_idempotency( + event: DomainEvent, + handler: Callable[..., Awaitable[None]], + idem: IdempotencyManager, + key_strategy: KeyStrategy, + ttl_seconds: int, + logger: logging.Logger, +) -> None: + """Run *handler* inside an idempotency guard (check → execute → mark).""" + result = await idem.check_and_reserve( + event=event, key_strategy=key_strategy, ttl_seconds=ttl_seconds, + ) + if result.is_duplicate: + logger.info(f"Duplicate event: {event.event_type} ({event.event_id})") + return + try: + await handler(event) + await idem.mark_completed(event=event, key_strategy=key_strategy) + except Exception as e: + await idem.mark_failed( + event=event, error=str(e), key_strategy=key_strategy, + ) + raise + + +def _topics(settings: Settings, group_id: GroupId) -> list[str]: + return [ + f"{settings.KAFKA_TOPIC_PREFIX}{t}" + for t in CONSUMER_GROUP_SUBSCRIPTIONS[group_id] + ] + + +def register_coordinator_subscriber(broker: KafkaBroker, settings: Settings) -> None: + sub = broker.subscriber( + *_topics(settings, GroupId.EXECUTION_COORDINATOR), + group_id=GroupId.EXECUTION_COORDINATOR, + ack_policy=AckPolicy.ACK, + ) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_REQUESTED) + async def on_execution_requested( + body: ExecutionRequestedEvent, + coordinator: FromDishka[ExecutionCoordinator], + idem: FromDishka[IdempotencyManager], + logger: FromDishka[logging.Logger], + ) -> None: + await with_idempotency( + body, coordinator.handle_execution_requested, idem, KeyStrategy.EVENT_BASED, 7200, logger, + ) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_COMPLETED) + async def on_execution_completed( + body: ExecutionCompletedEvent, + coordinator: FromDishka[ExecutionCoordinator], + idem: FromDishka[IdempotencyManager], + logger: FromDishka[logging.Logger], + ) -> None: + await with_idempotency( + body, coordinator.handle_execution_completed, idem, KeyStrategy.EVENT_BASED, 7200, logger, + ) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_FAILED) + async def on_execution_failed( + body: ExecutionFailedEvent, + coordinator: FromDishka[ExecutionCoordinator], + idem: FromDishka[IdempotencyManager], + logger: FromDishka[logging.Logger], + ) -> None: + await with_idempotency( + body, coordinator.handle_execution_failed, idem, KeyStrategy.EVENT_BASED, 7200, logger, + ) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_CANCELLED) + async def on_execution_cancelled( + body: ExecutionCancelledEvent, + coordinator: FromDishka[ExecutionCoordinator], + idem: FromDishka[IdempotencyManager], + logger: FromDishka[logging.Logger], + ) -> None: + await with_idempotency( + body, coordinator.handle_execution_cancelled, idem, KeyStrategy.EVENT_BASED, 7200, logger, + ) + + @sub + async def on_unhandled(body: DomainEvent) -> None: + pass + + +def register_k8s_worker_subscriber(broker: KafkaBroker, settings: Settings) -> None: + sub = broker.subscriber( + *_topics(settings, GroupId.K8S_WORKER), + group_id=GroupId.K8S_WORKER, + ack_policy=AckPolicy.ACK, + ) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.CREATE_POD_COMMAND) + async def on_create_pod( + body: CreatePodCommandEvent, + worker: FromDishka[KubernetesWorker], + idem: FromDishka[IdempotencyManager], + logger: FromDishka[logging.Logger], + ) -> None: + await with_idempotency(body, worker.handle_create_pod_command, idem, KeyStrategy.CONTENT_HASH, 3600, logger) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.DELETE_POD_COMMAND) + async def on_delete_pod( + body: DeletePodCommandEvent, + worker: FromDishka[KubernetesWorker], + idem: FromDishka[IdempotencyManager], + logger: FromDishka[logging.Logger], + ) -> None: + await with_idempotency(body, worker.handle_delete_pod_command, idem, KeyStrategy.CONTENT_HASH, 3600, logger) + + @sub + async def on_unhandled(body: DomainEvent) -> None: + pass + + +def register_result_processor_subscriber(broker: KafkaBroker, settings: Settings) -> None: + sub = broker.subscriber( + *_topics(settings, GroupId.RESULT_PROCESSOR), + group_id=GroupId.RESULT_PROCESSOR, + ack_policy=AckPolicy.ACK, + max_poll_records=1, + auto_offset_reset="earliest", + ) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_COMPLETED) + async def on_execution_completed( + body: ExecutionCompletedEvent, + processor: FromDishka[ResultProcessor], + idem: FromDishka[IdempotencyManager], + logger: FromDishka[logging.Logger], + ) -> None: + await with_idempotency(body, processor.handle_execution_completed, idem, KeyStrategy.CONTENT_HASH, 7200, logger) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_FAILED) + async def on_execution_failed( + body: ExecutionFailedEvent, + processor: FromDishka[ResultProcessor], + idem: FromDishka[IdempotencyManager], + logger: FromDishka[logging.Logger], + ) -> None: + await with_idempotency(body, processor.handle_execution_failed, idem, KeyStrategy.CONTENT_HASH, 7200, logger) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_TIMEOUT) + async def on_execution_timeout( + body: ExecutionTimeoutEvent, + processor: FromDishka[ResultProcessor], + idem: FromDishka[IdempotencyManager], + logger: FromDishka[logging.Logger], + ) -> None: + await with_idempotency(body, processor.handle_execution_timeout, idem, KeyStrategy.CONTENT_HASH, 7200, logger) + + @sub + async def on_unhandled(body: DomainEvent) -> None: + pass + + +def register_saga_subscriber(broker: KafkaBroker, settings: Settings) -> None: + sub = broker.subscriber( + *_topics(settings, GroupId.SAGA_ORCHESTRATOR), + group_id=GroupId.SAGA_ORCHESTRATOR, + ack_policy=AckPolicy.ACK, + ) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_REQUESTED) + async def on_execution_requested( + body: ExecutionRequestedEvent, + orchestrator: FromDishka[SagaOrchestrator], + ) -> None: + await orchestrator.handle_execution_requested(body) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_COMPLETED) + async def on_execution_completed( + body: ExecutionCompletedEvent, + orchestrator: FromDishka[SagaOrchestrator], + ) -> None: + await orchestrator.handle_execution_completed(body) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_FAILED) + async def on_execution_failed( + body: ExecutionFailedEvent, + orchestrator: FromDishka[SagaOrchestrator], + ) -> None: + await orchestrator.handle_execution_failed(body) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_TIMEOUT) + async def on_execution_timeout( + body: ExecutionTimeoutEvent, + orchestrator: FromDishka[SagaOrchestrator], + ) -> None: + await orchestrator.handle_execution_timeout(body) + + @sub + async def on_unhandled(body: DomainEvent) -> None: + pass + + +def register_event_store_subscriber(broker: KafkaBroker, settings: Settings) -> None: + topics = [f"{settings.KAFKA_TOPIC_PREFIX}{t}" for t in get_all_topics()] + + @broker.subscriber( + *topics, + group_id="event-store-consumer", + ack_policy=AckPolicy.ACK, + max_poll_records=100, + ) + async def on_any_event( + body: DomainEvent, + event_store: FromDishka[EventStore], + producer: FromDishka[UnifiedProducer], + logger: FromDishka[logging.Logger], + ) -> None: + try: + await event_store.store_event(body) + except Exception as err: + logger.error(f"Error storing event {body.event_id}: {err}", exc_info=True) + topic = str(EVENT_TYPE_TO_TOPIC.get(body.event_type, "unknown")) + await producer.send_to_dlq(body, topic, err, 0) + + +def register_sse_subscriber(broker: KafkaBroker, settings: Settings) -> None: + @broker.subscriber( + *_topics(settings, GroupId.WEBSOCKET_GATEWAY), + group_id="sse-bridge-pool", + ack_policy=AckPolicy.ACK_FIRST, + auto_offset_reset="latest", + max_workers=settings.SSE_CONSUMER_POOL_SIZE, + ) + async def on_sse_event( + body: DomainEvent, + sse_bus: FromDishka[SSERedisBus], + ) -> None: + if body.event_type in SSERedisBus.SSE_ROUTED_EVENTS: + await sse_bus.route_domain_event(body) + + +def register_notification_subscriber(broker: KafkaBroker, settings: Settings) -> None: + sub = broker.subscriber( + *_topics(settings, GroupId.NOTIFICATION_SERVICE), + group_id=GroupId.NOTIFICATION_SERVICE, + ack_policy=AckPolicy.ACK, + max_poll_records=10, + auto_offset_reset="latest", + ) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_COMPLETED) + async def on_execution_completed( + body: ExecutionCompletedEvent, + service: FromDishka[NotificationService], + ) -> None: + await service.handle_execution_completed(body) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_FAILED) + async def on_execution_failed( + body: ExecutionFailedEvent, + service: FromDishka[NotificationService], + ) -> None: + await service.handle_execution_failed(body) + + @sub(filter=lambda msg: msg.headers["event_type"] == EventType.EXECUTION_TIMEOUT) + async def on_execution_timeout( + body: ExecutionTimeoutEvent, + service: FromDishka[NotificationService], + ) -> None: + await service.handle_execution_timeout(body) + + @sub + async def on_unhandled(body: DomainEvent) -> None: + pass + + +def register_dlq_subscriber(broker: KafkaBroker, settings: Settings) -> None: + """Register a DLQ subscriber that consumes dead-letter messages. + + DLQ messages are plain JSON (not Avro), so a custom decoder is used + to bypass the broker-level Avro decoder. + """ + topic_name = f"{settings.KAFKA_TOPIC_PREFIX}{KafkaTopic.DEAD_LETTER_QUEUE}" + + async def dlq_json_decoder(msg: StreamMessage[Any]) -> dict[str, Any]: + return json.loads(msg.body) # type: ignore[no-any-return] + + @broker.subscriber( + topic_name, + group_id=GroupId.DLQ_MANAGER, + ack_policy=AckPolicy.ACK, + auto_offset_reset="earliest", + decoder=dlq_json_decoder, + ) + async def on_dlq_message( + body: dict[str, Any], + msg: StreamMessage[Any], + manager: FromDishka[DLQManager], + logger: FromDishka[logging.Logger], + ) -> None: + start = asyncio.get_running_loop().time() + raw = msg.raw_message + headers = {k: v.decode() for k, v in (raw.headers or [])} + dlq_msg = manager.parse_dlq_body(body, raw.offset, raw.partition, headers) + + ctx = extract_trace_context(dlq_msg.headers) + with get_tracer().start_as_current_span( + name="dlq.consume", + context=ctx, + kind=SpanKind.CONSUMER, + attributes={ + EventAttributes.KAFKA_TOPIC: str(manager.dlq_topic), + EventAttributes.EVENT_TYPE: dlq_msg.event.event_type, + EventAttributes.EVENT_ID: dlq_msg.event.event_id, + }, + ): + await manager.handle_message(dlq_msg) + + manager.metrics.record_dlq_message_received(dlq_msg.original_topic, dlq_msg.event.event_type) + manager.metrics.record_dlq_message_age( + (datetime.now(timezone.utc) - dlq_msg.failed_at).total_seconds() + ) + manager.metrics.record_dlq_processing_duration( + asyncio.get_running_loop().time() - start, "process" + ) diff --git a/backend/app/events/schema/schema_registry.py b/backend/app/events/schema/schema_registry.py index 6e4337a4..ece3a679 100644 --- a/backend/app/events/schema/schema_registry.py +++ b/backend/app/events/schema/schema_registry.py @@ -117,7 +117,3 @@ async def initialize_schemas(self) -> None: await self.set_compatibility(subject, "FORWARD") await self.register_schema(subject, event_class) self.logger.info(f"Initialized {len(_get_all_event_classes())} event schemas") - - -async def initialize_event_schemas(registry: SchemaRegistryManager) -> None: - await registry.initialize_schemas() diff --git a/backend/app/main.py b/backend/app/main.py index 914d31b0..f308fece 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,5 +1,6 @@ import uvicorn -from dishka.integrations.fastapi import setup_dishka +from dishka.integrations.fastapi import setup_dishka as setup_dishka_fastapi +from dishka.integrations.faststream import setup_dishka as setup_dishka_faststream from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware @@ -39,6 +40,13 @@ RequestSizeLimitMiddleware, setup_metrics, ) +from app.events.broker import create_broker +from app.events.handlers import ( + register_event_store_subscriber, + register_notification_subscriber, + register_sse_subscriber, +) +from app.events.schema.schema_registry import SchemaRegistryManager from app.settings import Settings @@ -53,8 +61,12 @@ def create_app(settings: Settings | None = None) -> FastAPI: settings = settings or Settings() logger = setup_logger(settings.LOG_LEVEL) - # Note: Metrics are now provided via DI (MetricsProvider) and injected into services. - # No manual MetricsContext initialization is needed. + # Create Kafka broker and register in-app subscribers + schema_registry = SchemaRegistryManager(settings, logger) + broker = create_broker(settings, schema_registry, logger) + register_event_store_subscriber(broker, settings) + register_sse_subscriber(broker, settings) + register_notification_subscriber(broker, settings) # Disable OpenAPI/Docs in production for security; health endpoints provide readiness app = FastAPI( @@ -65,8 +77,12 @@ def create_app(settings: Settings | None = None) -> FastAPI: redoc_url=None, ) - container = create_app_container(settings) - setup_dishka(container, app) + # Store broker on app state for lifespan access + app.state.kafka_broker = broker + + container = create_app_container(settings, broker) + setup_dishka_fastapi(container, app) + setup_dishka_faststream(container, broker=broker, auto_inject=True) setup_metrics(settings, logger) app.add_middleware(MetricsMiddleware) @@ -121,8 +137,6 @@ def create_app(settings: Settings | None = None) -> FastAPI: app.include_router(saga.router, prefix=settings.API_V1_STR) app.include_router(grafana_alerts.router, prefix=settings.API_V1_STR) - # No additional testing-only routes here - logger.info("All routers configured") configure_exception_handlers(app) diff --git a/backend/app/services/coordinator/coordinator.py b/backend/app/services/coordinator/coordinator.py index bd7bf4fb..d0bcffbc 100644 --- a/backend/app/services/coordinator/coordinator.py +++ b/backend/app/services/coordinator/coordinator.py @@ -7,12 +7,10 @@ from app.core.metrics import CoordinatorMetrics from app.db.repositories.execution_repository import ExecutionRepository -from app.domain.enums.events import EventType from app.domain.enums.execution import QueuePriority from app.domain.enums.storage import ExecutionErrorType from app.domain.events.typed import ( CreatePodCommandEvent, - DomainEvent, EventMetadata, ExecutionAcceptedEvent, ExecutionCancelledEvent, @@ -20,7 +18,7 @@ ExecutionFailedEvent, ExecutionRequestedEvent, ) -from app.events.core import EventDispatcher, UnifiedProducer +from app.events.core import UnifiedProducer class QueueRejectError(Exception): @@ -44,7 +42,6 @@ class ExecutionCoordinator: def __init__( self, producer: UnifiedProducer, - dispatcher: EventDispatcher, execution_repository: ExecutionRepository, logger: logging.Logger, coordinator_metrics: CoordinatorMetrics, @@ -72,33 +69,9 @@ def __init__( # Scheduling state self._active_executions: set[str] = set() - self._register_handlers(dispatcher) - - def _register_handlers(self, dispatcher: EventDispatcher) -> None: - dispatcher.register_handler(EventType.EXECUTION_REQUESTED, self._handle_requested_wrapper) - dispatcher.register_handler(EventType.EXECUTION_COMPLETED, self._handle_completed_wrapper) - dispatcher.register_handler(EventType.EXECUTION_FAILED, self._handle_failed_wrapper) - dispatcher.register_handler(EventType.EXECUTION_CANCELLED, self._handle_cancelled_wrapper) - - async def _handle_requested_wrapper(self, event: DomainEvent) -> None: - assert isinstance(event, ExecutionRequestedEvent) - await self._handle_execution_requested(event) - - async def _handle_completed_wrapper(self, event: DomainEvent) -> None: - assert isinstance(event, ExecutionCompletedEvent) - await self._handle_execution_completed(event) - - async def _handle_failed_wrapper(self, event: DomainEvent) -> None: - assert isinstance(event, ExecutionFailedEvent) - await self._handle_execution_failed(event) - - async def _handle_cancelled_wrapper(self, event: DomainEvent) -> None: - assert isinstance(event, ExecutionCancelledEvent) - await self._handle_execution_cancelled(event) - - async def _handle_execution_requested(self, event: ExecutionRequestedEvent) -> None: + async def handle_execution_requested(self, event: ExecutionRequestedEvent) -> None: """Handle execution requested event - add to queue for processing.""" - self.logger.info(f"HANDLER CALLED: _handle_execution_requested for event {event.event_id}") + self.logger.info(f"HANDLER CALLED: handle_execution_requested for event {event.event_id}") start_time = time.time() try: @@ -123,7 +96,7 @@ async def _handle_execution_requested(self, event: ExecutionRequestedEvent) -> N if position == 0: await self._try_schedule_next() - async def _handle_execution_cancelled(self, event: ExecutionCancelledEvent) -> None: + async def handle_execution_cancelled(self, event: ExecutionCancelledEvent) -> None: """Handle execution cancelled event.""" execution_id = event.execution_id @@ -137,7 +110,7 @@ async def _handle_execution_cancelled(self, event: ExecutionCancelledEvent) -> N await self._try_schedule_next() - async def _handle_execution_completed(self, event: ExecutionCompletedEvent) -> None: + async def handle_execution_completed(self, event: ExecutionCompletedEvent) -> None: """Handle execution completed event.""" execution_id = event.execution_id @@ -148,7 +121,7 @@ async def _handle_execution_completed(self, event: ExecutionCompletedEvent) -> N self.logger.info(f"Execution {execution_id} completed") await self._try_schedule_next() - async def _handle_execution_failed(self, event: ExecutionFailedEvent) -> None: + async def handle_execution_failed(self, event: ExecutionFailedEvent) -> None: """Handle execution failed event.""" execution_id = event.execution_id diff --git a/backend/app/services/idempotency/__init__.py b/backend/app/services/idempotency/__init__.py index 484465db..f429827d 100644 --- a/backend/app/services/idempotency/__init__.py +++ b/backend/app/services/idempotency/__init__.py @@ -4,13 +4,10 @@ IdempotencyManager, IdempotencyResult, ) -from app.services.idempotency.middleware import IdempotentEventDispatcher, IdempotentEventHandler __all__ = [ "IdempotencyConfig", "IdempotencyManager", "IdempotencyResult", "IdempotencyStatus", - "IdempotentEventDispatcher", - "IdempotentEventHandler", ] diff --git a/backend/app/services/idempotency/middleware.py b/backend/app/services/idempotency/middleware.py deleted file mode 100644 index 13b8b59a..00000000 --- a/backend/app/services/idempotency/middleware.py +++ /dev/null @@ -1,88 +0,0 @@ -import logging -from collections.abc import Awaitable, Callable - -from app.domain.events.typed import DomainEvent -from app.domain.idempotency import KeyStrategy -from app.events.core.dispatcher import EventDispatcher, EventHandler -from app.services.idempotency.idempotency_manager import IdempotencyManager - - -class IdempotentEventHandler: - """Wraps a single event handler with idempotency check-and-reserve logic.""" - - def __init__( - self, - handler: Callable[[DomainEvent], Awaitable[None]], - idempotency_manager: IdempotencyManager, - logger: logging.Logger, - key_strategy: KeyStrategy = KeyStrategy.EVENT_BASED, - fields: set[str] | None = None, - ttl_seconds: int | None = None, - ): - self.handler = handler - self.idempotency_manager = idempotency_manager - self.logger = logger - self.key_strategy = key_strategy - self.fields = fields - self.ttl_seconds = ttl_seconds - self.__name__ = handler.__name__ - - async def __call__(self, event: DomainEvent) -> None: - """Process event with idempotency check.""" - self.logger.info( - f"IdempotentEventHandler called for event {event.event_type}, " - f"id={event.event_id}, handler={self.__name__}" - ) - - idempotency_result = await self.idempotency_manager.check_and_reserve( - event=event, - key_strategy=self.key_strategy, - ttl_seconds=self.ttl_seconds, - fields=self.fields, - ) - - if idempotency_result.is_duplicate: - self.logger.info( - f"Duplicate event detected: {event.event_type} ({event.event_id}), status: {idempotency_result.status}" - ) - return - - try: - await self.handler(event) - await self.idempotency_manager.mark_completed( - event=event, key_strategy=self.key_strategy, fields=self.fields - ) - except Exception as e: - await self.idempotency_manager.mark_failed( - event=event, error=str(e), key_strategy=self.key_strategy, fields=self.fields - ) - raise - - -class IdempotentEventDispatcher(EventDispatcher): - """EventDispatcher that automatically wraps every handler with idempotency. - - Drop-in replacement for ``EventDispatcher`` — DI providers create this - subclass for services that need idempotent event handling. - """ - - def __init__( - self, - logger: logging.Logger, - idempotency_manager: IdempotencyManager, - key_strategy: KeyStrategy = KeyStrategy.EVENT_BASED, - ttl_seconds: int = 3600, - ) -> None: - super().__init__(logger=logger) - self._idempotency_manager = idempotency_manager - self._key_strategy = key_strategy - self._ttl_seconds = ttl_seconds - - def _wrap_handler(self, handler: EventHandler) -> EventHandler: - return IdempotentEventHandler( - handler=handler, - idempotency_manager=self._idempotency_manager, - logger=self.logger, - key_strategy=self._key_strategy, - ttl_seconds=self._ttl_seconds, - ) diff --git a/backend/app/services/k8s_worker/worker.py b/backend/app/services/k8s_worker/worker.py index f90c0eec..7cf199d2 100644 --- a/backend/app/services/k8s_worker/worker.py +++ b/backend/app/services/k8s_worker/worker.py @@ -8,17 +8,15 @@ from kubernetes_asyncio.client.rest import ApiException from app.core.metrics import EventMetrics, ExecutionMetrics, KubernetesMetrics -from app.domain.enums.events import EventType from app.domain.enums.storage import ExecutionErrorType from app.domain.events.typed import ( CreatePodCommandEvent, DeletePodCommandEvent, - DomainEvent, ExecutionFailedEvent, ExecutionStartedEvent, PodCreatedEvent, ) -from app.events.core import EventDispatcher, UnifiedProducer +from app.events.core import UnifiedProducer from app.runtime_registry import RUNTIME_REGISTRY from app.settings import Settings @@ -42,7 +40,6 @@ def __init__( self, api_client: k8s_client.ApiClient, producer: UnifiedProducer, - dispatcher: EventDispatcher, settings: Settings, logger: logging.Logger, event_metrics: EventMetrics, @@ -67,11 +64,6 @@ def __init__( # Components self.pod_builder = PodBuilder(settings=settings) self.producer = producer - self._dispatcher = dispatcher - - # Register handlers on dispatcher - dispatcher.register_handler(EventType.CREATE_POD_COMMAND, self._handle_create_pod_command_wrapper) - dispatcher.register_handler(EventType.DELETE_POD_COMMAND, self._handle_delete_pod_command_wrapper) # State tracking self._active_creations: set[str] = set() @@ -79,19 +71,7 @@ def __init__( self.logger.info(f"KubernetesWorker initialized for namespace {self._settings.K8S_NAMESPACE}") - async def _handle_create_pod_command_wrapper(self, event: DomainEvent) -> None: - """Wrapper for handling CreatePodCommandEvent with type safety.""" - assert isinstance(event, CreatePodCommandEvent) - self.logger.info(f"Processing create_pod_command for execution {event.execution_id} from saga {event.saga_id}") - await self._handle_create_pod_command(event) - - async def _handle_delete_pod_command_wrapper(self, event: DomainEvent) -> None: - """Wrapper for handling DeletePodCommandEvent.""" - assert isinstance(event, DeletePodCommandEvent) - self.logger.info(f"Processing delete_pod_command for execution {event.execution_id} from saga {event.saga_id}") - await self._handle_delete_pod_command(event) - - async def _handle_create_pod_command(self, command: CreatePodCommandEvent) -> None: + async def handle_create_pod_command(self, command: CreatePodCommandEvent) -> None: """Handle create pod command from saga orchestrator""" execution_id = command.execution_id @@ -100,10 +80,9 @@ async def _handle_create_pod_command(self, command: CreatePodCommandEvent) -> No self.logger.warning(f"Already creating pod for execution {execution_id}") return - # Create pod asynchronously - asyncio.create_task(self._create_pod_for_execution(command)) + await self._create_pod_for_execution(command) - async def _handle_delete_pod_command(self, command: DeletePodCommandEvent) -> None: + async def handle_delete_pod_command(self, command: DeletePodCommandEvent) -> None: """Handle delete pod command from saga orchestrator (compensation)""" execution_id = command.execution_id self.logger.info(f"Deleting pod for execution {execution_id} due to: {command.reason}") @@ -257,16 +236,6 @@ async def _publish_pod_creation_failed(self, command: CreatePodCommandEvent, err ) await self.producer.produce(event_to_produce=event, key=command.execution_id) - async def get_status(self) -> dict[str, Any]: - """Get worker status""" - return { - "active_creations": len(self._active_creations), - "config": { - "namespace": self._settings.K8S_NAMESPACE, - "max_concurrent_pods": self._settings.K8S_MAX_CONCURRENT_PODS, - }, - } - async def wait_for_active_creations(self, timeout: float = 30.0) -> None: """Wait for active pod creations to complete (for graceful shutdown).""" if not self._active_creations: diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index 1ace24bb..e0aa7e6a 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -16,7 +16,6 @@ ) from app.domain.enums.user import UserRole from app.domain.events.typed import ( - DomainEvent, ExecutionCompletedEvent, ExecutionFailedEvent, ExecutionTimeoutEvent, @@ -153,7 +152,7 @@ async def create_notification( ) # Check throttling - if await self._throttle_cache.check_throttle( + if not self.settings.TESTING and await self._throttle_cache.check_throttle( user_id, severity, window_hours=self.settings.NOTIF_THROTTLE_WINDOW_HOURS, @@ -410,8 +409,8 @@ def _get_slack_color(self, priority: NotificationSeverity) -> str: NotificationSeverity.URGENT: "#990000", # Dark Red }.get(priority, "#808080") # Default gray - async def _handle_execution_timeout_typed(self, event: ExecutionTimeoutEvent) -> None: - """Handle typed execution timeout event.""" + async def handle_execution_timeout(self, event: ExecutionTimeoutEvent) -> None: + """Handle execution timeout event.""" user_id = event.metadata.user_id if not user_id: self.logger.error("No user_id in event metadata") @@ -430,8 +429,8 @@ async def _handle_execution_timeout_typed(self, event: ExecutionTimeoutEvent) -> ), ) - async def _handle_execution_completed_typed(self, event: ExecutionCompletedEvent) -> None: - """Handle typed execution completed event.""" + async def handle_execution_completed(self, event: ExecutionCompletedEvent) -> None: + """Handle execution completed event.""" user_id = event.metadata.user_id if not user_id: self.logger.error("No user_id in event metadata") @@ -451,22 +450,8 @@ async def _handle_execution_completed_typed(self, event: ExecutionCompletedEvent ), ) - async def _handle_execution_event(self, event: DomainEvent) -> None: - """Unified handler for execution result events.""" - try: - if isinstance(event, ExecutionCompletedEvent): - await self._handle_execution_completed_typed(event) - elif isinstance(event, ExecutionFailedEvent): - await self._handle_execution_failed_typed(event) - elif isinstance(event, ExecutionTimeoutEvent): - await self._handle_execution_timeout_typed(event) - else: - self.logger.warning(f"Unhandled execution event type: {event.event_type}") - except Exception as e: - self.logger.error(f"Error handling execution event: {e}", exc_info=True) - - async def _handle_execution_failed_typed(self, event: ExecutionFailedEvent) -> None: - """Handle typed execution failed event.""" + async def handle_execution_failed(self, event: ExecutionFailedEvent) -> None: + """Handle execution failed event.""" user_id = event.metadata.user_id if not user_id: self.logger.error("No user_id in event metadata") diff --git a/backend/app/services/pod_monitor/config.py b/backend/app/services/pod_monitor/config.py index 44159037..f862f016 100644 --- a/backend/app/services/pod_monitor/config.py +++ b/backend/app/services/pod_monitor/config.py @@ -25,16 +25,10 @@ class PodMonitorConfig: label_selector: str = "app=integr8s,component=executor" field_selector: str | None = None watch_timeout_seconds: int = 300 # 5 minutes - watch_reconnect_delay: int = 5 - max_reconnect_attempts: int = 10 # Monitoring settings enable_metrics: bool = True metrics_port: int = 9091 - # State reconciliation - reconcile_interval_seconds: int = 300 # 5 minutes - enable_state_reconciliation: bool = True - # Event filtering ignored_pod_phases: list[PodPhase] = field(default_factory=list) diff --git a/backend/app/services/pod_monitor/monitor.py b/backend/app/services/pod_monitor/monitor.py index b1017914..80ead1e8 100644 --- a/backend/app/services/pod_monitor/monitor.py +++ b/backend/app/services/pod_monitor/monitor.py @@ -1,4 +1,3 @@ -import asyncio import logging import time from dataclasses import dataclass @@ -7,7 +6,6 @@ from kubernetes_asyncio import client as k8s_client from kubernetes_asyncio import watch as k8s_watch -from kubernetes_asyncio.client.rest import ApiException from app.core.metrics import KubernetesMetrics from app.core.utils import StringEnum @@ -17,15 +15,8 @@ from app.services.pod_monitor.event_mapper import PodEventMapper # Type aliases -type PodName = str type ResourceVersion = str -type EventType = str type KubeEvent = dict[str, Any] -type StatusDict = dict[str, Any] - -# Constants -MAX_BACKOFF_SECONDS: int = 300 # 5 minutes -RECONCILIATION_LOG_INTERVAL: int = 60 # 1 minute class WatchEventType(StringEnum): @@ -36,15 +27,6 @@ class WatchEventType(StringEnum): DELETED = "DELETED" -class MonitorState(StringEnum): - """Pod monitor states.""" - - IDLE = auto() - RUNNING = auto() - STOPPING = auto() - STOPPED = auto() - - class ErrorType(StringEnum): """Error types for metrics.""" @@ -54,17 +36,6 @@ class ErrorType(StringEnum): PROCESSING_ERROR = auto() -@dataclass(frozen=True, slots=True) -class WatchContext: - """Immutable context for watch operations.""" - - namespace: str - label_selector: str - field_selector: str | None - timeout_seconds: int - resource_version: ResourceVersion | None - - @dataclass(frozen=True, slots=True) class PodEvent: """Immutable pod event data.""" @@ -74,26 +45,17 @@ class PodEvent: resource_version: ResourceVersion | None -@dataclass(frozen=True, slots=True) -class ReconciliationResult: - """Result of state reconciliation.""" - - missing_pods: set[PodName] - extra_pods: set[PodName] - duration_seconds: float - success: bool - error: str | None = None - - class PodMonitor: """ Monitors Kubernetes pods and publishes lifecycle events. - This service watches pods with specific labels using the K8s watch API, - maps Kubernetes events to application events, and publishes them to Kafka. - Events are stored in the events collection AND published to Kafka via KafkaEventService. + Uses the list-then-watch pattern: + - On first call (or after 410 Gone), LISTs all pods to get current state + and a resource_version cursor. + - Subsequent calls WATCH from that cursor, receiving only new changes. - Lifecycle is managed by DI provider - call start() to begin monitoring, stop() to end. + Stateless service — all lifecycle (watch scheduling, error handling, + shutdown) is managed by APScheduler in the DI provider. """ def __init__( @@ -105,7 +67,6 @@ def __init__( event_mapper: PodEventMapper, kubernetes_metrics: KubernetesMetrics, ) -> None: - """Initialize the pod monitor with all required dependencies.""" self.logger = logger self.config = config @@ -113,135 +74,82 @@ def __init__( self._v1 = k8s_client.CoreV1Api(api_client) self._watch = k8s_watch.Watch() - # Components (required, no nullability) + # Components self._event_mapper = event_mapper self._kafka_event_service = kafka_event_service - # State - self._state = MonitorState.IDLE - self._tracked_pods: set[PodName] = set() - self._reconnect_attempts: int = 0 + # Watch cursor — set from LIST on first run or after 410 Gone self._last_resource_version: ResourceVersion | None = None - # Tasks - self._watch_task: asyncio.Task[None] | None = None - self._reconcile_task: asyncio.Task[None] | None = None - # Metrics self._metrics = kubernetes_metrics self.logger.info(f"PodMonitor initialized for namespace {config.namespace}") - @property - def state(self) -> MonitorState: - """Get current monitor state.""" - return self._state - - async def start(self) -> None: - """Start the pod monitor.""" - self.logger.info("Starting PodMonitor service...") - - # Start monitoring - self._state = MonitorState.RUNNING - self._watch_task = asyncio.create_task(self._watch_pods()) - - # Start reconciliation if enabled - if self.config.enable_state_reconciliation: - self._reconcile_task = asyncio.create_task(self._reconciliation_loop()) - - self.logger.info("PodMonitor service started successfully") - - async def stop(self) -> None: - """Stop the pod monitor.""" - self.logger.info("Stopping PodMonitor service...") - self._state = MonitorState.STOPPING - - # Cancel tasks - tasks = [t for t in [self._watch_task, self._reconcile_task] if t] - for task in tasks: - task.cancel() - - # Wait for cancellation - if tasks: - await asyncio.gather(*tasks, return_exceptions=True) - - # Close watch - if self._watch: - self._watch.stop() - await self._watch.close() - - # Clear state - self._tracked_pods.clear() - self._event_mapper.clear_cache() - - self._state = MonitorState.STOPPED - self.logger.info("PodMonitor service stopped") - - async def _watch_pods(self) -> None: - """Main watch loop for pods.""" - while self._state == MonitorState.RUNNING: - try: - self._reconnect_attempts = 0 - await self._watch_pod_events() - - except ApiException as e: - match e.status: - case 410: # Gone - resource version too old - self.logger.warning("Resource version expired, resetting watch") - self._last_resource_version = None - self._metrics.record_pod_monitor_watch_error(ErrorType.RESOURCE_VERSION_EXPIRED) - case _: - self.logger.error(f"API error in watch: {e}") - self._metrics.record_pod_monitor_watch_error(ErrorType.API_ERROR) - - await self._handle_watch_error() - - except Exception as e: - self.logger.error(f"Unexpected error in watch: {e}", exc_info=True) - self._metrics.record_pod_monitor_watch_error(ErrorType.UNEXPECTED) - await self._handle_watch_error() - - async def _watch_pod_events(self) -> None: - """Watch for pod events.""" - context = WatchContext( - namespace=self.config.namespace, - label_selector=self.config.label_selector, - field_selector=self.config.field_selector, - timeout_seconds=self.config.watch_timeout_seconds, - resource_version=self._last_resource_version, - ) + async def watch_pod_events(self) -> None: + """Run a single bounded K8s watch stream using list-then-watch. - self.logger.info(f"Starting pod watch with selector: {context.label_selector}, namespace: {context.namespace}") + On first call (or when _last_resource_version is None), performs a LIST + to get current state and a resource_version cursor. Then watches from + that cursor. + + Returns normally when the server-side timeout expires. + Raises on K8s API errors — the caller (provider) handles retries. + """ + if not self._last_resource_version: + await self._list_and_process_existing_pods() + + self.logger.info( + f"Starting pod watch from rv={self._last_resource_version}, " + f"selector: {self.config.label_selector}" + ) - # Create watch stream kwargs kwargs: dict[str, Any] = { - "namespace": context.namespace, - "label_selector": context.label_selector, - "timeout_seconds": context.timeout_seconds, + "namespace": self.config.namespace, + "label_selector": self.config.label_selector, + "timeout_seconds": self.config.watch_timeout_seconds, + "resource_version": self._last_resource_version, } - if context.field_selector: - kwargs["field_selector"] = context.field_selector + if self.config.field_selector: + kwargs["field_selector"] = self.config.field_selector - if context.resource_version: - kwargs["resource_version"] = context.resource_version - - # Watch stream - kubernetes_asyncio Watch is an async iterator async for event in self._watch.stream(self._v1.list_namespaced_pod, **kwargs): - if self._state != MonitorState.RUNNING: - self._watch.stop() - break - await self._process_raw_event(event) # Store resource version from watch for next iteration if self._watch.resource_version: self._last_resource_version = self._watch.resource_version + async def _list_and_process_existing_pods(self) -> None: + """LIST all matching pods to bootstrap state and get a resource_version cursor.""" + kwargs: dict[str, Any] = { + "namespace": self.config.namespace, + "label_selector": self.config.label_selector, + } + if self.config.field_selector: + kwargs["field_selector"] = self.config.field_selector + + pod_list = await self._v1.list_namespaced_pod(**kwargs) + + for pod in pod_list.items: + event = PodEvent( + event_type=WatchEventType.ADDED, + pod=pod, + resource_version=pod.metadata.resource_version if pod.metadata else None, + ) + await self._process_pod_event(event) + + # Use the list's resource_version as the authoritative cursor + self._last_resource_version = pod_list.metadata.resource_version + + self.logger.info( + f"Listed {len(pod_list.items)} existing pods, rv={self._last_resource_version}" + ) + async def _process_raw_event(self, raw_event: KubeEvent) -> None: """Process a raw Kubernetes watch event.""" try: - # Parse event event = PodEvent( event_type=WatchEventType(raw_event["type"].upper()), pod=raw_event["object"], @@ -261,7 +169,7 @@ async def _process_pod_event(self, event: PodEvent) -> None: start_time = time.time() try: - # Update resource version + # Update resource version for crash recovery if event.resource_version: self._last_resource_version = event.resource_version @@ -270,16 +178,7 @@ async def _process_pod_event(self, event: PodEvent) -> None: if pod_phase in self.config.ignored_pod_phases: return - # Update tracked pods pod_name = event.pod.metadata.name - match event.event_type: - case WatchEventType.ADDED | WatchEventType.MODIFIED: - self._tracked_pods.add(pod_name) - case WatchEventType.DELETED: - self._tracked_pods.discard(pod_name) - - # Update metrics - self._metrics.update_pod_monitor_pods_watched(len(self._tracked_pods)) # Map to application events app_events = await self._event_mapper.map_pod_event(event.pod, event.event_type) @@ -288,7 +187,6 @@ async def _process_pod_event(self, event: PodEvent) -> None: for app_event in app_events: await self._publish_event(app_event, event.pod) - # Log event if app_events: self.logger.info( f"Processed {event.event_type} event for pod {pod_name} " @@ -296,7 +194,6 @@ async def _process_pod_event(self, event: PodEvent) -> None: f"published {len(app_events)} events" ) - # Update metrics duration = time.time() - start_time self._metrics.record_pod_monitor_event_processing_duration(duration, event.event_type) @@ -307,7 +204,6 @@ async def _process_pod_event(self, event: PodEvent) -> None: async def _publish_event(self, event: DomainEvent, pod: k8s_client.V1Pod) -> None: """Publish event to Kafka and store in events collection.""" try: - # Add correlation ID from pod labels if pod.metadata and pod.metadata.labels: event.metadata.correlation_id = pod.metadata.labels.get("execution-id") or "" @@ -321,118 +217,3 @@ async def _publish_event(self, event: DomainEvent, pod: k8s_client.V1Pod) -> Non except Exception as e: self.logger.error(f"Error publishing event: {e}", exc_info=True) - - async def _handle_watch_error(self) -> None: - """Handle watch errors with exponential backoff.""" - self._reconnect_attempts += 1 - - if self._reconnect_attempts > self.config.max_reconnect_attempts: - self.logger.error( - f"Max reconnect attempts ({self.config.max_reconnect_attempts}) exceeded, stopping pod monitor" - ) - self._state = MonitorState.STOPPING - return - - # Calculate exponential backoff - backoff = min(self.config.watch_reconnect_delay * (2 ** (self._reconnect_attempts - 1)), MAX_BACKOFF_SECONDS) - - self.logger.info( - f"Reconnecting watch in {backoff}s " - f"(attempt {self._reconnect_attempts}/{self.config.max_reconnect_attempts})" - ) - - self._metrics.increment_pod_monitor_watch_reconnects() - await asyncio.sleep(backoff) - - async def _reconciliation_loop(self) -> None: - """Periodically reconcile state with Kubernetes.""" - while self._state == MonitorState.RUNNING: - try: - await asyncio.sleep(self.config.reconcile_interval_seconds) - - if self._state == MonitorState.RUNNING: - result = await self._reconcile_state() - self._log_reconciliation_result(result) - - except Exception as e: - self.logger.error(f"Error in reconciliation loop: {e}", exc_info=True) - - async def _reconcile_state(self) -> ReconciliationResult: - """Reconcile tracked pods with actual state.""" - start_time = time.time() - - try: - self.logger.info("Starting pod state reconciliation") - - # List all pods matching selector - pods = await self._v1.list_namespaced_pod( - namespace=self.config.namespace, label_selector=self.config.label_selector - ) - - # Get current pod names - current_pods = {pod.metadata.name for pod in pods.items} - - # Find differences - missing_pods = current_pods - self._tracked_pods - extra_pods = self._tracked_pods - current_pods - - # Process missing pods - for pod in pods.items: - if pod.metadata.name in missing_pods: - self.logger.info(f"Reconciling missing pod: {pod.metadata.name}") - event = PodEvent( - event_type=WatchEventType.ADDED, pod=pod, resource_version=pod.metadata.resource_version - ) - await self._process_pod_event(event) - - # Remove extra pods - for pod_name in extra_pods: - self.logger.info(f"Removing stale pod from tracking: {pod_name}") - self._tracked_pods.discard(pod_name) - - # Update metrics - self._metrics.update_pod_monitor_pods_watched(len(self._tracked_pods)) - self._metrics.record_pod_monitor_reconciliation_run("success") - - duration = time.time() - start_time - - return ReconciliationResult( - missing_pods=missing_pods, extra_pods=extra_pods, duration_seconds=duration, success=True - ) - - except Exception as e: - self.logger.error(f"Failed to reconcile state: {e}", exc_info=True) - self._metrics.record_pod_monitor_reconciliation_run("failed") - - return ReconciliationResult( - missing_pods=set(), - extra_pods=set(), - duration_seconds=time.time() - start_time, - success=False, - error=str(e), - ) - - def _log_reconciliation_result(self, result: ReconciliationResult) -> None: - """Log reconciliation result.""" - if result.success: - self.logger.info( - f"Reconciliation completed in {result.duration_seconds:.2f}s. " - f"Found {len(result.missing_pods)} missing, " - f"{len(result.extra_pods)} extra pods" - ) - else: - self.logger.error(f"Reconciliation failed after {result.duration_seconds:.2f}s: {result.error}") - - async def get_status(self) -> StatusDict: - """Get monitor status.""" - return { - "state": self._state, - "tracked_pods": len(self._tracked_pods), - "reconnect_attempts": self._reconnect_attempts, - "last_resource_version": self._last_resource_version, - "config": { - "namespace": self.config.namespace, - "label_selector": self.config.label_selector, - "enable_reconciliation": self.config.enable_state_reconciliation, - }, - } diff --git a/backend/app/services/saga/saga_orchestrator.py b/backend/app/services/saga/saga_orchestrator.py index 04849d96..702c5d1d 100644 --- a/backend/app/services/saga/saga_orchestrator.py +++ b/backend/app/services/saga/saga_orchestrator.py @@ -1,4 +1,3 @@ -import asyncio import logging from datetime import UTC, datetime, timedelta from uuid import uuid4 @@ -114,7 +113,7 @@ async def _start_saga(self, trigger_event: ExecutionRequestedEvent) -> str: saga = self._create_saga_instance() context = SagaContext(instance.saga_id, execution_id) - asyncio.create_task(self._execute_saga(saga, instance, context, trigger_event)) + await self._execute_saga(saga, instance, context, trigger_event) return instance.saga_id diff --git a/backend/pyproject.toml b/backend/pyproject.toml index f0396c26..ebc70c1b 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -23,12 +23,12 @@ dependencies = [ "charset-normalizer==3.4.0", "click==8.1.7", "ConfigArgParse==1.7.1", - "aiokafka==0.13.0", + "aiokafka==0.12.0", "python-schema-registry-client==2.6.1", "contourpy==1.3.3", "cycler==0.12.1", "Deprecated==1.2.14", - "dishka==1.6.0", + "dishka==1.7.2", "dnspython==2.7.0", "durationpy==0.9", "email-validator==2.3.0", @@ -125,6 +125,7 @@ dependencies = [ "monggregate==0.22.1", "aiofiles==25.1.0", "APScheduler==3.10.4", + "faststream[kafka]==0.6.6", ] [build-system] diff --git a/backend/tests/e2e/dlq/test_dlq_manager.py b/backend/tests/e2e/dlq/test_dlq_manager.py index d7c9b81c..a37d81f0 100644 --- a/backend/tests/e2e/dlq/test_dlq_manager.py +++ b/backend/tests/e2e/dlq/test_dlq_manager.py @@ -4,7 +4,8 @@ from datetime import datetime, timezone import pytest -from aiokafka import AIOKafkaConsumer, AIOKafkaProducer +from aiokafka import AIOKafkaConsumer +from faststream.kafka import KafkaBroker from app.core.metrics import DLQMetrics from app.db.repositories.dlq_repository import DLQRepository from app.dlq.manager import DLQManager @@ -63,21 +64,18 @@ async def consume_dlq_events() -> None: except Exception as e: _test_logger.debug(f"Error deserializing DLQ event: {e}") - # Create and start a producer for the manager (lifecycle managed by test) - producer = AIOKafkaProducer( - bootstrap_servers=test_settings.KAFKA_BOOTSTRAP_SERVERS, - client_id="test-dlq-producer", - acks="all", - enable_idempotence=True, + # Create and start a broker for the manager (lifecycle managed by test) + broker = KafkaBroker( + test_settings.KAFKA_BOOTSTRAP_SERVERS, ) - await asyncio.gather(producer.start(), events_consumer.start()) + await asyncio.gather(broker.start(), events_consumer.start()) consume_task = asyncio.create_task(consume_dlq_events()) try: repository = DLQRepository(_test_logger) manager = DLQManager( settings=test_settings, - producer=producer, + broker=broker, schema_registry=schema_registry, logger=_test_logger, dlq_metrics=dlq_metrics, @@ -108,4 +106,4 @@ async def consume_dlq_events() -> None: await consume_task except asyncio.CancelledError: pass - await asyncio.gather(events_consumer.stop(), producer.stop()) + await asyncio.gather(events_consumer.stop(), broker.stop()) diff --git a/backend/tests/e2e/events/test_consume_roundtrip.py b/backend/tests/e2e/events/test_consume_roundtrip.py deleted file mode 100644 index 3b7d969b..00000000 --- a/backend/tests/e2e/events/test_consume_roundtrip.py +++ /dev/null @@ -1,72 +0,0 @@ -import asyncio -import logging -import uuid - -import pytest -from app.core.metrics import EventMetrics -from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic -from app.domain.events.typed import DomainEvent -from app.events.core import UnifiedConsumer, UnifiedProducer -from app.events.core.dispatcher import EventDispatcher -from app.events.core.types import ConsumerConfig -from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.settings import Settings -from dishka import AsyncContainer - -from tests.conftest import make_execution_requested_event - -# xdist_group: Kafka consumer creation can crash librdkafka when multiple workers -# instantiate Consumer() objects simultaneously. Serial execution prevents this. -pytestmark = [pytest.mark.e2e, pytest.mark.kafka, pytest.mark.xdist_group("kafka_consumers")] - -_test_logger = logging.getLogger("test.events.consume_roundtrip") - - -@pytest.mark.asyncio -async def test_produce_consume_roundtrip(scope: AsyncContainer) -> None: - # Ensure schemas are registered - registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) - settings: Settings = await scope.get(Settings) - event_metrics: EventMetrics = await scope.get(EventMetrics) - await initialize_event_schemas(registry) - - # Real producer from DI - producer: UnifiedProducer = await scope.get(UnifiedProducer) - - # Build a consumer that handles EXECUTION_REQUESTED - dispatcher = EventDispatcher(logger=_test_logger) - received = asyncio.Event() - - @dispatcher.register(EventType.EXECUTION_REQUESTED) - async def _handle(_event: DomainEvent) -> None: - received.set() - - group_id = f"test-consumer.{uuid.uuid4().hex[:6]}" - config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=group_id, - enable_auto_commit=True, - auto_offset_reset="earliest", - ) - - consumer = UnifiedConsumer( - config, - dispatcher, - schema_registry=registry, - settings=settings, - logger=_test_logger, - event_metrics=event_metrics, - ) - await consumer.start([KafkaTopic.EXECUTION_EVENTS]) - - try: - # Produce a request event - execution_id = f"exec-{uuid.uuid4().hex[:8]}" - evt = make_execution_requested_event(execution_id=execution_id) - await producer.produce(evt, key=execution_id) - - # Wait for the handler to be called - await asyncio.wait_for(received.wait(), timeout=10.0) - finally: - await consumer.stop() diff --git a/backend/tests/e2e/events/test_consumer_lifecycle.py b/backend/tests/e2e/events/test_consumer_lifecycle.py deleted file mode 100644 index 98c53a08..00000000 --- a/backend/tests/e2e/events/test_consumer_lifecycle.py +++ /dev/null @@ -1,46 +0,0 @@ -import logging -from uuid import uuid4 - -import pytest -from app.core.metrics import EventMetrics -from app.domain.enums.kafka import KafkaTopic -from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer -from app.events.schema.schema_registry import SchemaRegistryManager -from app.settings import Settings -from dishka import AsyncContainer - -# xdist_group: Kafka consumer creation can crash librdkafka when multiple workers -# instantiate Consumer() objects simultaneously. Serial execution prevents this. -pytestmark = [pytest.mark.e2e, pytest.mark.kafka, pytest.mark.xdist_group("kafka_consumers")] - -_test_logger = logging.getLogger("test.events.consumer_lifecycle") - - -@pytest.mark.asyncio -async def test_consumer_start_status_seek_and_stop(scope: AsyncContainer) -> None: - registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) - settings: Settings = await scope.get(Settings) - event_metrics: EventMetrics = await scope.get(EventMetrics) - cfg = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=f"test-consumer-{uuid4().hex[:6]}", - ) - disp = EventDispatcher(logger=_test_logger) - c = UnifiedConsumer( - cfg, - event_dispatcher=disp, - schema_registry=registry, - settings=settings, - logger=_test_logger, - event_metrics=event_metrics, - ) - await c.start([KafkaTopic.EXECUTION_EVENTS]) - try: - st = c.get_status() - assert st.state == "running" and st.is_running is True - # Exercise seek functions; don't force specific partition offsets - await c.seek_to_beginning() - await c.seek_to_end() - # No need to sleep; just ensure we can call seek APIs while running - finally: - await c.stop() diff --git a/backend/tests/e2e/events/test_dlq_handler.py b/backend/tests/e2e/events/test_dlq_handler.py deleted file mode 100644 index c8dbf089..00000000 --- a/backend/tests/e2e/events/test_dlq_handler.py +++ /dev/null @@ -1,63 +0,0 @@ -import logging - -import pytest -from app.domain.events.typed import DomainEvent, EventMetadata, SagaStartedEvent -from app.events.core import UnifiedProducer, create_dlq_error_handler, create_immediate_dlq_handler -from dishka import AsyncContainer - -pytestmark = [pytest.mark.e2e, pytest.mark.kafka] - -_test_logger = logging.getLogger("test.events.dlq_handler") - - -@pytest.mark.asyncio -async def test_dlq_handler_with_retries(scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch) -> None: - p: UnifiedProducer = await scope.get(UnifiedProducer) - calls: list[tuple[str | None, str, str, int]] = [] - - async def _record_send_to_dlq( - original_event: DomainEvent, original_topic: str, error: Exception, retry_count: int - ) -> None: - calls.append((original_event.event_id, original_topic, str(error), retry_count)) - - monkeypatch.setattr(p, "send_to_dlq", _record_send_to_dlq) - h = create_dlq_error_handler(p, max_retries=2, logger=_test_logger) - e = SagaStartedEvent( - saga_id="s", - saga_name="n", - execution_id="x", - initial_event_id="i", - metadata=EventMetadata(service_name="a", service_version="1"), - ) - # Call 1 and 2 should not send to DLQ - await h(RuntimeError("boom"), e, "test-topic") - await h(RuntimeError("boom"), e, "test-topic") - assert len(calls) == 0 - # 3rd call triggers DLQ - await h(RuntimeError("boom"), e, "test-topic") - assert len(calls) == 1 - assert calls[0][1] == "test-topic" - - -@pytest.mark.asyncio -async def test_immediate_dlq_handler(scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch) -> None: - p: UnifiedProducer = await scope.get(UnifiedProducer) - calls: list[tuple[str | None, str, str, int]] = [] - - async def _record_send_to_dlq( - original_event: DomainEvent, original_topic: str, error: Exception, retry_count: int - ) -> None: - calls.append((original_event.event_id, original_topic, str(error), retry_count)) - - monkeypatch.setattr(p, "send_to_dlq", _record_send_to_dlq) - h = create_immediate_dlq_handler(p, logger=_test_logger) - e = SagaStartedEvent( - saga_id="s2", - saga_name="n", - execution_id="x", - initial_event_id="i", - metadata=EventMetadata(service_name="a", service_version="1"), - ) - await h(RuntimeError("x"), e, "test-topic") - assert calls and calls[0][1] == "test-topic" - assert calls[0][3] == 0 diff --git a/backend/tests/e2e/events/test_event_dispatcher.py b/backend/tests/e2e/events/test_event_dispatcher.py deleted file mode 100644 index 2ead3aa3..00000000 --- a/backend/tests/e2e/events/test_event_dispatcher.py +++ /dev/null @@ -1,72 +0,0 @@ -import asyncio -import logging -import uuid - -import pytest -from app.core.metrics import EventMetrics -from app.domain.enums.events import EventType -from app.domain.enums.kafka import KafkaTopic -from app.domain.events.typed import DomainEvent -from app.events.core import UnifiedConsumer, UnifiedProducer -from app.events.core.dispatcher import EventDispatcher -from app.events.core.types import ConsumerConfig -from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.settings import Settings -from dishka import AsyncContainer - -from tests.conftest import make_execution_requested_event - -# xdist_group: Kafka consumer creation can crash librdkafka when multiple workers -# instantiate Consumer() objects simultaneously. Serial execution prevents this. -pytestmark = [pytest.mark.e2e, pytest.mark.kafka, pytest.mark.xdist_group("kafka_consumers")] - -_test_logger = logging.getLogger("test.events.event_dispatcher") - - -@pytest.mark.asyncio -async def test_dispatcher_with_multiple_handlers(scope: AsyncContainer) -> None: - # Ensure schema registry is ready - registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) - settings: Settings = await scope.get(Settings) - event_metrics: EventMetrics = await scope.get(EventMetrics) - await initialize_event_schemas(registry) - - # Build dispatcher with two handlers for the same event - dispatcher = EventDispatcher(logger=_test_logger) - h1_called = asyncio.Event() - h2_called = asyncio.Event() - - @dispatcher.register(EventType.EXECUTION_REQUESTED) - async def h1(_e: DomainEvent) -> None: - h1_called.set() - - @dispatcher.register(EventType.EXECUTION_REQUESTED) - async def h2(_e: DomainEvent) -> None: - h2_called.set() - - # Real consumer against execution-events - cfg = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=f"dispatcher-it.{uuid.uuid4().hex[:6]}", - enable_auto_commit=True, - auto_offset_reset="earliest", - ) - consumer = UnifiedConsumer( - cfg, - dispatcher, - schema_registry=registry, - settings=settings, - logger=_test_logger, - event_metrics=event_metrics, - ) - await consumer.start([KafkaTopic.EXECUTION_EVENTS]) - - # Produce a request event via DI - producer: UnifiedProducer = await scope.get(UnifiedProducer) - evt = make_execution_requested_event(execution_id=f"exec-{uuid.uuid4().hex[:8]}") - await producer.produce(evt, key="k") - - try: - await asyncio.wait_for(asyncio.gather(h1_called.wait(), h2_called.wait()), timeout=10.0) - finally: - await consumer.stop() diff --git a/backend/tests/e2e/events/test_producer_roundtrip.py b/backend/tests/e2e/events/test_producer_roundtrip.py index a5fca17c..773c7f3a 100644 --- a/backend/tests/e2e/events/test_producer_roundtrip.py +++ b/backend/tests/e2e/events/test_producer_roundtrip.py @@ -22,8 +22,6 @@ async def test_unified_producer_produce_and_send_to_dlq( ev = make_execution_requested_event(execution_id=f"exec-{uuid4().hex[:8]}") await prod.produce(ev, key=ev.execution_id) - # Exercise send_to_dlq path + # Exercise send_to_dlq path — should not raise topic = str(get_topic_for_event(ev.event_type)) await prod.send_to_dlq(ev, original_topic=topic, error=RuntimeError("forced"), retry_count=1) - - assert prod.metrics.messages_sent >= 2 diff --git a/backend/tests/e2e/idempotency/test_idempotency.py b/backend/tests/e2e/idempotency/test_idempotency.py index 6019f9ce..0dad952d 100644 --- a/backend/tests/e2e/idempotency/test_idempotency.py +++ b/backend/tests/e2e/idempotency/test_idempotency.py @@ -7,10 +7,8 @@ import pytest import redis.asyncio as redis from app.core.metrics import DatabaseMetrics -from app.domain.events.typed import DomainEvent from app.domain.idempotency import IdempotencyRecord, IdempotencyStatus, KeyStrategy from app.services.idempotency.idempotency_manager import IdempotencyConfig, IdempotencyManager -from app.services.idempotency.middleware import IdempotentEventHandler from app.services.idempotency.redis_repository import RedisIdempotencyRepository from app.settings import Settings @@ -192,93 +190,6 @@ async def test_result_caching(self, manager: IdempotencyManager) -> None: assert duplicate_result.is_duplicate is True assert duplicate_result.has_cached_result is True - -class TestIdempotentEventHandlerIntegration: - """Test IdempotentEventHandler with real components""" - - @pytest.fixture - def manager(self, redis_client: redis.Redis, test_settings: Settings) -> IdempotencyManager: - prefix = f"handler_test:{uuid.uuid4().hex[:6]}" - config = IdempotencyConfig(key_prefix=prefix) - repo = RedisIdempotencyRepository(redis_client, key_prefix=prefix) - database_metrics = DatabaseMetrics(test_settings) - return IdempotencyManager(config, repo, _test_logger, database_metrics=database_metrics) - - @pytest.mark.asyncio - async def test_handler_processes_new_event(self, manager: IdempotencyManager) -> None: - """Test that handler processes new events""" - processed_events: list[DomainEvent] = [] - - async def actual_handler(event: DomainEvent) -> None: - processed_events.append(event) - - # Create idempotent handler - handler = IdempotentEventHandler( - handler=actual_handler, - idempotency_manager=manager, - key_strategy=KeyStrategy.EVENT_BASED, - logger=_test_logger, - ) - - # Process event - real_event = make_execution_requested_event(execution_id="handler-test-123") - await handler(real_event) - - # Verify event was processed - assert len(processed_events) == 1 - assert processed_events[0] == real_event - - @pytest.mark.asyncio - async def test_handler_blocks_duplicate(self, manager: IdempotencyManager) -> None: - """Test that handler blocks duplicate events""" - processed_events: list[DomainEvent] = [] - - async def actual_handler(event: DomainEvent) -> None: - processed_events.append(event) - - # Create idempotent handler - handler = IdempotentEventHandler( - handler=actual_handler, - idempotency_manager=manager, - key_strategy=KeyStrategy.EVENT_BASED, - logger=_test_logger, - ) - - # Process event twice - real_event = make_execution_requested_event(execution_id="handler-dup-123") - await handler(real_event) - await handler(real_event) - - # Verify event was processed only once - assert len(processed_events) == 1 - - @pytest.mark.asyncio - async def test_handler_with_failure(self, manager: IdempotencyManager) -> None: - """Test handler marks failure on exception""" - - async def failing_handler(event: DomainEvent) -> None: # noqa: ARG001 - raise ValueError("Processing failed") - - handler = IdempotentEventHandler( - handler=failing_handler, - idempotency_manager=manager, - key_strategy=KeyStrategy.EVENT_BASED, - logger=_test_logger, - ) - - # Process event (should raise) - real_event = make_execution_requested_event(execution_id="handler-fail-1") - with pytest.raises(ValueError, match="Processing failed"): - await handler(real_event) - - # Verify marked as failed - key = f"{manager.config.key_prefix}:{real_event.event_type}:{real_event.event_id}" - record = await manager._repo.find_by_key(key) - assert record is not None - assert record.status == IdempotencyStatus.FAILED - assert record.error is not None - assert "Processing failed" in record.error - @pytest.mark.asyncio async def test_invalid_key_strategy(self, manager: IdempotencyManager) -> None: """Test that invalid key strategy raises error""" @@ -296,13 +207,11 @@ async def test_custom_key_without_custom_key_param(self, manager: IdempotencyMan @pytest.mark.asyncio async def test_get_cached_json_existing(self, manager: IdempotencyManager) -> None: """Test retrieving cached JSON result""" - # First complete with cached result real_event = make_execution_requested_event(execution_id="cache-exist-1") await manager.check_and_reserve(real_event, key_strategy=KeyStrategy.EVENT_BASED) cached_data = json.dumps({"output": "test", "code": 0}) await manager.mark_completed_with_json(real_event, cached_data, KeyStrategy.EVENT_BASED) - # Retrieve cached result retrieved = await manager.get_cached_json(real_event, KeyStrategy.EVENT_BASED, None) assert retrieved == cached_data @@ -310,14 +219,12 @@ async def test_get_cached_json_existing(self, manager: IdempotencyManager) -> No async def test_get_cached_json_non_existing(self, manager: IdempotencyManager) -> None: """Test retrieving non-existing cached result raises assertion""" real_event = make_execution_requested_event(execution_id="cache-miss-1") - # Trying to get cached result for non-existent key should raise with pytest.raises(AssertionError, match="cached result must exist"): await manager.get_cached_json(real_event, KeyStrategy.EVENT_BASED, None) @pytest.mark.asyncio async def test_cleanup_expired_keys(self, manager: IdempotencyManager) -> None: """Test cleanup of expired keys""" - # Create expired record expired_key = f"{manager.config.key_prefix}:expired" expired_record = IdempotencyRecord( key=expired_key, @@ -325,15 +232,13 @@ async def test_cleanup_expired_keys(self, manager: IdempotencyManager) -> None: event_type="test", event_id="expired-1", created_at=datetime.now(timezone.utc) - timedelta(hours=2), - ttl_seconds=3600, # 1 hour TTL + ttl_seconds=3600, completed_at=datetime.now(timezone.utc) - timedelta(hours=2) ) await manager._repo.insert_processing(expired_record) - # Cleanup should detect it as expired - # Note: actual cleanup implementation depends on repository record = await manager._repo.find_by_key(expired_key) - assert record is not None # Still exists until explicit cleanup + assert record is not None @pytest.mark.asyncio async def test_content_hash_with_fields(self, manager: IdempotencyManager) -> None: @@ -343,7 +248,6 @@ async def test_content_hash_with_fields(self, manager: IdempotencyManager) -> No service_name="test-service", ) - # Use content hash with only script field fields = {"script", "language"} result1 = await manager.check_and_reserve( event1, @@ -353,7 +257,6 @@ async def test_content_hash_with_fields(self, manager: IdempotencyManager) -> No assert result1.is_duplicate is False await manager.mark_completed(event1, key_strategy=KeyStrategy.CONTENT_HASH, fields=fields) - # Event with same script and language but different other fields event2 = make_execution_requested_event( execution_id="exec-2", timeout_seconds=60, @@ -369,4 +272,4 @@ async def test_content_hash_with_fields(self, manager: IdempotencyManager) -> No key_strategy=KeyStrategy.CONTENT_HASH, fields=fields ) - assert result2.is_duplicate is True # Same script and language + assert result2.is_duplicate is True diff --git a/backend/tests/e2e/idempotency/test_idempotent_handler.py b/backend/tests/e2e/idempotency/test_idempotent_handler.py deleted file mode 100644 index 63f3fca9..00000000 --- a/backend/tests/e2e/idempotency/test_idempotent_handler.py +++ /dev/null @@ -1,63 +0,0 @@ -import logging - -import pytest -from app.domain.events.typed import DomainEvent -from app.domain.idempotency import KeyStrategy -from app.services.idempotency.idempotency_manager import IdempotencyManager -from app.services.idempotency.middleware import IdempotentEventHandler -from dishka import AsyncContainer - -from tests.conftest import make_execution_requested_event - -pytestmark = [pytest.mark.e2e] - -_test_logger = logging.getLogger("test.idempotency.idempotent_handler") - - -@pytest.mark.asyncio -async def test_idempotent_handler_blocks_duplicates(scope: AsyncContainer) -> None: - manager: IdempotencyManager = await scope.get(IdempotencyManager) - - processed: list[str | None] = [] - - async def _handler(ev: DomainEvent) -> None: - processed.append(ev.event_id) - - handler = IdempotentEventHandler( - handler=_handler, - idempotency_manager=manager, - key_strategy=KeyStrategy.EVENT_BASED, - logger=_test_logger, - ) - - ev = make_execution_requested_event(execution_id="exec-dup-1") - - await handler(ev) - await handler(ev) # duplicate - - assert processed == [ev.event_id] - - -@pytest.mark.asyncio -async def test_idempotent_handler_content_hash_blocks_same_content(scope: AsyncContainer) -> None: - manager: IdempotencyManager = await scope.get(IdempotencyManager) - - processed: list[str] = [] - - async def _handler(ev: DomainEvent) -> None: - processed.append(getattr(ev, "execution_id", "")) - - handler = IdempotentEventHandler( - handler=_handler, - idempotency_manager=manager, - key_strategy=KeyStrategy.CONTENT_HASH, - logger=_test_logger, - ) - - e1 = make_execution_requested_event(execution_id="exec-dup-2") - e2 = make_execution_requested_event(execution_id="exec-dup-2") - - await handler(e1) - await handler(e2) - - assert processed == [e1.execution_id] diff --git a/backend/tests/e2e/result_processor/test_result_processor.py b/backend/tests/e2e/result_processor/test_result_processor.py index f907d56e..7976d1b0 100644 --- a/backend/tests/e2e/result_processor/test_result_processor.py +++ b/backend/tests/e2e/result_processor/test_result_processor.py @@ -1,34 +1,23 @@ -import asyncio import logging import uuid import pytest from app.core.database_context import Database -from app.core.metrics import EventMetrics, ExecutionMetrics +from app.core.metrics import ExecutionMetrics from app.db.repositories.execution_repository import ExecutionRepository -from app.domain.enums.events import EventType from app.domain.enums.execution import ExecutionStatus -from app.domain.enums.kafka import KafkaTopic from app.domain.events.typed import ( EventMetadata, ExecutionCompletedEvent, ResourceUsageDomain, - ResultStoredEvent, ) from app.domain.execution import DomainExecutionCreate -from app.domain.idempotency import KeyStrategy -from app.events.core import UnifiedConsumer, UnifiedProducer -from app.events.core.dispatcher import EventDispatcher -from app.events.core.types import ConsumerConfig -from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.services.idempotency import IdempotencyManager -from app.services.idempotency.middleware import IdempotentEventDispatcher +from app.events.core import UnifiedProducer +from app.events.schema.schema_registry import SchemaRegistryManager from app.services.result_processor.processor import ResultProcessor from app.settings import Settings from dishka import AsyncContainer -# xdist_group: Kafka consumer creation can crash librdkafka when multiple workers -# instantiate Consumer() objects simultaneously. Serial execution prevents this. pytestmark = [ pytest.mark.e2e, pytest.mark.kafka, @@ -41,18 +30,15 @@ @pytest.mark.asyncio async def test_result_processor_persists_and_emits(scope: AsyncContainer) -> None: - # Ensure schemas + # Schemas are initialized inside the SchemaRegistryManager DI provider registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager) settings: Settings = await scope.get(Settings) - event_metrics: EventMetrics = await scope.get(EventMetrics) execution_metrics: ExecutionMetrics = await scope.get(ExecutionMetrics) - await initialize_event_schemas(registry) # Dependencies db: Database = await scope.get(Database) repo: ExecutionRepository = await scope.get(ExecutionRepository) producer: UnifiedProducer = await scope.get(UnifiedProducer) - idem: IdempotencyManager = await scope.get(IdempotencyManager) # Create a base execution to satisfy ResultProcessor lookup created = await repo.create_execution(DomainExecutionCreate( @@ -64,7 +50,7 @@ async def test_result_processor_persists_and_emits(scope: AsyncContainer) -> Non )) execution_id = created.execution_id - # Build the processor and wire up dispatcher + consumer + # Build the processor processor = ResultProcessor( execution_repo=repo, producer=producer, @@ -72,58 +58,8 @@ async def test_result_processor_persists_and_emits(scope: AsyncContainer) -> Non logger=_test_logger, execution_metrics=execution_metrics, ) - proc_dispatcher = IdempotentEventDispatcher( - logger=_test_logger, - idempotency_manager=idem, - key_strategy=KeyStrategy.CONTENT_HASH, - ttl_seconds=7200, - ) - proc_dispatcher.register_handler(EventType.EXECUTION_COMPLETED, processor.handle_execution_completed) - proc_dispatcher.register_handler(EventType.EXECUTION_FAILED, processor.handle_execution_failed) - proc_dispatcher.register_handler(EventType.EXECUTION_TIMEOUT, processor.handle_execution_timeout) - - proc_consumer_config = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=f"rp-proc.{uuid.uuid4().hex[:6]}", - max_poll_records=1, - enable_auto_commit=True, - auto_offset_reset="earliest", - ) - proc_consumer = UnifiedConsumer( - proc_consumer_config, - event_dispatcher=proc_dispatcher, - schema_registry=registry, - settings=settings, - logger=_test_logger, - event_metrics=event_metrics, - ) - - # Setup a small consumer to capture ResultStoredEvent - dispatcher = EventDispatcher(logger=_test_logger) - stored_received = asyncio.Event() - @dispatcher.register(EventType.RESULT_STORED) - async def _stored(event: ResultStoredEvent) -> None: - if event.execution_id == execution_id: - stored_received.set() - - group_id = f"rp-test.{uuid.uuid4().hex[:6]}" - cconf = ConsumerConfig( - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=group_id, - enable_auto_commit=True, - auto_offset_reset="earliest", - ) - stored_consumer = UnifiedConsumer( - cconf, - dispatcher, - schema_registry=registry, - settings=settings, - logger=_test_logger, - event_metrics=event_metrics, - ) - - # Produce the event BEFORE starting consumers (auto_offset_reset="earliest" will read it) + # Build the event usage = ResourceUsageDomain( execution_time_wall_seconds=0.5, cpu_time_jiffies=100, @@ -138,22 +74,13 @@ async def _stored(event: ResultStoredEvent) -> None: resource_usage=usage, metadata=EventMetadata(service_name="tests", service_version="1.0.0"), ) - await producer.produce(evt, key=execution_id) - # Start consumers after producing - await stored_consumer.start([KafkaTopic.EXECUTION_RESULTS]) - await proc_consumer.start([KafkaTopic.EXECUTION_EVENTS]) + # Directly call the handler (subscriber routing tested separately) + await processor.handle_execution_completed(evt) - try: - # Await the ResultStoredEvent - signals that processing is complete - await asyncio.wait_for(stored_received.wait(), timeout=12.0) - - # Now verify DB persistence - should be done since event was emitted - doc = await db.get_collection("executions").find_one({"execution_id": execution_id}) - assert doc is not None, f"Execution {execution_id} not found in DB after ResultStoredEvent" - assert doc.get("status") == ExecutionStatus.COMPLETED, ( - f"Expected COMPLETED status, got {doc.get('status')}" - ) - finally: - await proc_consumer.stop() - await stored_consumer.stop() + # Verify DB persistence + doc = await db.get_collection("executions").find_one({"execution_id": execution_id}) + assert doc is not None, f"Execution {execution_id} not found in DB after processing" + assert doc.get("status") == ExecutionStatus.COMPLETED, ( + f"Expected COMPLETED status, got {doc.get('status')}" + ) diff --git a/backend/tests/e2e/services/coordinator/test_execution_coordinator.py b/backend/tests/e2e/services/coordinator/test_execution_coordinator.py index 59ed202a..8335c4b0 100644 --- a/backend/tests/e2e/services/coordinator/test_execution_coordinator.py +++ b/backend/tests/e2e/services/coordinator/test_execution_coordinator.py @@ -18,7 +18,7 @@ async def test_handle_requested_schedules_execution( coord: ExecutionCoordinator = await scope.get(ExecutionCoordinator) ev = make_execution_requested_event(execution_id="e-sched-1") - await coord._handle_execution_requested(ev) # noqa: SLF001 + await coord.handle_execution_requested(ev) assert "e-sched-1" in coord._active_executions # noqa: SLF001 @@ -33,7 +33,7 @@ async def test_handle_requested_with_priority( priority=QueuePriority.BACKGROUND, ) - await coord._handle_execution_requested(ev) # noqa: SLF001 + await coord.handle_execution_requested(ev) assert "e-priority-1" in coord._active_executions # noqa: SLF001 @@ -47,8 +47,8 @@ async def test_handle_requested_unique_executions( ev1 = make_execution_requested_event(execution_id="e-unique-1") ev2 = make_execution_requested_event(execution_id="e-unique-2") - await coord._handle_execution_requested(ev1) # noqa: SLF001 - await coord._handle_execution_requested(ev2) # noqa: SLF001 + await coord.handle_execution_requested(ev1) + await coord.handle_execution_requested(ev2) assert "e-unique-1" in coord._active_executions # noqa: SLF001 assert "e-unique-2" in coord._active_executions # noqa: SLF001 diff --git a/backend/tests/e2e/services/sse/test_partitioned_event_router.py b/backend/tests/e2e/services/sse/test_partitioned_event_router.py index b7c20feb..76548d80 100644 --- a/backend/tests/e2e/services/sse/test_partitioned_event_router.py +++ b/backend/tests/e2e/services/sse/test_partitioned_event_router.py @@ -4,10 +4,6 @@ import pytest import redis.asyncio as redis -from app.core.metrics import EventMetrics -from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId -from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer -from app.events.schema.schema_registry import SchemaRegistryManager from app.schemas_pydantic.sse import RedisSSEMessage from app.services.sse.redis_bus import SSERedisBus from app.settings import Settings @@ -29,58 +25,12 @@ async def test_bus_routes_event_to_redis(redis_client: redis.Redis, test_setting logger=_test_logger, ) - disp = EventDispatcher(logger=_test_logger) - for et in SSERedisBus.SSE_ROUTED_EVENTS: - disp.register_handler(et, bus.route_domain_event) - execution_id = f"e-{uuid4().hex[:8]}" subscription = await bus.open_subscription(execution_id) ev = make_execution_requested_event(execution_id=execution_id) - handler = disp._handlers[ev.event_type][0] - await handler(ev) + await bus.route_domain_event(ev) msg = await asyncio.wait_for(subscription.get(RedisSSEMessage), timeout=2.0) assert msg is not None assert str(msg.event_type) == str(ev.event_type) - - -@pytest.mark.asyncio -@pytest.mark.kafka -async def test_sse_consumer_start_and_stop( - redis_client: redis.Redis, test_settings: Settings, -) -> None: - suffix = uuid4().hex[:6] - bus = SSERedisBus( - redis_client, - exec_prefix=f"sse:exec:{suffix}:", - notif_prefix=f"sse:notif:{suffix}:", - logger=_test_logger, - ) - - config = ConsumerConfig( - bootstrap_servers=test_settings.KAFKA_BOOTSTRAP_SERVERS, - group_id="sse-bridge-pool", - client_id="sse-consumer-test-0", - enable_auto_commit=True, - auto_offset_reset="latest", - max_poll_interval_ms=test_settings.KAFKA_MAX_POLL_INTERVAL_MS, - session_timeout_ms=test_settings.KAFKA_SESSION_TIMEOUT_MS, - heartbeat_interval_ms=test_settings.KAFKA_HEARTBEAT_INTERVAL_MS, - request_timeout_ms=test_settings.KAFKA_REQUEST_TIMEOUT_MS, - ) - dispatcher = EventDispatcher(logger=_test_logger) - for et in SSERedisBus.SSE_ROUTED_EVENTS: - dispatcher.register_handler(et, bus.route_domain_event) - - consumer = UnifiedConsumer( - config=config, - event_dispatcher=dispatcher, - schema_registry=SchemaRegistryManager(settings=test_settings, logger=_test_logger), - settings=test_settings, - logger=_test_logger, - event_metrics=EventMetrics(test_settings), - ) - topics = list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.WEBSOCKET_GATEWAY]) - await consumer.start(topics) - await consumer.stop() diff --git a/backend/tests/e2e/test_execution_routes.py b/backend/tests/e2e/test_execution_routes.py index 10bfbf18..f40630d6 100644 --- a/backend/tests/e2e/test_execution_routes.py +++ b/backend/tests/e2e/test_execution_routes.py @@ -288,7 +288,7 @@ async def test_cancel_completed_execution_fails(self, test_user: AsyncClient) -> ) assert cancel_response.status_code == 400 - assert "completed" in cancel_response.json()["detail"].lower() + assert "cannot cancel" in cancel_response.json()["detail"].lower() class TestExecutionRetry: diff --git a/backend/tests/e2e/test_k8s_worker_create_pod.py b/backend/tests/e2e/test_k8s_worker_create_pod.py index 74b7b105..91d9c0dd 100644 --- a/backend/tests/e2e/test_k8s_worker_create_pod.py +++ b/backend/tests/e2e/test_k8s_worker_create_pod.py @@ -5,7 +5,7 @@ from app.core.metrics import EventMetrics from app.domain.enums.execution import QueuePriority from app.domain.events.typed import CreatePodCommandEvent, EventMetadata -from app.events.core import EventDispatcher, UnifiedProducer +from app.events.core import UnifiedProducer from app.services.k8s_worker import KubernetesWorker from app.settings import Settings from dishka import AsyncContainer @@ -25,12 +25,9 @@ async def test_worker_creates_configmap_and_pod( producer: UnifiedProducer = await scope.get(UnifiedProducer) event_metrics: EventMetrics = await scope.get(EventMetrics) - dispatcher = EventDispatcher(logger=_test_logger) - worker = KubernetesWorker( api_client=api_client, producer=producer, - dispatcher=dispatcher, settings=test_settings, logger=_test_logger, event_metrics=event_metrics, diff --git a/backend/tests/e2e/test_sse_routes.py b/backend/tests/e2e/test_sse_routes.py index 2b362e98..12fe8136 100644 --- a/backend/tests/e2e/test_sse_routes.py +++ b/backend/tests/e2e/test_sse_routes.py @@ -1,3 +1,5 @@ +from typing import Any + import pytest import pytest_asyncio from app.schemas_pydantic.execution import ExecutionResponse @@ -8,6 +10,28 @@ pytestmark = [pytest.mark.e2e] +class _NoLifespan: + """ASGI wrapper that completes lifespan events immediately. + + async-asgi-testclient's context manager triggers ASGI lifespan + startup/shutdown. Without this wrapper, the shutdown closes the + Kafka broker that the session-scoped ``app`` fixture owns, breaking + every subsequent test that publishes events. + """ + + def __init__(self, app: Any) -> None: + self.app = app + + async def __call__(self, scope: Any, receive: Any, send: Any) -> None: + if scope["type"] == "lifespan": + await receive() # lifespan.startup + await send({"type": "lifespan.startup.complete"}) + await receive() # lifespan.shutdown + await send({"type": "lifespan.shutdown.complete"}) + return + await self.app(scope, receive, send) + + @pytest_asyncio.fixture async def sse_client(app: FastAPI, test_user: AsyncClient) -> SSETestClient: """SSE-capable test client with auth cookies from test_user. @@ -15,8 +39,11 @@ async def sse_client(app: FastAPI, test_user: AsyncClient) -> SSETestClient: Uses async-asgi-testclient which properly streams SSE responses, unlike httpx's ASGITransport which buffers entire responses. See: https://github.com/encode/httpx/issues/2186 + + The app is wrapped with _NoLifespan to prevent the SSE client's + context manager from closing the session-scoped Kafka broker. """ - client = SSETestClient(app) + client = SSETestClient(_NoLifespan(app)) # Copy auth cookies from httpx client (SimpleCookie uses dict-style assignment) for name, value in test_user.cookies.items(): client.cookie_jar[name] = value @@ -29,7 +56,7 @@ async def sse_client(app: FastAPI, test_user: AsyncClient) -> SSETestClient: @pytest_asyncio.fixture async def sse_client_another(app: FastAPI, another_user: AsyncClient) -> SSETestClient: """SSE-capable test client with auth from another_user.""" - client = SSETestClient(app) + client = SSETestClient(_NoLifespan(app)) for name, value in another_user.cookies.items(): client.cookie_jar[name] = value if csrf := another_user.headers.get("X-CSRF-Token"): diff --git a/backend/tests/unit/conftest.py b/backend/tests/unit/conftest.py index 3e28e839..a02357cd 100644 --- a/backend/tests/unit/conftest.py +++ b/backend/tests/unit/conftest.py @@ -4,18 +4,6 @@ from unittest.mock import AsyncMock, MagicMock import pytest -from kubernetes_asyncio.client import ( - V1ContainerState, - V1ContainerStateTerminated, - V1ContainerStateWaiting, - V1ContainerStatus, - V1ObjectMeta, - V1Pod, - V1PodCondition, - V1PodSpec, - V1PodStatus, -) - from app.core.metrics import ( ConnectionMetrics, CoordinatorMetrics, @@ -31,7 +19,17 @@ SecurityMetrics, ) from app.settings import Settings - +from kubernetes_asyncio.client import ( + V1ContainerState, + V1ContainerStateTerminated, + V1ContainerStateWaiting, + V1ContainerStatus, + V1ObjectMeta, + V1Pod, + V1PodCondition, + V1PodSpec, + V1PodStatus, +) # ===== Kubernetes test factories ===== @@ -150,17 +148,20 @@ def make_mock_watch( return mock -def make_mock_v1_api(logs: str = "{}", pods: list[V1Pod] | None = None) -> MagicMock: +def make_mock_v1_api( + logs: str = "{}", pods: list[V1Pod] | None = None, list_resource_version: str = "list-rv1", +) -> MagicMock: """Create a mock CoreV1Api with configurable responses.""" class PodList: - def __init__(self, items: list[V1Pod]) -> None: + def __init__(self, items: list[V1Pod], resource_version: str) -> None: self.items = items + self.metadata = V1ObjectMeta(resource_version=resource_version) mock = MagicMock() mock.read_namespaced_pod_log = AsyncMock(return_value=logs) mock.get_api_resources = AsyncMock(return_value=None) - mock.list_namespaced_pod = AsyncMock(return_value=PodList(list(pods or []))) + mock.list_namespaced_pod = AsyncMock(return_value=PodList(list(pods or []), list_resource_version)) return mock diff --git a/backend/tests/unit/events/test_event_dispatcher.py b/backend/tests/unit/events/test_event_dispatcher.py deleted file mode 100644 index 38f34f93..00000000 --- a/backend/tests/unit/events/test_event_dispatcher.py +++ /dev/null @@ -1,41 +0,0 @@ -import logging - -from app.domain.enums.events import EventType -from app.domain.events.typed import DomainEvent -from app.events.core import EventDispatcher - -from tests.conftest import make_execution_requested_event - -_test_logger = logging.getLogger("test.events.event_dispatcher") - - -def make_event() -> DomainEvent: - return make_execution_requested_event(execution_id="e1") - - -def test_decorator_registration() -> None: - disp = EventDispatcher(logger=_test_logger) - - @disp.register(EventType.EXECUTION_REQUESTED) - async def handler(ev: DomainEvent) -> None: # noqa: ARG001 - return None - - assert len(disp._handlers[EventType.EXECUTION_REQUESTED]) == 1 - - -async def test_dispatch_calls_matching_handler() -> None: - disp = EventDispatcher(logger=_test_logger) - called = {"n": 0} - - @disp.register(EventType.EXECUTION_REQUESTED) - async def handler(_: DomainEvent) -> None: - called["n"] += 1 - - await disp.dispatch(make_event()) - - # Dispatch event with no handlers (different type) — should be a no-op - e = make_event() - e.event_type = EventType.EXECUTION_FAILED - await disp.dispatch(e) - - assert called["n"] == 1 diff --git a/backend/tests/unit/services/coordinator/test_coordinator_queue.py b/backend/tests/unit/services/coordinator/test_coordinator_queue.py index ecb9ac6f..82e47be4 100644 --- a/backend/tests/unit/services/coordinator/test_coordinator_queue.py +++ b/backend/tests/unit/services/coordinator/test_coordinator_queue.py @@ -1,5 +1,5 @@ import logging -from unittest.mock import AsyncMock, MagicMock +from unittest.mock import AsyncMock import pytest from app.core.metrics import CoordinatorMetrics @@ -22,7 +22,6 @@ def _make_coordinator( ) -> ExecutionCoordinator: return ExecutionCoordinator( producer=AsyncMock(), - dispatcher=MagicMock(), execution_repository=AsyncMock(), logger=_test_logger, coordinator_metrics=coordinator_metrics, diff --git a/backend/tests/unit/services/idempotency/test_middleware.py b/backend/tests/unit/services/idempotency/test_middleware.py deleted file mode 100644 index eb43f6e0..00000000 --- a/backend/tests/unit/services/idempotency/test_middleware.py +++ /dev/null @@ -1,110 +0,0 @@ -import logging -from unittest.mock import AsyncMock, MagicMock - -import pytest -from app.domain.events.typed import DomainEvent -from app.domain.idempotency import IdempotencyStatus, KeyStrategy -from app.services.idempotency.idempotency_manager import IdempotencyManager, IdempotencyResult -from app.services.idempotency.middleware import ( - IdempotentEventHandler, -) - -_test_logger = logging.getLogger("test.services.idempotency.middleware") - - -pytestmark = pytest.mark.unit - - -class TestIdempotentEventHandler: - @pytest.fixture - def mock_idempotency_manager(self) -> AsyncMock: - return AsyncMock(spec=IdempotencyManager) - - @pytest.fixture - def mock_handler(self) -> AsyncMock: - handler = AsyncMock() - handler.__name__ = "test_handler" - return handler - - @pytest.fixture - def event(self) -> MagicMock: - event = MagicMock(spec=DomainEvent) - event.event_type = "test.event" - event.event_id = "event-123" - return event - - @pytest.fixture - def idempotent_event_handler( - self, mock_handler: AsyncMock, mock_idempotency_manager: AsyncMock - ) -> IdempotentEventHandler: - return IdempotentEventHandler( - handler=mock_handler, - idempotency_manager=mock_idempotency_manager, - key_strategy=KeyStrategy.EVENT_BASED, - ttl_seconds=3600, - logger=_test_logger - ) - - @pytest.mark.asyncio - async def test_call_with_fields( - self, mock_handler: AsyncMock, mock_idempotency_manager: AsyncMock, event: MagicMock - ) -> None: - # Setup with specific fields - fields = {"field1", "field2"} - - handler = IdempotentEventHandler( - handler=mock_handler, - idempotency_manager=mock_idempotency_manager, - key_strategy=KeyStrategy.CONTENT_HASH, - fields=fields, - logger=_test_logger - ) - - idempotency_result = IdempotencyResult( - is_duplicate=False, - status=IdempotencyStatus.PROCESSING, - created_at=MagicMock(), - key="test-key" - ) - mock_idempotency_manager.check_and_reserve.return_value = idempotency_result - - # Execute - await handler(event) - - # Verify - mock_idempotency_manager.check_and_reserve.assert_called_once_with( - event=event, - key_strategy=KeyStrategy.CONTENT_HASH, - ttl_seconds=None, - fields=fields - ) - - @pytest.mark.asyncio - async def test_call_handler_exception( - self, - idempotent_event_handler: IdempotentEventHandler, - mock_idempotency_manager: AsyncMock, - mock_handler: AsyncMock, - event: MagicMock, - ) -> None: - # Setup: Handler raises exception - idempotency_result = IdempotencyResult( - is_duplicate=False, - status=IdempotencyStatus.PROCESSING, - created_at=MagicMock(), - key="test-key" - ) - mock_idempotency_manager.check_and_reserve.return_value = idempotency_result - mock_handler.side_effect = Exception("Handler error") - - # Execute and verify exception is raised - with pytest.raises(Exception, match="Handler error"): - await idempotent_event_handler(event) - - # Verify failure is marked - mock_idempotency_manager.mark_failed.assert_called_once_with( - event=event, - error="Handler error", - key_strategy=KeyStrategy.EVENT_BASED, - fields=None - ) diff --git a/backend/tests/unit/services/pod_monitor/test_monitor.py b/backend/tests/unit/services/pod_monitor/test_monitor.py index a3b89227..1c0bdec8 100644 --- a/backend/tests/unit/services/pod_monitor/test_monitor.py +++ b/backend/tests/unit/services/pod_monitor/test_monitor.py @@ -1,12 +1,9 @@ -import asyncio import logging import types from typing import Any from unittest.mock import AsyncMock, MagicMock import pytest -from kubernetes_asyncio import client as k8s_client - from app.core.metrics import EventMetrics, KubernetesMetrics from app.db.repositories.event_repository import EventRepository from app.domain.events.typed import ( @@ -21,16 +18,14 @@ from app.services.pod_monitor.config import PodMonitorConfig from app.services.pod_monitor.event_mapper import PodEventMapper from app.services.pod_monitor.monitor import ( - MonitorState, PodEvent, PodMonitor, - ReconciliationResult, WatchEventType, ) from app.settings import Settings -from kubernetes_asyncio.client.rest import ApiException - +from kubernetes_asyncio import client as k8s_client from kubernetes_asyncio.client import V1Pod +from kubernetes_asyncio.client.rest import ApiException from tests.unit.conftest import ( MockWatchStream, @@ -95,19 +90,6 @@ def create_test_kafka_event_service(event_metrics: EventMetrics) -> tuple[KafkaE # ===== Helpers to create test instances with pure DI ===== -class SpyMapper: - """Spy event mapper that tracks clear_cache calls.""" - - def __init__(self) -> None: - self.cleared = False - - def clear_cache(self) -> None: - self.cleared = True - - async def map_pod_event(self, pod: Any, event_type: WatchEventType) -> list[Any]: # noqa: ARG002 - return [] - - def make_mock_api_client() -> MagicMock: """Create a mock ApiClient.""" mock = MagicMock(spec=k8s_client.ApiClient) @@ -127,12 +109,9 @@ def make_pod_monitor( pods: list[V1Pod] | None = None, events: list[dict[str, Any]] | None = None, resource_version: str = "rv1", + list_resource_version: str = "list-rv1", ) -> PodMonitor: - """Create PodMonitor with sensible test defaults. - - Since PodMonitor creates its own v1/watch from api_client, - we create the monitor and then replace _v1 and _watch with mocks. - """ + """Create PodMonitor with sensible test defaults.""" cfg = config or PodMonitorConfig() client = api_client or make_mock_api_client() mapper = event_mapper or PodEventMapper(logger=_test_logger, k8s_api=make_mock_v1_api("{}")) @@ -148,7 +127,7 @@ def make_pod_monitor( ) # Replace internal clients with mocks for testing - monitor._v1 = mock_v1 or make_mock_v1_api(pods=pods) + monitor._v1 = mock_v1 or make_mock_v1_api(pods=pods, list_resource_version=list_resource_version) monitor._watch = mock_watch or make_mock_watch(events or [], resource_version) return monitor @@ -158,160 +137,160 @@ def make_pod_monitor( @pytest.mark.asyncio -async def test_start_and_stop_lifecycle(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: +async def test_watch_pod_events_list_then_watch( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: + """First call does LIST + WATCH; second call skips LIST.""" cfg = PodMonitorConfig() - cfg.enable_state_reconciliation = False - - spy = SpyMapper() - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, event_mapper=spy) # type: ignore[arg-type] - # Replace _watch_pods to avoid real watch loop - async def _quick_watch() -> None: - return None + pod = make_pod(name="existing", phase="Running", resource_version="rv1") - pm._watch_pods = _quick_watch # type: ignore[method-assign] + pm = make_pod_monitor( + event_metrics, kubernetes_metrics, config=cfg, + pods=[pod], list_resource_version="list-rv5", + events=[{"type": "MODIFIED", "object": make_pod(name="existing", phase="Succeeded", resource_version="rv6")}], + resource_version="rv7", + ) - await pm.start() - assert pm.state == MonitorState.RUNNING + # First call: LIST (gets list-rv5) then WATCH (ends at rv7) + await pm.watch_pod_events() + assert pm._last_resource_version == "rv7" + assert pm._v1.list_namespaced_pod.await_count == 1 # type: ignore[attr-defined] # LIST was called - await pm.stop() - final_state: MonitorState = pm.state - assert final_state == MonitorState.STOPPED - assert spy.cleared is True + # Second call: no LIST needed, just WATCH + pm._watch = make_mock_watch([], "rv8") + await pm.watch_pod_events() + assert pm._last_resource_version == "rv8" + assert pm._v1.list_namespaced_pod.await_count == 1 # type: ignore[attr-defined] # LIST not called again @pytest.mark.asyncio -async def test_watch_pod_events_flow_and_publish(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: +async def test_watch_pod_events_with_field_selector( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: cfg = PodMonitorConfig() - cfg.enable_state_reconciliation = False + cfg.field_selector = "status.phase=Running" + + watch_kwargs: list[dict[str, Any]] = [] + + tracking_v1 = MagicMock() + + # LIST returns a pod list with metadata + class FakePodList: + items: list[V1Pod] = [] + metadata = types.SimpleNamespace(resource_version="rv1") + + tracking_v1.list_namespaced_pod = AsyncMock(return_value=FakePodList()) + + tracking_watch = MagicMock() + + def track_stream(func: Any, **kwargs: Any) -> MockWatchStream: # noqa: ARG001 + watch_kwargs.append(kwargs) + return MockWatchStream([], "rv1") - pod = make_pod(name="p", phase="Succeeded", labels={"execution-id": "e1"}, term_exit=0, resource_version="rv1") + tracking_watch.stream.side_effect = track_stream + tracking_watch.stop.return_value = None + tracking_watch.resource_version = "rv1" pm = make_pod_monitor( event_metrics, kubernetes_metrics, config=cfg, - events=[{"type": "MODIFIED", "object": pod}], resource_version="rv2" + mock_v1=tracking_v1, mock_watch=tracking_watch, ) - pm._state = MonitorState.RUNNING - await pm._watch_pod_events() - assert pm._last_resource_version == "rv2" + await pm.watch_pod_events() + + assert any("field_selector" in kw for kw in watch_kwargs) @pytest.mark.asyncio -async def test_process_raw_event_invalid_and_handle_watch_error(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: +async def test_watch_pod_events_raises_api_exception( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: + """watch_pod_events propagates ApiException to the caller.""" cfg = PodMonitorConfig() pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - await pm._process_raw_event({}) - - pm.config.watch_reconnect_delay = 0 - pm._reconnect_attempts = 0 - await pm._handle_watch_error() - await pm._handle_watch_error() - assert pm._reconnect_attempts >= 2 + # Pre-set resource version so LIST is skipped + pm._last_resource_version = "rv1" + mock_watch_obj = MagicMock() + mock_watch_obj.stream.side_effect = ApiException(status=410) + pm._watch = mock_watch_obj -@pytest.mark.asyncio -async def test_get_status(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - cfg.namespace = "test-ns" - cfg.label_selector = "app=test" - cfg.enable_state_reconciliation = True - - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - pm._tracked_pods = {"pod1", "pod2"} - pm._reconnect_attempts = 3 - pm._last_resource_version = "v123" - - status = await pm.get_status() - assert "idle" in status["state"].lower() - assert status["tracked_pods"] == 2 - assert status["reconnect_attempts"] == 3 - assert status["last_resource_version"] == "v123" - assert status["config"]["namespace"] == "test-ns" - assert status["config"]["label_selector"] == "app=test" - assert status["config"]["enable_reconciliation"] is True + with pytest.raises(ApiException): + await pm.watch_pod_events() @pytest.mark.asyncio -async def test_reconciliation_loop_and_state(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: +async def test_watch_resets_after_410( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: + """After 410 Gone resets _last_resource_version, next call re-LISTs.""" cfg = PodMonitorConfig() - cfg.enable_state_reconciliation = True - cfg.reconcile_interval_seconds = 0 # sleep(0) yields control immediately - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - pm._state = MonitorState.RUNNING + pod = make_pod(name="p1", phase="Running", resource_version="rv10") - reconcile_called: list[bool] = [] + pm = make_pod_monitor( + event_metrics, kubernetes_metrics, config=cfg, + pods=[pod], list_resource_version="list-rv10", + events=[], resource_version="rv11", + ) - async def mock_reconcile() -> ReconciliationResult: - reconcile_called.append(True) - return ReconciliationResult(missing_pods={"p1"}, extra_pods={"p2"}, duration_seconds=0.1, success=True) + # Simulate 410 recovery: provider sets _last_resource_version = None + pm._last_resource_version = None - evt = asyncio.Event() + await pm.watch_pod_events() - async def wrapped_reconcile() -> ReconciliationResult: - res = await mock_reconcile() - evt.set() - return res + # LIST was called, resource version set from list + assert pm._v1.list_namespaced_pod.await_count == 1 # type: ignore[attr-defined] + assert pm._last_resource_version == "rv11" - pm._reconcile_state = wrapped_reconcile # type: ignore[method-assign] - task = asyncio.create_task(pm._reconciliation_loop()) - await asyncio.wait_for(evt.wait(), timeout=1.0) - pm._state = MonitorState.STOPPED - task.cancel() - with pytest.raises(asyncio.CancelledError): - await task +@pytest.mark.asyncio +async def test_process_raw_event_invalid( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: + cfg = PodMonitorConfig() + pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - assert len(reconcile_called) > 0 + # Should not raise - invalid events are caught and logged + await pm._process_raw_event({}) @pytest.mark.asyncio -async def test_reconcile_state_success(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: +async def test_process_raw_event_with_metadata( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: cfg = PodMonitorConfig() - cfg.namespace = "test" - cfg.label_selector = "app=test" - - pod1 = make_pod(name="pod1", phase="Running", resource_version="v1") - pod2 = make_pod(name="pod2", phase="Running", resource_version="v1") - - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, pods=[pod1, pod2]) - pm._tracked_pods = {"pod2", "pod3"} + pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - processed: list[str] = [] + processed: list[PodEvent] = [] async def mock_process(event: PodEvent) -> None: - processed.append(event.pod.metadata.name) + processed.append(event) pm._process_pod_event = mock_process # type: ignore[method-assign] - result = await pm._reconcile_state() - - assert result.success is True - assert result.missing_pods == {"pod1"} - assert result.extra_pods == {"pod3"} - assert "pod1" in processed - assert "pod3" not in pm._tracked_pods - - -@pytest.mark.asyncio -async def test_reconcile_state_exception(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() + raw_event = { + "type": "ADDED", + "object": types.SimpleNamespace(metadata=types.SimpleNamespace(resource_version="v1")), + } - fail_v1 = MagicMock() - fail_v1.list_namespaced_pod = AsyncMock(side_effect=RuntimeError("API error")) + await pm._process_raw_event(raw_event) + assert len(processed) == 1 + assert processed[0].resource_version == "v1" - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, mock_v1=fail_v1) + raw_event_no_meta = {"type": "MODIFIED", "object": types.SimpleNamespace(metadata=None)} - result = await pm._reconcile_state() - assert result.success is False - assert result.error is not None - assert "API error" in result.error + await pm._process_raw_event(raw_event_no_meta) + assert len(processed) == 2 + assert processed[1].resource_version is None @pytest.mark.asyncio -async def test_process_pod_event_full_flow(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: +async def test_process_pod_event_full_flow( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: cfg = PodMonitorConfig() cfg.ignored_pod_phases = ["Unknown"] @@ -343,7 +322,6 @@ async def mock_publish(event: Any, pod: Any) -> None: # noqa: ARG001 ) await pm._process_pod_event(event) - assert "test-pod" in pm._tracked_pods assert pm._last_resource_version == "v1" assert len(published) == 1 @@ -354,7 +332,6 @@ async def mock_publish(event: Any, pod: Any) -> None: # noqa: ARG001 ) await pm._process_pod_event(event_del) - assert "test-pod" not in pm._tracked_pods assert pm._last_resource_version == "v2" event_ignored = PodEvent( @@ -369,7 +346,9 @@ async def mock_publish(event: Any, pod: Any) -> None: # noqa: ARG001 @pytest.mark.asyncio -async def test_process_pod_event_exception_handling(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: +async def test_process_pod_event_exception_handling( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: cfg = PodMonitorConfig() class FailMapper: @@ -392,7 +371,9 @@ def clear_cache(self) -> None: @pytest.mark.asyncio -async def test_publish_event_full_flow(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: +async def test_publish_event_full_flow( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: cfg = PodMonitorConfig() service, fake_producer = create_test_kafka_event_service(event_metrics) pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, kafka_service=service) @@ -413,7 +394,9 @@ async def test_publish_event_full_flow(event_metrics: EventMetrics, kubernetes_m @pytest.mark.asyncio -async def test_publish_event_exception_handling(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: +async def test_publish_event_exception_handling( + event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, +) -> None: cfg = PodMonitorConfig() class FailingProducer(FakeUnifiedProducer): @@ -422,7 +405,6 @@ async def produce( ) -> None: raise RuntimeError("Publish failed") - # Create service with failing producer failing_producer = FailingProducer() fake_repo = FakeEventRepository() failing_service = KafkaEventService( @@ -441,265 +423,8 @@ async def produce( metadata=EventMetadata(service_name="test", service_version="1.0"), ) - # Use pod with no metadata to exercise edge case pod = make_pod(name="no-meta-pod", phase="Pending") pod.metadata = None # type: ignore[assignment] # Should not raise - errors are caught and logged await pm._publish_event(event, pod) - - -@pytest.mark.asyncio -async def test_handle_watch_error_max_attempts(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - cfg.max_reconnect_attempts = 2 - - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - pm._state = MonitorState.RUNNING - pm._reconnect_attempts = 2 - - await pm._handle_watch_error() - - assert pm._state == MonitorState.STOPPING - - -@pytest.mark.asyncio -async def test_watch_pods_main_loop(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - pm._state = MonitorState.RUNNING - - watch_count: list[int] = [] - - async def mock_watch() -> None: - watch_count.append(1) - if len(watch_count) > 2: - pm._state = MonitorState.STOPPED - - async def mock_handle_error() -> None: - pass - - pm._watch_pod_events = mock_watch # type: ignore[method-assign] - pm._handle_watch_error = mock_handle_error # type: ignore[method-assign] - - await pm._watch_pods() - assert len(watch_count) > 2 - - -@pytest.mark.asyncio -async def test_watch_pods_api_exception(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - pm._state = MonitorState.RUNNING - - async def mock_watch() -> None: - raise ApiException(status=410) - - error_handled: list[bool] = [] - - async def mock_handle() -> None: - error_handled.append(True) - pm._state = MonitorState.STOPPED - - pm._watch_pod_events = mock_watch # type: ignore[method-assign] - pm._handle_watch_error = mock_handle # type: ignore[method-assign] - - await pm._watch_pods() - - assert pm._last_resource_version is None - assert len(error_handled) > 0 - - -@pytest.mark.asyncio -async def test_watch_pods_generic_exception(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - pm._state = MonitorState.RUNNING - - async def mock_watch() -> None: - raise RuntimeError("Unexpected error") - - error_handled: list[bool] = [] - - async def mock_handle() -> None: - error_handled.append(True) - pm._state = MonitorState.STOPPED - - pm._watch_pod_events = mock_watch # type: ignore[method-assign] - pm._handle_watch_error = mock_handle # type: ignore[method-assign] - - await pm._watch_pods() - assert len(error_handled) > 0 - - -@pytest.mark.asyncio -async def test_start_and_stop(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - """Test explicit start() and stop() methods.""" - cfg = PodMonitorConfig() - cfg.enable_state_reconciliation = False - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - - assert pm.state == MonitorState.IDLE - - await pm.start() - state_after_start: MonitorState = pm.state - assert state_after_start == MonitorState.RUNNING - - await pm.stop() - state_after_stop: MonitorState = pm.state - assert state_after_stop == MonitorState.STOPPED - - -@pytest.mark.asyncio -async def test_stop_with_tasks(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - """Test cleanup of tasks on stop().""" - cfg = PodMonitorConfig() - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - pm._state = MonitorState.RUNNING - - async def dummy_task() -> None: - await asyncio.Event().wait() - - pm._watch_task = asyncio.create_task(dummy_task()) - pm._reconcile_task = asyncio.create_task(dummy_task()) - pm._tracked_pods = {"pod1"} - - await pm.stop() - - assert pm._state == MonitorState.STOPPED - assert len(pm._tracked_pods) == 0 - - -@pytest.mark.asyncio -async def test_process_raw_event_with_metadata(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - - processed: list[PodEvent] = [] - - async def mock_process(event: PodEvent) -> None: - processed.append(event) - - pm._process_pod_event = mock_process # type: ignore[method-assign] - - raw_event = { - "type": "ADDED", - "object": types.SimpleNamespace(metadata=types.SimpleNamespace(resource_version="v1")), - } - - await pm._process_raw_event(raw_event) - assert len(processed) == 1 - assert processed[0].resource_version == "v1" - - raw_event_no_meta = {"type": "MODIFIED", "object": types.SimpleNamespace(metadata=None)} - - await pm._process_raw_event(raw_event_no_meta) - assert len(processed) == 2 - assert processed[1].resource_version is None - - -@pytest.mark.asyncio -async def test_watch_pods_api_exception_other_status(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - pm._state = MonitorState.RUNNING - - async def mock_watch() -> None: - raise ApiException(status=500) - - error_handled: list[bool] = [] - - async def mock_handle() -> None: - error_handled.append(True) - pm._state = MonitorState.STOPPED - - pm._watch_pod_events = mock_watch # type: ignore[method-assign] - pm._handle_watch_error = mock_handle # type: ignore[method-assign] - - await pm._watch_pods() - assert len(error_handled) > 0 - - -@pytest.mark.asyncio -async def test_watch_pod_events_with_field_selector(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - cfg.field_selector = "status.phase=Running" - cfg.enable_state_reconciliation = False - - watch_kwargs: list[dict[str, Any]] = [] - - tracking_v1 = MagicMock() - - def track_list(namespace: str, label_selector: str) -> None: - watch_kwargs.append({"namespace": namespace, "label_selector": label_selector}) - return None - - tracking_v1.list_namespaced_pod.side_effect = track_list - - tracking_watch = MagicMock() - - def track_stream(func: Any, **kwargs: Any) -> MockWatchStream: # noqa: ARG001 - watch_kwargs.append(kwargs) - return MockWatchStream([], "rv1") - - tracking_watch.stream.side_effect = track_stream - tracking_watch.stop.return_value = None - tracking_watch.resource_version = "rv1" - - pm = make_pod_monitor( - event_metrics, kubernetes_metrics, config=cfg, - mock_v1=tracking_v1, mock_watch=tracking_watch - ) - pm._state = MonitorState.RUNNING - - await pm._watch_pod_events() - - assert any("field_selector" in kw for kw in watch_kwargs) - - -@pytest.mark.asyncio -async def test_reconciliation_loop_exception(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - cfg.enable_state_reconciliation = True - cfg.reconcile_interval_seconds = 0 # sleep(0) yields control immediately - - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - pm._state = MonitorState.RUNNING - - hit = asyncio.Event() - - async def raising() -> ReconciliationResult: - hit.set() - raise RuntimeError("Reconcile error") - - pm._reconcile_state = raising # type: ignore[method-assign] - - task = asyncio.create_task(pm._reconciliation_loop()) - await asyncio.wait_for(hit.wait(), timeout=1.0) - pm._state = MonitorState.STOPPED - task.cancel() - with pytest.raises(asyncio.CancelledError): - await task - - -@pytest.mark.asyncio -async def test_start_with_reconciliation(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None: - cfg = PodMonitorConfig() - cfg.enable_state_reconciliation = True - - pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg) - - async def mock_watch() -> None: - return None - - async def mock_reconcile() -> None: - return None - - pm._watch_pods = mock_watch # type: ignore[method-assign] - pm._reconciliation_loop = mock_reconcile # type: ignore[method-assign] - - await pm.start() - assert pm._watch_task is not None - assert pm._reconcile_task is not None - - await pm.stop() diff --git a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py index a46b611a..eb0228b4 100644 --- a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py +++ b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py @@ -5,6 +5,7 @@ from app.db.repositories.saga_repository import SagaRepository from app.domain.enums.saga import SagaState from app.domain.events.typed import DomainEvent +from app.domain.saga import DomainResourceAllocation, DomainResourceAllocationCreate from app.domain.saga.models import Saga, SagaConfig from app.events.core import UnifiedProducer from app.services.saga.execution_saga import ExecutionSaga @@ -56,7 +57,21 @@ class _FakeAlloc(ResourceAllocationRepository): """Fake ResourceAllocationRepository for testing.""" def __init__(self) -> None: - pass # No special attributes needed + self.allocations: list[DomainResourceAllocation] = [] + + async def count_active(self, language: str) -> int: + return 0 + + async def create_allocation(self, create_data: DomainResourceAllocationCreate) -> DomainResourceAllocation: + alloc = DomainResourceAllocation( + allocation_id="alloc-1", + **create_data.model_dump(), + ) + self.allocations.append(alloc) + return alloc + + async def release_allocation(self, allocation_id: str) -> bool: + return True def _orch(repo: SagaRepository | None = None) -> SagaOrchestrator: @@ -74,11 +89,11 @@ async def test_handle_event_triggers_saga() -> None: fake_repo = _FakeRepo() orch = _orch(repo=fake_repo) await orch.handle_execution_requested(make_execution_requested_event(execution_id="e")) - assert len(fake_repo.saved) == 1 - saved = fake_repo.saved[0] - assert saved.execution_id == "e" - assert saved.saga_name == ExecutionSaga.get_name() - assert saved.state == SagaState.RUNNING + # The saga is created and fully executed (steps run to completion) + assert len(fake_repo.saved) >= 1 + first_saved = fake_repo.saved[0] + assert first_saved.execution_id == "e" + assert first_saved.saga_name == ExecutionSaga.get_name() @pytest.mark.asyncio diff --git a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py index 58590707..f3122dc4 100644 --- a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py +++ b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py @@ -1,9 +1,7 @@ import logging import pytest -from app.domain.enums.events import EventType from app.domain.events.typed import DomainEvent, EventMetadata, ExecutionStartedEvent -from app.events.core import EventDispatcher from app.services.sse.redis_bus import SSERedisBus pytestmark = pytest.mark.unit @@ -31,19 +29,10 @@ def _make_metadata() -> EventMetadata: async def test_route_domain_event_publishes_to_redis() -> None: fake_bus = _FakeBus() - # Register routing handlers on a dispatcher (same pattern as the DI provider) - disp = EventDispatcher(_test_logger) - for et in SSERedisBus.SSE_ROUTED_EVENTS: - disp.register_handler(et, fake_bus.route_domain_event) - - handlers = disp._handlers[EventType.EXECUTION_STARTED] - assert len(handlers) > 0 - # Event with empty execution_id is ignored - h = handlers[0] - await h(ExecutionStartedEvent(execution_id="", pod_name="p", metadata=_make_metadata())) + await fake_bus.route_domain_event(ExecutionStartedEvent(execution_id="", pod_name="p", metadata=_make_metadata())) assert fake_bus.published == [] # Proper event is published - await h(ExecutionStartedEvent(execution_id="exec-123", pod_name="p", metadata=_make_metadata())) + await fake_bus.route_domain_event(ExecutionStartedEvent(execution_id="exec-123", pod_name="p", metadata=_make_metadata())) assert fake_bus.published and fake_bus.published[-1][0] == "exec-123" diff --git a/backend/uv.lock b/backend/uv.lock index 2a9a72dc..015e47fe 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -111,33 +111,27 @@ wheels = [ [[package]] name = "aiokafka" -version = "0.13.0" +version = "0.12.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "async-timeout" }, { name = "packaging" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/87/18/d3a4f8f9ad099fc59217b8cdf66eeecde3a9ef3bb31fe676e431a3b0010f/aiokafka-0.13.0.tar.gz", hash = "sha256:7d634af3c8d694a37a6c8535c54f01a740e74cccf7cc189ecc4a3d64e31ce122", size = 598580, upload-time = "2026-01-02T13:55:18.911Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/60/17/715ac23b4f8df3ff8d7c0a6f1c5fd3a179a8a675205be62d1d1bb27dffa2/aiokafka-0.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:231ecc0038c2736118f1c95149550dbbdf7b7a12069f70c005764fa1824c35d4", size = 346168, upload-time = "2026-01-02T13:54:49.128Z" }, - { url = "https://files.pythonhosted.org/packages/00/26/71c6f4cce2c710c6ffa18b9e294384157f46b0491d5b020de300802d167e/aiokafka-0.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e2817593cab4c71c1d3b265b2446da91121a467ff7477c65f0f39a80047bc28", size = 349037, upload-time = "2026-01-02T13:54:50.48Z" }, - { url = "https://files.pythonhosted.org/packages/82/18/7b86418a4d3dc1303e89c0391942258ead31c02309e90eb631f3081eec1d/aiokafka-0.13.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b80e0aa1c811a9a12edb0b94445a0638d61a345932f785d47901d28b8aad86c8", size = 1140066, upload-time = "2026-01-02T13:54:52.33Z" }, - { url = "https://files.pythonhosted.org/packages/f9/51/45e46b4407d39b950c8493e19498aeeb5af4fc461fb54fa0247da16bfd75/aiokafka-0.13.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:79672c456bd1642769e74fc2db1c34f23b15500e978fd38411662e8ca07590ad", size = 1130088, upload-time = "2026-01-02T13:54:53.786Z" }, - { url = "https://files.pythonhosted.org/packages/49/7f/6a66f6fd6fb73e15bd34f574e38703ba36d3f9256c80e7aba007bd8a9256/aiokafka-0.13.0-cp312-cp312-win32.whl", hash = "sha256:00bb4e3d5a237b8618883eb1dd8c08d671db91d3e8e33ac98b04edf64225658c", size = 309581, upload-time = "2026-01-02T13:54:55.444Z" }, - { url = "https://files.pythonhosted.org/packages/d3/e0/a2d5a8912699dd0fee28e6fb780358c63c7a4727517fffc110cb7e43f874/aiokafka-0.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:0f0cccdf2fd16927fbe077279524950676fbffa7b102d6b117041b3461b5d927", size = 329327, upload-time = "2026-01-02T13:54:56.981Z" }, - { url = "https://files.pythonhosted.org/packages/e3/f6/a74c49759233e98b61182ba3d49d5ac9c8de0643651892acba2704fba1cc/aiokafka-0.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:39d71c40cff733221a6b2afff4beeac5dacbd119fb99eec5198af59115264a1a", size = 343733, upload-time = "2026-01-02T13:54:58.536Z" }, - { url = "https://files.pythonhosted.org/packages/cf/52/4f7e80eee2c69cd8b047c18145469bf0dc27542a5dca3f96ff81ade575b0/aiokafka-0.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:faa2f5f3d0d2283a0c1a149748cc7e3a3862ef327fa5762e2461088eedde230a", size = 346258, upload-time = "2026-01-02T13:55:00.947Z" }, - { url = "https://files.pythonhosted.org/packages/81/9b/d2766bb3b0bad53eb25a88e51a884be4b77a1706053ad717b893b4daea4b/aiokafka-0.13.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b890d535e55f5073f939585bef5301634df669e97832fda77aa743498f008662", size = 1114744, upload-time = "2026-01-02T13:55:02.475Z" }, - { url = "https://files.pythonhosted.org/packages/8f/00/12e0a39cd4809149a09b4a52b629abc9bf80e7b8bad9950040b1adae99fc/aiokafka-0.13.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e22eb8a1475b9c0f45b553b6e2dcaf4ec3c0014bf4e389e00a0a0ec85d0e3bdc", size = 1105676, upload-time = "2026-01-02T13:55:04.036Z" }, - { url = "https://files.pythonhosted.org/packages/38/4a/0bc91e90faf55533fe6468461c2dd31c22b0e1d274b9386f341cca3f7eb7/aiokafka-0.13.0-cp313-cp313-win32.whl", hash = "sha256:ae507c7b09e882484f709f2e7172b3a4f75afffcd896d00517feb35c619495bb", size = 308257, upload-time = "2026-01-02T13:55:05.873Z" }, - { url = "https://files.pythonhosted.org/packages/23/63/5433d1aa10c4fb4cf85bd73013263c36d7da4604b0c77ed4d1ad42fae70c/aiokafka-0.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:fec1a7e3458365a72809edaa2b990f65ca39b01a2a579f879ac4da6c9b2dbc5c", size = 326968, upload-time = "2026-01-02T13:55:07.351Z" }, - { url = "https://files.pythonhosted.org/packages/3c/cc/45b04c3a5fd3d2d5f444889ecceb80b2f78d6d66aa45e3042767e55579e2/aiokafka-0.13.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9a403785f7092c72906c37f7618f7b16a4219eba8ed0bdda90fba410a7dd50b5", size = 344503, upload-time = "2026-01-02T13:55:08.723Z" }, - { url = "https://files.pythonhosted.org/packages/76/df/0b76fe3b93558ae71b856940e384909c4c2c7a1c330423003191e4ba7782/aiokafka-0.13.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:256807326831b7eee253ea1017bd2b19ab1c2298ce6b20a87fde97c253c572bc", size = 347621, upload-time = "2026-01-02T13:55:10.147Z" }, - { url = "https://files.pythonhosted.org/packages/34/1a/d59932f98fd3c106e2a7c8d4d5ebd8df25403436dfc27b3031918a37385e/aiokafka-0.13.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:64d90f91291da265d7f25296ba68fc6275684eebd6d1cf05a1b2abe6c2ba3543", size = 1111410, upload-time = "2026-01-02T13:55:11.763Z" }, - { url = "https://files.pythonhosted.org/packages/7e/04/fbf3e34ab3bc21e6e760c3fcd089375052fccc04eb8745459a82a58a647b/aiokafka-0.13.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b5a33cc043c8d199bcf101359d86f2d31fd54f4b157ac12028bdc34e3e1cf74a", size = 1094799, upload-time = "2026-01-02T13:55:13.795Z" }, - { url = "https://files.pythonhosted.org/packages/85/10/509f709fd3b7c3e568a5b8044be0e80a1504f8da6ddc72c128b21e270913/aiokafka-0.13.0-cp314-cp314-win32.whl", hash = "sha256:538950384b539ba2333d35a853f09214c0409e818e5d5f366ef759eea50bae9c", size = 311553, upload-time = "2026-01-02T13:55:15.928Z" }, - { url = "https://files.pythonhosted.org/packages/2b/18/424d6a4eb6f4835a371c1e2cfafce800540b33d957c6638795d911f98973/aiokafka-0.13.0-cp314-cp314-win_amd64.whl", hash = "sha256:c906dd42daadd14b4506a2e6c62dfef3d4919b5953d32ae5e5f0d99efd103c89", size = 330648, upload-time = "2026-01-02T13:55:17.421Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/65/ca/42a962033e6a7926dcb789168bce81d0181ef4ddabce454d830b7e62370e/aiokafka-0.12.0.tar.gz", hash = "sha256:62423895b866f95b5ed8d88335295a37cc5403af64cb7cb0e234f88adc2dff94", size = 564955, upload-time = "2024-10-26T20:53:11.227Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/d4/baf1b2389995c6c312834792329a1993a303ff703ac023250ff977c5923b/aiokafka-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b01947553ff1120fa1cb1a05f2c3e5aa47a5378c720bafd09e6630ba18af02aa", size = 375031, upload-time = "2024-10-26T20:52:40.104Z" }, + { url = "https://files.pythonhosted.org/packages/54/ac/653070a4add8beea7aa8209ab396de87c7b4f9628fff15efcdbaea40e973/aiokafka-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e3c8ec1c0606fa645462c7353dc3e4119cade20c4656efa2031682ffaad361c0", size = 370619, upload-time = "2024-10-26T20:52:41.877Z" }, + { url = "https://files.pythonhosted.org/packages/80/f2/0ddaaa11876ab78e0f3b30f272c62eea70870e1a52a5afe985c7c1d098e1/aiokafka-0.12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:577c1c48b240e9eba57b3d2d806fb3d023a575334fc3953f063179170cc8964f", size = 1192363, upload-time = "2024-10-26T20:52:44.028Z" }, + { url = "https://files.pythonhosted.org/packages/ae/48/541ccece0e593e24ee371dec0c33c23718bc010b04e998693e4c19091258/aiokafka-0.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7b815b2e5fed9912f1231be6196547a367b9eb3380b487ff5942f0c73a3fb5c", size = 1213231, upload-time = "2024-10-26T20:52:46.028Z" }, + { url = "https://files.pythonhosted.org/packages/99/3f/75bd0faa77dfecce34dd1c0edd317b608518b096809736f9987dd61f4cec/aiokafka-0.12.0-cp312-cp312-win32.whl", hash = "sha256:5a907abcdf02430df0829ac80f25b8bb849630300fa01365c76e0ae49306f512", size = 347752, upload-time = "2024-10-26T20:52:47.327Z" }, + { url = "https://files.pythonhosted.org/packages/ef/97/e2513a0c10585e51d4d9b42c9dd5f5ab15dfe150620a4893a2c6c20f0f4a/aiokafka-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:fdbd69ec70eea4a8dfaa5c35ff4852e90e1277fcc426b9380f0b499b77f13b16", size = 366068, upload-time = "2024-10-26T20:52:49.132Z" }, + { url = "https://files.pythonhosted.org/packages/30/84/f1f7e603cd07e877520b5a1e48e006cbc1fe448806cabbaa98aa732f530d/aiokafka-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f9e8ab97b935ca681a5f28cf22cf2b5112be86728876b3ec07e4ed5fc6c21f2d", size = 370960, upload-time = "2024-10-26T20:52:51.235Z" }, + { url = "https://files.pythonhosted.org/packages/d7/c7/5237b3687198c2129c0bafa4a96cf8ae3883e20cc860125bafe16af3778e/aiokafka-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ed991c120fe19fd9439f564201dd746c4839700ef270dd4c3ee6d4895f64fe83", size = 366597, upload-time = "2024-10-26T20:52:52.539Z" }, + { url = "https://files.pythonhosted.org/packages/6b/67/0154551292ec1c977e5def178ae5c947773e921aefb6877971e7fdf1942e/aiokafka-0.12.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c01abf9787b1c3f3af779ad8e76d5b74903f590593bc26f33ed48750503e7f7", size = 1152905, upload-time = "2024-10-26T20:52:54.089Z" }, + { url = "https://files.pythonhosted.org/packages/d9/20/69f913a76916e94c4e783dc7d0d05a25c384b25faec33e121062c62411fe/aiokafka-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08c84b3894d97fd02fcc8886f394000d0f5ce771fab5c498ea2b0dd2f6b46d5b", size = 1171893, upload-time = "2024-10-26T20:52:56.14Z" }, + { url = "https://files.pythonhosted.org/packages/16/65/41cc1b19e7dea623ef58f3bf1e2720377c5757a76d9799d53a1b5fc39255/aiokafka-0.12.0-cp313-cp313-win32.whl", hash = "sha256:63875fed922c8c7cf470d9b2a82e1b76b4a1baf2ae62e07486cf516fd09ff8f2", size = 345933, upload-time = "2024-10-26T20:52:57.518Z" }, + { url = "https://files.pythonhosted.org/packages/bf/0d/4cb57231ff650a01123a09075bf098d8fdaf94b15a1a58465066b2251e8b/aiokafka-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:bdc0a83eb386d2384325d6571f8ef65b4cfa205f8d1c16d7863e8d10cacd995a", size = 363194, upload-time = "2024-10-26T20:52:59.434Z" }, ] [[package]] @@ -630,11 +624,11 @@ wheels = [ [[package]] name = "dishka" -version = "1.6.0" +version = "1.7.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/11/04/f3add05678a3ac1ab7736faae45b18b5365d84b1cd3cf3af64b09a1d6a5f/dishka-1.6.0.tar.gz", hash = "sha256:f1fa5ec7e980d4f618d0c425d1bb81d8e9414894d8ec6553b197d2298774e12f", size = 65971, upload-time = "2025-05-18T21:40:53.259Z" } +sdist = { url = "https://files.pythonhosted.org/packages/40/d7/1be31f5ef32387059190353f9fa493ff4d07a1c75fa856c7566ca45e0800/dishka-1.7.2.tar.gz", hash = "sha256:47d4cb5162b28c61bf5541860e605ed5eaf5c667122299c7ef657c86fc8d5a49", size = 68132, upload-time = "2025-09-24T21:23:05.135Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/76/6b/f9cd08543c4f55bf129a0ebce5c09e43528235dd6e7cb906761ca094979a/dishka-1.6.0-py3-none-any.whl", hash = "sha256:ab1aedee152ce7bb11cfd2673d7ce4001fe2b330d14e84535d7525a68430b2c2", size = 90789, upload-time = "2025-05-18T21:40:51.352Z" }, + { url = "https://files.pythonhosted.org/packages/b7/b9/89381173b4f336e986d72471198614806cd313e0f85c143ccb677c310223/dishka-1.7.2-py3-none-any.whl", hash = "sha256:f6faa6ab321903926b825b3337d77172ee693450279b314434864978d01fbad3", size = 94774, upload-time = "2025-09-24T21:23:03.246Z" }, ] [[package]] @@ -686,6 +680,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" }, ] +[[package]] +name = "fast-depends" +version = "3.0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/07/f3/41e955f5f0811de6ef9f00f8462f2ade7bc4a99b93714c9b134646baa831/fast_depends-3.0.5.tar.gz", hash = "sha256:c915a54d6e0d0f0393686d37c14d54d9ec7c43d7b9def3f3fc4f7b4d52f67f2a", size = 18235, upload-time = "2025-11-30T20:26:12.92Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/dd/76697228ae63dcbaf0a0a1b20fc996433a33f184ac4f578382b681dcf5ea/fast_depends-3.0.5-py3-none-any.whl", hash = "sha256:38a3d7044d3d6d0b1bed703691275c870316426e8a9bfa6b1c89e979b15659e2", size = 25362, upload-time = "2025-11-30T20:26:10.96Z" }, +] + +[package.optional-dependencies] +pydantic = [ + { name = "pydantic" }, +] + [[package]] name = "fastapi" version = "0.128.0" @@ -736,6 +748,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/93/b44f67589e4d439913dab6720f7e3507b0fa8b8e56d06f6fc875ced26afb/fastavro-1.12.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:43ded16b3f4a9f1a42f5970c2aa618acb23ea59c4fcaa06680bdf470b255e5a8", size = 3386636, upload-time = "2025-10-10T15:42:18.974Z" }, ] +[[package]] +name = "faststream" +version = "0.6.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "fast-depends", extra = ["pydantic"] }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/cc/26deefd97a3d51205554d4fe69ffc2a9144515cda20cb7185be27e11166e/faststream-0.6.6.tar.gz", hash = "sha256:de87502e22db0372131165221728c6993b29d42ba29aaaa0a27d1249803f2ddd", size = 302712, upload-time = "2026-02-03T18:08:35.747Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/17/169728098799d4f5c4978f9b83d2dc41541eee02ec8547149a085acf11dd/faststream-0.6.6-py3-none-any.whl", hash = "sha256:4aca70628b526d8e27771f1f8edf9cd0a80a62f335a2721ddbbc863e6098f269", size = 507654, upload-time = "2026-02-03T18:08:34.347Z" }, +] + +[package.optional-dependencies] +kafka = [ + { name = "aiokafka" }, +] + [[package]] name = "fonttools" version = "4.61.1" @@ -1081,6 +1112,7 @@ dependencies = [ { name = "exceptiongroup" }, { name = "fastapi" }, { name = "fastavro" }, + { name = "faststream", extra = ["kafka"] }, { name = "fonttools" }, { name = "frozenlist" }, { name = "google-auth" }, @@ -1196,7 +1228,7 @@ requires-dist = [ { name = "aiofiles", specifier = "==25.1.0" }, { name = "aiohappyeyeballs", specifier = "==2.6.1" }, { name = "aiohttp", specifier = "==3.13.3" }, - { name = "aiokafka", specifier = "==0.13.0" }, + { name = "aiokafka", specifier = "==0.12.0" }, { name = "aiosignal", specifier = "==1.4.0" }, { name = "aiosmtplib", specifier = "==3.0.2" }, { name = "annotated-doc", specifier = "==0.0.4" }, @@ -1219,13 +1251,14 @@ requires-dist = [ { name = "contourpy", specifier = "==1.3.3" }, { name = "cycler", specifier = "==0.12.1" }, { name = "deprecated", specifier = "==1.2.14" }, - { name = "dishka", specifier = "==1.6.0" }, + { name = "dishka", specifier = "==1.7.2" }, { name = "dnspython", specifier = "==2.7.0" }, { name = "durationpy", specifier = "==0.9" }, { name = "email-validator", specifier = "==2.3.0" }, { name = "exceptiongroup", specifier = "==1.2.2" }, { name = "fastapi", specifier = "==0.128.0" }, { name = "fastavro", specifier = "==1.12.1" }, + { name = "faststream", extras = ["kafka"], specifier = "==0.6.6" }, { name = "fonttools", specifier = "==4.61.1" }, { name = "frozenlist", specifier = "==1.7.0" }, { name = "google-auth", specifier = "==2.47.0" }, diff --git a/backend/workers/dlq_processor.py b/backend/workers/dlq_processor.py index d36899fb..6f2561cc 100644 --- a/backend/workers/dlq_processor.py +++ b/backend/workers/dlq_processor.py @@ -1,202 +1,65 @@ import asyncio -import logging -import signal -from contextlib import AsyncExitStack -from datetime import datetime, timezone -from aiokafka import AIOKafkaConsumer from app.core.container import create_dlq_processor_container -from app.core.database_context import Database -from app.core.tracing import EventAttributes -from app.core.tracing.utils import extract_trace_context, get_tracer -from app.db.docs import ALL_DOCUMENTS -from app.dlq import DLQMessage, RetryPolicy, RetryStrategy +from app.core.logging import setup_logger +from app.core.tracing import init_tracing from app.dlq.manager import DLQManager -from app.domain.enums.kafka import GroupId, KafkaTopic +from app.domain.enums.kafka import GroupId +from app.events.broker import create_broker +from app.events.handlers import register_dlq_subscriber +from app.events.schema.schema_registry import SchemaRegistryManager from app.settings import Settings -from beanie import init_beanie -from opentelemetry.trace import SpanKind +from dishka.integrations.faststream import setup_dishka +from faststream import FastStream -def _configure_retry_policies(manager: DLQManager, logger: logging.Logger) -> None: - manager.set_retry_policy( - "execution-requests", - RetryPolicy( - topic="execution-requests", - strategy=RetryStrategy.EXPONENTIAL_BACKOFF, - max_retries=5, - base_delay_seconds=30, - max_delay_seconds=300, - retry_multiplier=2.0, - ), - ) - manager.set_retry_policy( - "pod-events", - RetryPolicy( - topic="pod-events", - strategy=RetryStrategy.EXPONENTIAL_BACKOFF, - max_retries=3, - base_delay_seconds=60, - max_delay_seconds=600, - retry_multiplier=3.0, - ), - ) - manager.set_retry_policy( - "resource-allocation", - RetryPolicy(topic="resource-allocation", strategy=RetryStrategy.IMMEDIATE, max_retries=3), - ) - manager.set_retry_policy( - "websocket-events", - RetryPolicy( - topic="websocket-events", strategy=RetryStrategy.FIXED_INTERVAL, max_retries=10, base_delay_seconds=10 - ), - ) - manager.default_retry_policy = RetryPolicy( - topic="default", - strategy=RetryStrategy.EXPONENTIAL_BACKOFF, - max_retries=4, - base_delay_seconds=60, - max_delay_seconds=1800, - retry_multiplier=2.5, - ) +def main() -> None: + """Main entry point for DLQ processor worker.""" + settings = Settings(override_path="config.dlq-processor.toml") + logger = setup_logger(settings.LOG_LEVEL) -def _configure_filters(manager: DLQManager, testing: bool, logger: logging.Logger) -> None: - if not testing: + logger.info("Starting DLQ Processor worker...") - def filter_test_events(message: DLQMessage) -> bool: - event_id = message.event.event_id or "" - return not event_id.startswith("test-") + if settings.ENABLE_TRACING: + init_tracing( + service_name=GroupId.DLQ_MANAGER, + settings=settings, + logger=logger, + service_version=settings.TRACING_SERVICE_VERSION, + enable_console_exporter=False, + sampling_rate=settings.TRACING_SAMPLING_RATE, + ) + logger.info("Tracing initialized for DLQ Processor") - manager.add_filter(filter_test_events) + # Create Kafka broker and register DLQ subscriber + schema_registry = SchemaRegistryManager(settings, logger) + broker = create_broker(settings, schema_registry, logger) + register_dlq_subscriber(broker, settings) - def filter_old_messages(message: DLQMessage) -> bool: - max_age_days = 7 - age_seconds = (datetime.now(timezone.utc) - message.failed_at).total_seconds() - return age_seconds < (max_age_days * 24 * 3600) + # Create DI container with broker in context + container = create_dlq_processor_container(settings, broker) + setup_dishka(container, broker=broker, auto_inject=True) - manager.add_filter(filter_old_messages) + app = FastStream(broker) + @app.on_startup + async def startup() -> None: + # Resolving DLQManager triggers Database init (via dependency), + # configures retry policies/filters, and starts APScheduler retry monitor + await container.get(DLQManager) + logger.info("DLQ Processor infrastructure initialized") -async def _consume_messages( - consumer: AIOKafkaConsumer, - manager: DLQManager, - stop_event: asyncio.Event, - logger: logging.Logger, -) -> None: - """Consume DLQ messages and dispatch each to the stateless handler.""" - async for msg in consumer: - if stop_event.is_set(): - break - try: - start = asyncio.get_running_loop().time() - dlq_msg = manager.parse_kafka_message(msg) - await manager.repository.save_message(dlq_msg) + @app.on_shutdown + async def shutdown() -> None: + await container.close() + logger.info("DLQ Processor shutdown complete") - manager.metrics.record_dlq_message_received(dlq_msg.original_topic, dlq_msg.event.event_type) - manager.metrics.record_dlq_message_age( - (datetime.now(timezone.utc) - dlq_msg.failed_at).total_seconds() - ) + async def run() -> None: + await app.run() - ctx = extract_trace_context(dlq_msg.headers) - with get_tracer().start_as_current_span( - name="dlq.consume", - context=ctx, - kind=SpanKind.CONSUMER, - attributes={ - EventAttributes.KAFKA_TOPIC: manager.dlq_topic, - EventAttributes.EVENT_TYPE: dlq_msg.event.event_type, - EventAttributes.EVENT_ID: dlq_msg.event.event_id, - }, - ): - await manager.handle_message(dlq_msg) - - await consumer.commit() - manager.metrics.record_dlq_processing_duration(asyncio.get_running_loop().time() - start, "process") - except Exception as e: - logger.error(f"Error processing DLQ message: {e}") - - -async def _monitor_retries( - manager: DLQManager, - stop_event: asyncio.Event, - logger: logging.Logger, -) -> None: - """Periodically process due retries and update queue metrics.""" - while not stop_event.is_set(): - try: - await manager.process_due_retries() - await manager.update_queue_metrics() - interval = 10 - except Exception as e: - logger.error(f"Error in DLQ monitor: {e}") - interval = 60 - try: - await asyncio.wait_for(stop_event.wait(), timeout=interval) - break # stop_event was set - except asyncio.TimeoutError: - continue - - -async def main(settings: Settings) -> None: - """Run the DLQ processor. - - DLQ lifecycle events (received, retried, discarded) are emitted to the - dlq_events Kafka topic for external observability. Logging is handled - internally by the DLQ manager. - """ - container = create_dlq_processor_container(settings) - logger = await container.get(logging.Logger) - logger.info("Starting DLQ Processor...") - - db = await container.get(Database) - await init_beanie(database=db, document_models=ALL_DOCUMENTS) - - manager = await container.get(DLQManager) - - _configure_retry_policies(manager, logger) - _configure_filters(manager, testing=settings.TESTING, logger=logger) - - topic_name = f"{settings.KAFKA_TOPIC_PREFIX}{KafkaTopic.DEAD_LETTER_QUEUE}" - consumer = AIOKafkaConsumer( - topic_name, - bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS, - group_id=GroupId.DLQ_MANAGER, - enable_auto_commit=False, - auto_offset_reset="earliest", - client_id="dlq-manager-consumer", - session_timeout_ms=settings.KAFKA_SESSION_TIMEOUT_MS, - heartbeat_interval_ms=settings.KAFKA_HEARTBEAT_INTERVAL_MS, - max_poll_interval_ms=settings.KAFKA_MAX_POLL_INTERVAL_MS, - request_timeout_ms=settings.KAFKA_REQUEST_TIMEOUT_MS, - ) - - stop_event = asyncio.Event() - loop = asyncio.get_running_loop() - - def signal_handler() -> None: - logger.info("Received signal, initiating shutdown...") - stop_event.set() - - for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, signal_handler) - - async with AsyncExitStack() as stack: - stack.push_async_callback(container.close) - await consumer.start() - stack.push_async_callback(consumer.stop) - - consume_task = asyncio.create_task(_consume_messages(consumer, manager, stop_event, logger)) - monitor_task = asyncio.create_task(_monitor_retries(manager, stop_event, logger)) - - logger.info("DLQ Processor running") - await stop_event.wait() - - consume_task.cancel() - monitor_task.cancel() - await asyncio.gather(consume_task, monitor_task, return_exceptions=True) - logger.info("DLQ Processor stopped") + asyncio.run(run()) if __name__ == "__main__": - asyncio.run(main(Settings(override_path="config.dlq-processor.toml"))) + main() diff --git a/backend/workers/run_coordinator.py b/backend/workers/run_coordinator.py index 03702b49..d2d5ae30 100644 --- a/backend/workers/run_coordinator.py +++ b/backend/workers/run_coordinator.py @@ -1,48 +1,16 @@ import asyncio -import logging -import signal from app.core.container import create_coordinator_container from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing -from app.db.docs import ALL_DOCUMENTS from app.domain.enums.kafka import GroupId -from app.events.core import UnifiedConsumer -from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas +from app.events.broker import create_broker +from app.events.handlers import register_coordinator_subscriber +from app.events.schema.schema_registry import SchemaRegistryManager from app.settings import Settings -from beanie import init_beanie - - -async def run_coordinator(settings: Settings) -> None: - """Run the execution coordinator service.""" - - container = create_coordinator_container(settings) - logger = await container.get(logging.Logger) - logger.info("Starting ExecutionCoordinator with DI container...") - - db = await container.get(Database) - await init_beanie(database=db, document_models=ALL_DOCUMENTS) - - schema_registry = await container.get(SchemaRegistryManager) - await initialize_event_schemas(schema_registry) - - # Get consumer (triggers coordinator + dispatcher + queue_manager + scheduling via DI) - await container.get(UnifiedConsumer) - - # Shutdown event - shutdown_event = asyncio.Event() - loop = asyncio.get_running_loop() - for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, shutdown_event.set) - - logger.info("ExecutionCoordinator started and running") - - try: - await shutdown_event.wait() - finally: - logger.info("Initiating graceful shutdown...") - await container.close() +from dishka.integrations.faststream import setup_dishka +from faststream import FastStream def main() -> None: @@ -50,7 +18,6 @@ def main() -> None: settings = Settings(override_path="config.coordinator.toml") logger = setup_logger(settings.LOG_LEVEL) - logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting ExecutionCoordinator worker...") @@ -65,7 +32,31 @@ def main() -> None: ) logger.info("Tracing initialized for ExecutionCoordinator") - asyncio.run(run_coordinator(settings)) + # Create Kafka broker and register subscriber + schema_registry = SchemaRegistryManager(settings, logger) + broker = create_broker(settings, schema_registry, logger) + register_coordinator_subscriber(broker, settings) + + # Create DI container with broker in context + container = create_coordinator_container(settings, broker) + setup_dishka(container, broker=broker, auto_inject=True) + + app = FastStream(broker) + + @app.on_startup + async def startup() -> None: + await container.get(Database) # triggers init_beanie inside provider + logger.info("ExecutionCoordinator infrastructure initialized") + + @app.on_shutdown + async def shutdown() -> None: + await container.close() + logger.info("ExecutionCoordinator shutdown complete") + + async def run() -> None: + await app.run() + + asyncio.run(run()) if __name__ == "__main__": diff --git a/backend/workers/run_event_replay.py b/backend/workers/run_event_replay.py index df648269..81aac922 100644 --- a/backend/workers/run_event_replay.py +++ b/backend/workers/run_event_replay.py @@ -1,57 +1,41 @@ import asyncio import logging -from contextlib import AsyncExitStack +import signal from app.core.container import create_event_replay_container -from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing -from app.db.docs import ALL_DOCUMENTS +from app.events.broker import create_broker +from app.events.schema.schema_registry import SchemaRegistryManager from app.services.event_replay.replay_service import EventReplayService from app.settings import Settings -from beanie import init_beanie - - -async def cleanup_task(replay_service: EventReplayService, logger: logging.Logger, interval_hours: int = 6) -> None: - """Periodically clean up old replay sessions""" - while True: - try: - await asyncio.sleep(interval_hours * 3600) - removed = await replay_service.cleanup_old_sessions(older_than_hours=48) - logger.info(f"Cleaned up {removed} old replay sessions") - except Exception as e: - logger.error(f"Error during cleanup: {e}") async def run_replay_service(settings: Settings) -> None: - """Run the event replay service with cleanup task.""" + """Run the event replay service with DI-managed cleanup scheduler.""" + tmp_logger = setup_logger(settings.LOG_LEVEL) + schema_registry = SchemaRegistryManager(settings, tmp_logger) + broker = create_broker(settings, schema_registry, tmp_logger) - container = create_event_replay_container(settings) + container = create_event_replay_container(settings, broker) logger = await container.get(logging.Logger) logger.info("Starting EventReplayService with DI container...") - db = await container.get(Database) - await init_beanie(database=db, document_models=ALL_DOCUMENTS) - - replay_service = await container.get(EventReplayService) - + # Resolving EventReplayService triggers Database init (via dependency) + # and starts the APScheduler cleanup scheduler (via EventReplayWorkerProvider) + await container.get(EventReplayService) logger.info("Event replay service initialized") - async with AsyncExitStack() as stack: - stack.push_async_callback(container.close) - - task = asyncio.create_task(cleanup_task(replay_service, logger)) - - async def _cancel_task() -> None: - task.cancel() - try: - await task - except asyncio.CancelledError: - pass - - stack.push_async_callback(_cancel_task) + shutdown_event = asyncio.Event() + loop = asyncio.get_running_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, shutdown_event.set) - await asyncio.Event().wait() + try: + await shutdown_event.wait() + finally: + logger.info("Initiating graceful shutdown...") + await container.close() def main() -> None: @@ -59,7 +43,6 @@ def main() -> None: settings = Settings(override_path="config.event-replay.toml") logger = setup_logger(settings.LOG_LEVEL) - logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting Event Replay Service...") diff --git a/backend/workers/run_k8s_worker.py b/backend/workers/run_k8s_worker.py index 6fe841c1..3044625b 100644 --- a/backend/workers/run_k8s_worker.py +++ b/backend/workers/run_k8s_worker.py @@ -1,61 +1,17 @@ import asyncio -import logging -import signal from app.core.container import create_k8s_worker_container from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing -from app.db.docs import ALL_DOCUMENTS from app.domain.enums.kafka import GroupId -from app.events.core import UnifiedConsumer -from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas +from app.events.broker import create_broker +from app.events.handlers import register_k8s_worker_subscriber +from app.events.schema.schema_registry import SchemaRegistryManager from app.services.k8s_worker import KubernetesWorker from app.settings import Settings -from beanie import init_beanie - - -async def run_kubernetes_worker(settings: Settings) -> None: - """Run the Kubernetes worker service.""" - - container = create_k8s_worker_container(settings) - logger = await container.get(logging.Logger) - logger.info("Starting KubernetesWorker with DI container...") - - # Bootstrap database - db = await container.get(Database) - await init_beanie(database=db, document_models=ALL_DOCUMENTS) - - # Initialize schemas - schema_registry = await container.get(SchemaRegistryManager) - await initialize_event_schemas(schema_registry) - - # Get worker (triggers dispatcher creation and handler registration) - worker = await container.get(KubernetesWorker) - - # Get consumer (triggers consumer creation and start) - # Consumer runs in background via its internal consume loop - await container.get(UnifiedConsumer) - - # Bootstrap: ensure image pre-puller DaemonSet exists - # Save task to variable to prevent premature garbage collection - _daemonset_task = asyncio.create_task(worker.ensure_image_pre_puller_daemonset()) - logger.info("Image pre-puller daemonset task scheduled") - - # Shutdown event - shutdown_event = asyncio.Event() - loop = asyncio.get_running_loop() - for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, shutdown_event.set) - - logger.info("KubernetesWorker started and running") - - try: - # Wait for shutdown signal - await shutdown_event.wait() - finally: - logger.info("Initiating graceful shutdown...") - await container.close() +from dishka.integrations.faststream import setup_dishka +from faststream import FastStream def main() -> None: @@ -63,7 +19,6 @@ def main() -> None: settings = Settings(override_path="config.k8s-worker.toml") logger = setup_logger(settings.LOG_LEVEL) - logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting KubernetesWorker...") @@ -78,7 +33,37 @@ def main() -> None: ) logger.info("Tracing initialized for KubernetesWorker") - asyncio.run(run_kubernetes_worker(settings)) + # Create Kafka broker and register subscriber + schema_registry = SchemaRegistryManager(settings, logger) + broker = create_broker(settings, schema_registry, logger) + register_k8s_worker_subscriber(broker, settings) + + # Create DI container with broker in context + container = create_k8s_worker_container(settings, broker) + setup_dishka(container, broker=broker, auto_inject=True) + + app = FastStream(broker) + + @app.on_startup + async def startup() -> None: + await container.get(Database) # triggers init_beanie inside provider + logger.info("KubernetesWorker infrastructure initialized") + + @app.after_startup + async def bootstrap() -> None: + worker = await container.get(KubernetesWorker) + await worker.ensure_image_pre_puller_daemonset() + logger.info("Image pre-puller daemonset applied") + + @app.on_shutdown + async def shutdown() -> None: + await container.close() + logger.info("KubernetesWorker shutdown complete") + + async def run() -> None: + await app.run() + + asyncio.run(run()) if __name__ == "__main__": diff --git a/backend/workers/run_pod_monitor.py b/backend/workers/run_pod_monitor.py index 0fbee75b..1f72277e 100644 --- a/backend/workers/run_pod_monitor.py +++ b/backend/workers/run_pod_monitor.py @@ -1,55 +1,15 @@ import asyncio -import logging -import signal from app.core.container import create_pod_monitor_container -from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing -from app.db.docs import ALL_DOCUMENTS from app.domain.enums.kafka import GroupId -from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.services.pod_monitor.monitor import MonitorState, PodMonitor +from app.events.broker import create_broker +from app.events.schema.schema_registry import SchemaRegistryManager +from app.services.pod_monitor.monitor import PodMonitor from app.settings import Settings -from beanie import init_beanie - -RECONCILIATION_LOG_INTERVAL: int = 60 - - -async def run_pod_monitor(settings: Settings) -> None: - """Run the pod monitor service.""" - - container = create_pod_monitor_container(settings) - logger = await container.get(logging.Logger) - logger.info("Starting PodMonitor with DI container...") - - db = await container.get(Database) - await init_beanie(database=db, document_models=ALL_DOCUMENTS) - - schema_registry = await container.get(SchemaRegistryManager) - await initialize_event_schemas(schema_registry) - - # Services are already started by the DI container providers - monitor = await container.get(PodMonitor) - - # Shutdown event - signal handlers just set this - shutdown_event = asyncio.Event() - loop = asyncio.get_running_loop() - for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, shutdown_event.set) - - logger.info("PodMonitor started and running") - - try: - # Wait for shutdown signal or service to stop - while monitor.state == MonitorState.RUNNING and not shutdown_event.is_set(): - await asyncio.sleep(RECONCILIATION_LOG_INTERVAL) - status = await monitor.get_status() - logger.info(f"Pod monitor status: {status}") - finally: - # Container cleanup stops everything - logger.info("Initiating graceful shutdown...") - await container.close() +from dishka.integrations.faststream import setup_dishka +from faststream import FastStream def main() -> None: @@ -57,7 +17,6 @@ def main() -> None: settings = Settings(override_path="config.pod-monitor.toml") logger = setup_logger(settings.LOG_LEVEL) - logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting PodMonitor worker...") @@ -72,7 +31,32 @@ def main() -> None: ) logger.info("Tracing initialized for PodMonitor Service") - asyncio.run(run_pod_monitor(settings)) + # Create Kafka broker (PodMonitor publishes events via KafkaEventService) + schema_registry = SchemaRegistryManager(settings, logger) + broker = create_broker(settings, schema_registry, logger) + + # Create DI container with broker in context + container = create_pod_monitor_container(settings, broker) + setup_dishka(container, broker=broker, auto_inject=True) + + app = FastStream(broker) + + @app.on_startup + async def startup() -> None: + # Resolving PodMonitor triggers Database init (via dependency), + # starts the K8s watch loop, and starts the reconciliation scheduler + await container.get(PodMonitor) + logger.info("PodMonitor infrastructure initialized") + + @app.on_shutdown + async def shutdown() -> None: + await container.close() + logger.info("PodMonitor shutdown complete") + + async def run() -> None: + await app.run() + + asyncio.run(run()) if __name__ == "__main__": diff --git a/backend/workers/run_result_processor.py b/backend/workers/run_result_processor.py index a78a32de..2a1061a2 100644 --- a/backend/workers/run_result_processor.py +++ b/backend/workers/run_result_processor.py @@ -1,49 +1,16 @@ import asyncio -import logging -import signal from app.core.container import create_result_processor_container from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing -from app.db.docs import ALL_DOCUMENTS from app.domain.enums.kafka import GroupId -from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas -from app.services.result_processor.processor import ResultProcessor +from app.events.broker import create_broker +from app.events.handlers import register_result_processor_subscriber +from app.events.schema.schema_registry import SchemaRegistryManager from app.settings import Settings -from beanie import init_beanie - - -async def run_result_processor(settings: Settings) -> None: - """Run the result processor.""" - - container = create_result_processor_container(settings) - logger = await container.get(logging.Logger) - logger.info("Starting ResultProcessor with DI container...") - - db = await container.get(Database) - await init_beanie(database=db, document_models=ALL_DOCUMENTS) - - schema_registry = await container.get(SchemaRegistryManager) - await initialize_event_schemas(schema_registry) - - # Triggers consumer start via DI - await container.get(ResultProcessor) - - shutdown_event = asyncio.Event() - loop = asyncio.get_running_loop() - for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, shutdown_event.set) - - logger.info("ResultProcessor started and running") - - try: - await shutdown_event.wait() - finally: - logger.info("Initiating graceful shutdown...") - await container.close() - - logger.warning("ResultProcessor stopped") +from dishka.integrations.faststream import setup_dishka +from faststream import FastStream def main() -> None: @@ -51,7 +18,6 @@ def main() -> None: settings = Settings(override_path="config.result-processor.toml") logger = setup_logger(settings.LOG_LEVEL) - logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting ResultProcessor worker...") @@ -66,7 +32,31 @@ def main() -> None: ) logger.info("Tracing initialized for ResultProcessor Service") - asyncio.run(run_result_processor(settings)) + # Create Kafka broker and register subscriber + schema_registry = SchemaRegistryManager(settings, logger) + broker = create_broker(settings, schema_registry, logger) + register_result_processor_subscriber(broker, settings) + + # Create DI container with broker in context + container = create_result_processor_container(settings, broker) + setup_dishka(container, broker=broker, auto_inject=True) + + app = FastStream(broker) + + @app.on_startup + async def startup() -> None: + await container.get(Database) # triggers init_beanie inside provider + logger.info("ResultProcessor infrastructure initialized") + + @app.on_shutdown + async def shutdown() -> None: + await container.close() + logger.info("ResultProcessor shutdown complete") + + async def run() -> None: + await app.run() + + asyncio.run(run()) if __name__ == "__main__": diff --git a/backend/workers/run_saga_orchestrator.py b/backend/workers/run_saga_orchestrator.py index fd1c1e97..4f355f86 100644 --- a/backend/workers/run_saga_orchestrator.py +++ b/backend/workers/run_saga_orchestrator.py @@ -1,49 +1,16 @@ import asyncio -import logging -import signal from app.core.container import create_saga_orchestrator_container -from app.core.database_context import Database from app.core.logging import setup_logger from app.core.tracing import init_tracing -from app.db.docs import ALL_DOCUMENTS from app.domain.enums.kafka import GroupId -from app.events.schema.schema_registry import SchemaRegistryManager, initialize_event_schemas +from app.events.broker import create_broker +from app.events.handlers import register_saga_subscriber +from app.events.schema.schema_registry import SchemaRegistryManager from app.services.saga import SagaOrchestrator from app.settings import Settings -from beanie import init_beanie - - -async def run_saga_orchestrator(settings: Settings) -> None: - """Run the saga orchestrator.""" - - container = create_saga_orchestrator_container(settings) - logger = await container.get(logging.Logger) - logger.info("Starting SagaOrchestrator with DI container...") - - db = await container.get(Database) - await init_beanie(database=db, document_models=ALL_DOCUMENTS) - - schema_registry = await container.get(SchemaRegistryManager) - await initialize_event_schemas(schema_registry) - - # Triggers consumer start + timeout checker via DI - await container.get(SagaOrchestrator) - - shutdown_event = asyncio.Event() - loop = asyncio.get_running_loop() - for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, shutdown_event.set) - - logger.info("Saga orchestrator started and running") - - try: - await shutdown_event.wait() - finally: - logger.info("Initiating graceful shutdown...") - await container.close() - - logger.warning("Saga orchestrator stopped") +from dishka.integrations.faststream import setup_dishka +from faststream import FastStream def main() -> None: @@ -51,7 +18,6 @@ def main() -> None: settings = Settings(override_path="config.saga-orchestrator.toml") logger = setup_logger(settings.LOG_LEVEL) - logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger.info("Starting Saga Orchestrator worker...") @@ -66,7 +32,33 @@ def main() -> None: ) logger.info("Tracing initialized for Saga Orchestrator Service") - asyncio.run(run_saga_orchestrator(settings)) + # Create Kafka broker and register subscriber + schema_registry = SchemaRegistryManager(settings, logger) + broker = create_broker(settings, schema_registry, logger) + register_saga_subscriber(broker, settings) + + # Create DI container with broker in context + container = create_saga_orchestrator_container(settings, broker) + setup_dishka(container, broker=broker, auto_inject=True) + + app = FastStream(broker) + + @app.on_startup + async def startup() -> None: + # Resolving SagaOrchestrator triggers Database init (via dependency) + # and starts the APScheduler timeout checker (via SagaWorkerProvider) + await container.get(SagaOrchestrator) + logger.info("SagaOrchestrator infrastructure initialized") + + @app.on_shutdown + async def shutdown() -> None: + await container.close() + logger.info("SagaOrchestrator shutdown complete") + + async def run() -> None: + await app.run() + + asyncio.run(run()) if __name__ == "__main__":