From 3a427dbb113364b16f6228033e3e02c08a1622fe Mon Sep 17 00:00:00 2001 From: shangkunwang Date: Tue, 2 Jun 2026 18:29:41 +0000 Subject: [PATCH 1/3] fix: add jitter to agent batch calls to avoid SQLite database lock contention when num_concurrent is high --- MaxKernel/auto_agent/agent_client/run_batch_agent_call.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MaxKernel/auto_agent/agent_client/run_batch_agent_call.py b/MaxKernel/auto_agent/agent_client/run_batch_agent_call.py index a63f2c9..f6927e3 100644 --- a/MaxKernel/auto_agent/agent_client/run_batch_agent_call.py +++ b/MaxKernel/auto_agent/agent_client/run_batch_agent_call.py @@ -2,6 +2,7 @@ import json import logging import os +import random import time from concurrent.futures import ThreadPoolExecutor, as_completed @@ -89,6 +90,11 @@ def process_problem( user_id = "user_0" session_id = f"session_{problem_id}_attempt_{attempt}_{int(time.time())}" + # Add random jitter to avoid SQLite database lock contention + jitter = random.uniform(0.1, 2.0) + logger.info(f"Sleeping for {jitter:.2f}s (jitter) to avoid DB lock.") + time.sleep(jitter) + client = AutoAgentClient( user_id=user_id, session_id=session_id, From 47fd938a4aafc0351f7d4f632db33dc9d062d862 Mon Sep 17 00:00:00 2001 From: shangkunwang Date: Tue, 2 Jun 2026 19:17:08 +0000 Subject: [PATCH 2/3] feat: add artifact versioning to pipeline for easier debugging --- .../auto_agent/subagents/pipeline_agent.py | 51 ++++++++++++++++++- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/MaxKernel/auto_agent/subagents/pipeline_agent.py b/MaxKernel/auto_agent/subagents/pipeline_agent.py index ad06020..bc0b236 100644 --- a/MaxKernel/auto_agent/subagents/pipeline_agent.py +++ b/MaxKernel/auto_agent/subagents/pipeline_agent.py @@ -3,6 +3,7 @@ import logging import os import re +import shutil from typing import AsyncGenerator from google.adk.agents import BaseAgent @@ -85,6 +86,9 @@ async def _run_async_impl( logging.error( f"[{self.name}] Compilation failed. Looping back to planning." ) + self._save_iteration_files( + ctx, iteration, keys_to_save=["optimized_kernel_path"] + ) iteration += 1 continue @@ -99,6 +103,11 @@ async def _run_async_impl( logging.error( f"[{self.name}] Test generation/validation failed. Looping back to planning." ) + self._save_iteration_files( + ctx, + iteration, + keys_to_save=["optimized_kernel_path", "test_file_path"], + ) iteration += 1 continue @@ -111,6 +120,11 @@ async def _run_async_impl( test_results = ctx.session.state.get("test_results", {}) if not test_results.get("success", False): logging.error(f"[{self.name}] Tests failed. Looping back to planning.") + self._save_iteration_files( + ctx, + iteration, + keys_to_save=["optimized_kernel_path", "test_file_path"], + ) iteration += 1 continue @@ -161,9 +175,11 @@ async def _run_async_impl( ) logging.info(f"[{self.name}] Saved snapshot for iteration {iteration}") - # Step 7: Check if improvement is needed - needs_improvement = ctx.session.state.get("needs_improvement", False) + self._save_iteration_files(ctx, iteration) + # Step 7: Check if improvement is needed + # needs_improvement = ctx.session.state.get("needs_improvement", False) + needs_improvement = True if not needs_improvement: logging.info( f"[{self.name}] No further improvement needed or agent decided to stop. Stopping pipeline." @@ -193,6 +209,35 @@ async def _run_async_impl( ), ) + def _save_iteration_files( + self, + ctx: InvocationContext, + iteration: int, + keys_to_save: list[str] | None = None, + ): + """Saves artifacts with an iteration suffix.""" + if keys_to_save is None: + keys_to_save = [ + "optimized_kernel_path", + "test_file_path", + "autotune_specs_path", + "autotune_results_path", + ] + for path_key in keys_to_save: + path = ctx.session.state.get(path_key) + if path and os.path.exists(path): + directory, filename = os.path.split(path) + name, ext = os.path.splitext(filename) + new_filename = f"{name}_{iteration}{ext}" + new_path = os.path.join(directory, new_filename) + try: + shutil.copy2(path, new_path) + logging.info(f"[{self.name}] Copied {path_key} to {new_path}") + except Exception as e: + logging.error( + f"[{self.name}] Failed to copy {path_key} to {new_path}: {e}" + ) + def _initialize_state(self, ctx: InvocationContext) -> Event: """Initializes session state with standard paths and returns the event.""" # Initialize history @@ -274,6 +319,8 @@ def _initialize_state(self, ctx: InvocationContext) -> Event: "kernel_plan_path": ctx.session.state["kernel_plan_path"], "test_file_path": ctx.session.state["test_file_path"], "profiling_script_path": ctx.session.state["profiling_script_path"], + "autotune_specs_path": ctx.session.state["autotune_specs_path"], + "autotune_results_path": ctx.session.state["autotune_results_path"], } ), ) From 98a4b27e87c3e755abd9e840a5421062391569cd Mon Sep 17 00:00:00 2001 From: shangkunwang Date: Tue, 2 Jun 2026 19:19:40 +0000 Subject: [PATCH 3/3] feat: add TEST_EXECUTION_POLL_INTERVAL constant to TestRunner agent --- MaxKernel/auto_agent/subagents/testing/agent.py | 1 + 1 file changed, 1 insertion(+) diff --git a/MaxKernel/auto_agent/subagents/testing/agent.py b/MaxKernel/auto_agent/subagents/testing/agent.py index c120de1..4165018 100644 --- a/MaxKernel/auto_agent/subagents/testing/agent.py +++ b/MaxKernel/auto_agent/subagents/testing/agent.py @@ -33,6 +33,7 @@ COMPILE_VALIDATION_TIMEOUT = 60 * 1 MOCK_EXECUTION_TIMEOUT = 60 * 3 TEST_EXECUTION_TIMEOUT = 60 * 5 +TEST_EXECUTION_POLL_INTERVAL = 20 class TestRunner(BaseAgent):