From 3a427dbb113364b16f6228033e3e02c08a1622fe Mon Sep 17 00:00:00 2001
From: shangkunwang <shangkunwang@google.com>
Date: Tue, 2 Jun 2026 18:29:41 +0000
Subject: [PATCH 1/3] fix: add jitter to agent batch calls to avoid SQLite
 database lock contention when num_concurrent is high

---
 MaxKernel/auto_agent/agent_client/run_batch_agent_call.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/MaxKernel/auto_agent/agent_client/run_batch_agent_call.py b/MaxKernel/auto_agent/agent_client/run_batch_agent_call.py
index a63f2c9..f6927e3 100644
--- a/MaxKernel/auto_agent/agent_client/run_batch_agent_call.py
+++ b/MaxKernel/auto_agent/agent_client/run_batch_agent_call.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import os
+import random
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
@@ -89,6 +90,11 @@ def process_problem(
     user_id = "user_0"
     session_id = f"session_{problem_id}_attempt_{attempt}_{int(time.time())}"
 
+    # Add random jitter to avoid SQLite database lock contention
+    jitter = random.uniform(0.1, 2.0)
+    logger.info(f"Sleeping for {jitter:.2f}s (jitter) to avoid DB lock.")
+    time.sleep(jitter)
+
     client = AutoAgentClient(
       user_id=user_id,
       session_id=session_id,

From 47fd938a4aafc0351f7d4f632db33dc9d062d862 Mon Sep 17 00:00:00 2001
From: shangkunwang <shangkunwang@google.com>
Date: Tue, 2 Jun 2026 19:17:08 +0000
Subject: [PATCH 2/3] feat: add artifact versioning to pipeline for easier
 debugging

---
 .../auto_agent/subagents/pipeline_agent.py    | 51 ++++++++++++++++++-
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/MaxKernel/auto_agent/subagents/pipeline_agent.py b/MaxKernel/auto_agent/subagents/pipeline_agent.py
index ad06020..bc0b236 100644
--- a/MaxKernel/auto_agent/subagents/pipeline_agent.py
+++ b/MaxKernel/auto_agent/subagents/pipeline_agent.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import re
+import shutil
 from typing import AsyncGenerator
 
 from google.adk.agents import BaseAgent
@@ -85,6 +86,9 @@ async def _run_async_impl(
         logging.error(
           f"[{self.name}] Compilation failed. Looping back to planning."
         )
+        self._save_iteration_files(
+          ctx, iteration, keys_to_save=["optimized_kernel_path"]
+        )
         iteration += 1
         continue
 
@@ -99,6 +103,11 @@ async def _run_async_impl(
         logging.error(
           f"[{self.name}] Test generation/validation failed. Looping back to planning."
         )
+        self._save_iteration_files(
+          ctx,
+          iteration,
+          keys_to_save=["optimized_kernel_path", "test_file_path"],
+        )
         iteration += 1
         continue
 
@@ -111,6 +120,11 @@ async def _run_async_impl(
       test_results = ctx.session.state.get("test_results", {})
       if not test_results.get("success", False):
         logging.error(f"[{self.name}] Tests failed. Looping back to planning.")
+        self._save_iteration_files(
+          ctx,
+          iteration,
+          keys_to_save=["optimized_kernel_path", "test_file_path"],
+        )
         iteration += 1
         continue
 
@@ -161,9 +175,11 @@ async def _run_async_impl(
       )
       logging.info(f"[{self.name}] Saved snapshot for iteration {iteration}")
 
-      # Step 7: Check if improvement is needed
-      needs_improvement = ctx.session.state.get("needs_improvement", False)
+      self._save_iteration_files(ctx, iteration)
 
+      # Step 7: Check if improvement is needed
+      # needs_improvement = ctx.session.state.get("needs_improvement", False)
+      needs_improvement = True
       if not needs_improvement:
         logging.info(
           f"[{self.name}] No further improvement needed or agent decided to stop. Stopping pipeline."
@@ -193,6 +209,35 @@ async def _run_async_impl(
       ),
     )
 
+  def _save_iteration_files(
+    self,
+    ctx: InvocationContext,
+    iteration: int,
+    keys_to_save: list[str] | None = None,
+  ):
+    """Saves artifacts with an iteration suffix."""
+    if keys_to_save is None:
+      keys_to_save = [
+        "optimized_kernel_path",
+        "test_file_path",
+        "autotune_specs_path",
+        "autotune_results_path",
+      ]
+    for path_key in keys_to_save:
+      path = ctx.session.state.get(path_key)
+      if path and os.path.exists(path):
+        directory, filename = os.path.split(path)
+        name, ext = os.path.splitext(filename)
+        new_filename = f"{name}_{iteration}{ext}"
+        new_path = os.path.join(directory, new_filename)
+        try:
+          shutil.copy2(path, new_path)
+          logging.info(f"[{self.name}] Copied {path_key} to {new_path}")
+        except Exception as e:
+          logging.error(
+            f"[{self.name}] Failed to copy {path_key} to {new_path}: {e}"
+          )
+
   def _initialize_state(self, ctx: InvocationContext) -> Event:
     """Initializes session state with standard paths and returns the event."""
     # Initialize history
@@ -274,6 +319,8 @@ def _initialize_state(self, ctx: InvocationContext) -> Event:
           "kernel_plan_path": ctx.session.state["kernel_plan_path"],
           "test_file_path": ctx.session.state["test_file_path"],
           "profiling_script_path": ctx.session.state["profiling_script_path"],
+          "autotune_specs_path": ctx.session.state["autotune_specs_path"],
+          "autotune_results_path": ctx.session.state["autotune_results_path"],
         }
       ),
     )

From 98a4b27e87c3e755abd9e840a5421062391569cd Mon Sep 17 00:00:00 2001
From: shangkunwang <shangkunwang@google.com>
Date: Tue, 2 Jun 2026 19:19:40 +0000
Subject: [PATCH 3/3] feat: add TEST_EXECUTION_POLL_INTERVAL constant to
 TestRunner agent

---
 MaxKernel/auto_agent/subagents/testing/agent.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MaxKernel/auto_agent/subagents/testing/agent.py b/MaxKernel/auto_agent/subagents/testing/agent.py
index c120de1..4165018 100644
--- a/MaxKernel/auto_agent/subagents/testing/agent.py
+++ b/MaxKernel/auto_agent/subagents/testing/agent.py
@@ -33,6 +33,7 @@
 COMPILE_VALIDATION_TIMEOUT = 60 * 1
 MOCK_EXECUTION_TIMEOUT = 60 * 3
 TEST_EXECUTION_TIMEOUT = 60 * 5
+TEST_EXECUTION_POLL_INTERVAL = 20
 
 
 class TestRunner(BaseAgent):