From 5403dbf702a5d6241594577e2664ec31cc28dd7a Mon Sep 17 00:00:00 2001
From: Reason-Wang <reason-wang@foxmail.com>
Date: Wed, 13 Aug 2025 12:21:29 +0000
Subject: [PATCH] Add mock tests, fix training bug

---
 agents/agents/agents/agent_base.py            |  14 +-
 agents/agents/agents/auto.py                  |  52 +--
 agents/agents/agents/llm_backend.py           |  38 +-
 agents/agents/agents/react/react_agent.py     |   1 -
 agents/agents/agents/templates/utils.py       |  20 +-
 agents/agents/agents/utils/tokenizer.py       |  11 +-
 .../tests/unit/agents/mock_tests/__init__.py  |   1 +
 .../tests/unit/agents/mock_tests/conftest.py  | 155 ++++++++
 .../mock_tests/test_mock_agent_integration.py | 324 ++++++++++++++++
 .../agents/mock_tests/test_mock_auto_agent.py | 294 +++++++++++++++
 .../agents/mock_tests/test_mock_code_agent.py | 251 +++++++++++++
 .../mock_tests/test_mock_react_agent.py       | 352 ++++++++++++++++++
 agents/tests/unit/agents/test_vision_agent.py |  17 +-
 agents/tests/unit/tools/test_code_tool.py     |  18 +-
 verl                                          |   2 +-
 15 files changed, 1477 insertions(+), 73 deletions(-)
 create mode 100644 agents/tests/unit/agents/mock_tests/__init__.py
 create mode 100644 agents/tests/unit/agents/mock_tests/conftest.py
 create mode 100644 agents/tests/unit/agents/mock_tests/test_mock_agent_integration.py
 create mode 100644 agents/tests/unit/agents/mock_tests/test_mock_auto_agent.py
 create mode 100644 agents/tests/unit/agents/mock_tests/test_mock_code_agent.py
 create mode 100644 agents/tests/unit/agents/mock_tests/test_mock_react_agent.py

diff --git a/agents/agents/agents/agent_base.py b/agents/agents/agents/agent_base.py
index e878a79..43b01f2 100644
--- a/agents/agents/agents/agent_base.py
+++ b/agents/agents/agents/agent_base.py
@@ -17,7 +17,7 @@
 import warnings
 import logging
 from .chain.streaming_observer import ConsoleStreamObserver, StreamingManager
-from .utils.tokenizer import create_tokenizer
+from .utils.tokenizer import create_processor, create_tokenizer
 from .backend_config import BACKEND_CONFIGS
 try:
     from verl.protocol import DataProto
@@ -43,7 +43,6 @@ def __init__(
         system_prompt: str = None,
         tools: List = None,
         max_length: int=8192,
-        debug: bool = False,
         backend: str = "transformers",
         backend_config: Any = None,
         reward_fn: Callable = None,
@@ -51,6 +50,7 @@ def __init__(
         project_name: str = None,
         run_name: str = None,
         streaming: str = "console",
+        debug: bool = False,
         **kwargs # To pass other unused arguments
     ):
         """
@@ -65,6 +65,7 @@ def __init__(
         """
         torch.set_printoptions(threshold=10_000)
         self.logger = get_logger(directory=os.path.join(AGENT_DATA_DIR, "debug"), filename=log_file, level="DEBUG" if debug else "INFO")
+        self.debug = debug
         self.backend = backend
         self.template = template
         self.max_length = max_length
@@ -87,6 +88,8 @@ def __init__(
         
         # Create appropriate tokenizer for trajectory processing
         self.tokenizer = create_tokenizer(model_name_or_path)
+
+        self.processor = create_processor(model_name_or_path)
         
         self._reward_fn = reward_fn
 
@@ -105,8 +108,7 @@ def __init__(
             raise ValueError(f"Streaming mode {streaming} is not supported.")
         super().__init__()
         if kwargs:
-            # warnings.warn(f"Unused arguments for agent initialization: {kwargs}")
-            raise ValueError(f"Unused arguments for agent initialization: {kwargs}")
+            warnings.warn(f"Unused arguments for agent initialization: {kwargs}")
     
     def _init_llm_engine(self, model_name_or_path: str, backend: str):
         if isinstance(model_name_or_path, str):
@@ -206,7 +208,7 @@ def trajectories(self):
 
         return trajectories
 
-    def tokenize_trajectories(self, tokenizer, return_action_mask: bool = False, return_reward_mask: bool = False):
+    def tokenize_trajectories(self, tokenizer = None, return_reward_mask: bool = False):
         if tokenizer is None:
             tokenizer = self.tokenizer
             
@@ -318,7 +320,7 @@ def rewards(self):
     
 
     def get_verl_data_proto(self):
-        inputs, other_info_list = self.tokenize_trajectories(return_action_mask=True, return_reward_mask=True)
+        inputs, other_info_list = self.tokenize_trajectories(return_reward_mask=True)
         group_ids = np.array([info["group_id"] for info in other_info_list], dtype=object)
         # Do evaluation here
         reward_values, other_values = self.rewards
diff --git a/agents/agents/agents/auto.py b/agents/agents/agents/auto.py
index 19d3e27..84050f4 100644
--- a/agents/agents/agents/auto.py
+++ b/agents/agents/agents/auto.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 from .specialized.think_agent import ThinkAgent
 from agents.agents.specialized.openai_agent import OpenAIAgent
@@ -8,8 +8,7 @@
 from .specialized.code_agent import CodeAgent
 from ..rewards.reward_base import get_reward_from_name
 
-# Registry for agent types - will be populated dynamically
-AGENT_MAPPING = {}
+
 
 class AutoAgent:
     """
@@ -22,7 +21,7 @@ class AutoAgent:
     These agents are registered automatically. Additional custom agents can be
     registered using the register_agent method.
     """
-    
+    AGENT_MAPPING = {}
     @classmethod
     def register_agent(cls, agent_type: str, agent_class: Type[BaseAgent]) -> None:
         """
@@ -32,7 +31,7 @@ def register_agent(cls, agent_type: str, agent_class: Type[BaseAgent]) -> None:
             agent_type: The name identifier for the agent type (e.g., 'react', 'code')
             agent_class: The agent class to instantiate for this type
         """
-        AGENT_MAPPING[agent_type.lower()] = agent_class
+        cls.AGENT_MAPPING[agent_type.lower()] = agent_class
     
     @classmethod
     def _get_agent_class(cls, agent_type: str) -> Type[BaseAgent]:
@@ -50,11 +49,11 @@ def _get_agent_class(cls, agent_type: str) -> Type[BaseAgent]:
         """
         agent_type = agent_type.lower()
         
-        if agent_type not in AGENT_MAPPING:
-            available_types = list(AGENT_MAPPING.keys())
+        if agent_type not in cls.AGENT_MAPPING:
+            available_types = list(cls.AGENT_MAPPING.keys())
             raise ValueError(f"Unknown agent type: '{agent_type}'. Available types: {available_types}")
             
-        return AGENT_MAPPING[agent_type]
+        return cls.AGENT_MAPPING[agent_type]
     
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> BaseAgent:
@@ -81,6 +80,14 @@ def from_config(cls, config: Dict[str, Any]) -> BaseAgent:
             An initialized agent instance.
         """
         # Extract and validate required parameters
+        if config is None:
+            raise ValueError("Config could not be None")
+
+        # construct a copy for agent_kwargs
+        agent_kwargs = {}
+        for k, v in config.items():
+            agent_kwargs[k] = v
+        
         required_params = ["agent_type", "template", "tools", "backend"]
         missing_params = [param for param in required_params if not config.get(param)]
         
@@ -88,20 +95,21 @@ def from_config(cls, config: Dict[str, Any]) -> BaseAgent:
             raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
         
         agent_type = config["agent_type"]
+        agent_kwargs.pop("agent_type")
         tools = get_tools_from_names(config["tools"])
         agent_class = cls._get_agent_class(agent_type)
+        reward_name = config.get("reward_name")
+        if reward_name is not None:
+            reward_fn = get_reward_from_name(reward_name)
+            agent_kwargs.pop("reward_name")
+        else:
+            reward_fn = None
         
-        # construct a copy for agent_kwargs
-        agent_kwargs = {}
-        for k, v in config.items():
-            agent_kwargs[k] = v
-        
-        agent_kwargs.pop("agent_type")
         agent_kwargs['tools'] = tools
-        if "reward_name" in config and config["reward_name"] is not None:
-            agent_kwargs.pop("reward_name")
-            reward_fn = get_reward_from_name(config["reward_name"])
-            agent_kwargs["reward_fn"] = reward_fn
+        agent_kwargs['reward_fn'] = reward_fn
+
+        if "use_agent" in agent_kwargs:
+            agent_kwargs.pop("use_agent")
         
         agent = agent_class(**agent_kwargs)
 
@@ -114,11 +122,9 @@ def from_pretrained(
         agent_type: str,
         template: str,
         tools: Optional[List] = None,
-        vllm: bool = False,
         debug: bool = False,
         log_file: str = "agent",
-        wrapper: bool = False,
-        reward_name: Optional[str] = None,
+        reward_fn: Optional[Callable] = None,
         **kwargs
     ) -> BaseAgent:
         """
@@ -147,11 +153,9 @@ def from_pretrained(
             "model_name_or_path": model_name_or_path,
             "template": template,
             "tools": tools or [],
-            "vllm": vllm,
             "debug": debug,
             "log_file": log_file,
-            "wrapper": wrapper,
-            "reward_name": reward_name,
+            "reward_fn": reward_fn,
             **kwargs
         }
             
diff --git a/agents/agents/agents/llm_backend.py b/agents/agents/agents/llm_backend.py
index 726620a..3a65f00 100644
--- a/agents/agents/agents/llm_backend.py
+++ b/agents/agents/agents/llm_backend.py
@@ -5,11 +5,11 @@
 import asyncio
 from asyncore import loop
 from collections import deque
+import copy
 from functools import partial
 import time
 from typing import Dict, Any, List, Optional, Callable, AsyncGenerator
 import uuid
-from .templates.utils import convert_messages_to_openai_format
 import numpy as np
 from tenacity import retry, stop_after_attempt, wait_exponential
 import torch
@@ -24,8 +24,8 @@
 import logging
 import PIL
 
+
 LOGGER = logging.getLogger(__name__)
-LOGGER.setLevel(logging.DEBUG)
 
 try:
     from verl.protocol import DataProto
@@ -353,6 +353,21 @@ def _process_inputs(self, prompts: List[str], vision_inputs: Dict[str, List[PIL.
     
     def generate(self, messages_list: str, **kwargs) -> str:
         raise NotImplementedError("Async Verl backend does not support sync generation")
+
+    def _convert_to_openai_chat_without_tool_call_processing(self, messages: list) -> list:
+        """
+        We use the pure generated content as the history. So we don't want any tool call to be part of the history.
+        This is used when models are not openai's official models like GPT-4o.
+        """
+        messages = copy.deepcopy(messages)
+        for message in messages:
+            if "tool_calls" in message:
+                del message["tool_calls"]
+            if "tool_call_id" in message:
+                del message["tool_call_id"]
+            if "tool_choice" in message:
+                del message["tool_choice"]
+        return messages
     
     async def generate_async(self, messages_list: str, **kwargs) -> str:
         """Generate text from prompt using Verl"""
@@ -360,7 +375,7 @@ async def generate_async(self, messages_list: str, **kwargs) -> str:
 
         generation_config = {}
         tensors = torch.ones(len(messages_list), dtype=torch.int64)
-        messages_list = [convert_messages_to_openai_format(messages) for messages in messages_list]
+        messages_list = [self._convert_to_openai_chat_without_tool_call_processing(messages) for messages in messages_list]
         tools = kwargs.get("tools", None)
         tools_list = np.array([tools] * len(messages_list))
         data = {"input_ids": tensors, "raw_prompt": np.array(messages_list), "tools": tools_list}
@@ -457,6 +472,21 @@ async def _call(self, messages: List[List[Dict]], **kw) -> str:
             loop = asyncio.get_running_loop()
             return await loop.run_in_executor(None, partial(self._blocking_call, messages, **kw))
 
+    def _convert_to_openai_chat_without_tool_call_processing(self, messages: list) -> list:
+        """
+        We use the pure generated content as the history. So we don't want any tool call to be part of the history.
+        This is used when models are not openai's official models like GPT-4o.
+        TODO: we need to add support for openai models
+        """
+        messages = copy.deepcopy(messages)
+        for message in messages:
+            if "tool_calls" in message:
+                del message["tool_calls"]
+            if "tool_call_id" in message:
+                del message["tool_call_id"]
+            if "tool_choice" in message:
+                del message["tool_choice"]
+        return messages
 
     # Public API ‑‑ sync or async depending on caller's context
     def async_generate(
@@ -478,7 +508,7 @@ def async_generate(
         else:
             messages_list = messages     # batch
         print(f"[ClientBackend] messages_list: {messages_list}")
-        messages_list = [convert_messages_to_openai_format(messages) for messages in messages_list]
+        messages_list = [self._convert_to_openai_chat_without_tool_call_processing(messages) for messages in messages_list]
 
         async def _runner():
             tasks = [asyncio.create_task(self._call(_input, **kwargs)) for _input in messages_list]
diff --git a/agents/agents/agents/react/react_agent.py b/agents/agents/agents/react/react_agent.py
index e176149..3c52228 100644
--- a/agents/agents/agents/react/react_agent.py
+++ b/agents/agents/agents/react/react_agent.py
@@ -123,7 +123,6 @@ def __init__(self,
             model_name_or_path=model_name_or_path,
             tools=tools,
             system_prompt=system_prompt,
-            max_length=8192,
             **kwargs
         )
         
diff --git a/agents/agents/agents/templates/utils.py b/agents/agents/agents/templates/utils.py
index 9f62182..9e6b383 100644
--- a/agents/agents/agents/templates/utils.py
+++ b/agents/agents/agents/templates/utils.py
@@ -22,22 +22,6 @@ def strip_ansi(s: str) -> str:
     return ANSI_RE.sub('', s)
 
 
-def convert_messages_to_openai_format(messages: list) -> list:
-    """
-    Convert messages to OpenAI format.
-    TODO: add more processing for other types of content
-    """
-    messages = copy.deepcopy(messages)
-    for message in messages:
-        # if "tool_calls" in message:
-        #     del message["tool_calls"]
-        # if "tool_call_id" in message:
-        #     del message["tool_call_id"]
-        if "tool_choice" in message:
-            del message["tool_choice"]
-    return messages
-
-
 def convert_messages_to_hf_format(messages: list) -> list:
     """
     Convert messages to Hugging Face format.
@@ -305,9 +289,7 @@ def compare_hf_template(tokenizer, template_name, messages=None, tools=None, add
     plain_highlighted_prompt = strip_ansi(highlighted_prompt)
     is_equal_between_implemented_prompts = implemented_prompt == plain_highlighted_prompt
     jinja_template = chat.template.jinja_template()
-    # Save jinja template to file
-    with open("jinja_template.jinja", "w") as f:
-        f.write(jinja_template)
+    
     tokenizer.chat_template = jinja_template
     implemented_jinja_prompt = tokenizer.apply_chat_template(messages, tokenize=False, tools=tools, add_generation_prompt=add_generation_prompt)
     is_equal_between_jinja_prompts = implemented_jinja_prompt == implemented_prompt
diff --git a/agents/agents/agents/utils/tokenizer.py b/agents/agents/agents/utils/tokenizer.py
index 00ab8fe..ed9f675 100644
--- a/agents/agents/agents/utils/tokenizer.py
+++ b/agents/agents/agents/utils/tokenizer.py
@@ -1,4 +1,4 @@
-from transformers import AutoTokenizer
+from transformers import AutoProcessor, AutoTokenizer
 
 def create_tokenizer(model_name_or_path: str):
     try:
@@ -8,3 +8,12 @@ def create_tokenizer(model_name_or_path: str):
         tokenizer = None
     
     return tokenizer
+
+
+def create_processor(model_name_or_path: str):
+    try:
+        processor = AutoProcessor.from_pretrained(model_name_or_path)
+    except OSError:
+        processor = None
+    
+    return processor
\ No newline at end of file
diff --git a/agents/tests/unit/agents/mock_tests/__init__.py b/agents/tests/unit/agents/mock_tests/__init__.py
new file mode 100644
index 0000000..e1d31e9
--- /dev/null
+++ b/agents/tests/unit/agents/mock_tests/__init__.py
@@ -0,0 +1 @@
+# Mock tests package for agents
diff --git a/agents/tests/unit/agents/mock_tests/conftest.py b/agents/tests/unit/agents/mock_tests/conftest.py
new file mode 100644
index 0000000..9b0433b
--- /dev/null
+++ b/agents/tests/unit/agents/mock_tests/conftest.py
@@ -0,0 +1,155 @@
+import pytest
+import os
+from unittest.mock import Mock, patch, AsyncMock
+from typing import Dict, Any, List
+
+
+@pytest.fixture
+def mock_llm_engine():
+    """Mock LLM engine for testing"""
+    mock_engine = Mock()
+    mock_engine.generate_async = AsyncMock()
+    mock_engine.generate = Mock()
+    return mock_engine
+
+
+@pytest.fixture
+def mock_tokenizer():
+    """Mock tokenizer for testing"""
+    mock_tok = Mock()
+    mock_tok.encode = Mock(return_value=[1, 2, 3, 4, 5])
+    mock_tok.decode = Mock(return_value="Mocked decoded text")
+    mock_tok.pad_token_id = 0
+    mock_tok.eos_token_id = 1
+    return mock_tok
+
+
+@pytest.fixture
+def mock_processor():
+    """Mock processor for testing"""
+    mock_proc = Mock()
+    mock_proc.encode = Mock(return_value={"input_ids": [1, 2, 3, 4, 5]})
+    mock_proc.decode = Mock(return_value="Mocked processed text")
+    return mock_proc
+
+
+@pytest.fixture
+def mock_tools():
+    """Mock tools for testing"""
+    mock_code_interpreter = Mock()
+    mock_code_interpreter.name = "code_interpreter"
+    mock_code_interpreter.description = "Run Python code"
+    mock_code_interpreter.schema = {
+        "name": "code_interpreter",
+        "description": "Run Python code",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "code": {"type": "string", "description": "Python code to execute"}
+            },
+            "required": ["code"]
+        }
+    }
+    
+    mock_answer = Mock()
+    mock_answer.name = "answer"
+    mock_answer.description = "Provide final answer"
+    mock_answer.schema = {
+        "name": "answer",
+        "description": "Provide final answer",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "text": {"type": "string", "description": "The answer text"}
+            },
+            "required": ["text"]
+        }
+    }
+    
+    mock_google_search = Mock()
+    mock_google_search.name = "google_search"
+    mock_google_search.description = "Search the web"
+    mock_google_search.schema = {
+        "name": "google_search",
+        "description": "Search the web",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "query": {"type": "string", "description": "Search query"}
+            },
+            "required": ["query"]
+        }
+    }
+    
+    return {
+        "code_interpreter": mock_code_interpreter,
+        "answer": mock_answer,
+        "google_search": mock_google_search
+    }
+
+
+@pytest.fixture
+def mock_responses():
+    """Mock model responses for testing"""
+    return {
+        "code_agent": [
+            "I'll solve this math problem step by step.\n```python\n# Calculate the speed\ns = 9 / 4  # 9 km in 4 hours\nprint(f'Speed: {s} km/h')\n```",
+            "Now let me calculate the time for s + 0.5 speed.\n```python\nnew_speed = s + 0.5\nnew_time = 9 / new_speed\nprint(f'New time: {new_time} hours')\n```",
+            "The walk takes 204 minutes including coffee shop time."
+        ],
+        "react_agent": [
+            "Thought: I need to search for information about Python programming.\nAction: google_search\nInput: {\"query\": \"Python programming language features\"}",
+            "Thought: Based on the search results, I can now provide an answer.\nAction: answer\nInput: {\"text\": \"Python is a high-level programming language known for its simplicity and readability.\"}"
+        ],
+        "think_agent": [
+            "Let me think about this step by step.\n\nFirst, I need to understand the problem...\n\nBased on my reasoning, the answer is 42."
+        ]
+    }
+
+
+@pytest.fixture
+def test_config():
+    """Provide test configuration based on environment"""
+    if os.environ.get('CI'):
+        return {
+            "backend": "client",
+            "model": "microsoft/DialoGPT-small",  # Smaller CPU-compatible model
+            "max_steps": 2,
+            "num_chains": 2,
+            "use_mock": True
+        }
+    else:
+        return {
+            "backend": "async_vllm",
+            "model": "Qwen/Qwen2.5-3B-Instruct",
+            "max_steps": 4,
+            "num_chains": 5,
+            "use_mock": False
+        }
+
+
+@pytest.fixture
+def mock_chain_generation():
+    """Mock chain generation methods"""
+    with patch('agents.agents.agent_base.ChainGeneration.run_async') as mock_run, \
+         patch('agents.agents.agent_base.ChainGeneration.get_messages') as mock_get_messages, \
+         patch('agents.agents.agent_base.ChainGeneration.tokenize_trajectories') as mock_tokenize:
+        
+        mock_run.return_value = None
+        mock_get_messages.return_value = [{"role": "assistant", "content": "Mocked response"}]
+        mock_tokenize.return_value = {"input_ids": [[1, 2, 3, 4, 5]], "attention_mask": [[1, 1, 1, 1, 1]]}
+        
+        yield {
+            "run_async": mock_run,
+            "get_messages": mock_get_messages,
+            "tokenize_trajectories": mock_tokenize
+        }
+
+
+@pytest.fixture
+def mock_reward_function():
+    """Mock reward function for testing"""
+    mock_reward = Mock()
+    mock_reward.__call__ = Mock(return_value=0.85)
+    mock_reward.name = "mock_reward"
+    return mock_reward
diff --git a/agents/tests/unit/agents/mock_tests/test_mock_agent_integration.py b/agents/tests/unit/agents/mock_tests/test_mock_agent_integration.py
new file mode 100644
index 0000000..e61a9d1
--- /dev/null
+++ b/agents/tests/unit/agents/mock_tests/test_mock_agent_integration.py
@@ -0,0 +1,324 @@
+import pytest
+from unittest.mock import Mock, patch, AsyncMock
+from agents.agents.auto import AutoAgent
+from agents.agents.react.react_agent import ReactAgent
+from agents.agents.specialized.code_agent import CodeAgent
+
+
+class TestMockAgentIntegration:
+    """Integration tests for multiple agents working together with mocked dependencies"""
+    
+    def test_agent_workflow_code_to_react(self, mock_tools, mock_chain_generation):
+        """Test workflow where CodeAgent generates code that ReactAgent uses"""
+        # Create CodeAgent
+        code_tools = [mock_tools["code_interpreter"]]
+        code_agent = CodeAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=code_tools,
+            template="qwen-7b-chat",
+            backend="client"
+        )
+        
+        # Create ReactAgent
+        react_tools = [mock_tools["google_search"], mock_tools["answer"]]
+        react_agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=react_tools,
+            template="qwen2.5",
+            task_info="Use code execution results to provide answers",
+            backend="client"
+        )
+        
+        # Test that both agents can be created and configured
+        assert isinstance(code_agent, CodeAgent)
+        assert isinstance(react_agent, ReactAgent)
+        assert len(code_agent.tools) == 1
+        assert len(react_agent.tools) == 2
+        
+        # Test that both agents have the expected methods
+        assert hasattr(code_agent, 'parse')
+        assert hasattr(react_agent, 'parse')
+        assert hasattr(code_agent, 'run_async')
+        assert hasattr(react_agent, 'run_async')
+    
+    def test_agent_workflow_react_to_code(self, mock_tools, mock_chain_generation):
+        """Test workflow where ReactAgent decides to use CodeAgent"""
+        # Create ReactAgent with code execution capability
+        react_tools = [mock_tools["google_search"], mock_tools["code_interpreter"], mock_tools["answer"]]
+        react_agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=react_tools,
+            template="qwen2.5",
+            task_info="Search for information and execute code when needed",
+            backend="client"
+        )
+        
+        # Test that ReactAgent can handle code execution tools
+        assert len(react_agent.tools) == 3
+        tool_names = [tool.name for tool in react_agent.tools]
+        assert "google_search" in tool_names
+        assert "code_interpreter" in tool_names
+        assert "answer" in tool_names
+        
+        # Test system prompt includes code execution
+        assert "code_interpreter" in react_agent.system_prompt
+    
+    def test_auto_agent_workflow(self, mock_tools, mock_chain_generation):
+        """Test AutoAgent creating different agent types in sequence"""
+        # Create ReactAgent via AutoAgent
+        react_config = {
+            "agent_type": "react",
+            "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+            "template": "qwen-7b-chat",
+            "tools": [mock_tools["google_search"], mock_tools["answer"]],
+            "backend": "client"
+        }
+        
+        react_agent = AutoAgent.from_config(react_config)
+        assert isinstance(react_agent, ReactAgent)
+        
+        # Create CodeAgent via AutoAgent
+        code_config = {
+            "agent_type": "code",
+            "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+            "template": "qwen-7b-chat",
+            "tools": [mock_tools["code_interpreter"]],
+            "backend": "client"
+        }
+        
+        code_agent = AutoAgent.from_config(code_config)
+        assert isinstance(code_agent, CodeAgent)
+        
+        # Test that both agents work independently
+        assert react_agent.agent_type != code_agent.agent_type
+        assert len(react_agent.tools) != len(code_agent.tools)
+    
+    def test_agent_tool_sharing(self, mock_tools, mock_chain_generation):
+        """Test that agents can share common tools"""
+        # Create agents with overlapping tools
+        code_agent = CodeAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=[mock_tools["code_interpreter"]],
+            template="qwen-7b-chat",
+            backend="client"
+        )
+        
+        react_agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=[mock_tools["code_interpreter"], mock_tools["answer"]],
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        # Test that both agents can use the shared tool
+        assert code_agent.tools[0].name == "code_interpreter"
+        assert react_agent.tools[0].name == "code_interpreter"
+        
+        # Test that the tool has the same schema in both agents
+        assert code_agent.tools[0].schema == react_agent.tools[0].schema
+    
+    def test_agent_response_parsing_integration(self, mock_tools, mock_chain_generation):
+        """Test that different agents can parse each other's response formats"""
+        # Create a response that could come from either agent
+        mixed_response = """Thought: I need to calculate something.
+Action: code_interpreter
+Input: {"code": "print(2 + 2)"}"""
+        
+        # Test ReactAgent parsing this response
+        react_tools = [mock_tools["code_interpreter"], mock_tools["answer"]]
+        react_agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=react_tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        react_result = react_agent.parse([mixed_response], react_tools)
+        assert len(react_result) == 1
+        assert react_result[0]["role"] == "assistant"
+        
+        # Test CodeAgent parsing a code-focused response
+        code_response = "I'll solve this step by step.\n```python\nx = 2 + 2\nprint(x)\n```"
+        code_tools = [mock_tools["code_interpreter"]]
+        code_agent = CodeAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=code_tools,
+            template="qwen-7b-chat",
+            backend="client"
+        )
+        
+        code_result = code_agent.parse([code_response], code_tools)
+        assert len(code_result) == 1
+        assert code_result[0]["role"] == "assistant"
+    
+    def test_agent_backend_compatibility(self, mock_tools, mock_chain_generation):
+        """Test that agents work with different backends"""
+        backends = ["client", "transformers"]
+        
+        for backend in backends:
+            # Test ReactAgent with different backends
+            react_agent = ReactAgent(
+                "Qwen/Qwen2.5-3B-Instruct",
+                tools=[mock_tools["google_search"]],
+                template="qwen2.5",
+                backend=backend
+            )
+            assert react_agent.backend == backend
+            
+            # Test CodeAgent with different backends
+            code_agent = CodeAgent(
+                "Qwen/Qwen2.5-3B-Instruct",
+                tools=[mock_tools["code_interpreter"]],
+                template="qwen-7b-chat",
+                backend=backend
+            )
+            assert code_agent.backend == backend
+    
+    def test_agent_error_handling_integration(self, mock_tools, mock_chain_generation):
+        """Test error handling across different agent types"""
+        # Test ReactAgent with malformed input
+        react_tools = [mock_tools["google_search"]]
+        react_agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=react_tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        malformed_response = "Thought: I need to search.\nAction: google_search\nInput: {invalid json"
+        react_result = react_agent.parse([malformed_response], react_tools)
+        assert len(react_result) == 1
+        assert len(react_result[0]["tool_calls"]) == 0
+        
+        # Test CodeAgent with malformed input
+        code_tools = [mock_tools["code_interpreter"]]
+        code_agent = CodeAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=code_tools,
+            template="qwen-7b-chat",
+            backend="client"
+        )
+        
+        malformed_code_response = "```python\nx = 1\n"  # Missing closing ```
+        code_result = code_agent.parse([malformed_code_response], code_tools)
+        assert len(code_result) == 1
+        assert len(code_result[0]["tool_calls"]) == 0
+    
+    def test_agent_template_compatibility(self, mock_tools, mock_chain_generation):
+        """Test that agents work with different templates"""
+        templates = ["qwen-7b-chat", "qwen2.5", "qwen2.5-no-tool"]
+        
+        for template in templates:
+            # Test ReactAgent with different templates
+            react_agent = ReactAgent(
+                "Qwen/Qwen2.5-3B-Instruct",
+                tools=[mock_tools["google_search"]],
+                template=template,
+                backend="client"
+            )
+            assert react_agent.template == template
+            
+            # Test CodeAgent with different templates
+            code_agent = CodeAgent(
+                "Qwen/Qwen2.5-3B-Instruct",
+                tools=[mock_tools["code_interpreter"]],
+                template=template,
+                backend="client"
+            )
+            assert code_agent.template == template
+    
+    def test_agent_async_operations_integration(self, mock_tools, mock_llm_engine):
+        """Test async operations across different agent types"""
+        # Mock LLM engine for both agents
+        with patch('agents.agents.agent_base.BaseAgent._setup_backend') as mock_setup:
+            mock_setup.return_value = None
+            
+            # Test ReactAgent async operations
+            react_tools = [mock_tools["google_search"]]
+            react_agent = ReactAgent(
+                "Qwen/Qwen2.5-3B-Instruct",
+                tools=react_tools,
+                template="qwen2.5",
+                backend="client"
+            )
+            react_agent.llm_engine = mock_llm_engine
+            
+            # Test CodeAgent async operations
+            code_tools = [mock_tools["code_interpreter"]]
+            code_agent = CodeAgent(
+                "Qwen/Qwen2.5-3B-Instruct",
+                tools=code_tools,
+                template="qwen-7b-chat",
+                backend="client"
+            )
+            code_agent.llm_engine = mock_llm_engine
+            
+            # Verify both agents can use the same LLM engine
+            assert react_agent.llm_engine is mock_llm_engine
+            assert code_agent.llm_engine is mock_llm_engine
+    
+    def test_agent_system_prompt_integration(self, mock_tools, mock_chain_generation):
+        """Test that system prompts are properly integrated across agents"""
+        # Test ReactAgent system prompt
+        react_tools = [mock_tools["google_search"], mock_tools["answer"]]
+        react_agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=react_tools,
+            template="qwen2.5",
+            task_info="Test task for integration",
+            backend="client"
+        )
+        
+        # Test CodeAgent system prompt
+        code_tools = [mock_tools["code_interpreter"]]
+        code_agent = CodeAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=code_tools,
+            template="qwen-7b-chat",
+            backend="client"
+        )
+        
+        # Verify both agents have appropriate system prompts
+        assert "ReAct-style agent" in react_agent.system_prompt
+        assert "multi-turn manner" in code_agent.system_prompt
+        assert "Test task for integration" in react_agent.system_prompt
+        
+        # Verify tool information is included in system prompts
+        for tool in react_agent.tools:
+            assert tool.name in react_agent.system_prompt
+        
+        for tool in code_agent.tools:
+            assert tool.name in code_agent.system_prompt
+    
+    def test_agent_chain_generation_integration(self, mock_tools, mock_chain_generation):
+        """Test that chain generation methods work across different agent types"""
+        # Test ReactAgent chain generation
+        react_tools = [mock_tools["google_search"]]
+        react_agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=react_tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        # Test CodeAgent chain generation
+        code_tools = [mock_tools["code_interpreter"]]
+        code_agent = CodeAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=code_tools,
+            template="qwen-7b-chat",
+            backend="client"
+        )
+        
+        # Verify both agents have chain generation methods
+        for agent in [react_agent, code_agent]:
+            assert hasattr(agent, 'run_async')
+            assert hasattr(agent, 'get_messages')
+            assert hasattr(agent, 'tokenize_trajectories')
+            
+            # Test that methods can be called (they're mocked)
+            messages = agent.get_messages()
+            assert isinstance(messages, list)
+            
+            trajectories = agent.tokenize_trajectories()
+            assert isinstance(trajectories, dict)
diff --git a/agents/tests/unit/agents/mock_tests/test_mock_auto_agent.py b/agents/tests/unit/agents/mock_tests/test_mock_auto_agent.py
new file mode 100644
index 0000000..365f795
--- /dev/null
+++ b/agents/tests/unit/agents/mock_tests/test_mock_auto_agent.py
@@ -0,0 +1,294 @@
+import pytest
+from unittest.mock import Mock, patch, AsyncMock
+from agents.agents.auto import AutoAgent
+from agents.agents.react.react_agent import ReactAgent
+from agents.agents.specialized.code_agent import CodeAgent
+from agents.rewards import qa_f1_reward
+
+def test_auto_agent_registration():
+    """Test agent registration functionality"""
+    # Test that built-in agents are registered
+    assert "react" in AutoAgent.AGENT_MAPPING
+    assert "code" in AutoAgent.AGENT_MAPPING
+    
+    # Test custom agent registration
+    class CustomAgent:
+        pass
+    
+    AutoAgent.register_agent("custom", CustomAgent)
+    assert "custom" in AutoAgent.AGENT_MAPPING
+    assert AutoAgent.AGENT_MAPPING["custom"] == CustomAgent
+
+
+def test_auto_agent_from_config_react():
+    """Test creating ReactAgent from config"""
+    config = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search", "answer"],
+        "backend": "client"
+    }
+    
+    agent = AutoAgent.from_config(config)
+    
+    assert isinstance(agent, ReactAgent)
+    assert agent.model_name_or_path == "Qwen/Qwen2.5-3B-Instruct"
+    assert agent.template == "qwen2.5"
+    assert len(agent.tools) == 2
+    assert agent.backend == "client"
+
+def test_auto_agent_from_config_code():
+    """Test creating CodeAgent from config"""
+    config = {
+        "agent_type": "code",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["code_interpreter"],
+        "backend": "client"
+    }
+    
+    agent = AutoAgent.from_config(config)
+    
+    assert isinstance(agent, CodeAgent)
+    assert agent.model_name_or_path == "Qwen/Qwen2.5-3B-Instruct"
+    assert len(agent.tools) == 1
+    assert agent.backend == "client"
+
+def test_auto_agent_from_config_with_reward():
+    """Test creating agent with reward function"""
+    config = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search"],
+        "reward_fn": qa_f1_reward,
+        "backend": "client"
+    }
+    
+    agent = AutoAgent.from_config(config)
+    
+    assert isinstance(agent, ReactAgent)
+
+def test_auto_agent_from_pretrained():
+    """Test creating agent using from_pretrained method"""
+    agent = AutoAgent.from_pretrained(
+        model_name_or_path="Qwen/Qwen2.5-3B-Instruct",
+        agent_type="react",
+        template="qwen2.5",
+        tools=["google_search", "answer"],
+        debug=True,
+        backend="client"
+    )
+    
+    assert isinstance(agent, ReactAgent)
+    assert agent.model_name_or_path == "Qwen/Qwen2.5-3B-Instruct"
+    assert agent.template == "qwen2.5"
+    assert agent.backend == "client"
+
+def test_auto_agent_from_config_missing_params():
+    """Test config validation with missing parameters"""
+    # Missing agent_type
+    config1 = {
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search", "answer"],
+        "backend": "client"
+    }
+    
+    with pytest.raises(ValueError, match="Missing required parameter"):
+        AutoAgent.from_config(config1)
+    
+    # Missing template
+    config2 = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "tools": ["google_search", "answer"],
+        "backend": "client"
+    }
+    
+    with pytest.raises(ValueError, match="Missing required parameter"):
+        AutoAgent.from_config(config2)
+    
+    # Missing tools
+    config3 = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "backend": "client"
+    }
+    
+    with pytest.raises(ValueError, match="Missing required parameter"):
+        AutoAgent.from_config(config3)
+    
+    # Missing backend
+    config4 = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search", "answer"]
+    }
+    
+    with pytest.raises(ValueError, match="Missing required parameter"):
+        AutoAgent.from_config(config4)
+
+def test_auto_agent_from_config_invalid_type():
+    """Test config validation with invalid agent type"""
+    config = {
+        "agent_type": "invalid_type",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search", "answer"],
+        "backend": "client"
+    }
+    
+    with pytest.raises(ValueError, match="Unknown agent type"):
+        AutoAgent.from_config(config)
+
+def test_auto_agent_tool_loading():
+    """Test that tools are properly loaded from names"""
+    config = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search", "answer"],
+        "backend": "client"
+    }
+    
+    agent = AutoAgent.from_config(config)
+    assert len(agent.tools) == 2
+    assert agent.tools[0].name == "google_search"
+    assert agent.tools[1].name == "answer"
+
+
+def test_auto_agent_debug_mode():
+    """Test debug mode configuration"""
+    config = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search"],
+        "backend": "client",
+        "debug": True
+    }
+    
+    agent = AutoAgent.from_config(config)
+    assert agent.debug is True
+
+def test_auto_agent_log_file_configuration():
+    """Test log file configuration"""
+    config = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search"],
+        "backend": "client",
+        "log_file": "test_agent"
+    }
+    
+    agent = AutoAgent.from_config(config)
+    assert hasattr(agent, 'logger')
+
+def test_auto_agent_max_length_configuration():
+    """Test max length configuration"""
+    config = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search"],
+        "backend": "client",
+        "max_length": 4096
+    }
+    
+    agent = AutoAgent.from_config(config)
+    assert agent.max_length == 4096
+
+def test_auto_agent_task_info_configuration():
+    """Test task info configuration for ReactAgent"""
+    task_info = "Use web search to find information and provide answers"
+    config = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search", "answer"],
+        "backend": "client",
+        "task_info": task_info
+    }
+    
+    agent = AutoAgent.from_config(config)
+    assert isinstance(agent, ReactAgent)
+    assert task_info in agent.system_prompt
+
+def test_auto_agent_custom_agent_registration():
+    """Test custom agent registration and usage"""
+    class CustomTestAgent:
+        def __init__(self, **kwargs):
+            self.config = kwargs
+    
+    # Register custom agent
+    AutoAgent.register_agent("custom_test", CustomTestAgent)
+    
+    # Test that it's registered
+    assert "custom_test" in AutoAgent.AGENT_MAPPING
+    
+    # Test creating custom agent
+    config = {
+        "agent_type": "custom_test",
+        "model_name_or_path": "test-model",
+        "template": "test-template",
+        "tools": ["answer"],
+        "backend": "client"
+    }
+    
+    agent = AutoAgent.from_config(config)
+    assert isinstance(agent, CustomTestAgent)
+
+def test_auto_agent_error_handling():
+    """Test error handling in agent creation"""
+    # Test with completely invalid config
+    with pytest.raises(ValueError):
+        AutoAgent.from_config({})
+    
+    # Test with None config
+    with pytest.raises(ValueError):
+        AutoAgent.from_config(None)
+
+def test_auto_agent_environment_specific_config(test_config):
+    """Test environment-specific configuration"""
+    if test_config["use_mock"]:
+        # CI environment - use smaller model and fewer steps
+        config = {
+            "agent_type": "react",
+            "model_name_or_path": test_config["model"],
+            "template": "qwen2.5",
+            "tools": ["google_search"],
+            "backend": test_config["backend"]
+        }
+        
+        agent = AutoAgent.from_config(config)
+        assert agent.backend == test_config["backend"]
+        assert agent.model_name_or_path == test_config["model"]
+
+def test_auto_agent_tool_validation():
+    """Test that tools are properly validated and stored"""
+    config = {
+        "agent_type": "react",
+        "model_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
+        "template": "qwen2.5",
+        "tools": ["google_search", "answer"],
+        "backend": "client"
+    }
+    
+    agent = AutoAgent.from_config(config)
+    
+    # Verify tools are properly stored
+    assert len(agent.tools) == 2
+    tool_names = [tool.name for tool in agent.tools]
+    assert "google_search" in tool_names
+    assert "answer" in tool_names
+    
+    # Verify tool schemas
+    for tool in agent.tools:
+        assert hasattr(tool, 'name')
+        assert hasattr(tool, 'description')
+        assert hasattr(tool, 'schema')
diff --git a/agents/tests/unit/agents/mock_tests/test_mock_code_agent.py b/agents/tests/unit/agents/mock_tests/test_mock_code_agent.py
new file mode 100644
index 0000000..69bf824
--- /dev/null
+++ b/agents/tests/unit/agents/mock_tests/test_mock_code_agent.py
@@ -0,0 +1,251 @@
+import pytest
+from unittest.mock import Mock, patch, AsyncMock
+from agents.agents.specialized.code_agent import CodeAgent, extract_python_code_markdown, CodeAgentSystemPrompt
+
+def test_code_agent_initialization():
+    """Test CodeAgent initialization without GPU dependencies"""
+    tools = ["code_interpreter"]
+        
+    agent = CodeAgent(
+        "Qwen/Qwen2.5-3B-Instruct",
+        tools=tools,
+        template="qwen2.5",
+        backend="client",  # Use client backend for CI
+        debug=True
+    )
+        
+    # Test basic initialization
+    assert agent is not None
+    assert agent.model_name_or_path == "Qwen/Qwen2.5-3B-Instruct"
+    assert agent.template == "qwen2.5"
+    assert agent.backend == "client"
+    assert len(agent.tools) == 1
+    assert agent.max_length == 8192
+    
+    # Test system prompt
+    assert "multi-turn manner" in agent.system_prompt
+    assert "python code" in agent.system_prompt.lower()
+    assert "code interpreter" in agent.system_prompt.lower()
+    
+
+def test_extract_python_code_markdown(self):
+    """Test Python code extraction from markdown"""
+    # Test single code block
+    text1 = "Here's some code:\n```python\nprint('Hello')\n```\nThat's it."
+    result1 = extract_python_code_markdown(text1)
+    assert len(result1) == 1
+    assert "print('Hello')" in result1[0]
+    
+    # Test multiple code blocks
+    text2 = "First:\n```python\nx = 1\n```\nSecond:\n```python\ny = 2\n```"
+    result2 = extract_python_code_markdown(text2)
+    assert len(result2) == 2
+    assert "x = 1" in result2[0]
+    assert "y = 2" in result2[1]
+    
+    # Test no code blocks
+    text3 = "Just regular text with no code."
+    result3 = extract_python_code_markdown(text3)
+    assert len(result3) == 0
+    
+    # Test code block with different spacing
+    text4 = "```python\n   x = 42   \n```"
+    result4 = extract_python_code_markdown(text4)
+    assert len(result4) == 1
+    assert "x = 42" in result4[0]
+
+def test_code_agent_parse_single_code_block(self, mock_tools):
+    """Test parsing responses with single code blocks"""
+    tools = [mock_tools["code_interpreter"]]
+    agent = CodeAgent(
+        "Qwen/Qwen2.5-3B-Instruct",
+        tools=tools,
+        template="qwen2.5",
+        backend="client"
+    )
+    
+    responses = [
+        "I'll solve this step by step.\n```python\nx = 9 / 4\nprint(f'Speed: {x} km/h')\n```"
+    ]
+    
+    result = agent.parse(responses, tools)
+    
+    assert len(result) == 1
+    assert result[0]["role"] == "assistant"
+    assert "I'll solve this step by step" in result[0]["content"][0]["text"]
+    assert len(result[0]["tool_calls"]) == 1
+    assert result[0]["tool_calls"][0]["function"]["name"] == "code_interpreter"
+    assert "x = 9 / 4" in result[0]["tool_calls"][0]["function"]["arguments"]
+    assert result[0]["status"] == "continue"
+    assert result[0]["loss"] is True
+
+def test_code_agent_parse_no_code_block(self, mock_tools):
+    """Test parsing responses with no code blocks"""
+    tools = [mock_tools["code_interpreter"]]
+    agent = CodeAgent(
+        "Qwen/Qwen2.5-3B-Instruct",
+        tools=tools,
+        template="qwen2.5",
+        backend="client"
+    )
+    
+    responses = [
+        "I'll solve this problem step by step."
+    ]
+    
+    result = agent.parse(responses, tools)
+    
+    assert len(result) == 1
+    assert result[0]["role"] == "assistant"
+    assert "I'll solve this problem step by step" in result[0]["content"][0]["text"]
+    assert len(result[0]["tool_calls"]) == 0
+    assert result[0]["status"] == "terminal"
+    assert result[0]["loss"] is True
+
+
+def test_code_agent_parse_multiple_code_blocks(self):
+    """Test parsing responses with multiple code blocks (should fail)"""
+    tools = ["code_interpreter"]
+    agent = CodeAgent(
+        "Qwen/Qwen2.5-3B-Instruct",
+        tools=tools,
+        template="qwen2.5",
+        backend="client"
+    )
+    
+    responses = [
+        "Here's the first step:\n```python\nx = 1\n```\nAnd the second:\n```python\ny = 2\n```"
+    ]
+    
+    result = agent.parse(responses, tools)
+    
+    assert len(result) == 1
+    assert result[0]["role"] == "assistant"
+    assert len(result[0]["tool_calls"]) == 0
+    assert result[0]["status"] == "terminal"
+
+def test_code_agent_parse_final_answer():
+    """Test parsing responses with final answer"""
+    tools = ["code_interpreter"]
+    agent = CodeAgent(
+        "Qwen/Qwen2.5-3B-Instruct",
+        tools=tools,
+        template="qwen2.5",
+        backend="client"
+    )
+    
+    responses = [
+        "The final answer is <answer>204 minutes</answer>"
+    ]
+    
+    result = agent.parse(responses, tools)
+    
+    assert len(result) == 1
+    assert result[0]["role"] == "assistant"
+    assert "204 minutes" in result[0]["content"][0]["text"]
+    assert len(result[0]["tool_calls"]) == 0
+    assert result[0]["status"] == "terminal"
+
+def test_code_agent_with_mock_llm_engine(mock_llm_engine):
+    """Test CodeAgent with mocked LLM engine"""
+    tools = ["code_interpreter"]
+    
+    with patch('agents.agents.agent_base.BaseAgent._setup_backend') as mock_setup:
+        mock_setup.return_value = None
+        
+        agent = CodeAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        # Mock the LLM engine
+        agent.llm_engine = mock_llm_engine
+        
+        # Test that the agent can be created and configured
+        assert agent.llm_engine is not None
+        assert hasattr(agent.llm_engine, 'generate_async')
+
+def test_code_agent_tool_schema_validation():
+    """Test that CodeAgent properly handles tool schemas"""
+    tools = ["code_interpreter"]
+    agent = CodeAgent(
+        "Qwen/Qwen2.5-3B-Instruct",
+        tools=tools,
+        template="qwen2.5",
+        backend="client"
+    )
+    
+    # Verify tool is properly stored
+    assert len(agent.tools) == 1
+    assert agent.tools[0].name == "code_interpreter"
+    assert "Run Python code" in agent.tools[0].description
+
+def test_code_agent_error_handling():
+    """Test CodeAgent error handling in parsing"""
+    tools = ["code_interpreter"]
+    agent = CodeAgent(
+        "Qwen/Qwen2.5-3B-Instruct",
+        tools=tools,
+        template="qwen2.5",
+        backend="client"
+    )
+    
+    # Test with malformed response
+    malformed_responses = [
+        "```python\nx = 1\n"  # Missing closing ```
+    ]
+    
+    result = agent.parse(malformed_responses, tools)
+    
+    assert len(result) == 1
+    assert result[0]["role"] == "assistant"
+    assert len(result[0]["tool_calls"]) == 0
+    assert result[0]["status"] == "terminal"
+
+def test_code_agent_chain_generation_integration(mock_chain_generation):
+    """Test CodeAgent integration with chain generation methods"""
+    tools = ["code_interpreter"]
+    
+    with patch('agents.agents.agent_base.BaseAgent._setup_backend') as mock_setup:
+        mock_setup.return_value = None
+        
+        agent = CodeAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        # Test that chain generation methods are available
+        assert hasattr(agent, 'run_async')
+        assert hasattr(agent, 'get_messages')
+        assert hasattr(agent, 'tokenize_trajectories')
+
+@pytest.mark.asyncio
+async def test_code_agent_async_operations(mock_llm_engine):
+    """Test CodeAgent async operations with mocked dependencies"""
+    tools = ["code_interpreter"]
+    
+    with patch('agents.agents.agent_base.BaseAgent._setup_backend') as mock_setup:
+        mock_setup.return_value = None
+        
+        agent = CodeAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        agent.llm_engine = mock_llm_engine
+        
+        # Mock the generate_async method
+        mock_llm_engine.generate_async.return_value = [
+            "I'll solve this step by step.\n```python\nx = 9 / 4\nprint(f'Speed: {x} km/h')\n```"
+        ]
+        
+        # Test that async generation can be called
+        result = await agent.llm_engine.generate_async(["test"])
+        assert len(result) == 1
+        assert "```python" in result[0]
diff --git a/agents/tests/unit/agents/mock_tests/test_mock_react_agent.py b/agents/tests/unit/agents/mock_tests/test_mock_react_agent.py
new file mode 100644
index 0000000..84baf3a
--- /dev/null
+++ b/agents/tests/unit/agents/mock_tests/test_mock_react_agent.py
@@ -0,0 +1,352 @@
+import pytest
+from unittest.mock import Mock, patch, AsyncMock
+from agents.agents.react.react_agent import ReactAgent, parse_react_step, extract_tool_calls, ReactSystemPromptTemplate
+
+
+class TestMockReactAgent:
+    """Test ReactAgent with mocked dependencies for CI environments"""
+    
+    def test_react_agent_initialization(self, mock_tools):
+        """Test ReactAgent initialization without GPU dependencies"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        task_info = "Test search task"
+        
+        agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            task_info=task_info,
+            backend="client"  # Use client backend for CI
+        )
+        
+        # Test basic initialization
+        assert agent is not None
+        assert agent.model_name_or_path == "Qwen/Qwen2.5-3B-Instruct"
+        assert agent.template == "qwen2.5"
+        assert agent.backend == "client"
+        assert len(agent.tools) == 2
+        assert agent.max_length == 8192
+        
+        # Test system prompt contains task info and tools
+        assert task_info in agent.system_prompt
+        assert "google_search" in agent.system_prompt
+        assert "answer" in agent.system_prompt
+        assert "ReAct-style agent" in agent.system_prompt
+    
+    def test_react_agent_system_prompt_formatting(self, mock_tools):
+        """Test that ReactAgent system prompt is correctly formatted"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        task_info = "Search for information and provide answers"
+        
+        agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            task_info=task_info,
+            backend="client"
+        )
+        
+        # Check system prompt structure
+        assert "Think→Act→Observe" in agent.system_prompt
+        assert "Thought:" in agent.system_prompt
+        assert "Action:" in agent.system_prompt
+        assert "Input:" in agent.system_prompt
+        assert "Answer:" in agent.system_prompt
+        assert task_info in agent.system_prompt
+        
+        # Check tool schemas are included
+        assert "google_search" in agent.system_prompt
+        assert "answer" in agent.system_prompt
+    
+    def test_react_agent_no_task_info(self, mock_tools):
+        """Test ReactAgent initialization without task info"""
+        tools = [mock_tools["google_search"]]
+        
+        agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        # Should still have basic system prompt
+        assert "ReAct-style agent" in agent.system_prompt
+        assert len(agent.tools) == 1
+    
+    def test_parse_react_step_complete(self):
+        """Test parsing complete ReAct step"""
+        text = """Thought: I need to find information about Python.
+Action: google_search
+Input: {"query": "Python programming language"}"""
+        
+        result = parse_react_step(text)
+        
+        assert result["thought"] == "I need to find information about Python."
+        assert result["action"] == "google_search"
+        assert result["input"] == '{"query": "Python programming language"}'
+    
+    def test_parse_react_step_missing_components(self):
+        """Test parsing ReAct step with missing components"""
+        text = "Thought: I'm thinking about something."
+        result = parse_react_step(text)
+        
+        assert result["thought"] == "I'm thinking about something."
+        assert result["action"] is None
+        assert result["input"] is None
+    
+    def test_parse_react_step_action_only(self):
+        """Test parsing ReAct step with only action"""
+        text = "Action: search\nInput: {\"query\": \"test\"}"
+        result = parse_react_step(text)
+        
+        assert result["thought"] is None
+        assert result["action"] == "search"
+        assert result["input"] == '{"query": "test"}'
+    
+    def test_parse_react_step_case_insensitive(self):
+        """Test parsing ReAct step with different case"""
+        text = "THOUGHT: I need to think.\nACTION: search\nINPUT: {\"query\": \"test\"}"
+        result = parse_react_step(text)
+        
+        assert result["thought"] == "I need to think."
+        assert result["action"] == "search"
+        assert result["input"] == '{"query": "test"}'
+    
+    def test_parse_react_step_multiline_thought(self):
+        """Test parsing ReAct step with multiline thought"""
+        text = """Thought: I need to think about this
+step by step. First, I should consider
+the user's request carefully.
+Action: search
+Input: {"query": "multiline test"}"""
+        
+        result = parse_react_step(text)
+        
+        assert "step by step" in result["thought"]
+        assert "First, I should consider" in result["thought"]
+        assert result["action"] == "search"
+        assert result["input"] == '{"query": "multiline test"}'
+    
+    def test_extract_tool_calls_valid_json(self):
+        """Test extracting tool calls from valid JSON input"""
+        action_input = '{"name": "google_search", "arguments": {"query": "test"}}'
+        result = extract_tool_calls(action_input)
+        
+        assert len(result) == 1
+        assert result[0]["name"] == "google_search"
+        assert result[0]["arguments"] == {"query": "test"}
+    
+    def test_extract_tool_calls_invalid_json(self):
+        """Test extracting tool calls from invalid JSON input"""
+        action_input = '{"name": "google_search", "arguments": {"query": "test"}'  # Missing }
+        result = extract_tool_calls(action_input)
+        
+        assert len(result) == 0
+    
+    def test_extract_tool_calls_none_input(self):
+        """Test extracting tool calls from None input"""
+        result = extract_tool_calls(None)
+        assert len(result) == 0
+    
+    def test_extract_tool_calls_empty_string(self):
+        """Test extracting tool calls from empty string"""
+        result = extract_tool_calls("")
+        assert len(result) == 0
+    
+    def test_react_agent_parse_single_tool_call(self, mock_tools):
+        """Test parsing responses with single tool call"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        responses = ["""Thought: I need to search for information.
+Action: google_search
+Input: {"query": "test query"}"""]
+        
+        result = agent.parse(responses, tools)
+        
+        assert len(result) == 1
+        assert result[0]["role"] == "assistant"
+        assert "Thought: I need to search for information." in result[0]["content"][0]["text"]
+        assert len(result[0]["tool_calls"]) == 1
+        assert result[0]["tool_calls"][0]["function"]["name"] == "google_search"
+        assert result[0]["tool_calls"][0]["function"]["arguments"] == {"query": "test query"}
+        assert result[0]["loss"] is True
+    
+    def test_react_agent_parse_no_tool_call(self, mock_tools):
+        """Test parsing responses with no tool call"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        responses = ["Thought: I'm thinking about this problem."]
+        
+        result = agent.parse(responses, tools)
+        
+        assert len(result) == 1
+        assert result[0]["role"] == "assistant"
+        assert "Thought: I'm thinking about this problem." in result[0]["content"][0]["text"]
+        assert len(result[0]["tool_calls"]) == 0
+        assert result[0]["loss"] is True
+    
+    def test_react_agent_parse_final_answer(self, mock_tools):
+        """Test parsing responses with final answer"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        responses = ["""Thought: I have enough information now.
+Action: answer
+Input: {"text": "The answer is 42."}"""]
+        
+        result = agent.parse(responses, tools)
+        
+        assert len(result) == 1
+        assert result[0]["role"] == "assistant"
+        assert "The answer is 42." in str(result[0]["tool_calls"][0]["function"]["arguments"])
+        assert result[0]["tool_calls"][0]["function"]["name"] == "answer"
+    
+    def test_react_agent_parse_multiple_responses(self, mock_tools):
+        """Test parsing multiple responses"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        responses = [
+            """Thought: I need to search for information.
+Action: google_search
+Input: {"query": "first query"}""",
+            """Thought: Now I can provide an answer.
+Action: answer
+Input: {"text": "Final answer"}"""
+        ]
+        
+        result = agent.parse(responses, tools)
+        
+        assert len(result) == 2
+        assert result[0]["tool_calls"][0]["function"]["name"] == "google_search"
+        assert result[1]["tool_calls"][0]["function"]["name"] == "answer"
+    
+    def test_react_agent_tool_schema_validation(self, mock_tools):
+        """Test that ReactAgent properly handles tool schemas"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        # Verify tools are properly stored
+        assert len(agent.tools) == 2
+        tool_names = [tool.name for tool in agent.tools]
+        assert "google_search" in tool_names
+        assert "answer" in tool_names
+    
+    def test_react_agent_with_mock_llm_engine(self, mock_tools, mock_llm_engine):
+        """Test ReactAgent with mocked LLM engine"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        
+        with patch('agents.agents.agent_base.BaseAgent._setup_backend') as mock_setup:
+            mock_setup.return_value = None
+            
+            agent = ReactAgent(
+                "Qwen/Qwen2.5-3B-Instruct",
+                tools=tools,
+                template="qwen2.5",
+                backend="client"
+            )
+            
+            # Mock the LLM engine
+            agent.llm_engine = mock_llm_engine
+            
+            # Test that the agent can be created and configured
+            assert agent.llm_engine is not None
+            assert hasattr(agent.llm_engine, 'generate_async')
+    
+    def test_react_agent_chain_generation_integration(self, mock_tools, mock_chain_generation):
+        """Test ReactAgent integration with chain generation methods"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        
+        with patch('agents.agents.agent_base.BaseAgent._setup_backend') as mock_setup:
+            mock_setup.return_value = None
+            
+            agent = ReactAgent(
+                "Qwen/Qwen2.5-3B-Instruct",
+                tools=tools,
+                template="qwen2.5",
+                backend="client"
+            )
+            
+            # Test that chain generation methods are available
+            assert hasattr(agent, 'run_async')
+            assert hasattr(agent, 'get_messages')
+            assert hasattr(agent, 'tokenize_trajectories')
+    
+    @pytest.mark.asyncio
+    async def test_react_agent_async_operations(self, mock_tools, mock_llm_engine):
+        """Test ReactAgent async operations with mocked dependencies"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        
+        with patch('agents.agents.agent_base.BaseAgent._setup_backend') as mock_setup:
+            mock_setup.return_value = None
+            
+            agent = ReactAgent(
+                "Qwen/Qwen2.5-3B-Instruct",
+                tools=tools,
+                template="qwen2.5",
+                backend="client"
+            )
+            
+            agent.llm_engine = mock_llm_engine
+            
+            # Mock the generate_async method
+            mock_llm_engine.generate_async.return_value = [
+                "Thought: I need to search.\nAction: google_search\nInput: {\"query\": \"test\"}"
+            ]
+            
+            # Test that async generation can be called
+            result = await agent.llm_engine.generate_async(["test"])
+            assert len(result) == 1
+            assert "Thought:" in result[0]
+            assert "Action:" in result[0]
+    
+    def test_react_agent_error_handling(self, mock_tools):
+        """Test ReactAgent error handling in parsing"""
+        tools = [mock_tools["google_search"], mock_tools["answer"]]
+        agent = ReactAgent(
+            "Qwen/Qwen2.5-3B-Instruct",
+            tools=tools,
+            template="qwen2.5",
+            backend="client"
+        )
+        
+        # Test with malformed JSON in input
+        malformed_responses = [
+            """Thought: I need to search.
+Action: google_search
+Input: {"query": "test query"""  # Missing closing }
+        ]
+        
+        result = agent.parse(malformed_responses, tools)
+        
+        assert len(result) == 1
+        assert result[0]["role"] == "assistant"
+        # Should handle malformed input gracefully
+        assert len(result[0]["tool_calls"]) == 0
diff --git a/agents/tests/unit/agents/test_vision_agent.py b/agents/tests/unit/agents/test_vision_agent.py
index 1759861..0f67ab5 100644
--- a/agents/tests/unit/agents/test_vision_agent.py
+++ b/agents/tests/unit/agents/test_vision_agent.py
@@ -1,21 +1,19 @@
 from agents.agents.react.react_agent import ReactAgent
-from agents.tools import google_search_serper, answer
-
+from agents.tools import answer_qa
 import pytest
 
 @pytest.mark.asyncio(loop_scope="session")
 async def test_vision_agent():
-    tools = [google_search_serper, answer]
+    tools = [answer_qa]
 
-    task_info = "Use web search to get answers."
+    task_info = "Answer the question based on the image."
 
     react_agent = ReactAgent(
         "Qwen/Qwen2.5-VL-3B-Instruct",
         tools=tools,
         template="qwen2.5-vl",
         task_info=task_info,
-        backend="async_vllm",
-        debug=True
+        backend="async_vllm"
     )
 
     messages = [
@@ -41,6 +39,9 @@ async def test_vision_agent():
         start_messages=messages,
         num_chains=10
     )
-
-    inputs = react_agent.tokenize_trajectories(return_action_mask=True)
+    messages_list = react_agent.get_messages()
+    messages = messages_list[0]['messages']
+    for message in messages:
+        print(f"{message['role']}: {message['content']}")
+    inputs = react_agent.tokenize_trajectories()
     print(inputs)
\ No newline at end of file
diff --git a/agents/tests/unit/tools/test_code_tool.py b/agents/tests/unit/tools/test_code_tool.py
index 80f25d2..388845f 100644
--- a/agents/tests/unit/tools/test_code_tool.py
+++ b/agents/tests/unit/tools/test_code_tool.py
@@ -45,14 +45,14 @@ async def test_double_release():
     await code_interpreter.release(id="x")   # must return instantly
 
 
-@pytest.mark.asyncio(loop_scope="session")
-async def test_global_clean():
+# @pytest.mark.asyncio(loop_scope="session")
+# async def test_global_clean():
 
-    async def one_chain(i):
-        await code_interpreter(id=f"c{i}", code="x=1")
-        # We don't release the env here, so it will be cleaned up automatically
-        # await code_interpreter.release_env(id=f"c{i}")
+#     async def one_chain(i):
+#         await code_interpreter(id=f"c{i}", code="x=1")
+#         # We don't release the env here, so it will be cleaned up automatically
+#         # await code_interpreter.release_env(id=f"c{i}")
 
-    await asyncio.gather(*[
-        one_chain(i) for i in range(code_interpreter.pool_size-5)
-    ])
\ No newline at end of file
+#     await asyncio.gather(*[
+#         one_chain(i) for i in range(code_interpreter.pool_size-5)
+#     ])
\ No newline at end of file
diff --git a/verl b/verl
index 861f63b..237d9ca 160000
--- a/verl
+++ b/verl
@@ -1 +1 @@
-Subproject commit 861f63ba8097a43ababe27116842512783080586
+Subproject commit 237d9cacd2ede001c21f1a1daa44e8e8598993e1