Metaculus · CodexVeritas · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/.devcontainer/postinstall.sh b/.devcontainer/postinstall.sh
@@ -34,6 +34,8 @@ nvm install --lts # Consider version 18 if we want a consistent version rather t
 nvm use --lts
 npm install -g @anthropic-ai/claude-code
 
+curl https://cursor.com/install -fsS | bash
+
 git config --global --add safe.directory /workspaces/auto-questions
 
 # Activate virtual environment

diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
@@ -23,7 +23,8 @@ jobs:
     steps:
       - name: Check out repository
         uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - id: setup-python
+        uses: actions/setup-python@v5
         with:
           python-version: "3.12.3"
       - name: Install poetry
@@ -39,7 +40,6 @@ jobs:
           path: .venv
           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
       - name: Install dependencies
-        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
         run: poetry install --no-interaction --no-root
       - name: "Run the unit tests"
         if: ${{ !cancelled() }}

diff --git a/code_tests/integration_tests/test_adjacent_news_api.py b/code_tests/integration_tests/test_adjacent_news_api.py
diff --git a/code_tests/integration_tests/test_ai_models/test_general_llm.py b/code_tests/integration_tests/test_ai_models/test_general_llm.py
@@ -40,11 +40,15 @@ def _all_tests() -> list[ModelTest]:
             test_data.get_cheap_user_message(),
         ),
         ModelTest(
-            GeneralLlm(model="claude-4-6-sonnet"),
+            GeneralLlm(model="claude-sonnet-4-6"),
             test_data.get_cheap_user_message(),
         ),
         ModelTest(
-            GeneralLlm(model="claude-4-6-sonnet"),
+            GeneralLlm(model="anthropic/claude-sonnet-4-6"),
+            test_data.get_cheap_user_message(),
+        ),
+        ModelTest(
+            GeneralLlm(model="anthropic/claude-sonnet-4-6"),
             test_data.get_cheap_vision_message_data(),
         ),
         ModelTest(
@@ -72,10 +76,15 @@ def _all_tests() -> list[ModelTest]:
                 model="openai/gpt-5",
                 responses_api=True,
                 tools=[{"type": "web_search"}],
-                reasoning_effort="minimal",
             ),
             "What is the latest News on the Middle East? Do a single very quick search. Go as fast as you can. I just want headlines.",
         ),
+        ModelTest(
+            GeneralLlm(
+                model="openai/gpt-5",
+            ),
+            test_data.get_cheap_user_message(),
+        ),
     ]
 
 
@@ -89,7 +98,7 @@ def all_tests_with_names() -> list[tuple[str, ModelTest]]:
 
 
 @pytest.mark.parametrize("test_name, test", all_tests_with_names())
-def test_general_llm_instances_run(
+def test_general_llm_instances_run_and_track_cost(
     test_name: str,
     test: ModelTest,
 ) -> None:
@@ -159,18 +168,31 @@ def test_litellm_params_work() -> None:
     )
 
 
-def test_citations_are_populated() -> None:
-    model = GeneralLlm(model="openrouter/perplexity/sonar", populate_citations=True)
-    response = asyncio.run(model.invoke("When did Abraham Lincoln die?"))
-    logger.info(f"Response: {response}")
-    assert response, "Response is empty"
-    assert "http" in response or "www." in response, "Citations are not populated"
-
-    model = GeneralLlm(model="openrouter/perplexity/sonar", populate_citations=False)
-    response = asyncio.run(model.invoke("When did Abraham Lincoln die?"))
+@pytest.mark.parametrize(
+    "model_name, populate_citations",
+    [
+        ("openrouter/perplexity/sonar-reasoning-pro", True),
+        ("perplexity/sonar-reasoning-pro", True),
+        ("openrouter/perplexity/sonar", True),
+        ("perplexity/sonar", True),
+        ("openrouter/perplexity/sonar", False),
+    ],
+)
+def test_citations_are_populated(model_name: str, populate_citations: bool) -> None:
+    model = GeneralLlm(model=model_name, populate_citations=populate_citations)
+    response = asyncio.run(
+        model.invoke(
+            "When did Abraham Lincoln die? Howd did he die? Where is the great barrier reef located?"
+        )
+    )
     logger.info(f"Response: {response}")
     assert response, "Response is empty"
-    assert "http" not in response and "www." not in response, "Citations are populated"
+    if populate_citations:
+        assert "http" in response or "www." in response, "Citations are not populated"
+    else:
+        assert (
+            "http" not in response and "www." not in response
+        ), "Citations are populated"
 
 
 async def test_exa_errors_with_prompt_too_long() -> None:

diff --git a/code_tests/integration_tests/test_ai_models/test_models_incurring_cost.py b/code_tests/integration_tests/test_ai_models/test_models_incurring_cost.py
@@ -9,6 +9,7 @@
 from code_tests.unit_tests.test_ai_models.models_to_test import ModelsToTest
 from forecasting_tools.ai_models.ai_utils.response_types import TextTokenCostResponse
 from forecasting_tools.ai_models.deprecated_model_classes.deepseek_r1 import DeepSeekR1
+from forecasting_tools.ai_models.deprecated_model_classes.perplexity import Perplexity
 from forecasting_tools.ai_models.model_interfaces.ai_model import AiModel
 from forecasting_tools.ai_models.model_interfaces.combined_llm_archetype import (
     CombinedLlmArchetype,
@@ -99,18 +100,6 @@ async def find_number_of_hard_limit_exceptions_in_run(
 ############################### TESTS ########################################
 
 
-@pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
-def test_cost_manager_notices_cost_without_mocks(
-    subclass: type[AiModel],
-) -> None:
-    if not issubclass(subclass, IncursCost):
-        raise ValueError(NOT_INCURS_COST_ERROR_MESSAGE)
-
-    max_cost = 10
-    cost = run_cheap_invoke_and_track_cost(subclass, max_cost)
-    assert cost > 0, "No cost was incurred"
-
-
 @pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
 async def test_cost_calculated_matches_actual_cost(
     subclass: type[AiModel],
@@ -121,6 +110,10 @@ async def test_cost_calculated_matches_actual_cost(
         pytest.skip(
             "DeepSeekR1 does not have correct token-cost estimation due to reasoning tokens"
         )
+    if issubclass(subclass, Perplexity):
+        pytest.skip(
+            "Perplexity does not have correct token-cost estimation due to search costs (probably)"
+        )
     model = subclass()
     direct_response = await model._mockable_direct_call_to_model(
         model._get_cheap_input_for_invoke()
@@ -136,21 +129,6 @@ async def test_cost_calculated_matches_actual_cost(
     ), "Cost calculated does not match actual cost"
 
 
-@pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
-def test_cost_manager_notices_cost_with_mocks(
-    mocker: Mock, subclass: type[AiModel]
-) -> None:
-    if not issubclass(subclass, IncursCost):
-        raise ValueError(NOT_INCURS_COST_ERROR_MESSAGE)
-
-    AiModelMockManager.mock_ai_model_direct_call_with_predefined_mock_value(
-        mocker, subclass
-    )
-    max_cost = 100
-    cost = run_cheap_invoke_and_track_cost(subclass, max_cost)
-    assert cost > 0, "No cost was incurred"
-
-
 @pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
 def test_error_thrown_when_limit_reached(mocker: Mock, subclass: type[AiModel]) -> None:
     if not issubclass(subclass, IncursCost):

diff --git a/code_tests/integration_tests/test_coherence_links.py b/code_tests/integration_tests/test_coherence_links.py
@@ -1,6 +1,11 @@
+import pytest
+
 from forecasting_tools import MetaculusClient
 
 
+@pytest.mark.skip(
+    reason="Skipping coherence links api tests. Is not needed and is broken"
+)
 def test_coherence_links_api():
     client = MetaculusClient()
     new_id = client.post_question_link(

diff --git a/forecasting_tools/agents_and_tools/ai_congress_v2/data_models.py b/forecasting_tools/agents_and_tools/ai_congress_v2/data_models.py
@@ -172,7 +172,7 @@ class PolicyProposal(BaseModel, Jsonable):
         description="Full proposal with footnote references [^1], [^2], etc."
     )
     key_recommendations: list[str] = Field(
-        description="Top 3-5 actionable recommendations"
+        description="Topdescription 3-5 actionable recommendations"
     )
     robustness_analysis: str = Field(
         default="",

diff --git a/forecasting_tools/ai_models/general_llm.py b/forecasting_tools/ai_models/general_llm.py
@@ -169,17 +169,13 @@ def __init__(
 
         metaculus_prefix = "metaculus/"
         exa_prefix = "exa/"
-        openai_prefix = "openai/"
-        anthropic_prefix = "anthropic/"
         asknews_prefix = "asknews/"
         self._use_metaculus_proxy = model.startswith(metaculus_prefix)
         self._use_exa = model.startswith(exa_prefix)
         self._use_asknews = model.startswith(asknews_prefix)
         prefixes_in_operational_order = [
             metaculus_prefix,
             exa_prefix,
-            openai_prefix,
-            anthropic_prefix,
         ]
 
         # prefix removal is to help with matching with model cost lists
@@ -341,14 +337,12 @@ async def _mockable_direct_call_to_model(
             self._litellm_model, observed_no_cost=observed_no_cost
         )
 
-        if (
-            response.model_extra
-            and "citations" in response.model_extra
-            and self.populate_citations
-        ):
-            citations = response.model_extra.get("citations")
-            citations = typeguard.check_type(citations, list[str])
-            answer = fill_in_citations(citations, answer, use_citation_brackets=False)
+        if self.populate_citations:
+            citations = self._extract_citations(response, choices)
+            if citations:
+                answer = fill_in_citations(
+                    citations, answer, use_citation_brackets=False
+                )
             # TODO: Add citation support for Gemini - https://ai.google.dev/gemini-api/docs/google-search#attributing_sources_with_inline_citations
 
         await asyncio.sleep(
@@ -366,6 +360,51 @@ async def _mockable_direct_call_to_model(
 
         return response
 
+    @staticmethod
+    def _extract_citations(
+        response: ModelResponse, choices: list[Choices]
+    ) -> list[str]:
+        if response.model_extra and "citations" in response.model_extra:
+            citations = response.model_extra.get("citations")
+            return typeguard.check_type(citations, list[str])
+
+        # OpenRouter returns Perplexity citations as url_citation annotations
+        # rather than in model_extra["citations"]. The annotations are the
+        # flat source URL list duplicated (once with titles, once without),
+        # NOT one-per-occurrence. All start_index/end_index are 0.
+        # We deduplicate to reconstruct the original indexed list where
+        # urls[i] corresponds to citation [i+1] in the text.
+        message = choices[0].message
+        annotations = getattr(message, "annotations", None)
+        if not annotations:
+            return []
+        all_urls: list[str] = []
+        for annotation in annotations:
+            if not isinstance(annotation, dict):
+                continue
+            if annotation.get("type") != "url_citation":
+                continue
+            url_info = annotation.get("url_citation", {})
+            url = url_info.get("url", "")
+            if url:
+                all_urls.append(url)
+
+        seen: set[str] = set()
+        unique_urls: list[str] = []
+        for url in all_urls:
+            if url not in seen:
+                seen.add(url)
+                unique_urls.append(url)
+
+        num_unique = len(unique_urls)
+        num_total = len(all_urls)
+        if num_total != num_unique and num_total != num_unique * 2:
+            raise ValueError(
+                f"Expected annotations to contain each URL once or twice, "
+                f"but got {num_total} total URLs and {num_unique} unique URLs"
+            )
+        return unique_urls
+
     def _normalize_response(
         self, raw_response: ResponsesAPIResponse, model_response: ModelResponse
     ) -> ModelResponse:

diff --git a/forecasting_tools/forecast_bots/forecast_bot.py b/forecasting_tools/forecast_bots/forecast_bot.py
@@ -1027,6 +1027,14 @@ def _llm_config_defaults(cls) -> dict[str, str | GeneralLlm | None]:
         else:
             parser = GeneralLlm(model="gpt-4o-mini", temperature=0.3)
 
+        if researcher == "openai/gpt-4o-search-preview" or (
+            isinstance(researcher, GeneralLlm)
+            and researcher.model == "openai/gpt-4o-search-preview"
+        ):
+            logger.warning(
+                "Using gpt-4o-search-preview as default researcher. This can be expensive, it is recommended you explicitlyset the researcher yourself to a different model."
+            )
+
         return {
             "default": main_default_llm,
             "summarizer": summarizer,