Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .devcontainer/postinstall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ nvm install --lts # Consider version 18 if we want a consistent version rather t
nvm use --lts
npm install -g @anthropic-ai/claude-code

curl https://cursor.com/install -fsS | bash

git config --global --add safe.directory /workspaces/auto-questions

# Activate virtual environment
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ jobs:
steps:
- name: Check out repository
uses: actions/checkout@v4
- uses: actions/setup-python@v5
- id: setup-python
uses: actions/setup-python@v5
with:
python-version: "3.12.3"
- name: Install poetry
Expand All @@ -39,7 +40,6 @@ jobs:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction --no-root
- name: "Run the unit tests"
if: ${{ !cancelled() }}
Expand Down
42 changes: 0 additions & 42 deletions code_tests/integration_tests/test_adjacent_news_api.py

This file was deleted.

50 changes: 36 additions & 14 deletions code_tests/integration_tests/test_ai_models/test_general_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,15 @@ def _all_tests() -> list[ModelTest]:
test_data.get_cheap_user_message(),
),
ModelTest(
GeneralLlm(model="claude-4-6-sonnet"),
GeneralLlm(model="claude-sonnet-4-6"),
test_data.get_cheap_user_message(),
),
ModelTest(
GeneralLlm(model="claude-4-6-sonnet"),
GeneralLlm(model="anthropic/claude-sonnet-4-6"),
test_data.get_cheap_user_message(),
),
ModelTest(
GeneralLlm(model="anthropic/claude-sonnet-4-6"),
test_data.get_cheap_vision_message_data(),
),
ModelTest(
Expand Down Expand Up @@ -72,10 +76,15 @@ def _all_tests() -> list[ModelTest]:
model="openai/gpt-5",
responses_api=True,
tools=[{"type": "web_search"}],
reasoning_effort="minimal",
),
"What is the latest News on the Middle East? Do a single very quick search. Go as fast as you can. I just want headlines.",
),
ModelTest(
GeneralLlm(
model="openai/gpt-5",
),
test_data.get_cheap_user_message(),
),
]


Expand All @@ -89,7 +98,7 @@ def all_tests_with_names() -> list[tuple[str, ModelTest]]:


@pytest.mark.parametrize("test_name, test", all_tests_with_names())
def test_general_llm_instances_run(
def test_general_llm_instances_run_and_track_cost(
test_name: str,
test: ModelTest,
) -> None:
Expand Down Expand Up @@ -159,18 +168,31 @@ def test_litellm_params_work() -> None:
)


def test_citations_are_populated() -> None:
model = GeneralLlm(model="openrouter/perplexity/sonar", populate_citations=True)
response = asyncio.run(model.invoke("When did Abraham Lincoln die?"))
logger.info(f"Response: {response}")
assert response, "Response is empty"
assert "http" in response or "www." in response, "Citations are not populated"

model = GeneralLlm(model="openrouter/perplexity/sonar", populate_citations=False)
response = asyncio.run(model.invoke("When did Abraham Lincoln die?"))
@pytest.mark.parametrize(
"model_name, populate_citations",
[
("openrouter/perplexity/sonar-reasoning-pro", True),
("perplexity/sonar-reasoning-pro", True),
("openrouter/perplexity/sonar", True),
("perplexity/sonar", True),
("openrouter/perplexity/sonar", False),
],
)
def test_citations_are_populated(model_name: str, populate_citations: bool) -> None:
model = GeneralLlm(model=model_name, populate_citations=populate_citations)
response = asyncio.run(
model.invoke(
"When did Abraham Lincoln die? Howd did he die? Where is the great barrier reef located?"
)
)
logger.info(f"Response: {response}")
assert response, "Response is empty"
assert "http" not in response and "www." not in response, "Citations are populated"
if populate_citations:
assert "http" in response or "www." in response, "Citations are not populated"
else:
assert (
"http" not in response and "www." not in response
), "Citations are populated"


async def test_exa_errors_with_prompt_too_long() -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from code_tests.unit_tests.test_ai_models.models_to_test import ModelsToTest
from forecasting_tools.ai_models.ai_utils.response_types import TextTokenCostResponse
from forecasting_tools.ai_models.deprecated_model_classes.deepseek_r1 import DeepSeekR1
from forecasting_tools.ai_models.deprecated_model_classes.perplexity import Perplexity
from forecasting_tools.ai_models.model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.model_interfaces.combined_llm_archetype import (
CombinedLlmArchetype,
Expand Down Expand Up @@ -99,18 +100,6 @@ async def find_number_of_hard_limit_exceptions_in_run(
############################### TESTS ########################################


@pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
def test_cost_manager_notices_cost_without_mocks(
subclass: type[AiModel],
) -> None:
if not issubclass(subclass, IncursCost):
raise ValueError(NOT_INCURS_COST_ERROR_MESSAGE)

max_cost = 10
cost = run_cheap_invoke_and_track_cost(subclass, max_cost)
assert cost > 0, "No cost was incurred"


@pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
async def test_cost_calculated_matches_actual_cost(
subclass: type[AiModel],
Expand All @@ -121,6 +110,10 @@ async def test_cost_calculated_matches_actual_cost(
pytest.skip(
"DeepSeekR1 does not have correct token-cost estimation due to reasoning tokens"
)
if issubclass(subclass, Perplexity):
pytest.skip(
"Perplexity does not have correct token-cost estimation due to search costs (probably)"
)
model = subclass()
direct_response = await model._mockable_direct_call_to_model(
model._get_cheap_input_for_invoke()
Expand All @@ -136,21 +129,6 @@ async def test_cost_calculated_matches_actual_cost(
), "Cost calculated does not match actual cost"


@pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
def test_cost_manager_notices_cost_with_mocks(
mocker: Mock, subclass: type[AiModel]
) -> None:
if not issubclass(subclass, IncursCost):
raise ValueError(NOT_INCURS_COST_ERROR_MESSAGE)

AiModelMockManager.mock_ai_model_direct_call_with_predefined_mock_value(
mocker, subclass
)
max_cost = 100
cost = run_cheap_invoke_and_track_cost(subclass, max_cost)
assert cost > 0, "No cost was incurred"


@pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
def test_error_thrown_when_limit_reached(mocker: Mock, subclass: type[AiModel]) -> None:
if not issubclass(subclass, IncursCost):
Expand Down
5 changes: 5 additions & 0 deletions code_tests/integration_tests/test_coherence_links.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import pytest

from forecasting_tools import MetaculusClient


@pytest.mark.skip(
reason="Skipping coherence links api tests. Is not needed and is broken"
)
def test_coherence_links_api():
client = MetaculusClient()
new_id = client.post_question_link(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ class PolicyProposal(BaseModel, Jsonable):
description="Full proposal with footnote references [^1], [^2], etc."
)
key_recommendations: list[str] = Field(
description="Top 3-5 actionable recommendations"
description="Topdescription 3-5 actionable recommendations"
)
robustness_analysis: str = Field(
default="",
Expand Down
63 changes: 51 additions & 12 deletions forecasting_tools/ai_models/general_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,17 +169,13 @@ def __init__(

metaculus_prefix = "metaculus/"
exa_prefix = "exa/"
openai_prefix = "openai/"
anthropic_prefix = "anthropic/"
asknews_prefix = "asknews/"
self._use_metaculus_proxy = model.startswith(metaculus_prefix)
self._use_exa = model.startswith(exa_prefix)
self._use_asknews = model.startswith(asknews_prefix)
prefixes_in_operational_order = [
metaculus_prefix,
exa_prefix,
openai_prefix,
anthropic_prefix,
]

# prefix removal is to help with matching with model cost lists
Expand Down Expand Up @@ -341,14 +337,12 @@ async def _mockable_direct_call_to_model(
self._litellm_model, observed_no_cost=observed_no_cost
)

if (
response.model_extra
and "citations" in response.model_extra
and self.populate_citations
):
citations = response.model_extra.get("citations")
citations = typeguard.check_type(citations, list[str])
answer = fill_in_citations(citations, answer, use_citation_brackets=False)
if self.populate_citations:
citations = self._extract_citations(response, choices)
if citations:
answer = fill_in_citations(
citations, answer, use_citation_brackets=False
)
# TODO: Add citation support for Gemini - https://ai.google.dev/gemini-api/docs/google-search#attributing_sources_with_inline_citations

await asyncio.sleep(
Expand All @@ -366,6 +360,51 @@ async def _mockable_direct_call_to_model(

return response

@staticmethod
def _extract_citations(
response: ModelResponse, choices: list[Choices]
) -> list[str]:
if response.model_extra and "citations" in response.model_extra:
citations = response.model_extra.get("citations")
return typeguard.check_type(citations, list[str])

# OpenRouter returns Perplexity citations as url_citation annotations
# rather than in model_extra["citations"]. The annotations are the
# flat source URL list duplicated (once with titles, once without),
# NOT one-per-occurrence. All start_index/end_index are 0.
# We deduplicate to reconstruct the original indexed list where
# urls[i] corresponds to citation [i+1] in the text.
message = choices[0].message
annotations = getattr(message, "annotations", None)
if not annotations:
return []
all_urls: list[str] = []
for annotation in annotations:
if not isinstance(annotation, dict):
continue
if annotation.get("type") != "url_citation":
continue
url_info = annotation.get("url_citation", {})
url = url_info.get("url", "")
if url:
all_urls.append(url)

seen: set[str] = set()
unique_urls: list[str] = []
for url in all_urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)

num_unique = len(unique_urls)
num_total = len(all_urls)
if num_total != num_unique and num_total != num_unique * 2:
raise ValueError(
f"Expected annotations to contain each URL once or twice, "
f"but got {num_total} total URLs and {num_unique} unique URLs"
)
return unique_urls

def _normalize_response(
self, raw_response: ResponsesAPIResponse, model_response: ModelResponse
) -> ModelResponse:
Expand Down
8 changes: 8 additions & 0 deletions forecasting_tools/forecast_bots/forecast_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,14 @@ def _llm_config_defaults(cls) -> dict[str, str | GeneralLlm | None]:
else:
parser = GeneralLlm(model="gpt-4o-mini", temperature=0.3)

if researcher == "openai/gpt-4o-search-preview" or (
isinstance(researcher, GeneralLlm)
and researcher.model == "openai/gpt-4o-search-preview"
):
logger.warning(
"Using gpt-4o-search-preview as default researcher. This can be expensive, it is recommended you explicitlyset the researcher yourself to a different model."
)

return {
"default": main_default_llm,
"summarizer": summarizer,
Expand Down
Loading
Loading