oughtinc · pre-commit-ci · Apr 6, 2026 · Apr 6, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 minimum_pre_commit_version: "2.9.0"
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v6.0.0
     hooks:
       - id: check-yaml
         args: [--allow-multiple-documents]
@@ -15,20 +15,20 @@ repos:
     hooks:
       - id: prettier
   - repo: https://github.com/asottile/reorder-python-imports
-    rev: v3.10.0
+    rev: v3.16.0
     hooks:
       - id: reorder-python-imports
         args: [--py39-plus]
-  - repo: https://github.com/psf/black
-    rev: 23.3.0
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 26.3.1
     hooks:
       - id: black
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+    rev: 7.3.0
     hooks:
       - id: flake8
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.1
+    rev: v1.20.0
     hooks:
       - id: mypy
         additional_dependencies: [numpy, httpx, pytest, structlog, types-PyYAML]

diff --git a/ice/agent.py b/ice/agent.py
@@ -21,8 +21,7 @@
 except ImportError:
 
     class Tfew(Agent):
-        def __init__(self, *args, **kwargs):
-            ...
+        def __init__(self, *args, **kwargs): ...
 
 
 MACHINE_AGENTS = {

diff --git a/ice/agents/augmented.py b/ice/agents/augmented.py
@@ -66,7 +66,7 @@ async def classify(
         default: Optional[str] = None,
         verbose: bool = False,
     ) -> tuple[dict[str, float], Optional[str]]:
-        (machine_probs, explanation) = await self.machine.classify(
+        machine_probs, explanation = await self.machine.classify(
             prompt=prompt,
             choices=choices,
             default=default,

diff --git a/ice/cache.py b/ice/cache.py
@@ -1,6 +1,7 @@
 """
 Decorator for caching function results to disk
 """
+
 import asyncio
 import functools
 import inspect

diff --git a/ice/datasets/qasper.py b/ice/datasets/qasper.py
@@ -13,7 +13,6 @@
 from ice.paper import split_sentences
 from ice.recipes.meta.eval_paper_qa.types import PaperQaGoldStandard
 
-
 TRAIN_PATH = "/code/datasets/qasper-train-v0.3.json"
 
 VAL_PATH = "/code/datasets/qasper-dev-v0.3.json"

diff --git a/ice/evaluation/evaluate_recipe_result.py b/ice/evaluation/evaluate_recipe_result.py
@@ -138,9 +138,7 @@ def __str__(self) -> str:
         correctness = (
             "Correct"
             if self.is_correct == True
-            else "Incorrect"
-            if self.is_correct == False
-            else "Not evaluated"
+            else "Incorrect" if self.is_correct == False else "Not evaluated"
         )
         return f"""{correctness}.
     - Predicted: {self.predicted}
@@ -196,13 +194,17 @@ def evaluated_classifications(self) -> list[EvaluatedClassification]:
 
         for i in range(0, max(len(recipe_classifications), len(gold_classifications))):
             evaluated_classification = EvaluatedClassification(
-                predicted=recipe_classifications[i]
-                if i < len(recipe_classifications)
-                else None,
+                predicted=(
+                    recipe_classifications[i]
+                    if i < len(recipe_classifications)
+                    else None
+                ),
                 gold=gold_classifications[i] if i < len(gold_classifications) else None,
-                classification_eq=self.classification_eq[i]
-                if i < len(self.classification_eq)
-                else None,
+                classification_eq=(
+                    self.classification_eq[i]
+                    if i < len(self.classification_eq)
+                    else None
+                ),
             )
 
             evaluated_classifications.append(evaluated_classification)

diff --git a/ice/evaluation/evaluation_report.py b/ice/evaluation/evaluation_report.py
@@ -382,9 +382,9 @@ def make_dashboard_row_df(self):
                 classification_summary.proportion_correct
             )
 
-            row[
-                f"Classification {i+1} # evaluated"
-            ] = classification_summary.num_evaluated
+            row[f"Classification {i+1} # evaluated"] = (
+                classification_summary.num_evaluated
+            )
 
         df = pd.DataFrame([row])
         df.to_csv(
@@ -408,9 +408,9 @@ def make_experiments_evaluation_df(self):
                 "ice_commit": latest_commit_hash(),
                 "document_id": result.document_id,
                 "split": result.gold_standard.split if result.gold_standard else None,
-                "experiment": result.gold_standard.experiment
-                if result.gold_standard
-                else None,
+                "experiment": (
+                    result.gold_standard.experiment if result.gold_standard else None
+                ),
                 "total_gs_quotes": len(
                     result.evaluated_excerpts.gold_standards_in_excerpts_results
                 ),
@@ -420,9 +420,9 @@ def make_experiments_evaluation_df(self):
                 "excerpts": result.evaluated_excerpts.excerpts,
                 "gs_quotes": result.evaluated_excerpts.gold_standards_str(),
                 "answer": result.answer,
-                "gs_answer": result.gold_standard.answer
-                if result.gold_standard
-                else None,
+                "gs_answer": (
+                    result.gold_standard.answer if result.gold_standard else None
+                ),
                 "answer_rating": result.answer_rating,
                 "failure_modes": result.failure_modes,
             }

diff --git a/ice/evaluation/summarize_experiment_evals.py b/ice/evaluation/summarize_experiment_evals.py
@@ -35,13 +35,17 @@ async def summarize_experiment_evals(results_file: str):
                     row.get("classification_1"),
                     row.get("classification_2"),
                 ],
-                answer_rating=None
-                if pd.isna(row.get("answer_rating"))
-                else int(row.get("answer_rating")),
+                answer_rating=(
+                    None
+                    if pd.isna(row.get("answer_rating"))
+                    else int(row.get("answer_rating"))
+                ),
                 elicit_commit=row.get("elicit_commit"),
-                failure_modes=None
-                if pd.isna(row.get("failure_modes"))
-                else row.failure_modes.split(","),
+                failure_modes=(
+                    None
+                    if pd.isna(row.get("failure_modes"))
+                    else row.failure_modes.split(",")
+                ),
             )
             for _, row in recipe_df.iterrows()
         ]

diff --git a/ice/formatter/multi.py b/ice/formatter/multi.py
@@ -50,7 +50,7 @@ def _is_partial(**fields: Union[literal, _NotNeededSentinel]):
 
 
 def all_values_needed(
-    examples: Sequence[Mapping[str, Union[literal_or_transform, _NotNeededSentinel]]]
+    examples: Sequence[Mapping[str, Union[literal_or_transform, _NotNeededSentinel]]],
 ) -> TypeGuard[Sequence[Mapping[str, literal_or_transform]]]:
     return all(
         (
@@ -107,7 +107,7 @@ def _unparse(parses: _StdLibFormatStringParses) -> str:
 
 
 def _no_sentinels_remaining(
-    concrete_values: dict[str, Union[literal, _NotNeededSentinel]]
+    concrete_values: dict[str, Union[literal, _NotNeededSentinel]],
 ) -> TypeGuard[dict[str, literal]]:
     return all(
         (value is not _not_needed_sentinel for value in concrete_values.values())
@@ -146,7 +146,7 @@ def _format_truncate(
 
 
 def _has_stop(
-    concrete_values: Mapping[str, Union[literal, _NotNeededSentinel]]
+    concrete_values: Mapping[str, Union[literal, _NotNeededSentinel]],
 ) -> bool:
     return any(isinstance(value, StopSentinel) for value in concrete_values.values())
 

diff --git a/ice/formatter/transform/dependent.py b/ice/formatter/transform/dependent.py
@@ -9,7 +9,6 @@
 
 from ice.formatter.transform import _Transform
 
-
 T_contra = TypeVar("T_contra", contravariant=True)
 
 

diff --git a/ice/formatter/transform/value.py b/ice/formatter/transform/value.py
@@ -6,7 +6,6 @@
 
 from ice.formatter.transform import _Transform
 
-
 T_contra = TypeVar("T_contra", contravariant=True)
 
 

diff --git a/ice/json_value.py b/ice/json_value.py
@@ -6,7 +6,6 @@
 
 from fvalues import F
 
-
 JSONValue = Union[
     str, int, float, bool, None, list["JSONValue"], dict[str, "JSONValue"]
 ]

diff --git a/ice/metrics/gold_paragraphs.py b/ice/metrics/gold_paragraphs.py
@@ -1,6 +1,7 @@
 """
 Make a dataframe that contains the paragraphs that contain the gold standard quotes.
 """
+
 import asyncio
 from pathlib import Path
 from typing import Optional
@@ -72,15 +73,13 @@ def get_containing_paragraph(
         # Explanations:
         # - Quote is split across two paragraphs
         # - Document paragraphs don't include quote
-        log.warning(
-            f"""Couldn't find gold standard paragraph for quote
+        log.warning(f"""Couldn't find gold standard paragraph for quote
 
 > {quote}
 
 in {document_id}. Best recall was {best_recall:.2f}. Best paragraph was:
 
-> {best_recall_paragraph}"""
-        )
+> {best_recall_paragraph}""")
     return best_recall_paragraph
 
 

diff --git a/ice/metrics/gold_standards.py b/ice/metrics/gold_standards.py
@@ -184,8 +184,7 @@ def get_gold_standards(
     question_short_name: Optional[str] = None,
     experiment: Optional[str] = None,
     model_type: None = None,
-) -> list[GoldStandard[Any]]:
-    ...
+) -> list[GoldStandard[Any]]: ...
 
 
 @overload
@@ -195,8 +194,7 @@ def get_gold_standards(
     document_id: Optional[str] = None,
     question_short_name: Optional[str] = None,
     experiment: Optional[str] = None,
-) -> list[GoldStandard[ParsedGoldStandardType]]:
-    ...
+) -> list[GoldStandard[ParsedGoldStandardType]]: ...
 
 
 def get_gold_standards(
@@ -226,8 +224,7 @@ def get_gold_standard(
     question_short_name: Optional[str] = None,
     experiment: Optional[str] = None,
     model_type: None = None,
-) -> Optional[GoldStandard[Any]]:
-    ...
+) -> Optional[GoldStandard[Any]]: ...
 
 
 @overload
@@ -237,8 +234,7 @@ def get_gold_standard(
     document_id: Optional[str] = None,
     question_short_name: Optional[str] = None,
     experiment: Optional[str] = None,
-) -> Optional[GoldStandard[ParsedGoldStandardType]]:
-    ...
+) -> Optional[GoldStandard[ParsedGoldStandardType]]: ...
 
 
 def get_gold_standard(

diff --git a/ice/paper.py b/ice/paper.py
@@ -93,9 +93,9 @@ def parse_txt(file: Path) -> list[dict]:
                                 "number": section_title_number(current_section),
                             }
                         ],
-                        "sectionType": "abstract"
-                        if current_section == "Abstract"
-                        else "main",
+                        "sectionType": (
+                            "abstract" if current_section == "Abstract" else "main"
+                        ),
                     }
                 )
     return body

diff --git a/ice/recipes/adherence_tfew_paragraph.py b/ice/recipes/adherence_tfew_paragraph.py
@@ -24,7 +24,6 @@
 from ice.utils import map_async
 from ice.utils import max_by_value
 
-
 gpt2_tokenizer: GPT2TokenizerFast = AutoTokenizer.from_pretrained("gpt2")
 
 

diff --git a/ice/recipes/blinding_dynamic.py b/ice/recipes/blinding_dynamic.py
@@ -21,6 +21,7 @@
 - routledge-2006.pdf
 - vittengl-2009.pdf
 """
+
 import itertools
 from typing import Any
 from typing import Literal
@@ -344,9 +345,9 @@ async def run(self, paper: Paper):
         results_by_intervention: dict[str, dict[Group, dict[str, Any]]] = {}
         interventions = await self.interventions(paper)
         for intervention in interventions:
-            results_by_intervention[
-                intervention
-            ] = await self.blinding_for_intervention(paper, intervention)
+            results_by_intervention[intervention] = (
+                await self.blinding_for_intervention(paper, intervention)
+            )
 
         recipe_results: list[RecipeResult] = []
         for intervention in interventions:

diff --git a/ice/recipes/consort_flow/baseline_elicit_answer.py b/ice/recipes/consort_flow/baseline_elicit_answer.py
@@ -7,7 +7,6 @@
 from ice.apis.openai import openai_complete
 from ice.recipes.program_search.nodes.answer.types import Demonstration
 
-
 log = get_logger()
 
 

diff --git a/ice/recipes/consort_flow/baselines.py b/ice/recipes/consort_flow/baselines.py
@@ -342,9 +342,11 @@ async def _all_options(
         except TooLongRequestError:
             selections = remove_lowest_perplexity(selections)
     return PaperQaAnswer(
-        answer=["The question is not answered in the text."]
-        if do_return_list
-        else "The question is not answered in the text.",
+        answer=(
+            ["The question is not answered in the text."]
+            if do_return_list
+            else "The question is not answered in the text."
+        ),
         support_candidates=texts,
         support_labels=[False for text in texts],
         support_scores=[t[1] for t in texts_with_perplexities],

diff --git a/ice/recipes/consort_flow/golds.py b/ice/recipes/consort_flow/golds.py
@@ -54,9 +54,11 @@ def paper_to_allocation_gold_standards(
         (
             f"The {exp.name} experiment included {len(exp.arms or [])} arms: {', '.join((arm.name for arm in exp.arms or []))}. How many participants were initially allocated to the {arm.name} arm of the {exp.name} experiment?",
             texts,
-            arm.allocated.quotes
-            if arm.allocated and isinstance(arm.allocated, SampleSize)
-            else [],
+            (
+                arm.allocated.quotes
+                if arm.allocated and isinstance(arm.allocated, SampleSize)
+                else []
+            ),
         )
         for exp in gs.parsed_answer.experiments
         for arm in (exp.arms or [])

diff --git a/ice/recipes/experiments_and_arms/prompts/can_name_arms.py b/ice/recipes/experiments_and_arms/prompts/can_name_arms.py
@@ -13,7 +13,6 @@
 from ice.recipes.experiments_and_arms.prompts.utils import start_last_example
 from ice.recipes.experiments_and_arms.types import MultipartReasoningPrompt
 
-
 CAN_WE_NAME_ARMS_EXAMPLES: list[
     dict[str, Union[ValueTransform[Sequence[str]], str, int]]
 ] = [

diff --git a/ice/recipes/experiments_and_arms/prompts/cluster.py b/ice/recipes/experiments_and_arms/prompts/cluster.py
@@ -2,7 +2,6 @@
 
 from structlog.stdlib import get_logger
 
-
 log = get_logger()
 
 

diff --git a/ice/recipes/experiments_and_arms/prompts/consensus.py b/ice/recipes/experiments_and_arms/prompts/consensus.py
@@ -2,7 +2,6 @@
 
 from structlog.stdlib import get_logger
 
-
 log = get_logger()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,7 +9,6 @@

		from ice.formatter.transform import _Transform


		T_contra = TypeVar("T_contra", contravariant=True)


Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,7 +6,6 @@

		from ice.formatter.transform import _Transform


		T_contra = TypeVar("T_contra", contravariant=True)


Expand Down
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,7 +6,6 @@ @@
     from fvalues import F
     JSONValue = Union[
         str, int, float, bool, None, list["JSONValue"], dict[str, "JSONValue"]
     ]
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,7 +24,6 @@
		from ice.utils import map_async
		from ice.utils import max_by_value


		gpt2_tokenizer: GPT2TokenizerFast = AutoTokenizer.from_pretrained("gpt2")


Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -7,7 +7,6 @@
		from ice.apis.openai import openai_complete
		from ice.recipes.program_search.nodes.answer.types import Demonstration


		log = get_logger()


Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,7 +2,6 @@

		from structlog.stdlib import get_logger


		log = get_logger()


Expand Down