From 0917ea9b9b8e5b41bcfeb83257ec323ed044646c Mon Sep 17 00:00:00 2001 From: Francisco Xavier Vu Alves Pereira <54419505+Falvespereira@users.noreply.github.com> Date: Tue, 4 Mar 2025 18:04:39 -0500 Subject: [PATCH 01/78] first steps to making reading csv creating csv file --- benchtools/runner.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchtools/runner.py b/benchtools/runner.py index fcf2753..ecb0ad4 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -27,7 +27,12 @@ def from_txt_csv(): load a template from txt and create task objects for each row of a csv ''' # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor + textFile = open(self.dir, "r") + csvFile = pandas.read_csv(self.dir) + + + return self From 00b05e82be6a7cb4694fcbd1c49cb5b9b14ec296 Mon Sep 17 00:00:00 2001 From: Francisco Xavier Vu Alves Pereira <54419505+Falvespereira@users.noreply.github.com> Date: Wed, 5 Mar 2025 18:49:22 -0500 Subject: [PATCH 02/78] csv runner update (untested) should return the prompts as a array of strings --- benchtools/runner.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index ecb0ad4..505e483 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -29,11 +29,16 @@ def from_txt_csv(): # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor textFile = open(self.dir, "r") csvFile = pandas.read_csv(self.dir) - + x = 0 + storedPrompts = [] + while x < len(csvFile): + processed_prompt = textFile.replace("{a}", csvFile.iloc[x,1]) + processed_prompt.replace("{b}", csvFile.iloc[x, 2]) + storedPrompts.append(processed_prompt) - return self + return storedPrompts def from_yaml(): From 8e94e0ace7a3c7ce9dba690a43dd24d7e24517be Mon Sep 17 00:00:00 2001 From: Benjamin Dahrooge Date: Wed, 5 Mar 2025 23:13:15 -0500 Subject: [PATCH 03/78] Ollama and OpenAI API support complete --- .gitignore | 1 + benchtools/task.py | 89 ++++++++++++++++++++++++++++++++++------------ requirements.txt | 2 ++ 3 files changed, 69 insertions(+), 23 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e99e36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc \ No newline at end of file diff --git a/benchtools/task.py b/benchtools/task.py index d632513..f296856 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -1,30 +1,38 @@ # defines a class object for a task -from ollama import generate +from ollama import Client +from openai import OpenAI + from scorerers import exact_match -scoring_fx = {'exact_match':exact_match} +scoring_fx = {"exact_match": exact_match} + class PromptTask: - ''' + """ defines a basic prompt task with a simple scoring function - ''' - def __init__(self,prompt=None,scoring_function=None, reference=None, runner_type='ollama'): - ''' + """ + + def __init__( + self, prompt=None, scoring_function=None, reference=None, runner_type="ollama" + ): + """ init a task object Parameters ---------- dir : string or path directory containing the task assets - prompt : string + prompt : string prompt that will pass to the model scoring_function : function handle or string if string, must be name of built in eval function provided here reference: string or number solution that will be passed with the model answer - runner_type: string {ollama} - the way the runner should be called, - ''' + runner_type: string {ollama,openai} + define which runner should be used for the task. + to use the Ollama runner, the script expects the model to be installed, and `ollama serve` running on localhost:11434 + to use OpenAI runner, you must have an API key set in your OPENAI_API_KEY environment variable + """ self.prompt = prompt if type(scoring_function) == str: self.scoring_function = scoring_fx[scoring_function] @@ -34,27 +42,62 @@ def __init__(self,prompt=None,scoring_function=None, reference=None, runner_type self.reference = reference self.runner_type = runner_type - def run(self,model): - ''' - ''' - # this should actually be a better switch structure - # these types should be documented in the constructor method (init) - if self.runner_type == 'ollma': - return generate(model,self.prompt,) + def run(self, model, api_url=None): + """ + run the task on the model + Parameters + ---------- + model : string + the model to run the task on + api_url : string + the url of the api to use for the task + """ + match self.runner_type: + case "ollama": + client = Client( + host=api_url if api_url else "http://localhost:11434", + ) + response = client.chat( + model, + messages=[ + { + "role": "user", + "content": self.prompt, + }, + ], + ) + return response["message"]["content"] + case "openai": + client = OpenAI( + base_url=api_url if api_url else "https://api.openai.com/v1", + ) + chat_completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": self.prompt, + } + ], + ) + return chat_completion.choices[0].message.content + case _: + print(f"Runner type {self.runner_type} not supported") + return None - def score(self,response): - ''' + def score(self, response): + """ score the response using the defined function Parameters ---------- response : string the value to score - ''' - return self.scoring_function(response,self.reference) - + """ + return self.scoring_function(response, self.reference) + # additional classes for other types of tasks -# likely an agent task that can pass environment assets \ No newline at end of file +# likely an agent task that can pass environment assets diff --git a/requirements.txt b/requirements.txt index f6c04f8..a1eb96b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ pyyaml pandas click +ollama +openai \ No newline at end of file From d19bb604b9c455bda0244413ee1bb1a97116f7f9 Mon Sep 17 00:00:00 2001 From: Emory Salaun Date: Thu, 6 Mar 2025 15:05:59 -0500 Subject: [PATCH 04/78] From_Yaml and minor fixes. From_Yaml has been implemented and tested on the test yml file. Minor changes to the imports in runner.py. Initialization of the tasks list for Bench() has been fixed as well. --- benchtools/runner.py | 51 ++++++++++++++++++++++++++++++++------ demobench/miscops/task.yml | 6 ++--- 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index fcf2753..2d927fa 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,16 +1,21 @@ # module to run benchmarks import pandas -import yaml +import yaml # requires pyyaml +from itertools import product +from task import PromptTask +from pathlib import Path class Bench(): ''' ''' - def __init__(dir, target_dir): + + + def __init__(self, dir, target_dir): ''' ''' # load tasks from file strucutre and instantiate task objects for each, store those in a list. # loading will - self.tasks + self.tasks = [] @@ -30,8 +35,38 @@ def from_txt_csv(): return self - - def from_yaml(): - ''' - laod from a yaml - ''' \ No newline at end of file + + def from_yaml(self, yaml_file): + """ + Load tasks from a YAML file and generate PromptTask objects. + + Parameters + ---------- + yaml_file : str + Path to the YAML file containing task templates and values. + + Returns + ------- + self : Bench + The Bench instance with tasks populated. + """ + with open(yaml_file, 'r') as file: + data = yaml.safe_load(file) + + self.tasks = [] + + for entry in data: + template = entry["template"] # Extract template + values_dict = entry["values"] # Extract values dictionary + + # Generate all possible value combinations using itertools.product + keys = values_dict.keys() + value_combinations = product(*values_dict.values()) + + # Create a PromptTask for each combination + for values in value_combinations: + value_mapping = dict(zip(keys, values)) # Pair keys with values + filled_prompt = template.format(**value_mapping) # Format the template + self.tasks.append(PromptTask(prompt=filled_prompt)) # Store task + + return self diff --git a/demobench/miscops/task.yml b/demobench/miscops/task.yml index 9fdd6e6..206b397 100644 --- a/demobench/miscops/task.yml +++ b/demobench/miscops/task.yml @@ -1,7 +1,7 @@ - template: "find the product of {a} and {b}" values: - - a: 2,3,5 - - b: 3,4,5 + a: [2,3,5] + b: [3,4,5] - template: "what is the name for the following symbol? {symb}" values: - - symb: ["@","$","#"] \ No newline at end of file + symb: ["@","$","#"] \ No newline at end of file From 8b92d09e54cc5e3ed6f6d89db33517cc1ee83d3f Mon Sep 17 00:00:00 2001 From: Ritta Neg Mfa Date: Thu, 6 Mar 2025 15:45:55 -0500 Subject: [PATCH 05/78] log file. --- log_file.py | 22 ++++++++++++++++++++++ .vscode/settings.json | 5 +++++ 2 files changed, 27 insertions(+) create mode 100644 log_file.py create mode 100644 .vscode/settings.json diff --git a/ log_file.py b/ log_file.py new file mode 100644 index 0000000..9bf90cd --- /dev/null +++ b/ log_file.py @@ -0,0 +1,22 @@ +import logging +import os +#agent_log.text that is if we will like to name it that way , it could be anything +agent_name = os.path.join(os.getcwd(), 'agent_log.txt') +logging.basicConfig(filename=agent_name, + level=logging.INFO, format= '%(asctime)s - %(levelname)s - %(message)s') + +def log_agent_interaction(agent_input, agent_output): + """ + Logs the agent's input and output to a file. + + Parameters: + agent_input (str): The input provided to the agent. + agent_output (str): The output generated by the agent. + """ + logging.info(f'Input: {agent_input}') + logging.info(f'Output: {agent_output}') + + + + + diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..7146af2 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "cSpell.words": [ + "levelname" + ] +} \ No newline at end of file From 90083a6a28f578b94382ad7e66e27134f6961348 Mon Sep 17 00:00:00 2001 From: Ayman Sandouk <111829133+AymanBx@users.noreply.github.com> Date: Thu, 6 Mar 2025 15:53:23 -0500 Subject: [PATCH 06/78] Delete .vscode directory --- .vscode/settings.json | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 7146af2..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "cSpell.words": [ - "levelname" - ] -} \ No newline at end of file From 5b50d5ba481b9fd5c8e3f980a4820fe231808404 Mon Sep 17 00:00:00 2001 From: Victoria Ryan <123494653+victoriaryan2@users.noreply.github.com> Date: Thu, 6 Mar 2025 15:59:51 -0500 Subject: [PATCH 07/78] Update runner.py yaml --- benchtools/runner.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index fcf2753..6046bee 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -31,7 +31,25 @@ def from_txt_csv(): return self - def from_yaml(): - ''' - laod from a yaml - ''' \ No newline at end of file + def from_yaml(self, yaml_file): + with open(yaml_file, 'r') as file: + data = yaml.safe_load(file) + self.tasks = [] + for each in data: + template = each.get('template', '') + values = each.get('values', []) + processed_values = [] + for val in values: + for key, value in val.items(): + if isinstance(value, str): + processed_values.append((key, list(map(int, value.split(','))))) + else: + processed_values.append((key, value)) + keys = [key for key, _ in processed_values] + value_lists = [value for _, value in processed_values] + value_combinations = list(zip(*value_lists)) + for combination in value_combinations: + value_dict = dict(zip(keys, combination)) + temp = template.format(**value_dict) + self.tasks.append(temp) + return self.tasks From 84ab6dedf001da1e0a09872eb1f5c2dd5cf869a8 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 6 Mar 2025 16:50:59 -0500 Subject: [PATCH 08/78] initialze bench object --- benchtools/runner.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index fcf2753..d1c4f16 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,22 +1,32 @@ # module to run benchmarks import pandas import yaml +import os class Bench(): ''' ''' - def __init__(dir, target_dir): + def __init__(self, dir, target_dir): ''' ''' # load tasks from file strucutre and instantiate task objects for each, store those in a list. # loading will - self.tasks + task_folder = os.listdir(dir) + for file in task_folder: + if file.endswith("csv"): + self.tasks = self.from_txt_csv() + elif file.endswith("yml"): + self.tasks = self.from_yaml() + # Both functions should have the same type return. porobably should be a list of PRompt_Task + + def run(self, model): ''' + ''' for task in self.tasks: task.run(model) From 36d5230ae2759cf72f9a2434bda365b09debf41f Mon Sep 17 00:00:00 2001 From: Francisco Xavier Vu Alves Pereira <54419505+Falvespereira@users.noreply.github.com> Date: Thu, 6 Mar 2025 17:53:15 -0500 Subject: [PATCH 09/78] update to include the answers --- benchtools/runner.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 505e483..a84bb70 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,7 +1,7 @@ # module to run benchmarks import pandas import yaml - +import task class Bench(): ''' ''' @@ -22,23 +22,24 @@ def run(self, model): task.run(model) # possibly private method? - def from_txt_csv(): + def from_txt_csv(task_folder): ''' load a template from txt and create task objects for each row of a csv ''' # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor - textFile = open(self.dir, "r") - csvFile = pandas.read_csv(self.dir) + textFile = open(task_folder + "task.txt", "r") + csvFile = pandas.read_csv(task_folder + "values.csv") + answers = pandas.read_csv(task_folder + "results") x = 0 - storedPrompts = [] + storedTasks = [] + storedAnswers = [] while x < len(csvFile): processed_prompt = textFile.replace("{a}", csvFile.iloc[x,1]) processed_prompt.replace("{b}", csvFile.iloc[x, 2]) - storedPrompts.append(processed_prompt) - - + storedAnswers.append(answers[x,1]) + storedTasks.append(processed_prompt) - return storedPrompts + return storedTasks, storedAnswers def from_yaml(): From 19bb281c5f581734ba14cbe54c92b542b32c88c2 Mon Sep 17 00:00:00 2001 From: Benjamin Dahrooge Date: Thu, 6 Mar 2025 18:00:06 -0500 Subject: [PATCH 10/78] Runner returns prompt tuple, response --- benchtools/task.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/benchtools/task.py b/benchtools/task.py index f296856..e18de38 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -1,4 +1,3 @@ -# defines a class object for a task from ollama import Client from openai import OpenAI @@ -34,7 +33,7 @@ def __init__( to use OpenAI runner, you must have an API key set in your OPENAI_API_KEY environment variable """ self.prompt = prompt - if type(scoring_function) == str: + if type(scoring_function) is str: self.scoring_function = scoring_fx[scoring_function] else: self.scoring_function = scoring_function @@ -67,7 +66,7 @@ def run(self, model, api_url=None): }, ], ) - return response["message"]["content"] + return (self.prompt, response["message"]["content"]) case "openai": client = OpenAI( base_url=api_url if api_url else "https://api.openai.com/v1", @@ -81,7 +80,7 @@ def run(self, model, api_url=None): } ], ) - return chat_completion.choices[0].message.content + return (self.prompt, chat_completion.choices[0].message.content) case _: print(f"Runner type {self.runner_type} not supported") return None From fb98657cf88c4622690526d7e76cd28bea64f0b7 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 6 Mar 2025 20:23:59 -0500 Subject: [PATCH 11/78] Planning out the run function and the task object creation --- benchtools/runner.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index d1c4f16..d5cea9e 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -2,6 +2,7 @@ import pandas import yaml import os +from log_file.py import log_agent_interaction class Bench(): ''' @@ -11,12 +12,13 @@ def __init__(self, dir, target_dir): ''' # load tasks from file strucutre and instantiate task objects for each, store those in a list. # loading will + task_folder = os.listdir(dir) for file in task_folder: if file.endswith("csv"): - self.tasks = self.from_txt_csv() + self.tasks = self.from_txt_csv(dir) elif file.endswith("yml"): - self.tasks = self.from_yaml() + self.tasks = self.from_yaml(dir) # Both functions should have the same type return. porobably should be a list of PRompt_Task @@ -29,7 +31,16 @@ def run(self, model): ''' for task in self.tasks: - task.run(model) + (prompt, response) = task.run(model) + log_agent_interaction(prompt, response) + task.score() + + + + + + + # possibly private method? def from_txt_csv(): From 5b17a73e2773de3b89cb5f1584ab7d9215a36abc Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 18 Oct 2025 18:07:52 +0000 Subject: [PATCH 12/78] Added .gitignore with pyhon template --- .gitignore | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..976618d --- /dev/null +++ b/.gitignore @@ -0,0 +1,219 @@ +.vscode/ + +## Python template for .gitignore sourced from Github's [gitignore](https://github.com/github/gitignore) +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml \ No newline at end of file From d6fa598a4467b413f52e8967e78135a987e536cb Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 18 Oct 2025 18:09:57 +0000 Subject: [PATCH 13/78] Added necessary files to make the project into an installable python package --- benchtools/__init__.py | 0 setup.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 benchtools/__init__.py create mode 100644 setup.py diff --git a/benchtools/__init__.py b/benchtools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3439fe2 --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +from setuptools import setup, find_namespace_packages + +setup( + name='benchtools', + version='0.1', + packages=find_namespace_packages(), + install_requires=[ + 'Click' + ], + entry_points={ + 'console_scripts': [ + 'benchtool = benchtools.cli:cli', + ], + }, +) \ No newline at end of file From 0b0c068692351aa536af7745e967b283bca1ac7a Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 18 Oct 2025 18:18:40 +0000 Subject: [PATCH 14/78] Modifications to metadata in project.toml + fixing the cli to call the correct cli function --- project.toml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/project.toml b/project.toml index 8cbb88d..2106de6 100644 --- a/project.toml +++ b/project.toml @@ -13,19 +13,18 @@ dependencies = [ ] requires-python = ">=3.10" authors = [ + {name = "Ayman Sandouk, email = "ayman_sandouk@uri.edu"}, {name = "Sarah M Brown, email = "brownsarahm@uri.edu"}, - {name = "Tzu-Ping Chung", email = "tzu-ping@example.com"}, - {name = "Another person"}, - {email = "different.person@example.com"}, ] maintainers = [ + {name = "Ayman Sandouk, email = "ayman_sandouk@uri.edu"}, {name = "Sarah M Brown", email = "brownsarahm@uri.edu"} ] description = "" readme = "README.md" license = "MIT" # or license = {file = "LICENSE.txt"} for legacy declaration license-files = ["LICEN[CS]E.*"] -keywords = ["benchmark"] +keywords = ["benchmark", "machine-learning", "ai", "llm"] [project.optional-dependencies] @@ -38,11 +37,11 @@ cli = [ [project.urls] Homepage = "https://example.com" Documentation = "https://readthedocs.org" -Repository = "https://github.com/evalexplain/benchtools" -"Bug Tracker" = "https://github.com/evalexplain/benchtools/issues" +Repository = "https://github.com/ml4sts/benchtools" +"Bug Tracker" = "https://github.com/ml4sts/benchtools/issues" [project.scripts] -bench-cli = "benchtools:cli" +bench-cli = "benchtools.cli:main" From 8d9aaaf6829da6696002df6c337359a46b03c2b3 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 18 Oct 2025 18:29:08 +0000 Subject: [PATCH 15/78] cli.py: Adding init function and subfunctions for it --- benchtools/cli.py | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 8862f89..36fe414 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,7 +1,7 @@ +import os import click # from task import PromptTask # from runner import Bench -import os @click.group() def cli(): @@ -10,6 +10,8 @@ def cli(): """ pass + +## AS: Come back to this soon @click.command() @click.argument('bench_name') def generate_demo_bench(bench_name): @@ -34,6 +36,44 @@ def generate_demo_bench(bench_name): click.echo("Subfolders 'Tasks' and 'Report' created.") +## Sub-functions for the init method + +### If user calls init without a benchmark name as an argument +def get_benchmark_name(): + return input("Enter the name of your benchmark/project (will be used as folder and repo name)...\n") + + +### Generate an about path from the description of the user +def create_about(bench_name, bench_path, text): + about_path = os.path.join(bench_path, bench_name, "about.md") + about_text= f""" + # {bench_name} + {text} + + Generated by BenchTools + """ + with open(about_path, 'w') as file: + file.write(about_text) + + + + +# Initialize the benchmark +@click.command() +@click.argument('benchmark_name', required = False) +@click.option('--path', '-P', default=".", help="The path where the benchmark repository will be") +@click.option('--about', '-A', default="", help="The Content that goes in the about.md file") +def init(benchmark_name, path, about): + """Initializing a new benchmark.""" + # new_benchmark = PromptTask() + if not benchmark_name: + benchmark_name = get_benchmark_name() + click.echo("Creating " + benchmark_name + " in " + path) + os.mkdir(os.path.join(path, benchmark_name)) + create_about(benchmark_name, path, about) + + + # What us creating a new task @click.command() @click.argument('task_name') @@ -51,6 +91,7 @@ def run_task(task_name): click.echo(result) +cli.add_command(init) cli.add_command(add_task) cli.add_command(run_task) cli.add_command(generate_demo_bench) From 62d3b42ef8ebcc970cc0cc69b337759b6c178844 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 22 Oct 2025 05:43:01 +0000 Subject: [PATCH 16/78] add git init function --- benchtools/cli.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 36fe414..2ba066b 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -55,6 +55,18 @@ def create_about(bench_name, bench_path, text): with open(about_path, 'w') as file: file.write(about_text) + +### Initialize git repository +def init_repo(bench_name, bench_path): + current_dir = os.getcwd() + benchmark_path = os.path.join(bench_path, bench_name) + os.chdir(benchmark_path) + try: + os.system("git init .") + print("DONE!") + except: + print("git might not be initialized in your system. Please run \"git init . \" when setup") + os.chdir(current_dir) @@ -63,7 +75,8 @@ def create_about(bench_name, bench_path, text): @click.argument('benchmark_name', required = False) @click.option('--path', '-P', default=".", help="The path where the benchmark repository will be") @click.option('--about', '-A', default="", help="The Content that goes in the about.md file") -def init(benchmark_name, path, about): +@click.option('--no-git', default=False, help="Don't make benchmark a git repository. Default is False") +def init(benchmark_name, path, about, no_git): """Initializing a new benchmark.""" # new_benchmark = PromptTask() if not benchmark_name: @@ -71,7 +84,9 @@ def init(benchmark_name, path, about): click.echo("Creating " + benchmark_name + " in " + path) os.mkdir(os.path.join(path, benchmark_name)) create_about(benchmark_name, path, about) - + # Initialize a git repo + if not no_git: + init_repo(benchmark_name, path) # What us creating a new task From 097e08e171ecb8285fac2510180d5bc4bfd6a5c7 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 22 Oct 2025 05:52:01 +0000 Subject: [PATCH 17/78] small modifications to CLI functions to test --- benchtools/cli.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 2ba066b..314ae90 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -91,19 +91,20 @@ def init(benchmark_name, path, about, no_git): # What us creating a new task @click.command() -@click.argument('task_name') +@click.argument('task_name', required = True) +# @click.option() def add_task(task_name): - """Creating a new task.""" - new_task = PromptTask() + """Setting up a new task.""" + # new_task = PromptTask() click.echo("Adding " + task_name) - + @click.command() -@click.argument('task_name') +@click.argument('task_name', required = True) def run_task(task_name): - """Running a task.""" + """Running the tasks and generating logs""" - click.echo(result) + click.echo(f"Running {task_name} now") cli.add_command(init) From b9f07ccbe900cfa0f5addb0bdb6bef6bce5ff4d4 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 22 Oct 2025 07:23:08 +0000 Subject: [PATCH 18/78] CLI: Create .gitignore in init_repo function --- benchtools/cli.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 314ae90..b84a99e 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,5 +1,6 @@ import os import click +import requests # from task import PromptTask # from runner import Bench @@ -63,10 +64,15 @@ def init_repo(bench_name, bench_path): os.chdir(benchmark_path) try: os.system("git init .") - print("DONE!") except: print("git might not be initialized in your system. Please run \"git init . \" when setup") + # Get python gitignore template and create .gitignore + ignore_text = requests.get("https://raw.githubusercontent.com/github/gitignore/refs/heads/main/Python.gitignore") + if ignore_text.status_code == 200: + with open(".gitignore", 'a') as f: + f.write(ignore_text.text) os.chdir(current_dir) + From dad2eae10076512a0046336835df7a9e0f0af0be Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 23 Oct 2025 16:36:40 +0000 Subject: [PATCH 19/78] README: Adding some usage info for CLI --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index ab14c84..2af7773 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,20 @@ a python library designed to help people design and run LLM benchmarks **warning** currently just an outline, has not yet run + +## Usage + +### CLI + +``` +benchtool init +``` +To generate a folder structure for the Benchmark + +The system asks conceptual questions about the benchmark to align user's thoughts with the BetterBench checklist + + + ## Orientation to the Repo - the `benchtools` folder is the code for the library From 65c219ecb6ba025af748321a0e6fbe96b4cec34d Mon Sep 17 00:00:00 2001 From: abdel-elj Date: Wed, 5 Mar 2025 00:52:50 +0200 Subject: [PATCH 20/78] Adding CLI group to cli.py --- benchtools/cli.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/benchtools/cli.py b/benchtools/cli.py index 81c53b5..7dd07b1 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,2 +1,30 @@ import click + +@click.group() +def cli(): + """Benchmark Command-line Interface.""" + pass + +@click.command() +@click.argument('task_name') +def add_task(task_name): + """Creating a new task.""" + + click.echo(result) + + +@click.command() +@click.argument('task_name') +def run_task(task_name): + """Running a task.""" + + click.echo(result) + + +cli.add_command(add_task) +cli.add_command(run_task) + + +if __name__ == "__main__": + cli() From 73cea6788fa9cb8bc48215413a914bdc8e2127e0 Mon Sep 17 00:00:00 2001 From: abdel-elj Date: Wed, 5 Mar 2025 01:24:26 +0200 Subject: [PATCH 21/78] Initialize class for adding new task in cli.py --- benchtools/cli.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 7dd07b1..1361e8a 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,5 +1,6 @@ import click - +from task import PromptTask +from runner import Bench @click.group() def cli(): @@ -10,8 +11,8 @@ def cli(): @click.argument('task_name') def add_task(task_name): """Creating a new task.""" - - click.echo(result) + new_task = PromptTask() + click.echo("Adding " + task_name) @click.command() From 7630e35934a5af8f96b555181c97b07774c1f9cf Mon Sep 17 00:00:00 2001 From: Emory Salaun Date: Thu, 6 Mar 2025 17:53:38 -0500 Subject: [PATCH 22/78] Generates Folder Structure For Benchmark Generates main folder and two subfolders (reports and tasks) for further filling. Co-Authored-By: abdel-elj <196731936+abdel-elj@users.noreply.github.com> Co-Authored-By: nadia-sousa <98436401+nadia-sousa@users.noreply.github.com> --- benchtools/cli.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/benchtools/cli.py b/benchtools/cli.py index 1361e8a..b97a367 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,12 +1,32 @@ import click from task import PromptTask from runner import Bench +import os @click.group() def cli(): """Benchmark Command-line Interface.""" pass +@click.command() +@click.argument('bench_name') +def generate_demo_bench(bench_name): + current_dir = os.getcwd() + parent_dir = os.path.abspath(os.path.join(current_dir, "..")) + new_folder_path = os.path.join(parent_dir, bench_name) + os.makedirs(new_folder_path, exist_ok=True) + click.echo(f"Folder '{bench_name}' created at {new_folder_path}") + + tasks_folder = os.path.join(new_folder_path, "Tasks") + report_folder = os.path.join(new_folder_path, "Report") + + os.makedirs(tasks_folder, exist_ok=True) + os.makedirs(report_folder, exist_ok=True) + + click.echo("Subfolders 'Tasks' and 'Report' created.") + + + @click.command() @click.argument('task_name') def add_task(task_name): @@ -25,6 +45,7 @@ def run_task(task_name): cli.add_command(add_task) cli.add_command(run_task) +cli.add_command(generate_demo_bench) if __name__ == "__main__": From 1b62f8314ed29858424934ec776abc7d2aa9666f Mon Sep 17 00:00:00 2001 From: AymanBx Date: Tue, 1 Jul 2025 14:58:03 +0000 Subject: [PATCH 23/78] Small modifications and comments --- benchtools/cli.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index b97a367..8862f89 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,32 +1,40 @@ import click -from task import PromptTask -from runner import Bench +# from task import PromptTask +# from runner import Bench import os @click.group() def cli(): - """Benchmark Command-line Interface.""" + """ + BenchTools is a tool that helps researchers set up benchmarks. + """ pass @click.command() @click.argument('bench_name') def generate_demo_bench(bench_name): + """ + Generate a demo benchmark + """ + + # Set up directory for the demo bench current_dir = os.getcwd() parent_dir = os.path.abspath(os.path.join(current_dir, "..")) - new_folder_path = os.path.join(parent_dir, bench_name) - os.makedirs(new_folder_path, exist_ok=True) - click.echo(f"Folder '{bench_name}' created at {new_folder_path}") + # Change this to use the current path or possibly take a path as an argument + demo_bench_path = os.path.join(parent_dir, bench_name) + os.makedirs(demo_bench_path, exist_ok=True) - tasks_folder = os.path.join(new_folder_path, "Tasks") - report_folder = os.path.join(new_folder_path, "Report") + tasks_folder = os.path.join(demo_bench_path, "Tasks") + report_folder = os.path.join(demo_bench_path, "Report") os.makedirs(tasks_folder, exist_ok=True) os.makedirs(report_folder, exist_ok=True) + click.echo(f"Folder '{bench_name}' created at {demo_bench_path}") click.echo("Subfolders 'Tasks' and 'Report' created.") - +# What us creating a new task @click.command() @click.argument('task_name') def add_task(task_name): From cf43336788a1ed39f06c1bdc8e26516436bbba2d Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 6 Mar 2025 16:50:59 -0500 Subject: [PATCH 24/78] initialze bench object --- benchtools/runner.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 4d92cc1..1d7d973 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,33 +1,31 @@ # module to run benchmarks -import pandas -import yaml # requires pyyaml import os +import yaml # requires pyyaml +import pandas from task import PromptTask from pathlib import Path from log_file.py import log_agent_interaction +from itertools import product class Bench(): ''' ''' - - def __init__(self, dir, target_dir): ''' ''' # load tasks from file strucutre and instantiate task objects for each, store those in a list. # loading will - - task_folder = os.listdir(dir) for file in task_folder: if file.endswith("csv"): - self.tasks = self.from_txt_csv(dir) + self.tasks = self.from_txt_csv() elif file.endswith("yml"): - self.tasks = self.from_yaml(dir) + self.tasks = self.from_yaml() # Both functions should have the same type return. porobably should be a list of PRompt_Task + def run(self, model): From f4e3322f760fd49a8b8928b95799434d01b712c4 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 6 Mar 2025 20:23:59 -0500 Subject: [PATCH 25/78] Planning out the run function and the task object creation --- benchtools/runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 1d7d973..4e5af38 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -6,7 +6,7 @@ from pathlib import Path from log_file.py import log_agent_interaction from itertools import product - +from log_file.py import log_agent_interaction class Bench(): ''' @@ -16,12 +16,13 @@ def __init__(self, dir, target_dir): ''' # load tasks from file strucutre and instantiate task objects for each, store those in a list. # loading will + task_folder = os.listdir(dir) for file in task_folder: if file.endswith("csv"): - self.tasks = self.from_txt_csv() + self.tasks = self.from_txt_csv(dir) elif file.endswith("yml"): - self.tasks = self.from_yaml() + self.tasks = self.from_yaml(dir) # Both functions should have the same type return. porobably should be a list of PRompt_Task From 85eab252be67014b6caf90c91eeda8333b881483 Mon Sep 17 00:00:00 2001 From: Francisco Xavier Vu Alves Pereira <54419505+Falvespereira@users.noreply.github.com> Date: Thu, 6 Mar 2025 17:53:15 -0500 Subject: [PATCH 26/78] update to include the answers --- benchtools/runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchtools/runner.py b/benchtools/runner.py index 4e5af38..3176ce3 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,5 +1,6 @@ # module to run benchmarks import os +import task import yaml # requires pyyaml import pandas from task import PromptTask @@ -8,6 +9,7 @@ from itertools import product from log_file.py import log_agent_interaction + class Bench(): ''' ''' From b172d6f110e4fea874be19f3c4fec422388854fd Mon Sep 17 00:00:00 2001 From: Benjamin Dahrooge Date: Wed, 5 Mar 2025 23:13:15 -0500 Subject: [PATCH 27/78] Ollama and OpenAI API support complete --- benchtools/task.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/benchtools/task.py b/benchtools/task.py index e18de38..ecc9470 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -1,3 +1,7 @@ +<<<<<<< HEAD +======= +# defines a class object for a task +>>>>>>> ad67394 (Ollama and OpenAI API support complete) from ollama import Client from openai import OpenAI @@ -26,6 +30,12 @@ def __init__( scoring_function : function handle or string if string, must be name of built in eval function provided here reference: string or number +<<<<<<< HEAD +======= + solution that will be passed with the model answer to the scoring function + runner_type: string {ollama} + the way the runner should be called, +>>>>>>> ad67394 (Ollama and OpenAI API support complete) solution that will be passed with the model answer runner_type: string {ollama,openai} define which runner should be used for the task. From dcdf391ece0e82a72bd8b58c06b33ab1281d474a Mon Sep 17 00:00:00 2001 From: Victoria Ryan <123494653+victoriaryan2@users.noreply.github.com> Date: Thu, 6 Mar 2025 15:59:51 -0500 Subject: [PATCH 28/78] Update runner.py yaml --- benchtools/runner.py | 39 +++++++++++++++++++++++---------------- benchtools/task.py | 6 ------ 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 3176ce3..a66ed7b 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -103,20 +103,27 @@ def from_yaml(self, yaml_file): self.tasks.append(PromptTask(prompt=filled_prompt)) # Store task return self + - -# processed_values = [] -# for val in values: -# for key, value in val.items(): -# if isinstance(value, str): -# processed_values.append((key, list(map(int, value.split(','))))) -# else: -# processed_values.append((key, value)) -# keys = [key for key, _ in processed_values] -# value_lists = [value for _, value in processed_values] -# value_combinations = list(zip(*value_lists)) -# for combination in value_combinations: -# value_dict = dict(zip(keys, combination)) -# temp = template.format(**value_dict) -# self.tasks.append(temp) -# return self.tasks + def from_yaml(self, yaml_file): + with open(yaml_file, 'r') as file: + data = yaml.safe_load(file) + self.tasks = [] + for each in data: + template = each.get('template', '') + values = each.get('values', []) + processed_values = [] + for val in values: + for key, value in val.items(): + if isinstance(value, str): + processed_values.append((key, list(map(int, value.split(','))))) + else: + processed_values.append((key, value)) + keys = [key for key, _ in processed_values] + value_lists = [value for _, value in processed_values] + value_combinations = list(zip(*value_lists)) + for combination in value_combinations: + value_dict = dict(zip(keys, combination)) + temp = template.format(**value_dict) + self.tasks.append(temp) + return self.tasks diff --git a/benchtools/task.py b/benchtools/task.py index ecc9470..881d232 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -1,7 +1,4 @@ -<<<<<<< HEAD -======= # defines a class object for a task ->>>>>>> ad67394 (Ollama and OpenAI API support complete) from ollama import Client from openai import OpenAI @@ -30,12 +27,9 @@ def __init__( scoring_function : function handle or string if string, must be name of built in eval function provided here reference: string or number -<<<<<<< HEAD -======= solution that will be passed with the model answer to the scoring function runner_type: string {ollama} the way the runner should be called, ->>>>>>> ad67394 (Ollama and OpenAI API support complete) solution that will be passed with the model answer runner_type: string {ollama,openai} define which runner should be used for the task. From 87d789a33a7e400b312a8d0f9766408bcf666b50 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 5 Nov 2025 03:42:58 +0000 Subject: [PATCH 29/78] Miscops: Fixed YAML syntax --- demobench/miscops/task.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/demobench/miscops/task.yml b/demobench/miscops/task.yml index 206b397..4787603 100644 --- a/demobench/miscops/task.yml +++ b/demobench/miscops/task.yml @@ -1,7 +1,9 @@ - template: "find the product of {a} and {b}" values: - a: [2,3,5] - b: [3,4,5] + a: [2,3,5] + b: [3,4,5] + result: [6,12,25] - template: "what is the name for the following symbol? {symb}" values: - symb: ["@","$","#"] \ No newline at end of file + symb: ["@","$","#"] + result: ["at", "dollar sign", "pound"] \ No newline at end of file From 841e9a74d4f835af2fc767a2b38bb37de3b84ef8 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 5 Nov 2025 03:43:27 +0000 Subject: [PATCH 30/78] Add Task: added results to col3 --- demobench/add/values.csv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demobench/add/values.csv b/demobench/add/values.csv index 641cea4..183572a 100644 --- a/demobench/add/values.csv +++ b/demobench/add/values.csv @@ -1,4 +1,4 @@ -a,b -2,3 -4,5 -8,9 \ No newline at end of file +a,b,res +2,3,5 +4,5,9 +8,9,17 \ No newline at end of file From fcf41baf8cb9faf7adf3f4d719c49942e957dfe7 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 5 Nov 2025 03:44:19 +0000 Subject: [PATCH 31/78] Runner: small mods to extract data functions --- benchtools/runner.py | 40 +++++++++------------------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index a66ed7b..6f274cf 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -57,19 +57,19 @@ def from_txt_csv(task_folder): textFile = open(task_folder + "task.txt", "r") csvFile = pandas.read_csv(task_folder + "values.csv") answers = pandas.read_csv(task_folder + "results") - x = 0 + storedTasks = [] storedAnswers = [] - while x < len(csvFile): + for x in range(len(csvFile)): processed_prompt = textFile.replace("{a}", csvFile.iloc[x,1]) processed_prompt.replace("{b}", csvFile.iloc[x, 2]) - storedAnswers.append(answers[x,1]) storedTasks.append(processed_prompt) + storedAnswers.append(csvFile.iloc[x, 3]) return storedTasks, storedAnswers - def from_yaml(self, yaml_file): + def from_yaml(yaml_file): """ Load tasks from a YAML file and generate PromptTask objects. @@ -86,11 +86,13 @@ def from_yaml(self, yaml_file): with open(yaml_file, 'r') as file: data = yaml.safe_load(file) - self.tasks = [] + storedTasks = [] + storedAnswers = [] for entry in data: template = entry["template"] # Extract template values_dict = entry["values"] # Extract values dictionary + storedAnswers = entry["result"] # Generate all possible value combinations using itertools.product keys = values_dict.keys() @@ -100,30 +102,6 @@ def from_yaml(self, yaml_file): for values in value_combinations: value_mapping = dict(zip(keys, values)) # Pair keys with values filled_prompt = template.format(**value_mapping) # Format the template - self.tasks.append(PromptTask(prompt=filled_prompt)) # Store task + storedTasks.append(filled_prompt) # Store task - return self - - - def from_yaml(self, yaml_file): - with open(yaml_file, 'r') as file: - data = yaml.safe_load(file) - self.tasks = [] - for each in data: - template = each.get('template', '') - values = each.get('values', []) - processed_values = [] - for val in values: - for key, value in val.items(): - if isinstance(value, str): - processed_values.append((key, list(map(int, value.split(','))))) - else: - processed_values.append((key, value)) - keys = [key for key, _ in processed_values] - value_lists = [value for _, value in processed_values] - value_combinations = list(zip(*value_lists)) - for combination in value_combinations: - value_dict = dict(zip(keys, combination)) - temp = template.format(**value_dict) - self.tasks.append(temp) - return self.tasks + return storedTasks, storedAnswers \ No newline at end of file From a95a9e8f72e11f4b107b7800f773c3b47f231186 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 5 Nov 2025 03:44:45 +0000 Subject: [PATCH 32/78] toml: added dependancies --- project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/project.toml b/project.toml index 8cbb88d..30c83d4 100644 --- a/project.toml +++ b/project.toml @@ -33,6 +33,8 @@ gui = ["PyQt5"] cli = [ "rich", "click", + "pyyaml", + "pandas", ] [project.urls] From 253b229165cd6fa5600736b2a48d800ed49b5548 Mon Sep 17 00:00:00 2001 From: Ayman Sandouk <111829133+AymanBx@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:40:50 +0300 Subject: [PATCH 33/78] Change option flags from uppercase to lowercase --- benchtools/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index b84a99e..b71be42 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -79,8 +79,8 @@ def init_repo(bench_name, bench_path): # Initialize the benchmark @click.command() @click.argument('benchmark_name', required = False) -@click.option('--path', '-P', default=".", help="The path where the benchmark repository will be") -@click.option('--about', '-A', default="", help="The Content that goes in the about.md file") +@click.option('--path', '-p', default=".", help="The path where the benchmark repository will be") +@click.option('--about', '-a', default="", help="The Content that goes in the about.md file") @click.option('--no-git', default=False, help="Don't make benchmark a git repository. Default is False") def init(benchmark_name, path, about, no_git): """Initializing a new benchmark.""" From 7e1e8e49e95395a5b477c4e8437489155ad75d2c Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sun, 9 Nov 2025 05:48:51 +0000 Subject: [PATCH 34/78] CLI: Added setup_task function --- benchtools/cli.py | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index b71be42..b9ed116 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,5 +1,6 @@ import os import click +import shutil import requests # from task import PromptTask # from runner import Bench @@ -63,7 +64,8 @@ def init_repo(bench_name, bench_path): benchmark_path = os.path.join(bench_path, bench_name) os.chdir(benchmark_path) try: - os.system("git init .") + os.system("git init . -q") + os.system("git branch -m main") except: print("git might not be initialized in your system. Please run \"git init . \" when setup") # Get python gitignore template and create .gitignore @@ -73,27 +75,57 @@ def init_repo(bench_name, bench_path): f.write(ignore_text.text) os.chdir(current_dir) + +# Create a benchmarks folder with tasks in them +def setup_task(tasks_path, task_name, task_path): + + click.echo(f"Setting up {task_name}...", nl=False) + task_folder = os.path.join(tasks_path, task_name) + os.mkdir(task_folder) # TODO: check if folder exists and handle + + # Path could be absolute or relative, check and work accordingly + if not task_path.startswith('/'): + if task_path.startswith('./'): + # TODO: Path could have one or more `../` use relpath to fix this block + task_path = task_path[2:] + task_path = os.path.join(os.getcwd(), task_path) + # print(f" path {task_path}\n\n") # Debugging - + # could be a single file or a folder check and work accordignly + if os.path.isdir(task_path): + for sub in os.listdir(task_path): + shutil.copy2(os.path.join(task_path, sub), task_folder) + else: + shutil.copy2(task_path, task_folder) + click.echo("Success") + # Initialize the benchmark @click.command() -@click.argument('benchmark_name', required = False) +@click.argument('benchmark_name', required=False) @click.option('--path', '-p', default=".", help="The path where the benchmark repository will be") @click.option('--about', '-a', default="", help="The Content that goes in the about.md file") -@click.option('--no-git', default=False, help="Don't make benchmark a git repository. Default is False") -def init(benchmark_name, path, about, no_git): +@click.option('--no-git', is_flag=True, help="Don't make benchmark a git repository. Default is False") +@click.option('--tasks', '-t', type=(str, str), multiple=True, default=[], help="Add benchmark tasks to your benchmark ") +def init(benchmark_name, path, about, no_git, tasks): """Initializing a new benchmark.""" # new_benchmark = PromptTask() if not benchmark_name: benchmark_name = get_benchmark_name() click.echo("Creating " + benchmark_name + " in " + path) - os.mkdir(os.path.join(path, benchmark_name)) + bench_path = os.path.join(path, benchmark_name) + os.mkdir(bench_path) create_about(benchmark_name, path, about) # Initialize a git repo if not no_git: init_repo(benchmark_name, path) + # Create a benchmarks folder with tasks in them + tasks_path = os.path.join(bench_path, "benchmarks") + os.mkdir(tasks_path) + for task_name, task_path in tasks: + setup_task(tasks_path, task_name, task_path) + # What us creating a new task @click.command() From 8fb542b5743515e0549ed5549453ac1c49c861e0 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sun, 9 Nov 2025 07:40:43 +0000 Subject: [PATCH 35/78] CLI: run command --- benchtools/cli.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index b9ed116..9fba22e 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -2,8 +2,8 @@ import click import shutil import requests +from benchtools.runner import Bench # from task import PromptTask -# from runner import Bench @click.group() def cli(): @@ -113,12 +113,17 @@ def init(benchmark_name, path, about, no_git, tasks): if not benchmark_name: benchmark_name = get_benchmark_name() click.echo("Creating " + benchmark_name + " in " + path) + + # Create the benchmark folder bench_path = os.path.join(path, benchmark_name) os.mkdir(bench_path) - create_about(benchmark_name, path, about) + # Initialize a git repo if not no_git: - init_repo(benchmark_name, path) + init_repo(benchmark_name, path) # TODO: bench_path? + + # Create about.md + create_about(benchmark_name, path, about) # TODO: bench_path? # Create a benchmarks folder with tasks in them tasks_path = os.path.join(bench_path, "benchmarks") @@ -126,6 +131,11 @@ def init(benchmark_name, path, about, no_git, tasks): for task_name, task_path in tasks: setup_task(tasks_path, task_name, task_path) + to_run = input(" Would you like to run the benchmark? y/n? ") + print() + if to_run in ['y', 'Y', 'yes', "YES", 'Yes', 'yyes']: + benchmark = Bench(bench_path, "something") + # What us creating a new task @click.command() @@ -137,6 +147,16 @@ def add_task(task_name): click.echo("Adding " + task_name) +@click.command() +@click.argument('benchmark',type=str, required = True) +def run(benchmark: str): + """Running the benchmark and generating logs""" + bench_path = os.path.join(os.getcwd(), benchmark) + + # TODO: Fix relPath case + click.echo(f"Running {benchmark.rsplit('/',maxsplit=1)[1]} now") + benchmark = Bench(benchmark, "something") + @click.command() @click.argument('task_name', required = True) def run_task(task_name): @@ -146,6 +166,7 @@ def run_task(task_name): cli.add_command(init) +cli.add_command(run) cli.add_command(add_task) cli.add_command(run_task) cli.add_command(generate_demo_bench) From acde0524aeb24552b0b8d6d21ffdf69b227ca537 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sun, 9 Nov 2025 07:41:42 +0000 Subject: [PATCH 36/78] Runner: Expect more than one task and add them to task list --- benchtools/runner.py | 150 +++++++++++++++++++++---------------------- 1 file changed, 75 insertions(+), 75 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 6f274cf..5265af7 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,40 +1,48 @@ # module to run benchmarks import os -import task import yaml # requires pyyaml import pandas -from task import PromptTask from pathlib import Path -from log_file.py import log_agent_interaction from itertools import product -from log_file.py import log_agent_interaction +# from benchtools.task import PromptTask +# from log_file.py import log_agent_interaction class Bench(): ''' ''' - def __init__(self, dir, target_dir): + def __init__(self, bench_dir, target_dir): ''' ''' # load tasks from file strucutre and instantiate task objects for each, store those in a list. # loading will - - task_folder = os.listdir(dir) - for file in task_folder: - if file.endswith("csv"): - self.tasks = self.from_txt_csv(dir) - elif file.endswith("yml"): - self.tasks = self.from_yaml(dir) + self.tasks = [] + + tasks_folder = os.path.join(bench_dir, "benchmarks") + tasks = os.listdir(tasks_folder) # Both functions should have the same type return. porobably should be a list of PRompt_Task + # TODO: use Prompt_Task + for task in tasks: + content = os.listdir(os.path.join(tasks_folder,task)) + for file in content: + if file.endswith("csv"): + self.tasks.append((task, from_txt_csv(os.path.join(tasks_folder,task)))) + elif file.endswith("yml"): + self.tasks.append((task, from_yaml(os.path.join(tasks_folder,task,file)))) + for name, (prompts, answers) in self.tasks: + print("Task: " + name) + print("Prompts: ", end='') + print(prompts) + print("Answers: ", end='') + print(answers) + def run(self, model): ''' - - ''' for task in self.tasks: (prompt, response) = task.run(model) @@ -44,64 +52,56 @@ def run(self, model): - - - - - # possibly private method? - def from_txt_csv(task_folder): - ''' - load a template from txt and create task objects for each row of a csv - ''' - # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor - textFile = open(task_folder + "task.txt", "r") - csvFile = pandas.read_csv(task_folder + "values.csv") - answers = pandas.read_csv(task_folder + "results") - - storedTasks = [] - storedAnswers = [] - for x in range(len(csvFile)): - processed_prompt = textFile.replace("{a}", csvFile.iloc[x,1]) - processed_prompt.replace("{b}", csvFile.iloc[x, 2]) - storedTasks.append(processed_prompt) - storedAnswers.append(csvFile.iloc[x, 3]) - - return storedTasks, storedAnswers - - - def from_yaml(yaml_file): - """ - Load tasks from a YAML file and generate PromptTask objects. - - Parameters - ---------- - yaml_file : str - Path to the YAML file containing task templates and values. - - Returns - ------- - self : Bench - The Bench instance with tasks populated. - """ - with open(yaml_file, 'r') as file: - data = yaml.safe_load(file) - - storedTasks = [] - storedAnswers = [] - - for entry in data: - template = entry["template"] # Extract template - values_dict = entry["values"] # Extract values dictionary - storedAnswers = entry["result"] - - # Generate all possible value combinations using itertools.product - keys = values_dict.keys() - value_combinations = zip(*values_dict.values()) - - # Create a PromptTask for each combination - for values in value_combinations: - value_mapping = dict(zip(keys, values)) # Pair keys with values - filled_prompt = template.format(**value_mapping) # Format the template - storedTasks.append(filled_prompt) # Store task - - return storedTasks, storedAnswers \ No newline at end of file +# possibly private method? # TODO: Fix csv indexing? +def from_txt_csv(task_folder): + ''' + load a template from txt and create task objects for each row of a csv + ''' + # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor + prompt = "" + with open(os.path.join(task_folder, "task.txt"), "r") as f: + prompt = f.read() + + csvFile = pandas.read_csv(os.path.join(task_folder, "values.csv")) + # answers = pandas.read_csv(os.path.join(task_folder, "results")) + storedTasks = [] + storedAnswers = [] + for x in range(len(csvFile)): + processed_prompt = prompt.replace("{a}", str(csvFile.iloc[x,0])) + processed_prompt = processed_prompt.replace("{b}", str(csvFile.iloc[x, 1])) + storedTasks.append(processed_prompt) + print("Prompt: "+ processed_prompt) + storedAnswers.append(str(csvFile.iloc[x, 2])) + + return (storedTasks, storedAnswers) + + +def from_yaml(yaml_file): + """ + Load tasks from a YAML file and generate PromptTask objects. + Parameters + ---------- + yaml_file : str + Path to the YAML file containing task templates and values. + Returns + ------- + self : Bench + The Bench instance with tasks populated. + """ + with open(yaml_file, 'r') as file: + data = yaml.safe_load(file) + storedTasks = [] + storedAnswers = [] + for entry in data: + template = entry["template"] # Extract template + values_dict = entry["values"] # Extract values dictionary + storedAnswers = entry["result"] + # Generate all possible value combinations using itertools.product + keys = values_dict.keys() + value_combinations = zip(*values_dict.values()) + # Create a PromptTask for each combination + for values in value_combinations: + value_mapping = dict(zip(keys, values)) # Pair keys with values + filled_prompt = template.format(**value_mapping) # Format the template + storedTasks.append(filled_prompt) # Store task + return (storedTasks, storedAnswers) \ No newline at end of file From 2aa21df678b71ae81717162b6e479c960cefdc2b Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 15 Nov 2025 01:19:19 +0000 Subject: [PATCH 37/78] Runner: Store results correctly --- benchtools/runner.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 5265af7..3b4c032 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -92,10 +92,10 @@ def from_yaml(yaml_file): data = yaml.safe_load(file) storedTasks = [] storedAnswers = [] - for entry in data: - template = entry["template"] # Extract template - values_dict = entry["values"] # Extract values dictionary - storedAnswers = entry["result"] + for sub_task in data: + template = sub_task["template"] # Extract template + values_dict = sub_task["values"] # Extract values dictionary + answers = sub_task["result"] # Generate all possible value combinations using itertools.product keys = values_dict.keys() value_combinations = zip(*values_dict.values()) @@ -104,4 +104,6 @@ def from_yaml(yaml_file): value_mapping = dict(zip(keys, values)) # Pair keys with values filled_prompt = template.format(**value_mapping) # Format the template storedTasks.append(filled_prompt) # Store task + for answer in answers: + storedAnswers.append(answer) return (storedTasks, storedAnswers) \ No newline at end of file From 97d473ed61ce40795fd24efeb28c18ced9a2a7ea Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 4 Dec 2025 00:13:11 +0000 Subject: [PATCH 38/78] Task/Runner: Runner now uses improved task class --- benchtools/runner.py | 89 ++++------------------ benchtools/task.py | 171 +++++++++++++++++++++++++++++++++---------- 2 files changed, 147 insertions(+), 113 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 3b4c032..e751195 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,109 +1,52 @@ # module to run benchmarks import os -import yaml # requires pyyaml -import pandas from pathlib import Path from itertools import product -# from benchtools.task import PromptTask +from benchtools.task import Task # from log_file.py import log_agent_interaction class Bench(): ''' ''' - def __init__(self, bench_dir, target_dir): + def __init__(self, bench_dir): ''' ''' # load tasks from file strucutre and instantiate task objects for each, store those in a list. # loading will + self.directory = bench_dir self.tasks = [] tasks_folder = os.path.join(bench_dir, "benchmarks") tasks = os.listdir(tasks_folder) - # Both functions should have the same type return. porobably should be a list of PRompt_Task - # TODO: use Prompt_Task for task in tasks: content = os.listdir(os.path.join(tasks_folder,task)) for file in content: if file.endswith("csv"): - self.tasks.append((task, from_txt_csv(os.path.join(tasks_folder,task)))) + self.tasks.append(Task('csv', task, os.path.join(tasks_folder,task))) elif file.endswith("yml"): - self.tasks.append((task, from_yaml(os.path.join(tasks_folder,task,file)))) + self.tasks.append(Task('yml', task, os.path.join(tasks_folder,task,file))) - for name, (prompts, answers) in self.tasks: + for task in self.tasks: + name, prompts, answers = task.name, task.sub_tasks, task.answers print("Task: " + name) print("Prompts: ", end='') print(prompts) print("Answers: ", end='') print(answers) - + print("Responses: ", end='') + task.run("gemma3") + print(task.responses) - def run(self, model): + + def run(self, model, api_url=None): ''' ''' for task in self.tasks: - (prompt, response) = task.run(model) - log_agent_interaction(prompt, response) - task.score() - - - - -# possibly private method? # TODO: Fix csv indexing? -def from_txt_csv(task_folder): - ''' - load a template from txt and create task objects for each row of a csv - ''' - # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor - prompt = "" - with open(os.path.join(task_folder, "task.txt"), "r") as f: - prompt = f.read() - - csvFile = pandas.read_csv(os.path.join(task_folder, "values.csv")) - # answers = pandas.read_csv(os.path.join(task_folder, "results")) - storedTasks = [] - storedAnswers = [] - for x in range(len(csvFile)): - processed_prompt = prompt.replace("{a}", str(csvFile.iloc[x,0])) - processed_prompt = processed_prompt.replace("{b}", str(csvFile.iloc[x, 1])) - storedTasks.append(processed_prompt) - print("Prompt: "+ processed_prompt) - storedAnswers.append(str(csvFile.iloc[x, 2])) - - return (storedTasks, storedAnswers) - - -def from_yaml(yaml_file): - """ - Load tasks from a YAML file and generate PromptTask objects. - Parameters - ---------- - yaml_file : str - Path to the YAML file containing task templates and values. - Returns - ------- - self : Bench - The Bench instance with tasks populated. - """ - with open(yaml_file, 'r') as file: - data = yaml.safe_load(file) - storedTasks = [] - storedAnswers = [] - for sub_task in data: - template = sub_task["template"] # Extract template - values_dict = sub_task["values"] # Extract values dictionary - answers = sub_task["result"] - # Generate all possible value combinations using itertools.product - keys = values_dict.keys() - value_combinations = zip(*values_dict.values()) - # Create a PromptTask for each combination - for values in value_combinations: - value_mapping = dict(zip(keys, values)) # Pair keys with values - filled_prompt = template.format(**value_mapping) # Format the template - storedTasks.append(filled_prompt) # Store task - for answer in answers: - storedAnswers.append(answer) - return (storedTasks, storedAnswers) \ No newline at end of file + task.run(model, api_url) + # log_agent_interaction(prompt, response) + # task.score() + print(task.responses) \ No newline at end of file diff --git a/benchtools/task.py b/benchtools/task.py index 881d232..322ea73 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -1,19 +1,21 @@ # defines a class object for a task -from ollama import Client -from openai import OpenAI +# from openai import OpenAI +import os +import yaml # requires pyyaml +import pandas +from ollama import chat, ChatResponse, Client -from scorerers import exact_match +# from scorerers import exact_match +# scoring_fx = {"exact_match": exact_match} -scoring_fx = {"exact_match": exact_match} - -class PromptTask: +class Task: """ defines a basic prompt task with a simple scoring function """ def __init__( - self, prompt=None, scoring_function=None, reference=None, runner_type="ollama" + self, data_type, name, path, scoring_function=None, reference=None, runner_type="ollama" ): """ init a task object @@ -22,8 +24,7 @@ def __init__( ---------- dir : string or path directory containing the task assets - prompt : string - prompt that will pass to the model + scoring_function : function handle or string if string, must be name of built in eval function provided here reference: string or number @@ -36,7 +37,20 @@ def __init__( to use the Ollama runner, the script expects the model to be installed, and `ollama serve` running on localhost:11434 to use OpenAI runner, you must have an API key set in your OPENAI_API_KEY environment variable """ - self.prompt = prompt + self.name = name + self.sub_tasks = [] + self.answers = [] + match data_type: + case 'csv': + prompt, answer = from_txt_csv(path) + self.sub_tasks=(prompt) + self.answers=(answer) + case 'yml': + prompt, answer = from_yaml(path) + self.sub_tasks=(prompt) + self.answers=(answer) + + if type(scoring_function) is str: self.scoring_function = scoring_fx[scoring_function] else: @@ -44,6 +58,9 @@ def __init__( self.reference = reference self.runner_type = runner_type + self.responses = [] + + def run(self, model, api_url=None): """ @@ -56,38 +73,54 @@ def run(self, model, api_url=None): api_url : string the url of the api to use for the task """ - match self.runner_type: - case "ollama": - client = Client( - host=api_url if api_url else "http://localhost:11434", - ) - response = client.chat( - model, - messages=[ + + for sub_task in self.sub_tasks: + print(sub_task) + + match self.runner_type: + case "ollama": + response: ChatResponse = chat(model=model, messages=[ { - "role": "user", - "content": self.prompt, + 'role': 'user', + 'content':sub_task, }, - ], - ) - return (self.prompt, response["message"]["content"]) - case "openai": - client = OpenAI( - base_url=api_url if api_url else "https://api.openai.com/v1", - ) - chat_completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": self.prompt, - } - ], - ) - return (self.prompt, chat_completion.choices[0].message.content) - case _: - print(f"Runner type {self.runner_type} not supported") - return None + ]) + # print("response: " + response.message.content) + self.responses.append(response.message.content) + + case "ollama_api": + client = Client( + host=api_url if api_url else "http://localhost:11434", + ) + response = client.chat( + model, + messages=[ + { + "role": "user", + "content": sub_task, + }, + ], + ) + self.responses.append(response["message"]["content"]) + + case "openai": + client = OpenAI( + base_url=api_url if api_url else "https://api.openai.com/v1", + ) + chat_completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": sub_task, + } + ], + ) + self.responses.append(chat_completion.choices[0].message.content) + case _: + print(f"Runner type {self.runner_type} not supported") + return None + def score(self, response): """ @@ -104,3 +137,61 @@ def score(self, response): # additional classes for other types of tasks # likely an agent task that can pass environment assets + + +# possibly private method? # TODO: Fix csv indexing? +def from_txt_csv(task_folder): + ''' + load a template from txt and create task objects for each row of a csv + ''' + # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor + prompt = "" + with open(os.path.join(task_folder, "task.txt"), "r") as f: + prompt = f.read() + + csvFile = pandas.read_csv(os.path.join(task_folder, "values.csv")) + # answers = pandas.read_csv(os.path.join(task_folder, "results")) + storedTasks = [] + storedAnswers = [] + for x in range(len(csvFile)): + processed_prompt = prompt.replace("{a}", str(csvFile.iloc[x,0])) + processed_prompt = processed_prompt.replace("{b}", str(csvFile.iloc[x, 1])) + storedTasks.append(processed_prompt) + print("Prompt: "+ processed_prompt) + storedAnswers.append(str(csvFile.iloc[x, 2])) + + return (storedTasks, storedAnswers) + + +def from_yaml(yaml_file): + """ + Load tasks from a YAML file and generate PromptTask objects. + Parameters + ---------- + yaml_file : str + Path to the YAML file containing task templates and values. + Returns + ------- + self : Bench + The Bench instance with tasks populated. + """ + with open(yaml_file, 'r') as file: + data = yaml.safe_load(file) + storedTasks = [] + storedAnswers = [] + for sub_task in data: + template = sub_task["template"] # Extract template + values_dict = sub_task["values"] # Extract values dictionary + answers = sub_task["result"] + # Generate all possible value combinations using itertools.product + keys = values_dict.keys() + value_combinations = zip(*values_dict.values()) + # Create a PromptTask for each combination + for values in value_combinations: + value_mapping = dict(zip(keys, values)) # Pair keys with values + filled_prompt = template.format(**value_mapping) # Format the template + print("Prompt: "+ filled_prompt) + storedTasks.append(filled_prompt) # Store task + for answer in answers: + storedAnswers.append(answer) + return (storedTasks, storedAnswers) \ No newline at end of file From c95607e2fb8c24423202028bee3123d917a2055b Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 4 Dec 2025 06:31:46 +0000 Subject: [PATCH 39/78] CLI: small mods on args and commands --- benchtools/cli.py | 81 ++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 33 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 9fba22e..d509921 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -44,10 +44,10 @@ def generate_demo_bench(bench_name): def get_benchmark_name(): return input("Enter the name of your benchmark/project (will be used as folder and repo name)...\n") - +# TODO: Move to designer ### Generate an about path from the description of the user def create_about(bench_name, bench_path, text): - about_path = os.path.join(bench_path, bench_name, "about.md") + about_path = os.path.join(bench_path, "about.md") about_text= f""" # {bench_name} {text} @@ -58,11 +58,11 @@ def create_about(bench_name, bench_path, text): file.write(about_text) +# TODO: Move to designer ### Initialize git repository -def init_repo(bench_name, bench_path): +def init_repo(bench_path): current_dir = os.getcwd() - benchmark_path = os.path.join(bench_path, bench_name) - os.chdir(benchmark_path) + os.chdir(bench_path) try: os.system("git init . -q") os.system("git branch -m main") @@ -76,6 +76,7 @@ def init_repo(bench_name, bench_path): os.chdir(current_dir) +# TODO: Move to designer? # Create a benchmarks folder with tasks in them def setup_task(tasks_path, task_name, task_path): @@ -100,62 +101,76 @@ def setup_task(tasks_path, task_name, task_path): click.echo("Success") +# TODO: Import from designer # Initialize the benchmark @click.command() @click.argument('benchmark_name', required=False) -@click.option('--path', '-p', default=".", help="The path where the benchmark repository will be") -@click.option('--about', '-a', default="", help="The Content that goes in the about.md file") -@click.option('--no-git', is_flag=True, help="Don't make benchmark a git repository. Default is False") -@click.option('--tasks', '-t', type=(str, str), multiple=True, default=[], help="Add benchmark tasks to your benchmark ") +@click.option('-p', '--path', help="The path where the new benchmark repository will be placed", default=".", type=str) +@click.option('-a', '--about', help="Benchmark describtion. Content will go in the about.md file", default="", type=str) +@click.option('--no-git', help="Don't make benchmark a git repository. Default is False", is_flag=True) +@click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=(str, str), multiple=True) def init(benchmark_name, path, about, no_git, tasks): """Initializing a new benchmark.""" - # new_benchmark = PromptTask() + if not benchmark_name: - benchmark_name = get_benchmark_name() + benchmark_name = input("Enter the name of your benchmark/project (will be used as folder and repo name)...\n") click.echo("Creating " + benchmark_name + " in " + path) # Create the benchmark folder - bench_path = os.path.join(path, benchmark_name) - os.mkdir(bench_path) - + if path.startswith('/'): + abs_path = path + else: + abs_path = os.path.abspath(path) + bench_path = os.path.join(abs_path, benchmark_name) + + + # TODO: Move to designer? + os.mkdir(bench_path) # Initialize a git repo if not no_git: - init_repo(benchmark_name, path) # TODO: bench_path? + init_repo(bench_path) # Create about.md - create_about(benchmark_name, path, about) # TODO: bench_path? + create_about(benchmark_name, bench_path, about) # Create a benchmarks folder with tasks in them tasks_path = os.path.join(bench_path, "benchmarks") os.mkdir(tasks_path) + for task_name, task_path in tasks: setup_task(tasks_path, task_name, task_path) - to_run = input(" Would you like to run the benchmark? y/n? ") + to_run = input("Would you like to run the benchmark? y/n? ") print() if to_run in ['y', 'Y', 'yes', "YES", 'Yes', 'yyes']: - benchmark = Bench(bench_path, "something") + benchmark = Bench(bench_path) -# What us creating a new task @click.command() -@click.argument('task_name', required = True) -# @click.option() -def add_task(task_name): - """Setting up a new task.""" - # new_task = PromptTask() - click.echo("Adding " + task_name) +@click.argument('benchmark_path', required = True, type=str) +@click.argument('task_name', required = True, type=str) +@click.argument('task_path', required = True, type=str) +def add_task(benchmark_path, task_name, task_path): + """Set up a new task.""" + bench_path = os.path.abspath(benchmark_path) + if os.path.exists(bench_path): + tasks_path = os.path.join(bench_path, "benchmarks") + if not os.path.exists(tasks_path): + os.mkdir(tasks_path) + setup_task(tasks_path, task_name, task_path) + else: + click.echo("No benchmark reposiory at " + bench_path) @click.command() -@click.argument('benchmark',type=str, required = True) -def run(benchmark: str): +@click.argument('benchmark_path', required = True, type=str) +def run(benchmark_path: str): """Running the benchmark and generating logs""" - bench_path = os.path.join(os.getcwd(), benchmark) + bench_path = os.path.abspath(benchmark_path) - # TODO: Fix relPath case click.echo(f"Running {benchmark.rsplit('/',maxsplit=1)[1]} now") - benchmark = Bench(benchmark, "something") + benchmark = Bench(bench_path) + @click.command() @click.argument('task_name', required = True) @@ -166,10 +181,10 @@ def run_task(task_name): cli.add_command(init) -cli.add_command(run) cli.add_command(add_task) -cli.add_command(run_task) -cli.add_command(generate_demo_bench) +cli.add_command(run) +# cli.add_command(run_task) +# cli.add_command(generate_demo_bench) if __name__ == "__main__": From 3e39319fad3923c81a039046a0b41af5497b2d98 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 6 Dec 2025 02:37:30 +0000 Subject: [PATCH 40/78] CLI: changing the name of function to benchtool to match bash command --- benchtools/cli.py | 16 ++++++---------- project.toml | 2 +- setup.py | 2 +- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index d509921..0725164 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -6,7 +6,7 @@ # from task import PromptTask @click.group() -def cli(): +def benchtool(): """ BenchTools is a tool that helps researchers set up benchmarks. """ @@ -180,12 +180,8 @@ def run_task(task_name): click.echo(f"Running {task_name} now") -cli.add_command(init) -cli.add_command(add_task) -cli.add_command(run) -# cli.add_command(run_task) -# cli.add_command(generate_demo_bench) - - -if __name__ == "__main__": - cli() +benchtool.add_command(init) +benchtool.add_command(add_task) +benchtool.add_command(run) +# benchtool.add_command(run_task) +# benchtool.add_command(generate_demo_bench) \ No newline at end of file diff --git a/project.toml b/project.toml index 339ffa1..bae4c7a 100644 --- a/project.toml +++ b/project.toml @@ -44,6 +44,6 @@ Repository = "https://github.com/ml4sts/benchtools" [project.scripts] -bench-cli = "benchtools.cli:main" +benchtool = "benchtools.cli:benchtool" diff --git a/setup.py b/setup.py index 3439fe2..244a5c9 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ ], entry_points={ 'console_scripts': [ - 'benchtool = benchtools.cli:cli', + 'benchtool = benchtools.cli:benchtool', ], }, ) \ No newline at end of file From 3d913341bf48378f7f4f43ceb484882eb724177b Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 6 Dec 2025 07:11:27 +0000 Subject: [PATCH 41/78] Designer: Finally added modularity so that the bench class can be used either through the CLI or imported into a new code --- benchtools/cli.py | 182 ++++++++++------------------------------- benchtools/designer.py | 65 ++++++++++++++- benchtools/runner.py | 63 +++++++++----- 3 files changed, 151 insertions(+), 159 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 0725164..7a0fa87 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,7 +1,5 @@ import os import click -import shutil -import requests from benchtools.runner import Bench # from task import PromptTask @@ -12,176 +10,86 @@ def benchtool(): """ pass - -## AS: Come back to this soon -@click.command() -@click.argument('bench_name') -def generate_demo_bench(bench_name): - """ - Generate a demo benchmark - """ - - # Set up directory for the demo bench - current_dir = os.getcwd() - parent_dir = os.path.abspath(os.path.join(current_dir, "..")) - # Change this to use the current path or possibly take a path as an argument - demo_bench_path = os.path.join(parent_dir, bench_name) - os.makedirs(demo_bench_path, exist_ok=True) - - tasks_folder = os.path.join(demo_bench_path, "Tasks") - report_folder = os.path.join(demo_bench_path, "Report") - - os.makedirs(tasks_folder, exist_ok=True) - os.makedirs(report_folder, exist_ok=True) - - click.echo(f"Folder '{bench_name}' created at {demo_bench_path}") - click.echo("Subfolders 'Tasks' and 'Report' created.") - - -## Sub-functions for the init method - -### If user calls init without a benchmark name as an argument -def get_benchmark_name(): - return input("Enter the name of your benchmark/project (will be used as folder and repo name)...\n") - -# TODO: Move to designer -### Generate an about path from the description of the user -def create_about(bench_name, bench_path, text): - about_path = os.path.join(bench_path, "about.md") - about_text= f""" - # {bench_name} - {text} - - Generated by BenchTools - """ - with open(about_path, 'w') as file: - file.write(about_text) - - -# TODO: Move to designer -### Initialize git repository -def init_repo(bench_path): - current_dir = os.getcwd() - os.chdir(bench_path) - try: - os.system("git init . -q") - os.system("git branch -m main") - except: - print("git might not be initialized in your system. Please run \"git init . \" when setup") - # Get python gitignore template and create .gitignore - ignore_text = requests.get("https://raw.githubusercontent.com/github/gitignore/refs/heads/main/Python.gitignore") - if ignore_text.status_code == 200: - with open(".gitignore", 'a') as f: - f.write(ignore_text.text) - os.chdir(current_dir) - - -# TODO: Move to designer? -# Create a benchmarks folder with tasks in them -def setup_task(tasks_path, task_name, task_path): - - click.echo(f"Setting up {task_name}...", nl=False) - task_folder = os.path.join(tasks_path, task_name) - os.mkdir(task_folder) # TODO: check if folder exists and handle - - # Path could be absolute or relative, check and work accordingly - if not task_path.startswith('/'): - if task_path.startswith('./'): - # TODO: Path could have one or more `../` use relpath to fix this block - task_path = task_path[2:] - task_path = os.path.join(os.getcwd(), task_path) - # print(f" path {task_path}\n\n") # Debugging - - # could be a single file or a folder check and work accordignly - if os.path.isdir(task_path): - for sub in os.listdir(task_path): - shutil.copy2(os.path.join(task_path, sub), task_folder) - else: - shutil.copy2(task_path, task_folder) - click.echo("Success") - - # TODO: Import from designer # Initialize the benchmark -@click.command() -@click.argument('benchmark_name', required=False) +@benchtool.command() +@click.argument('benchmark-name', required=False) @click.option('-p', '--path', help="The path where the new benchmark repository will be placed", default=".", type=str) @click.option('-a', '--about', help="Benchmark describtion. Content will go in the about.md file", default="", type=str) @click.option('--no-git', help="Don't make benchmark a git repository. Default is False", is_flag=True) @click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=(str, str), multiple=True) -def init(benchmark_name, path, about, no_git, tasks): +def init(benchmark_name:str, path:str, about:str, no_git:bool, tasks:(str,str)): """Initializing a new benchmark.""" if not benchmark_name: - benchmark_name = input("Enter the name of your benchmark/project (will be used as folder and repo name)...\n") - click.echo("Creating " + benchmark_name + " in " + path) + benchmark_name = click.prompt("Enter the name of your benchmark/project (will be used as folder and repo name)", type=str) - # Create the benchmark folder + + # TODO: Handle existing benchmark + if not os.path.exists(path): + try: + path = path[:-1] if path.endswith('/') else path + split_path = path.rsplit('/', 1) + if split_path[1] == benchmark_name: + path = path[0] + else: + raise ValueError("The passed path doesn't exist.") + except Exception as e: + # click.echo("The passed path doesn't exist.", nl=False) + # path = click.prompt("Enter an existing path where the new benchmark folder will be created.") + click.echo("The passed path doesn't exist.") + exit(4356) + + # Handle passed path to setup an absolute benchmark path if path.startswith('/'): abs_path = path else: abs_path = os.path.abspath(path) bench_path = os.path.join(abs_path, benchmark_name) - - # TODO: Move to designer? - os.mkdir(bench_path) - # Initialize a git repo - if not no_git: - init_repo(bench_path) - - # Create about.md - create_about(benchmark_name, bench_path, about) + click.echo(f"Creating {benchmark_name} in {bench_path}") + benchmark = Bench(benchmark_name, bench_path) + if benchmark.build(about, no_git, tasks): + click.echo(f"Built {benchmark_name} benchmark successfully!") - # Create a benchmarks folder with tasks in them - tasks_path = os.path.join(bench_path, "benchmarks") - os.mkdir(tasks_path) - - for task_name, task_path in tasks: - setup_task(tasks_path, task_name, task_path) + # TODO: Call betterbench CLI here - to_run = input("Would you like to run the benchmark? y/n? ") - print() - if to_run in ['y', 'Y', 'yes', "YES", 'Yes', 'yyes']: - benchmark = Bench(bench_path) + # Run? + to_run = click.prompt("Would you like to run the benchmark?", type=click.Choice(['y','n'], case_sensitive=False), show_choices=True) + if to_run in ['y', 'Y']: + benchmark.run() -@click.command() -@click.argument('benchmark_path', required = True, type=str) -@click.argument('task_name', required = True, type=str) -@click.argument('task_path', required = True, type=str) +@benchtool.command() +@click.argument('benchmark-path', required = True, type=str) +@click.argument('task-name', required = True, type=str) +@click.argument('task-path', required = True, type=str) def add_task(benchmark_path, task_name, task_path): """Set up a new task.""" bench_path = os.path.abspath(benchmark_path) if os.path.exists(bench_path): - tasks_path = os.path.join(bench_path, "benchmarks") - if not os.path.exists(tasks_path): - os.mkdir(tasks_path) - setup_task(tasks_path, task_name, task_path) + bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path + benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) + benchmark.new_task(task_name, task_path) else: click.echo("No benchmark reposiory at " + bench_path) -@click.command() +@benchtool.command() @click.argument('benchmark_path', required = True, type=str) def run(benchmark_path: str): """Running the benchmark and generating logs""" bench_path = os.path.abspath(benchmark_path) - - click.echo(f"Running {benchmark.rsplit('/',maxsplit=1)[1]} now") - benchmark = Bench(bench_path) + if os.path.exists(bench_path): + bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path + benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) + click.echo(f"Running {benchmark.bench_name} now") + benchmark.run() -@click.command() +@benchtool.command() @click.argument('task_name', required = True) def run_task(task_name): """Running the tasks and generating logs""" - click.echo(f"Running {task_name} now") - - -benchtool.add_command(init) -benchtool.add_command(add_task) -benchtool.add_command(run) -# benchtool.add_command(run_task) -# benchtool.add_command(generate_demo_bench) \ No newline at end of file + click.echo(f"Running {task_name} now") \ No newline at end of file diff --git a/benchtools/designer.py b/benchtools/designer.py index e9c035e..24eaa22 100644 --- a/benchtools/designer.py +++ b/benchtools/designer.py @@ -1,4 +1,67 @@ # module to build benchmarks -# should create folder structure +# should create folder structure +import os +import shutil +import requests +# from pathlib import Path # ??? +### Create benchmark skeleton +def build_dir(bench_path): + + os.mkdir(bench_path) + # Create a benchmarks folder with tasks in them + tasks_path = os.path.join(bench_path, "benchmarks") + os.mkdir(tasks_path) + +### Generate an about path from the description of the user +def create_about(bench_name, bench_path, text): + about_path = os.path.join(bench_path, "about.md") + about_text= f""" + # {bench_name} + {text} + + Generated by BenchTools + """ + with open(about_path, 'w') as file: + file.write(about_text) + +### Initialize git repository +def init_repo(bench_path): + current_dir = os.getcwd() + os.chdir(bench_path) + try: + os.system("git init . -q") + os.system("git branch -m main") + except: + print("git might not be initialized in your system. Please run \"git init . \" when setup") + # Get python gitignore template and create .gitignore + ignore_text = requests.get("https://raw.githubusercontent.com/github/gitignore/refs/heads/main/Python.gitignore") + if ignore_text.status_code == 200: + with open(".gitignore", 'a') as f: + f.write(ignore_text.text) + os.chdir(current_dir) + + +# Create a benchmarks folder with tasks in them +def setup_task(tasks_path, task_name, task_path): + + print(f"Setting up {task_name}...", end='') + task_folder = os.path.join(tasks_path, task_name) + os.mkdir(task_folder) # TODO: check if folder exists and handle + + # Path could be absolute or relative, check and work accordingly + if not task_path.startswith('/'): + if task_path.startswith('./'): + # TODO: Path could have one or more `../` use relpath to fix this block + task_path = task_path[2:] + task_path = os.path.join(os.getcwd(), task_path) + # print(f" path {task_path}\n\n") # Debugging + + # could be a single file or a folder check and work accordignly + if os.path.isdir(task_path): + for sub in os.listdir(task_path): + shutil.copy2(os.path.join(task_path, sub), task_folder) + else: + shutil.copy2(task_path, task_folder) + print("Success") \ No newline at end of file diff --git a/benchtools/runner.py b/benchtools/runner.py index e751195..9ec0768 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,52 +1,73 @@ # module to run benchmarks import os -from pathlib import Path -from itertools import product from benchtools.task import Task +from benchtools.designer import build_dir, init_repo, create_about, setup_task # from log_file.py import log_agent_interaction class Bench(): ''' ''' - def __init__(self, bench_dir): + def __init__(self, name, path): ''' ''' # load tasks from file strucutre and instantiate task objects for each, store those in a list. # loading will - self.directory = bench_dir + self.bench_name = name + self.bench_path = path + self.tasks_folder = os.path.join(self.bench_path, 'benchmarks') self.tasks = [] + self.built = os.path.exists(self.bench_path) - tasks_folder = os.path.join(bench_dir, "benchmarks") - tasks = os.listdir(tasks_folder) + + def build(self, about_text, no_git, new_tasks) -> bool: + + # Create benchmark skeleton + build_dir(self.bench_path) + + # Create about.md + create_about(self.bench_name, self.bench_path, about_text) + + # Initialize a git repo + if not no_git: + init_repo(self.bench_path) + + for task_name, task_path in new_tasks: + self.new_task(task_name, task_path) + + self.built = True + return self.built + + + def new_task(self, task_name, task_path): + if self.built: + self.tasks.append(setup_task(self.tasks_folder, task_name, task_path)) + + + def run(self, model='gemma3', api_url=None): + ''' + ''' + tasks = os.listdir(self.tasks_folder) for task in tasks: - content = os.listdir(os.path.join(tasks_folder,task)) + task_folder = os.path.join(self.tasks_folder,task) + content = os.listdir(task_folder) for file in content: if file.endswith("csv"): - self.tasks.append(Task('csv', task, os.path.join(tasks_folder,task))) + self.tasks.append(Task('csv', task, task_folder)) elif file.endswith("yml"): - self.tasks.append(Task('yml', task, os.path.join(tasks_folder,task,file))) + self.tasks.append(Task('yml', task, os.path.join(self.task_folder,file))) for task in self.tasks: + print("\n\n\n") name, prompts, answers = task.name, task.sub_tasks, task.answers print("Task: " + name) print("Prompts: ", end='') print(prompts) print("Answers: ", end='') print(answers) + task.run(model, api_url) print("Responses: ", end='') - task.run("gemma3") print(task.responses) - - - - - def run(self, model, api_url=None): - ''' - ''' - for task in self.tasks: - task.run(model, api_url) # log_agent_interaction(prompt, response) - # task.score() - print(task.responses) \ No newline at end of file + # task.score() \ No newline at end of file From dd8c2a9ea6672e664bf9e2db94fa76ef350b9226 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 28 Jan 2026 23:02:23 +0000 Subject: [PATCH 42/78] Logger: Moved log_file.py to be modified for new techniques --- log_file.py => benchtools/logger.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) rename log_file.py => benchtools/logger.py (71%) diff --git a/ log_file.py b/benchtools/logger.py similarity index 71% rename from log_file.py rename to benchtools/logger.py index 9bf90cd..ee17169 100644 --- a/ log_file.py +++ b/benchtools/logger.py @@ -1,8 +1,11 @@ -import logging import os -#agent_log.text that is if we will like to name it that way , it could be anything -agent_name = os.path.join(os.getcwd(), 'agent_log.txt') -logging.basicConfig(filename=agent_name, +import logging + +log_folder: str + +def init_logger(log_path, task_name): + log_file = os.path.join(log_path, f'{task_name}_log.txt') + logging.basicConfig(filename=log_file, level=logging.INFO, format= '%(asctime)s - %(levelname)s - %(message)s') def log_agent_interaction(agent_input, agent_output): From c0df28aceb66139a8c1dda7c9d790bfa2fc8628f Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 28 Jan 2026 23:05:16 +0000 Subject: [PATCH 43/78] task.py: incorporating logger --- benchtools/task.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/benchtools/task.py b/benchtools/task.py index 322ea73..275968c 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -4,6 +4,7 @@ import yaml # requires pyyaml import pandas from ollama import chat, ChatResponse, Client +from benchtools.logger import init_logger, log_agent_interaction # from scorerers import exact_match # scoring_fx = {"exact_match": exact_match} @@ -15,7 +16,7 @@ class Task: """ def __init__( - self, data_type, name, path, scoring_function=None, reference=None, runner_type="ollama" + self, data_type, task_name, path, log_path, scoring_function=None, reference=None, runner_type="ollama" ): """ init a task object @@ -37,18 +38,19 @@ def __init__( to use the Ollama runner, the script expects the model to be installed, and `ollama serve` running on localhost:11434 to use OpenAI runner, you must have an API key set in your OPENAI_API_KEY environment variable """ - self.name = name + self.name = task_name + self.log_path = log_path self.sub_tasks = [] self.answers = [] match data_type: case 'csv': - prompt, answer = from_txt_csv(path) - self.sub_tasks=(prompt) - self.answers=(answer) + prompts, answers = from_txt_csv(path) + self.sub_tasks += prompts + self.answers += answers case 'yml': - prompt, answer = from_yaml(path) - self.sub_tasks=(prompt) - self.answers=(answer) + prompts, answers = from_yaml(path) + self.sub_tasks += prompts + self.answers += answers if type(scoring_function) is str: @@ -60,6 +62,7 @@ def __init__( self.runner_type = runner_type self.responses = [] + init_logger(self.log_path, self.name) def run(self, model, api_url=None): @@ -75,7 +78,7 @@ def run(self, model, api_url=None): """ for sub_task in self.sub_tasks: - print(sub_task) + # print(sub_task) match self.runner_type: case "ollama": @@ -120,6 +123,8 @@ def run(self, model, api_url=None): case _: print(f"Runner type {self.runner_type} not supported") return None + + log_agent_interaction(sub_task, response.message.content) def score(self, response): From 50cd021ec15a2ecce86737853a3d2c9a795112cc Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 28 Jan 2026 23:06:01 +0000 Subject: [PATCH 44/78] runner.py: adding logs folder and incorporating logger --- benchtools/runner.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 9ec0768..70857fd 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -16,6 +16,7 @@ def __init__(self, name, path): self.bench_name = name self.bench_path = path self.tasks_folder = os.path.join(self.bench_path, 'benchmarks') + self.log_folder = os.path.join(self.bench_path, 'logs') self.tasks = [] self.built = os.path.exists(self.bench_path) @@ -33,18 +34,18 @@ def build(self, about_text, no_git, new_tasks) -> bool: init_repo(self.bench_path) for task_name, task_path in new_tasks: - self.new_task(task_name, task_path) + self.add_task(task_name, task_path) self.built = True return self.built - def new_task(self, task_name, task_path): + def add_task(self, task_name, task_path): if self.built: self.tasks.append(setup_task(self.tasks_folder, task_name, task_path)) - def run(self, model='gemma3', api_url=None): + def run(self, tasks_torun=[], model='gemma3', api_url=None): ''' ''' tasks = os.listdir(self.tasks_folder) @@ -53,21 +54,25 @@ def run(self, model='gemma3', api_url=None): content = os.listdir(task_folder) for file in content: if file.endswith("csv"): - self.tasks.append(Task('csv', task, task_folder)) + self.tasks.append(Task('csv', task, task_folder, self.log_folder)) elif file.endswith("yml"): - self.tasks.append(Task('yml', task, os.path.join(self.task_folder,file))) - + self.tasks.append(Task('yml', task, os.path.join(task_folder,file), self.log_folder)) + + tasks_torun = self.tasks if tasks_torun == [] else tasks_torun + print(tasks_torun) + print(self.tasks) for task in self.tasks: - print("\n\n\n") - name, prompts, answers = task.name, task.sub_tasks, task.answers - print("Task: " + name) - print("Prompts: ", end='') - print(prompts) - print("Answers: ", end='') - print(answers) - task.run(model, api_url) - print("Responses: ", end='') - print(task.responses) + if task.name in tasks_torun: + print("\n\n\n") + name, prompts, answers = task.name, task.sub_tasks, task.answers + print("Task: " + name) + print("Prompts: ", end='') + print(prompts) + print("Answers: ", end='') + print(answers) + task.run(model, api_url) + print("Responses: ", end='') + print(task.responses) # log_agent_interaction(prompt, response) # task.score() \ No newline at end of file From edbd188f801a253ebd1605eef1ec60058e9e17c6 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 28 Jan 2026 23:08:10 +0000 Subject: [PATCH 45/78] cli.py: Making all cli functions create a Bench object to work with --- benchtools/cli.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 7a0fa87..4faa0fe 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -10,7 +10,6 @@ def benchtool(): """ pass -# TODO: Import from designer # Initialize the benchmark @benchtool.command() @click.argument('benchmark-name', required=False) @@ -59,6 +58,7 @@ def init(benchmark_name:str, path:str, about:str, no_git:bool, tasks:(str,str)): if to_run in ['y', 'Y']: benchmark.run() +## TODO: Is it computationally better to use pickle to save the object in the benchmark folder?? @benchtool.command() @click.argument('benchmark-path', required = True, type=str) @@ -70,13 +70,13 @@ def add_task(benchmark_path, task_name, task_path): if os.path.exists(bench_path): bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) - benchmark.new_task(task_name, task_path) + benchmark.add_task(task_name, task_path) else: click.echo("No benchmark reposiory at " + bench_path) @benchtool.command() -@click.argument('benchmark_path', required = True, type=str) +@click.argument('benchmark-path', required = True, type=str) def run(benchmark_path: str): """Running the benchmark and generating logs""" bench_path = os.path.abspath(benchmark_path) @@ -88,8 +88,13 @@ def run(benchmark_path: str): @benchtool.command() +@click.argument('benchmark-path', required = True, type=str) @click.argument('task_name', required = True) -def run_task(task_name): +def run_task(benchmark_path: str, task_name): """Running the tasks and generating logs""" - - click.echo(f"Running {task_name} now") \ No newline at end of file + bench_path = os.path.abspath(benchmark_path) + if os.path.exists(bench_path): + bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path + benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) + click.echo(f"Running {task_name} now") + benchmark.run([task_name]) \ No newline at end of file From dcdc84edb821a0ec5a225d5e53f68358aaf6e528 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 28 Jan 2026 23:09:14 +0000 Subject: [PATCH 46/78] designer.py: creating logs folder. And first step to loading openai datasets from hf --- benchtools/designer.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/benchtools/designer.py b/benchtools/designer.py index 24eaa22..5ca6d28 100644 --- a/benchtools/designer.py +++ b/benchtools/designer.py @@ -3,6 +3,7 @@ import os import shutil import requests +from datasets import load_dataset # from pathlib import Path # ??? @@ -13,6 +14,8 @@ def build_dir(bench_path): # Create a benchmarks folder with tasks in them tasks_path = os.path.join(bench_path, "benchmarks") os.mkdir(tasks_path) + log_path = os.path.join(bench_path, "logs") + os.mkdir(log_path) ### Generate an about path from the description of the user def create_about(bench_name, bench_path, text): @@ -44,12 +47,18 @@ def init_repo(bench_path): # Create a benchmarks folder with tasks in them -def setup_task(tasks_path, task_name, task_path): +def setup_task(tasks_path: str, task_name: str, task_path: str): print(f"Setting up {task_name}...", end='') task_folder = os.path.join(tasks_path, task_name) os.mkdir(task_folder) # TODO: check if folder exists and handle + if task_path.startswith('openai'): + download_dataset(task_folder, task_path) + print("Success") + return + + # Path could be absolute or relative, check and work accordingly if not task_path.startswith('/'): if task_path.startswith('./'): @@ -64,4 +73,20 @@ def setup_task(tasks_path, task_name, task_path): shutil.copy2(os.path.join(task_path, sub), task_folder) else: shutil.copy2(task_path, task_folder) - print("Success") \ No newline at end of file + print("Success") + +def download_dataset(task_folder: str, hf_path: str): + with open(os.path.join(task_folder, 'task.txt'), 'w') as f: + f.write('{p}') + + dataset = load_dataset(hf_path) + dataset_test = dataset['test'] + + with open(os.path.join(task_folder, 'values.csv'), 'w') as f: + f.write('p,res') + for row in dataset_test: + prompt = row['prompt'] + answer = row['canonical_solution'] + f.write(f"{prompt,answer}") + + From 5f3ff64a1a2386d9f61d7735d643de8d47712ece Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 28 Jan 2026 23:09:57 +0000 Subject: [PATCH 47/78] README: added some example runs --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 2af7773..8d088bc 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,25 @@ a python library designed to help people design and run LLM benchmarks ## Usage +``` +benchtool init new_test -p ../ -t add ../datasets/add/ -t Gaps ../datasets/miscops/ -a "this is a demo for benchtools" +``` +``` +Creating new_test in ../ +Setting up add...Success +Setting up Gaps...Success +Would you like to run the benchmark? y/n? n +``` +``` +benchtool add-task ../new_test/ FillIn ../datasets/miscops/ +``` +``` +Setting up FillIn...Success +``` +``` +benchtool run testRuns/111 +``` ### CLI ``` From f477710473a2e45459cccd5b3bb068f31190a520 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 11 Feb 2026 23:38:41 +0000 Subject: [PATCH 48/78] docs: Setting up documentation with sphinx. --- .github/workflows/publish.yml | 45 +++++++++++++++++ docs/Makefile | 20 ++++++++ docs/index.md | 15 ------ docs/make.bat | 35 +++++++++++++ docs/source/cli.md | 30 ++++++++++++ docs/source/conf.py | 92 +++++++++++++++++++++++++++++++++++ docs/source/index.md | 56 +++++++++++++++++++++ docs/source/pylib.md | 1 + requirements.txt | 6 ++- 9 files changed, 284 insertions(+), 16 deletions(-) create mode 100644 .github/workflows/publish.yml create mode 100644 docs/Makefile delete mode 100644 docs/index.md create mode 100644 docs/make.bat create mode 100644 docs/source/cli.md create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.md create mode 100644 docs/source/pylib.md diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..c45d8c9 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,45 @@ +name: deploydocs + +# Only run this when the main branch changes +on: + push: + branches: + - main + - doc + # If your git repository has the Jupyter Book within some-subfolder next to + # unrelated files, you can make this run only if a file within that specific + # folder has been modified. + # + # paths: + # - some-subfolder/** + +# This job installs dependencies, build the book, and pushes it to `gh-pages` +jobs: + deploy-book: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4.1.1 + + # Install dependencies + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.11 + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install -e . --no-deps --force-reinstall + + # Build the book + - name: Build the docs + run: | + cd docs + make html + + # Push the book's HTML to github-pages + - name: GitHub Pages action + uses: peaceiris/actions-gh-pages@v3.6.1 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: docs/_build/html \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 2cb05f4..0000000 --- a/docs/index.md +++ /dev/null @@ -1,15 +0,0 @@ -# Benchtools - -a library for building and running benchmarks - -## Use - -*it does not run yet* - - -benchtools allows you to express templated tasks in multiple ways: -- a yaml format -- a txt file with tempalte and a csv file of values - -a benchmark can consist of tasks that all fit a single format above or a mixture of meta-tasks each represented as a folder -and then the specific tasks in one of the forms above \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..747ffb7 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/cli.md b/docs/source/cli.md new file mode 100644 index 0000000..23aa9e1 --- /dev/null +++ b/docs/source/cli.md @@ -0,0 +1,30 @@ +# CLI + + +``` +benchtool init new_test -p ../ -t add ../datasets/add/ -t Gaps ../datasets/miscops/ -a "this is a demo for benchtools" +``` +``` +Creating new_test in ../ +Setting up add...Success +Setting up Gaps...Success +Would you like to run the benchmark? y/n? n +``` +``` +benchtool add-task ../new_test/ FillIn ../datasets/miscops/ +``` +``` +Setting up FillIn...Success +``` + +``` +benchtool run testRuns/111 +``` + +```{eval-rst} +.. click:: benchtools.cli:benchtool + :prog: benchtool + :nested: full + :commands: + +``` \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..02150d5 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,92 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'BenchTools' +copyright = '2026, Ayman Sandouk, Sarah M Brown' +author = 'Ayman Sandouk, Sarah M Brown' +release = '0.0.1' + + +import os +import sys +sys.path.insert(0, os.path.abspath('../../benchtools/')) + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "myst_nb", + 'sphinx.ext.intersphinx', + "sphinx_design", + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx_click' +] + +templates_path = ['_templates'] +exclude_patterns = ["README.md", 'demobench/*', 'build/*', '_build', 'Thumbs.db', "*import_posts*"] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'pydata_sphinx_theme' + +html_theme_options = { + "show_nav_level": 2, + "header_links_before_dropdown": 6, + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/ml4sts/benchtools", + "icon": "fa-brands fa-github", + }, + { + "name": "Course", + "url": "https://evalai2eval.github.io/", + "icon": "fa-solid fa-school", + }], + "secondary_sidebar_items": { + "**/*": ["page-toc", "edit-this-page", "sourcelink"], + } +} + + +# MyST config +myst_enable_extensions = [ + # "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "fieldlist", + "html_admonition", + "html_image", + # "linkify", + "replacements", + "smartquotes", + "strikethrough", + "substitution", + # "tasklist", +] + +# html_favicon = "_static/favicon.ico" +# change this to change the site title +html_title = project + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +# html_extra_path = ["feed.xml"] +# map pages to which sidebar they should have +# "page_file_name": ["list.html", "of.html", "sidebar.html", "files.html"] +html_sidebars = { + "*": [], + "**/*": ["sidebar-nav-bs",] +} diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000..47108a6 --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,56 @@ +.. BenchTools documentation master file, created by + sphinx-quickstart on Wed Feb 11 16:55:24 2026. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + + + +# BenchTools + +A library for building and running benchmarks + + +## Install + +You can install after cloning to work locally or directly from github. + +### By clone + +You can clone first +``` +git clone https://github.com/ml4sts/benchtools.git +``` + +and then install +``` +pip install benchtools +``` +(possibly `pip3`) + +if you clone in order to develop, you may want to install with pip's `-e` option + +``` +pip install -e benchtools +``` + +To update, pull and install again. + +## Usage + +benchtools allows you to express templated tasks in multiple ways: +- a yaml format +- a txt file with tempalte and a csv file of values + +a benchmark can consist of tasks that all fit a single format above or a mixture of meta-tasks each represented as a folder +and then the specific tasks in one of the forms above + +There are two main ways to use BenchTools. The user can mix and match between the two methods. + +```{toctree} +:caption: Contents: +:maxdepth: 2 + +cli.md +pylib.md + +``` \ No newline at end of file diff --git a/docs/source/pylib.md b/docs/source/pylib.md new file mode 100644 index 0000000..a39eabd --- /dev/null +++ b/docs/source/pylib.md @@ -0,0 +1 @@ +# BenchTools as a Python Library \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a1eb96b..599e084 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,8 @@ pyyaml pandas click ollama -openai \ No newline at end of file +openai +sphinx +sphinx-click +sphinx-design +pydata_sphinx_theme \ No newline at end of file From 29416c8514f9fd5aa78b7a9f6631f2f48534a1af Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 11 Feb 2026 23:43:57 +0000 Subject: [PATCH 49/78] doc: Adding myst to requirments --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 599e084..fd725ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,8 @@ pandas click ollama openai +myst-nb +myst-parser sphinx sphinx-click sphinx-design From 1085a81763c602562db0d2d4836cf44fa223d614 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 12 Feb 2026 00:22:34 +0000 Subject: [PATCH 50/78] Fixing workflow with correct build directory name --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index c45d8c9..a22d3a5 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -42,4 +42,4 @@ jobs: uses: peaceiris/actions-gh-pages@v3.6.1 with: github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: docs/_build/html \ No newline at end of file + publish_dir: docs/build/html \ No newline at end of file From 3accbee92e98db0f21c8921104cb9dd685446200 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 12 Feb 2026 05:16:08 +0000 Subject: [PATCH 51/78] requirements: Adding datasets --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index fd725ab..111da9a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ pandas click ollama openai +datasets myst-nb myst-parser sphinx From 0d0685583090292d48383f9ec70334b5e0e0b668 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 12 Feb 2026 05:17:32 +0000 Subject: [PATCH 52/78] doc: Adding some docstrings to experiment with sphinx --- benchtools/cli.py | 23 ++++++++++++++++++++++- benchtools/designer.py | 5 ++++- docs/source/index.md | 13 +++++++++++-- docs/source/pylib.md | 24 +++++++++++++++++++++++- 4 files changed, 60 insertions(+), 5 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 4faa0fe..0c43839 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -18,7 +18,28 @@ def benchtool(): @click.option('--no-git', help="Don't make benchmark a git repository. Default is False", is_flag=True) @click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=(str, str), multiple=True) def init(benchmark_name:str, path:str, about:str, no_git:bool, tasks:(str,str)): - """Initializing a new benchmark.""" + """ + Initializes a new benchmark. + + Even though the command doesn't have any required arguments. If the argument wasn't passed the interface will ask for a name and wouldn't continue without one. + + This command is the starting point. With this, the process of creating a benchmark structure and guiding the user into the correct mindset of a benchmark. + + After running this command, the folder structure of the benchmark will be created. Task files will be loaded, the user will be asked a series of questions to demonstrate the correct mindset of benchmarking, and finally, the user will be given the choice to run the benchmark or not. + + :param benchmark_name: The name of the benchmark and the folder to be created. + :type benchmark_name: str. + :param path: The path in which the benchmark folder will be created. Default is `.`. + :type path: str. + :param about: A description of the benchmark, its purpose and its goal. Will be used to create an `about.md`. + :type about: str. + :param no_git: For the user to choose not to initialize a git repository for the benchmark. + :type no_git: bool. + :param tasks: A list of tasks can be provided from the get go to be loaded into the benchmark folder. The list consists of tuples of , + :type tasks: str,str. + + + """ if not benchmark_name: benchmark_name = click.prompt("Enter the name of your benchmark/project (will be used as folder and repo name)", type=str) diff --git a/benchtools/designer.py b/benchtools/designer.py index 5ca6d28..694d9a2 100644 --- a/benchtools/designer.py +++ b/benchtools/designer.py @@ -7,8 +7,11 @@ # from pathlib import Path # ??? -### Create benchmark skeleton def build_dir(bench_path): + """ + Create benchmark skeleton + :param bench_path: Path to where the folder will be created + """ os.mkdir(bench_path) # Create a benchmarks folder with tasks in them diff --git a/docs/source/index.md b/docs/source/index.md index 47108a6..533269b 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,7 +1,7 @@ -.. BenchTools documentation master file, created by + @@ -53,4 +53,13 @@ There are two main ways to use BenchTools. The user can mix and match between th cli.md pylib.md +``` + + +```{eval-rst} +.. click:: benchtools.cli:benchtool + :prog: benchtools + :nested: full + :commands: + ``` \ No newline at end of file diff --git a/docs/source/pylib.md b/docs/source/pylib.md index a39eabd..4e52ba2 100644 --- a/docs/source/pylib.md +++ b/docs/source/pylib.md @@ -1 +1,23 @@ -# BenchTools as a Python Library \ No newline at end of file +# BenchTools as a Python Library + + +## Design Benchmark Directory + +```{eval-rst} +.. automodule:: benchtools.designer + :members: +``` + +## Build Directory + +```{eval-rst} +.. autoclass:: benchtools.designer.build_dir + :members: +``` + +## Build Directory + +```{eval-rst} +.. automodule:: benchtools.designer.build_dir + :members: +``` \ No newline at end of file From 6f182475d1249cface0043ae294e0e5470475af6 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Thu, 12 Feb 2026 10:03:17 -0500 Subject: [PATCH 53/78] denote missing dependency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 244a5c9..cd9dac9 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ version='0.1', packages=find_namespace_packages(), install_requires=[ - 'Click' + 'Click', 'ollama' ], entry_points={ 'console_scripts': [ From cd20fb988d8984a7d180aee04e1bac3989c2a314 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Thu, 12 Feb 2026 10:25:50 -0500 Subject: [PATCH 54/78] begin documentation and more dependencies --- benchtools/cli.py | 19 ++++++++----------- benchtools/designer.py | 20 ++++++++++++++++++++ benchtools/logger.py | 17 +++++++++++++++-- benchtools/runner.py | 36 +++++++++++++++++++++++++++++++++++- setup.py | 2 +- 5 files changed, 79 insertions(+), 15 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 4faa0fe..202555d 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -16,9 +16,10 @@ def benchtool(): @click.option('-p', '--path', help="The path where the new benchmark repository will be placed", default=".", type=str) @click.option('-a', '--about', help="Benchmark describtion. Content will go in the about.md file", default="", type=str) @click.option('--no-git', help="Don't make benchmark a git repository. Default is False", is_flag=True) -@click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=(str, str), multiple=True) -def init(benchmark_name:str, path:str, about:str, no_git:bool, tasks:(str,str)): - """Initializing a new benchmark.""" +@click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=tuple(str, str), multiple=True) +def init(benchmark_name:str, path:str, about:str, no_git:bool, tasks:tuple(str,str)): + """Initialize a new benchmark. + """ if not benchmark_name: benchmark_name = click.prompt("Enter the name of your benchmark/project (will be used as folder and repo name)", type=str) @@ -39,12 +40,8 @@ def init(benchmark_name:str, path:str, about:str, no_git:bool, tasks:(str,str)): click.echo("The passed path doesn't exist.") exit(4356) - # Handle passed path to setup an absolute benchmark path - if path.startswith('/'): - abs_path = path - else: - abs_path = os.path.abspath(path) - bench_path = os.path.join(abs_path, benchmark_name) + # create full path + bench_path = os.path.join(path, benchmark_name) click.echo(f"Creating {benchmark_name} in {bench_path}") benchmark = Bench(benchmark_name, bench_path) @@ -61,8 +58,8 @@ def init(benchmark_name:str, path:str, about:str, no_git:bool, tasks:(str,str)): ## TODO: Is it computationally better to use pickle to save the object in the benchmark folder?? @benchtool.command() -@click.argument('benchmark-path', required = True, type=str) -@click.argument('task-name', required = True, type=str) +@click.argument('benchmark-path', required = True, type=str, help="The path to the benchmark repository where the task will be added.") +@click.argument('task-name', required = True, type=str, help="The name of the task to be added. This will be used as the folder name for the task and should be unique within the benchmark.") @click.argument('task-path', required = True, type=str) def add_task(benchmark_path, task_name, task_path): """Set up a new task.""" diff --git a/benchtools/designer.py b/benchtools/designer.py index 5ca6d28..9f7ab85 100644 --- a/benchtools/designer.py +++ b/benchtools/designer.py @@ -9,6 +9,18 @@ ### Create benchmark skeleton def build_dir(bench_path): + ''' + Create benchmrk skeleton + + Parameters: + ----------- + bench_path: str + The path to the benchmark folder. This folder will be created if it does not exist. + + Returns: + -------- + + ''' os.mkdir(bench_path) # Create a benchmarks folder with tasks in them @@ -31,6 +43,14 @@ def create_about(bench_name, bench_path, text): ### Initialize git repository def init_repo(bench_path): + ''' + Initialize the benchmark folder as git repo with gitiginore for python + + Parameters: + ----------- + bench_path: str + The path to the benchmark folder + ''' current_dir = os.getcwd() os.chdir(bench_path) try: diff --git a/benchtools/logger.py b/benchtools/logger.py index ee17169..e8efdba 100644 --- a/benchtools/logger.py +++ b/benchtools/logger.py @@ -4,6 +4,16 @@ log_folder: str def init_logger(log_path, task_name): + '''' + Initializes the logger for a specific task. + + Parameters: + ------------- + log_path: str + The path to the log folder where the log file will be created. + task_name: str + The name of the task for which the logger is being initialized. This will be used to name the log file. + ''' log_file = os.path.join(log_path, f'{task_name}_log.txt') logging.basicConfig(filename=log_file, level=logging.INFO, format= '%(asctime)s - %(levelname)s - %(message)s') @@ -13,8 +23,11 @@ def log_agent_interaction(agent_input, agent_output): Logs the agent's input and output to a file. Parameters: - agent_input (str): The input provided to the agent. - agent_output (str): The output generated by the agent. + ------------- + agent_input: string + The input provided to the agent. + agent_output: string + The output generated by the agent. """ logging.info(f'Input: {agent_input}') logging.info(f'Output: {agent_output}') diff --git a/benchtools/runner.py b/benchtools/runner.py index 70857fd..540525c 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -10,10 +10,18 @@ class Bench(): ''' def __init__(self, name, path): ''' + initialize the benchmark object with the name and path to the benchmark folder. + + Parameters: + ----------- + name: str + name of the benchmark will be used for folder + path: str or buffer + path to the benchmark folder. If the folder does not exist, it will be created ''' # load tasks from file strucutre and instantiate task objects for each, store those in a list. # loading will - self.bench_name = name + self.bench_name = name.strip().replace(" ", "_").lower() self.bench_path = path self.tasks_folder = os.path.join(self.bench_path, 'benchmarks') self.log_folder = os.path.join(self.bench_path, 'logs') @@ -22,6 +30,22 @@ def __init__(self, name, path): def build(self, about_text, no_git, new_tasks) -> bool: + ''' + + Parameters: + ----------- + about_text: str + description of the benchmark to be included in the about.md file + no_git: bool + whether to initialize a git repository in the benchmark folder + new_tasks: list of tuples (task_name, task_path) + list of tasks to be added to the benchmark. Each task is represented as a tuple containing + + Returns: + -------- + self.built : bool + True if the benchmark was successfully built, False otherwise + ''' # Create benchmark skeleton build_dir(self.bench_path) @@ -41,12 +65,22 @@ def build(self, about_text, no_git, new_tasks) -> bool: def add_task(self, task_name, task_path): + if self.built: self.tasks.append(setup_task(self.tasks_folder, task_name, task_path)) def run(self, tasks_torun=[], model='gemma3', api_url=None): ''' + Run the benchmark by running each task in the benchmark and logging the interactions. + Parameters: +----------- +tasks_torun: list of str + A list of task names to run. If empty, all tasks will be run. +model: str default 'gemma3' + The name of the model to use for running the tasks. Default is 'gemma3'. +api_url: str + The URL of the API to use for running the tasks. If None, the default API ''' tasks = os.listdir(self.tasks_folder) for task in tasks: diff --git a/setup.py b/setup.py index cd9dac9..b6b981f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ version='0.1', packages=find_namespace_packages(), install_requires=[ - 'Click', 'ollama' + 'Click', 'ollama', 'pandas', 'yaml' ], entry_points={ 'console_scripts': [ From d26ab3a4f954dfff9aab1e7f95fb7a3452744bd2 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Thu, 12 Feb 2026 21:58:53 -0500 Subject: [PATCH 55/78] get it to run enough to give help function and some cleanup --- benchtools/cli.py | 51 ++++++++++++++------------ benchtools/runner.py | 4 +- benchtools/task.py | 2 +- demobench/{ => tasks}/add/task.txt | 0 demobench/{ => tasks}/add/values.csv | 0 demobench/{ => tasks}/miscops/task.yml | 0 setup.py | 2 +- 7 files changed, 32 insertions(+), 27 deletions(-) rename demobench/{ => tasks}/add/task.txt (100%) rename demobench/{ => tasks}/add/values.csv (100%) rename demobench/{ => tasks}/miscops/task.yml (100%) diff --git a/benchtools/cli.py b/benchtools/cli.py index 60d10a2..19df0f9 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -16,27 +16,22 @@ def benchtool(): @click.option('-p', '--path', help="The path where the new benchmark repository will be placed", default=".", type=str) @click.option('-a', '--about', help="Benchmark describtion. Content will go in the about.md file", default="", type=str) @click.option('--no-git', help="Don't make benchmark a git repository. Default is False", is_flag=True) -@click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=(str, str), multiple=True) -def init(benchmark_name:str, path:str, about:str, no_git:bool, tasks:(str,str)): +@click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", + default=[], type=(str, str), multiple=True) +def init(benchmark_name, path, about, no_git, tasks): """ Initializes a new benchmark. - Even though the command doesn't have any required arguments. If the argument wasn't passed the interface will ask for a name and wouldn't continue without one. + Even though the command doesn't have any required arguments. If the + argument wasn't passed the interface will ask for a name and wouldn't continue without one. - This command is the starting point. With this, the process of creating a benchmark structure and guiding the user into the correct mindset of a benchmark. + This command is the starting point. With this, the process of creating a benchmark + structure and guiding the user into the correct mindset of a benchmark. - After running this command, the folder structure of the benchmark will be created. Task files will be loaded, the user will be asked a series of questions to demonstrate the correct mindset of benchmarking, and finally, the user will be given the choice to run the benchmark or not. - - :param benchmark_name: The name of the benchmark and the folder to be created. - :type benchmark_name: str. - :param path: The path in which the benchmark folder will be created. Default is `.`. - :type path: str. - :param about: A description of the benchmark, its purpose and its goal. Will be used to create an `about.md`. - :type about: str. - :param no_git: For the user to choose not to initialize a git repository for the benchmark. - :type no_git: bool. - :param tasks: A list of tasks can be provided from the get go to be loaded into the benchmark folder. The list consists of tuples of , - :type tasks: str,str. + After running this command, the folder structure of the benchmark will be created. + Task files will be loaded, the user will be asked a series of questions to demonstrate + the correct mindset of benchmarking, and finally, the user will be given the choice to + run the benchmark or not. """ @@ -71,18 +66,24 @@ def init(benchmark_name:str, path:str, about:str, no_git:bool, tasks:(str,str)): # TODO: Call betterbench CLI here # Run? - to_run = click.prompt("Would you like to run the benchmark?", type=click.Choice(['y','n'], case_sensitive=False), show_choices=True) - if to_run in ['y', 'Y']: + to_run = click.confirm("Do you want to run the benchmark now?", default=True) + if to_run: benchmark.run() ## TODO: Is it computationally better to use pickle to save the object in the benchmark folder?? + @benchtool.command() -@click.argument('benchmark-path', required = True, type=str, help="The path to the benchmark repository where the task will be added.") -@click.argument('task-name', required = True, type=str, help="The name of the task to be added. This will be used as the folder name for the task and should be unique within the benchmark.") +@click.argument('benchmark-path', required = True, type=str,) +# +@click.argument('task-name', required = True, type=str,) @click.argument('task-path', required = True, type=str) def add_task(benchmark_path, task_name, task_path): - """Set up a new task.""" + """ + Set up a new task. + + # TODO explain arguments or convert to options. to use help + """ bench_path = os.path.abspath(benchmark_path) if os.path.exists(bench_path): bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path @@ -95,7 +96,9 @@ def add_task(benchmark_path, task_name, task_path): @benchtool.command() @click.argument('benchmark-path', required = True, type=str) def run(benchmark_path: str): - """Running the benchmark and generating logs""" + """ + Running the benchmark and generating logs + """ bench_path = os.path.abspath(benchmark_path) if os.path.exists(bench_path): bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path @@ -108,7 +111,9 @@ def run(benchmark_path: str): @click.argument('benchmark-path', required = True, type=str) @click.argument('task_name', required = True) def run_task(benchmark_path: str, task_name): - """Running the tasks and generating logs""" + """ + Running the tasks and generating logs + """ bench_path = os.path.abspath(benchmark_path) if os.path.exists(bench_path): bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path diff --git a/benchtools/runner.py b/benchtools/runner.py index 540525c..dd79d01 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,4 +1,4 @@ -# module to run benchmarks +# module to run tasks import os from benchtools.task import Task from benchtools.designer import build_dir, init_repo, create_about, setup_task @@ -23,7 +23,7 @@ def __init__(self, name, path): # loading will self.bench_name = name.strip().replace(" ", "_").lower() self.bench_path = path - self.tasks_folder = os.path.join(self.bench_path, 'benchmarks') + self.tasks_folder = os.path.join(self.bench_path, 'tasks') self.log_folder = os.path.join(self.bench_path, 'logs') self.tasks = [] self.built = os.path.exists(self.bench_path) diff --git a/benchtools/task.py b/benchtools/task.py index 275968c..3bc83e1 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -54,7 +54,7 @@ def __init__( if type(scoring_function) is str: - self.scoring_function = scoring_fx[scoring_function] + self.scoring_function = scoring_function[scoring_function] else: self.scoring_function = scoring_function diff --git a/demobench/add/task.txt b/demobench/tasks/add/task.txt similarity index 100% rename from demobench/add/task.txt rename to demobench/tasks/add/task.txt diff --git a/demobench/add/values.csv b/demobench/tasks/add/values.csv similarity index 100% rename from demobench/add/values.csv rename to demobench/tasks/add/values.csv diff --git a/demobench/miscops/task.yml b/demobench/tasks/miscops/task.yml similarity index 100% rename from demobench/miscops/task.yml rename to demobench/tasks/miscops/task.yml diff --git a/setup.py b/setup.py index b6b981f..5050cf6 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ version='0.1', packages=find_namespace_packages(), install_requires=[ - 'Click', 'ollama', 'pandas', 'yaml' + 'Click', 'ollama', 'pandas', 'pyyaml', 'datasets' ], entry_points={ 'console_scripts': [ From 53acd888c32416521ca28abd5c4a9b377c146664 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 14 Feb 2026 17:08:36 +0000 Subject: [PATCH 56/78] Some bug fixes and cleaning up --- benchtools/cli.py | 26 +++++++++++++++++++------- benchtools/designer.py | 2 +- benchtools/runner.py | 26 +++++++++++++------------- benchtools/task.py | 4 ++-- demobench/logs/.gitkeep | 0 5 files changed, 35 insertions(+), 23 deletions(-) create mode 100644 demobench/logs/.gitkeep diff --git a/benchtools/cli.py b/benchtools/cli.py index 19df0f9..262439e 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -16,8 +16,7 @@ def benchtool(): @click.option('-p', '--path', help="The path where the new benchmark repository will be placed", default=".", type=str) @click.option('-a', '--about', help="Benchmark describtion. Content will go in the about.md file", default="", type=str) @click.option('--no-git', help="Don't make benchmark a git repository. Default is False", is_flag=True) -@click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", - default=[], type=(str, str), multiple=True) +@click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=(str, str), multiple=True) def init(benchmark_name, path, about, no_git, tasks): """ Initializes a new benchmark. @@ -38,7 +37,7 @@ def init(benchmark_name, path, about, no_git, tasks): if not benchmark_name: benchmark_name = click.prompt("Enter the name of your benchmark/project (will be used as folder and repo name)", type=str) - + benchmark_name = benchmark_name.strip().replace(" ", "_").lower() # TODO: Handle existing benchmark if not os.path.exists(path): @@ -60,10 +59,13 @@ def init(benchmark_name, path, about, no_git, tasks): click.echo(f"Creating {benchmark_name} in {bench_path}") benchmark = Bench(benchmark_name, bench_path) + + # Build the benchmark folder if benchmark.build(about, no_git, tasks): click.echo(f"Built {benchmark_name} benchmark successfully!") # TODO: Call betterbench CLI here + # betterbench() # Run? to_run = click.confirm("Do you want to run the benchmark now?", default=True) @@ -74,15 +76,17 @@ def init(benchmark_name, path, about, no_git, tasks): @benchtool.command() -@click.argument('benchmark-path', required = True, type=str,) -# -@click.argument('task-name', required = True, type=str,) +@click.argument('benchmark-path', required = True, type=str) +@click.argument('task-name', required = True, type=str) @click.argument('task-path', required = True, type=str) def add_task(benchmark_path, task_name, task_path): """ Set up a new task. # TODO explain arguments or convert to options. to use help + benchmark-path: "The path to the benchmark repository where the task will be added." + task-name: "The name of the task to be added. This will be used as the folder name for the task and should be unique within the benchmark." + task-path "The relative path to the dataset used for the task. OR any dataset from huggingface that starts with `openai/`" """ bench_path = os.path.abspath(benchmark_path) if os.path.exists(bench_path): @@ -98,6 +102,7 @@ def add_task(benchmark_path, task_name, task_path): def run(benchmark_path: str): """ Running the benchmark and generating logs + , help="The path to the benchmark repository where all the task reside." """ bench_path = os.path.abspath(benchmark_path) if os.path.exists(bench_path): @@ -113,10 +118,17 @@ def run(benchmark_path: str): def run_task(benchmark_path: str, task_name): """ Running the tasks and generating logs + + , help="The path to the benchmark repository where all the task reside." + , help="The name of the specific task you would like to run" """ bench_path = os.path.abspath(benchmark_path) if os.path.exists(bench_path): bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) click.echo(f"Running {task_name} now") - benchmark.run([task_name]) \ No newline at end of file + benchmark.run([task_name]) + +# For debugging +if __name__ == '__main__': + init() \ No newline at end of file diff --git a/benchtools/designer.py b/benchtools/designer.py index 35e01fb..87c1989 100644 --- a/benchtools/designer.py +++ b/benchtools/designer.py @@ -23,7 +23,7 @@ def build_dir(bench_path): os.mkdir(bench_path) # Create a benchmarks folder with tasks in them - tasks_path = os.path.join(bench_path, "benchmarks") + tasks_path = os.path.join(bench_path, "tasks") os.mkdir(tasks_path) log_path = os.path.join(bench_path, "logs") os.mkdir(log_path) diff --git a/benchtools/runner.py b/benchtools/runner.py index dd79d01..682938f 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,4 +1,4 @@ -# module to run tasks +# module to create and run benchmarks import os from benchtools.task import Task from benchtools.designer import build_dir, init_repo, create_about, setup_task @@ -10,7 +10,7 @@ class Bench(): ''' def __init__(self, name, path): ''' - initialize the benchmark object with the name and path to the benchmark folder. + Initialize the benchmark object with the name and path to the benchmark folder. Parameters: ----------- @@ -74,13 +74,13 @@ def run(self, tasks_torun=[], model='gemma3', api_url=None): ''' Run the benchmark by running each task in the benchmark and logging the interactions. Parameters: ------------ -tasks_torun: list of str - A list of task names to run. If empty, all tasks will be run. -model: str default 'gemma3' - The name of the model to use for running the tasks. Default is 'gemma3'. -api_url: str - The URL of the API to use for running the tasks. If None, the default API + ----------- + tasks_torun: list of str + A list of task names to run. If empty, all tasks will be run. + model: str default 'gemma3' + The name of the model to use for running the tasks. Default is 'gemma3'. + api_url: str + The URL of the API to use for running the tasks. If None, the default API ''' tasks = os.listdir(self.tasks_folder) for task in tasks: @@ -92,12 +92,12 @@ def run(self, tasks_torun=[], model='gemma3', api_url=None): elif file.endswith("yml"): self.tasks.append(Task('yml', task, os.path.join(task_folder,file), self.log_folder)) - tasks_torun = self.tasks if tasks_torun == [] else tasks_torun - print(tasks_torun) - print(self.tasks) + tasks_torun = [task.name for task in self.tasks] if tasks_torun == [] else tasks_torun + # print(tasks_torun) + # print(self.tasks) for task in self.tasks: if task.name in tasks_torun: - print("\n\n\n") + print("\n") name, prompts, answers = task.name, task.sub_tasks, task.answers print("Task: " + name) print("Prompts: ", end='') diff --git a/benchtools/task.py b/benchtools/task.py index 3bc83e1..254d600 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -162,7 +162,7 @@ def from_txt_csv(task_folder): processed_prompt = prompt.replace("{a}", str(csvFile.iloc[x,0])) processed_prompt = processed_prompt.replace("{b}", str(csvFile.iloc[x, 1])) storedTasks.append(processed_prompt) - print("Prompt: "+ processed_prompt) + # print("Prompt: "+ processed_prompt) # Debugging storedAnswers.append(str(csvFile.iloc[x, 2])) return (storedTasks, storedAnswers) @@ -195,7 +195,7 @@ def from_yaml(yaml_file): for values in value_combinations: value_mapping = dict(zip(keys, values)) # Pair keys with values filled_prompt = template.format(**value_mapping) # Format the template - print("Prompt: "+ filled_prompt) + # print("Prompt: "+ filled_prompt) # Debugging storedTasks.append(filled_prompt) # Store task for answer in answers: storedAnswers.append(answer) diff --git a/demobench/logs/.gitkeep b/demobench/logs/.gitkeep new file mode 100644 index 0000000..e69de29 From 134bef0bd90b8b1b2c5aa2f87e5ebf6f6e9d8abc Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 14 Feb 2026 17:25:13 +0000 Subject: [PATCH 57/78] Some documenting --- benchtools/runner.py | 26 ++++++++++++++++++++++++++ docs/source/cli.md | 1 + docs/source/pylib.md | 15 ++++----------- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 682938f..c1eaa64 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -7,6 +7,32 @@ class Bench(): ''' + Benchmark with multiple tasks + + ... + + Attributes + ---------- + bench_name : str + Name of the benchmark. + bench_path: str + Path to where the benchmark folder and all its content reside + task_folder: + Path to tasks folder insise benchmark folder + log folder: + Path to logs folder inside benchmark folder + tasks: tuple + A tas + is_built: bool + + Methods + ------- + build() + Build the benchmark directory. + add_task() + Add new tasks to the benchmark + run() + Run one task or all tasks of the benchmark. ''' def __init__(self, name, path): ''' diff --git a/docs/source/cli.md b/docs/source/cli.md index 23aa9e1..71f5589 100644 --- a/docs/source/cli.md +++ b/docs/source/cli.md @@ -21,6 +21,7 @@ Setting up FillIn...Success benchtool run testRuns/111 ``` + ```{eval-rst} .. click:: benchtools.cli:benchtool :prog: benchtool diff --git a/docs/source/pylib.md b/docs/source/pylib.md index 4e52ba2..61a1413 100644 --- a/docs/source/pylib.md +++ b/docs/source/pylib.md @@ -1,23 +1,16 @@ # BenchTools as a Python Library -## Design Benchmark Directory +## Creating a Benchmark object ```{eval-rst} -.. automodule:: benchtools.designer +.. automodule:: benchtools.runner :members: ``` -## Build Directory +## Benchmark class ```{eval-rst} -.. autoclass:: benchtools.designer.build_dir - :members: -``` - -## Build Directory - -```{eval-rst} -.. automodule:: benchtools.designer.build_dir +.. autoclass:: benchtools.runner.Benchmark :members: ``` \ No newline at end of file From 6b5a5eca05e3818062ad64e73d0a1b3dab1d7134 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 14 Feb 2026 17:26:02 +0000 Subject: [PATCH 58/78] BetterBench: Introducing betterbench interactive session --- benchtools/betterbench.py | 145 ++++++++++++++++++++++++++++++++++++++ benchtools/cli.py | 19 +++++ 2 files changed, 164 insertions(+) create mode 100644 benchtools/betterbench.py diff --git a/benchtools/betterbench.py b/benchtools/betterbench.py new file mode 100644 index 0000000..bcbb290 --- /dev/null +++ b/benchtools/betterbench.py @@ -0,0 +1,145 @@ +import os +import json +import click +import dataclasses +from dataclasses import dataclass + +class EnhancedJSONEncoder(json.JSONEncoder): + def default(self, o): + if dataclasses.is_dataclass(o): + return dataclasses.asdict(o) + #if it is a function, use its string name + elif hasattr(o, '__call__'): + return o.__name__ + return super().default(o) + +@dataclass +class ChecklistQuestion: + response: str + justification: str + score: int + criteria: list[str] + skipped: bool + + +def calculate_score(response: str, justification: str) -> int: + if response == 'no': + return 0 + else: + TODO + + +Checklist = [ + # Design + "The tested capability, characteristic, or concept is defined", + "How tested capability or concept translates to benchmark task is described", + "How knowing about the tested concept is helpful in the real world is described", + "How benchmark score should or shouldn't be interpreted/used is described", + "Domain experts are involved", + "Use cases and/or user personas are described", # Has n/a + "Domain literature is integrated", + "Informed performance metric choice", + "Metric floors and ceilings are included" + "Human performance level is included", # Has n/a + "Random performance level is included", # Has n/a + "Automatic evaluation is possible and validated", + "Differences to related benchmarks are explained", + "Input sensitivity is addressed", + # Implementation + "The evaluation code is available", + "The evaluation data or generation mechanism is accessible", + "The evaluation of models via API is supported", + "The evaluation of local models is supported", + "A globally unique identifier is added or evaluation instances are encrypted", + "A task to identify if model is included trained on benchmark data", + "A script to replicate results is explicitly included", + "Statistical significance or uncertainty quantification of benchmark results is reported", + "Need for warnings for sensitive/harmful content is assessed", + "A build status (or equivalent) is implemented", + "Release requirements are specified", + # Documentation + "Requirements file or equivalent is available", + "Quick-start guide or demo is available", + "In-line code comments are used", + "Code documentation is available", + "Accompanying paper is accepted at peer-reviewed venue", + "Benchmark construction process is documented", + "Test tasks & rationale are documented", + "Assumptions of normative properties are documented", + "Limitations are documented", + "Data collection, test environment design, or prompt design process is documented", + "Evaluation metric is documented", + "Applicable license is specified", + # Maintenance + "Code usability was checked within the last year", + "Maintained feedback channel for users is available", + "Contact person is listed" +] + + + +def betterbench(checklist_path="/work/pi_brownsarahm_uri_edu/ayman_uri/BenchTools/testRuns/111/betterbench.json") -> dict: + """ + The checklist below is based on the benchmark quality assessment proposed in BetterBench. It is supposed to help authors identify if they adhere to best practices in their benchmark development. If you want to have your benchmark added to the BetterBench Repository, please also fill out the justifications. These should be about one sentence long each, and include the page numbers of your paper or your webpage where the information can be found. You can also copy-paste quotes from any of your publicly available materials here as evidence. In this case, please also add a link to the source. + Reuel et. al. + + :param checklist_path: _description_, defaults to "/work/pi_brownsarahm_uri_edu/ayman_uri/BenchTools/testRuns/1111/betterbench.json" + :type checklist_path: str, optional + :return: _description_ + :rtype: dict + """ + + checklist={} + if os.path.exists(checklist_path): + with open(checklist_path) as f: + checklist = json.load(f) + + if not checklist: + checklist = {} + for question in Checklist: + item = ChecklistQuestion( + skipped=True, + response="", + justification="", + score=0, + criteria=[] + ) + checklist[question] = item + + + click.echo("Entering interactive session for BetterBench!") + click.echo("This interactive session is meant to help you think about your benchmark in through the standards develope by reuel et. al. that are the BetterBench Checklist!") + click.echo("This interactive session is optional and you can always come back to it with the `betterbench resume` command") + + # TODO: check if want to change answer on any questions + + # Loop until user opts out + for question, vals in checklist.items(): + # print(question) + # print(vals) + choice = click.prompt(f"{question}?\nEnter to skip. q to end this session...", type=click.Choice(["yes", "no", "n/a", 'q', ''], case_sensitive=False), show_choices=True, default='') + + # Check for user opt out + if choice == 'q': + break + elif choice == '': + continue + else: + justification = click.prompt("Justification? ") + + score = calculate_score(choice, justification) + checklist[question]['response'] = choice + checklist[question]['justification'] = justification + checklist[question]['score'] = score + + + + + json.dump(checklist, open(checklist_path, "w"), indent=4, cls=EnhancedJSONEncoder) + + exit(0) + + +def get_score() -> int: + return 99 + \ No newline at end of file diff --git a/benchtools/cli.py b/benchtools/cli.py index 262439e..24142ef 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,6 +1,7 @@ import os import click from benchtools.runner import Bench +from benchtools.betterbench import betterbench, get_score # from task import PromptTask @click.group() @@ -129,6 +130,24 @@ def run_task(benchmark_path: str, task_name): click.echo(f"Running {task_name} now") benchmark.run([task_name]) +@benchtool.command() +@click.argument('benchmark-path', required = True, type=str) +def score(benchmark_path: str): + """ + Running the tasks and generating logs + + , help="The path to the benchmark repository where all the task reside." + , help="The name of the specific task you would like to run" + """ + bench_path = os.path.abspath(benchmark_path) + if os.path.exists(bench_path): + bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path + benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) + click.echo(f"Scoring {benchmark.bench_name} now...") + score = get_score() + click.echo(f"Score: {score}") + + # For debugging if __name__ == '__main__': init() \ No newline at end of file From 9443fa4e47a61103db6567a88309bd6086662cc8 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 14 Feb 2026 13:26:29 -0500 Subject: [PATCH 59/78] Logger: Now task objects have their separate loggers. --- benchtools/logger.py | 22 +++++++++++++++------- benchtools/task.py | 6 +++--- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/benchtools/logger.py b/benchtools/logger.py index e8efdba..41cb479 100644 --- a/benchtools/logger.py +++ b/benchtools/logger.py @@ -1,8 +1,6 @@ import os import logging -log_folder: str - def init_logger(log_path, task_name): '''' Initializes the logger for a specific task. @@ -15,10 +13,20 @@ def init_logger(log_path, task_name): The name of the task for which the logger is being initialized. This will be used to name the log file. ''' log_file = os.path.join(log_path, f'{task_name}_log.txt') - logging.basicConfig(filename=log_file, - level=logging.INFO, format= '%(asctime)s - %(levelname)s - %(message)s') + # print(f"\nLOGPATH: {log_file}\n") # Debugging + + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + handler = logging.FileHandler(log_file) + handler.setFormatter(formatter) + + logger = logging.getLogger(task_name) + logger.setLevel(logging.INFO) # TODO add as an argument to the init functuion to use more options + logger.addHandler(handler) + + # print(logger) # Debugging + return logger -def log_agent_interaction(agent_input, agent_output): +def log_agent_interaction(logger, agent_input, agent_output): """ Logs the agent's input and output to a file. @@ -29,8 +37,8 @@ def log_agent_interaction(agent_input, agent_output): agent_output: string The output generated by the agent. """ - logging.info(f'Input: {agent_input}') - logging.info(f'Output: {agent_output}') + logger.info(f'Input: {agent_input}') + logger.info(f'Output: {agent_output}') diff --git a/benchtools/task.py b/benchtools/task.py index 254d600..9f52130 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -62,7 +62,7 @@ def __init__( self.runner_type = runner_type self.responses = [] - init_logger(self.log_path, self.name) + self.logger = init_logger(self.log_path, self.name) def run(self, model, api_url=None): @@ -123,8 +123,8 @@ def run(self, model, api_url=None): case _: print(f"Runner type {self.runner_type} not supported") return None - - log_agent_interaction(sub_task, response.message.content) + + log_agent_interaction(self.logger, sub_task, response.message.content) def score(self, response): From 470327103d876490415837f4fdc04da64aab59f5 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 12:49:02 -0500 Subject: [PATCH 60/78] fix typo --- docs/source/pylib.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/pylib.md b/docs/source/pylib.md index 61a1413..a2ae02b 100644 --- a/docs/source/pylib.md +++ b/docs/source/pylib.md @@ -11,6 +11,6 @@ ## Benchmark class ```{eval-rst} -.. autoclass:: benchtools.runner.Benchmark +.. autoclass:: benchtools.runner.Bench :members: ``` \ No newline at end of file From 7a006b2a2bffb0622934cd5eda6e69c639c37194 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 12:49:26 -0500 Subject: [PATCH 61/78] remove unnecessary --- benchtools/runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index c1eaa64..f1da32c 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -9,7 +9,6 @@ class Bench(): ''' Benchmark with multiple tasks - ... Attributes ---------- From 65c86f53e407be78353a3bf9ad8b8e949a65f40f Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 13:17:22 -0500 Subject: [PATCH 62/78] demo running form cwd, not up one, very conufsing --- docs/source/cli.md | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/docs/source/cli.md b/docs/source/cli.md index 71f5589..d43609f 100644 --- a/docs/source/cli.md +++ b/docs/source/cli.md @@ -1,21 +1,15 @@ # CLI +```bash +benchtool init new_test -p . -t add ../datasets/add/ -t Gaps ../datasets/miscops/ -a "this is a demo for benchtools" ``` -benchtool init new_test -p ../ -t add ../datasets/add/ -t Gaps ../datasets/miscops/ -a "this is a demo for benchtools" -``` -``` -Creating new_test in ../ -Setting up add...Success -Setting up Gaps...Success -Would you like to run the benchmark? y/n? n -``` -``` + +```bash benchtool add-task ../new_test/ FillIn ../datasets/miscops/ ``` -``` -Setting up FillIn...Success -``` + + ``` benchtool run testRuns/111 From 2249d30c991b4511eeb5995f1ae0bea6db01d4c5 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 13:19:41 -0500 Subject: [PATCH 63/78] start as notebook to have a demo --- docs/source/pylib.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/source/pylib.md b/docs/source/pylib.md index a2ae02b..e0241c1 100644 --- a/docs/source/pylib.md +++ b/docs/source/pylib.md @@ -1,5 +1,25 @@ +--- +jupytext: + formats: ipynb,md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.15.1 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + + # BenchTools as a Python Library +```{code-cell} +import benchtools +``` + + ## Creating a Benchmark object From f010754d2e47293934988ace735871609376c665 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 13:20:10 -0500 Subject: [PATCH 64/78] doc tone fixes --- benchtools/cli.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 24142ef..074edec 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -22,13 +22,9 @@ def init(benchmark_name, path, about, no_git, tasks): """ Initializes a new benchmark. - Even though the command doesn't have any required arguments. If the - argument wasn't passed the interface will ask for a name and wouldn't continue without one. + Benchmark-name is required, if not provided, requested interactively. - This command is the starting point. With this, the process of creating a benchmark - structure and guiding the user into the correct mindset of a benchmark. - - After running this command, the folder structure of the benchmark will be created. + this command creates the folder for the benchmark. Task files will be loaded, the user will be asked a series of questions to demonstrate the correct mindset of benchmarking, and finally, the user will be given the choice to run the benchmark or not. @@ -37,7 +33,7 @@ def init(benchmark_name, path, about, no_git, tasks): """ if not benchmark_name: - benchmark_name = click.prompt("Enter the name of your benchmark/project (will be used as folder and repo name)", type=str) + benchmark_name = click.prompt("Enter the name of your benchmark (will be used as folder and repo name)", type=str) benchmark_name = benchmark_name.strip().replace(" ", "_").lower() # TODO: Handle existing benchmark From d5dda912729789c58431b3426ea4d62f2991e507 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 13:21:38 -0500 Subject: [PATCH 65/78] "correct mindset" is dimunitiive --- benchtools/cli.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/benchtools/cli.py b/benchtools/cli.py index 074edec..9791640 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -25,10 +25,6 @@ def init(benchmark_name, path, about, no_git, tasks): Benchmark-name is required, if not provided, requested interactively. this command creates the folder for the benchmark. - Task files will be loaded, the user will be asked a series of questions to demonstrate - the correct mindset of benchmarking, and finally, the user will be given the choice to - run the benchmark or not. - """ From 603b6658aefe8fc8c23f947e6da74bcc50c28ca1 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 14:23:39 -0500 Subject: [PATCH 66/78] start refactor --- benchtools/__init__.py | 1 + benchtools/cli.py | 6 +- benchtools/designer.py | 314 +++++++++++++++++++++++++++++------------ benchtools/runner.py | 85 +---------- docs/source/pylib.md | 4 +- 5 files changed, 230 insertions(+), 180 deletions(-) diff --git a/benchtools/__init__.py b/benchtools/__init__.py index e69de29..c3152e0 100644 --- a/benchtools/__init__.py +++ b/benchtools/__init__.py @@ -0,0 +1 @@ +from .designer import Bench \ No newline at end of file diff --git a/benchtools/cli.py b/benchtools/cli.py index 9791640..62fd420 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,6 +1,6 @@ import os import click -from benchtools.runner import Bench +from benchtools.designer import Bench from benchtools.betterbench import betterbench, get_score # from task import PromptTask @@ -54,8 +54,8 @@ def init(benchmark_name, path, about, no_git, tasks): benchmark = Bench(benchmark_name, bench_path) # Build the benchmark folder - if benchmark.build(about, no_git, tasks): - click.echo(f"Built {benchmark_name} benchmark successfully!") + if benchmark.write(about, no_git, tasks): + click.echo(f"Created {benchmark_name} benchmark successfully!") # TODO: Call betterbench CLI here # betterbench() diff --git a/benchtools/designer.py b/benchtools/designer.py index 87c1989..897d802 100644 --- a/benchtools/designer.py +++ b/benchtools/designer.py @@ -3,109 +3,239 @@ import os import shutil import requests +import yaml from datasets import load_dataset # from pathlib import Path # ??? -def build_dir(bench_path): - ''' - Create benchmrk skeleton +about_template = """# {bench_name} - Parameters: - ----------- - bench_path: str - The path to the benchmark folder. This folder will be created if it does not exist. +{text} - Returns: - -------- - - ''' - - os.mkdir(bench_path) - # Create a benchmarks folder with tasks in them - tasks_path = os.path.join(bench_path, "tasks") - os.mkdir(tasks_path) - log_path = os.path.join(bench_path, "logs") - os.mkdir(log_path) - -### Generate an about path from the description of the user -def create_about(bench_name, bench_path, text): - about_path = os.path.join(bench_path, "about.md") - about_text= f""" - # {bench_name} - {text} - - Generated by BenchTools - """ - with open(about_path, 'w') as file: - file.write(about_text) - -### Initialize git repository -def init_repo(bench_path): +Generated by BenchTools +""" + +class Bench(): ''' - Initialize the benchmark folder as git repo with gitiginore for python + Benchmark with multiple tasks + - Parameters: - ----------- + Attributes + ---------- + bench_name : str + Name of the benchmark. bench_path: str - The path to the benchmark folder + Path to where the benchmark folder and all its content reside + task_folder: + Path to tasks folder insise benchmark folder + log folder: + Path to logs folder inside benchmark folder + tasks: tuple + A tas + is_built: bool + + Methods + ------- + build() + Build the benchmark directory. + add_task() + Add new tasks to the benchmark + run() + Run one task or all tasks of the benchmark. ''' - current_dir = os.getcwd() - os.chdir(bench_path) - try: - os.system("git init . -q") - os.system("git branch -m main") - except: - print("git might not be initialized in your system. Please run \"git init . \" when setup") - # Get python gitignore template and create .gitignore - ignore_text = requests.get("https://raw.githubusercontent.com/github/gitignore/refs/heads/main/Python.gitignore") - if ignore_text.status_code == 200: - with open(".gitignore", 'a') as f: - f.write(ignore_text.text) - os.chdir(current_dir) - - -# Create a benchmarks folder with tasks in them -def setup_task(tasks_path: str, task_name: str, task_path: str): - - print(f"Setting up {task_name}...", end='') - task_folder = os.path.join(tasks_path, task_name) - os.mkdir(task_folder) # TODO: check if folder exists and handle - - if task_path.startswith('openai'): - download_dataset(task_folder, task_path) - print("Success") - return + def __init__(self, name, path = '.',concept = None): + ''' + Initialize the benchmark object with the name and path to the benchmark folder. + + Parameters: + ----------- + name: str + name of the benchmark will be used for folder + path: str or buffer + path where the benchmark will be stored + + ''' + # load tasks from file structre and instantiate task objects for each, store those in a list. + # loading will + self.display_name = name.strip() + self.bench_concept = concept if concept else f'a benchmark about {name.strip()}' + self.bench_name = name.strip().replace(" ", "_").lower() + self.base_path = path + self.bench_path = os.path.join(path, self.bench_name) + self.tasks_folder = os.path.join(self.bench_path, 'tasks') + self.tasks = [] + self.written = os.path.exists(self.bench_path) + + @classmethod + def load(bench_path): + ''' + Load a benchmark from a given path. The path should point to the benchmark folder. + + Parameters: + ----------- + bench_path: str + The path to the benchmark folder. The folder should contain the about.md file, tasks folder and logs folder. + + Returns: + -------- + Bench + An instance of the Bench class with the loaded benchmark. + ''' + if not os.path.exists(bench_path): + raise ValueError("The passed path doesn't exist.") + + + + def initialize_dir(self, about_body=None, no_git=False): + ''' + write out the benchmark folder + + Parameters: + ----------- + about_text: str + description of the benchmark to be included in the about.md file + no_git: bool + whether to initialize a git repository in the benchmark folder + new_tasks: list of tuples (task_name, task_source) + list of tasks to be added to the benchmark. Each task is represented as a tuple containing + + Returns: + -------- + self.built : bool + True if the benchmark was successfully built, False otherwise + ''' + + # Create benchmark skeleton + os.mkdir(self.bench_path) + # Create a benchmarks folder with tasks in them + tasks_path = os.path.join(self.bench_path, "tasks") + os.mkdir(tasks_path) + log_path = os.path.join(self.bench_path, "logs") + os.mkdir(log_path) + + # Create about.md + about_path = os.path.join(self.bench_path, "about.md") + if not about_body: + about_body = "a {self.bench_name}." + about_text= about_template.format({'bench_name':self.bench_name, + 'text':about_body}) + with open(about_path, 'w') as file: + file.write(about_text) + + # Initialize a git repo + if not no_git: + self.init_repo(self.bench_path) + + for task_name, task_source in new_tasks: + self.add_task(task_name, task_source) + + self.write() + self.written = True + return self.written + + + def write(self): + info = {'bench_name': self.bench_name, + 'bench_concept': self.bench_concept, + 'bench_path': self.bench_path, + 'tasks': self.tasks} + with open(os.path.join(self.bench_path, 'info.yml'), 'w') as f: + yaml.dump(info, f) + + + ### Initialize git repository + def init_repo(bench_path): + ''' + Initialize the benchmark folder as git repo with gitiginore for python + + Parameters: + ----------- + bench_path: str + The path to the benchmark folder + ''' + current_dir = os.getcwd() + os.chdir(bench_path) + try: + os.system("git init . -q") + os.system("git branch -m main") + except: + print("git might not be initialized in your system. Please run \"git init . \" when setup") + # Get python gitignore template and create .gitignore + ignore_text = requests.get("https://raw.githubusercontent.com/github/gitignore/refs/heads/main/Python.gitignore") + if ignore_text.status_code == 200: + with open(".gitignore", 'a') as f: + f.write(ignore_text.text) + os.chdir(current_dir) + + + def add_task(self, task_name, task_source): + + self.tasks.append(task_name) + # setup_task(self.tasks_folder, task_name, task_source)) - # Path could be absolute or relative, check and work accordingly - if not task_path.startswith('/'): - if task_path.startswith('./'): - # TODO: Path could have one or more `../` use relpath to fix this block - task_path = task_path[2:] - task_path = os.path.join(os.getcwd(), task_path) - # print(f" path {task_path}\n\n") # Debugging - - # could be a single file or a folder check and work accordignly - if os.path.isdir(task_path): - for sub in os.listdir(task_path): - shutil.copy2(os.path.join(task_path, sub), task_folder) - else: - shutil.copy2(task_path, task_folder) - print("Success") - -def download_dataset(task_folder: str, hf_path: str): - with open(os.path.join(task_folder, 'task.txt'), 'w') as f: - f.write('{p}') - - dataset = load_dataset(hf_path) - dataset_test = dataset['test'] - - with open(os.path.join(task_folder, 'values.csv'), 'w') as f: - f.write('p,res') - for row in dataset_test: - prompt = row['prompt'] - answer = row['canonical_solution'] - f.write(f"{prompt,answer}") + # Create a benchmarks folder with tasks in them + def initialize_task_dir(tasks_path, task_name: str, task_source=None, + is_huggingface=False): + ''' + Initialize a new task folder in the benchmark repo + + Parameters: + ----------- + tasks_path: str + The path to the tasks folder inside the benchmark folder + task_name: str + The name of the task to be added. This will be used for the task folder name + task_source: str or buffer + The source of the task data. This can be a path to a local file or folder, + or a Hugging Face dataset identifier. + The content + is_huggingface: bool + Whether the task source is a Hugging Face dataset. If True, the task_source + should be like ownser/dataset_name + ''' + + print(f"Setting up {task_name}...", end='') + task_folder = os.path.join(tasks_path, task_name) + os.mkdir(task_folder) # TODO: check if folder exists and handle + + if is_huggingface: + download_dataset(task_folder, task_source) + print("Success") + return + + + # Path could be absolute or relative, check and work accordingly + # if not task_source.startswith('/'): + # if task_source.startswith('./'): + # # TODO: Path could have one or more `../` use relpath to fix this block + # task_source = task_source[2:] + # task_source = os.path.join(os.getcwd(), task_source) + # print(f" path {task_source}\n\n") # Debugging + + # could be a single file or a folder check and work accordignly + if os.path.isdir(task_source): + for sub in os.listdir(task_source): + shutil.copy2(os.path.join(task_source, sub), task_folder) + else: + shutil.copy2(task_source, task_folder) + print("Success") + + def download_dataset(task_folder: str, hf_path: str): + ''' + dataset must have columns 'prompt' and 'canonical_solution' for now, can be expanded in the future. + ''' + with open(os.path.join(task_folder, 'task.txt'), 'w') as f: + f.write('{p}') + + dataset = load_dataset(hf_path) + dataset_test = dataset['test'] + + with open(os.path.join(task_folder, 'values.csv'), 'w') as f: + f.write('p,res') + for row in dataset_test: + prompt = row['prompt'] + answer = row['canonical_solution'] + f.write(f"{prompt,answer}") diff --git a/benchtools/runner.py b/benchtools/runner.py index f1da32c..6913ef3 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -5,95 +5,12 @@ # from log_file.py import log_agent_interaction -class Bench(): +class BenchRunner(): ''' - Benchmark with multiple tasks - - - Attributes - ---------- - bench_name : str - Name of the benchmark. - bench_path: str - Path to where the benchmark folder and all its content reside - task_folder: - Path to tasks folder insise benchmark folder - log folder: - Path to logs folder inside benchmark folder - tasks: tuple - A tas - is_built: bool - - Methods - ------- - build() - Build the benchmark directory. - add_task() - Add new tasks to the benchmark - run() - Run one task or all tasks of the benchmark. ''' - def __init__(self, name, path): - ''' - Initialize the benchmark object with the name and path to the benchmark folder. - Parameters: - ----------- - name: str - name of the benchmark will be used for folder - path: str or buffer - path to the benchmark folder. If the folder does not exist, it will be created - ''' - # load tasks from file strucutre and instantiate task objects for each, store those in a list. - # loading will - self.bench_name = name.strip().replace(" ", "_").lower() - self.bench_path = path - self.tasks_folder = os.path.join(self.bench_path, 'tasks') - self.log_folder = os.path.join(self.bench_path, 'logs') - self.tasks = [] - self.built = os.path.exists(self.bench_path) - def build(self, about_text, no_git, new_tasks) -> bool: - ''' - - Parameters: - ----------- - about_text: str - description of the benchmark to be included in the about.md file - no_git: bool - whether to initialize a git repository in the benchmark folder - new_tasks: list of tuples (task_name, task_path) - list of tasks to be added to the benchmark. Each task is represented as a tuple containing - - Returns: - -------- - self.built : bool - True if the benchmark was successfully built, False otherwise - ''' - - # Create benchmark skeleton - build_dir(self.bench_path) - - # Create about.md - create_about(self.bench_name, self.bench_path, about_text) - - # Initialize a git repo - if not no_git: - init_repo(self.bench_path) - - for task_name, task_path in new_tasks: - self.add_task(task_name, task_path) - - self.built = True - return self.built - - - def add_task(self, task_name, task_path): - - if self.built: - self.tasks.append(setup_task(self.tasks_folder, task_name, task_path)) - def run(self, tasks_torun=[], model='gemma3', api_url=None): ''' diff --git a/docs/source/pylib.md b/docs/source/pylib.md index e0241c1..b107fdd 100644 --- a/docs/source/pylib.md +++ b/docs/source/pylib.md @@ -16,7 +16,9 @@ kernelspec: # BenchTools as a Python Library ```{code-cell} -import benchtools +from benchtools import Bench + +Bench('Tiniest Demo', concept ='the simplest bench') ``` From b8d7f622e6f4da266c74a43f7cd270db9bb7778a Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 14:56:37 -0500 Subject: [PATCH 67/78] rename, start class methods, and move runner all to run method to make the task objects more usable in design --- benchtools/task.py | 174 ++++++++++++++++++++++----------------------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/benchtools/task.py b/benchtools/task.py index 9f52130..ab0aaed 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -16,56 +16,110 @@ class Task: """ def __init__( - self, data_type, task_name, path, log_path, scoring_function=None, reference=None, runner_type="ollama" + self, task_name, prompt, reference, scoring_function=None, prompt_variants = None, ): """ - init a task object + init a task object from a prompt and reference, and a scoring function. If no scoring function is provided, defaults to exact match. Parameters ---------- dir : string or path directory containing the task assets - + prompt: string + prompt for task or overall description scoring_function : function handle or string if string, must be name of built in eval function provided here - reference: string or number + reference: string or number or list of solution that will be passed with the model answer to the scoring function - runner_type: string {ollama} - the way the runner should be called, - solution that will be passed with the model answer - runner_type: string {ollama,openai} - define which runner should be used for the task. - to use the Ollama runner, the script expects the model to be installed, and `ollama serve` running on localhost:11434 - to use OpenAI runner, you must have an API key set in your OPENAI_API_KEY environment variable """ self.name = task_name - self.log_path = log_path - self.sub_tasks = [] - self.answers = [] - match data_type: - case 'csv': - prompts, answers = from_txt_csv(path) - self.sub_tasks += prompts - self.answers += answers - case 'yml': - prompts, answers = from_yaml(path) - self.sub_tasks += prompts - self.answers += answers + if prompt_variants: + self.sub_tasks = prompt_variants + self.description = prompt + else: + self.sub_tasks = [prompt] + self.description = f"a basic prompt task with: {prompt}" + + + self.reference = reference if type(scoring_function) is str: self.scoring_function = scoring_function[scoring_function] else: self.scoring_function = scoring_function - self.reference = reference - self.runner_type = runner_type - self.responses = [] + + # self.responses = [] - self.logger = init_logger(self.log_path, self.name) + # self.logger = init_logger(self.log_path, self.name) + - def run(self, model, api_url=None): + @classmethod + def from_txt_csv(cls, task_name, task_folder): + ''' + load a template from txt and create task objects for each row of a csv + + folder must contain a task.txt file with the template, and a values.csv file with the values to fill in the template, and the reference answers. The csv should be structured as follows: + ''' + # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor + prompt = "" + with open(os.path.join(task_folder, "task.txt"), "r") as f: + prompt = f.read() + + value_answer_df = pandas.read_csv(os.path.join(task_folder, "values.csv")) + # answers = pandas.read_csv(os.path.join(task_folder, "results")) + storedTasks = [] + storedAnswers = [] + for x in range(len(value_answer_df)): + processed_prompt = prompt.replace("{a}", str(value_answer_df.iloc[x,0])) + processed_prompt = processed_prompt.replace("{b}", str(value_answer_df.iloc[x, 1])) + storedTasks.append(processed_prompt) + # print("Prompt: "+ processed_prompt) # Debugging + storedAnswers.append(str(value_answer_df.iloc[x, 2])) + + description = f"a template based task with template: {prompt} and values like:\n\n {value_answer_df.head().to_markdown()}" + + return cls(task_name, prompt_variants = storedTasks, reference=storedAnswers) + + + + @classmethod + def from_yaml(yaml_file): + """ + Load tasks from a YAML file and generate PromptTask objects. + Parameters + ---------- + yaml_file : str + Path to the YAML file containing task templates and values. + Returns + ------- + self : Bench + The Bench instance with tasks populated. + """ + with open(yaml_file, 'r') as file: + data = yaml.safe_load(file) + storedTasks = [] + storedAnswers = [] + for sub_task in data: + template = sub_task["template"] # Extract template + values_dict = sub_task["values"] # Extract values dictionary + answers = sub_task["result"] + # Generate all possible value combinations using itertools.product + keys = values_dict.keys() + value_combinations = zip(*values_dict.values()) + # Create a PromptTask for each combination + for values in value_combinations: + value_mapping = dict(zip(keys, values)) # Pair keys with values + filled_prompt = template.format(**value_mapping) # Format the template + # print("Prompt: "+ filled_prompt) # Debugging + storedTasks.append(filled_prompt) # Store task + for answer in answers: + storedAnswers.append(answer) + return (storedTasks, storedAnswers) + + def run(self, model,runner_type="ollama", api_url=None): """ run the task on the model @@ -75,12 +129,16 @@ def run(self, model, api_url=None): the model to run the task on api_url : string the url of the api to use for the task + runner_type: string {ollama,openai} + define which runner should be used for the task. + to use the Ollama runner, the script expects the model to be installed, and `ollama serve` running on localhost:11434 + to use OpenAI runner, you must have an API key set in your OPENAI_API_KEY environment variable """ for sub_task in self.sub_tasks: # print(sub_task) - match self.runner_type: + match runner_type: case "ollama": response: ChatResponse = chat(model=model, messages=[ { @@ -142,61 +200,3 @@ def score(self, response): # additional classes for other types of tasks # likely an agent task that can pass environment assets - - -# possibly private method? # TODO: Fix csv indexing? -def from_txt_csv(task_folder): - ''' - load a template from txt and create task objects for each row of a csv - ''' - # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor - prompt = "" - with open(os.path.join(task_folder, "task.txt"), "r") as f: - prompt = f.read() - - csvFile = pandas.read_csv(os.path.join(task_folder, "values.csv")) - # answers = pandas.read_csv(os.path.join(task_folder, "results")) - storedTasks = [] - storedAnswers = [] - for x in range(len(csvFile)): - processed_prompt = prompt.replace("{a}", str(csvFile.iloc[x,0])) - processed_prompt = processed_prompt.replace("{b}", str(csvFile.iloc[x, 1])) - storedTasks.append(processed_prompt) - # print("Prompt: "+ processed_prompt) # Debugging - storedAnswers.append(str(csvFile.iloc[x, 2])) - - return (storedTasks, storedAnswers) - - -def from_yaml(yaml_file): - """ - Load tasks from a YAML file and generate PromptTask objects. - Parameters - ---------- - yaml_file : str - Path to the YAML file containing task templates and values. - Returns - ------- - self : Bench - The Bench instance with tasks populated. - """ - with open(yaml_file, 'r') as file: - data = yaml.safe_load(file) - storedTasks = [] - storedAnswers = [] - for sub_task in data: - template = sub_task["template"] # Extract template - values_dict = sub_task["values"] # Extract values dictionary - answers = sub_task["result"] - # Generate all possible value combinations using itertools.product - keys = values_dict.keys() - value_combinations = zip(*values_dict.values()) - # Create a PromptTask for each combination - for values in value_combinations: - value_mapping = dict(zip(keys, values)) # Pair keys with values - filled_prompt = template.format(**value_mapping) # Format the template - # print("Prompt: "+ filled_prompt) # Debugging - storedTasks.append(filled_prompt) # Store task - for answer in answers: - storedAnswers.append(answer) - return (storedTasks, storedAnswers) \ No newline at end of file From d0910ea32473f41f6fad789a8f12bdf828ea2a15 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 16:22:08 -0500 Subject: [PATCH 68/78] continue refactor and start test case --- benchtools/__init__.py | 3 +- benchtools/{designer.py => benchmark.py} | 109 +++++++-------------- benchtools/cli.py | 1 - benchtools/runner.py | 4 +- benchtools/scorerers.py | 8 -- benchtools/scorers.py | 19 ++++ benchtools/task.py | 119 +++++++++++++++++++---- docs/source/pylib.md | 29 +++++- setup.py | 2 +- 9 files changed, 184 insertions(+), 110 deletions(-) rename benchtools/{designer.py => benchmark.py} (62%) delete mode 100644 benchtools/scorerers.py create mode 100644 benchtools/scorers.py diff --git a/benchtools/__init__.py b/benchtools/__init__.py index c3152e0..24ee3b5 100644 --- a/benchtools/__init__.py +++ b/benchtools/__init__.py @@ -1 +1,2 @@ -from .designer import Bench \ No newline at end of file +from .benchmark import Bench +from .task import Task \ No newline at end of file diff --git a/benchtools/designer.py b/benchtools/benchmark.py similarity index 62% rename from benchtools/designer.py rename to benchtools/benchmark.py index 897d802..edf0679 100644 --- a/benchtools/designer.py +++ b/benchtools/benchmark.py @@ -4,7 +4,6 @@ import shutil import requests import yaml -from datasets import load_dataset # from pathlib import Path # ??? @@ -30,20 +29,19 @@ class Bench(): Path to tasks folder insise benchmark folder log folder: Path to logs folder inside benchmark folder - tasks: tuple - A tas + tasks: list of Task objects + is_built: bool Methods ------- - build() Build the benchmark directory. add_task() Add new tasks to the benchmark run() Run one task or all tasks of the benchmark. ''' - def __init__(self, name, path = '.',concept = None): + def __init__(self, name, path = '.',concept = None, tasks = []): ''' Initialize the benchmark object with the name and path to the benchmark folder. @@ -54,16 +52,19 @@ def __init__(self, name, path = '.',concept = None): path: str or buffer path where the benchmark will be stored + tasks: list of Task objects + list of tasks to be included in the benchmark. Each task should be an instance of the + ''' # load tasks from file structre and instantiate task objects for each, store those in a list. # loading will self.display_name = name.strip() - self.bench_concept = concept if concept else f'a benchmark about {name.strip()}' + self.concept = concept if concept else f'a benchmark about {name.strip()}' self.bench_name = name.strip().replace(" ", "_").lower() self.base_path = path self.bench_path = os.path.join(path, self.bench_name) self.tasks_folder = os.path.join(self.bench_path, 'tasks') - self.tasks = [] + self.tasks = tasks # initialize a task object for each task. self.written = os.path.exists(self.bench_path) @classmethod @@ -84,12 +85,25 @@ def load(bench_path): if not os.path.exists(bench_path): raise ValueError("The passed path doesn't exist.") + task_folder = os.path.join(bench_path, 'tasks') + + with open(os.path.join(bench_path, 'info.yml'), 'r') as f: + info = yaml.safe_load(f) + + task_list = os.listdir(task_folder) + tasks = [] + for task in task_list: + # load the tasks + task_path = os.path.join(task_folder, task) + content = os.listdir(task_path) + + bench = cls(info['bench_name'], bench_path, info['concept']) + return bench - - def initialize_dir(self, about_body=None, no_git=False): + def initialize_dir(self, no_git=False): ''' - write out the benchmark folder + write out the benchmark folder initially Parameters: ----------- @@ -116,8 +130,8 @@ def initialize_dir(self, about_body=None, no_git=False): # Create about.md about_path = os.path.join(self.bench_path, "about.md") - if not about_body: - about_body = "a {self.bench_name}." + if self: + about_body = "*{self.concept}*" about_text= about_template.format({'bench_name':self.bench_name, 'text':about_body}) with open(about_path, 'w') as file: @@ -137,11 +151,18 @@ def initialize_dir(self, about_body=None, no_git=False): def write(self): info = {'bench_name': self.bench_name, - 'bench_concept': self.bench_concept, + 'concept': self.concept, 'bench_path': self.bench_path, 'tasks': self.tasks} with open(os.path.join(self.bench_path, 'info.yml'), 'w') as f: yaml.dump(info, f) + + # likely also writ the tasks and the about, if need to be updated + + + def write_tasks(self): + for task in self.tasks: + task.write(self.tasks_folder) ### Initialize git repository @@ -174,68 +195,6 @@ def add_task(self, task_name, task_source): self.tasks.append(task_name) # setup_task(self.tasks_folder, task_name, task_source)) - # Create a benchmarks folder with tasks in them - def initialize_task_dir(tasks_path, task_name: str, task_source=None, - is_huggingface=False): - ''' - Initialize a new task folder in the benchmark repo - - Parameters: - ----------- - tasks_path: str - The path to the tasks folder inside the benchmark folder - task_name: str - The name of the task to be added. This will be used for the task folder name - task_source: str or buffer - The source of the task data. This can be a path to a local file or folder, - or a Hugging Face dataset identifier. - The content - is_huggingface: bool - Whether the task source is a Hugging Face dataset. If True, the task_source - should be like ownser/dataset_name - ''' - - print(f"Setting up {task_name}...", end='') - task_folder = os.path.join(tasks_path, task_name) - os.mkdir(task_folder) # TODO: check if folder exists and handle - - if is_huggingface: - download_dataset(task_folder, task_source) - print("Success") - return - - # Path could be absolute or relative, check and work accordingly - # if not task_source.startswith('/'): - # if task_source.startswith('./'): - # # TODO: Path could have one or more `../` use relpath to fix this block - # task_source = task_source[2:] - # task_source = os.path.join(os.getcwd(), task_source) - # print(f" path {task_source}\n\n") # Debugging - - # could be a single file or a folder check and work accordignly - if os.path.isdir(task_source): - for sub in os.listdir(task_source): - shutil.copy2(os.path.join(task_source, sub), task_folder) - else: - shutil.copy2(task_source, task_folder) - print("Success") - - def download_dataset(task_folder: str, hf_path: str): - ''' - dataset must have columns 'prompt' and 'canonical_solution' for now, can be expanded in the future. - ''' - with open(os.path.join(task_folder, 'task.txt'), 'w') as f: - f.write('{p}') - - dataset = load_dataset(hf_path) - dataset_test = dataset['test'] - - with open(os.path.join(task_folder, 'values.csv'), 'w') as f: - f.write('p,res') - for row in dataset_test: - prompt = row['prompt'] - answer = row['canonical_solution'] - f.write(f"{prompt,answer}") diff --git a/benchtools/cli.py b/benchtools/cli.py index 62fd420..43bea14 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -65,7 +65,6 @@ def init(benchmark_name, path, about, no_git, tasks): if to_run: benchmark.run() -## TODO: Is it computationally better to use pickle to save the object in the benchmark folder?? @benchtool.command() diff --git a/benchtools/runner.py b/benchtools/runner.py index 6913ef3..9f5bd33 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -7,11 +7,9 @@ class BenchRunner(): ''' + TODO: this might not be how this ends up, but thi si scurrent thign ''' - - - def run(self, tasks_torun=[], model='gemma3', api_url=None): ''' Run the benchmark by running each task in the benchmark and logging the interactions. diff --git a/benchtools/scorerers.py b/benchtools/scorerers.py deleted file mode 100644 index 8bb43a7..0000000 --- a/benchtools/scorerers.py +++ /dev/null @@ -1,8 +0,0 @@ -# built in default scoring functions - - -def exact_match(response, reference): - ''' - ''' - return response == reference - diff --git a/benchtools/scorers.py b/benchtools/scorers.py new file mode 100644 index 0000000..e2fc42e --- /dev/null +++ b/benchtools/scorers.py @@ -0,0 +1,19 @@ +# built in default scoring functions + +def exact_match(response, reference): + ''' + score 1 if the response exactly matches the reference, 0 otherwise + ''' + return int(response == reference) + +def contains(response, reference): + ''' + score 1 if the reference is contained in the response, 0 otherwise + ''' + if isinstance(reference, list): + return int(any(ref in response for ref in reference)) + else: + return int(reference in response) + +scoring_fx_list = {"exact_match": exact_match, + "contains":contains} diff --git a/benchtools/task.py b/benchtools/task.py index ab0aaed..9940fb7 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -6,8 +6,9 @@ from ollama import chat, ChatResponse, Client from benchtools.logger import init_logger, log_agent_interaction -# from scorerers import exact_match -# scoring_fx = {"exact_match": exact_match} +from datasets import load_dataset + +from scorers import scoring_fx_list, contains, exact_match class Task: @@ -15,9 +16,8 @@ class Task: defines a basic prompt task with a simple scoring function """ - def __init__( - self, task_name, prompt, reference, scoring_function=None, prompt_variants = None, - ): + def __init__(self, task_name, prompt, reference=None, scoring_function=None, + prompt_variants = None, storage_type = 'yaml' ): """ init a task object from a prompt and reference, and a scoring function. If no scoring function is provided, defaults to exact match. @@ -37,24 +37,22 @@ def __init__( if prompt_variants: self.sub_tasks = prompt_variants self.description = prompt + self.reference = reference else: self.sub_tasks = [prompt] + self.reference = [reference] self.description = f"a basic prompt task with: {prompt}" - - self.reference = reference - if type(scoring_function) is str: - self.scoring_function = scoring_function[scoring_function] + self.storage_type = storage_type + if scoring_function: + if isinstance(scoring_function, str): + self.scoring_function = scoring_fx_list.get(scoring_function, exact_match) + if isinstance(scoring_function, callable): + self.scoring_function = scoring_function else: - self.scoring_function = scoring_function - - - # self.responses = [] + self.scoring_function = exact_match - # self.logger = init_logger(self.log_path, self.name) - - @classmethod def from_txt_csv(cls, task_name, task_folder): @@ -81,12 +79,12 @@ def from_txt_csv(cls, task_name, task_folder): description = f"a template based task with template: {prompt} and values like:\n\n {value_answer_df.head().to_markdown()}" - return cls(task_name, prompt_variants = storedTasks, reference=storedAnswers) + return cls(task_name, prommpt =description, prompt_variants = storedTasks, + reference=storedAnswers, storage_type ='csv') - @classmethod - def from_yaml(yaml_file): + def from_yaml(cls, task_name, yaml_file): """ Load tasks from a YAML file and generate PromptTask objects. Parameters @@ -109,6 +107,7 @@ def from_yaml(yaml_file): # Generate all possible value combinations using itertools.product keys = values_dict.keys() value_combinations = zip(*values_dict.values()) + # T # Create a PromptTask for each combination for values in value_combinations: value_mapping = dict(zip(keys, values)) # Pair keys with values @@ -117,8 +116,88 @@ def from_yaml(yaml_file): storedTasks.append(filled_prompt) # Store task for answer in answers: storedAnswers.append(answer) - return (storedTasks, storedAnswers) + + description = f"a template based task with template:" + return cls(task_name,description , prompt_variants = storedTasks, reference=storedAnswers, + storage_type ='yaml') + + @staticmethod + def from_hf_dataset(task_folder: str, hf_path: str): + ''' + dataset must have columns 'prompt' and 'canonical_solution' for now, can be expanded in the future. + ''' + with open(os.path.join(task_folder, 'task.txt'), 'w') as f: + f.write('{p}') + + dataset = load_dataset(hf_path) + dataset_test = dataset['test'] + + with open(os.path.join(task_folder, 'values.csv'), 'w') as f: + f.write('p,res') + for row in dataset_test: + prompt = row['prompt'] + answer = row['canonical_solution'] + f.write(f"{prompt,answer}") + + def write(self, target_path): + ''' + write the task + ''' + # choose the writer and call it + + # Create a benchmarks folder with tasks in them + def initialize_task_dir(tasks_path, task_name: str, task_source=None, + is_huggingface=False): + ''' + Initialize a new task folder in the benchmark repo + + *probably to be deprecated* + + Parameters: + ----------- + tasks_path: str + The path to the tasks folder inside the benchmark folder + task_name: str + The name of the task to be added. This will be used for the task folder name + task_source: str or buffer + The source of the task data. This can be a path to a local file or folder, + or a Hugging Face dataset identifier. + The content + is_huggingface: bool + Whether the task source is a Hugging Face dataset. If True, the task_source + should be like ownser/dataset_name + ''' + + print(f"Setting up {task_name}...", end='') + task_folder = os.path.join(tasks_path, task_name) + os.mkdir(task_folder) # TODO: check if folder exists and handle + + if is_huggingface: + download_dataset(task_folder, task_source) + print("Success") + return + + + # Path could be absolute or relative, check and work accordingly + # if not task_source.startswith('/'): + # if task_source.startswith('./'): + # # TODO: Path could have one or more `../` use relpath to fix this block + # task_source = task_source[2:] + # task_source = os.path.join(os.getcwd(), task_source) + # print(f" path {task_source}\n\n") # Debugging + + # could be a single file or a folder check and work accordignly + if os.path.isdir(task_source): + for sub in os.listdir(task_source): + shutil.copy2(os.path.join(task_source, sub), task_folder) + else: + shutil.copy2(task_source, task_folder) + print("Success") + + + + def run(self, model,runner_type="ollama", api_url=None): """ run the task on the model diff --git a/docs/source/pylib.md b/docs/source/pylib.md index b107fdd..2b12344 100644 --- a/docs/source/pylib.md +++ b/docs/source/pylib.md @@ -18,7 +18,34 @@ kernelspec: ```{code-cell} from benchtools import Bench -Bench('Tiniest Demo', concept ='the simplest bench') +tiny_bench = Bench('Tiniest Demo', concept ='a simplest test') +``` + + +```{code-cell} +from benchtools import Task + +tt = Task('greeting','Hello','hi', 'contains') +``` + + +```{code-cell} +tiny_bench.add_task(tt) +``` + + +```{code-cell} +response = tt.run() +``` + + +```{code-cell} +tt.score(response) +``` + + +```{code-cell} +tiny_bench.run() ``` diff --git a/setup.py b/setup.py index 5050cf6..d87338e 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='benchtools', - version='0.1', + version='0.2', packages=find_namespace_packages(), install_requires=[ 'Click', 'ollama', 'pandas', 'pyyaml', 'datasets' From cc2b61dc3029cf14029a88322aaa6e4044cd1679 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sat, 14 Feb 2026 16:53:32 -0500 Subject: [PATCH 69/78] more demo use --- benchtools/task.py | 6 +++--- docs/source/pylib.md | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/benchtools/task.py b/benchtools/task.py index 9940fb7..2b26300 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -55,7 +55,7 @@ def __init__(self, task_name, prompt, reference=None, scoring_function=None, @classmethod - def from_txt_csv(cls, task_name, task_folder): + def from_txt_csv(cls, task_name, source_folder): ''' load a template from txt and create task objects for each row of a csv @@ -63,10 +63,10 @@ def from_txt_csv(cls, task_name, task_folder): ''' # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor prompt = "" - with open(os.path.join(task_folder, "task.txt"), "r") as f: + with open(os.path.join(source_folder, "task.txt"), "r") as f: prompt = f.read() - value_answer_df = pandas.read_csv(os.path.join(task_folder, "values.csv")) + value_answer_df = pandas.read_csv(os.path.join(source_folder, "values.csv")) # answers = pandas.read_csv(os.path.join(task_folder, "results")) storedTasks = [] storedAnswers = [] diff --git a/docs/source/pylib.md b/docs/source/pylib.md index 2b12344..43dd0e2 100644 --- a/docs/source/pylib.md +++ b/docs/source/pylib.md @@ -29,25 +29,52 @@ tt = Task('greeting','Hello','hi', 'contains') ``` +```{code-cell} +response = tt.run() +``` + + +```{code-cell} +tt.score(response) +``` + ```{code-cell} tiny_bench.add_task(tt) ``` ```{code-cell} -response = tt.run() +tiny_bench.run() ``` ```{code-cell} -tt.score(response) +pre_built = Bench('math Demo', concept ='a math test') +add_task = Task.from_txt_csv('../../demobench/add') +pre_built.add_task(add_task) ``` ```{code-cell} -tiny_bench.run() +symb_task = Task.from_yaml('../../demobench/miscops') +pre_built.add_task(symb_task) ``` +```{code-cell} +pre_built.write() +``` + +```{code-cell} +pre_built.run() +``` + + +```{code-cell} +demo_bench = Bench.load('../../demobench') +``` + + + ## Creating a Benchmark object From 6581b95fd19f673c5a31c0ebe509d376fce323cd Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sun, 15 Feb 2026 05:43:22 +0000 Subject: [PATCH 70/78] Bug fixes to get the CLI to reach . Tasks still need to be initialized in the Bench object --- benchtools/benchmark.py | 25 +++++++++++++++---------- benchtools/cli.py | 14 ++++++++------ benchtools/task.py | 2 +- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index edf0679..ed76558 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -41,7 +41,7 @@ class Bench(): run() Run one task or all tasks of the benchmark. ''' - def __init__(self, name, path = '.',concept = None, tasks = []): + def __init__(self, name, path='.', concept = None, tasks=[]): ''' Initialize the benchmark object with the name and path to the benchmark folder. @@ -125,15 +125,15 @@ def initialize_dir(self, no_git=False): # Create a benchmarks folder with tasks in them tasks_path = os.path.join(self.bench_path, "tasks") os.mkdir(tasks_path) - log_path = os.path.join(self.bench_path, "logs") - os.mkdir(log_path) + log_path = os.path.join(self.bench_path, "logs") # Do we want a log dir? + os.mkdir(log_path) # Do we want a log dir? # Create about.md about_path = os.path.join(self.bench_path, "about.md") - if self: - about_body = "*{self.concept}*" - about_text= about_template.format({'bench_name':self.bench_name, - 'text':about_body}) + if self: # if self.concept? + about_body = f"*{self.concept}*" + about_text= about_template.format(bench_name=self.bench_name, + text = about_body) with open(about_path, 'w') as file: file.write(about_text) @@ -141,7 +141,7 @@ def initialize_dir(self, no_git=False): if not no_git: self.init_repo(self.bench_path) - for task_name, task_source in new_tasks: + for task_name, task_source in self.tasks: self.add_task(task_name, task_source) self.write() @@ -166,7 +166,7 @@ def write_tasks(self): ### Initialize git repository - def init_repo(bench_path): + def init_repo(self, bench_path): ''' Initialize the benchmark folder as git repo with gitiginore for python @@ -191,10 +191,15 @@ def init_repo(bench_path): def add_task(self, task_name, task_source): - + self.tasks.append(task_name) # setup_task(self.tasks_folder, task_name, task_source)) + def run(self): + + self.run_task(tasks) + def run_tasks(self, tasks): + self.runner = BenchRunner(tasks) diff --git a/benchtools/cli.py b/benchtools/cli.py index 43bea14..5bc5162 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,6 +1,6 @@ import os import click -from benchtools.designer import Bench +from benchtools.benchmark import Bench from benchtools.betterbench import betterbench, get_score # from task import PromptTask @@ -30,7 +30,7 @@ def init(benchmark_name, path, about, no_git, tasks): if not benchmark_name: benchmark_name = click.prompt("Enter the name of your benchmark (will be used as folder and repo name)", type=str) - benchmark_name = benchmark_name.strip().replace(" ", "_").lower() + benchmark_name = benchmark_name.strip() # Strip from surrouding whitespace # TODO: Handle existing benchmark if not os.path.exists(path): @@ -48,13 +48,15 @@ def init(benchmark_name, path, about, no_git, tasks): exit(4356) # create full path - bench_path = os.path.join(path, benchmark_name) + folder_name = benchmark_name.replace(" ", "_").lower() + bench_path = os.path.join(path, folder_name) - click.echo(f"Creating {benchmark_name} in {bench_path}") - benchmark = Bench(benchmark_name, bench_path) + # TODO: get concept? Is it different that about? + click.echo(f"Creating {benchmark_name} Benchmark in {bench_path}") + benchmark = Bench(benchmark_name, path, about, tasks) # Build the benchmark folder - if benchmark.write(about, no_git, tasks): + if benchmark.initialize_dir(no_git): click.echo(f"Created {benchmark_name} benchmark successfully!") # TODO: Call betterbench CLI here diff --git a/benchtools/task.py b/benchtools/task.py index 2b26300..1a296e5 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -8,7 +8,7 @@ from datasets import load_dataset -from scorers import scoring_fx_list, contains, exact_match +from benchtools.scorers import scoring_fx_list, contains, exact_match class Task: From 1241c2f07804f751b12ae17a1549f24e4d4e1e2d Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sun, 15 Feb 2026 06:23:36 +0000 Subject: [PATCH 71/78] Benchmark: run(): after reading the docs and the issues, I think this is how this should look like --- benchtools/benchmark.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index ed76558..4aa366b 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -195,11 +195,11 @@ def add_task(self, task_name, task_source): self.tasks.append(task_name) # setup_task(self.tasks_folder, task_name, task_source)) - def run(self): - - self.run_task(tasks) + def run(self, runner): + for task in self.tasks(): + self.run_task(task, runner) - def run_tasks(self, tasks): - self.runner = BenchRunner(tasks) + def run_task(self, task): + task.run(runner) From a77166b8ce0c0b8d1e714bed022dc3a2e610ff16 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sun, 15 Feb 2026 07:11:06 +0000 Subject: [PATCH 72/78] CLI: got rid of absolute paths using the Bench.load logic. Applying Task logic in pylib.md to loading tasks into the Bench --- benchtools/benchmark.py | 11 +++++---- benchtools/cli.py | 51 +++++++++++++++++++++-------------------- benchtools/runner.py | 2 +- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index 4aa366b..7ec0f4c 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -68,7 +68,7 @@ def __init__(self, name, path='.', concept = None, tasks=[]): self.written = os.path.exists(self.bench_path) @classmethod - def load(bench_path): + def load(cls, bench_path): ''' Load a benchmark from a given path. The path should point to the benchmark folder. @@ -96,8 +96,9 @@ def load(bench_path): # load the tasks task_path = os.path.join(task_folder, task) content = os.listdir(task_path) + # TODO: Look at content to create Task objects and add them to tasks - bench = cls(info['bench_name'], bench_path, info['concept']) + bench = cls(info['bench_name'], bench_path, info['concept'], tasks) return bench @@ -191,9 +192,11 @@ def init_repo(self, bench_path): def add_task(self, task_name, task_source): + # TODO: Look at content to create Task objects and add them to tasks + # setup_task(self.tasks_folder, task_name, task_source)) - self.tasks.append(task_name) - # setup_task(self.tasks_folder, task_name, task_source)) + # self.tasks.append(task) + continue def run(self, runner): for task in self.tasks(): diff --git a/benchtools/cli.py b/benchtools/cli.py index 5bc5162..6692e85 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -1,6 +1,8 @@ import os import click +from benchtools.task import Task from benchtools.benchmark import Bench +from benchtools.runner import BenchRunner from benchtools.betterbench import betterbench, get_score # from task import PromptTask @@ -18,7 +20,7 @@ def benchtool(): @click.option('-a', '--about', help="Benchmark describtion. Content will go in the about.md file", default="", type=str) @click.option('--no-git', help="Don't make benchmark a git repository. Default is False", is_flag=True) @click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=(str, str), multiple=True) -def init(benchmark_name, path, about, no_git, tasks): +def init(benchmark_name, path, about, no_git, task_sources): """ Initializes a new benchmark. @@ -50,8 +52,12 @@ def init(benchmark_name, path, about, no_git, tasks): # create full path folder_name = benchmark_name.replace(" ", "_").lower() bench_path = os.path.join(path, folder_name) - - # TODO: get concept? Is it different that about? + + tasks = [] + if task_sources: + # TODO: Look at content to create Task objects and add them to tasks + continue + click.echo(f"Creating {benchmark_name} Benchmark in {bench_path}") benchmark = Bench(benchmark_name, path, about, tasks) @@ -63,9 +69,12 @@ def init(benchmark_name, path, about, no_git, tasks): # betterbench() # Run? - to_run = click.confirm("Do you want to run the benchmark now?", default=True) - if to_run: - benchmark.run() + if tasks: + to_run = click.confirm("Do you want to run the benchmark now?", default=True) + if to_run: + # TODO: Get runner info and create runner object? + # benchmark.run(runner) + benchmark.run() @@ -98,12 +107,9 @@ def run(benchmark_path: str): Running the benchmark and generating logs , help="The path to the benchmark repository where all the task reside." """ - bench_path = os.path.abspath(benchmark_path) - if os.path.exists(bench_path): - bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path - benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) - click.echo(f"Running {benchmark.bench_name} now") - benchmark.run() + benchmark = Bench.load(benchmark_path) + click.echo(f"Running {benchmark.bench_name} now") + benchmark.run() @benchtool.command() @@ -116,12 +122,10 @@ def run_task(benchmark_path: str, task_name): , help="The path to the benchmark repository where all the task reside." , help="The name of the specific task you would like to run" """ - bench_path = os.path.abspath(benchmark_path) - if os.path.exists(bench_path): - bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path - benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) - click.echo(f"Running {task_name} now") - benchmark.run([task_name]) + + benchmark = Bench.load(benchmark_path) + click.echo(f"Running {task_name} now") + benchmark.run([task_name]) @benchtool.command() @click.argument('benchmark-path', required = True, type=str) @@ -132,13 +136,10 @@ def score(benchmark_path: str): , help="The path to the benchmark repository where all the task reside." , help="The name of the specific task you would like to run" """ - bench_path = os.path.abspath(benchmark_path) - if os.path.exists(bench_path): - bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path - benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) - click.echo(f"Scoring {benchmark.bench_name} now...") - score = get_score() - click.echo(f"Score: {score}") + benchmark = Bench.load(benchmark_path) + click.echo(f"Scoring {benchmark.bench_name} now...") + score = get_score() + click.echo(f"Score: {score}") # For debugging diff --git a/benchtools/runner.py b/benchtools/runner.py index 9f5bd33..810f2ea 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,7 +1,7 @@ # module to create and run benchmarks import os from benchtools.task import Task -from benchtools.designer import build_dir, init_repo, create_about, setup_task +# from benchtools.designer import build_dir, init_repo, create_about, setup_task # from log_file.py import log_agent_interaction From 7a4804cbaa2d6f741fe3110a18e85364c56ec526 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sun, 15 Feb 2026 07:21:35 +0000 Subject: [PATCH 73/78] bug fix --- benchtools/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index 7ec0f4c..a3368bc 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -202,7 +202,7 @@ def run(self, runner): for task in self.tasks(): self.run_task(task, runner) - def run_task(self, task): + def run_task(self, task, runner): task.run(runner) From 96270137b6a0234693befbbc09340abe83542b02 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sun, 15 Feb 2026 16:35:19 -0500 Subject: [PATCH 74/78] compelte refactor closes #31 --- .gitignore | 7 + benchtools/benchmark.py | 158 +++++++-- benchtools/cli.py | 98 ++++-- benchtools/logger.py | 4 +- benchtools/runner.py | 2 +- benchtools/task.py | 324 ++++++++++++------ demobench/logs/.gitkeep | 0 demobench/tasks/miscops/task.yml | 9 - {demobench => demos/folderbench}/README.md | 0 .../folderbench/tasks/add/template.txt | 0 .../folderbench}/tasks/add/values.csv | 2 +- demos/folderbench/tasks/symbols/template.txt | 1 + demos/folderbench/tasks/symbols/values.csv | 4 + demos/listbench/info.yml | 2 + demos/listbench/tasks.yml | 13 + docs/source/cli.md | 9 +- docs/source/pylib.md | 37 +- 17 files changed, 461 insertions(+), 209 deletions(-) delete mode 100644 demobench/logs/.gitkeep delete mode 100644 demobench/tasks/miscops/task.yml rename {demobench => demos/folderbench}/README.md (100%) rename demobench/tasks/add/task.txt => demos/folderbench/tasks/add/template.txt (100%) rename {demobench => demos/folderbench}/tasks/add/values.csv (56%) create mode 100644 demos/folderbench/tasks/symbols/template.txt create mode 100644 demos/folderbench/tasks/symbols/values.csv create mode 100644 demos/listbench/info.yml create mode 100644 demos/listbench/tasks.yml diff --git a/.gitignore b/.gitignore index c9aaf46..8d3ce44 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,11 @@ .vscode/ +# site and test related outputs +logs/* +docs/source/logs +demos/logs/* +demos/folderbench/logs/* +demos/listbench/logs/* + ## Python template for .gitignore sourced from Github's [gitignore](https://github.com/github/gitignore) # Byte-compiled / optimized / DLL files diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index a3368bc..8ce234f 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -5,7 +5,8 @@ import requests import yaml # from pathlib import Path # ??? - +from .task import Task +from pathlib import PurePath about_template = """# {bench_name} @@ -41,7 +42,7 @@ class Bench(): run() Run one task or all tasks of the benchmark. ''' - def __init__(self, name, path='.', concept = None, tasks=[]): + def __init__(self, name, base_path='.', bench_path=None, concept = None, tasks=[]): ''' Initialize the benchmark object with the name and path to the benchmark folder. @@ -61,14 +62,24 @@ def __init__(self, name, path='.', concept = None, tasks=[]): self.display_name = name.strip() self.concept = concept if concept else f'a benchmark about {name.strip()}' self.bench_name = name.strip().replace(" ", "_").lower() - self.base_path = path - self.bench_path = os.path.join(path, self.bench_name) + + if bench_path: + self.base_path = PurePath(bench_path).parent + self.bench_path = bench_path + else: + self.base_path = base_path + self.bench_path = os.path.join(base_path, self.bench_name) + self.tasks_folder = os.path.join(self.bench_path, 'tasks') - self.tasks = tasks # initialize a task object for each task. + if tasks: + self.tasks = {t.name:t for t in tasks} # initialize a task object for each task. + else: + self.tasks = {} + self.written = os.path.exists(self.bench_path) @classmethod - def load(cls, bench_path): + def from_folders(cls, bench_path): ''' Load a benchmark from a given path. The path should point to the benchmark folder. @@ -85,21 +96,69 @@ def load(cls, bench_path): if not os.path.exists(bench_path): raise ValueError("The passed path doesn't exist.") - task_folder = os.path.join(bench_path, 'tasks') - with open(os.path.join(bench_path, 'info.yml'), 'r') as f: - info = yaml.safe_load(f) + content = os.listdir(bench_path) + if 'tasks.yml' in content: + with open(os.path.join(bench_path, 'info.yml'), 'r') as f: + info = yaml.safe_load(f) + else: + info = {} + info['bench_name'] = PurePath(bench_path).parts[-1] + info['concept'] = f'a benchmark about {info["bench_name"]}' + + if 'tasks' in content: + task_folder = os.path.join(bench_path, 'tasks') + task_list = os.listdir(task_folder) + tasks = [] + for task_dir in task_list: + # load the tasks + task_path = os.path.join(task_folder, task_dir) + task = Task.from_txt_csv(task_path) + tasks.append(task) + else: + tasks = [] + + + return cls(name = info['bench_name'], + bench_path = bench_path, + concept = info['concept'], tasks=tasks) + + @classmethod + def from_yaml(cls, bench_path): + """ + Load tasks from a YAML file and generate Task objects. + + Parameters + ---------- + bench_path : str + Path to the YAML file containing task templates and values. + Returns + ------- + self : Bench + The Bench instance with tasks populated. + """ + # load the info + info = Bench.load_info(bench_path) + + # load the tasks + yaml_file = os.path.join(bench_path, 'tasks.yml') + with open(yaml_file, 'r') as file: + task_list = yaml.safe_load(file) + + tasks =[] + for task_dict in task_list: + tasks.append(Task.from_dict(task_dict)) + - task_list = os.listdir(task_folder) - tasks = [] - for task in task_list: - # load the tasks - task_path = os.path.join(task_folder, task) - content = os.listdir(task_path) - # TODO: Look at content to create Task objects and add them to tasks + return cls(name = info['bench_name'], bench_path =bench_path, + concept= info['concept'], tasks=tasks) - bench = cls(info['bench_name'], bench_path, info['concept'], tasks) - return bench + @staticmethod + def load_info(bench_path): + with open(os.path.join(bench_path, 'info.yml'), 'r') as f: + info = yaml.safe_load(f) + + return info def initialize_dir(self, no_git=False): @@ -117,7 +176,7 @@ def initialize_dir(self, no_git=False): Returns: -------- - self.built : bool + self.written: bool True if the benchmark was successfully built, False otherwise ''' @@ -131,8 +190,8 @@ def initialize_dir(self, no_git=False): # Create about.md about_path = os.path.join(self.bench_path, "about.md") - if self: # if self.concept? - about_body = f"*{self.concept}*" + + about_body = f"*{self.concept}*" about_text= about_template.format(bench_name=self.bench_name, text = about_body) with open(about_path, 'w') as file: @@ -142,8 +201,18 @@ def initialize_dir(self, no_git=False): if not no_git: self.init_repo(self.bench_path) - for task_name, task_source in self.tasks: - self.add_task(task_name, task_source) + task_types = set([task.storage_type for task in self.tasks.values()]) + if 'csv' in task_types: + for task_name, task_object in self.tasks.items(): + task_object.write(tasks_path) + + if task_types == {'yaml'}: + task_list = [] + for task in self.tasks.values(): + task_list.append(task.get_dict()) + + with open(os.path.join(tasks_path,'tasks.yml'), 'w') as file: + yaml.dump(task_list, file) self.write() self.written = True @@ -153,12 +222,11 @@ def initialize_dir(self, no_git=False): def write(self): info = {'bench_name': self.bench_name, 'concept': self.concept, - 'bench_path': self.bench_path, - 'tasks': self.tasks} + 'tasks': list(self.tasks.keys())} with open(os.path.join(self.bench_path, 'info.yml'), 'w') as f: yaml.dump(info, f) - # likely also writ the tasks and the about, if need to be updated + # likely also write the tasks and the about, if need to be updated def write_tasks(self): @@ -191,18 +259,40 @@ def init_repo(self, bench_path): os.chdir(current_dir) - def add_task(self, task_name, task_source): + def add_task(self, task_object): # TODO: Look at content to create Task objects and add them to tasks # setup_task(self.tasks_folder, task_name, task_source)) # self.tasks.append(task) - continue + self.tasks[task_object.name] = task_object - def run(self, runner): - for task in self.tasks(): - self.run_task(task, runner) - - def run_task(self, task, runner): - task.run(runner) + def run(self,model='gemma3',runner_type="ollama", api_url=None,): + ''' + Run the benchmark by running each task in the benchmark and logging the interactions. + Parameters: + ----------- + model: str default 'gemma3' + The name of the model to use for running the tasks. Default is 'gemma3'. + ''' + if not self.written: + raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.") + # TODO deal with results + for name, task in self.tasks.items(): + self.run_task(task, model,runner_type, api_url) + + def run_task(self, target_task=None, model='gemma3',runner_type="ollama", api_url=None): + if not(target_task): + # TODO: use a generator and make this have a state + target_task = list[self.tasks.keys()][0] + + if isinstance(target_task, str): + task_object = self.tasks[target_task] + elif isinstance(target_task, Task): + task_object = target_task + else: + raise ValueError("target_task should be either a string (task name) or a Task object.") + + logging_path = os.path.join(self.bench_path, 'logs') + return task_object.run(model,runner_type, api_url,logging_path) diff --git a/benchtools/cli.py b/benchtools/cli.py index 6692e85..55f6c7b 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -3,7 +3,7 @@ from benchtools.task import Task from benchtools.benchmark import Bench from benchtools.runner import BenchRunner -from benchtools.betterbench import betterbench, get_score +# from benchtools.betterbench import betterbench, get_score # from task import PromptTask @click.group() @@ -16,11 +16,14 @@ def benchtool(): # Initialize the benchmark @benchtool.command() @click.argument('benchmark-name', required=False) -@click.option('-p', '--path', help="The path where the new benchmark repository will be placed", default=".", type=str) -@click.option('-a', '--about', help="Benchmark describtion. Content will go in the about.md file", default="", type=str) -@click.option('--no-git', help="Don't make benchmark a git repository. Default is False", is_flag=True) -@click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=(str, str), multiple=True) -def init(benchmark_name, path, about, no_git, task_sources): +@click.option('-p', '--path', help="The path where the new benchmark repository will be placed", + default=".", type=str) +@click.option('-a', '--about', help="Benchmark describtion. Content will go in the about.md file", + default="", type=str) +@click.option('--no-git', help="Don't make benchmark a git repository. Default is False", + is_flag=True) +# @click.option('-t', '--tasks', help="Add benchmark tasks to your benchmark (can add multiple). Format: ", default=[], type=(str, str), multiple=True) +def init(benchmark_name, path, about, no_git): """ Initializes a new benchmark. @@ -53,13 +56,14 @@ def init(benchmark_name, path, about, no_git, task_sources): folder_name = benchmark_name.replace(" ", "_").lower() bench_path = os.path.join(path, folder_name) - tasks = [] - if task_sources: - # TODO: Look at content to create Task objects and add them to tasks - continue + # tasks = [] + # if task_sources: + # # TODO: Look at content to create Task objects and add them to tasks + # continue click.echo(f"Creating {benchmark_name} Benchmark in {bench_path}") - benchmark = Bench(benchmark_name, path, about, tasks) + benchmark = Bench(name =benchmark_name, bench_path = bench_path, + concept = about) # Build the benchmark folder if benchmark.initialize_dir(no_git): @@ -69,7 +73,7 @@ def init(benchmark_name, path, about, no_git, task_sources): # betterbench() # Run? - if tasks: + if benchmark.tasks: to_run = click.confirm("Do you want to run the benchmark now?", default=True) if to_run: # TODO: Get runner info and create runner object? @@ -79,37 +83,42 @@ def init(benchmark_name, path, about, no_git, task_sources): @benchtool.command() -@click.argument('benchmark-path', required = True, type=str) @click.argument('task-name', required = True, type=str) -@click.argument('task-path', required = True, type=str) -def add_task(benchmark_path, task_name, task_path): +@click.option('-p','--benchmark-path', default='.', help="The path to the benchmark repository where the task will be added.", type=str) +@click.option('-s','task-source', type=str,help="The relative path to content that already exists`", required=True) +@click.option('-t','--task-type', type=click.Choice(['folders', 'list']), help="The type of the task content being added. Options are csv or yml", required=True) +def add_task(task_name, bench_path, task_source,task_type): """ Set up a new task. - # TODO explain arguments or convert to options. to use help - benchmark-path: "The path to the benchmark repository where the task will be added." - task-name: "The name of the task to be added. This will be used as the folder name for the task and should be unique within the benchmark." - task-path "The relative path to the dataset used for the task. OR any dataset from huggingface that starts with `openai/`" """ - bench_path = os.path.abspath(benchmark_path) + if os.path.exists(bench_path): - bench_path = bench_path[:-1] if bench_path.endswith('/') else bench_path - benchmark = Bench(bench_path.rsplit('/',1)[1], bench_path) - benchmark.add_task(task_name, task_path) + benchmark = Bench.load(bench_path) + if task_source: + if os.path.isdir(task_source): + task = Task.from_txt_csv(task_source) + elif os.path.isfile(task_source): + task = Task.from_yaml(task_source) + elif task_type: + match task_type: + case 'folders': + storage_type = 'csv' + case 'list': + storage_type = 'yaml' + task = Task(name=task_name, template= "fill in your prompt template here", + description = "add a description of your task here", + storage_type=storage_type) + else: + click.echo("Invalid task content type. Either provide content with --task-source or specify the type of task content with --type.") + exit(4356) + + # TODO: handle adding to benchmark with metadata + # benchmark.add_task(task) + task.write(bench_path) + click.echo(f"Added {task_name} to {benchmark.bench_name} benchmark successfully!") else: click.echo("No benchmark reposiory at " + bench_path) - - -@benchtool.command() -@click.argument('benchmark-path', required = True, type=str) -def run(benchmark_path: str): - """ - Running the benchmark and generating logs - , help="The path to the benchmark repository where all the task reside." - """ - benchmark = Bench.load(benchmark_path) - click.echo(f"Running {benchmark.bench_name} now") - benchmark.run() @benchtool.command() @@ -127,6 +136,25 @@ def run_task(benchmark_path: str, task_name): click.echo(f"Running {task_name} now") benchmark.run([task_name]) +@benchtool.command() +@click.argument('benchmark-path', required = True, type=str) +def run(benchmark_path: str): + """ + Running the benchmark and generating logs + , help="The path to the benchmark repository where all the task reside." + """ + # check folder to see if folder or yaml type to load benchmark + if os.path.isdir(benchmark_path): + content = os.listdir(benchmark_path) + if 'tasks.yml' in content: + benchmark = Bench.from_yaml(benchmark_path) + else: + benchmark = Bench.from_folders(benchmark_path) + click.echo(f"Running {benchmark.bench_name} now") + benchmark.run() + + + @benchtool.command() @click.argument('benchmark-path', required = True, type=str) def score(benchmark_path: str): diff --git a/benchtools/logger.py b/benchtools/logger.py index 41cb479..328958a 100644 --- a/benchtools/logger.py +++ b/benchtools/logger.py @@ -26,9 +26,9 @@ def init_logger(log_path, task_name): # print(logger) # Debugging return logger -def log_agent_interaction(logger, agent_input, agent_output): +def log_interaction(logger, agent_input, agent_output): """ - Logs the agent's input and output to a file. + Logs the aevent to Parameters: ------------- diff --git a/benchtools/runner.py b/benchtools/runner.py index 810f2ea..71c6069 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -7,7 +7,7 @@ class BenchRunner(): ''' - TODO: this might not be how this ends up, but thi si scurrent thign + unused currenlty; possibly resurected for batch runs? ''' def run(self, tasks_torun=[], model='gemma3', api_url=None): diff --git a/benchtools/task.py b/benchtools/task.py index 1a296e5..f373ea1 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -2,10 +2,10 @@ # from openai import OpenAI import os import yaml # requires pyyaml -import pandas +import pandas as pd from ollama import chat, ChatResponse, Client -from benchtools.logger import init_logger, log_agent_interaction - +from benchtools.logger import init_logger, log_interaction +from pathlib import PurePath from datasets import load_dataset from benchtools.scorers import scoring_fx_list, contains, exact_match @@ -16,8 +16,8 @@ class Task: defines a basic prompt task with a simple scoring function """ - def __init__(self, task_name, prompt, reference=None, scoring_function=None, - prompt_variants = None, storage_type = 'yaml' ): + def __init__(self, task_name, template, reference=None, scoring_function=None, + variant_values = None, storage_type = 'yaml', description = None): """ init a task object from a prompt and reference, and a scoring function. If no scoring function is provided, defaults to exact match. @@ -26,125 +26,203 @@ def __init__(self, task_name, prompt, reference=None, scoring_function=None, dir : string or path directory containing the task assets prompt: string - prompt for task or overall description + prompt template scoring_function : function handle or string if string, must be name of built in eval function provided here - reference: string or number or list of - solution that will be passed with the model answer to the scoring function + reference: string, number, or list of strings or numbers the same shape as variant values, + solution that will be passed with the model answer to the scoring function, + variant_values: + dicttionary or list of dictiornaries with values to fill in a template, if the task is a template based task. If provided, the prompt will be used as a template and the values in variant_values will be used to fill in the template to create the final prompts for the task. The reference should then be a list of answers corresponding to each prompt variant. """ self.name = task_name + self.id = task_name.strip().replace(" ", "_").lower() - if prompt_variants: - self.sub_tasks = prompt_variants - self.description = prompt - self.reference = reference - else: - self.sub_tasks = [prompt] - self.reference = [reference] - self.description = f"a basic prompt task with: {prompt}" - + self.template = template + self.variant_values = variant_values + self.description = description + self.reference = reference + self.storage_type = storage_type if scoring_function: if isinstance(scoring_function, str): self.scoring_function = scoring_fx_list.get(scoring_function, exact_match) - if isinstance(scoring_function, callable): + elif callable(scoring_function): self.scoring_function = scoring_function + else: + # throw an error that scoring is not valid + raise ValueError(f"Scoring function {scoring_function} is not valid, must be a string name"+ + "of a built in function or a function handle") else: self.scoring_function = exact_match + def generate_prompts(self): + ''' + if the task is a template based task, generate the prompts by filling + in the template with the variant values + ''' + # TODO: consider if this could be a generator function if there are a lot of variants, to avoid memory issues. For now, we will assume that the number of variants is small enough to generate all prompts at once. + if self.variant_values: + prompt_list = [] + for value_set in self.variant_values: + prompt = self.template + prompt = prompt.format(**value_set) + prompt_list.append(prompt) + return prompt_list + else: + return [self.template] + @classmethod - def from_txt_csv(cls, task_name, source_folder): + def from_txt_csv(cls, source_folder, task_name = None, scoring_function = None): ''' load a template from txt and create task objects for each row of a csv - folder must contain a task.txt file with the template, and a values.csv file with the values to fill in the template, and the reference answers. The csv should be structured as follows: + folder must contain a template.txt file with the template, and a values.csv file with the values to fill in the template, and the reference answers. The csv should be structured as follows: ''' - # using pandas to load the csv is easy, then use python string formatting to set up the final prompt to apss to the task constructor + + + if not task_name: + # get the folder name if not provided + task_name = PurePath(source_folder).parts[-1] + # decide if using this: .replace("_", " ").title() + prompt = "" - with open(os.path.join(source_folder, "task.txt"), "r") as f: + with open(os.path.join(source_folder, "template.txt"), "r") as f: prompt = f.read() - value_answer_df = pandas.read_csv(os.path.join(source_folder, "values.csv")) - # answers = pandas.read_csv(os.path.join(task_folder, "results")) - storedTasks = [] - storedAnswers = [] - for x in range(len(value_answer_df)): - processed_prompt = prompt.replace("{a}", str(value_answer_df.iloc[x,0])) - processed_prompt = processed_prompt.replace("{b}", str(value_answer_df.iloc[x, 1])) - storedTasks.append(processed_prompt) - # print("Prompt: "+ processed_prompt) # Debugging - storedAnswers.append(str(value_answer_df.iloc[x, 2])) + values_file = os.path.join(source_folder, "values.csv") + # load and strip whitespace from column names + value_answer_df = pd.read_csv(values_file).rename(columns=lambda x: x.strip()) - description = f"a template based task with template: {prompt} and values like:\n\n {value_answer_df.head().to_markdown()}" - - return cls(task_name, prommpt =description, prompt_variants = storedTasks, - reference=storedAnswers, storage_type ='csv') - + variant_values = value_answer_df.drop(columns='reference').to_dict(orient='records') + reference = value_answer_df['reference'].tolist() + + # TODO: improve this + if os.path.exists(os.path.join(source_folder, "description.txt")): + with open(os.path.join(source_folder, "description.txt"), "r") as f: + description = f.read() + else: + description = f"a template based task with template: {prompt} and values like:\n\n {value_answer_df.head().to_markdown()}" + return cls(task_name, template= prompt, variant_values = variant_values, description = description, + reference=reference, storage_type ='csv', scoring_function=scoring_function) + @classmethod - def from_yaml(cls, task_name, yaml_file): - """ - Load tasks from a YAML file and generate PromptTask objects. - Parameters - ---------- - yaml_file : str - Path to the YAML file containing task templates and values. - Returns - ------- - self : Bench - The Bench instance with tasks populated. - """ + def from_yaml(cls, source_folder, task_name = None, scoring_function = None): + ''' + load a task from a yaml file. The yaml file should have the following structure: + name: string + template: string + values: list of dicts (optional) + reference: string, number, or list of strings or numbers the same shape as variant values (optional) + scoring_function: string or function handle (optional) + ''' + yaml_file = os.path.join(source_folder, "task_info.yml") with open(yaml_file, 'r') as file: - data = yaml.safe_load(file) - storedTasks = [] - storedAnswers = [] - for sub_task in data: - template = sub_task["template"] # Extract template - values_dict = sub_task["values"] # Extract values dictionary - answers = sub_task["result"] - # Generate all possible value combinations using itertools.product - keys = values_dict.keys() - value_combinations = zip(*values_dict.values()) - # T - # Create a PromptTask for each combination - for values in value_combinations: - value_mapping = dict(zip(keys, values)) # Pair keys with values - filled_prompt = template.format(**value_mapping) # Format the template - # print("Prompt: "+ filled_prompt) # Debugging - storedTasks.append(filled_prompt) # Store task - for answer in answers: - storedAnswers.append(answer) + task_dict = yaml.safe_load(file) - description = f"a template based task with template:" + return cls(task_dict['name'], template= task_dict['template'], + variant_values = task_dict['values'], + description = task_dict.get('description', None), + reference=task_dict['reference'], + storage_type ='yaml', + scoring_function=task_dict.get('scorer', None) or scoring_function) - return cls(task_name,description , prompt_variants = storedTasks, reference=storedAnswers, - storage_type ='yaml') + @classmethod + def from_dict(cls, task_dict): + ''' + load a task from a dictionary, which could be useful for loading from yaml or json files. The dictionary should have the following structure: + { + "template": string, + "values": list of dicts (optional), + "reference": string, number, or list of strings or numbers the same shape as variant values (optional), + "scoring_function": string or function handle (optional) + } + ''' + compact_values = task_dict.get("values", None) + print('in') + if compact_values: + expanded_values = pd.DataFrame(compact_values).to_dict(orient='records') + else: + expanded_values = None + + + return cls(task_dict.get("name", "unnamed_task"), + template = task_dict.get("template", ""), + variant_values=expanded_values, + reference = task_dict.get("reference", None), + scoring_function = task_dict.get("scoring_function", None), + description = task_dict.get("description", None), + storage_type='yaml') - @staticmethod - def from_hf_dataset(task_folder: str, hf_path: str): + @classmethod + def from_hf_dataset(cls,task_name, hf_path, prompt_column='prompt', answer_column='canonical_solution'): ''' dataset must have columns 'prompt' and 'canonical_solution' for now, can be expanded in the future. ''' - with open(os.path.join(task_folder, 'task.txt'), 'w') as f: - f.write('{p}') - - dataset = load_dataset(hf_path) - dataset_test = dataset['test'] - with open(os.path.join(task_folder, 'values.csv'), 'w') as f: - f.write('p,res') - for row in dataset_test: - prompt = row['prompt'] - answer = row['canonical_solution'] - f.write(f"{prompt,answer}") + dataset = load_dataset(hf_path) + dataset_test = dataset['test'] + + stored_tasks = dataset_test[prompt_column] + stored_answers = dataset_test[answer_column] + + description = f"a task base don the Hugging Face dataset {hf_path} with prompt column {prompt_column} and answer column {answer_column}" + + return cls(task_name, prommpt =description, variant_values = stored_tasks, + reference=stored_answers, storage_type ='csv') def write(self, target_path): ''' write the task ''' # choose the writer and call it + match self.storage_type: + case 'yaml': + self.write_yaml(target_path) + case 'csv': + self.write_csv(target_path) + + def get_dict(self): + + task_dict = { + "name": self.name, + "template": self.template, + "values": self.variant_values, + "reference": self.reference, + "scorer": self.scoring_function.__name__ if callable(self.scoring_function) else self.scoring_function, + "description": self.description + } + return task_dict + + def write_yaml(self, target_path): + ''' + write the task to a yaml file + ''' + data = self.get_dict() + task_path = os.path.join(target_path, self.id) + os.makedirs(task_path, exist_ok=True) + with open(os.path.join(task_path,'task_info.yml'), 'w') as file: + yaml.dump(data, file) + + def write_csv(self, target_folder): + ''' + write the task to a csv file with a task.txt template file + ''' + # write the template + with open(os.path.join(target_folder, 'template.txt'), 'w') as f: + f.write(self.template) + + + with open(os.path.join(target_folder, 'description.txt'), 'w') as f: + f.write(self.description) + + # write the values and answers to a csv + value_answer_df = pd.DataFrame(self.variant_values) + value_answer_df.to_csv(os.path.join(target_folder, 'values.csv'), index=False) + + # Create a benchmarks folder with tasks in them def initialize_task_dir(tasks_path, task_name: str, task_source=None, @@ -169,38 +247,38 @@ def initialize_task_dir(tasks_path, task_name: str, task_source=None, should be like ownser/dataset_name ''' - print(f"Setting up {task_name}...", end='') + # print(f"Setting up {task_name}...", end='') task_folder = os.path.join(tasks_path, task_name) os.mkdir(task_folder) # TODO: check if folder exists and handle - if is_huggingface: - download_dataset(task_folder, task_source) - print("Success") - return + # if is_huggingface: + # download_dataset(task_folder, task_source) + # print("Success") + # return - # Path could be absolute or relative, check and work accordingly - # if not task_source.startswith('/'): - # if task_source.startswith('./'): - # # TODO: Path could have one or more `../` use relpath to fix this block - # task_source = task_source[2:] - # task_source = os.path.join(os.getcwd(), task_source) - # print(f" path {task_source}\n\n") # Debugging + # # Path could be absolute or relative, check and work accordingly + # # if not task_source.startswith('/'): + # # if task_source.startswith('./'): + # # # TODO: Path could have one or more `../` use relpath to fix this block + # # task_source = task_source[2:] + # # task_source = os.path.join(os.getcwd(), task_source) + # # print(f" path {task_source}\n\n") # Debugging - # could be a single file or a folder check and work accordignly - if os.path.isdir(task_source): - for sub in os.listdir(task_source): - shutil.copy2(os.path.join(task_source, sub), task_folder) - else: - shutil.copy2(task_source, task_folder) - print("Success") + # # could be a single file or a folder check and work accordignly + # if os.path.isdir(task_source): + # for sub in os.listdir(task_source): + # shutil.copy2(os.path.join(task_source, sub), task_folder) + # else: + # shutil.copy2(task_source, task_folder) + # print("Success") - def run(self, model,runner_type="ollama", api_url=None): + def run(self, model='gemma3',runner_type="ollama", api_url=None,logging_path = None): """ - run the task on the model + run the task on the stated model and log the interactions. Parameters ---------- @@ -213,8 +291,15 @@ def run(self, model,runner_type="ollama", api_url=None): to use the Ollama runner, the script expects the model to be installed, and `ollama serve` running on localhost:11434 to use OpenAI runner, you must have an API key set in your OPENAI_API_KEY environment variable """ + responses = [] - for sub_task in self.sub_tasks: + if not logging_path: + logging_path = 'logs' + if not os.path.exists(logging_path): + os.mkdir(logging_path) + self.logger = init_logger(logging_path, self.name) + + for sub_task in self.generate_prompts(): # print(sub_task) match runner_type: @@ -226,7 +311,7 @@ def run(self, model,runner_type="ollama", api_url=None): }, ]) # print("response: " + response.message.content) - self.responses.append(response.message.content) + responses.append(response.message.content) case "ollama_api": client = Client( @@ -241,7 +326,7 @@ def run(self, model,runner_type="ollama", api_url=None): }, ], ) - self.responses.append(response["message"]["content"]) + responses.append(response["message"]["content"]) case "openai": client = OpenAI( @@ -256,12 +341,21 @@ def run(self, model,runner_type="ollama", api_url=None): } ], ) - self.responses.append(chat_completion.choices[0].message.content) + responses.append(chat_completion.choices[0].message.content) case _: - print(f"Runner type {self.runner_type} not supported") + print(f"Runner type {runner_type} not supported") return None - log_agent_interaction(self.logger, sub_task, response.message.content) + log_interaction(self.logger, sub_task, response.message.content) + + + if self.variant_values: + self.responses = responses + # dict(zip([str(v) for v in self.variant_values], responses)) + else: + self.responses = responses + + return self.responses def score(self, response): @@ -273,7 +367,13 @@ def score(self, response): response : string the value to score """ - return self.scoring_function(response, self.reference) + if isinstance(self.reference, list) and isinstance(response, list): + # TODO: error if the lengths don't match + # if there are multiple reference answers, score against each + scores = [self.scoring_function(resp, ref) for resp,ref in zip(self.reference)] + return scores + else: + return self.scoring_function(response, self.reference) # additional classes for other types of tasks diff --git a/demobench/logs/.gitkeep b/demobench/logs/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/demobench/tasks/miscops/task.yml b/demobench/tasks/miscops/task.yml deleted file mode 100644 index 4787603..0000000 --- a/demobench/tasks/miscops/task.yml +++ /dev/null @@ -1,9 +0,0 @@ -- template: "find the product of {a} and {b}" - values: - a: [2,3,5] - b: [3,4,5] - result: [6,12,25] -- template: "what is the name for the following symbol? {symb}" - values: - symb: ["@","$","#"] - result: ["at", "dollar sign", "pound"] \ No newline at end of file diff --git a/demobench/README.md b/demos/folderbench/README.md similarity index 100% rename from demobench/README.md rename to demos/folderbench/README.md diff --git a/demobench/tasks/add/task.txt b/demos/folderbench/tasks/add/template.txt similarity index 100% rename from demobench/tasks/add/task.txt rename to demos/folderbench/tasks/add/template.txt diff --git a/demobench/tasks/add/values.csv b/demos/folderbench/tasks/add/values.csv similarity index 56% rename from demobench/tasks/add/values.csv rename to demos/folderbench/tasks/add/values.csv index 183572a..81a9795 100644 --- a/demobench/tasks/add/values.csv +++ b/demos/folderbench/tasks/add/values.csv @@ -1,4 +1,4 @@ -a,b,res +a,b,reference 2,3,5 4,5,9 8,9,17 \ No newline at end of file diff --git a/demos/folderbench/tasks/symbols/template.txt b/demos/folderbench/tasks/symbols/template.txt new file mode 100644 index 0000000..39c2ebb --- /dev/null +++ b/demos/folderbench/tasks/symbols/template.txt @@ -0,0 +1 @@ +what is the name for the following symbol? {symb} \ No newline at end of file diff --git a/demos/folderbench/tasks/symbols/values.csv b/demos/folderbench/tasks/symbols/values.csv new file mode 100644 index 0000000..0ba7c0f --- /dev/null +++ b/demos/folderbench/tasks/symbols/values.csv @@ -0,0 +1,4 @@ +symb, reference +@, at +#, pound +$, dollar sign \ No newline at end of file diff --git a/demos/listbench/info.yml b/demos/listbench/info.yml new file mode 100644 index 0000000..47997ff --- /dev/null +++ b/demos/listbench/info.yml @@ -0,0 +1,2 @@ +bench_name: list bench +concept: simple example of a benchmark defined by a list diff --git a/demos/listbench/tasks.yml b/demos/listbench/tasks.yml new file mode 100644 index 0000000..d458844 --- /dev/null +++ b/demos/listbench/tasks.yml @@ -0,0 +1,13 @@ +- name: product + template: "find the product of {a} and {b}" + values: + a: [2,3,5] + b: [3,4,5] + reference: [6,12,25] + scorer: "exact_match" +- name: symbol + template: "what is the name for the following symbol? {symb}" + values: + symb: ["@","$","#"] + reference: ["at", "dollar sign", "pound"] + scorer: "contains" \ No newline at end of file diff --git a/docs/source/cli.md b/docs/source/cli.md index d43609f..230cc73 100644 --- a/docs/source/cli.md +++ b/docs/source/cli.md @@ -1,18 +1,23 @@ # CLI +We an initialize without tasks + ```bash -benchtool init new_test -p . -t add ../datasets/add/ -t Gaps ../datasets/miscops/ -a "this is a demo for benchtools" +cd demos +benchtool init testbench -a "to test a simple example" --no-git ``` + ```bash +cd testbench benchtool add-task ../new_test/ FillIn ../datasets/miscops/ ``` ``` -benchtool run testRuns/111 +benchtool run demos/folderbench ``` diff --git a/docs/source/pylib.md b/docs/source/pylib.md index 43dd0e2..6ecff7c 100644 --- a/docs/source/pylib.md +++ b/docs/source/pylib.md @@ -15,63 +15,74 @@ kernelspec: # BenchTools as a Python Library +## A tiny examle + +we can create a tiny benchmark programmatically ```{code-cell} from benchtools import Bench -tiny_bench = Bench('Tiniest Demo', concept ='a simplest test') +tiny_bench = Bench('Tiniest Demo', concept ='the simplest test') ``` ```{code-cell} from benchtools import Task -tt = Task('greeting','Hello','hi', 'contains') +tt = Task('greeting','Hello there','hi', 'contains') ``` - + ```{code-cell} response = tt.run() ``` - ```{code-cell} tt.score(response) ``` ```{code-cell} tiny_bench.add_task(tt) +add_task = Task.from_txt_csv('../../demos/folderbench/tasks/add') +tiny_bench.add_task(add_task) ``` +For demo purposes we delete the folder, if it exists, before running. +```{code-cell} +%%bash +rm -rf tiniest_demo +``` ```{code-cell} +tiny_bench.initialize_dir() tiny_bench.run() ``` ```{code-cell} -pre_built = Bench('math Demo', concept ='a math test') -add_task = Task.from_txt_csv('../../demobench/add') -pre_built.add_task(add_task) +pre_built_yml = Bench.from_yaml('../../demos/listbench') +pre_built_yml.written ``` +we can access individual tasks: ```{code-cell} -symb_task = Task.from_yaml('../../demobench/miscops') -pre_built.add_task(symb_task) +pre_built_yml.tasks['product'].variant_values ``` + + ```{code-cell} -pre_built.write() +pre_built_yml.run() ``` ```{code-cell} -pre_built.run() +demo_bench = Bench.from_yaml('../../demos/listbench') ``` -```{code-cell} + From b4d3be0c08f4c123ec770c1707575d21d6439fb2 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sun, 15 Feb 2026 22:57:54 +0000 Subject: [PATCH 75/78] BetterBench: Betterbench interactive session --- benchtools/assets/.gitignore | 216 +++++++++++++++++++++++++ benchtools/assets/betterbench.yml | 252 ++++++++++++++++++++++++++++++ benchtools/benchmark.py | 1 + benchtools/betterbench.py | 214 ++++++++++++------------- benchtools/cli.py | 36 +++-- requirements.txt | 1 + 6 files changed, 604 insertions(+), 116 deletions(-) create mode 100644 benchtools/assets/.gitignore create mode 100644 benchtools/assets/betterbench.yml diff --git a/benchtools/assets/.gitignore b/benchtools/assets/.gitignore new file mode 100644 index 0000000..e15106e --- /dev/null +++ b/benchtools/assets/.gitignore @@ -0,0 +1,216 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml diff --git a/benchtools/assets/betterbench.yml b/benchtools/assets/betterbench.yml new file mode 100644 index 0000000..86b5f3e --- /dev/null +++ b/benchtools/assets/betterbench.yml @@ -0,0 +1,252 @@ +# Design +"The tested capability, characteristic, or concept is defined": + - "Tested concept, capability, or characteristic not explicitly mentioned." + - "Tested concept explicitly mentioned and need for definition acknowledged, but definition not provided." + - "Tested concept, capability, or characteristic explicitly mentioned but not defined." + - "Tested concept, capability, or characteristic explicitly mentioned and defined." + +"How tested capability or concept translates to benchmark task is described": + - "No description of how the tested capability or concept translates to the benchmark task." + - "Acknowledgement that not describing how the tested capability or concept translates to the benchmark task is an issue, but no description provided." + - "Description of how tested capability or concept translates to benchmark tasks provided for some but not all tasks." + - "Description of how tested capability or concept translates to benchmark tasks provided for all tasks." + +"How knowing about the tested concept is helpful in the real world is described": + - "No description of how knowing about the tested concept is helpful in the real world." + - "Acknowledgement that not describing how knowing about the tested concept is helpful in the real world is an issue, but no description provided." + - "Limited description of how knowing about the tested concept is helpful in the real world." + - "Full description of how knowing about the tested concept is helpful in the real world." + +"How benchmark score should or shouldn't be interpreted/used is described": + - "The benchmark does not comment on how the benchmark scores should or shouldn't be interpreted." + - "The benchmark acknowledges that the benchmark scores need to be interpreted but gives no guidance with respect to how or how not to do that." + - "The benchmark describes how scores should or shouldn't be interpreted or used, but not both." + - "The benchmark describes how scores should and shouldn't be interpreted or used." + +"Domain experts are involved": + - "None of the authors has a background in the benchmark domain and no external experts were consulted during the design process." + - "The benchmark mentions domain experts but doesn't specify any further details." + - "The benchmark mentions that domain experts were consulted but not how their insights influenced the benchmark design." + - "At least one of the co-authors has a professional or academic background in the benchmark domain or the benchmark specified how external experts were consulted and how that influenced the design process." + +# has n/a +"Use cases and/or user personas are described": + - "The benchmark does not include any description of use cases or user personas." + - "The benchmark acknowledges the importance of use cases or user personas but does not explicitly formulate or describe them." + - "The benchmark provides a partial description of use cases or user personas." + - "The benchmark fully describes use cases and user personas, specifying the cultural and geographic context, types of human-model interactions (if applicable), and representing different user types that might interact with the AI system (if applicable)." + - "For AI systems that do not involve direct human interaction, such as those used in industrial automation or scientific simulations, defining user personas is not relevant. However, real-world use cases should still be specified; in more theoretical benchmarks, this use case might be to advance research." + +"Domain literature is integrated": + - "The benchmark does not reference domain-specific literature." + - "The benchmark mentions the need to integrate domain literature but did not address it in the background section or design process." + - "The benchmark references domain literature in the background or related work section but does not describe how that domain literature informed the benchmark design process." + - "The benchmark references domain literature throughout the paper and describes how that domain literature informed the benchmark design process." + +"Informed performance metric choice": + - "The benchmark does not mention an evaluation metric or does not explain the choice of metric." + - "The benchmark acknowledges the need for an informed metric choice but does not justify their metric choice." + - "The benchmark provides an explanation for the choice of some but not all of their metrics." + - "The benchmark provides an explanation for the choice of all of their metrics." + +"Metric floors and ceilings are included": + - "The benchmark doesn't provide any floors or ceilings." + - "Floors and ceilings are shown in the results figure but not explicitly mentioned in the text." + - "The benchmark provides floors and ceilings for some but not all evaluation metrics." + - "The benchmark provides floors and ceilings for all evaluation metrics." + +# has n/a +"Human performance level is included": + - "The benchmark does not state human performance and does not explain why this isn't applicable here." + - "The benchmark mentions human performance in passing but does not provide a measurement or explanation." + - "The benchmark states human performance but does not explain how it was obtained." + - "The benchmark states human performance and explains how it was obtained." + - "The benchmark task cannot be completed by a human, and hence reporting human performance is not possible." + +# has n/a +"Random performance level is included": + - "The benchmark does not state random performance and does not explain why this isn't applicable here." + - "The benchmark mentions random performance but does not provide quantitative random performance on the benchmark task(s)." + - "The benchmark states random performance for some but not all tasks." + - "The benchmark states random performance for all tasks." + - "Measuring random performance on the benchmark task is not possible, and hence reporting random performance is not possible." + +"Automatic evaluation is possible and validated": + - "The benchmark does not provide any form of automatic evaluation and relies entirely on human evaluation." + - "The benchmark mentions the benefits of automatic evaluation but provides no none or limited automatic valuation." + - "The benchmark includes an automatic evaluation method but does not offer any validation." + - "The benchmark includes an automatic evaluation method and describes how it was validated as well as the results of the validation." + +"Differences to related benchmarks are explained": + - "The benchmarks do not explain any differences or relevance to existing benchmarks." + - "The benchmark briefly mentions existing benchmarks but provides no explanations of differences or added value." + - "The benchmark provides an explanation of how it fills a gap or expands on existing benchmarks for some but not all mentioned related benchmarks." + - "The benchmark provides an explanation of how it fills a gap or expands on existing benchmarks for all mentioned related benchmarks." + +"Input sensitivity is addressed": + - "The benchmark does not mention or address input sensitivity." + - "The benchmark mentions the issue of input sensitivity but does not describe experiments to test for it." + - "The benchmark includes some input variations with the same semantic meaning but lacks thorough descriptions or details on the number of variations and their design." + - "The benchmark contains multiple input variations with the same semantic meaning, providing detailed descriptions of all relevant details such as the number of variations per prompt and how they were designed." + + +# Implementation +"The evaluation code is available": + - "The evaluation code is not publicly available." + - "The benchmark mentions the availability of evaluation code but does not provide access to it." + - "The evaluation code is publicly available for some metrics described by the benchmark." + - "The evaluation code is publicly available for all metrics described by the benchmark." + +"The evaluation data or generation mechanism is accessible": + - "No access to evaluation data, prompts, or data/environment generation mechanism is provided." + - "The existence of evaluation data, prompts, or data/environment generation mechanism is mentioned, but no concrete access is provided." + - "Partial access to evaluation data, prompts, or data/environment generation mechanism is provided, allowing for limited evaluation." + - "Full access to evaluation data, prompts, or data/environment generation mechanism is provided, enabling comprehensive evaluation." + +"The evaluation of models via API is supported": + - "The benchmark does not support evaluation of models via API calls." + - "The benchmark mentions the possibility of API evaluation but does not provide concrete implementation details." + - "The benchmark supports evaluation of models via one API." + - "The benchmark supports evaluation of models via two or more APIs to different models." + +"The evaluation of local models is supported": + - "The benchmark requires users to write their own code to evaluate a local model." + - "The benchmark mentions that local evaluation should be possible but doesn't provide corresponding code." + - "The benchmark provides minimal support for local model evaluation, requiring significant user effort." + - "The benchmark provides full support for local model evaluation with user-friendly code." + +"A globally unique identifier is added or evaluation instances are encrypted": + - "The benchmark does not include a GUID or encryption of evaluation instances." + - "The benchmark acknowledges the risk of contamination but does not address it." + - "The benchmark partially implements a GUID or encryption, but not consistently across all relevant files." + - "The benchmark consistently includes a GUID or encryption across all relevant files and repositories." + +"A task to identify if model is trained on benchmark data": + - "The benchmark does not include a 'training_on_test_set' task." + - "The benchmark mentions the possibility that models were trained on its data but does not provide a way to check it." + - "The benchmark includes a partial or limited implementation of a 'training_on_test_set' task that only tests for part of the data used." + - "The benchmark includes a comprehensive 'training_on_test_set' task." + +"A script to replicate results is explicitly included": + - "[No description provided]" + - "The issue of result replicability is mentioned in the benchmark paper but not addressed." + - "A script to reproduce some results in the benchmark paper is available." + - "A script to reproduce all results in the benchmark paper is available." + +"Statistical significance or uncertainty quantification of benchmark results is reported": + - "No statistical significance testing or variance reporting is provided for the benchmark results." + - "The need for valid benchmarks and/or statistical significance or uncertainty estimation is mentioned but not addressed." + - "Benchmark developers bound the expected variation across model training runs." + - "Benchmark developers run statistical significance tests on the benchmark results for at least one model and provide variance bounds or other uncertainty estimations. In cases where the benchmark is perfectly deterministic, this is explicitly stated." + +"Need for warnings for sensitive/harmful content is assessed": + - "The benchmark does not mention that they checked for the presence or absence of sensitive/harmful content in the evaluation tasks or expected output." + - "The benchmark mentions the general possibility of sensitive/harmful content but does not provide clear statements or warnings." + - "The benchmark explicitly states the presence or absence of sensitive/harmful content for either the evaluation tasks or the expected output." + - "The benchmark explicitly states the presence or absence of sensitive/harmful content for both the evaluation tasks and the expected output." + +"A build status (or equivalent) is implemented": + - "The benchmark neither references nor implements any form of build status or equivalent." + - "The benchmark mentions the need for working evaluation code but does not implement it in any meaningful way." + - "The benchmark partially implements a build status or equivalent by providing the information in a less accessible manner." + - "The benchmark fully implements a build status or equivalent, clearly displaying the status of the most recent build and providing easy access to the information." + +"Release requirements are specified": + - "The benchmark does not specify any release requirements for benchmark users." + - "The benchmark briefly mentions the issue of potential gameability or misuse by benchmark users but does not provide specific details." + - "The benchmark states dos and don'ts how to use the benchmark but does not specify these as requirements for use." + - "The benchmark provides a set of release requirements for benchmark users." + + +# Documentation: +"Requirements file or equivalent is available": + - "No requirements file or equivalent is provided." + - "A requirements file is mentioned but not provided." + - "A requirements file is provided but may be missing some dependencies or versions." + - "A complete and accurate requirements file specifying all necessary dependencies and versions is provided." + +"Quick-start guide or demo is available": + - "No quick-start guide or demo code is provided." + - "A quick-start guide or demo code is mentioned but not provided." + - "A quick-start guide or demo code is provided but may be missing some steps or details." + - "A comprehensive, step-by-step quick-start guide or demo code is provided." + +"In-line code comments are used": + - "No in-line code comments are provided." + - "In-line code comments are sparse and do not adequately explain the purpose, inputs, outputs, or functionality of the code." + - "Informative in-line code comments are present for most of the code but may be lacking in detail or clarity for some code segments." + - "Comprehensive and informative in-line code comments are provided for all relevant code segments, clearly explaining their purpose, inputs, outputs, and functionality." + +"Code documentation is available": + - "No code documentation is provided." + - "Code documentation is mentioned but not provided." + - "Code documentation is minimal or incomplete, lacking important details about the repository structure and functions." + - "Comprehensive code documentation is provided, including a clear overview of the folder structure, files in the repo, and detailed explanations of all relevant functions." + +"Accompanying paper is accepted at peer-reviewed venue": + - "The benchmark/its associated paper has not been accepted at a peer-reviewed venue." + - "The benchmark/its associated paper has been submitted to a peer-reviewed venue but is still under review or awaiting acceptance." + - "The benchmark/its associated paper has been accepted at a peer-reviewed workshop or symposium." + - "The benchmark/its associated paper has been accepted at a peer-reviewed journal, conference, or similar high-profile venue." + +"Benchmark construction process is documented": + - "No documentation of the benchmark construction process is provided." + - "The benchmark construction process is briefly mentioned but lacks sufficient detail about the decisions made, rationale, and trade-offs considered." + - "The benchmark construction process is documented, including some decisions made and their rationale, but the description lacks depth or fails to address important aspects such as trade-offs or compromises." + - "The benchmark construction process is comprehensively documented, providing a detailed account of the specific decisions made at each stage, the rationale behind them, and any trade-offs or compromises considered." + +"Test tasks & rationale are documented": + - "No documentation of test task categories or rationale is provided." + - "Test task categories are mentioned but they are neither defined in detail and a rationale for their selection is missing or inadequate." + - "Test task categories are defined, but the rationale for their selection is not provided." + - "Test task categories are clearly defined, and a comprehensive rationale is provided, explaining their relevance to the benchmark's objectives, what they measure, and their importance for evaluating the targeted concept or capability." + +"Assumptions of normative properties are documented": + - "No documentation of normative assumptions is provided, even though the benchmark measures culturally-dependent properties." + - "The potential influence and importance of cultural context on the benchmark is acknowledged but normative assumptions aren't stated." + - "Normative assumptions are stated, but the explanation of how they are conceptualized and operationalized within the benchmark is incomplete or lacks clarity." + - "Normative assumptions are explicitly and clearly stated, defining the cultural context and values that the benchmark adheres to, and explaining how the measured properties are conceptualized and operationalized within the benchmark." + +"Limitations are documented": + - "No documentation of the benchmark's limitations is provided." + - "Limitations of AI evaluations more broadly are briefly mentioned but without any detail and not applied to the specific benchmark." + - "Either limitations regarding the applicability and use of the benchmark or limitations of the benchmark design are discussed, but not both." + - "Both limitations regarding the applicability and use of the benchmark and limitations of the benchmark design are comprehensively discussed." + +"Data collection, test environment design, or prompt design process is documented": + + - "No documentation of the data collection or environment/prompt design process is provided." + - "The data collection or environment/prompt design process is briefly mentioned but no information about the sources, selection criteria, preprocessing steps, or prompt validation is provided." + - "The data collection or environment/prompt design process is documented, including some information about the sources, selection criteria, preprocessing steps, or prompt validation, but the description lacks depth or fails to address important aspects." + - "The data collection or environment/prompt design process is comprehensively documented, providing a detailed account of the sources of the data, the criteria for selection, any preprocessing steps, and, if applicable, how the prompts were created, validated, and why they were used to elicit the desired responses." + +"Evaluation metric is documented": + - "No documentation of the evaluation metrics is provided." + - "The evaluation metrics are mentioned but not clearly defined, and the exact formulas or processes used to calculate them are not provided." + - "The evaluation metrics are defined, but the documentation lacks some important details, such as any parameters or thresholds employed." + - "The evaluation metrics are clearly specified. The exact formulas or processes used to calculate these metrics, along with any parameters or thresholds employed, are comprehensively documented." + +"Applicable license is specified": + - "No license is specified for the benchmark." + - "A license is mentioned but not clearly specified or linked to in the code repository or paper." + - "A license is specified but lacks some important details about the conditions under which the benchmark can be used, modified, or distributed." + - "The applicable license for the benchmark is clearly specified in the code repository or paper, providing comprehensive information about the conditions under which the benchmark can be used, modified, and distributed." + +# Maintenance: +"Code usability was checked within the last year": + - "No updates to the main files of the public code within the last year, and no explicit statement of a usability check in the README file." + - "Updates to minor files in the repo were made (e.g., README file) but an explicit statement of a usability check in the README file is not reported." + - "Updates to the main files of the public code were made within the last year, but the build status check failed and wasn't fixed." + - "Updates to the main files of the public code within the last year, accompanied by a successful build status check, or an explicit statement of a usability check in the README file, including the date of the check was provided." + +"Maintained feedback channel for users is available": + - "No acknowledgment or response to GitHub issues that are older than three months." + - "GitHub issues are mentioned as a way to provide feedback but there are GitHub issues that were not responded to and that are older than three months." + - "All GitHub issues are acknowledged within three months, but not all are addressed or resolved or were closed because the issue/feature request won't be attended to." + - "All GitHub issues are acknowledged and addressed within three months, or it is clearly stated if an issue cannot be fixed or if a feature request won't be fulfilled. Alternatively, there are no open issues." + +"Contact person is listed": + - "It is not disclosed who developed the benchmark." + - "The benchmark developers are disclosed but no explicit contact details are provided." + - "Contact details are provided but are incomplete or difficult to find, e.g., only as part of terms of service on a website." + - "Contact details of the person responsible for the benchmark are easily accessible, such as a corresponding author in the associated paper, a contact person listed on GitHub or the website, or an available online feedback form." \ No newline at end of file diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index 8ce234f..a09a8dc 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -266,6 +266,7 @@ def add_task(self, task_object): # self.tasks.append(task) self.tasks[task_object.name] = task_object + def run(self,model='gemma3',runner_type="ollama", api_url=None,): ''' Run the benchmark by running each task in the benchmark and logging the interactions. diff --git a/benchtools/betterbench.py b/benchtools/betterbench.py index bcbb290..c40d858 100644 --- a/benchtools/betterbench.py +++ b/benchtools/betterbench.py @@ -1,143 +1,145 @@ import os -import json +import yaml +# import json import click -import dataclasses +# import dataclasses from dataclasses import dataclass +from click_prompt import choice_option -class EnhancedJSONEncoder(json.JSONEncoder): - def default(self, o): - if dataclasses.is_dataclass(o): - return dataclasses.asdict(o) - #if it is a function, use its string name - elif hasattr(o, '__call__'): - return o.__name__ - return super().default(o) +# class EnhancedJSONEncoder(json.JSONEncoder): +# def default(self, o): +# if dataclasses.is_dataclass(o): +# return dataclasses.asdict(o) +# #if it is a function, use its string name +# elif hasattr(o, '__call__'): +# return o.__name__ +# return super().default(o) + +# We'll see if this is needed: @dataclass -class ChecklistQuestion: - response: str +class Question: + question_text: str justification: str - score: int criteria: list[str] + NA: bool + +@dataclass +class ChecklistItem: + # question: Question # Again, we'll see... skipped: bool + response: str + justification: str + score: int + # criteria: list[str] + +# def calculate_score(response: str, justification: str) -> int: +# if response == 'no': +# return 0 +# else: +# TODO + +with open("benchtools/assets/betterbench.yml", 'r') as f: # NEED manifest? + main_checklist = yaml.safe_load(f) -def calculate_score(response: str, justification: str) -> int: - if response == 'no': - return 0 - else: - TODO - - -Checklist = [ - # Design - "The tested capability, characteristic, or concept is defined", - "How tested capability or concept translates to benchmark task is described", - "How knowing about the tested concept is helpful in the real world is described", - "How benchmark score should or shouldn't be interpreted/used is described", - "Domain experts are involved", - "Use cases and/or user personas are described", # Has n/a - "Domain literature is integrated", - "Informed performance metric choice", - "Metric floors and ceilings are included" - "Human performance level is included", # Has n/a - "Random performance level is included", # Has n/a - "Automatic evaluation is possible and validated", - "Differences to related benchmarks are explained", - "Input sensitivity is addressed", - # Implementation - "The evaluation code is available", - "The evaluation data or generation mechanism is accessible", - "The evaluation of models via API is supported", - "The evaluation of local models is supported", - "A globally unique identifier is added or evaluation instances are encrypted", - "A task to identify if model is included trained on benchmark data", - "A script to replicate results is explicitly included", - "Statistical significance or uncertainty quantification of benchmark results is reported", - "Need for warnings for sensitive/harmful content is assessed", - "A build status (or equivalent) is implemented", - "Release requirements are specified", - # Documentation - "Requirements file or equivalent is available", - "Quick-start guide or demo is available", - "In-line code comments are used", - "Code documentation is available", - "Accompanying paper is accepted at peer-reviewed venue", - "Benchmark construction process is documented", - "Test tasks & rationale are documented", - "Assumptions of normative properties are documented", - "Limitations are documented", - "Data collection, test environment design, or prompt design process is documented", - "Evaluation metric is documented", - "Applicable license is specified", - # Maintenance - "Code usability was checked within the last year", - "Maintained feedback channel for users is available", - "Contact person is listed" -] - - - -def betterbench(checklist_path="/work/pi_brownsarahm_uri_edu/ayman_uri/BenchTools/testRuns/111/betterbench.json") -> dict: +def better_session(bench_path="/work/pi_brownsarahm_uri_edu/ayman_uri/BenchTools/testRuns/111/") -> dict: +# def betterbench(checklist_path) -> dict: """ The checklist below is based on the benchmark quality assessment proposed in BetterBench. It is supposed to help authors identify if they adhere to best practices in their benchmark development. If you want to have your benchmark added to the BetterBench Repository, please also fill out the justifications. These should be about one sentence long each, and include the page numbers of your paper or your webpage where the information can be found. You can also copy-paste quotes from any of your publicly available materials here as evidence. In this case, please also add a link to the source. Reuel et. al. - :param checklist_path: _description_, defaults to "/work/pi_brownsarahm_uri_edu/ayman_uri/BenchTools/testRuns/1111/betterbench.json" - :type checklist_path: str, optional - :return: _description_ - :rtype: dict + To understand methodology and justification of questions please view [BetterBench Methodology](https://betterbench.stanford.edu/methodology.html) + + ---- + checklist_path: Path to Benchmark's betterbench checklist file + """ - checklist={} + # Intro + click.echo("Entering interactive session for BetterBench!") + click.echo("This interactive session is meant guide the benchmark to follow the standards developed by reuel et. al. named the BetterBench Checklist!") + click.echo("This interactive session is optional and you can always come back to it with the `benchtool betterbench resume ` command") + + # Load existing BetterBench checklist if applicable + bench_checklist={} + checklist_path = os.path.join(bench_path, "betterbench.yml") if os.path.exists(checklist_path): - with open(checklist_path) as f: - checklist = json.load(f) + with open(checklist_path, 'r') as f: + bench_checklist = yaml.safe_load(f) - if not checklist: - checklist = {} - for question in Checklist: - item = ChecklistQuestion( + if not bench_checklist: + # Create checklist items and add them to new checklist + bench_checklist={} + for question in main_checklist.keys(): + # print(question) # Debugging + item = ChecklistItem( skipped=True, response="", justification="", score=0, - criteria=[] ) - checklist[question] = item - - - click.echo("Entering interactive session for BetterBench!") - click.echo("This interactive session is meant to help you think about your benchmark in through the standards develope by reuel et. al. that are the BetterBench Checklist!") - click.echo("This interactive session is optional and you can always come back to it with the `betterbench resume` command") + bench_checklist[question] = yaml.dump(item) + + # Save empty checklist into the benchmark repo + if os.path.exists(bench_path): + with open(checklist_path, 'w') as f: + yaml.dump(bench_checklist, f) + # TODO: check if want to change answer on any questions # Loop until user opts out - for question, vals in checklist.items(): - # print(question) - # print(vals) - choice = click.prompt(f"{question}?\nEnter to skip. q to end this session...", type=click.Choice(["yes", "no", "n/a", 'q', ''], case_sensitive=False), show_choices=True, default='') + for question, criteria in main_checklist.items(): + # TODO: add if(bench_checklist[skipped]) + # print(question) # DEbugging + # # print(vals) + if len(criteria) == 4: + choice = click.prompt(f"{question}?\nEnter to skip. q to end this session...", type=click.Choice(["yes", "no", 'q', ''], case_sensitive=False), show_choices=True, default='') + else: + choice = click.prompt(f"{question}?\nEnter to skip. q to end this session...", type=click.Choice(["yes", "no", "n/a", 'q', ''], case_sensitive=False), show_choices=True, default='') + # TODO: check for n/a # Check for user opt out - if choice == 'q': - break - elif choice == '': - continue - else: - justification = click.prompt("Justification? ") + match choice: + case 'q': + break + case 'no': + item = ChecklistItem( + skipped=False, + response=choice, + justification=criteria[0], + score=0, + ) + bench_checklist[question] = yaml.dump(item) + print(bench_checklist[question]) + case 'yes': + score = click.prompt(f"Please pick score level:\n0- {criteria[0]}\n5- {criteria[1]}\n10- {criteria[2]}\n15- {criteria[3]}\n", type=click.Choice([0, 5, 10, 15]), show_choices=True, default=5) + justification = click.prompt("Justification? ") + item = ChecklistItem( + skipped=False, + response=choice, + justification=justification, + score=score, + ) + bench_checklist[question] = yaml.dump(item) + print(bench_checklist[question]) + case '': + continue + - score = calculate_score(choice, justification) - checklist[question]['response'] = choice - checklist[question]['justification'] = justification - checklist[question]['score'] = score + # score = calculate_score(choice, justification) + # checklist[question]['response'] = choice + # checklist[question]['justification'] = justification + # checklist[question]['score'] = score - - json.dump(checklist, open(checklist_path, "w"), indent=4, cls=EnhancedJSONEncoder) - - exit(0) + print(checklist_path) #debugging + # Save current checklist into the benchmark repo + if os.path.exists(checklist_path): + with open(checklist_path, 'w') as f: + yaml.dump(bench_checklist, f) def get_score() -> int: diff --git a/benchtools/cli.py b/benchtools/cli.py index 55f6c7b..9fbf837 100644 --- a/benchtools/cli.py +++ b/benchtools/cli.py @@ -3,7 +3,7 @@ from benchtools.task import Task from benchtools.benchmark import Bench from benchtools.runner import BenchRunner -# from benchtools.betterbench import betterbench, get_score +from benchtools.betterbench import better_session, get_score # from task import PromptTask @click.group() @@ -70,7 +70,7 @@ def init(benchmark_name, path, about, no_git): click.echo(f"Created {benchmark_name} benchmark successfully!") # TODO: Call betterbench CLI here - # betterbench() + betterbench(bench_path) # Run? if benchmark.tasks: @@ -154,21 +154,37 @@ def run(benchmark_path: str): benchmark.run() - -@benchtool.command() -@click.argument('benchmark-path', required = True, type=str) -def score(benchmark_path: str): +@click.group() +def betterbench(): """ - Running the tasks and generating logs + Launch the BenchBench interactive tool + """ + pass + +@betterbench.command() +@click.argument('bench-path', required = True, type=str) +def resume(bench_path: str): + """ + Running the betterbench interactive session + """ + # benchmark = Bench.load(bench_path) # IS this needed? Maybe just check if written? + better_session(bench_path) - , help="The path to the benchmark repository where all the task reside." - , help="The name of the specific task you would like to run" + + +@betterbench.command() +@click.argument('bench-path', required = True, type=str) +def score(bench_path: str): """ - benchmark = Bench.load(benchmark_path) + Running the betterbench scoring function + """ + # benchmark = Bench.load(bench_path) # IS this needed? Maybe just check if written? click.echo(f"Scoring {benchmark.bench_name} now...") score = get_score() click.echo(f"Score: {score}") +benchtool.add_command(betterbench) + # For debugging if __name__ == '__main__': diff --git a/requirements.txt b/requirements.txt index 111da9a..b728209 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ pyyaml pandas click +click-prompt ollama openai datasets From 05d56ba988dcb0cb1c7160f082c88097908f6031 Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sun, 15 Feb 2026 18:12:00 -0500 Subject: [PATCH 76/78] fix run --- benchtools/betterbench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchtools/betterbench.py b/benchtools/betterbench.py index c40d858..594ea71 100644 --- a/benchtools/betterbench.py +++ b/benchtools/betterbench.py @@ -4,7 +4,7 @@ import click # import dataclasses from dataclasses import dataclass -from click_prompt import choice_option +# from click_prompt import choice_option # class EnhancedJSONEncoder(json.JSONEncoder): From d1ca4b09d0696099a68ca580cb3206e435505c5d Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sun, 15 Feb 2026 18:12:13 -0500 Subject: [PATCH 77/78] fix syntax --- project.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/project.toml b/project.toml index bae4c7a..2fad435 100644 --- a/project.toml +++ b/project.toml @@ -13,17 +13,17 @@ dependencies = [ ] requires-python = ">=3.10" authors = [ - {name = "Ayman Sandouk, email = "ayman_sandouk@uri.edu"}, - {name = "Sarah M Brown, email = "brownsarahm@uri.edu"}, + {name = "Ayman Sandouk", email = "ayman_sandouk@uri.edu"}, + {name = "Sarah M Brown", email = "brownsarahm@uri.edu"}, ] maintainers = [ - {name = "Ayman Sandouk, email = "ayman_sandouk@uri.edu"}, + {name = "Ayman Sandouk", email = "ayman_sandouk@uri.edu"}, {name = "Sarah M Brown", email = "brownsarahm@uri.edu"} ] description = "" readme = "README.md" license = "MIT" # or license = {file = "LICENSE.txt"} for legacy declaration -license-files = ["LICEN[CS]E.*"] + keywords = ["benchmark", "machine-learning", "ai", "llm"] From d67f01a3499105c74e8f2d8779a2497c7235c7de Mon Sep 17 00:00:00 2001 From: Sarah M Brown Date: Sun, 15 Feb 2026 18:21:13 -0500 Subject: [PATCH 78/78] typo and rm path --- benchtools/betterbench.py | 2 +- docs/source/cli.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchtools/betterbench.py b/benchtools/betterbench.py index 594ea71..f21ecac 100644 --- a/benchtools/betterbench.py +++ b/benchtools/betterbench.py @@ -43,7 +43,7 @@ class ChecklistItem: main_checklist = yaml.safe_load(f) -def better_session(bench_path="/work/pi_brownsarahm_uri_edu/ayman_uri/BenchTools/testRuns/111/") -> dict: +def better_session(bench_path) -> dict: # def betterbench(checklist_path) -> dict: """ The checklist below is based on the benchmark quality assessment proposed in BetterBench. It is supposed to help authors identify if they adhere to best practices in their benchmark development. If you want to have your benchmark added to the BetterBench Repository, please also fill out the justifications. These should be about one sentence long each, and include the page numbers of your paper or your webpage where the information can be found. You can also copy-paste quotes from any of your publicly available materials here as evidence. In this case, please also add a link to the source. diff --git a/docs/source/cli.md b/docs/source/cli.md index 230cc73..8a72bd1 100644 --- a/docs/source/cli.md +++ b/docs/source/cli.md @@ -16,7 +16,7 @@ benchtool add-task ../new_test/ FillIn ../datasets/miscops/ -``` +```bash benchtool run demos/folderbench ```