Skip to content

Commit 2a7f385

Browse files
Eval run method improvements (#19)
* Improved validations * Fix type hint * Change run_local to run, function to callable and allow for runtime on HL evals * Fix error messages * More logging tweaks --------- Co-authored-by: fern-api <115122769+fern-api[bot]@users.noreply.github.com>
1 parent 881371e commit 2a7f385

File tree

6 files changed

+62
-47
lines changed

6 files changed

+62
-47
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ client.prompts.log(
4141
messages=[{"role": "user", "content": "What really happened at Roswell?"}],
4242
inputs={"person": "Trump"},
4343
created_at=datetime.datetime.fromisoformat(
44-
"2024-07-19 00:29:35.178000+00:00",
44+
"2024-07-18 23:29:35.178000+00:00",
4545
),
4646
provider_latency=6.5931549072265625,
4747
output_message={
@@ -88,7 +88,7 @@ async def main() -> None:
8888
],
8989
inputs={"person": "Trump"},
9090
created_at=datetime.datetime.fromisoformat(
91-
"2024-07-19 00:29:35.178000+00:00",
91+
"2024-07-18 23:29:35.178000+00:00",
9292
),
9393
provider_latency=6.5931549072265625,
9494
output_message={

reference.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ client.prompts.log(
5656
messages=[{"role": "user", "content": "What really happened at Roswell?"}],
5757
inputs={"person": "Trump"},
5858
created_at=datetime.datetime.fromisoformat(
59-
"2024-07-19 00:29:35.178000+00:00",
59+
"2024-07-18 23:29:35.178000+00:00",
6060
),
6161
provider_latency=6.5931549072265625,
6262
output_message={
@@ -6258,10 +6258,10 @@ client.flows.log(
62586258
output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.",
62596259
trace_status="incomplete",
62606260
start_time=datetime.datetime.fromisoformat(
6261-
"2024-07-08 22:40:35+00:00",
6261+
"2024-07-08 21:40:35+00:00",
62626262
),
62636263
end_time=datetime.datetime.fromisoformat(
6264-
"2024-07-08 22:40:39+00:00",
6264+
"2024-07-08 21:40:39+00:00",
62656265
),
62666266
)
62676267

src/humanloop/client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(
3737
follow_redirects=follow_redirects,
3838
httpx_client=httpx_client,
3939
)
40-
self.evaluations.run_local = partial(_run_eval, client=self) # type: ignore[attr-defined]
40+
self.evaluations.run = partial(_run_eval, client=self) # type: ignore[attr-defined]
4141

4242

4343
class AsyncHumanloop(AsyncBaseHumanloop):

src/humanloop/eval_utils.py

Lines changed: 50 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from .requests import HumanEvaluatorRequestParams as HumanEvaluatorDict
3535

3636

37-
# Responses are Pydantic models
37+
# Responses are Pydantic models and we leverage them for improved request validation
3838
from .types import FlowKernelRequest as Flow
3939
from .types import PromptKernelRequest as Prompt
4040
from .types import ToolKernelRequest as Tool
@@ -65,7 +65,7 @@
6565
FileType = Literal["flow", "prompt", "tool", "evaluator"]
6666

6767

68-
# ANSI escape codes for colors
68+
# ANSI escape codes for logging colors
6969
YELLOW = "\033[93m"
7070
CYAN = "\033[96m"
7171
GREEN = "\033[92m"
@@ -81,23 +81,23 @@ class Identifiers(TypedDict):
8181
"""The path of the File on Humanloop."""
8282

8383

84-
class File(Identifiers, total=False):
84+
class File(Identifiers):
8585
"""A File on Humanloop (Flow, Prompt, Tool, Evaluator)."""
8686
type: NotRequired[FileType]
8787
"""The type of File this function relates to on Humanloop."""
8888
version: NotRequired[Version]
89-
"""The contents uniquely define the version of the File on Humanloop"""
90-
function: Callable
89+
"""The contents uniquely define the version of the File on Humanloop."""
90+
callable: Callable
9191
"""The function being evaluated.
92-
It will be called using your Dataset `inputs` as follows: `output = function(**datapoint.inputs)`.
93-
If `messages` are defined in your Dataset, then `output = function(**datapoint.inputs, messages=datapoint.messages)`.
92+
It will be called using your Dataset `inputs` as follows: `output = callable(**datapoint.inputs)`.
93+
If `messages` are defined in your Dataset, then `output = callable(**datapoint.inputs, messages=datapoint.messages)`.
9494
It should return a single string output. If not, you must provide a `custom_logger`.
9595
"""
9696
custom_logger: NotRequired[Callable]
9797
"""function that logs the output of your function to Humanloop, replacing the default logging.
9898
If provided, it will be called as follows:
9999
```
100-
output = function(**datapoint.inputs).
100+
output = callable(**datapoint.inputs).
101101
log = custom_logger(client, output)
102102
```
103103
Inside the custom_logger, you can use the Humanloop `client` to log the output of your function.
@@ -119,12 +119,12 @@ class Evaluator(Identifiers):
119119
"""The type of arguments the Evaluator expects - only required for local Evaluators."""
120120
return_type: NotRequired[EvaluatorReturnTypeEnum]
121121
"""The type of return value the Evaluator produces - only required for local Evaluators."""
122-
function: NotRequired[Callable]
122+
callable: NotRequired[Callable]
123123
"""The function to run on the logs to produce the judgment - only required for local Evaluators."""
124124
custom_logger: NotRequired[Callable]
125125
"""optional function that logs the output judgment from your Evaluator to Humanloop, if provided, it will be called as follows:
126126
```
127-
judgment = function(log_dict)
127+
judgment = callable(log_dict)
128128
log = custom_logger(client, judgmemt)
129129
```
130130
Inside the custom_logger, you can use the Humanloop `client` to log the judgment to Humanloop.
@@ -157,7 +157,7 @@ def _run_eval(
157157
dataset: Dataset,
158158
evaluators: Optional[Sequence[Evaluator]] = None,
159159
# logs: typing.Sequence[dict] | None = None,
160-
workers: int = 5,
160+
workers: int = 4,
161161
) -> List[EvaluatorCheck]:
162162
"""
163163
Evaluate your function for a given `Dataset` and set of `Evaluators`.
@@ -173,23 +173,30 @@ def _run_eval(
173173

174174
# Get or create the file on Humanloop
175175
version = file.pop("version", {})
176+
176177
# Raise error if one of path or id not provided
177178
if not file.get("path") and not file.get("id"):
178179
raise ValueError("You must provide a path or id in your `file`.")
179180

180-
try:
181-
function_ = file.pop("function")
182-
except KeyError as _:
183-
raise ValueError("You must provide a `function` for your `file` to run a local eval.")
184-
181+
# Determine the `type` of the `file` to Evaluate - if not `type` provided, default to `flow`
185182
try:
186183
type_ = file.pop("type")
187-
logger.info(f"{CYAN}Evaluating your {type_} function corresponding to `{file['path']}` on Humanloop{RESET} \n\n")
184+
logger.info(
185+
f"{CYAN}Evaluating your {type_} function corresponding to `{file['path']}` on Humanloop{RESET} \n\n")
188186
except KeyError as _:
189-
# Default to flows if not type specified
190187
type_ = "flow"
191188
logger.warning("No `file` type specified, defaulting to flow.")
192189

190+
# If a `callable` is provided, Logs will be generated locally, otherwise Logs will be generated on Humanloop.
191+
function_ = None
192+
try:
193+
function_ = file.pop("callable")
194+
except KeyError as _:
195+
if type_ == "flow":
196+
raise ValueError("You must provide a `callable` for your Flow `file` to run a local eval.")
197+
else:
198+
logger.info(f"No `callable` provided for your {type_} file - will attempt to generate logs on Humanloop.")
199+
193200
custom_logger = file.pop("custom_logger", None)
194201
file_dict = {**file, **version}
195202

@@ -234,8 +241,11 @@ def _run_eval(
234241
if evaluators:
235242
for evaluator in evaluators:
236243
# If a callable is provided for an Evaluator, we treat it as External
237-
eval_function = evaluator.get("function")
244+
eval_function = evaluator.get("callable")
238245
if eval_function is not None:
246+
# TODO: support the case where `file` logs generated on Humanloop but Evaluator logs generated locally
247+
if function_ is None:
248+
raise ValueError(f"Local Evaluators are only supported when generating Logs locally using your {type_}'s `callable`. Please provide a `callable` for your file in order to run Evaluators locally.")
239249
local_evaluators.append(evaluator)
240250
spec = ExternalEvaluator(
241251
arguments_type=evaluator["args_type"],
@@ -306,7 +316,7 @@ def process_datapoint(datapoint: Datapoint):
306316
log = function_(client=client, output=output)
307317
else:
308318
if not isinstance(output, str):
309-
raise ValueError("Your File function must return a string if you do not provide a custom logger.")
319+
raise ValueError(f"Your {type_}'s `callable` must return a string if you do not provide a custom logger.")
310320
log = log_func(
311321
inputs=datapoint.inputs,
312322
output=output,
@@ -322,13 +332,13 @@ def process_datapoint(datapoint: Datapoint):
322332
start_time=start_time,
323333
end_time=datetime.now(),
324334
)
325-
logger.warning(msg=f"\nFile function failed for Datapoint: {datapoint.id}. \n Error: {str(e)}")
335+
logger.warning(msg=f"\nYour {type_}'s `callable` failed for Datapoint: {datapoint.id}. \n Error: {str(e)}")
326336

327337
# Apply local Evaluators
328338
for local_evaluator in local_evaluators:
329339
try:
330340
start_time = datetime.now()
331-
eval_function = local_evaluator["function"]
341+
eval_function = local_evaluator["callable"]
332342
if local_evaluator["args_type"] == "target_required":
333343
judgment = eval_function(log.dict(), datapoint.target)
334344
else:
@@ -359,20 +369,25 @@ def process_datapoint(datapoint: Datapoint):
359369

360370
# Execute the function and send the logs to Humanloop in parallel
361371
total_datapoints = len(hl_dataset.datapoints)
362-
logger.info(f"\n{CYAN}Navigate to your Evals:{RESET} {evaluation.url}")
363-
logger.info(f"{CYAN}Version Id: {hl_file.version_id}{RESET}")
372+
logger.info(f"\n{CYAN}Navigate to your evals:{RESET}\n{evaluation.url}")
373+
logger.info(f"{CYAN}{type_} version Id: {hl_file.version_id}{RESET}")
364374
logger.info(f"{CYAN}Run Id: {batch_id}{RESET}")
365-
logger.info(f"{CYAN}\nRunning function for File {hl_file.name} over the Dataset {hl_dataset.name}{RESET}")
366-
367-
completed_tasks = 0
368-
with ThreadPoolExecutor(max_workers=workers) as executor:
369-
futures = [
370-
executor.submit(process_datapoint, datapoint)
371-
for datapoint in hl_dataset.datapoints
372-
]
373-
for _ in as_completed(futures):
374-
completed_tasks += 1
375-
_progress_bar(total_datapoints, completed_tasks)
375+
376+
# Generate locally if a file `callable` is provided
377+
if function_:
378+
logger.info(f"{CYAN}\nRunning {hl_file.name} {type_} callable over {hl_dataset.name}{RESET} Dataset using {workers} workers")
379+
completed_tasks = 0
380+
with ThreadPoolExecutor(max_workers=workers) as executor:
381+
futures = [
382+
executor.submit(process_datapoint, datapoint)
383+
for datapoint in hl_dataset.datapoints
384+
]
385+
for _ in as_completed(futures):
386+
completed_tasks += 1
387+
_progress_bar(total_datapoints, completed_tasks)
388+
else:
389+
# TODO: trigger run when updated API is available
390+
logger.info(f"{CYAN}\nRunning {type_} {hl_file.name} over the Dataset {hl_dataset.name}{RESET}")
376391

377392
# Wait for the Evaluation to complete then print the results
378393
complete = False

src/humanloop/flows/client.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -197,10 +197,10 @@ def log(
197197
output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.",
198198
trace_status="incomplete",
199199
start_time=datetime.datetime.fromisoformat(
200-
"2024-07-08 22:40:35+00:00",
200+
"2024-07-08 21:40:35+00:00",
201201
),
202202
end_time=datetime.datetime.fromisoformat(
203-
"2024-07-08 22:40:39+00:00",
203+
"2024-07-08 21:40:39+00:00",
204204
),
205205
)
206206
"""
@@ -1366,10 +1366,10 @@ async def main() -> None:
13661366
output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.",
13671367
trace_status="incomplete",
13681368
start_time=datetime.datetime.fromisoformat(
1369-
"2024-07-08 22:40:35+00:00",
1369+
"2024-07-08 21:40:35+00:00",
13701370
),
13711371
end_time=datetime.datetime.fromisoformat(
1372-
"2024-07-08 22:40:39+00:00",
1372+
"2024-07-08 21:40:39+00:00",
13731373
),
13741374
)
13751375

src/humanloop/prompts/client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def log(
236236
messages=[{"role": "user", "content": "What really happened at Roswell?"}],
237237
inputs={"person": "Trump"},
238238
created_at=datetime.datetime.fromisoformat(
239-
"2024-07-19 00:29:35.178000+00:00",
239+
"2024-07-18 23:29:35.178000+00:00",
240240
),
241241
provider_latency=6.5931549072265625,
242242
output_message={
@@ -2117,7 +2117,7 @@ async def main() -> None:
21172117
],
21182118
inputs={"person": "Trump"},
21192119
created_at=datetime.datetime.fromisoformat(
2120-
"2024-07-19 00:29:35.178000+00:00",
2120+
"2024-07-18 23:29:35.178000+00:00",
21212121
),
21222122
provider_latency=6.5931549072265625,
21232123
output_message={

0 commit comments

Comments
 (0)