Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions apps/evalite-docs/src/content/docs/guides/variant-comparison.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,43 @@ Now answer the question.`,
scorers: [Levenshtein],
});
```

## Controlling Variant Concurrency

When comparing many variants or models with rate limits, you can use `parallelLimit` to control how many variants execute concurrently:

```ts
evalite.each(
[
{ name: "GPT-4o", input: { model: "gpt-4o" } },
{ name: "GPT-4o mini", input: { model: "gpt-4o-mini" } },
{ name: "Claude Sonnet 3.5", input: { model: "claude-3-5-sonnet" } },
{ name: "Claude Haiku", input: { model: "claude-3-5-haiku" } },
],
{ parallelLimit: 2 } // Only 2 variants run at once
)("Compare models", {
data: async () => [
{ input: "What's the capital of France?", expected: "Paris" },
// ...
],
task: async (input, variant) => {
return generateText({
model: openai(variant.model),
prompt: input,
});
},
scorers: [Factuality, Levenshtein],
});
```

### Use Cases

- **API rate limiting**: Stay within provider concurrent request limits
- **Cost control**: Run expensive models sequentially or in small batches
- **Resource management**: Control memory/CPU usage for large evaluations

<Aside type="note">
Each variant still respects the global `maxConcurrency` setting for its
individual test cases. The `parallelLimit` only controls how many variants run
concurrently.
</Aside>
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { evalite } from "evalite";
import { Levenshtein } from "autoevals";
import { setTimeout } from "node:timers/promises";

evalite.each([
{ name: "A", input: { id: 1 } },
{ name: "B", input: { id: 2 } },
{ name: "C", input: { id: 3 } },
])("No Limit Test", {
data: () => {
return [
{
input: "test",
expected: "test-value",
},
];
},
task: async (input, variant) => {
await setTimeout(20);
return `${input}-value-${variant.id}`;
},
scorers: [Levenshtein],
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import { evalite } from "evalite";
import { Levenshtein } from "autoevals";
import { setTimeout } from "node:timers/promises";

evalite.each(
[
{ name: "V1", input: { id: 1 } },
{ name: "V2", input: { id: 2 } },
{ name: "V3", input: { id: 3 } },
],
{ parallelLimit: 1 }
)("Sequential Test", {
data: () => {
return [
{
input: "test",
expected: "test-result",
},
];
},
task: async (input, variant) => {
await setTimeout(30);
return `${input}-result-${variant.id}`;
},
scorers: [Levenshtein],
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { evalite } from "evalite";
import { Levenshtein } from "autoevals";
import { setTimeout } from "node:timers/promises";

evalite.each(
[
{ name: "Variant 1", input: { id: 1 } },
{ name: "Variant 2", input: { id: 2 } },
{ name: "Variant 3", input: { id: 3 } },
{ name: "Variant 4", input: { id: 4 } },
],
{ parallelLimit: 2 }
)("Parallel Limit Test", {
data: () => {
return [
{
input: "test",
expected: "test-output",
},
];
},
task: async (input, variant) => {
// Simulate work
await setTimeout(50);
return `${input}-output-${variant.id}`;
},
scorers: [Levenshtein],
});
78 changes: 78 additions & 0 deletions packages/evalite-tests/tests/parallel-limit.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import { expect, it } from "vitest";
import { getEvalsAsRecordViaStorage, loadFixture } from "./test-utils.js";

it("Should enforce parallelLimit for variants", async () => {
await using fixture = await loadFixture("parallel-limit");

await fixture.run({ mode: "run-once-and-exit" });

const evals = await getEvalsAsRecordViaStorage(fixture.storage);

// All 4 variants should complete successfully
expect(evals["Parallel Limit Test [Variant 1]"]).toBeDefined();
expect(evals["Parallel Limit Test [Variant 2]"]).toBeDefined();
expect(evals["Parallel Limit Test [Variant 3]"]).toBeDefined();
expect(evals["Parallel Limit Test [Variant 4]"]).toBeDefined();

// Verify outputs are correct
expect(
evals["Parallel Limit Test [Variant 1]"]?.[0]?.results[0]?.output
).toBe("test-output-1");
expect(
evals["Parallel Limit Test [Variant 2]"]?.[0]?.results[0]?.output
).toBe("test-output-2");
expect(
evals["Parallel Limit Test [Variant 3]"]?.[0]?.results[0]?.output
).toBe("test-output-3");
expect(
evals["Parallel Limit Test [Variant 4]"]?.[0]?.results[0]?.output
).toBe("test-output-4");
});

it("Should run sequentially with parallelLimit: 1", async () => {
await using fixture = await loadFixture("parallel-limit-sequential");

await fixture.run({ mode: "run-once-and-exit" });

const evals = await getEvalsAsRecordViaStorage(fixture.storage);

// All 3 variants should complete successfully
expect(evals["Sequential Test [V1]"]).toBeDefined();
expect(evals["Sequential Test [V2]"]).toBeDefined();
expect(evals["Sequential Test [V3]"]).toBeDefined();

// Verify outputs
expect(evals["Sequential Test [V1]"]?.[0]?.results[0]?.output).toBe(
"test-result-1"
);
expect(evals["Sequential Test [V2]"]?.[0]?.results[0]?.output).toBe(
"test-result-2"
);
expect(evals["Sequential Test [V3]"]?.[0]?.results[0]?.output).toBe(
"test-result-3"
);
});

it("Should run all variants concurrently when no parallelLimit is specified", async () => {
await using fixture = await loadFixture("parallel-limit-none");

await fixture.run({ mode: "run-once-and-exit" });

const evals = await getEvalsAsRecordViaStorage(fixture.storage);

// All 3 variants should complete successfully
expect(evals["No Limit Test [A]"]).toBeDefined();
expect(evals["No Limit Test [B]"]).toBeDefined();
expect(evals["No Limit Test [C]"]).toBeDefined();

// Verify outputs
expect(evals["No Limit Test [A]"]?.[0]?.results[0]?.output).toBe(
"test-value-1"
);
expect(evals["No Limit Test [B]"]?.[0]?.results[0]?.output).toBe(
"test-value-2"
);
expect(evals["No Limit Test [C]"]?.[0]?.results[0]?.output).toBe(
"test-value-3"
);
});
43 changes: 37 additions & 6 deletions packages/evalite/src/evalite.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import { mkdir, writeFile } from "fs/promises";
import path from "path";
import { describe, inject, it } from "vitest";
import { afterAll, beforeAll, describe, inject, it } from "vitest";
import { reportTraceLocalStorage } from "./traces.js";
import { writeFileQueueLocalStorage } from "./write-file-queue-local-storage.js";
import { createEvaliteFileIfNeeded } from "./utils.js";
import type { Evalite } from "./types.js";
import { FILES_LOCATION } from "./backend-only-constants.js";
import { createScorer } from "./index.js";
import { serializeAnnotation } from "./reporter/events.js";
import { Semaphore } from "./semaphore.js";

const joinArrayOfUnknownResults = (results: unknown[]): unknown => {
return results.reduce((acc, result) => {
Expand Down Expand Up @@ -121,6 +122,9 @@ const runTask = async <TInput, TOutput, TExpected, TVariant = undefined>(
};
};

// Registry for variant group semaphores to control concurrency
const variantGroupSemaphores = new Map<string, Semaphore>();

export const evalite = <TInput, TOutput, TExpected = TOutput>(
evalName: string,
opts: Evalite.RunnerOpts<TInput, TOutput, TExpected>
Expand All @@ -137,20 +141,35 @@ evalite.skip = <TInput, TOutput, TExpected>(
evalite.experimental_skip = evalite.skip;

evalite.each = <TVariant>(
variants: Array<{ name: string; input: TVariant }>
variants: Array<{ name: string; input: TVariant }>,
opts?: Evalite.EachOpts
) => {
return <TInput, TOutput, TExpected = TOutput>(
evalName: string,
opts: Evalite.RunnerOpts<TInput, TOutput, TExpected, TVariant>
evalOpts: Evalite.RunnerOpts<TInput, TOutput, TExpected, TVariant>
) => {
// Create or get semaphore for this variant group if parallelLimit is specified
let semaphore: Semaphore | undefined;
if (opts?.parallelLimit) {
const groupKey = evalName;
if (!variantGroupSemaphores.has(groupKey)) {
variantGroupSemaphores.set(groupKey, new Semaphore(opts.parallelLimit));
}
semaphore = variantGroupSemaphores.get(groupKey);
}

for (const variant of variants) {
registerEvalite(
evalName,
{
...opts,
task: (input) => opts.task(input, variant.input),
...evalOpts,
task: (input) => evalOpts.task(input, variant.input),
},
{ variantName: variant.name, variantGroup: evalName }
{
variantName: variant.name,
variantGroup: evalName,
semaphore: semaphore,
}
);
}
};
Expand Down Expand Up @@ -189,6 +208,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
modifier?: "only" | "skip";
variantName?: string;
variantGroup?: string;
semaphore?: Semaphore;
} = {}
) {
const describeFn = vitestOpts.modifier === "skip" ? describe.skip : describe;
Expand All @@ -206,6 +226,17 @@ function registerEvalite<TInput, TOutput, TExpected>(
: evalName;

return describeFn(fullEvalName, async () => {
// Acquire semaphore permit before running this variant
if (vitestOpts.semaphore) {
beforeAll(async () => {
await vitestOpts.semaphore!.acquire();
});

afterAll(() => {
vitestOpts.semaphore!.release();
});
}

const datasetResult = await datasetPromise;

if (!datasetResult.success) {
Expand Down
59 changes: 59 additions & 0 deletions packages/evalite/src/semaphore.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/**
* A simple semaphore implementation for controlling concurrent execution.
* Used to limit the number of variants that can run simultaneously.
*/
export class Semaphore {
private permits: number;
private queue: Array<() => void> = [];

constructor(permits: number) {
if (permits <= 0) {
throw new Error("Semaphore permits must be positive");
}
this.permits = permits;
}

/**
* Acquire a permit. If no permits are available, the promise will resolve
* when a permit becomes available.
*/
async acquire(): Promise<void> {
if (this.permits > 0) {
this.permits--;
return Promise.resolve();
}

return new Promise<void>((resolve) => {
this.queue.push(() => {
this.permits--;
resolve();
});
});
}

/**
* Release a permit, allowing the next waiting task to proceed.
*/
release(): void {
const next = this.queue.shift();
if (next) {
next();
} else {
this.permits++;
}
}

/**
* Get the number of available permits (for testing).
*/
available(): number {
return this.permits;
}

/**
* Get the number of tasks waiting for permits (for testing).
*/
queueLength(): number {
return this.queue.length;
}
}
12 changes: 12 additions & 0 deletions packages/evalite/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,18 @@ export declare namespace Evalite {
DataShape<TInput, TExpected>[]
>;

export type EachOpts = {
/**
* Maximum number of variants to run concurrently.
* If not specified, all variants run concurrently (respecting global maxConcurrency).
* @example
* ```ts
* evalite.each(variants, { parallelLimit: 2 })("Compare", { ... })
* ```
*/
parallelLimit?: number;
};

export type RunnerOpts<TInput, TOutput, TExpected, TVariant = undefined> = {
data:
| DataShape<TInput, TExpected>[]
Expand Down