mattpocock · justsml · Jan 9, 2026
diff --git a/apps/evalite-docs/src/content/docs/guides/variant-comparison.mdx b/apps/evalite-docs/src/content/docs/guides/variant-comparison.mdx
@@ -81,3 +81,43 @@ Now answer the question.`,
   scorers: [Levenshtein],
 });
 ```
+
+## Controlling Variant Concurrency
+
+When comparing many variants or models with rate limits, you can use `parallelLimit` to control how many variants execute concurrently:
+
+```ts
+evalite.each(
+  [
+    { name: "GPT-4o", input: { model: "gpt-4o" } },
+    { name: "GPT-4o mini", input: { model: "gpt-4o-mini" } },
+    { name: "Claude Sonnet 3.5", input: { model: "claude-3-5-sonnet" } },
+    { name: "Claude Haiku", input: { model: "claude-3-5-haiku" } },
+  ],
+  { parallelLimit: 2 } // Only 2 variants run at once
+)("Compare models", {
+  data: async () => [
+    { input: "What's the capital of France?", expected: "Paris" },
+    // ...
+  ],
+  task: async (input, variant) => {
+    return generateText({
+      model: openai(variant.model),
+      prompt: input,
+    });
+  },
+  scorers: [Factuality, Levenshtein],
+});
+```
+
+### Use Cases
+
+- **API rate limiting**: Stay within provider concurrent request limits
+- **Cost control**: Run expensive models sequentially or in small batches
+- **Resource management**: Control memory/CPU usage for large evaluations
+
+<Aside type="note">
+  Each variant still respects the global `maxConcurrency` setting for its
+  individual test cases. The `parallelLimit` only controls how many variants run
+  concurrently.
+</Aside>
diff --git a/packages/evalite-tests/tests/fixtures/parallel-limit-none/none.eval.ts b/packages/evalite-tests/tests/fixtures/parallel-limit-none/none.eval.ts
@@ -0,0 +1,23 @@
+import { evalite } from "evalite";
+import { Levenshtein } from "autoevals";
+import { setTimeout } from "node:timers/promises";
+
+evalite.each([
+  { name: "A", input: { id: 1 } },
+  { name: "B", input: { id: 2 } },
+  { name: "C", input: { id: 3 } },
+])("No Limit Test", {
+  data: () => {
+    return [
+      {
+        input: "test",
+        expected: "test-value",
+      },
+    ];
+  },
+  task: async (input, variant) => {
+    await setTimeout(20);
+    return `${input}-value-${variant.id}`;
+  },
+  scorers: [Levenshtein],
+});
diff --git a/packages/evalite-tests/tests/fixtures/parallel-limit-sequential/sequential.eval.ts b/packages/evalite-tests/tests/fixtures/parallel-limit-sequential/sequential.eval.ts
@@ -0,0 +1,26 @@
+import { evalite } from "evalite";
+import { Levenshtein } from "autoevals";
+import { setTimeout } from "node:timers/promises";
+
+evalite.each(
+  [
+    { name: "V1", input: { id: 1 } },
+    { name: "V2", input: { id: 2 } },
+    { name: "V3", input: { id: 3 } },
+  ],
+  { parallelLimit: 1 }
+)("Sequential Test", {
+  data: () => {
+    return [
+      {
+        input: "test",
+        expected: "test-result",
+      },
+    ];
+  },
+  task: async (input, variant) => {
+    await setTimeout(30);
+    return `${input}-result-${variant.id}`;
+  },
+  scorers: [Levenshtein],
+});
diff --git a/packages/evalite-tests/tests/fixtures/parallel-limit/parallel-limit.eval.ts b/packages/evalite-tests/tests/fixtures/parallel-limit/parallel-limit.eval.ts
@@ -0,0 +1,28 @@
+import { evalite } from "evalite";
+import { Levenshtein } from "autoevals";
+import { setTimeout } from "node:timers/promises";
+
+evalite.each(
+  [
+    { name: "Variant 1", input: { id: 1 } },
+    { name: "Variant 2", input: { id: 2 } },
+    { name: "Variant 3", input: { id: 3 } },
+    { name: "Variant 4", input: { id: 4 } },
+  ],
+  { parallelLimit: 2 }
+)("Parallel Limit Test", {
+  data: () => {
+    return [
+      {
+        input: "test",
+        expected: "test-output",
+      },
+    ];
+  },
+  task: async (input, variant) => {
+    // Simulate work
+    await setTimeout(50);
+    return `${input}-output-${variant.id}`;
+  },
+  scorers: [Levenshtein],
+});
diff --git a/packages/evalite-tests/tests/parallel-limit.test.ts b/packages/evalite-tests/tests/parallel-limit.test.ts
@@ -0,0 +1,78 @@
+import { expect, it } from "vitest";
+import { getEvalsAsRecordViaStorage, loadFixture } from "./test-utils.js";
+
+it("Should enforce parallelLimit for variants", async () => {
+  await using fixture = await loadFixture("parallel-limit");
+
+  await fixture.run({ mode: "run-once-and-exit" });
+
+  const evals = await getEvalsAsRecordViaStorage(fixture.storage);
+
+  // All 4 variants should complete successfully
+  expect(evals["Parallel Limit Test [Variant 1]"]).toBeDefined();
+  expect(evals["Parallel Limit Test [Variant 2]"]).toBeDefined();
+  expect(evals["Parallel Limit Test [Variant 3]"]).toBeDefined();
+  expect(evals["Parallel Limit Test [Variant 4]"]).toBeDefined();
+
+  // Verify outputs are correct
+  expect(
+    evals["Parallel Limit Test [Variant 1]"]?.[0]?.results[0]?.output
+  ).toBe("test-output-1");
+  expect(
+    evals["Parallel Limit Test [Variant 2]"]?.[0]?.results[0]?.output
+  ).toBe("test-output-2");
+  expect(
+    evals["Parallel Limit Test [Variant 3]"]?.[0]?.results[0]?.output
+  ).toBe("test-output-3");
+  expect(
+    evals["Parallel Limit Test [Variant 4]"]?.[0]?.results[0]?.output
+  ).toBe("test-output-4");
+});
+
+it("Should run sequentially with parallelLimit: 1", async () => {
+  await using fixture = await loadFixture("parallel-limit-sequential");
+
+  await fixture.run({ mode: "run-once-and-exit" });
+
+  const evals = await getEvalsAsRecordViaStorage(fixture.storage);
+
+  // All 3 variants should complete successfully
+  expect(evals["Sequential Test [V1]"]).toBeDefined();
+  expect(evals["Sequential Test [V2]"]).toBeDefined();
+  expect(evals["Sequential Test [V3]"]).toBeDefined();
+
+  // Verify outputs
+  expect(evals["Sequential Test [V1]"]?.[0]?.results[0]?.output).toBe(
+    "test-result-1"
+  );
+  expect(evals["Sequential Test [V2]"]?.[0]?.results[0]?.output).toBe(
+    "test-result-2"
+  );
+  expect(evals["Sequential Test [V3]"]?.[0]?.results[0]?.output).toBe(
+    "test-result-3"
+  );
+});
+
+it("Should run all variants concurrently when no parallelLimit is specified", async () => {
+  await using fixture = await loadFixture("parallel-limit-none");
+
+  await fixture.run({ mode: "run-once-and-exit" });
+
+  const evals = await getEvalsAsRecordViaStorage(fixture.storage);
+
+  // All 3 variants should complete successfully
+  expect(evals["No Limit Test [A]"]).toBeDefined();
+  expect(evals["No Limit Test [B]"]).toBeDefined();
+  expect(evals["No Limit Test [C]"]).toBeDefined();
+
+  // Verify outputs
+  expect(evals["No Limit Test [A]"]?.[0]?.results[0]?.output).toBe(
+    "test-value-1"
+  );
+  expect(evals["No Limit Test [B]"]?.[0]?.results[0]?.output).toBe(
+    "test-value-2"
+  );
+  expect(evals["No Limit Test [C]"]?.[0]?.results[0]?.output).toBe(
+    "test-value-3"
+  );
+});
diff --git a/packages/evalite/src/evalite.ts b/packages/evalite/src/evalite.ts
@@ -1,13 +1,14 @@
 import { mkdir, writeFile } from "fs/promises";
 import path from "path";
-import { describe, inject, it } from "vitest";
+import { afterAll, beforeAll, describe, inject, it } from "vitest";
 import { reportTraceLocalStorage } from "./traces.js";
 import { writeFileQueueLocalStorage } from "./write-file-queue-local-storage.js";
 import { createEvaliteFileIfNeeded } from "./utils.js";
 import type { Evalite } from "./types.js";
 import { FILES_LOCATION } from "./backend-only-constants.js";
 import { createScorer } from "./index.js";
 import { serializeAnnotation } from "./reporter/events.js";
+import { Semaphore } from "./semaphore.js";
 
 const joinArrayOfUnknownResults = (results: unknown[]): unknown => {
   return results.reduce((acc, result) => {
@@ -121,6 +122,9 @@ const runTask = async <TInput, TOutput, TExpected, TVariant = undefined>(
   };
 };
 
+// Registry for variant group semaphores to control concurrency
+const variantGroupSemaphores = new Map<string, Semaphore>();
+
 export const evalite = <TInput, TOutput, TExpected = TOutput>(
   evalName: string,
   opts: Evalite.RunnerOpts<TInput, TOutput, TExpected>
@@ -137,20 +141,35 @@ evalite.skip = <TInput, TOutput, TExpected>(
 evalite.experimental_skip = evalite.skip;
 
 evalite.each = <TVariant>(
-  variants: Array<{ name: string; input: TVariant }>
+  variants: Array<{ name: string; input: TVariant }>,
+  opts?: Evalite.EachOpts
 ) => {
   return <TInput, TOutput, TExpected = TOutput>(
     evalName: string,
-    opts: Evalite.RunnerOpts<TInput, TOutput, TExpected, TVariant>
+    evalOpts: Evalite.RunnerOpts<TInput, TOutput, TExpected, TVariant>
   ) => {
+    // Create or get semaphore for this variant group if parallelLimit is specified
+    let semaphore: Semaphore | undefined;
+    if (opts?.parallelLimit) {
+      const groupKey = evalName;
+      if (!variantGroupSemaphores.has(groupKey)) {
+        variantGroupSemaphores.set(groupKey, new Semaphore(opts.parallelLimit));
+      }
+      semaphore = variantGroupSemaphores.get(groupKey);
+    }
+
     for (const variant of variants) {
       registerEvalite(
         evalName,
         {
-          ...opts,
-          task: (input) => opts.task(input, variant.input),
+          ...evalOpts,
+          task: (input) => evalOpts.task(input, variant.input),
         },
-        { variantName: variant.name, variantGroup: evalName }
+        {
+          variantName: variant.name,
+          variantGroup: evalName,
+          semaphore: semaphore,
+        }
       );
     }
   };
@@ -189,6 +208,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
     modifier?: "only" | "skip";
     variantName?: string;
     variantGroup?: string;
+    semaphore?: Semaphore;
   } = {}
 ) {
   const describeFn = vitestOpts.modifier === "skip" ? describe.skip : describe;
@@ -206,6 +226,17 @@ function registerEvalite<TInput, TOutput, TExpected>(
     : evalName;
 
   return describeFn(fullEvalName, async () => {
+    // Acquire semaphore permit before running this variant
+    if (vitestOpts.semaphore) {
+      beforeAll(async () => {
+        await vitestOpts.semaphore!.acquire();
+      });
+
+      afterAll(() => {
+        vitestOpts.semaphore!.release();
+      });
+    }
+
     const datasetResult = await datasetPromise;
 
     if (!datasetResult.success) {

diff --git a/packages/evalite/src/semaphore.ts b/packages/evalite/src/semaphore.ts
@@ -0,0 +1,59 @@
+/**
+ * A simple semaphore implementation for controlling concurrent execution.
+ * Used to limit the number of variants that can run simultaneously.
+ */
+export class Semaphore {
+  private permits: number;
+  private queue: Array<() => void> = [];
+
+  constructor(permits: number) {
+    if (permits <= 0) {
+      throw new Error("Semaphore permits must be positive");
+    }
+    this.permits = permits;
+  }
+
+  /**
+   * Acquire a permit. If no permits are available, the promise will resolve
+   * when a permit becomes available.
+   */
+  async acquire(): Promise<void> {
+    if (this.permits > 0) {
+      this.permits--;
+      return Promise.resolve();
+    }
+
+    return new Promise<void>((resolve) => {
+      this.queue.push(() => {
+        this.permits--;
+        resolve();
+      });
+    });
+  }
+
+  /**
+   * Release a permit, allowing the next waiting task to proceed.
+   */
+  release(): void {
+    const next = this.queue.shift();
+    if (next) {
+      next();
+    } else {
+      this.permits++;
+    }
+  }
+
+  /**
+   * Get the number of available permits (for testing).
+   */
+  available(): number {
+    return this.permits;
+  }
+
+  /**
+   * Get the number of tasks waiting for permits (for testing).
+   */
+  queueLength(): number {
+    return this.queue.length;
+  }
+}
diff --git a/packages/evalite/src/types.ts b/packages/evalite/src/types.ts
@@ -207,6 +207,18 @@ export declare namespace Evalite {
     DataShape<TInput, TExpected>[]
   >;
 
+  export type EachOpts = {
+    /**
+     * Maximum number of variants to run concurrently.
+     * If not specified, all variants run concurrently (respecting global maxConcurrency).
+     * @example
+     * ```ts
+     * evalite.each(variants, { parallelLimit: 2 })("Compare", { ... })
+     * ```
+     */
+    parallelLimit?: number;
+  };
+
   export type RunnerOpts<TInput, TOutput, TExpected, TVariant = undefined> = {
     data:
       | DataShape<TInput, TExpected>[]