wix · ameerabuf · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026 · Apr 29, 2026
diff --git a/.github/workflows/interact-debug-e2e.yml b/.github/workflows/interact-debug-e2e.yml
@@ -0,0 +1,123 @@
+name: Interact Debug E2E Tests
+
+on:
+  push:
+    branches:
+      - master
+  workflow_dispatch:
+    inputs:
+      branch:
+        type: string
+        description: Branch to run tests on
+        default: master
+      browser:
+        type: choice
+        description: Browser to run tests in
+        options:
+          - chromium
+          - firefox
+          - webkit
+          - all
+        default: chromium
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
+
+jobs:
+  interact-debug-e2e:
+    name: Interact Debug E2E (${{ github.event.inputs.browser || 'chromium' }})
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+        with:
+          ref: ${{ github.event.inputs.branch || github.ref }}
+
+      - name: Setup Node.js
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
+        with:
+          node-version: 24
+          registry-url: https://registry.npmjs.org
+
+      - name: Cache Yarn dependencies
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
+        with:
+          path: |
+            .yarn/cache
+            .yarn/install-state.gz
+            node_modules
+            **/node_modules
+          key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-yarn-
+
+      - name: Enable Corepack
+        run: |
+          corepack enable
+          corepack prepare yarn@4.10.3 --activate
+          yarn set version 4.10.3
+
+      - name: Install dependencies
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: NPQ_PKG_MGR=yarn npx npq install --immutable
+
+      - name: Build motion package
+        run: yarn workspace @wix/motion build
+
+      - name: Build interact package
+        run: yarn workspace @wix/interact build
+
+      - name: Build interact-debug package
+        run: yarn workspace @wix/interact-debug build
+
+      - name: Cache Playwright browsers
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
+        id: playwright-cache
+        with:
+          path: ~/.cache/ms-playwright
+          key: ${{ runner.os }}-playwright-${{ github.event.inputs.browser || 'chromium' }}-${{ hashFiles('packages/interact-debug/package.json') }}
+          restore-keys: |
+            ${{ runner.os }}-playwright-${{ github.event.inputs.browser || 'chromium' }}-
+            ${{ runner.os }}-playwright-
+
+      - name: Install Playwright browsers (all)
+        if: steps.playwright-cache.outputs.cache-hit != 'true' && (github.event.inputs.browser == 'all' || github.event.inputs.browser == '')
+        working-directory: packages/interact-debug
+        run: npx playwright install --with-deps chromium firefox webkit
+
+      - name: Install Playwright browsers (selected)
+        if: steps.playwright-cache.outputs.cache-hit != 'true' && github.event.inputs.browser != 'all' && github.event.inputs.browser != ''
+        working-directory: packages/interact-debug
+        run: npx playwright install --with-deps ${{ github.event.inputs.browser }}
+
+      - name: Install Playwright system deps
+        if: steps.playwright-cache.outputs.cache-hit == 'true'
+        run: npx playwright install-deps ${{ github.event.inputs.browser != 'all' && github.event.inputs.browser != '' && github.event.inputs.browser || '' }}
+
+      - name: Run E2E tests (selected browser)
+        if: github.event.inputs.browser != 'all' && github.event.inputs.browser != ''
+        working-directory: packages/interact-debug
+        run: npx playwright test --project=${{ github.event.inputs.browser }}
+
+      - name: Run E2E tests (all browsers)
+        if: github.event.inputs.browser == 'all'
+        working-directory: packages/interact-debug
+        run: npx playwright test
+
+      - name: Upload Playwright report
+        if: ${{ always() }}
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        with:
+          name: interact-debug-playwright-report
+          path: packages/interact-debug/playwright-report/
+          retention-days: 14
diff --git a/packages/interact-debug/bin/run-eval-compare.ts b/packages/interact-debug/bin/run-eval-compare.ts
@@ -0,0 +1,140 @@
+#!/usr/bin/env npx tsx
+/**
+ * Runs the evaluation three times with different levels of rules context:
+ *   1. no-rules:   Base prompt only (no @wix/interact documentation)
+ *   2. partial:    Only the core overview rule (full-lean.md)
+ *   3. full-rules: All 7 rule files (the default)
+ *
+ * Writes a comparison JSON and prints a side-by-side table.
+ */
+import {
+  runEvaluation,
+  formatReport,
+  scenarios,
+  buildSystemPromptFromFiles,
+  buildSystemPrompt,
+} from '../src/eval';
+import type { EvalReport } from '../src/eval';
+import { writeFile } from 'node:fs/promises';
+import { resolve } from 'node:path';
+
+const BARE_PROMPT = `You are an expert web developer. Generate a complete, single-file HTML document that implements the requested animation scenario.
+
+Requirements:
+- Output ONLY the HTML document. No explanations, no markdown fences, no commentary.
+- The HTML must be a complete document (<!DOCTYPE html>, <html>, <head>, <body>).
+- All CSS goes inside a <style> tag in <head>.
+- All JavaScript goes inside a <script type="module"> tag at the end of <body>.`;
+
+type Variant = { name: string; systemPrompt: string };
+
+async function main() {
+  const variants: Variant[] = [
+    { name: 'no-rules', systemPrompt: BARE_PROMPT },
+    { name: 'partial', systemPrompt: await buildSystemPromptFromFiles(['full-lean.md']) },
+    { name: 'full-rules', systemPrompt: await buildSystemPrompt() },
+  ];
+
+  const reports: Record<string, EvalReport> = {};
+  const startTime = Date.now();
+
+  for (const variant of variants) {
+    console.log(`\n${'='.repeat(60)}`);
+    console.log(
+      `  Running variant: ${variant.name} (prompt size: ${(variant.systemPrompt.length / 1024).toFixed(1)}KB)`,
+    );
+    console.log(`${'='.repeat(60)}\n`);
+
+    const report = await runEvaluation(scenarios, {
+      systemPrompt: variant.systemPrompt,
+      onResult: (result) => {
+        const status = result.success
+          ? `score=${result.scores?.aggregate.toFixed(2)}, valid=${result.validation?.valid}, errs=${result.validation?.errors.length}`
+          : `FAILED: ${result.error?.slice(0, 80)}`;
+        console.log(
+          `  [${result.scenario.id}] ${status} (${(result.durationMs / 1000).toFixed(0)}s)`,
+        );
+      },
+    });
+
+    reports[variant.name] = report;
+    console.log('\n' + formatReport(report));
+  }
+
+  // Print comparison table
+  console.log('\n' + '='.repeat(80));
+  console.log('  COMPARISON SUMMARY');
+  console.log('='.repeat(80) + '\n');
+
+  const header = [
+    'Metric'.padEnd(25),
+    'no-rules'.padEnd(12),
+    'partial'.padEnd(12),
+    'full-rules'.padEnd(12),
+  ].join('');
+  console.log(header);
+  console.log('-'.repeat(header.length));
+
+  const metrics: [string, (r: EvalReport) => string][] = [
+    ['Generated', (r) => `${r.summary.generated}/${r.summary.total}`],
+    ['Valid', (r) => `${r.summary.valid}/${r.summary.total}`],
+    ['Avg aggregate', (r) => r.summary.averageAggregate.toFixed(3)],
+    ['Total errors', (r) => String(r.summary.totalErrors)],
+    ['Total warnings', (r) => String(r.summary.totalWarnings)],
+  ];
+
+  // Per-dimension averages
+  const dimensions = ['complexity', 'weight', 'a11y', 'coherence', 'bestPractices', 'validation'];
+  for (const dim of dimensions) {
+    metrics.push([
+      `Avg ${dim}`,
+      (r) => {
+        const scores = r.results
+          .filter((x) => x.scores)
+          .map((x) => x.scores!.dimensions.find((d) => d.dimension === dim)?.score ?? 0);
+        return scores.length > 0
+          ? (scores.reduce((a, b) => a + b, 0) / scores.length).toFixed(3)
+          : '-';
+      },
+    ]);
+  }
+
+  for (const [name, fn] of metrics) {
+    console.log(
+      [
+        name.padEnd(25),
+        fn(reports['no-rules']).padEnd(12),
+        fn(reports['partial']).padEnd(12),
+        fn(reports['full-rules']).padEnd(12),
+      ].join(''),
+    );
+  }
+
+  console.log(`\nTotal time: ${((Date.now() - startTime) / 1000 / 60).toFixed(1)} minutes`);
+
+  // Write comparison JSON
+  const outPath = resolve(__dirname, '..', 'eval-comparison.json');
+  const output: Record<string, unknown> = {};
+  for (const v of variants) {
+    const r = reports[v.name];
+    output[v.name] = {
+      promptSizeKB: +(v.systemPrompt.length / 1024).toFixed(1),
+      summary: r.summary,
+      perScenario: r.results.map((x) => ({
+        scenario: x.scenario.id,
+        success: x.success,
+        aggregate: x.scores?.aggregate,
+        valid: x.validation?.valid,
+        errors: x.validation?.errors.length,
+        warnings: x.validation?.warnings.length,
+      })),
+    };
+  }
+  await writeFile(outPath, JSON.stringify(output, null, 2));
+  console.log(`\nComparison written to ${outPath}`);
+}
+
+main().catch((err) => {
+  console.error('Eval comparison failed:', err);
+  process.exit(1);
+});
diff --git a/packages/interact-debug/bin/run-eval.ts b/packages/interact-debug/bin/run-eval.ts
@@ -0,0 +1,90 @@
+#!/usr/bin/env npx tsx
+import { runEvaluation, formatReport, scenarios } from '../src/eval';
+import { writeFile, mkdir } from 'node:fs/promises';
+import { resolve } from 'node:path';
+
+const RESULTS_FILE = resolve(__dirname, '..', 'eval-results.json');
+const RUNS_PER_SCENARIO = 3;
+
+async function main() {
+  console.log(
+    `Starting evaluation of ${scenarios.length} scenarios (${RUNS_PER_SCENARIO} runs each)...\n`,
+  );
+  const startTime = Date.now();
+
+  const rawDir = resolve(__dirname, '..', 'eval-raw-output');
+  await mkdir(rawDir, { recursive: true });
+
+  const report = await runEvaluation(scenarios, {
+    runsPerScenario: RUNS_PER_SCENARIO,
+    onRun: async (scenario, runIdx, run) => {
+      const status = run.success
+        ? `score=${run.scores?.aggregate.toFixed(2)}, valid=${run.validation?.valid}, errs=${run.validation?.errors.length}`
+        : `FAILED: ${run.error?.slice(0, 80)}`;
+      console.log(
+        `  [${scenario.id}] run ${runIdx + 1}/${RUNS_PER_SCENARIO}: ${status} (${(run.durationMs / 1000).toFixed(0)}s)`,
+      );
+
+      if (run.rawOutput) {
+        await writeFile(resolve(rawDir, `${scenario.id}-run${runIdx}.html`), run.rawOutput).catch(
+          () => {},
+        );
+      }
+    },
+    onResult: (result) => {
+      const stats = result.scoreStats;
+      const status = result.success
+        ? `mean=${stats?.mean.toFixed(2)}, std=${stats?.stddev.toFixed(2)}, rate=${(result.successRate * 100).toFixed(0)}%`
+        : `ALL FAILED`;
+      console.log(`  => [${result.scenario.id}] ${status}\n`);
+    },
+  });
+
+  console.log('\n' + formatReport(report));
+  console.log(`Total time: ${((Date.now() - startTime) / 1000 / 60).toFixed(1)} minutes`);
+
+  const slimResults = report.results.map((r) => ({
+    scenario: r.scenario.id,
+    success: r.success,
+    successRate: r.successRate,
+    error: r.error,
+    scoreStats: r.scoreStats,
+    validation: r.validation
+      ? {
+          valid: r.validation.valid,
+          errors: r.validation.errors.length,
+          warnings: r.validation.warnings.length,
+          infos: r.validation.infos.length,
+        }
+      : undefined,
+    scores: r.scores
+      ? {
+          aggregate: r.scores.aggregate,
+          dimensions: r.scores.dimensions.map((d) => ({ dimension: d.dimension, score: d.score })),
+        }
+      : undefined,
+    durationMs: r.durationMs,
+    runs: r.runs.map((run) => ({
+      success: run.success,
+      aggregate: run.scores?.aggregate,
+      valid: run.validation?.valid,
+      errors: run.validation?.errors.length,
+      durationMs: run.durationMs,
+    })),
+  }));
+
+  const output = {
+    timestamp: report.timestamp,
+    runsPerScenario: report.runsPerScenario,
+    summary: report.summary,
+    results: slimResults,
+  };
+
+  await writeFile(RESULTS_FILE, JSON.stringify(output, null, 2));
+  console.log(`\nResults written to ${RESULTS_FILE}`);
+}
+
+main().catch((err) => {
+  console.error('Eval failed:', err);
+  process.exit(1);
+});