Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions .github/workflows/interact-debug-e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
name: Interact Debug E2E Tests

on:
push:
branches:
- master
workflow_dispatch:
inputs:
branch:
type: string
description: Branch to run tests on
default: master
browser:
type: choice
description: Browser to run tests in
options:
- chromium
- firefox
- webkit
- all
default: chromium

permissions:
contents: read

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

jobs:
interact-debug-e2e:
name: Interact Debug E2E (${{ github.event.inputs.browser || 'chromium' }})
runs-on: ubuntu-latest
timeout-minutes: 30

steps:
- name: Checkout repository
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
with:
ref: ${{ github.event.inputs.branch || github.ref }}

- name: Setup Node.js
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: 24
registry-url: https://registry.npmjs.org

- name: Cache Yarn dependencies
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
with:
path: |
.yarn/cache
.yarn/install-state.gz
node_modules
**/node_modules
key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
restore-keys: |
${{ runner.os }}-yarn-

- name: Enable Corepack
run: |
corepack enable
corepack prepare yarn@4.10.3 --activate
yarn set version 4.10.3

- name: Install dependencies
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: NPQ_PKG_MGR=yarn npx npq install --immutable

- name: Build motion package
run: yarn workspace @wix/motion build

- name: Build interact package
run: yarn workspace @wix/interact build

- name: Build interact-debug package
run: yarn workspace @wix/interact-debug build

- name: Cache Playwright browsers
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
id: playwright-cache
with:
path: ~/.cache/ms-playwright
key: ${{ runner.os }}-playwright-${{ github.event.inputs.browser || 'chromium' }}-${{ hashFiles('packages/interact-debug/package.json') }}
restore-keys: |
${{ runner.os }}-playwright-${{ github.event.inputs.browser || 'chromium' }}-
${{ runner.os }}-playwright-

- name: Install Playwright browsers (all)
if: steps.playwright-cache.outputs.cache-hit != 'true' && (github.event.inputs.browser == 'all' || github.event.inputs.browser == '')
working-directory: packages/interact-debug
run: npx playwright install --with-deps chromium firefox webkit

- name: Install Playwright browsers (selected)
if: steps.playwright-cache.outputs.cache-hit != 'true' && github.event.inputs.browser != 'all' && github.event.inputs.browser != ''
working-directory: packages/interact-debug
run: npx playwright install --with-deps ${{ github.event.inputs.browser }}

- name: Install Playwright system deps
if: steps.playwright-cache.outputs.cache-hit == 'true'
run: npx playwright install-deps ${{ github.event.inputs.browser != 'all' && github.event.inputs.browser != '' && github.event.inputs.browser || '' }}

- name: Run E2E tests (selected browser)
if: github.event.inputs.browser != 'all' && github.event.inputs.browser != ''
working-directory: packages/interact-debug
run: npx playwright test --project=${{ github.event.inputs.browser }}

- name: Run E2E tests (all browsers)
if: github.event.inputs.browser == 'all'
working-directory: packages/interact-debug
run: npx playwright test

- name: Upload Playwright report
if: ${{ always() }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
with:
name: interact-debug-playwright-report
path: packages/interact-debug/playwright-report/
retention-days: 14
140 changes: 140 additions & 0 deletions packages/interact-debug/bin/run-eval-compare.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#!/usr/bin/env npx tsx
/**
* Runs the evaluation three times with different levels of rules context:
* 1. no-rules: Base prompt only (no @wix/interact documentation)
* 2. partial: Only the core overview rule (full-lean.md)
* 3. full-rules: All 7 rule files (the default)
*
* Writes a comparison JSON and prints a side-by-side table.
*/
import {
runEvaluation,
formatReport,
scenarios,
buildSystemPromptFromFiles,
buildSystemPrompt,
} from '../src/eval';
import type { EvalReport } from '../src/eval';
import { writeFile } from 'node:fs/promises';
import { resolve } from 'node:path';

const BARE_PROMPT = `You are an expert web developer. Generate a complete, single-file HTML document that implements the requested animation scenario.

Requirements:
- Output ONLY the HTML document. No explanations, no markdown fences, no commentary.
- The HTML must be a complete document (<!DOCTYPE html>, <html>, <head>, <body>).
- All CSS goes inside a <style> tag in <head>.
- All JavaScript goes inside a <script type="module"> tag at the end of <body>.`;

type Variant = { name: string; systemPrompt: string };

async function main() {
const variants: Variant[] = [
{ name: 'no-rules', systemPrompt: BARE_PROMPT },
{ name: 'partial', systemPrompt: await buildSystemPromptFromFiles(['full-lean.md']) },
{ name: 'full-rules', systemPrompt: await buildSystemPrompt() },
];

const reports: Record<string, EvalReport> = {};
const startTime = Date.now();

for (const variant of variants) {
console.log(`\n${'='.repeat(60)}`);
console.log(
` Running variant: ${variant.name} (prompt size: ${(variant.systemPrompt.length / 1024).toFixed(1)}KB)`,
);
console.log(`${'='.repeat(60)}\n`);

const report = await runEvaluation(scenarios, {
systemPrompt: variant.systemPrompt,
onResult: (result) => {
const status = result.success
? `score=${result.scores?.aggregate.toFixed(2)}, valid=${result.validation?.valid}, errs=${result.validation?.errors.length}`
: `FAILED: ${result.error?.slice(0, 80)}`;
console.log(
` [${result.scenario.id}] ${status} (${(result.durationMs / 1000).toFixed(0)}s)`,
);
},
});

reports[variant.name] = report;
console.log('\n' + formatReport(report));
}

// Print comparison table
console.log('\n' + '='.repeat(80));
console.log(' COMPARISON SUMMARY');
console.log('='.repeat(80) + '\n');

const header = [
'Metric'.padEnd(25),
'no-rules'.padEnd(12),
'partial'.padEnd(12),
'full-rules'.padEnd(12),
].join('');
console.log(header);
console.log('-'.repeat(header.length));

const metrics: [string, (r: EvalReport) => string][] = [
['Generated', (r) => `${r.summary.generated}/${r.summary.total}`],
['Valid', (r) => `${r.summary.valid}/${r.summary.total}`],
['Avg aggregate', (r) => r.summary.averageAggregate.toFixed(3)],
['Total errors', (r) => String(r.summary.totalErrors)],
['Total warnings', (r) => String(r.summary.totalWarnings)],
];

// Per-dimension averages
const dimensions = ['complexity', 'weight', 'a11y', 'coherence', 'bestPractices', 'validation'];
for (const dim of dimensions) {
metrics.push([
`Avg ${dim}`,
(r) => {
const scores = r.results
.filter((x) => x.scores)
.map((x) => x.scores!.dimensions.find((d) => d.dimension === dim)?.score ?? 0);
return scores.length > 0
? (scores.reduce((a, b) => a + b, 0) / scores.length).toFixed(3)
: '-';
},
]);
}

for (const [name, fn] of metrics) {
console.log(
[
name.padEnd(25),
fn(reports['no-rules']).padEnd(12),
fn(reports['partial']).padEnd(12),
fn(reports['full-rules']).padEnd(12),
].join(''),
);
}

console.log(`\nTotal time: ${((Date.now() - startTime) / 1000 / 60).toFixed(1)} minutes`);

// Write comparison JSON
const outPath = resolve(__dirname, '..', 'eval-comparison.json');
const output: Record<string, unknown> = {};
for (const v of variants) {
const r = reports[v.name];
output[v.name] = {
promptSizeKB: +(v.systemPrompt.length / 1024).toFixed(1),
summary: r.summary,
perScenario: r.results.map((x) => ({
scenario: x.scenario.id,
success: x.success,
aggregate: x.scores?.aggregate,
valid: x.validation?.valid,
errors: x.validation?.errors.length,
warnings: x.validation?.warnings.length,
})),
};
}
await writeFile(outPath, JSON.stringify(output, null, 2));
console.log(`\nComparison written to ${outPath}`);
}

main().catch((err) => {
console.error('Eval comparison failed:', err);
process.exit(1);
});
90 changes: 90 additions & 0 deletions packages/interact-debug/bin/run-eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env npx tsx
import { runEvaluation, formatReport, scenarios } from '../src/eval';
import { writeFile, mkdir } from 'node:fs/promises';
import { resolve } from 'node:path';

const RESULTS_FILE = resolve(__dirname, '..', 'eval-results.json');
const RUNS_PER_SCENARIO = 3;

async function main() {
console.log(
`Starting evaluation of ${scenarios.length} scenarios (${RUNS_PER_SCENARIO} runs each)...\n`,
);
const startTime = Date.now();

const rawDir = resolve(__dirname, '..', 'eval-raw-output');
await mkdir(rawDir, { recursive: true });

const report = await runEvaluation(scenarios, {
runsPerScenario: RUNS_PER_SCENARIO,
onRun: async (scenario, runIdx, run) => {
const status = run.success
? `score=${run.scores?.aggregate.toFixed(2)}, valid=${run.validation?.valid}, errs=${run.validation?.errors.length}`
: `FAILED: ${run.error?.slice(0, 80)}`;
console.log(
` [${scenario.id}] run ${runIdx + 1}/${RUNS_PER_SCENARIO}: ${status} (${(run.durationMs / 1000).toFixed(0)}s)`,
);

if (run.rawOutput) {
await writeFile(resolve(rawDir, `${scenario.id}-run${runIdx}.html`), run.rawOutput).catch(
() => {},
);
}
},
onResult: (result) => {
const stats = result.scoreStats;
const status = result.success
? `mean=${stats?.mean.toFixed(2)}, std=${stats?.stddev.toFixed(2)}, rate=${(result.successRate * 100).toFixed(0)}%`
: `ALL FAILED`;
console.log(` => [${result.scenario.id}] ${status}\n`);
},
});

console.log('\n' + formatReport(report));
console.log(`Total time: ${((Date.now() - startTime) / 1000 / 60).toFixed(1)} minutes`);

const slimResults = report.results.map((r) => ({
scenario: r.scenario.id,
success: r.success,
successRate: r.successRate,
error: r.error,
scoreStats: r.scoreStats,
validation: r.validation
? {
valid: r.validation.valid,
errors: r.validation.errors.length,
warnings: r.validation.warnings.length,
infos: r.validation.infos.length,
}
: undefined,
scores: r.scores
? {
aggregate: r.scores.aggregate,
dimensions: r.scores.dimensions.map((d) => ({ dimension: d.dimension, score: d.score })),
}
: undefined,
durationMs: r.durationMs,
runs: r.runs.map((run) => ({
success: run.success,
aggregate: run.scores?.aggregate,
valid: run.validation?.valid,
errors: run.validation?.errors.length,
durationMs: run.durationMs,
})),
}));

const output = {
timestamp: report.timestamp,
runsPerScenario: report.runsPerScenario,
summary: report.summary,
results: slimResults,
};

await writeFile(RESULTS_FILE, JSON.stringify(output, null, 2));
console.log(`\nResults written to ${RESULTS_FILE}`);
}

main().catch((err) => {
console.error('Eval failed:', err);
process.exit(1);
});
Loading
Loading