Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 136 additions & 6 deletions skills/event-prospecting/scripts/extract_event.mjs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
#!/usr/bin/env node
// extract_event.mjs — read recon.json, dispatch to platform-specific extractor,
// write people.jsonl (one speaker per line) and seed_companies.txt.
//
// Usage: node extract_event.mjs <output-dir>
// extract_event.mjs — enhanced with deduplication, CSV export, and stats

import { execFileSync } from 'child_process';
import { readFileSync, writeFileSync } from 'fs';
Expand All @@ -23,12 +20,16 @@ function slugify(s) {
return (s || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '');
}

function normalizeLinkedIn(url) {
if (!url) return null;
const m = url.match(/linkedin\.com\/in\/([\w-]+)/i);
return m ? `https://www.linkedin.com/in/${m[1]}/` : url;
}

function extractFromNextData(paths) {
// Build a JS expression that walks __NEXT_DATA__ for each path and unions the arrays.
const js = `(() => {
const data = JSON.parse(document.getElementById('__NEXT_DATA__').textContent);
function get(obj, path) {
// path like '.props.pageProps.foo[0].bar' — naive parser, sufficient
const tokens = path.match(/\\.[a-zA-Z_$][\\w$]*|\\[\\d+\\]/g) || [];
let cur = obj;
for (const t of tokens) {
Expand All @@ -38,6 +39,135 @@ function extractFromNextData(paths) {
}
return cur;
}
function pickImage(s) {
const re = /portrait|headshot|photo|image|picture|avatar|thumbnail/i;
const keys = Object.keys(s).filter(k => re.test(k));
keys.sort((a, b) => (/mono|grey|gray|black/i.test(a) ? 1 : 0) - (/mono|grey|gray|black/i.test(b) ? 1 : 0));
function unwrap(v) {
if (!v) return null;
if (typeof v === 'string') return v;
if (Array.isArray(v)) return v.map(unwrap).find(Boolean) || null;
if (typeof v === 'object') return v.url || v.src || (v.asset && v.asset.url) || (v.fields && v.fields.file && v.fields.file.url) || null;
return null;
}
for (const k of keys) {
const got = unwrap(s[k]);
if (got) return got;
}
return null;
}
const all = [];
${JSON.stringify(paths)}.forEach(p => {
const arr = get(data, p);
if (Array.isArray(arr)) all.push(...arr);
});
return all.map(s => ({
name: s.name || s.fullName || null,
title: s.title || s.role || null,
company: s.companyName || s.company || s.org || null,
linkedin: s.linkedInProfile || s.linkedinUrl || s.linkedin || null,
bio: s.bio || s.description || null,
image: pickImage(s),
}));
})()`;
browse('goto', recon.url);
browse('wait', 'timeout', '2000');
const evalRes = JSON.parse(browse('eval', js));
return evalRes.result || [];
}

function extractFromMarkdown() {
browse('goto', recon.url);
browse('wait', 'timeout', '2500');
const md = JSON.parse(browse('get', 'markdown')).markdown || '';
const blocks = md.split(/\n#{2,4} /);
const out = [];
for (const b of blocks) {
const lines = b.split(/\n+/).map(l => l.trim()).filter(Boolean);
if (lines.length < 2) continue;
const name = lines[0];
if (!/^[A-Z]/.test(name)) continue;
const linkedinMatch = b.match(/linkedin\.com\/in\/([\w-]+)/i);
out.push({
name,
title: lines[1] || null,
company: lines[2] || null,
linkedin: linkedinMatch ? `https://www.linkedin.com/in/${linkedinMatch[1]}/` : null,
});
}
return out;
}

let people = recon.strategy === 'next-data-eval'
? extractFromNextData(recon.nextDataPaths || [])
: extractFromMarkdown();

const eventOrigin = (() => { try { return new URL(recon.url).origin; } catch { return null; } })();
const resolveImage = src => !src ? null : /^https?:\/\//i.test(src) ? src : src.startsWith('//') ? 'https:' + src : src.startsWith('/') && eventOrigin ? eventOrigin + src : src;

const slugCounts = new Map();
people = people.map(p => {
const base = slugify(p.name);
const n = (slugCounts.get(base) || 0) + 1;
slugCounts.set(base, n);
return { ...p, linkedin: normalizeLinkedIn(p.linkedin), image: resolveImage(p.image), slug: n === 1 ? base : `${base}-${n}` };
});

// 🔥 NEW: Deduplicate
const seen = new Set();
people = people.filter(p => {
const key = p.linkedin || p.name;
if (seen.has(key)) return false;
seen.add(key);
return true;
});
Comment on lines +116 to +123
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dedup key uses name alone, dropping distinct people

Medium Severity

The deduplication key falls back to p.name alone when p.linkedin is absent. Two genuinely different speakers who share the same name but work at different companies (e.g., "David Chen" at Company A and "David Chen" at Company B) will be collapsed — the second is silently dropped. Using a composite key that incorporates p.company would prevent this data loss.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 9d0c250. Configure here.


// Filtering
const hostOrg = (() => {
try {
const h = new URL(recon.url).hostname.replace(/^www\./, '').toLowerCase();
const parts = h.split('.');
const sld = parts.length >= 2 ? parts[parts.length - 2] : parts[0];
const stripped = sld.replace(/(?:sessions?|conf(?:erence)?|summit|events?)$/, '');
return (stripped !== sld && stripped.length >= 5) ? stripped : sld;
} catch { return null; }
})();

const userCompanyArg = (() => {
const i = process.argv.indexOf('--user-company');
return i !== -1 ? process.argv[i + 1] : null;
})();

const dropList = new Set([hostOrg && slugify(hostOrg), userCompanyArg && slugify(userCompanyArg)].filter(Boolean));

people = people.filter(p => !p.company || !dropList.has(slugify(p.company)));

// Write JSONL
const peopleFile = join(outDir, 'people.jsonl');
writeFileSync(peopleFile, people.map(p => JSON.stringify(p)).join('\n') + '\n');

// 🔥 NEW: CSV Export
const csv = ['name,title,company,linkedin,image'].concat(
people.map(p => `"${p.name}","${p.title}","${p.company}","${p.linkedin}","${p.image}"`)
Comment on lines +149 to +151
).join('\n');
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CSV export doesn't escape quotes or handle nulls

Medium Severity

The CSV export interpolates field values directly into double-quoted strings without escaping embedded double quotes (per RFC 4180, " must be escaped as ""). Additionally, null values from fields like p.title, p.company, p.linkedin, and p.image are coerced to the literal string "null" in the CSV, making them indistinguishable from actual data.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 9d0c250. Configure here.

writeFileSync(join(outDir, 'people.csv'), csv);

// Unique companies
const companies = [...new Set(people.map(p => p.company).filter(Boolean))].sort();
writeFileSync(join(outDir, 'seed_companies.txt'), companies.join('\n') + '\n');

// 🔥 NEW: Stats
const stats = {
total: people.length,
missingLinkedIn: people.filter(p => !p.linkedin).length,
missingImages: people.filter(p => !p.image).length
};

console.error(`Stats:`, stats);
console.log(JSON.stringify({ peopleCount: people.length, companyCount: companies.length, stats, peopleFile }, null, 2)); else cur = cur[parseInt(t.slice(1, -1), 10)];
}
Comment on lines +166 to +168
return cur;
}
function pickImage(s) {
// Detect image fields by KEY NAME regex (across Next.js / Sanity / Sessionize / custom CMS shapes).
// Matches anything containing portrait/headshot/photo/image/picture/avatar/thumbnail (case-insensitive).
Comment on lines +167 to 173
Expand Down
Loading