From 9d0c2502a2b2f18bff46e42b64288b3055f6756c Mon Sep 17 00:00:00 2001 From: krataratha Date: Tue, 5 May 2026 10:49:37 +0530 Subject: [PATCH] Improve extract_event.mjs with new features Enhanced event extraction script with deduplication, CSV export, and statistics reporting. --- .../scripts/extract_event.mjs | 142 +++++++++++++++++- 1 file changed, 136 insertions(+), 6 deletions(-) diff --git a/skills/event-prospecting/scripts/extract_event.mjs b/skills/event-prospecting/scripts/extract_event.mjs index ae3a383..926139d 100644 --- a/skills/event-prospecting/scripts/extract_event.mjs +++ b/skills/event-prospecting/scripts/extract_event.mjs @@ -1,8 +1,5 @@ #!/usr/bin/env node -// extract_event.mjs — read recon.json, dispatch to platform-specific extractor, -// write people.jsonl (one speaker per line) and seed_companies.txt. -// -// Usage: node extract_event.mjs +// extract_event.mjs — enhanced with deduplication, CSV export, and stats import { execFileSync } from 'child_process'; import { readFileSync, writeFileSync } from 'fs'; @@ -23,12 +20,16 @@ function slugify(s) { return (s || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, ''); } +function normalizeLinkedIn(url) { + if (!url) return null; + const m = url.match(/linkedin\.com\/in\/([\w-]+)/i); + return m ? `https://www.linkedin.com/in/${m[1]}/` : url; +} + function extractFromNextData(paths) { - // Build a JS expression that walks __NEXT_DATA__ for each path and unions the arrays. const js = `(() => { const data = JSON.parse(document.getElementById('__NEXT_DATA__').textContent); function get(obj, path) { - // path like '.props.pageProps.foo[0].bar' — naive parser, sufficient const tokens = path.match(/\\.[a-zA-Z_$][\\w$]*|\\[\\d+\\]/g) || []; let cur = obj; for (const t of tokens) { @@ -38,6 +39,135 @@ function extractFromNextData(paths) { } return cur; } + function pickImage(s) { + const re = /portrait|headshot|photo|image|picture|avatar|thumbnail/i; + const keys = Object.keys(s).filter(k => re.test(k)); + keys.sort((a, b) => (/mono|grey|gray|black/i.test(a) ? 1 : 0) - (/mono|grey|gray|black/i.test(b) ? 1 : 0)); + function unwrap(v) { + if (!v) return null; + if (typeof v === 'string') return v; + if (Array.isArray(v)) return v.map(unwrap).find(Boolean) || null; + if (typeof v === 'object') return v.url || v.src || (v.asset && v.asset.url) || (v.fields && v.fields.file && v.fields.file.url) || null; + return null; + } + for (const k of keys) { + const got = unwrap(s[k]); + if (got) return got; + } + return null; + } + const all = []; + ${JSON.stringify(paths)}.forEach(p => { + const arr = get(data, p); + if (Array.isArray(arr)) all.push(...arr); + }); + return all.map(s => ({ + name: s.name || s.fullName || null, + title: s.title || s.role || null, + company: s.companyName || s.company || s.org || null, + linkedin: s.linkedInProfile || s.linkedinUrl || s.linkedin || null, + bio: s.bio || s.description || null, + image: pickImage(s), + })); + })()`; + browse('goto', recon.url); + browse('wait', 'timeout', '2000'); + const evalRes = JSON.parse(browse('eval', js)); + return evalRes.result || []; +} + +function extractFromMarkdown() { + browse('goto', recon.url); + browse('wait', 'timeout', '2500'); + const md = JSON.parse(browse('get', 'markdown')).markdown || ''; + const blocks = md.split(/\n#{2,4} /); + const out = []; + for (const b of blocks) { + const lines = b.split(/\n+/).map(l => l.trim()).filter(Boolean); + if (lines.length < 2) continue; + const name = lines[0]; + if (!/^[A-Z]/.test(name)) continue; + const linkedinMatch = b.match(/linkedin\.com\/in\/([\w-]+)/i); + out.push({ + name, + title: lines[1] || null, + company: lines[2] || null, + linkedin: linkedinMatch ? `https://www.linkedin.com/in/${linkedinMatch[1]}/` : null, + }); + } + return out; +} + +let people = recon.strategy === 'next-data-eval' + ? extractFromNextData(recon.nextDataPaths || []) + : extractFromMarkdown(); + +const eventOrigin = (() => { try { return new URL(recon.url).origin; } catch { return null; } })(); +const resolveImage = src => !src ? null : /^https?:\/\//i.test(src) ? src : src.startsWith('//') ? 'https:' + src : src.startsWith('/') && eventOrigin ? eventOrigin + src : src; + +const slugCounts = new Map(); +people = people.map(p => { + const base = slugify(p.name); + const n = (slugCounts.get(base) || 0) + 1; + slugCounts.set(base, n); + return { ...p, linkedin: normalizeLinkedIn(p.linkedin), image: resolveImage(p.image), slug: n === 1 ? base : `${base}-${n}` }; +}); + +// 🔥 NEW: Deduplicate +const seen = new Set(); +people = people.filter(p => { + const key = p.linkedin || p.name; + if (seen.has(key)) return false; + seen.add(key); + return true; +}); + +// Filtering +const hostOrg = (() => { + try { + const h = new URL(recon.url).hostname.replace(/^www\./, '').toLowerCase(); + const parts = h.split('.'); + const sld = parts.length >= 2 ? parts[parts.length - 2] : parts[0]; + const stripped = sld.replace(/(?:sessions?|conf(?:erence)?|summit|events?)$/, ''); + return (stripped !== sld && stripped.length >= 5) ? stripped : sld; + } catch { return null; } +})(); + +const userCompanyArg = (() => { + const i = process.argv.indexOf('--user-company'); + return i !== -1 ? process.argv[i + 1] : null; +})(); + +const dropList = new Set([hostOrg && slugify(hostOrg), userCompanyArg && slugify(userCompanyArg)].filter(Boolean)); + +people = people.filter(p => !p.company || !dropList.has(slugify(p.company))); + +// Write JSONL +const peopleFile = join(outDir, 'people.jsonl'); +writeFileSync(peopleFile, people.map(p => JSON.stringify(p)).join('\n') + '\n'); + +// 🔥 NEW: CSV Export +const csv = ['name,title,company,linkedin,image'].concat( + people.map(p => `"${p.name}","${p.title}","${p.company}","${p.linkedin}","${p.image}"`) +).join('\n'); +writeFileSync(join(outDir, 'people.csv'), csv); + +// Unique companies +const companies = [...new Set(people.map(p => p.company).filter(Boolean))].sort(); +writeFileSync(join(outDir, 'seed_companies.txt'), companies.join('\n') + '\n'); + +// 🔥 NEW: Stats +const stats = { + total: people.length, + missingLinkedIn: people.filter(p => !p.linkedin).length, + missingImages: people.filter(p => !p.image).length +}; + +console.error(`Stats:`, stats); +console.log(JSON.stringify({ peopleCount: people.length, companyCount: companies.length, stats, peopleFile }, null, 2)); else cur = cur[parseInt(t.slice(1, -1), 10)]; + } + return cur; + } function pickImage(s) { // Detect image fields by KEY NAME regex (across Next.js / Sanity / Sessionize / custom CMS shapes). // Matches anything containing portrait/headshot/photo/image/picture/avatar/thumbnail (case-insensitive).