From 9d0c2502a2b2f18bff46e42b64288b3055f6756c Mon Sep 17 00:00:00 2001
From: krataratha <kratarathtatran@icloud.com>
Date: Tue, 5 May 2026 10:49:37 +0530
Subject: [PATCH] Improve extract_event.mjs with new features

Enhanced event extraction script with deduplication, CSV export, and statistics reporting.
---
 .../scripts/extract_event.mjs                 | 142 +++++++++++++++++-
 1 file changed, 136 insertions(+), 6 deletions(-)
diff --git a/skills/event-prospecting/scripts/extract_event.mjs b/skills/event-prospecting/scripts/extract_event.mjs
index ae3a383..926139d 100644
--- a/skills/event-prospecting/scripts/extract_event.mjs
+++ b/skills/event-prospecting/scripts/extract_event.mjs
@@ -1,8 +1,5 @@
 #!/usr/bin/env node
-// extract_event.mjs — read recon.json, dispatch to platform-specific extractor,
-// write people.jsonl (one speaker per line) and seed_companies.txt.
-//
-// Usage: node extract_event.mjs <output-dir>
+// extract_event.mjs — enhanced with deduplication, CSV export, and stats
 
 import { execFileSync } from 'child_process';
 import { readFileSync, writeFileSync } from 'fs';
@@ -23,12 +20,16 @@ function slugify(s) {
   return (s || '').toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '');
 }
 
+function normalizeLinkedIn(url) {
+  if (!url) return null;
+  const m = url.match(/linkedin\.com\/in\/([\w-]+)/i);
+  return m ? `https://www.linkedin.com/in/${m[1]}/` : url;
+}
+
 function extractFromNextData(paths) {
-  // Build a JS expression that walks __NEXT_DATA__ for each path and unions the arrays.
   const js = `(() => {
     const data = JSON.parse(document.getElementById('__NEXT_DATA__').textContent);
     function get(obj, path) {
-      // path like '.props.pageProps.foo[0].bar' — naive parser, sufficient
       const tokens = path.match(/\\.[a-zA-Z_$][\\w$]*|\\[\\d+\\]/g) || [];
       let cur = obj;
       for (const t of tokens) {
@@ -38,6 +39,135 @@ function extractFromNextData(paths) {
       }
       return cur;
     }
+    function pickImage(s) {
+      const re = /portrait|headshot|photo|image|picture|avatar|thumbnail/i;
+      const keys = Object.keys(s).filter(k => re.test(k));
+      keys.sort((a, b) => (/mono|grey|gray|black/i.test(a) ? 1 : 0) - (/mono|grey|gray|black/i.test(b) ? 1 : 0));
+      function unwrap(v) {
+        if (!v) return null;
+        if (typeof v === 'string') return v;
+        if (Array.isArray(v)) return v.map(unwrap).find(Boolean) || null;
+        if (typeof v === 'object') return v.url || v.src || (v.asset && v.asset.url) || (v.fields && v.fields.file && v.fields.file.url) || null;
+        return null;
+      }
+      for (const k of keys) {
+        const got = unwrap(s[k]);
+        if (got) return got;
+      }
+      return null;
+    }
+    const all = [];
+    ${JSON.stringify(paths)}.forEach(p => {
+      const arr = get(data, p);
+      if (Array.isArray(arr)) all.push(...arr);
+    });
+    return all.map(s => ({
+      name: s.name || s.fullName || null,
+      title: s.title || s.role || null,
+      company: s.companyName || s.company || s.org || null,
+      linkedin: s.linkedInProfile || s.linkedinUrl || s.linkedin || null,
+      bio: s.bio || s.description || null,
+      image: pickImage(s),
+    }));
+  })()`;
+  browse('goto', recon.url);
+  browse('wait', 'timeout', '2000');
+  const evalRes = JSON.parse(browse('eval', js));
+  return evalRes.result || [];
+}
+
+function extractFromMarkdown() {
+  browse('goto', recon.url);
+  browse('wait', 'timeout', '2500');
+  const md = JSON.parse(browse('get', 'markdown')).markdown || '';
+  const blocks = md.split(/\n#{2,4} /);
+  const out = [];
+  for (const b of blocks) {
+    const lines = b.split(/\n+/).map(l => l.trim()).filter(Boolean);
+    if (lines.length < 2) continue;
+    const name = lines[0];
+    if (!/^[A-Z]/.test(name)) continue;
+    const linkedinMatch = b.match(/linkedin\.com\/in\/([\w-]+)/i);
+    out.push({
+      name,
+      title: lines[1] || null,
+      company: lines[2] || null,
+      linkedin: linkedinMatch ? `https://www.linkedin.com/in/${linkedinMatch[1]}/` : null,
+    });
+  }
+  return out;
+}
+
+let people = recon.strategy === 'next-data-eval'
+  ? extractFromNextData(recon.nextDataPaths || [])
+  : extractFromMarkdown();
+
+const eventOrigin = (() => { try { return new URL(recon.url).origin; } catch { return null; } })();
+const resolveImage = src => !src ? null : /^https?:\/\//i.test(src) ? src : src.startsWith('//') ? 'https:' + src : src.startsWith('/') && eventOrigin ? eventOrigin + src : src;
+
+const slugCounts = new Map();
+people = people.map(p => {
+  const base = slugify(p.name);
+  const n = (slugCounts.get(base) || 0) + 1;
+  slugCounts.set(base, n);
+  return { ...p, linkedin: normalizeLinkedIn(p.linkedin), image: resolveImage(p.image), slug: n === 1 ? base : `${base}-${n}` };
+});
+
+// 🔥 NEW: Deduplicate
+const seen = new Set();
+people = people.filter(p => {
+  const key = p.linkedin || p.name;
+  if (seen.has(key)) return false;
+  seen.add(key);
+  return true;
+});
+
+// Filtering
+const hostOrg = (() => {
+  try {
+    const h = new URL(recon.url).hostname.replace(/^www\./, '').toLowerCase();
+    const parts = h.split('.');
+    const sld = parts.length >= 2 ? parts[parts.length - 2] : parts[0];
+    const stripped = sld.replace(/(?:sessions?|conf(?:erence)?|summit|events?)$/, '');
+    return (stripped !== sld && stripped.length >= 5) ? stripped : sld;
+  } catch { return null; }
+})();
+
+const userCompanyArg = (() => {
+  const i = process.argv.indexOf('--user-company');
+  return i !== -1 ? process.argv[i + 1] : null;
+})();
+
+const dropList = new Set([hostOrg && slugify(hostOrg), userCompanyArg && slugify(userCompanyArg)].filter(Boolean));
+
+people = people.filter(p => !p.company || !dropList.has(slugify(p.company)));
+
+// Write JSONL
+const peopleFile = join(outDir, 'people.jsonl');
+writeFileSync(peopleFile, people.map(p => JSON.stringify(p)).join('\n') + '\n');
+
+// 🔥 NEW: CSV Export
+const csv = ['name,title,company,linkedin,image'].concat(
+  people.map(p => `"${p.name}","${p.title}","${p.company}","${p.linkedin}","${p.image}"`)
+).join('\n');
+writeFileSync(join(outDir, 'people.csv'), csv);
+
+// Unique companies
+const companies = [...new Set(people.map(p => p.company).filter(Boolean))].sort();
+writeFileSync(join(outDir, 'seed_companies.txt'), companies.join('\n') + '\n');
+
+// 🔥 NEW: Stats
+const stats = {
+  total: people.length,
+  missingLinkedIn: people.filter(p => !p.linkedin).length,
+  missingImages: people.filter(p => !p.image).length
+};
+
+console.error(`Stats:`, stats);
+console.log(JSON.stringify({ peopleCount: people.length, companyCount: companies.length, stats, peopleFile }, null, 2));        else cur = cur[parseInt(t.slice(1, -1), 10)];
+      }
+      return cur;
+    }
     function pickImage(s) {
       // Detect image fields by KEY NAME regex (across Next.js / Sanity / Sessionize / custom CMS shapes).
       // Matches anything containing portrait/headshot/photo/image/picture/avatar/thumbnail (case-insensitive).