bear-toolbox/fetch_linkedin.py at main · jokebear-bot/bear-toolbox · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
🔗 LinkedIn 简历抓取
使用 Playwright 访问用户领英页面
"""

import asyncio
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from stealth_browser import StealthBrowser

async def fetch_linkedin():
    """抓取领英简历"""
    print("=" * 60)
    print("🔗 抓取 LinkedIn 简历")
    print("=" * 60)

    async with StealthBrowser(headless=True) as browser:
        print("\n🚀 正在打开 LinkedIn...")

        # 访问用户领英页面
        await browser.goto("https://www.your-linkedin-profile", wait_for="body")
        await asyncio.sleep(3)

        print("✅ 页面已加载")

        # 截图
        screenshot = "/tmp/linkedin_profile.png"
        await browser.screenshot(screenshot)
        print(f"📸 已保存截图: {screenshot}")

        # 尝试提取信息
        profile_info = await browser.page.evaluate("""
            () => {
                const data = {
                    name: '',
                    headline: '',
                    about: '',
                    experience: [],
                    education: []
                };

                // 姓名
                const nameEl = document.querySelector('h1');
                if (nameEl) data.name = nameEl.innerText.trim();

                // 头衔
                const headlineEl = document.querySelector('[class*="headline"], .pv-top-card__headline');
                if (headlineEl) data.headline = headlineEl.innerText.trim();

                // About
                const aboutEl = document.querySelector('[class*="about"] [class*="summary"], [class*="inline-show-more-text"]');
                if (aboutEl) data.about = aboutEl.innerText.trim();

                // 工作经历
                const expItems = document.querySelectorAll('[class*="experience"], .pv-experience-section__summary-item');
                expItems.forEach(item => {
                    const title = item.querySelector('h3, [class*="title"]')?.innerText?.trim();
                    const company = item.querySelector('[class*="company"], p')?.innerText?.trim();
                    if (title || company) {
                        data.experience.push({ title, company });
                    }
                });

                // 教育
                const eduItems = document.querySelectorAll('[class*="education"]');
                eduItems.forEach(item => {
                    const school = item.querySelector('h3, [class*="school"]')?.innerText?.trim();
                    const degree = item.querySelector('[class*="degree"]')?.innerText?.trim();
                    if (school) {
                        data.education.push({ school, degree });
                    }
                });

                return data;
            }
        """)

        print("\n📄 提取的信息:")
        print(f"姓名: {profile_info.get('name', 'N/A')}")
        print(f"头衔: {profile_info.get('headline', 'N/A')}")

        if profile_info.get('about'):
            print(f"\n关于:\n{profile_info['about'][:300]}...")

        if profile_info.get('experience'):
            print(f"\n工作经历 ({len(profile_info['experience'])} 条):")
            for exp in profile_info['experience'][:3]:
                print(f"  - {exp.get('title', '')} @ {exp.get('company', '')}")

        # 保存完整内容
        content = await browser.get_content()
        with open('/tmp/linkedin_content.html', 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"\n📄 完整 HTML 已保存到 /tmp/linkedin_content.html")

    print("\n" + "=" * 60)
    print("✅ 完成")
    print("=" * 60)

if __name__ == "__main__":
    try:
        asyncio.run(fetch_linkedin())
    except Exception as e:
        print(f"\n❌ 错误: {e}")
        import traceback
        traceback.print_exc()