-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch_linkedin.py
More file actions
110 lines (90 loc) · 3.99 KB
/
fetch_linkedin.py
File metadata and controls
110 lines (90 loc) · 3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
🔗 LinkedIn 简历抓取
使用 Playwright 访问用户领英页面
"""
import asyncio
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from stealth_browser import StealthBrowser
async def fetch_linkedin():
"""抓取领英简历"""
print("=" * 60)
print("🔗 抓取 LinkedIn 简历")
print("=" * 60)
async with StealthBrowser(headless=True) as browser:
print("\n🚀 正在打开 LinkedIn...")
# 访问用户领英页面
await browser.goto("https://www.your-linkedin-profile", wait_for="body")
await asyncio.sleep(3)
print("✅ 页面已加载")
# 截图
screenshot = "/tmp/linkedin_profile.png"
await browser.screenshot(screenshot)
print(f"📸 已保存截图: {screenshot}")
# 尝试提取信息
profile_info = await browser.page.evaluate("""
() => {
const data = {
name: '',
headline: '',
about: '',
experience: [],
education: []
};
// 姓名
const nameEl = document.querySelector('h1');
if (nameEl) data.name = nameEl.innerText.trim();
// 头衔
const headlineEl = document.querySelector('[class*="headline"], .pv-top-card__headline');
if (headlineEl) data.headline = headlineEl.innerText.trim();
// About
const aboutEl = document.querySelector('[class*="about"] [class*="summary"], [class*="inline-show-more-text"]');
if (aboutEl) data.about = aboutEl.innerText.trim();
// 工作经历
const expItems = document.querySelectorAll('[class*="experience"], .pv-experience-section__summary-item');
expItems.forEach(item => {
const title = item.querySelector('h3, [class*="title"]')?.innerText?.trim();
const company = item.querySelector('[class*="company"], p')?.innerText?.trim();
if (title || company) {
data.experience.push({ title, company });
}
});
// 教育
const eduItems = document.querySelectorAll('[class*="education"]');
eduItems.forEach(item => {
const school = item.querySelector('h3, [class*="school"]')?.innerText?.trim();
const degree = item.querySelector('[class*="degree"]')?.innerText?.trim();
if (school) {
data.education.push({ school, degree });
}
});
return data;
}
""")
print("\n📄 提取的信息:")
print(f"姓名: {profile_info.get('name', 'N/A')}")
print(f"头衔: {profile_info.get('headline', 'N/A')}")
if profile_info.get('about'):
print(f"\n关于:\n{profile_info['about'][:300]}...")
if profile_info.get('experience'):
print(f"\n工作经历 ({len(profile_info['experience'])} 条):")
for exp in profile_info['experience'][:3]:
print(f" - {exp.get('title', '')} @ {exp.get('company', '')}")
# 保存完整内容
content = await browser.get_content()
with open('/tmp/linkedin_content.html', 'w', encoding='utf-8') as f:
f.write(content)
print(f"\n📄 完整 HTML 已保存到 /tmp/linkedin_content.html")
print("\n" + "=" * 60)
print("✅ 完成")
print("=" * 60)
if __name__ == "__main__":
try:
asyncio.run(fetch_linkedin())
except Exception as e:
print(f"\n❌ 错误: {e}")
import traceback
traceback.print_exc()