Email_audit/email_generator.py at master · weiwill88/Email_audit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
import pandas as pd
import random
from datetime import datetime, timedelta
import json
import requests
from pathlib import Path
import numpy as np
import logging
from tqdm import tqdm
import sys
import time
import re

class EmailGenerator:
    def __init__(self, api_key="sk-uuvzcgizeroljmrdrwnxlrcpuslkldgtsfrernccqxxuhosd"):
        # 设置日志
        self.setup_logging()
        self.logger = logging.getLogger('EmailGenerator')

        self.logger.info("初始化邮件生成器...")
        self.api_key = api_key
        self.api_url = "https://api.siliconflow.cn/v1/chat/completions"
        self.load_config()

        self.email_folders = {
            "sent": "Top of Personal Folders/发件箱",
            "inbox": "Top of Personal Folders/收件箱"
        }
        self.my_email = "zhang.ming@wisdomtech.com"
        self.my_name = "张明"

        # 添加邮件类型分布配置
        self.email_distribution = {
            "daily": 0.70,    # 日常工作邮件
            "system": 0.15,   # 系统通知邮件
            "business": 0.10, # 外部商务邮件
            "suspicious": 0.05 # 可疑行为邮件
        }

        # 添加风险等级分布
        self.risk_distribution = {
            "normal": 0.60,    # 正常邮件
            "low_risk": 0.25,  # 轻微可疑
            "medium_risk": 0.10, # 中度可疑
            "high_risk": 0.05   # 高度可疑
        }

    def setup_logging(self):
        """设置日志配置"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('output/email_generation.log', encoding='utf-8'),
                logging.StreamHandler(sys.stdout)
            ]
        )

    def load_config(self):
        """加载配置信息"""
        self.logger.info("加载配置信息...")
        # 人物关系和邮箱映射
        self.contacts = {
            "internal": {
                "李强": "li.qiang@wisdomtech.com",
                "王婷": "wang.ting@wisdomtech.com",
                "刘洋": "liu.yang@wisdomtech.com",
                "周静": "zhou.jing@wisdomtech.com",
                "赵海": "zhao.hai@wisdomtech.com",
                "孙琳": "sun.lin@wisdomtech.com",
                "陈明": "chen.ming@wisdomtech.com"
            },
            "client": {
                "赵静": "zhao.jing@clientA.com",
                "王刚": "wang.gang@clientA.com",
                "李晓": "li.xiao@clientB.com",
                "张伟": "zhang.wei@clientB.com",
                "陈总": "chen@clientC.com",
                "吴经理": "wu@clientC.com"
            },
            "competitor": {
                "陈涛": "hr_chen@competitor.com",
                "杨军": "yang@competitor.com",
                "郑阳": "zheng@competitor.com"
            }
        }

        # 项目信息
        self.projects = {
            "A公司ERP升级": {
                "code": "AERP2023",
                "docs": ["系统架构设计", "数据库设计", "接口文档", "测试报告"],
                "contacts": ["赵静", "王刚"],
                "duration": "12个月",
                "importance": "high",
                "sensitive_data": ["客户数据", "财务信息", "业务流程"]
            },
            "B集团数据迁移": {
                "code": "BMIG2023",
                "docs": ["迁移方案", "数据映射", "安全策略", "进度报告"],
                "contacts": ["李晓", "张伟"],
                "duration": "6个月",
                "importance": "medium",
                "sensitive_data": ["历史数据", "用户信息"]
            },
            "C企业数字化转型": {
                "code": "CDIT2023",
                "docs": ["项目建议书", "实施方案", "成本预算", "合同文件"],
                "contacts": ["陈总", "吴经理"],
                "duration": "18个月",
                "importance": "high",
                "sensitive_data": ["战略规划", "核心业务数据"]
            }
        }

        # 邮件模板
        self.templates = {
            "normal": {
                "工作汇报": [
                    "关于{project}项目{phase}的周报",
                    "{project}项目{phase}进展情况",
                    "{project}{phase}阶段总结"
                ],
                "会议通知": [
                    "{project}项目{phase}评审会议",
                    "{project}周例会通知",
                    "{project}阶段性会议安排"
                ],
                "文档传输": [
                    "{project}{doc}文档提交",
                    "更新{project}的{doc}",
                    "{project}{doc}最新版本"
                ]
            },
            "suspicious": {
                "数据泄露": [
                    "个人资料备份",
                    "文档整理",
                    "项目资料归档"
                ],
                "工作交接": [
                    "工作交接清单",
                    "项目文档移交",
                    "历史资料整理"
                ],
                "竞争对手": [
                    "咨询回复",
                    "简历确认",
                    "入职材料"
                ]
            },
            "system_notice": {
                "权限变更": [
                    "系统权限变更通知",
                    "账号访问权限更新",
                    "系统维护通知"
                ],
                "系统更新": [
                    "系统版本更新通知",
                    "安全补丁安装通知",
                    "系统功能优化通知"
                ]
            },
            "business_communication": {
                "商务谈判": [
                    "关于{project}项目合作意向的讨论",
                    "{project}项目报价方案",
                    "商务合作洽谈会议安排"
                ],
                "合同管理": [
                    "{project}合同条款确认",
                    "合同签署相关事宜",
                    "合同变更说明"
                ]
            }
        }

        # 关键时间节点
        self.key_dates = {
            "2023-01-15": "晋升项目经理",
            "2023-06-01": "负责核心客户项目",
            "2023-09-05": "开始接触竞争对手",
            "2024-01-05": "准备离职",
            "2024-03-01": "最后工作日"
        }

        self.email_templates = {
            "normal": [
                "尊敬的{recipient}：\n\n关于{project}项目的最新进展，{content}...",
                "您好，{recipient}：\n\n{project}项目目前进展顺利，{content}...",
                # 添加更多模板...
            ],
            "suspicious": [
                "Hi，{recipient}\n\n{content}...",
                # 添加更多模板...
            ]
        }

    def generate_attachment_info(self, project, is_suspicious, event_type=None):
        """生成更真实的附件信息"""
        if not is_suspicious:
            # 正常附件
            doc_type = random.choice(["docx", "pdf", "pptx", "xlsx"])
            doc_name = random.choice(self.projects[project]["docs"])
            size = random.randint(100, 5000)
            return f"{doc_name}.{doc_type} (application/{doc_type}, {size}字节)"
        else:
            if event_type == "high_risk":
                # 高风险附件，伪装成普通文档
                doc_type = random.choice(["zip", "rar"])
                size = random.randint(10000, 50000)
                names = ["工作总结", "项目资料", "培训文档", "技术文档"]
                return f"{random.choice(names)}_{datetime.now().strftime('%Y%m')}.{doc_type} (application/{doc_type}, {size}字节)"
            else:
                # 普通可疑附件
                if random.random() < 0.4:
                    # 加密压缩包
                    size = random.randint(5000, 15000)
                    return f"backup_{random.randint(1,100)}.zip (application/zip, {size}字节)"
                else:
                    # 伪装成正常文档
                    doc_type = random.choice(["docx", "pdf", "xlsx"])
                    size = random.randint(1000, 8000)
                    names = ["会议纪要", "工作报告", "项目进展", "技术方案"]
                    return f"{random.choice(names)}_{random.randint(1,100)}.{doc_type} (application/{doc_type}, {size}字节)"

    def generate_email_content(self, scenario, max_retries=3, retry_delay=2):
        """使用模板+API生成邮件内容"""
        # 首先尝试使用模板
        try:
            template = random.choice(
                self.email_templates["suspicious" if scenario['type'] == "suspicious" else "normal"]
            )

            # 构造基本内容
            base_content = {
                "recipient": scenario['recipient'].split()[0],  # 取收件人姓名
                "project": scenario['project'],
                "project_code": self.projects[scenario['project']]['code'],
            }

            # 仅对核心内容调用API
            prompt = f"""
            请生成一段不超过50字的{scenario['type']}类型的项目进展描述，项目为{scenario['project']}({base_content['project_code']})。
            如果是可疑类型，请使用含蓄的表达。
            """

            for attempt in range(max_retries):
                try:
                    payload = {
                        "model": "THUDM/glm-4-9b-chat",
                        "messages": [{"role": "user", "content": prompt}],
                        "stream": False,
                        "max_tokens": 100,  # 减少token数量
                        "temperature": 0.7,
                        "top_p": 0.7
                    }

                    response = requests.post(
                        self.api_url,
                        json=payload,
                        headers={"Authorization": f"Bearer {self.api_key}",
                                "Content-Type": "application/json"},
                        timeout=(5, 30)  # 缩短超时时间
                    )

                    if response.status_code == 200:
                        content = response.json()['choices'][0]['message']['content'].strip()
                        if content:
                            base_content["content"] = content
                            return template.format(**base_content)

                    # 如果API调用失败，使用备用内容
                    if attempt == max_retries - 1:
                        base_content["content"] = self._get_fallback_content(scenario)
                        return template.format(**base_content)

                    wait_time = retry_delay * (2 ** attempt)
                    time.sleep(wait_time)

                except Exception as e:
                    self.logger.warning(f"API调用失败: {str(e)}")
                    if attempt == max_retries - 1:
                        base_content["content"] = self._get_fallback_content(scenario)
                        return template.format(**base_content)
                    time.sleep(retry_delay * (2 ** attempt))

            return template.format(**base_content)

        except Exception as e:
            self.logger.error(f"生成邮件内容失败: {str(e)}")
            return "生成邮件内容失败"

    def _get_fallback_content(self, scenario):
        """生成备用内容"""
        normal_contents = [
            f"按计划推进中，已完成{random.choice(['需求分析', '概要设计', '详细设计'])}阶段",
            f"完成了{random.choice(self.projects[scenario['project']]['docs'])}的编写",
            "团队正在积极推进相关工作"
        ]
        suspicious_contents = [
            "需要和您详细沟通一些事情",
            "有一些想法想请教您",
            "希望能得到您的建议"
        ]
        return random.choice(suspicious_contents if scenario['type'] == "suspicious" else normal_contents)

    def generate_dataset(self, num_emails=200, batch_size=10):
        """分批生成模拟邮件数据集"""
        self.logger.info(f"开始生成{num_emails}封模拟邮件...")
        emails = []

        # 调整为两年时间跨度
        start_date = datetime(2023, 1, 1)
        end_date = datetime(2024, 12, 31)

        # 根据邮件类型分布计算各类型邮件数量
        email_counts = {
            "daily": int(num_emails * self.email_distribution["daily"]),      # 约140封
            "system": int(num_emails * self.email_distribution["system"]),    # 约30封
            "business": int(num_emails * self.email_distribution["business"]), # 约20封
            "suspicious": int(num_emails * self.email_distribution["suspicious"]) # 约10封
        }

        # 生成时间序列，确保工作日邮件更多
        dates = []
        current_date = start_date
        while current_date <= end_date:
            # 周末权重降低
            if current_date.weekday() < 5:  # 周一到周五
                weight = 1.0
            else:  # 周六日
                weight = 0.3

            # 根据权重添加日期
            if random.random() < weight:
                dates.append(current_date)

            current_date += timedelta(days=1)

        # 随机选择日期并排序
        selected_dates = sorted(random.sample(dates, num_emails))

        # 生成关键时间点的邮件
        self.logger.info("生成关键时间点邮件...")
        key_emails = self._generate_key_date_emails()
        emails.extend(key_emails)

        # 分批生成其余邮件
        remaining_emails = num_emails - len(key_emails)
        self.logger.info(f"分批生成剩余{remaining_emails}封常规邮件...")

        with tqdm(total=remaining_emails, desc="生成邮件") as pbar:
            for i in range(0, remaining_emails, batch_size):
                batch_end = min(i + batch_size, remaining_emails)
                batch_size_actual = batch_end - i

                for j in range(batch_size_actual):
                    # 根据邮件分布选择邮件类型
                    email_type = random.choices(
                        list(email_counts.keys()),
                        weights=list(self.email_distribution.values())
                    )[0]

                    # 生成对应类型的邮件
                    email = self._generate_single_email(
                        selected_dates[i + j],
                        force_suspicious=(email_type == "suspicious"),
                        email_type=email_type
                    )
                    emails.append(email)
                    pbar.update(1)

                # 增加批次间隔
                if i + batch_size < remaining_emails:
                    time.sleep(2)  # 减少等待时间到2秒

        # 按时间排序
        df = pd.DataFrame(emails)
        df['时间戳'] = pd.to_datetime(df['时间戳'])
        df = df.sort_values('时间戳')

        # 确保列名和格式与pst_parser.py输出完全一致
        output_df = pd.DataFrame({
            '发件人': df['发件人'],
            '时间戳': df['时间戳'].dt.strftime('%Y-%m-%d %H:%M:%S'),
            '主题': df['主题'],
            '邮件线程': df['邮件线程'],
            '附件数量': df['附件数量'],
            '附件信息': df['附件信息'],
            '文件夹路径': df['文件夹路径'],
            '文件夹类型': df['文件夹类型'],
            '收件人': df['收件人'],
            '邮件正文': df['邮件正文']
        })

        # 添加一些统计信息
        sent_count = len(output_df[output_df['文件夹路径'] == self.email_folders["sent"]])
        inbox_count = len(output_df[output_df['文件夹路径'] == self.email_folders["inbox"]])
        self.logger.info(f"生成完成 - 发件箱: {sent_count}封, 收件箱: {inbox_count}封")

        return output_df

    def _generate_key_date_emails(self):
        """生成关键时间点的邮件"""
        key_emails = []

        # 按照时间线设定生成邮件
        key_events = {
            # 铺垫期
            "2023-01-15": {"event": "晋升项目经理", "type": "normal",
                          "actions": ["申请更高权限", "接手重要项目"]},
            "2023-02-05": {"event": "推动项目资料云端化", "type": "suspicious",
                          "actions": ["建议升级文档系统", "推动远程访问"]},
            "2023-03-10": {"event": "负责制定部门备份策略", "type": "suspicious",
                          "actions": ["获取数据备份权限", "制定备份方案"]},

            # 布局期
            "2023-07-05": {"event": "优化项目文档结构", "type": "suspicious",
                          "actions": ["重组文档结构", "建立知识库"]},
            "2023-09-03": {"event": "参加行业技术峰会", "type": "suspicious",
                          "actions": ["与竞争对手初次接触", "交流技术经验"]},
            "2023-11-08": {"event": "负责灾备系统升级", "type": "suspicious",
                          "actions": ["升级备份系统", "优化数据存储"]},

            # 实施期
            "2024-01-05": {"event": "开始工作交接", "type": "high_risk",
                          "actions": ["整理历史文档", "移交项目材料"]},
            "2024-02-15": {"event": "交接文档准备", "type": "high_risk",
                          "actions": ["准备交接清单", "整理核心资料"]},
            "2024-03-01": {"event": "最后工作日", "type": "high_risk",
                          "actions": ["完成工作交接", "提交总结报告"]}
        }

        for date, event_info in key_events.items():
            # 为每个关键事件生成多封相关邮件
            for action in event_info["actions"]:
                email = self._generate_single_email(
                    pd.to_datetime(date),
                    force_suspicious=(event_info["type"] in ["suspicious", "high_risk"]),
                    key_event=action,
                    event_type=event_info["type"]
                )
                key_emails.append(email)

        return key_emails

    def _generate_single_email(self, date, force_suspicious=None, key_event=None, event_type=None, email_type="daily"):
        """生成单个邮件，增加伪装技巧"""
        # 确定邮件类型
        is_suspicious = force_suspicious if force_suspicious is not None else (email_type == "suspicious")

        # 添加伪装策略
        disguise_strategies = {
            "normal_business": {
                "subject_style": "formal",
                "content_style": "professional",
                "attachment_type": "standard"
            },
            "data_leak": {
                "subject_style": "casual",
                "content_style": "indirect",
                "attachment_type": "disguised"
            },
            "competitor_contact": {
                "subject_style": "business",
                "content_style": "ambiguous",
                "attachment_type": "encrypted"
            }
        }

        # 选择伪装策略
        strategy = disguise_strategies["normal_business"]
        if is_suspicious:
            strategy = random.choice([
                disguise_strategies["data_leak"],
                disguise_strategies["competitor_contact"]
            ])

        # 选择项目
        project = random.choice(list(self.projects.keys()))

        # 选择收发件人
        recipient = self._select_recipient(project, is_suspicious, event_type)
        is_outgoing = random.random() < 0.5  # 50%概率是发出的邮件

        # 构建场景
        scenario = {
            "time_period": date.strftime("%Y-%m-%d"),
            "type": email_type,
            "project": project,
            "recipient": recipient if is_outgoing else self.my_name,
            "purpose": key_event if key_event else "regular_communication"
        }

        # 生成邮件内容
        content = self.generate_email_content(scenario)

        # 添加多级转发链
        if email_type in ["suspicious", "business"]:
            content = self._add_forwarding_chain(content, is_suspicious)

        # 构建邮件记录，字段顺序与pst_parser.py保持一致
        email = {
            '发件人': f"{self.my_name} <{self.my_email}>" if is_outgoing else recipient,
            '时间戳': self._generate_timestamp(date, is_suspicious).strftime('%Y-%m-%d %H:%M:%S'),
            '主题': self._generate_subject(project, is_suspicious, event_type),
            '邮件线程': self.normalize_subject(self._generate_subject(project, is_suspicious, event_type)),
            '附件数量': random.randint(1, 3) if is_suspicious else random.randint(0, 1),
            '附件信息': self.generate_attachment_info(project, is_suspicious, event_type),
            '文件夹路径': self.email_folders["sent"] if is_outgoing else self.email_folders["inbox"],
            '文件夹类型': self.get_folder_type(self.email_folders["sent"] if is_outgoing else self.email_folders["inbox"]),
            '收件人': recipient if is_outgoing else f"{self.my_name} <{self.my_email}>",
            '邮件正文': content
        }

        return email

    def _select_recipient(self, project, is_suspicious, event_type=None):
        """选择收件人，增加隐蔽性"""
        if is_suspicious:
            if event_type == "high_risk":
                # 高风险行为：使用个人邮箱或通过中间人
                if random.random() < 0.4:
                    return "张明 <zhangming_2023@gmail.com>"
                else:
                    # 通过可信任的内部人员中转
                    trusted = ["王婷", "周静"]
                    contact_name = random.choice(trusted)
                    return f"{contact_name} <{self.contacts['internal'][contact_name]}>"
            else:
                # 普通可疑行为：通过正常业务掩护
                if random.random() < 0.3:
                    # 通过客户方中转
                    contact = random.choice(list(self.contacts["client"].items()))
                    return f"{contact[0]} <{contact[1]}>"
                elif random.random() < 0.4:
                    # 直接与竞争对手联系（伪装成正常商务往来）
                    contact = random.choice(list(self.contacts["competitor"].items()))
                    return f"{contact[0]} <{contact[1]}>"

        # 正常邮件
        project_contacts = self.projects[project]["contacts"]
        contact_name = random.choice(project_contacts)
        for category in self.contacts.values():
            if contact_name in category:
                return f"{contact_name} <{category[contact_name]}>"

        return "项目组 <project_team@wisdomtech.com>"

    def _generate_subject(self, project, is_suspicious, event_type=None):
        """生成更具隐蔽性的邮件主题"""
        if is_suspicious:
            if event_type == "high_risk":
                templates = {
                    "数据处理": ["数据清理报告", "存储空间整理", "历史记录归档"],
                    "工作交接": ["工作总结", "项目情况汇总", "经验分享"],
                    "会议沟通": ["业务交流", "技术探讨", "行业分享"]
                }
            else:
                templates = {
                    "文档管理": ["文档结构优化", "知识库更新", "资料整理总结"],
                    "系统维护": ["系统升级方案", "运维报告", "性能优化建议"],
                    "项目管理": ["项目进展汇报", "阶段性总结", "工作计划"]
                }

            category = random.choice(list(templates.keys()))
            template = random.choice(templates[category])
        else:
            category = random.choice(list(self.templates["normal"].keys()))
            template = random.choice(self.templates["normal"][category])

        return template.format(
            project=project,
            phase=random.choice(["需求分析", "设计", "开发", "测试", "部署"]),
            doc=random.choice(self.projects[project]["docs"])
        )

    def _add_forwarding_chain(self, email_content, is_suspicious):
        """添加多级转发链"""
        if is_suspicious and random.random() < 0.3:
            # 通过可信中间人转发
            trusted_contacts = ["王婷", "周静", "李强"]
            forwarding_chain = []

            # 随机生成2-3级转发链
            chain_length = random.randint(2, 3)
            for _ in range(chain_length):
                forwarder = random.choice(trusted_contacts)
                forwarding_chain.append(forwarder)

            # 构建转发内容
            for forwarder in reversed(forwarding_chain):
                email_content = f"转发: \n发件人: {forwarder}\n\n{email_content}"

        return email_content

    def _generate_timestamp(self, date, is_suspicious):
        """生成更真实的时间戳"""
        base_time = datetime.strptime(date.strftime("%Y-%m-%d"), "%Y-%m-%d")

        if is_suspicious:
            # 可疑时间特征
            if random.random() < 0.4:
                # 非工作时间
                hour = random.choice(list(range(0, 7)) + list(range(19, 24)))
                minute = random.randint(0, 59)
            else:
                # 正常工作时间，降低可疑性
                hour = random.randint(9, 17)
                minute = random.randint(0, 59)
        else:
            # 正常工作时间
            hour = random.randint(9, 17)
            minute = random.randint(0, 59)

        return base_time + timedelta(hours=hour, minutes=minute)

    def get_folder_type(self, folder_path):
        """根据文件夹路径判断文件夹类型"""
        folder_path_lower = folder_path.lower()
        if "收件箱" in folder_path_lower:
            return "收件箱"
        elif "发件箱" in folder_path_lower:
            return "发件箱"
        else:
            return "其他"

    def normalize_subject(self, subject):
        """归一化邮件主题，与pst_parser.py保持一致"""
        if not subject:
            return "无主题"
        subject = subject.lower().strip()
        # 去除 "re:", "fw:" 等前缀
        subject = re.sub(r'^(re:|fw:|fwd:)\s*', '', subject)
        return subject

def main():
    try:
        generator = EmailGenerator()
        df = generator.generate_dataset(100)

        # 保存为CSV
        output_dir = Path("output")
        output_dir.mkdir(exist_ok=True)
        output_path = output_dir / "metadata_report.csv"  # 使用与pst_parser.py相同的输出文件名
        df.to_csv(output_path, index=False, encoding='utf-8')
        print(f"\n模拟邮件数据已生成：{output_path}")

        # 输出统计信息
        print("\n数据统计：")
        print(f"总邮件数：{len(df)}")
        print(f"包含附件的邮件数：{len(df[df['附件数量'] > 0])}")
        print(f"时间范围：{df['时间戳'].min()} 至 {df['时间戳'].max()}")

    except Exception as e:
        logging.error(f"生成过程出错: {str(e)}", exc_info=True)
        raise

if __name__ == "__main__":
    main()