-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathparse_fl_logs.py
More file actions
114 lines (92 loc) · 4.13 KB
/
Copy pathparse_fl_logs.py
File metadata and controls
114 lines (92 loc) · 4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
"""
从 fl.py 运行日志中解析 issue 匹配信息
用法: python parse_fl_logs.py <log_file>
"""
import re
import sys
import json
def parse_log_file(log_path):
"""解析日志文件,提取 issue 匹配信息"""
matches = []
current_instance = None
current_expected_title = None
with open(log_path, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
# 匹配 instance_id
instance_match = re.search(r'Instance ID: ([^\s]+)', line)
if instance_match:
current_instance = instance_match.group(1)
continue
# 匹配预期标题
title_match = re.search(r'Extracted title from problem description: (.+)$', line)
if title_match:
current_expected_title = title_match.group(1).strip()
continue
# 匹配找到的 best match
best_match = re.search(r'Found best match (?:PR|Issue) #(\d+), similarity: ([\d.]+)', line)
if best_match and current_instance:
issue_num = best_match.group(1)
similarity = float(best_match.group(2))
# 下一行应该包含标题
matches.append({
'instance_id': current_instance,
'expected_title': current_expected_title,
'matched_issue': issue_num,
'similarity': similarity,
'pending_title': True # 标记等待下一行的标题
})
continue
# 匹配 issue 标题(通常在 "Found best match" 的下一行)
if matches and matches[-1].get('pending_title'):
# 去除前导空格和特殊字符
title_line = line.strip()
if title_line and not title_line.startswith('['):
matches[-1]['matched_title'] = title_line
matches[-1].pop('pending_title')
return matches
def main():
if len(sys.argv) < 2:
print("用法: python parse_fl_logs.py <log_file>")
print("\n示例:")
print(" # 运行并记录日志:")
print(" python kgcompass/fl.py instance_id repo_path output_dir 2>&1 | tee run.log")
print(" # 解析日志:")
print(" python parse_fl_logs.py run.log")
sys.exit(1)
log_file = sys.argv[1]
print(f"解析日志文件: {log_file}")
matches = parse_log_file(log_file)
print(f"\n找到 {len(matches)} 个 issue 匹配记录\n")
# 按相似度排序
matches.sort(key=lambda x: x['similarity'])
# 设置相似度阈值
THRESHOLD = 0.5
low_similarity = [m for m in matches if m['similarity'] < THRESHOLD]
if low_similarity:
print(f"低相似度匹配(< {THRESHOLD}):{len(low_similarity)} 个\n")
print("=" * 100)
for i, match in enumerate(low_similarity, 1):
print(f"\n{i}. {match['instance_id']}")
print(f" 预期标题: {match.get('expected_title', 'N/A')}")
print(f" 匹配 Issue: #{match['matched_issue']}")
print(f" 匹配标题: {match.get('matched_title', 'N/A')}")
print(f" 相似度: {match['similarity']:.3f}")
else:
print(f"所有匹配的相似度都 >= {THRESHOLD}")
# 保存结果
output_file = "parsed_issue_matches.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(matches, f, indent=2, ensure_ascii=False)
print(f"\n完整结果已保存到: {output_file}")
# 统计信息
print(f"\n统计信息:")
print(f" 总匹配数: {len(matches)}")
print(f" 低相似度 (< {THRESHOLD}): {len(low_similarity)}")
if matches:
avg_similarity = sum(m['similarity'] for m in matches) / len(matches)
print(f" 平均相似度: {avg_similarity:.3f}")
print(f" 最低相似度: {min(m['similarity'] for m in matches):.3f}")
print(f" 最高相似度: {max(m['similarity'] for m in matches):.3f}")
if __name__ == "__main__":
main()