-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathverify_bug_with_llm.py
More file actions
773 lines (648 loc) · 36.9 KB
/
verify_bug_with_llm.py
File metadata and controls
773 lines (648 loc) · 36.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
#!/usr/bin/env python3
"""
Bug Verification Module
This module handles verification of potential bugs found during test execution
by using LLMs to analyze whether they are real issues or false positives.
"""
import logging
import re
import traceback
import time
from feedback import call_anthropic_api, call_gpt_api, call_deepseek_api, reset_llm_metrics, get_llm_metrics_summary
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("bug_verification")
def verify_bug_with_llm(bug_info, test_method, source_code, class_name):
"""
Use LLM to verify if a detected bug is a legitimate bug or a false positive
Parameters:
bug_info (dict): Information about the detected bug
test_method (str): The test method code that triggered the bug
source_code (str): The source code of the class being tested
class_name (str): Name of the class being tested
Returns:
dict: Verification result with additional reasoning
"""
# Defensive checks for inputs
if not test_method or not source_code:
logger.warning("Missing test method or source code for bug verification")
return {
"is_real_bug": bug_info.get("confidence", 0.5) > 0.7, # Default based on confidence
"confidence": bug_info.get("confidence", 0.5),
"reasoning": "Insufficient data for verification"
}
bug_type = bug_info.get("type", "unknown")
error_message = bug_info.get("error", "")
severity = bug_info.get("severity", "medium")
confidence = bug_info.get("confidence", 0.5)
# Pre-filter common false positives without LLM call
# Check for known patterns in specific bug types
if bug_type == "assertion_failure":
# Check if this is just an incorrect assertion in empty/null/trivial tests
if ("expected: <null>" in error_message and "but was: <" in error_message) or \
("expected: <[]>" in error_message and "but was: <" in error_message) or \
("expected: <>" in error_message and "but was: <" in error_message):
if "null" in test_method.lower() or "empty" in test_method.lower():
return {
"is_real_bug": False,
"confidence": 0.9,
"reasoning": "This is a common false positive for empty/null tests - the test expectation is likely incorrect"
}
# Auto-verify high-confidence memory errors
if bug_type == "memory_error" or "OutOfMemoryError" in error_message or "StackOverflowError" in error_message:
return {
"is_real_bug": True,
"confidence": 0.95,
"reasoning": "Memory errors are almost always real bugs, typically indicating infinite recursion or excessive memory allocation"
}
# If confidence is already very high, skip verification
if confidence > 0.9:
logger.info(f"Skipping verification for high-confidence bug: {bug_type}")
return {
"is_real_bug": True,
"confidence": confidence,
"reasoning": "High confidence pre-verification"
}
# Truncate source code if too long to fit in context window
# trimmed_source = source_code[:5000] if len(source_code) > 5000 else source_code
# if len(source_code) > 5000:
# logger.info(f"Truncated source code from {len(source_code)} to 5000 chars for LLM verification")
# Prepare analysis prompt
prompt = f"""
You are a professional Java analysis expert specializing in identifying real bugs and false positives in unit tests.
I will provide you with the source code of a Java class, a test method, and information about a potential bug found during testing.
Please analyze whether this is a real bug in the source code or just a false positive caused by testing environment or code issues.
Class name: {class_name}
Source code:
```java
{source_code}
```
Test method:
```java
{test_method}
```
Issue found:
- Bug type: {bug_type}
- Severity: {severity}
- Error message: {error_message}
Please analyze whether the issue found by this test method is a real bug in the source code or a false positive due to test code issues or environment problems.
Please provide your response in this specific format:
1. VERDICT: "REAL BUG" or "FALSE POSITIVE"
2. CONFIDENCE: A number between 1-10
3. REASONING: Your detailed analysis and reasoning
The analysis should particularly consider:
1. Whether the error is caused by the test code itself (e.g., test environment configuration, test dependencies, etc.)
2. Whether the issue actually exposes a defect in the class being tested
3. Whether the test method is reasonable or if it's testing unreasonable/extreme edge cases
4. Whether the test expectations match the intended behavior of the class
For CONFIDENCE score, use these guidelines:
- 9-10: Very confident in the assessment
- 7-8: Confident but with some uncertainty
- 5-6: Moderately confident
- 1-4: Significant uncertainty
"""
try:
# Attempt to use Anthropic first for verification
logger.info(f"Verifying potential {bug_type} bug in class {class_name}")
response = call_anthropic_api(prompt)
# response = call_deepseek_api(prompt)
if not response or len(response) < 50:
logger.warning(f"Insufficient response from API for bug verification: {response}")
return {
"is_real_bug": confidence > 0.7, # Default to pre-verification confidence
"confidence": confidence,
"reasoning": "Verification failed - insufficient API response"
}
except Exception as e:
logger.warning(f"Failed to call Anthropic API, falling back to GPT: {str(e)}")
try:
# Fall back to GPT
response = call_gpt_api(prompt)
if not response or len(response) < 50:
raise ValueError("Insufficient response")
except Exception as e2:
logger.error(f"Failed to verify bug with LLM: {str(e2)}")
# If both fail, rely on the existing confidence score
return {
"is_real_bug": confidence > 0.7,
"confidence": confidence,
"reasoning": "Unable to perform LLM verification"
}
# print("--------------------------------")
# print("response:")
# print(response)
# print("--------------------------------")
# Parse the response
verdict_match = re.search(r'VERDICT:\s*["\'"]?(REAL BUG|FALSE POSITIVE)["\'"]?', response, re.IGNORECASE)
confidence_match = re.search(r'CONFIDENCE:\s*(\d+(?:\.\d+)?)', response)
reasoning_match = re.search(r'REASONING:(.+?)(?=VERDICT:|CONFIDENCE:|\Z)', response, re.DOTALL)
# If no structured response was found, use a more flexible pattern matching
if not verdict_match:
# Look for explicit statements about bug status
if re.search(r'(this|it)\s+(is|appears to be)\s+a\s+real\s+bug', response.lower()) or \
"yes, this is a real bug" in response.lower() or \
"real bug" in response.lower():
is_real_bug = True
verification_confidence = 0.8
elif re.search(r'(this|it)\s+(is|appears to be)\s+not\s+a\s+real\s+bug', response.lower()) or \
"not a real bug" in response.lower() or \
"false positive" in response.lower():
is_real_bug = False
verification_confidence = 0.8
else:
# Count signals for more nuanced decision
positive_signals = ["real issue", "actual bug", "code defect", "exposes a problem",
"defect in the class", "vulnerability", "should be fixed"]
negative_signals = ["unreasonable test", "test method issue", "test environment problem",
"not a bug", "expected behavior", "by design", "unreasonable expectation",
"edge case that", "not realistic", "documented limitation"]
pos_count = sum(1 for signal in positive_signals if signal in response.lower())
neg_count = sum(1 for signal in negative_signals if signal in response.lower())
if pos_count > neg_count:
is_real_bug = True
verification_confidence = 0.6 + min(0.3, 0.05 * (pos_count - neg_count))
else:
is_real_bug = False
verification_confidence = 0.6 + min(0.3, 0.05 * (neg_count - pos_count))
# Try to extract reasoning from the response
reasoning = response[:500] # Just use the first part of the response
else:
# Process structured response
is_real_bug = verdict_match.group(1).upper() == "REAL BUG"
# Get confidence score
if confidence_match:
llm_confidence = float(confidence_match.group(1))
verification_confidence = min(llm_confidence / 10, 0.95) # Convert to 0-1 scale
else:
verification_confidence = 0.7 if is_real_bug else 0.7 # Default confidence
# Get reasoning
if reasoning_match:
reasoning = reasoning_match.group(1).strip()
else:
# Try to extract reasoning from the full response
reasoning = response[:500] # Limit length
# Log the verification result
result_type = "REAL BUG" if is_real_bug else "FALSE POSITIVE"
logger.info(f"Bug verification result: {result_type} with confidence {verification_confidence:.2f}")
return {
"is_real_bug": is_real_bug,
"confidence": verification_confidence,
"reasoning": reasoning[:500] if reasoning else "No detailed reasoning provided",
"full_response": response[:1000] # Store truncated response for debugging
}
def filter_verified_bug_methods(bug_methods, source_code, class_name, package_name, test_code=None):
"""
Use LLM to verify which bug-finding methods are likely to be real bugs in batch
Parameters:
bug_methods (list): List of potential bug-finding methods
source_code (str): Source code of the class
class_name (str): Class name
package_name (str): Package name
test_code (str): Optional full test code for context
Returns:
list: Filtered list of verified bug methods with verification results
"""
verified_methods = []
try:
logger.info(f"Batch filtering {len(bug_methods)} potential bug methods...")
if not bug_methods:
return []
# Check for methods that were already verified and skip them
methods_to_verify = []
for method in bug_methods:
if isinstance(method, dict) and "code" in method:
# If already verified, use existing results
if method.get("verified", False):
verified_methods.append(method)
else:
methods_to_verify.append(method)
else:
methods_to_verify.append(method)
if not methods_to_verify:
logger.info("All methods already verified, returning cached results")
return verified_methods
# Continue only verifying unverified methods
logger.info(f"Verifying {len(methods_to_verify)} unverified methods")
# Check for obviously incompatible method calls
incompatible_methods = ["setValuesList", "setDeprecated", "addValuesList",
"privateMethod", "inaccessible"]
compiler_incompatible = []
# Filter obviously incompatible methods
for idx, method in enumerate(methods_to_verify):
method_code = method["code"] if isinstance(method, dict) and "code" in method else str(method)
# Check for known incompatible API calls
for incompatible in incompatible_methods:
if incompatible in method_code:
logger.warning(f"Method {idx+1} uses incompatible API call: {incompatible}")
compiler_incompatible.append(idx)
break
# Don't exclude constructor tests unless they reference undefined symbols
if "cannot find symbol" in method_code or "cannot resolve symbol" in method_code:
logger.warning(f"Method {idx+1} references undefined symbols")
compiler_incompatible.append(idx)
# Remove incompatible methods
filtered_methods = [m for i, m in enumerate(methods_to_verify) if i not in compiler_incompatible]
# Auto-mark common false positive patterns
for method in filtered_methods[:]: # Use a copy of the list for iteration
if isinstance(method, dict) and "code" in method:
method_code = method["code"]
# Check for assertion failures
if "expected:" in method_code and "but was:" in method_code:
# Mark as a verified false positive
method["verified"] = True
method["is_real_bug"] = False # This is a false positive, not a real bug
method["verification_confidence"] = 0.9
method["verification_reasoning"] = "Assertion failure is due to mismatched expectations, not a real bug in the code."
verified_methods.append(method)
filtered_methods.remove(method) # Remove from methods to verify
continue
# Check for other common false positive patterns
if "Expected exception to be thrown" in method_code:
method["verified"] = True
method["is_real_bug"] = False
method["verification_confidence"] = 0.9
method["verification_reasoning"] = "Test expects exception that is not thrown - likely due to changed behavior."
verified_methods.append(method)
filtered_methods.remove(method) # Remove from methods to verify
continue
# If all methods have been processed, or no methods remain, return verified results
if not filtered_methods:
return verified_methods
# Batch process remaining methods in smaller chunks to avoid LLM context limits
batch_size = 5 # Process in batches of 5 methods at a time
all_batches = [filtered_methods[i:i+batch_size] for i in range(0, len(filtered_methods), batch_size)]
for batch_idx, batch in enumerate(all_batches):
logger.info(f"Processing batch {batch_idx+1}/{len(all_batches)} with {len(batch)} methods")
# Create prompt for LLM to verify current batch of bug methods
prompt = f"""You are a Java testing expert. You need to analyze the following test methods to determine if they likely identify real bugs in the code under test.
Source class: {package_name}.{class_name}
Source code snippet:
```java
{source_code[:2500]}
```
Potential bug-finding test methods:
"""
for i, method in enumerate(batch):
if isinstance(method, dict) and "code" in method:
method_code = method["code"]
method_bugs = method.get("bug_info", [])
if method_bugs:
bug_info = ", ".join([bug.get("type", "Unknown") for bug in method_bugs])
if len(method_bugs) > 3:
bug_info += f", and {len(method_bugs) - 3} more"
else:
bug_info = "Unknown issue"
prompt += f"\nMethod {i+1}:\n```java\n{method_code}\n```\n\nDetected issues: {bug_info}\n"
else:
prompt += f"\nMethod {i+1}:\n```java\n{method}\n```\n\n"
prompt += """
For each method, determine if it's testing a real bug or potential issue in the code, rather than just a feature or expected behavior.
Criteria for a real bug:
- The test identifies an actual flaw, exception, or unexpected behavior
- The behavior being tested violates the expected contract or reasonable assumptions for the class
- It's not just testing a documented limitation or expected boundary condition
For each method, provide:
1. Is it likely detecting a real bug/issue? Please answer with a Yes/No
2. A brief explanation of your reasoning
3. A "confidence" score from 1-10 on whether this is a genuine bug
Then provide a final list of real bugs in this exact format:
REAL_BUGS: [comma-separated method numbers]
For example, if methods 2, 5, and 8 are real bugs, end your response with:
REAL_BUGS: 2, 5, 8
"""
try:
# Call the LLM API
result = call_anthropic_api(prompt, max_tokens=8192)
# result = call_deepseek_api(prompt)
if not result or len(result) < 100:
logger.warning("Insufficient response from LLM for batch bug verification")
# Process remaining methods in batch as likely false positives
for method in batch:
if isinstance(method, dict):
method["verified"] = True
method["is_real_bug"] = False
method["verification_confidence"] = 0.7
method["verification_reasoning"] = "Automated assessment: likely false positive due to insufficient LLM response"
verified_methods.append(method)
continue
# Extract verified method numbers from the response
verified_indices = []
# Look for explicit REAL_BUGS format (preferred format)
real_bugs_pattern = r"REAL_BUGS:\s*([\d,\s]+)"
real_bugs_match = re.search(real_bugs_pattern, result)
if real_bugs_match:
logger.info("Found explicit REAL_BUGS format in response")
# Extract comma-separated numbers and convert to integers
numbers_text = real_bugs_match.group(1).strip()
numbers = re.findall(r'\d+', numbers_text)
for num in numbers:
try:
idx = int(num) - 1 # Convert to 0-based index
if 0 <= idx < len(batch):
verified_indices.append(idx)
except ValueError:
continue
else:
# Fallback strategy 1: Look for "Final list" format
list_matches = re.findall(r"(?:- Method|Method)\s+(\d+).*?(?:real bug|REAL BUG)", result, re.IGNORECASE)
if list_matches:
logger.info(f"Found {len(list_matches)} methods in 'list' format")
for method_num in list_matches:
try:
idx = int(method_num) - 1
if 0 <= idx < len(batch):
verified_indices.append(idx)
except ValueError:
continue
# Fallback strategy 2: Look for Yes/No judgments
if not verified_indices:
logger.info("Attempting to extract from individual Yes/No judgments")
method_judgments = re.findall(
r"Method\s+(\d+).*?(?::|is)\s*(Yes|No|yes|no|TRUE|FALSE|True|False)",
result,
re.IGNORECASE | re.DOTALL
)
for method_num, judgment in method_judgments:
try:
idx = int(method_num) - 1 # Convert to 0-based index
if judgment.lower() in ['yes', 'true'] and 0 <= idx < len(batch):
verified_indices.append(idx)
except ValueError:
continue
# Log the detected real bugs
if verified_indices:
verified_indices = sorted(list(set(verified_indices))) # Remove duplicates and sort
logger.info(f"Detected real bugs in methods: {[i+1 for i in verified_indices]}")
else:
logger.warning("No real bugs detected in this batch")
# Extract confidence scores for each method
confidence_scores = {}
confidence_pattern = r"Method\s+(\d+).*?[Cc]onfidence:?\s*(\d+)(?:\s*/\s*10)?"
confidence_matches = re.findall(confidence_pattern, result, re.IGNORECASE | re.DOTALL)
for method_num, score in confidence_matches:
try:
idx = int(method_num) - 1 # Convert to 0-based index
if 0 <= idx < len(batch):
score_val = float(score) / 10.0 # Normalize to 0-1 scale
confidence_scores[idx] = score_val
except ValueError:
continue
# Process all methods in current batch with verification results
for idx, method in enumerate(batch):
if isinstance(method, dict):
method_copy = method.copy()
method_copy["verified"] = True
# If this is identified as a real bug
if idx in verified_indices:
method_copy["is_real_bug"] = True
method_copy["verification_confidence"] = confidence_scores.get(idx, 0.7)
# Extract reasoning for this method if available
method_pattern = r"Method\s+" + re.escape(str(idx+1)) + r".*?(?:Yes|No).*?(?:Reason(?:ing)?:|explanation)?\s*(.*?)(?=Method\s+\d+|$|REAL_BUGS:)"
reasoning_match = re.search(method_pattern, result, re.IGNORECASE | re.DOTALL)
if reasoning_match:
raw_reasoning = reasoning_match.group(1).strip()
# Clean up reasoning
cleaned_reasoning = re.sub(r'Confidence:?\s*\d+(/10)?', '', raw_reasoning).strip()
method_copy["verification_reasoning"] = cleaned_reasoning
else:
method_copy["verification_reasoning"] = "LLM verification identified this as a real bug"
else:
# Mark as false positive
method_copy["is_real_bug"] = False
method_copy["verification_confidence"] = 1.0 - confidence_scores.get(idx, 0.3)
method_copy["verification_reasoning"] = "LLM verification determined this is likely a false positive"
verified_methods.append(method_copy)
else:
# Handle non-dictionary objects
verified_methods.append({
"code": method,
"verified": True,
"is_real_bug": idx in verified_indices,
"verification_confidence": confidence_scores.get(idx, 0.5),
"bug_info": []
})
# Add a short delay between batch requests to avoid rate limiting
if len(all_batches) > 1 and batch_idx < len(all_batches)-1:
time.sleep(1)
except Exception as e:
logger.error(f"Error in batch LLM verification of bugs: {str(e)}")
logger.error(traceback.format_exc())
# Process all remaining methods in batch as false positives due to error
for method in batch:
if isinstance(method, dict):
method["verified"] = True
method["is_real_bug"] = False
method["verification_confidence"] = 0.8
method["verification_reasoning"] = "Default assessment due to verification error: likely false positive"
verified_methods.append(method)
# Tally up verified bugs vs false positives
verified_real_bugs = len([m for m in verified_methods if m.get("is_real_bug", False)])
verified_false_positives = len([m for m in verified_methods if m.get("verified", False) and not m.get("is_real_bug", False)])
logger.info(f"Verified {len(verified_methods)} methods: {verified_real_bugs} real bugs, {verified_false_positives} false positives")
return verified_methods
except Exception as e:
logger.error(f"Failed to filter bug methods: {str(e)}")
logger.error(traceback.format_exc())
return verified_methods
def merge_verified_bug_tests(base_test, verified_bug_methods, class_name, package_name, project_dir, source_code):
"""
Merge verified bug-finding test methods into a base test
Parameters:
base_test (str): Base test code
verified_bug_methods (list): List of verified bug-finding methods
class_name (str): Class name
package_name (str): Package name
project_dir (str): Project directory
source_code (str): Source code
Returns:
tuple: (enhanced_test, bug_info_dict)
"""
# 导入traceback以确保可用(避免之前的错误)
import traceback
if base_test is None:
logger.error("Base test is None, cannot merge")
return None, {"error": "Base test is None"}
if not verified_bug_methods:
logger.info("No verified bug methods to merge")
return base_test, {"merged_methods": 0}
try:
# 过滤,只保留真正的bug方法
real_bug_methods = [m for m in verified_bug_methods
if isinstance(m, dict) and m.get("is_real_bug", True)]
if not real_bug_methods:
logger.info("No real bug methods to merge after filtering")
return base_test, {"merged_methods": 0, "message": "No real bugs after filtering"}
logger.info(f"Merging {len(real_bug_methods)} verified real bug methods into base test")
# 首先,检查方法是否已存在于基础测试中
methods_to_merge = []
for method in real_bug_methods:
method_code = method.get("code", "")
if not method_code:
continue
# 提取方法名和签名
name_match = re.search(r'void\s+(\w+)\s*\(', method_code)
if not name_match:
continue
method_name = name_match.group(1)
# 尝试提取完整签名(包括参数列表)
full_signature_match = re.search(r'void\s+(\w+\s*\([^)]*\))', method_code)
method_signature = full_signature_match.group(1).strip() if full_signature_match else method_name + "()"
# 1. 检查方法签名是否已存在
if re.search(r'void\s+' + re.escape(method_signature), base_test):
logger.info(f"Method with signature '{method_signature}' already exists in base test, skipping")
continue
# 2. 检查方法名是否已存在 - 这将捕获不同参数但相同名称的重载方法
method_name_pattern = r'void\s+' + re.escape(method_name) + r'\s*\('
if re.search(method_name_pattern, base_test):
# 方法名已存在,但签名不同(可能是重载) - 重命名以避免冲突
suffix = 1
while re.search(r'void\s+' + re.escape(f"{method_name}_{suffix}") + r'\s*\(', base_test):
suffix += 1
# 创建新方法名
new_name = f"{method_name}_{suffix}"
logger.info(f"Renaming method from '{method_name}' to '{new_name}' to avoid conflict")
# 替换方法名
method_code = re.sub(
r'(public\s+|private\s+|protected\s+)?void\s+' + re.escape(method_name) + r'\s*\(',
r'\1void ' + new_name + r'(',
method_code
)
# 更新方法中对自身名称的任何引用
method_code = method_code.replace(f"Method {method_name}", f"Method {new_name}")
method_code = method_code.replace(f"Test {method_name}", f"Test {new_name}")
method_name = new_name
# 3. 检查是否存在具有相同内容的方法(忽略空格和注释)
# 清理代码以便于比较
cleaned_method_body = re.sub(r'@Test.*?void\s+\w+\s*\([^{]*\{', '', method_code, flags=re.DOTALL)
cleaned_method_body = re.sub(r'//.*?$', '', cleaned_method_body, flags=re.MULTILINE)
cleaned_method_body = re.sub(r'/\*.*?\*/', '', cleaned_method_body, flags=re.DOTALL)
cleaned_method_body = re.sub(r'\s+', ' ', cleaned_method_body).strip()
# 创建一个足够独特的片段以检查其是否存在
if len(cleaned_method_body) > 40:
key_snippet = cleaned_method_body[:40] # 使用前40个字符作为指纹
if key_snippet in base_test:
logger.info(f"Method body appears similar to existing code, skipping")
continue
# 通过所有检查,可以添加到待合并列表
method["name"] = method_name # 更新方法名(以防已重命名)
method["code"] = method_code # 更新代码(以防已修改)
methods_to_merge.append(method)
# 如果没有方法要合并,返回原始测试
if not methods_to_merge:
logger.info("No methods to merge after duplication checks")
return base_test, {"merged_methods": 0}
# 找到类结束位置进行插入
class_end = base_test.rfind('}')
if class_end <= 0:
logger.error("Could not find class end in base test")
return base_test, {"error": "Could not find class end"}
# 构建增强的测试代码
enhanced_test = base_test[:class_end]
# 添加bug验证注释和方法
for method in methods_to_merge:
# 添加bug验证注释
bug_type = method.get("bug_type", "unknown")
verification = method.get("verification_confidence", 0.8)
severity = method.get("severity", "medium")
method_code = method["code"]
# 确保方法有适当的缩进
method_code = "\n " + method_code.replace("\n", "\n ")
# 在方法开头添加验证注释
if not "// Verified real bug" in method_code:
method_code = method_code.replace("@Test", "@Test\n // Verified real bug test: " +
f"Type: {bug_type}, Severity: {severity}, " +
f"Confidence: {verification:.2f}")
enhanced_test += method_code + "\n"
# 添加类结束括号
enhanced_test += "\n}" if not enhanced_test.rstrip().endswith("}") else ""
# 创建bug信息字典
bug_info = {
"merged_methods": len(methods_to_merge),
"method_names": [m.get("name", "unknown") for m in methods_to_merge],
"real_bugs": True
}
logger.info(f"Successfully merged {len(methods_to_merge)} verified bug methods")
return enhanced_test, bug_info
except Exception as e:
logger.error(f"Error merging bug methods: {str(e)}")
logger.error(traceback.format_exc())
return base_test, {"error": str(e)}
def attempt_to_fix_test_expectations(method_code, reasoning):
"""
Attempt to fix test expectations that don't match the actual behavior
Parameters:
method_code (str): The test method code
reasoning (str): Verification reasoning from the LLM
Returns:
str or None: Fixed method code or None if couldn't fix
"""
# Extract method name for better logging
name_match = re.search(r'void\s+(\w+)\s*\(', method_code)
method_name = name_match.group(1) if name_match else "unknown"
# Fix for URL special characters test
if ("testCreateURLWithSpecialCharacters" in method_code or
"URLSpecialChars" in method_code) and "assertThrows" in method_code and "ParseException" in method_code:
logger.info(f"Fixing URL special characters test: {method_name}")
# URL class actually accepts special characters, so test should expect success
fixed = re.sub(
r'assertThrows\(\s*ParseException\.class,\s*\(\)\s*->\s*\{?\s*(?:URL\s+)?\w+\s*=\s*TypeHandler\.createURL\("([^"]+)"\)\s*;?\s*\}?\s*\)',
r'URL url = TypeHandler.createURL("\1");\n assertNotNull(url);\n // Test originally expected ParseException, but URLs accept these special characters',
method_code
)
return fixed
# Fix for BigDecimal empty string test
if ("testCreateValueWithEmptyBigDecimal" in method_code or
"EmptyBigDecimal" in method_code) and "NumberFormatException" in method_code:
logger.info(f"Fixing BigDecimal empty string test: {method_name}")
# TypeHandler wraps NumberFormatException in ParseException
fixed = method_code.replace(
"NumberFormatException.class",
"ParseException.class"
)
return fixed
# Fix for large decimal number test
if ("testCreateNumberWithLargeDecimal" in method_code or
"LargeDecimal" in method_code) and "throws ParseException" in method_code:
logger.info(f"Fixing large decimal test: {method_name}")
# Method might throw NumberFormatException for very large numbers
return re.sub(
r'(void\s+\w+\s*\([^)]*\))\s*throws\s+ParseException\s*\{',
r'\1 {\n assertThrows(ParseException.class, () -> {',
method_code
).replace(
"assertEquals(new BigInteger",
"});\n /* Original expectation was incorrect:\n assertEquals(new BigInteger"
).replace(
");",
");*/"
)
# Generic fix for assertions that consistently fail in the same way
if "expected:" in method_code and "but was:" in method_code and reasoning:
# Try to find what value it expects vs what it gets
expected_actual_pattern = r"expected:.*?<([^>]+)>.*?but was:.*?<([^>]+)>"
match = re.search(expected_actual_pattern, method_code)
if match:
expected = match.group(1)
actual = match.group(2)
# If the reasoning suggests the actual behavior is correct
if "actual behavior is correct" in reasoning.lower() or "expected value is incorrect" in reasoning.lower():
logger.info(f"Fixing assertion in {method_name}: actual value {actual} appears to be correct")
# Find the assertion line
assertion_pattern = r'(assert\w+\([^;]*expected:.*?<' + re.escape(expected) + r'>.*?but was:.*?<' + re.escape(actual) + r'>[^;]*;)'
assertion_match = re.search(assertion_pattern, method_code)
if assertion_match:
assertion_line = assertion_match.group(1)
# Replace the assertion with a correct one
fixed_assertion = f"// Original assertion failed: {assertion_line}\n assertEquals({actual}, "
# Extract what we're checking
method_calls = re.findall(r'(\w+\([^)]*\))', assertion_line)
if method_calls:
fixed_assertion += f"{method_calls[-1]});"
else:
# Fall back to a basic assertion
fixed_assertion += f"actual);"
fixed_code = method_code.replace(assertion_line, fixed_assertion)
return fixed_code
# Couldn't find a way to fix this method
logger.info(f"Could not automatically fix expectations for method: {method_name}")
return None