FailMapper/static_analysis.py at main · nuwaLab/FailMapper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import json
import argparse
import re
from file_analyzer import analyze_java_file
from dependency_analyzer import analyze_java_project
from indirect_dependency_analyzer import EnhancedJavaDependencyAnalyzer
from boundary_exception_analyzer import analyze_boundary_and_exception

class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)

def clean_unicode_surrogates(text):
    """clean the Unicode surrogate characters in the string"""
    if isinstance(text, str):
        # remove the Unicode surrogate character range (\uD800-\uDFFF)
        cleaned = re.sub(r'[\uD800-\uDFFF]', '', text)
        return cleaned
    return text

def clean_data_recursive(data):
    """recursively clean the Unicode surrogate characters in the data structure"""
    if isinstance(data, dict):
        return {key: clean_data_recursive(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [clean_data_recursive(item) for item in data]
    elif isinstance(data, set):
        return {clean_data_recursive(item) for item in data}
    elif isinstance(data, str):
        return clean_unicode_surrogates(data)
    else:
        return data

def analyze_project(project_path: str) -> dict:
    project_info = {}
    total_files = 0
    successful_files = 0
    failed_files = 0

    print(f"starting to analyze project: {project_path}")

    for root, dirs, files in os.walk(project_path):
        for file in files:
            if 'Test' in file:
                continue
            # Skip module-info.java files as they use Java 9+ module syntax which javalang can't parse
            if file == 'module-info.java':
                continue
            if file.endswith('.java'):
                total_files += 1
                file_path = os.path.join(root, file)
                relative_path = os.path.relpath(file_path, project_path)

                print(f"analyzing: {relative_path}")

                try:
                    result = analyze_java_file(file_path)

                    # check if fallback method is used
                    if "parsing_method" in result:
                        if result["parsing_method"] == "regex_fallback":
                            print(f"  -> using regex fallback parsing")
                        elif result["parsing_method"] == "preprocessed_javalang":
                            print(f"  -> using preprocessed javalang parsing")
                        else:
                            print(f"  -> using {result['parsing_method']} parsing")

                    project_info[relative_path] = result
                    successful_files += 1

                except Exception as e:
                    failed_files += 1
                    print(f"  -> parsing failed: {str(e)}")

                    # create the smallest error record
                    project_info[relative_path] = {
                        "error": str(e),
                        "classes": [],
                        "interfaces": [],
                        "enums": [],
                        "parsing_method": "failed"
                    }

    print(f"\nanalysis completed:")
    print(f"  total files: {total_files}")
    print(f"  successful parsing: {successful_files}")
    print(f"  parsing failed: {failed_files}")
    print(f"  success rate: {(successful_files/total_files*100):.1f}%" if total_files > 0 else "  success rate: 0%")

    return project_info

def save_json(data: dict, file_path: str):
    # clean the Unicode surrogate characters in the data
    cleaned_data = clean_data_recursive(data)
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(cleaned_data, f, ensure_ascii=False, indent=2, cls=SetEncoder)
    except UnicodeEncodeError as e:
        print(f"Unicode encoding error: {e}")
        print("Falling back to ASCII encoding...")
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(cleaned_data, f, ensure_ascii=True, indent=2, cls=SetEncoder)


def main():
    parser = argparse.ArgumentParser(description="Analyze Java project and generate static analysis results.")
    parser.add_argument("project_path", help="Path to the Java project")
    parser.add_argument("--output_dir", default="../results/static_analysis", help="Directory to save output files")
    args = parser.parse_args()

    project_name = os.path.basename(args.project_path)
    output_dir = os.path.join(args.output_dir, project_name)
    os.makedirs(output_dir, exist_ok=True)

    # file analysis (including data flow graph)
    output_file_dfg = os.path.join(output_dir, f"{project_name}_dfg.json")
    project_info_dfg = analyze_project(args.project_path)
    save_json(project_info_dfg, output_file_dfg)
    print(f"data flow graph analysis completed, results saved to {output_file_dfg}")

    # dependency analysis
    output_file_dep = os.path.join(output_dir, f"{project_name}_dependency.json")
    project_info_dep = analyze_java_project(args.project_path)
    save_json(project_info_dep, output_file_dep)
    print(f"dependency analysis completed, results saved to {output_file_dep}")

    # indirect dependency analysis
    output_file_idc = os.path.join(output_dir, f"{project_name}_IDC.json")
    analyzer = EnhancedJavaDependencyAnalyzer(args.project_path)
    analyzer.analyze()
    analyzer.save_to_json(output_file_idc)
    print(f"indirect dependency analysis completed, results saved to {output_file_idc}")

    # merge all analysis results
    combined_results = {
        "data_flow_graph": project_info_dfg,
        "dependencies": project_info_dep,
        "indirect_dependencies": {k: list(v) for k, v in analyzer.dependencies.items()}
    }
    output_file_combined = os.path.join(output_dir, f"{project_name}_combined_analysis.json")
    save_json(combined_results, output_file_combined)
    print(f"all analysis results merged and saved to {output_file_combined}")

if __name__ == "__main__":
    main()