forked from reposense/RepoSense
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFileInfoExtractor.java
More file actions
230 lines (195 loc) · 9.68 KB
/
FileInfoExtractor.java
File metadata and controls
230 lines (195 loc) · 9.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
package reposense.authorship;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import reposense.authorship.model.FileInfo;
import reposense.authorship.model.LineInfo;
import reposense.git.GitCheckout;
import reposense.git.GitDiff;
import reposense.git.GitRevList;
import reposense.git.exception.CommitNotFoundException;
import reposense.model.RepoConfiguration;
import reposense.system.LogsManager;
import reposense.util.FileUtil;
/**
* Extracts out all the relevant {@code FileInfo} from the repository.
*/
public class FileInfoExtractor {
private static final Logger logger = LogsManager.getLogger(FileInfoExtractor.class);
private static final String MESSAGE_START_EXTRACTING_FILE_INFO = "Extracting relevant file info from %s (%s)...";
private static final String DIFF_FILE_CHUNK_SEPARATOR = "\ndiff --git a/.*\n";
private static final String LINE_CHUNKS_SEPARATOR = "\n@@ ";
private static final String LINE_INSERTED_SYMBOL = "+";
private static final String STARTING_LINE_NUMBER_GROUP_NAME = "startingLineNumber";
private static final String FILE_CHANGED_GROUP_NAME = "filePath";
private static final String FILE_DELETED_SYMBOL = "/dev/null";
private static final String MATCH_GROUP_FAIL_MESSAGE_FORMAT = "Failed to match the %s group for:\n%s";
private static final String BINARY_FILE_LINE_DIFF_RESULT = "-\t-\t";
private static final int LINE_CHANGED_HEADER_INDEX = 0;
private static final Pattern STARTING_LINE_NUMBER_PATTERN = Pattern.compile(
"-(\\d)+(,)?(\\d)* \\+(?<startingLineNumber>\\d+)(,)?(\\d)* @@");
private static final Pattern FILE_CHANGED_PATTERN = Pattern.compile("\n(\\+){3} b?/(?<filePath>.*)\n");
/**
* Extracts a list of relevant files given in {@code config}.
*/
public static List<FileInfo> extractFileInfos(RepoConfiguration config) {
logger.info(String.format(MESSAGE_START_EXTRACTING_FILE_INFO, config.getLocation(), config.getBranch()));
List<FileInfo> fileInfos = new ArrayList<>();
// checks out to the latest commit of the date range to ensure the FileInfo generated correspond to the
// git blame file analyze output
try {
GitCheckout.checkoutDate(config.getRepoRoot(), config.getBranch(), config.getUntilDate());
} catch (CommitNotFoundException cnfe) {
return fileInfos;
}
String lastCommitHash = GitRevList.getCommitHashBeforeDate(
config.getRepoRoot(), config.getBranch(), config.getSinceDate());
if (!lastCommitHash.isEmpty()) {
fileInfos = getEditedFileInfos(config, lastCommitHash);
} else {
getAllFileInfo(config, fileInfos);
}
fileInfos.sort(Comparator.comparing(FileInfo::getPath));
return fileInfos;
}
/**
* Generates a list of relevant {@code FileInfo} for all files that were edited in between the current
* commit and the {@code lastCommitHash} commit, marks each {@code LineInfo} for each {@code FileInfo} on
* whether they have been inserted within the commit range or not, and returns it.
*/
public static List<FileInfo> getEditedFileInfos(RepoConfiguration config, String lastCommitHash) {
List<FileInfo> fileInfos = new ArrayList<>();
String fullDiffResult = GitDiff.diffCommit(config.getRepoRoot(), lastCommitHash);
// no diff between the 2 commits, return an empty list
if (fullDiffResult.isEmpty()) {
return fileInfos;
}
String[] fileDiffResultList = fullDiffResult.split(DIFF_FILE_CHUNK_SEPARATOR);
Set<Path> nonBinaryFilesSet = getNonBinaryFilesList(config);
for (String fileDiffResult : fileDiffResultList) {
Matcher filePathMatcher = FILE_CHANGED_PATTERN.matcher(fileDiffResult);
// diff result does not have the markers to indicate that file has any line changes, skip it
if (!filePathMatcher.find()) {
continue;
}
String filePath = filePathMatcher.group(FILE_CHANGED_GROUP_NAME);
// file is deleted, skip it as well
if (filePath.equals(FILE_DELETED_SYMBOL)) {
continue;
}
if (!isValidAndNonBinaryFile(filePath, nonBinaryFilesSet)) {
continue;
}
if (config.getFileTypeManager().isInsideWhitelistedFormats(filePath)) {
FileInfo currentFileInfo = generateFileInfo(config.getRepoRoot(), filePath);
setLinesToTrack(currentFileInfo, fileDiffResult);
fileInfos.add(currentFileInfo);
}
}
return fileInfos;
}
/**
* Returns a {@code Set} of non-binary files for the repo {@code repoConfig}.
*/
public static Set<Path> getNonBinaryFilesList(RepoConfiguration repoConfig) {
List<String> modifiedFileList = GitDiff.getModifiedFilesList(Paths.get(repoConfig.getRepoRoot()));
// Gets rid of binary files and files with invalid directory name.
return modifiedFileList.stream()
.filter(file -> !file.startsWith(BINARY_FILE_LINE_DIFF_RESULT))
.map(rawNonBinaryFile -> rawNonBinaryFile.split("\t")[2])
.filter(FileUtil::isValidPath)
.map(filteredFile -> Paths.get(filteredFile))
.collect(Collectors.toCollection(HashSet::new));
}
/**
* Analyzes the {@code fileDiffResult} and marks each {@code LineInfo} in {@code FileInfo} on whether they were
* inserted in between the commit range.
*/
private static void setLinesToTrack(FileInfo fileInfo, String fileDiffResult) {
String[] linesChangedChunk = fileDiffResult.split(LINE_CHUNKS_SEPARATOR);
List<LineInfo> lineInfos = fileInfo.getLines();
int fileLinePointer = 0;
// skips the header, index starts from 1
for (int sectionIndex = 1; sectionIndex < linesChangedChunk.length; sectionIndex++) {
String linesChangedInSection = linesChangedChunk[sectionIndex];
String[] linesChanged = linesChangedInSection.split("\n");
int startingLineNumber = getStartingLineNumber(linesChanged[LINE_CHANGED_HEADER_INDEX]);
// mark all untouched lines between sections as untracked
while (fileLinePointer < startingLineNumber - 1) {
lineInfos.get(fileLinePointer++).setTracked(false);
}
// skips the header, index starts from 1
for (int lineIndex = 1; lineIndex < linesChanged.length; lineIndex++) {
String lineChanged = linesChanged[lineIndex];
// set line added to be tracked
if (lineChanged.startsWith(LINE_INSERTED_SYMBOL)) {
lineInfos.get(fileLinePointer++).setTracked(true);
}
}
}
// set all remaining lines in file that were untouched to be untracked
while (fileLinePointer < lineInfos.size()) {
lineInfos.get(fileLinePointer++).setTracked(false);
}
}
/**
* Traverses each file from the repo root directory, generates the {@code FileInfo} for each relevant file found
* based on {@code config} and inserts it into {@code fileInfos}.
*/
private static void getAllFileInfo(RepoConfiguration config, List<FileInfo> fileInfos) {
Set<Path> nonBinaryFilesList = getNonBinaryFilesList(config);
for (Path relativePath : nonBinaryFilesList) {
if (config.getFileTypeManager().isInsideWhitelistedFormats(relativePath.toString())) {
fileInfos.add(generateFileInfo(config.getRepoRoot(), relativePath.toString()));
}
}
}
/**
* Generates and returns a {@code FileInfo} with a list of {@code LineInfo} for each line content in the
* {@code relativePath} file.
*/
public static FileInfo generateFileInfo(String repoRoot, String relativePath) {
FileInfo fileInfo = new FileInfo(relativePath);
Path path = Paths.get(repoRoot, fileInfo.getPath());
try (BufferedReader br = new BufferedReader(new FileReader(path.toFile()))) {
String line;
int lineNum = 1;
while ((line = br.readLine()) != null) {
fileInfo.addLine(new LineInfo(lineNum++, line));
}
} catch (IOException ioe) {
logger.log(Level.SEVERE, ioe.getMessage(), ioe);
}
return fileInfo;
}
/**
* Returns the starting line changed number, within the file diff result, by matching the pattern inside
* {@code linesChanged}.
*/
private static int getStartingLineNumber(String linesChanged) {
Matcher chunkHeaderMatcher = STARTING_LINE_NUMBER_PATTERN.matcher(linesChanged);
if (!chunkHeaderMatcher.find()) {
logger.severe(String.format(MATCH_GROUP_FAIL_MESSAGE_FORMAT, "line changed", linesChanged));
throw new AssertionError("Should not have error matching line number pattern inside chunk header!");
}
return Integer.parseInt(chunkHeaderMatcher.group(STARTING_LINE_NUMBER_GROUP_NAME));
}
/**
* Returns true if {@code filePath} is valid and the file is not in binary.
*/
private static boolean isValidAndNonBinaryFile(String filePath, Set<Path> nonBinaryFilesSet) {
return FileUtil.isValidPath(filePath) && nonBinaryFilesSet.contains(Paths.get(filePath));
}
}