Skip to content

Commit 7cda24b

Browse files
committed
make count inline comment test regex based hopefully it will still work
1 parent 267254f commit 7cda24b

1 file changed

Lines changed: 114 additions & 108 deletions

File tree

Lines changed: 114 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
import os
22
import re
3-
from pygments import highlight
4-
from pygments.lexers import get_lexer_for_filename
5-
from pygments.token import Token
63

74

85
def count_inline_comments(file_path):
96
"""
10-
Count inline comments in a source code file.
7+
Count inline comments in a source code file using regex patterns.
118
129
An inline comment is a comment that appears on the same line as code,
1310
not on a line by itself.
@@ -19,140 +16,149 @@ def count_inline_comments(file_path):
1916
int: Number of inline comments found
2017
2118
Raises:
22-
ValueError: If the file extension is not supported by Pygments
19+
ValueError: If the file extension is not supported
2320
FileNotFoundError: If the file doesn't exist
2421
"""
2522
if not os.path.exists(file_path):
2623
raise FileNotFoundError(f"File not found: {file_path}")
2724

28-
try:
29-
# Get the appropriate lexer for the file
30-
lexer = get_lexer_for_filename(file_path)
31-
except Exception:
32-
raise ValueError(f"Unsupported file extension: {file_path}")
25+
# Get file extension
26+
_, ext = os.path.splitext(file_path)
27+
28+
# Define comment patterns for different languages
29+
comment_patterns = {
30+
'.py': r'#',
31+
'.js': r'//',
32+
'.go': r'//',
33+
'.rb': r'#',
34+
'.java': r'//',
35+
'.cpp': r'//',
36+
'.c': r'//',
37+
'.cs': r'//',
38+
'.php': r'//',
39+
'.swift': r'//',
40+
'.kt': r'//',
41+
'.scala': r'//',
42+
'.rs': r'//',
43+
'.ts': r'//',
44+
'.jsx': r'//',
45+
'.tsx': r'//',
46+
}
47+
48+
if ext not in comment_patterns:
49+
raise ValueError(f"Unsupported file extension: {ext}")
50+
51+
comment_marker = comment_patterns[ext]
3352

34-
# Read the file content
3553
try:
3654
with open(file_path, 'r', encoding='utf-8') as f:
3755
content = f.read()
3856
except UnicodeDecodeError:
39-
# Try with different encoding if UTF-8 fails
4057
with open(file_path, 'r', encoding='latin-1') as f:
4158
content = f.read()
4259

4360
if not content.strip():
4461
return 0
4562

46-
# Tokenize the content
47-
tokens = list(lexer.get_tokens(content))
48-
49-
# Group tokens by line
5063
lines = content.splitlines()
51-
line_tokens = {i + 1: [] for i in range(len(lines))}
52-
53-
current_line = 1
54-
current_pos = 0
55-
56-
for token_type, token_value in tokens:
57-
if token_value == '\n':
58-
current_line += 1
59-
current_pos = 0
60-
elif token_value:
61-
# Find which line this token belongs to
62-
token_lines = token_value.count('\n')
63-
if token_lines == 0:
64-
line_tokens[current_line].append((token_type, token_value))
65-
else:
66-
# Multi-line token
67-
parts = token_value.split('\n')
68-
for i, part in enumerate(parts):
69-
if part:
70-
line_tokens[current_line + i].append((token_type, part))
71-
current_line += token_lines
72-
7364
inline_comment_count = 0
7465

75-
# Check each line for inline comments
76-
for line_num, line_token_list in line_tokens.items():
77-
if not line_token_list:
78-
continue
79-
80-
# Check if this line has both code and comments
81-
has_code = False
82-
has_comment = False
83-
84-
for token_type, token_value in line_token_list:
85-
# Skip whitespace tokens
86-
if token_type in (Token.Text, Token.Text.Whitespace) and token_value.strip() == '':
87-
continue
88-
89-
# Check if it's a comment token
90-
if token_type in Token.Comment:
91-
has_comment = True
92-
elif token_type not in (Token.Text, Token.Text.Whitespace):
93-
# Non-whitespace, non-comment token = code
94-
has_code = True
95-
96-
# If the line has both code and comments, it contains an inline comment
97-
if has_code and has_comment:
66+
for line in lines:
67+
if _has_inline_comment(line, comment_marker):
9868
inline_comment_count += 1
9969

10070
return inline_comment_count
10171

10272

103-
# Alternative simpler implementation using regex patterns
104-
def count_inline_comments_regex(file_path):
73+
def _has_inline_comment(line, comment_marker):
10574
"""
106-
Alternative implementation using regex patterns for comment detection.
107-
This is simpler but less accurate than the Pygments-based approach.
75+
Check if a line has an inline comment (comment on same line as code).
76+
77+
Args:
78+
line (str): The line to check
79+
comment_marker (str): The comment marker for the language (e.g., '//', '#')
80+
81+
Returns:
82+
bool: True if the line has an inline comment, False otherwise
10883
"""
109-
if not os.path.exists(file_path):
110-
raise FileNotFoundError(f"File not found: {file_path}")
84+
# Remove leading/trailing whitespace
85+
line = line.strip()
11186

112-
# Get file extension
113-
_, ext = os.path.splitext(file_path)
87+
# Empty line or line with only whitespace
88+
if not line:
89+
return False
11490

115-
# Define comment patterns for different languages
116-
comment_patterns = {
117-
'.py': r'#.*',
118-
'.js': r'//.*',
119-
'.go': r'//.*',
120-
'.rb': r'#.*',
121-
'.java': r'//.*',
122-
'.cpp': r'//.*',
123-
'.c': r'//.*',
124-
'.cs': r'//.*',
125-
'.php': r'//.*',
126-
'.swift': r'//.*',
127-
'.kt': r'//.*',
128-
'.scala': r'//.*',
129-
}
91+
# Line starts with comment marker (full-line comment)
92+
if line.startswith(comment_marker):
93+
return False
13094

131-
if ext not in comment_patterns:
132-
raise ValueError(f"Unsupported file extension: {ext}")
95+
# Find comment marker in the line
96+
comment_index = line.find(comment_marker)
13397

134-
comment_pattern = comment_patterns[ext]
98+
# No comment marker found
99+
if comment_index == -1:
100+
return False
135101

136-
try:
137-
with open(file_path, 'r', encoding='utf-8') as f:
138-
lines = f.readlines()
139-
except UnicodeDecodeError:
140-
with open(file_path, 'r', encoding='latin-1') as f:
141-
lines = f.readlines()
102+
# Check if there's non-whitespace code before the comment
103+
code_before_comment = line[:comment_index].strip()
142104

143-
inline_comment_count = 0
105+
# Handle string literals that might contain comment markers
106+
if _is_comment_in_string(line, comment_index):
107+
return False
144108

145-
for line in lines:
146-
line = line.strip()
147-
if not line:
148-
continue
149-
150-
# Find comment in the line
151-
comment_match = re.search(comment_pattern, line)
152-
if comment_match:
153-
# Check if there's code before the comment
154-
code_before_comment = line[:comment_match.start()].strip()
155-
if code_before_comment:
156-
inline_comment_count += 1
157-
158-
return inline_comment_count
109+
# If there's code before the comment, it's an inline comment
110+
return bool(code_before_comment)
111+
112+
113+
def _is_comment_in_string(line, comment_index):
114+
"""
115+
Check if the comment marker is inside a string literal.
116+
This is a simplified check that handles basic cases.
117+
118+
Args:
119+
line (str): The line to check
120+
comment_index (int): Index of the comment marker
121+
122+
Returns:
123+
bool: True if the comment marker is likely inside a string
124+
"""
125+
# Count quotes before the comment marker
126+
line_before_comment = line[:comment_index]
127+
128+
# Count single and double quotes (basic check)
129+
single_quotes = line_before_comment.count("'")
130+
double_quotes = line_before_comment.count('"')
131+
132+
# Simple heuristic: if odd number of quotes, we're likely inside a string
133+
# This is not perfect but handles most common cases
134+
in_single_quote_string = single_quotes % 2 == 1
135+
in_double_quote_string = double_quotes % 2 == 1
136+
137+
return in_single_quote_string or in_double_quote_string
138+
139+
140+
# More robust string detection (optional, more complex)
141+
def _is_comment_in_string_robust(line, comment_index):
142+
"""
143+
More robust check for comment markers inside strings.
144+
Handles escaped quotes and mixed quote types.
145+
"""
146+
i = 0
147+
in_single_string = False
148+
in_double_string = False
149+
150+
while i < comment_index:
151+
char = line[i]
152+
153+
if char == '"' and not in_single_string:
154+
# Check if it's escaped
155+
if i == 0 or line[i-1] != '\\':
156+
in_double_string = not in_double_string
157+
elif char == "'" and not in_double_string:
158+
# Check if it's escaped
159+
if i == 0 or line[i-1] != '\\':
160+
in_single_string = not in_single_string
161+
162+
i += 1
163+
164+
return in_single_string or in_double_string

0 commit comments

Comments
 (0)