make count inline comment test regex based hopefully it will still work

CodyKoInABox · CodyKoInABox · commit 7cda24b5b274 · 2025-05-24T12:57:41.000-03:00
diff --git a/tests/spice/analyzers/test_count_inline_comments.py b/tests/spice/analyzers/test_count_inline_comments.py
@@ -1,13 +1,10 @@
 import os
 import re
-from pygments import highlight
-from pygments.lexers import get_lexer_for_filename
-from pygments.token import Token
 
 
 def count_inline_comments(file_path):
     """
-    Count inline comments in a source code file.
+    Count inline comments in a source code file using regex patterns.
     
     An inline comment is a comment that appears on the same line as code,
     not on a line by itself.
@@ -19,140 +16,149 @@ def count_inline_comments(file_path):
         int: Number of inline comments found
         
     Raises:
-        ValueError: If the file extension is not supported by Pygments
+        ValueError: If the file extension is not supported
         FileNotFoundError: If the file doesn't exist
     """
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"File not found: {file_path}")
     
-    try:
-        # Get the appropriate lexer for the file
-        lexer = get_lexer_for_filename(file_path)
-    except Exception:
-        raise ValueError(f"Unsupported file extension: {file_path}")
+    # Get file extension
+    _, ext = os.path.splitext(file_path)
+    
+    # Define comment patterns for different languages
+    comment_patterns = {
+        '.py': r'#',
+        '.js': r'//',
+        '.go': r'//',
+        '.rb': r'#',
+        '.java': r'//',
+        '.cpp': r'//',
+        '.c': r'//',
+        '.cs': r'//',
+        '.php': r'//',
+        '.swift': r'//',
+        '.kt': r'//',
+        '.scala': r'//',
+        '.rs': r'//',
+        '.ts': r'//',
+        '.jsx': r'//',
+        '.tsx': r'//',
+    }
+    
+    if ext not in comment_patterns:
+        raise ValueError(f"Unsupported file extension: {ext}")
+    
+    comment_marker = comment_patterns[ext]
     
-    # Read the file content
     try:
         with open(file_path, 'r', encoding='utf-8') as f:
             content = f.read()
     except UnicodeDecodeError:
-        # Try with different encoding if UTF-8 fails
         with open(file_path, 'r', encoding='latin-1') as f:
             content = f.read()
     
     if not content.strip():
         return 0
     
-    # Tokenize the content
-    tokens = list(lexer.get_tokens(content))
-    
-    # Group tokens by line
     lines = content.splitlines()
-    line_tokens = {i + 1: [] for i in range(len(lines))}
-    
-    current_line = 1
-    current_pos = 0
-    
-    for token_type, token_value in tokens:
-        if token_value == '\n':
-            current_line += 1
-            current_pos = 0
-        elif token_value:
-            # Find which line this token belongs to
-            token_lines = token_value.count('\n')
-            if token_lines == 0:
-                line_tokens[current_line].append((token_type, token_value))
-            else:
-                # Multi-line token
-                parts = token_value.split('\n')
-                for i, part in enumerate(parts):
-                    if part:
-                        line_tokens[current_line + i].append((token_type, part))
-                current_line += token_lines
-    
     inline_comment_count = 0
     
-    # Check each line for inline comments
-    for line_num, line_token_list in line_tokens.items():
-        if not line_token_list:
-            continue
-            
-        # Check if this line has both code and comments
-        has_code = False
-        has_comment = False
-        
-        for token_type, token_value in line_token_list:
-            # Skip whitespace tokens
-            if token_type in (Token.Text, Token.Text.Whitespace) and token_value.strip() == '':
-                continue
-            
-            # Check if it's a comment token
-            if token_type in Token.Comment:
-                has_comment = True
-            elif token_type not in (Token.Text, Token.Text.Whitespace):
-                # Non-whitespace, non-comment token = code
-                has_code = True
-        
-        # If the line has both code and comments, it contains an inline comment
-        if has_code and has_comment:
+    for line in lines:
+        if _has_inline_comment(line, comment_marker):
             inline_comment_count += 1
     
     return inline_comment_count
 
 
-# Alternative simpler implementation using regex patterns
-def count_inline_comments_regex(file_path):
+def _has_inline_comment(line, comment_marker):
     """
-    Alternative implementation using regex patterns for comment detection.
-    This is simpler but less accurate than the Pygments-based approach.
+    Check if a line has an inline comment (comment on same line as code).
+    
+    Args:
+        line (str): The line to check
+        comment_marker (str): The comment marker for the language (e.g., '//', '#')
+        
+    Returns:
+        bool: True if the line has an inline comment, False otherwise
     """
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"File not found: {file_path}")
+    # Remove leading/trailing whitespace
+    line = line.strip()
     
-    # Get file extension
-    _, ext = os.path.splitext(file_path)
+    # Empty line or line with only whitespace
+    if not line:
+        return False
     
-    # Define comment patterns for different languages
-    comment_patterns = {
-        '.py': r'#.*',
-        '.js': r'//.*',
-        '.go': r'//.*',
-        '.rb': r'#.*',
-        '.java': r'//.*',
-        '.cpp': r'//.*',
-        '.c': r'//.*',
-        '.cs': r'//.*',
-        '.php': r'//.*',
-        '.swift': r'//.*',
-        '.kt': r'//.*',
-        '.scala': r'//.*',
-    }
+    # Line starts with comment marker (full-line comment)
+    if line.startswith(comment_marker):
+        return False
     
-    if ext not in comment_patterns:
-        raise ValueError(f"Unsupported file extension: {ext}")
+    # Find comment marker in the line
+    comment_index = line.find(comment_marker)
     
-    comment_pattern = comment_patterns[ext]
+    # No comment marker found
+    if comment_index == -1:
+        return False
     
-    try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            lines = f.readlines()
-    except UnicodeDecodeError:
-        with open(file_path, 'r', encoding='latin-1') as f:
-            lines = f.readlines()
+    # Check if there's non-whitespace code before the comment
+    code_before_comment = line[:comment_index].strip()
     
-    inline_comment_count = 0
+    # Handle string literals that might contain comment markers
+    if _is_comment_in_string(line, comment_index):
+        return False
     
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-            
-        # Find comment in the line
-        comment_match = re.search(comment_pattern, line)
-        if comment_match:
-            # Check if there's code before the comment
-            code_before_comment = line[:comment_match.start()].strip()
-            if code_before_comment:
-                inline_comment_count += 1
-    
-    return inline_comment_count
+    # If there's code before the comment, it's an inline comment
+    return bool(code_before_comment)
+
+
+def _is_comment_in_string(line, comment_index):
+    """
+    Check if the comment marker is inside a string literal.
+    This is a simplified check that handles basic cases.
+    
+    Args:
+        line (str): The line to check
+        comment_index (int): Index of the comment marker
+        
+    Returns:
+        bool: True if the comment marker is likely inside a string
+    """
+    # Count quotes before the comment marker
+    line_before_comment = line[:comment_index]
+    
+    # Count single and double quotes (basic check)
+    single_quotes = line_before_comment.count("'")
+    double_quotes = line_before_comment.count('"')
+    
+    # Simple heuristic: if odd number of quotes, we're likely inside a string
+    # This is not perfect but handles most common cases
+    in_single_quote_string = single_quotes % 2 == 1
+    in_double_quote_string = double_quotes % 2 == 1
+    
+    return in_single_quote_string or in_double_quote_string
+
+
+# More robust string detection (optional, more complex)
+def _is_comment_in_string_robust(line, comment_index):
+    """
+    More robust check for comment markers inside strings.
+    Handles escaped quotes and mixed quote types.
+    """
+    i = 0
+    in_single_string = False
+    in_double_string = False
+    
+    while i < comment_index:
+        char = line[i]
+        
+        if char == '"' and not in_single_string:
+            # Check if it's escaped
+            if i == 0 or line[i-1] != '\\':
+                in_double_string = not in_double_string
+        elif char == "'" and not in_double_string:
+            # Check if it's escaped
+            if i == 0 or line[i-1] != '\\':
+                in_single_string = not in_single_string
+        
+        i += 1
+    
+    return in_single_string or in_double_string