From 9aec69e4b09b5a4f04219cb694960216f641be7d Mon Sep 17 00:00:00 2001 From: Brian Date: Tue, 24 Mar 2026 06:48:52 -0700 Subject: [PATCH 1/2] fix: resolve C# parser crash on startup due to missing/wrong grammar key Two bugs combined to raise `ValueError: Language 'c_sharp' is not available in tree-sitter-language-pack` on GraphBuilder init, breaking indexing even for non-C# projects: 1. Version pin too broad (>=0.6.0): versions before 0.13.0 don't ship tree-sitter-c-sharp as a transitive dependency, so `import tree_sitter_c_sharp` raises ModuleNotFoundError at startup. Bumped floor to >=0.13.0. 2. C# special-case used a hard-coded direct import and the wrong fallback key ('c_sharp') instead of delegating to get_language(). The pack's C# key also changed between major versions (0.x uses "csharp", 1.x uses "c_sharp"), so the fallback path was also broken. Replaced the manual import block with a version-agnostic loop that tries both keys in order, and tightened the except clause to re-raise ValueError unchanged (so LanguageNotFoundError from the 1.x pack propagates cleanly). Also adds a TODO comment in GraphBuilder.__init__ tracking the planned Fix 3 (graceful per-parser degradation) for future work. Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 4 +-- src/codegraphcontext/tools/graph_builder.py | 7 ++++ .../utils/tree_sitter_manager.py | 33 ++++++++++++++----- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ffc245bf..005269a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "inquirerpy>=0.3.4", "python-dotenv>=1.0.0", "tree-sitter>=0.21.0", - "tree-sitter-language-pack>=0.6.0", + "tree-sitter-language-pack>=0.13.0", "pyyaml", "pytest", "nbformat", @@ -39,7 +39,7 @@ dependencies = [ [project.optional-dependencies] parsing = [ "tree-sitter>=0.21.0", - "tree-sitter-language-pack>=0.6.0", + "tree-sitter-language-pack>=0.13.0", ] dev = [ "pytest>=7.4.0", diff --git a/src/codegraphcontext/tools/graph_builder.py b/src/codegraphcontext/tools/graph_builder.py index c3e45e81..2bbc5ab6 100644 --- a/src/codegraphcontext/tools/graph_builder.py +++ b/src/codegraphcontext/tools/graph_builder.py @@ -101,6 +101,13 @@ def __init__(self, db_manager: DatabaseManager, job_manager: JobManager, loop: a self.job_manager = job_manager self.loop = loop self.driver = self.db_manager.get_driver() + # TODO (Fix 3 – graceful degradation): Wrap each TreeSitterParser(...) + # construction in a try/except so that a single unavailable grammar + # (e.g. a future language whose package isn't installed) doesn't crash + # the entire GraphBuilder. Suggested approach: extract a + # _make_parser_safe(lang) helper that logs a warning and returns None + # on failure, then filter None values out of self.parsers. + # See: https://github.com/Shashankss1205/CodeGraphContext/issues self.parsers = { '.py': TreeSitterParser('python'), '.ipynb': TreeSitterParser('python'), diff --git a/src/codegraphcontext/utils/tree_sitter_manager.py b/src/codegraphcontext/utils/tree_sitter_manager.py index f3f9d7a7..4c18bdef 100644 --- a/src/codegraphcontext/utils/tree_sitter_manager.py +++ b/src/codegraphcontext/utils/tree_sitter_manager.py @@ -128,22 +128,37 @@ def get_language_safe(self, lang: str) -> Language: return self._language_cache[canonical_name] try: - # Special handling for C# which is available as tree_sitter_c_sharp + # tree-sitter-language-pack changed its C# key across major versions: + # 0.x (>=0.13.0): uses "csharp" (delegates to tree-sitter-c-sharp pkg) + # 1.x: uses "c_sharp" (native binary, no separate pkg) + # Try both names so the loader is version-agnostic. + # All other languages use a stable name matching our canonical form. if canonical_name == "c_sharp": - import tree_sitter_c_sharp - # tree_sitter_c_sharp.language() returns a PyCapsule, wrap it in Language - capsule = tree_sitter_c_sharp.language() - language = Language(capsule) + for pack_name in ("c_sharp", "csharp"): + try: + language = get_language(pack_name) + break + except Exception: + continue + else: + raise ValueError( + "Language 'c_sharp' (C#) is not available. " + "Ensure tree-sitter-language-pack>=0.13.0 is installed " + "(earlier versions do not ship the C# grammar)." + ) else: # Load the language from tree-sitter-language-pack language = get_language(canonical_name) - + self._language_cache[canonical_name] = language return language - except (KeyError, ModuleNotFoundError): + except ValueError: + raise # pass through ValueError unchanged (ours or pack's LanguageNotFoundError) + except (KeyError, LookupError, ModuleNotFoundError) as e: raise ValueError( - f"Language '{canonical_name}' is not available in tree-sitter-language-pack. " - f"This may be due to a missing or experimental grammar." + f"Language '{canonical_name}' is not available. " + f"Ensure tree-sitter-language-pack>=0.13.0 is installed. " + f"Error: {e}" ) except Exception as e: raise Exception( From 8fd71ab3e96f5f55c369924a4e4badee85de1d24 Mon Sep 17 00:00:00 2001 From: Brian Date: Tue, 24 Mar 2026 16:41:27 -0700 Subject: [PATCH 2/2] fix(graph_builder): add graceful parser degradation for unavailable grammars Implement Fix 3 from PR #2: extract _make_parser_safe() helper that wraps each TreeSitterParser construction in a try/except. Parsers that fail to load (e.g. C# when tree-sitter-language-pack cannot find the grammar) are skipped with a warning instead of crashing the entire GraphBuilder init. This unblocks the e2e CI tests which were failing because the C# grammar is not available under either key in all tree-sitter-language-pack versions. Co-Authored-By: Claude Sonnet 4.6 --- src/codegraphcontext/tools/graph_builder.py | 83 +++++++++++---------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/src/codegraphcontext/tools/graph_builder.py b/src/codegraphcontext/tools/graph_builder.py index 2bbc5ab6..9acf0141 100644 --- a/src/codegraphcontext/tools/graph_builder.py +++ b/src/codegraphcontext/tools/graph_builder.py @@ -101,47 +101,54 @@ def __init__(self, db_manager: DatabaseManager, job_manager: JobManager, loop: a self.job_manager = job_manager self.loop = loop self.driver = self.db_manager.get_driver() - # TODO (Fix 3 – graceful degradation): Wrap each TreeSitterParser(...) - # construction in a try/except so that a single unavailable grammar - # (e.g. a future language whose package isn't installed) doesn't crash - # the entire GraphBuilder. Suggested approach: extract a - # _make_parser_safe(lang) helper that logs a warning and returns None - # on failure, then filter None values out of self.parsers. - # See: https://github.com/Shashankss1205/CodeGraphContext/issues - self.parsers = { - '.py': TreeSitterParser('python'), - '.ipynb': TreeSitterParser('python'), - '.js': TreeSitterParser('javascript'), - '.jsx': TreeSitterParser('javascript'), - '.mjs': TreeSitterParser('javascript'), - '.cjs': TreeSitterParser('javascript'), - '.go': TreeSitterParser('go'), - '.ts': TreeSitterParser('typescript'), - '.tsx': TreeSitterParser('typescript'), - '.cpp': TreeSitterParser('cpp'), - '.h': TreeSitterParser('cpp'), - '.hpp': TreeSitterParser('cpp'), - '.hh': TreeSitterParser('cpp'), - '.rs': TreeSitterParser('rust'), - '.c': TreeSitterParser('c'), - # '.h': TreeSitterParser('c'), # Need to write an algo for distinguishing C vs C++ headers - '.java': TreeSitterParser('java'), - '.rb': TreeSitterParser('ruby'), - '.cs': TreeSitterParser('c_sharp'), - '.php': TreeSitterParser('php'), - '.kt': TreeSitterParser('kotlin'), - '.scala': TreeSitterParser('scala'), - '.sc': TreeSitterParser('scala'), - '.swift': TreeSitterParser('swift'), - '.hs': TreeSitterParser('haskell'), - '.dart': TreeSitterParser('dart'), - '.pl': TreeSitterParser('perl'), - '.pm': TreeSitterParser('perl'), - '.ex': TreeSitterParser('elixir'), - '.exs': TreeSitterParser('elixir'), + raw_parsers = { + '.py': 'python', + '.ipynb': 'python', + '.js': 'javascript', + '.jsx': 'javascript', + '.mjs': 'javascript', + '.cjs': 'javascript', + '.go': 'go', + '.ts': 'typescript', + '.tsx': 'typescript', + '.cpp': 'cpp', + '.h': 'cpp', + '.hpp': 'cpp', + '.hh': 'cpp', + '.rs': 'rust', + '.c': 'c', + # '.h': 'c', # Need to write an algo for distinguishing C vs C++ headers + '.java': 'java', + '.rb': 'ruby', + '.cs': 'c_sharp', + '.php': 'php', + '.kt': 'kotlin', + '.scala': 'scala', + '.sc': 'scala', + '.swift': 'swift', + '.hs': 'haskell', + '.dart': 'dart', + '.pl': 'perl', + '.pm': 'perl', + '.ex': 'elixir', + '.exs': 'elixir', } + self.parsers = {} + for ext, lang in raw_parsers.items(): + parser = self._make_parser_safe(lang) + if parser is not None: + self.parsers[ext] = parser self.create_schema() + @staticmethod + def _make_parser_safe(lang: str) -> Optional['TreeSitterParser']: + """Try to construct a TreeSitterParser for *lang*, returning None on failure.""" + try: + return TreeSitterParser(lang) + except Exception as e: + warning_logger(f"Skipping parser for '{lang}': {e}") + return None + # A general schema creation based on common features across languages def create_schema(self): """Create constraints and indexes in Neo4j."""