From 653bcda386c0f342429123779759bf9227a2dfe5 Mon Sep 17 00:00:00 2001 From: Shane McCarron Date: Tue, 16 Jun 2026 06:51:28 -0500 Subject: [PATCH 1/6] fix(perl): stop extracting config strings as call targets The Perl branch of extract_scripting_callee blindly returned the text of child(0) of every call node. In config-heavy Perl (.cgi/.pl with embedded log4perl-style config), tree-sitter-perl misparses dotted config tokens (e.g. "log4perl.appender.File.utf8") into call-shaped nodes, and that dotted string was emitted as a callee_name, later matched by the generic short-name resolver to unrelated project subs. Now the Perl branch pulls the real name token (method/function field, else child(0)) and validates it as a bare Perl sub/method identifier via perl_is_identifier_callee: must start with a letter or '_' and contain only [A-Za-z0-9_:] (allowing the '::' package separator). Any '.', whitespace, quote, or '/' disqualifies it and NULL is returned so no CALLS edge forms. Gated to CBM_LANG_PERL; other languages are untouched. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Shane McCarron --- internal/cbm/extract_calls.c | 44 +++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/internal/cbm/extract_calls.c b/internal/cbm/extract_calls.c index 98b924b8..051241c7 100644 --- a/internal/cbm/extract_calls.c +++ b/internal/cbm/extract_calls.c @@ -355,6 +355,31 @@ static char *extract_swift_callee(CBMArena *a, TSNode node, const char *source, return NULL; } +// A Perl sub/method name is an identifier: it starts with a letter or '_', +// contains only [A-Za-z0-9_] plus the '::' package separator, and is never a +// string/config literal. tree-sitter-perl mis-parses config lines in .cgi / +// heredoc-heavy files into call-shaped nodes whose "callee" is a dotted config +// token (e.g. "log4perl.appender.File.utf8"); rejecting non-identifier text +// here stops those from becoming bogus CALLS edges. Any '.', whitespace, quote, +// or '/' disqualifies the token. +static bool perl_is_identifier_callee(const char *name) { + if (!name || !name[0]) { + return false; + } + unsigned char c0 = (unsigned char)name[0]; + if (!(isalpha(c0) || c0 == '_')) { + return false; + } + for (const char *p = name; *p; p++) { + unsigned char c = (unsigned char)*p; + if (isalnum(c) || c == '_' || c == ':') { + continue; + } + return false; // '.', space, quote, '/', etc. → not a sub/method name + } + return true; +} + // Callee extraction for scripting languages (Elixir, Perl, PHP, Kotlin, MATLAB). static char *extract_scripting_callee(CBMArena *a, TSNode node, const char *source, CBMLanguage lang, const char *nk) { @@ -367,7 +392,24 @@ static char *extract_scripting_callee(CBMArena *a, TSNode node, const char *sour return NULL; } if (lang == CBM_LANG_PERL && ts_node_child_count(node) > 0) { - return cbm_node_text(a, ts_node_child(node, 0), source); + // Pull the actual sub/method name token rather than blindly taking + // child(0). Grammar (verified against the vendored parser): + // method_call_expression : field `method` ($obj->m / Class->m) + // function_call_expression : field `function` (foo(); name with '.' + // from a config-string misparse lands here) + // ambiguous_function_call_expression : field `function` + // func1op_call_expression : builtin keyword as child(0) (no field) + TSNode name_node = ts_node_child_by_field_name(node, TS_FIELD("method")); + if (ts_node_is_null(name_node)) { + name_node = ts_node_child_by_field_name(node, TS_FIELD("function")); + } + if (ts_node_is_null(name_node)) { + name_node = ts_node_child(node, 0); + } + char *pn = cbm_node_text(a, name_node, source); + // Reject anything that is not a bare Perl sub/method identifier (config + // strings, quoted literals, paths) so no spurious CALLS edge is emitted. + return perl_is_identifier_callee(pn) ? pn : NULL; } if (lang == CBM_LANG_PHP) { TSNode func_node = ts_node_child_by_field_name(node, TS_FIELD("function")); From d28e7448a9196a97502f4016a36d1a7fa734b0ea Mon Sep 17 00:00:00 2001 From: Shane McCarron Date: Tue, 16 Jun 2026 07:00:26 -0500 Subject: [PATCH 2/6] fix(resolver): don't match Perl builtins to project subs Perl builtins (push/shift/keys/sprintf/...) carry no language or call-kind awareness through the generic name-matcher in cbm_registry_resolve. When a project defines a sub whose name collides with a builtin, an invocation of the builtin was wired to that sub by same-module / suffix matching - a false-positive CALLS edge. Adds cbm_perl_is_builtin (curated, sorted bsearch set of 94 perlfunc core builtins) and applies it in both call-resolution passes (sequential resolve_single_call and parallel resolve_file_calls), gated on the file language == CBM_LANG_PERL and only AFTER LSP resolution has declined, so a genuine LSP-resolved call is never suppressed. The file language is threaded into both resolvers via a new trailing CBMLanguage parameter; every other language reaches cbm_registry_resolve unchanged (byte-identical behavior). Co-Authored-By: Claude Opus 4.8 Signed-off-by: Shane McCarron --- src/pipeline/pass_calls.c | 13 ++++++++++-- src/pipeline/pass_parallel.c | 13 ++++++++++-- src/pipeline/pipeline.h | 6 ++++++ src/pipeline/registry.c | 39 ++++++++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 4 deletions(-) diff --git a/src/pipeline/pass_calls.c b/src/pipeline/pass_calls.c index 15d691d3..dd7ddaed 100644 --- a/src/pipeline/pass_calls.c +++ b/src/pipeline/pass_calls.c @@ -336,7 +336,7 @@ static const cbm_gbuf_node_t *calls_find_source(cbm_pipeline_ctx_t *ctx, const c static int resolve_single_call(cbm_pipeline_ctx_t *ctx, CBMCall *call, const CBMResolvedCallArray *lsp_calls, const char *rel, const char *module_qn, const char **imp_keys, const char **imp_vals, - int imp_count) { + int imp_count, CBMLanguage lang) { const cbm_gbuf_node_t *source_node = calls_find_source(ctx, rel, call->enclosing_func_qn); if (!source_node) { return 0; @@ -361,6 +361,15 @@ static int resolve_single_call(cbm_pipeline_ctx_t *ctx, CBMCall *call, } } + /* Perl builtin guard (#459 follow-up). Applied only after LSP resolution + * declined, so a real LSP-resolved call is never suppressed. A Perl builtin + * (push/shift/keys/...) resolved by the generic short-name matcher to a + * project sub sharing the name is almost always a false positive. Gated to + * Perl — other languages reach cbm_registry_resolve unchanged. */ + if (lang == CBM_LANG_PERL && cbm_perl_is_builtin(call->callee_name)) { + return 0; + } + cbm_resolution_t res = cbm_registry_resolve(ctx->registry, call->callee_name, module_qn, imp_keys, imp_vals, imp_count); if (!res.qualified_name || res.qualified_name[0] == '\0') { @@ -440,7 +449,7 @@ int cbm_pipeline_pass_calls(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *file } total_calls++; if (resolve_single_call(ctx, call, &result->resolved_calls, rel, module_qn, imp_keys, - imp_vals, imp_count)) { + imp_vals, imp_count, files[i].language)) { resolved++; } else { unresolved++; diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 180ee85f..7ef1dbe0 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -1693,7 +1693,7 @@ static void lsp_idx_free_key(const char *key, void *value, void *ud) { /* Resolve calls for one file and emit CALLS/HTTP_CALLS/ASYNC_CALLS edges. */ static void resolve_file_calls(resolve_ctx_t *rc, resolve_worker_state_t *ws, CBMFileResult *result, const char *rel, const char *module_qn, const char **imp_keys, - const char **imp_vals, int imp_count) { + const char **imp_vals, int imp_count, CBMLanguage lang) { /* Build a per-file hash index of resolved_calls keyed by * "caller_qn|callee_short" for O(1) lookup. cbm_pipeline_find_lsp_ * resolution would otherwise do an O(N) linear scan over @@ -1791,6 +1791,15 @@ static void resolve_file_calls(resolve_ctx_t *rc, resolve_worker_state_t *ws, CB res.candidate_count = 1; ws->lsp_overrides++; } + } else if (lang == CBM_LANG_PERL && cbm_perl_is_builtin(call->callee_name)) { + /* Perl builtin guard (#459 follow-up), mirroring the sequential + * pass (pass_calls.c). LSP resolution already declined above + * (lsp == NULL here), so suppress the generic short-name match for + * Perl builtins (push/shift/keys/...). Leaves res empty → no edge. + * Gated to Perl; every other language resolves unchanged. */ + atomic_fetch_add_explicit(&rc->time_ns_rc_resolve, extract_now_ns() - _rc_t0, + memory_order_relaxed); + continue; } else { res = cbm_registry_resolve(rc->registry, call->callee_name, module_qn, imp_keys, imp_vals, imp_count); @@ -2328,7 +2337,7 @@ static void resolve_worker(int worker_id, void *ctx_ptr) { /* ── CALLS resolution ──────────────────────────────────── */ _ph_t0 = extract_now_ns(); - resolve_file_calls(rc, ws, result, rel, module_qn, imp_keys, imp_vals, imp_count); + resolve_file_calls(rc, ws, result, rel, module_qn, imp_keys, imp_vals, imp_count, lang); atomic_fetch_add_explicit(&rc->time_ns_calls, extract_now_ns() - _ph_t0, memory_order_relaxed); diff --git a/src/pipeline/pipeline.h b/src/pipeline/pipeline.h index 1aa5ab5f..ee385635 100644 --- a/src/pipeline/pipeline.h +++ b/src/pipeline/pipeline.h @@ -167,6 +167,12 @@ void cbm_registry_resolve_cache_end(void); /* Check if a qualified name exists in the registry. */ bool cbm_registry_exists(const cbm_registry_t *r, const char *qn); +/* True if `name` is one of the curated Perl core builtins (perlfunc). Used by + * the call-resolution passes to suppress generic-resolver CALLS edges from Perl + * builtin invocations (push/shift/keys/...) to project subs that merely share + * the name. Perl-scoped: callers gate on the file language. */ +bool cbm_perl_is_builtin(const char *name); + /* Get the label of a qualified name, or NULL if not found. */ const char *cbm_registry_label_of(const cbm_registry_t *r, const char *qn); diff --git a/src/pipeline/registry.c b/src/pipeline/registry.c index 6533e9e2..5dac5799 100644 --- a/src/pipeline/registry.c +++ b/src/pipeline/registry.c @@ -356,6 +356,45 @@ static cbm_resolution_t empty_result(void) { return r; } +/* ── Perl builtin guard (#459 follow-up: call-graph noise) ────────── + * Curated subset of perlfunc core builtins. When a Perl CALL resolves + * only by the generic short-name matcher (no LSP, no import, after the + * same-module/name-lookup chain), a builtin name like `push`/`shift`/ + * `keys` must NOT be wired to a project sub that merely shares the name + * — that is virtually always a false positive. A genuine intra-project + * call is resolved by earlier (LSP/textual) stages before this guard. + * MUST stay sorted ASCII-ascending for bsearch. */ +static const char *const PERL_BUILTINS[] = { + "abs", "atan2", "binmode", "bless", "caller", "chdir", "chmod", "chomp", + "chop", "chown", "chr", "chroot", "close", "closedir", "cos", "defined", + "delete", "die", "do", "each", "eof", "eval", "exec", "exists", + "exit", "fork", "gmtime", "goto", "grep", "hex", "index", "int", + "join", "keys", "last", "lc", "lcfirst", "length", "local", "localtime", + "log", "lstat", "map", "mkdir", "my", "next", "oct", "open", + "opendir", "ord", "our", "pop", "pos", "print", "printf", "push", + "quotemeta", "rand", "read", "readdir", "readline", "redo", "ref", "rename", + "require", "return", "reverse", "rindex", "rmdir", "say", "scalar", "seek", + "shift", "sin", "sleep", "sort", "splice", "split", "sprintf", "sqrt", + "srand", "stat", "substr", "system", "time", "uc", "ucfirst", "undef", + "unlink", "unshift", "values", "wantarray", "warn", "write", +}; + +static int perl_builtin_cmp(const void *key, const void *elem) { + return strcmp((const char *)key, *(const char *const *)elem); +} + +/* True if `name` is one of the curated Perl core builtins. Used to suppress + * generic-resolver CALLS edges from Perl builtin invocations to project subs + * that happen to share the builtin's name. Perl-scoped: callers gate on the + * file language so no other language's resolution is affected. */ +bool cbm_perl_is_builtin(const char *name) { + if (!name || !name[0]) { + return false; + } + return bsearch(name, PERL_BUILTINS, sizeof(PERL_BUILTINS) / sizeof(PERL_BUILTINS[0]), + sizeof(PERL_BUILTINS[0]), perl_builtin_cmp) != NULL; +} + /* ── Lifecycle ──────────────────────────────────────────────────── */ cbm_registry_t *cbm_registry_new(void) { From f416fee55e4f04fdaf8383b0bdc40780cabfabde Mon Sep 17 00:00:00 2001 From: Shane McCarron Date: Tue, 16 Jun 2026 07:02:25 -0500 Subject: [PATCH 3/6] fix(resolver): suppress generic short-name matching for Perl method calls A Perl method call ($obj->m / Class->m) carries no receiver type at the structural tier, so the generic short-name matcher in cbm_registry_resolve would wire $dbh->commit / $cgi->param / $logger->log to any project sub sharing the bare method name - the dominant source of false-positive CALLS edges in CPAN/framework-heavy Perl. Resolving such a call correctly is the LSP's job, not the bare-name matcher's. Adds a CBMCall.is_method flag (zero-init false, so all other languages and existing call sites are unaffected). method_call_expression is added to the Perl call node set and handle_calls sets is_method=true only for that node type when the file language is Perl. Both call-resolution passes then skip generic resolution for Perl method calls (combined with the builtin guard from the prior commit). Genuine intra-project function calls (non-method, non-builtin) still resolve as before. LSP-resolved method calls are unaffected because the guard runs only after LSP resolution declines. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Shane McCarron --- internal/cbm/cbm.h | 1 + internal/cbm/extract_calls.c | 8 ++++++++ internal/cbm/lang_specs.c | 2 +- src/pipeline/pass_calls.c | 14 ++++++++------ src/pipeline/pass_parallel.c | 12 +++++++----- 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/internal/cbm/cbm.h b/internal/cbm/cbm.h index cc3607ee..7b89bd4f 100644 --- a/internal/cbm/cbm.h +++ b/internal/cbm/cbm.h @@ -236,6 +236,7 @@ typedef struct { int loop_depth; // enclosing loop nesting at the call site int branch_depth; // enclosing branch nesting at the call site int start_line; // 1-based source line of the call (for def range-match) + bool is_method; // Perl-only: arrow/method call ($obj->m). Default false. } CBMCall; typedef struct { diff --git a/internal/cbm/extract_calls.c b/internal/cbm/extract_calls.c index 051241c7..ba24cccd 100644 --- a/internal/cbm/extract_calls.c +++ b/internal/cbm/extract_calls.c @@ -1176,6 +1176,14 @@ void handle_calls(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec, Walk call.loop_depth = state->loop_depth; // enclosing loop nesting at this call call.branch_depth = state->branch_depth; // enclosing branch nesting at this call call.start_line = (int)ts_node_start_point(node).row + TS_LINE_OFFSET; + // Perl-only: flag arrow/method calls ($obj->m / Class->m). The + // generic short-name resolver cannot place a method without a known + // receiver type, so the call-resolution pass suppresses those edges. + // Default false for every other language (struct is zero-init). + if (ctx->language == CBM_LANG_PERL && + strcmp(ts_node_type(node), "method_call_expression") == 0) { + call.is_method = true; + } TSNode args = ts_node_child_by_field_name(node, TS_FIELD("arguments")); if (!ts_node_is_null(args)) { diff --git a/internal/cbm/lang_specs.c b/internal/cbm/lang_specs.c index 68d2afad..26d25b3d 100644 --- a/internal/cbm/lang_specs.c +++ b/internal/cbm/lang_specs.c @@ -585,7 +585,7 @@ static const char *perl_func_types[] = {"subroutine_declaration_statement", NULL static const char *perl_module_types[] = {"source_file", NULL}; static const char *perl_call_types[] = {"ambiguous_function_call_expression", "function_call_expression", "func1op_call_expression", - NULL}; + "method_call_expression", NULL}; static const char *perl_import_types[] = {"use_statement", "require_statement", "require", NULL}; static const char *perl_branch_types[] = {"if_statement", "unless_statement", "for_statement", "foreach_statement", "while_statement", NULL}; diff --git a/src/pipeline/pass_calls.c b/src/pipeline/pass_calls.c index dd7ddaed..c90d0ff3 100644 --- a/src/pipeline/pass_calls.c +++ b/src/pipeline/pass_calls.c @@ -361,12 +361,14 @@ static int resolve_single_call(cbm_pipeline_ctx_t *ctx, CBMCall *call, } } - /* Perl builtin guard (#459 follow-up). Applied only after LSP resolution - * declined, so a real LSP-resolved call is never suppressed. A Perl builtin - * (push/shift/keys/...) resolved by the generic short-name matcher to a - * project sub sharing the name is almost always a false positive. Gated to - * Perl — other languages reach cbm_registry_resolve unchanged. */ - if (lang == CBM_LANG_PERL && cbm_perl_is_builtin(call->callee_name)) { + /* Perl call-graph noise guards (#459 follow-up). Applied only after LSP + * resolution declined, so a real LSP-resolved call is never suppressed. + * Gated to Perl — other languages reach cbm_registry_resolve unchanged. + * 1. Builtins (push/shift/keys/...): a generic short-name match to a + * project sub sharing the name is almost always a false positive. + * 2. Method calls ($obj->m): without a resolved receiver type the bare + * short-name match is wrong; that resolution is the LSP's job. */ + if (lang == CBM_LANG_PERL && (call->is_method || cbm_perl_is_builtin(call->callee_name))) { return 0; } diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 7ef1dbe0..38c6ded8 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -1791,11 +1791,13 @@ static void resolve_file_calls(resolve_ctx_t *rc, resolve_worker_state_t *ws, CB res.candidate_count = 1; ws->lsp_overrides++; } - } else if (lang == CBM_LANG_PERL && cbm_perl_is_builtin(call->callee_name)) { - /* Perl builtin guard (#459 follow-up), mirroring the sequential - * pass (pass_calls.c). LSP resolution already declined above - * (lsp == NULL here), so suppress the generic short-name match for - * Perl builtins (push/shift/keys/...). Leaves res empty → no edge. + } else if (lang == CBM_LANG_PERL && + (call->is_method || cbm_perl_is_builtin(call->callee_name))) { + /* Perl call-graph noise guards (#459 follow-up), mirroring the + * sequential pass (pass_calls.c). LSP resolution already declined + * above (lsp == NULL here), so suppress the generic short-name + * match for Perl builtins (push/shift/keys/...) and for method + * calls with an unknown receiver. Leaves res empty → no edge. * Gated to Perl; every other language resolves unchanged. */ atomic_fetch_add_explicit(&rc->time_ns_rc_resolve, extract_now_ns() - _rc_t0, memory_order_relaxed); From 41f4ff8a8324306b3c8e675a6b6ab3c980d96493 Mon Sep 17 00:00:00 2001 From: Shane McCarron Date: Tue, 16 Jun 2026 07:21:11 -0500 Subject: [PATCH 4/6] test(perl): cover call-graph noise fixes (builtins, methods, config strings) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hermetic tests for the three Perl call-graph noise fixes: test_extraction.c (extraction tier): - config string is never emitted as a callee; genuine call still extracted - builtin calls (push/keys) extracted but never flagged is_method - arrow/method calls ($self->commit / $dbh->commit) set is_method=true, while the genuine function call (helper) does not - a JS method call never sets is_method (flag is Perl-only — other languages unaffected) test_registry.c (resolver tier): - cbm_perl_is_builtin recognizes core builtins (incl. first/last of the sorted set) and rejects project subs, case variants, empty, and NULL Co-Authored-By: Claude Opus 4.8 Signed-off-by: Shane McCarron --- tests/test_extraction.c | 114 ++++++++++++++++++++++++++++++++++++++++ tests/test_registry.c | 28 ++++++++++ 2 files changed, 142 insertions(+) diff --git a/tests/test_extraction.c b/tests/test_extraction.c index 0308372c..4bad219b 100644 --- a/tests/test_extraction.c +++ b/tests/test_extraction.c @@ -2814,6 +2814,114 @@ TEST(complexity_access_depth_and_params) { PASS(); } +/* ═══════════════════════════════════════════════════════════════════ + * Perl call-graph noise (#459 follow-up) + * ═══════════════════════════════════════════════════════════════════ */ + +/* Count calls whose callee_name is exactly `name` (has_call is substring). */ +static int count_calls_exact(CBMFileResult *r, const char *name) { + int n = 0; + for (int i = 0; i < r->calls.count; i++) { + if (r->calls.items[i].callee_name && strcmp(r->calls.items[i].callee_name, name) == 0) + n++; + } + return n; +} + +/* (b) A dotted config string must never be extracted as a callee. */ +TEST(extract_perl_config_string_not_a_callee) { + CBMFileResult *r = extract("package C;\n" + "sub run {\n" + " my $cfg = { \"log4perl.appender.File.utf8\" => 1 };\n" + " helper();\n" + "}\n" + "sub helper { return 1; }\n" + "1;\n", + CBM_LANG_PERL, "t", "app.pl"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + /* No callee may contain a '.' (config/string tokens are rejected). */ + for (int i = 0; i < r->calls.count; i++) { + ASSERT_TRUE(strchr(r->calls.items[i].callee_name, '.') == NULL); + } + /* (d) The genuine intra-file function call is still extracted. */ + ASSERT_TRUE(count_calls_exact(r, "helper") >= 1); + cbm_free_result(r); + PASS(); +} + +/* (a) A Perl builtin call is extracted as a non-method callee. Suppression of + * the resulting CALLS edge happens in the resolver (see test_registry.c / + * end-to-end); extraction itself keeps the bare builtin token. */ +TEST(extract_perl_builtin_call_is_function_not_method) { + CBMFileResult *r = extract("package B;\n" + "sub run {\n" + " my @x;\n" + " push @x, 1;\n" + " keys %h;\n" + "}\n" + "1;\n", + CBM_LANG_PERL, "t", "b.pl"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + /* push / keys are extracted (they are valid identifiers) ... */ + ASSERT_TRUE(has_call(r, "push")); + /* ... and crucially are NOT flagged as method calls. */ + for (int i = 0; i < r->calls.count; i++) { + ASSERT_FALSE(r->calls.items[i].is_method); + } + cbm_free_result(r); + PASS(); +} + +/* (c) An arrow/method call is extracted with is_method=true so the resolver + * can suppress generic short-name matching for it. */ +TEST(extract_perl_method_call_flags_is_method) { + CBMFileResult *r = extract("package M;\n" + "sub run {\n" + " my $self = shift;\n" + " $self->commit();\n" + " $dbh->commit();\n" + " helper();\n" + "}\n" + "sub helper { return 1; }\n" + "1;\n", + CBM_LANG_PERL, "t", "m.pl"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + /* Every "commit" call is a method call (is_method set). */ + int commit_calls = 0; + for (int i = 0; i < r->calls.count; i++) { + if (strcmp(r->calls.items[i].callee_name, "commit") == 0) { + commit_calls++; + ASSERT_TRUE(r->calls.items[i].is_method); + } + /* The genuine function call is NOT a method. */ + if (strcmp(r->calls.items[i].callee_name, "helper") == 0) { + ASSERT_FALSE(r->calls.items[i].is_method); + } + } + ASSERT_TRUE(commit_calls >= 1); + /* (d) genuine intra-file function call still extracted. */ + ASSERT_TRUE(count_calls_exact(r, "helper") >= 1); + cbm_free_result(r); + PASS(); +} + +/* Other languages must be unaffected: a JS method call never sets is_method + * (the flag is Perl-only). */ +TEST(extract_non_perl_method_call_not_flagged_is_method) { + CBMFileResult *r = + extract("function run(o){ o.commit(); helper(); }\n", CBM_LANG_JAVASCRIPT, "t", "x.js"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + for (int i = 0; i < r->calls.count; i++) { + ASSERT_FALSE(r->calls.items[i].is_method); + } + cbm_free_result(r); + PASS(); +} + /* ═══════════════════════════════════════════════════════════════════ * Suite * ═══════════════════════════════════════════════════════════════════ */ @@ -2822,6 +2930,12 @@ SUITE(extraction) { /* Initialize extraction library */ cbm_init(); + /* Perl call-graph noise (#459 follow-up) */ + RUN_TEST(extract_perl_config_string_not_a_callee); + RUN_TEST(extract_perl_builtin_call_is_function_not_method); + RUN_TEST(extract_perl_method_call_flags_is_method); + RUN_TEST(extract_non_perl_method_call_not_flagged_is_method); + /* R box-module imports + member calls */ RUN_TEST(extract_r_box_use_imports_issue218); RUN_TEST(extract_r_dollar_call_issue219); diff --git a/tests/test_registry.c b/tests/test_registry.c index 2c2181a0..320c7256 100644 --- a/tests/test_registry.c +++ b/tests/test_registry.c @@ -626,6 +626,30 @@ TEST(fuzzy_no_import_map_passthrough) { PASS(); } +/* ── Perl builtin guard (#459 follow-up: call-graph noise) ───────── */ + +TEST(perl_builtin_set_recognizes_core_builtins) { + /* Representative core builtins from across the sorted set. */ + ASSERT_TRUE(cbm_perl_is_builtin("push")); + ASSERT_TRUE(cbm_perl_is_builtin("shift")); + ASSERT_TRUE(cbm_perl_is_builtin("keys")); + ASSERT_TRUE(cbm_perl_is_builtin("sprintf")); + ASSERT_TRUE(cbm_perl_is_builtin("abs")); /* first element */ + ASSERT_TRUE(cbm_perl_is_builtin("write")); /* last element */ + ASSERT_TRUE(cbm_perl_is_builtin("wantarray")); + PASS(); +} + +TEST(perl_builtin_set_rejects_project_subs) { + /* Genuine project sub names and edge inputs must NOT be flagged. */ + ASSERT_FALSE(cbm_perl_is_builtin("helper")); + ASSERT_FALSE(cbm_perl_is_builtin("process_request")); + ASSERT_FALSE(cbm_perl_is_builtin("Push")); /* case-sensitive */ + ASSERT_FALSE(cbm_perl_is_builtin("")); + ASSERT_FALSE(cbm_perl_is_builtin(NULL)); + PASS(); +} + /* ── Suite ─────────────────────────────────────────────────────── */ SUITE(registry) { @@ -684,4 +708,8 @@ SUITE(registry) { RUN_TEST(fuzzy_resolve_confidence_distance); RUN_TEST(fuzzy_penalty_unreachable_import); RUN_TEST(fuzzy_no_import_map_passthrough); + + /* Perl builtin guard */ + RUN_TEST(perl_builtin_set_recognizes_core_builtins); + RUN_TEST(perl_builtin_set_rejects_project_subs); } From fbdb063e9c26a666083572664530684421980914 Mon Sep 17 00:00:00 2001 From: Shane McCarron Date: Tue, 16 Jun 2026 09:14:18 -0500 Subject: [PATCH 5/6] fix(perl): address QA round 1 Round 1 (Claude panel + DO DeepSeek) findings: - Rework the Perl noise guard so it suppresses only WEAK generic matches (suffix_match/unique_name) and KEEPS high-confidence same_module/import_map. The prior guard ran before cbm_registry_resolve and dropped genuine same-file calls to builtin-named subs (e.g. a project sub log/index/open called as a bare function) that pre-PR resolved via same_module. Extracted the decision into pure, unit-tested cbm_perl_suppress_generic_match() shared by the sequential (pass_calls.c) and parallel (pass_parallel.c) resolvers; corrected the inaccurate comments (Perl has no LSP/textual stage before the guard). - Tighten perl_is_identifier_callee to require '::' pairs (reject a lone ':', ':::', or trailing '::'). - Add resolver-contract tests covering weak-match suppression, same_module/ import_map retention, genuine-call survival, non-Perl no-op, and NULL strategy. Verified on a real Perl monorepo: .cgi builtin/CPAN/config-string noise stays eliminated while same_module edges to builtin-named subs are recovered. Signed-off-by: Shane McCarron --- internal/cbm/extract_calls.c | 11 ++++++++++- src/pipeline/pass_calls.c | 24 +++++++++++++----------- src/pipeline/pass_parallel.c | 24 +++++++++++++----------- src/pipeline/pipeline.h | 7 +++++++ src/pipeline/registry.c | 25 +++++++++++++++++++++++++ tests/test_registry.c | 29 +++++++++++++++++++++++++++++ 6 files changed, 97 insertions(+), 23 deletions(-) diff --git a/internal/cbm/extract_calls.c b/internal/cbm/extract_calls.c index ba24cccd..583fbe6d 100644 --- a/internal/cbm/extract_calls.c +++ b/internal/cbm/extract_calls.c @@ -372,7 +372,16 @@ static bool perl_is_identifier_callee(const char *name) { } for (const char *p = name; *p; p++) { unsigned char c = (unsigned char)*p; - if (isalnum(c) || c == '_' || c == ':') { + if (isalnum(c) || c == '_') { + continue; + } + if (c == ':') { + // Only the '::' package separator is allowed: require an adjacent + // pair, and reject a lone ':', ':::', or a trailing '::'. + if (p[1] != ':' || p[2] == ':' || p[2] == '\0') { + return false; + } + p++; // consume the second ':'; the loop's p++ moves past the pair continue; } return false; // '.', space, quote, '/', etc. → not a sub/method name diff --git a/src/pipeline/pass_calls.c b/src/pipeline/pass_calls.c index c90d0ff3..25beb884 100644 --- a/src/pipeline/pass_calls.c +++ b/src/pipeline/pass_calls.c @@ -361,22 +361,24 @@ static int resolve_single_call(cbm_pipeline_ctx_t *ctx, CBMCall *call, } } - /* Perl call-graph noise guards (#459 follow-up). Applied only after LSP - * resolution declined, so a real LSP-resolved call is never suppressed. - * Gated to Perl — other languages reach cbm_registry_resolve unchanged. - * 1. Builtins (push/shift/keys/...): a generic short-name match to a - * project sub sharing the name is almost always a false positive. - * 2. Method calls ($obj->m): without a resolved receiver type the bare - * short-name match is wrong; that resolution is the LSP's job. */ - if (lang == CBM_LANG_PERL && (call->is_method || cbm_perl_is_builtin(call->callee_name))) { - return 0; - } - cbm_resolution_t res = cbm_registry_resolve(ctx->registry, call->callee_name, module_qn, imp_keys, imp_vals, imp_count); if (!res.qualified_name || res.qualified_name[0] == '\0') { return 0; } + + /* Perl call-graph noise guard (#476). Perl has no LSP resolver, so the + * generic registry chain is the only resolver; for builtins (push/shift/ + * keys/...) and method calls ($obj->m with an unresolved receiver), a *weak* + * cross-file short-name match to a project sub sharing the name is almost + * always a false positive. Suppress only those weak matches; KEEP the + * high-confidence same_module / import_map strategies so a genuine + * same-file or imported call to a builtin-named sub still resolves. Gated + * to Perl — other languages are unaffected. */ + if (cbm_perl_suppress_generic_match(lang == CBM_LANG_PERL, call->is_method, call->callee_name, + res.strategy)) { + return 0; + } const cbm_gbuf_node_t *target_node = cbm_gbuf_find_by_qn(ctx->gbuf, res.qualified_name); if (!target_node || source_node->id == target_node->id) { return 0; diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 38c6ded8..a3c25630 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -1791,17 +1791,6 @@ static void resolve_file_calls(resolve_ctx_t *rc, resolve_worker_state_t *ws, CB res.candidate_count = 1; ws->lsp_overrides++; } - } else if (lang == CBM_LANG_PERL && - (call->is_method || cbm_perl_is_builtin(call->callee_name))) { - /* Perl call-graph noise guards (#459 follow-up), mirroring the - * sequential pass (pass_calls.c). LSP resolution already declined - * above (lsp == NULL here), so suppress the generic short-name - * match for Perl builtins (push/shift/keys/...) and for method - * calls with an unknown receiver. Leaves res empty → no edge. - * Gated to Perl; every other language resolves unchanged. */ - atomic_fetch_add_explicit(&rc->time_ns_rc_resolve, extract_now_ns() - _rc_t0, - memory_order_relaxed); - continue; } else { res = cbm_registry_resolve(rc->registry, call->callee_name, module_qn, imp_keys, imp_vals, imp_count); @@ -1814,6 +1803,19 @@ static void resolve_file_calls(resolve_ctx_t *rc, resolve_worker_state_t *ws, CB atomic_fetch_add_explicit(&rc->time_ns_rc_hint, extract_now_ns() - _rc_t0, memory_order_relaxed); + /* Perl call-graph noise guard (#476), mirroring the sequential pass + * (pass_calls.c). Perl has no LSP resolver; for builtins (push/shift/ + * keys/...) and method calls ($obj->m, unresolved receiver), suppress + * only WEAK cross-file short-name matches and keep the high-confidence + * same_module / import_map strategies so a genuine same-file or + * imported call to a builtin-named sub still resolves. Placed after the + * field-type hint so a hint cannot re-introduce a suppressed edge. + * Gated to Perl — other languages are unaffected. */ + if (cbm_perl_suppress_generic_match(lang == CBM_LANG_PERL, call->is_method, + call->callee_name, res.strategy)) { + continue; + } + if (!res.qualified_name || res.qualified_name[0] == '\0') { if (cbm_service_pattern_route_method(call->callee_name) != NULL) { cbm_resolution_t fake_res = {.qualified_name = call->callee_name, diff --git a/src/pipeline/pipeline.h b/src/pipeline/pipeline.h index ee385635..ae7d125a 100644 --- a/src/pipeline/pipeline.h +++ b/src/pipeline/pipeline.h @@ -173,6 +173,13 @@ bool cbm_registry_exists(const cbm_registry_t *r, const char *qn); * the name. Perl-scoped: callers gate on the file language. */ bool cbm_perl_is_builtin(const char *name); +/* Decide whether a resolved Perl call edge is generic-resolver noise to drop + * (#476): true only for Perl, only for a builtin/method call, and only when the + * match used a weak short-name strategy — high-confidence same_module/import_map + * matches are kept. Pure; unit-tested in test_registry.c. */ +bool cbm_perl_suppress_generic_match(bool is_perl, bool is_method, const char *callee_name, + const char *strategy); + /* Get the label of a qualified name, or NULL if not found. */ const char *cbm_registry_label_of(const cbm_registry_t *r, const char *qn); diff --git a/src/pipeline/registry.c b/src/pipeline/registry.c index 5dac5799..221e02a5 100644 --- a/src/pipeline/registry.c +++ b/src/pipeline/registry.c @@ -395,6 +395,31 @@ bool cbm_perl_is_builtin(const char *name) { sizeof(PERL_BUILTINS[0]), perl_builtin_cmp) != NULL; } +/* Decide whether a *resolved* Perl call edge is generic-resolver noise that + * should be suppressed (#476). Returns true only for Perl, only for a builtin + * invocation or a method call, and only when the registry landed the match via + * a WEAK short-name strategy. High-confidence same_module / import_map matches + * are KEPT so a genuine same-file or imported call to a builtin-named sub still + * resolves. `strategy` is the cbm_resolution_t.strategy of a non-empty match; + * NULL/empty (no match) returns false. Pure + side-effect-free so the + * suppression contract is unit-testable without a full pipeline. */ +bool cbm_perl_suppress_generic_match(bool is_perl, bool is_method, const char *callee_name, + const char *strategy) { + if (!is_perl) { + return false; + } + if (!(is_method || cbm_perl_is_builtin(callee_name))) { + return false; + } + if (!strategy || !strategy[0]) { + return false; + } + if (strcmp(strategy, "same_module") == 0 || strcmp(strategy, "import_map") == 0) { + return false; /* high-confidence — keep the genuine edge */ + } + return true; /* weak short-name match (suffix_match / unique_name / …) → drop */ +} + /* ── Lifecycle ──────────────────────────────────────────────────── */ cbm_registry_t *cbm_registry_new(void) { diff --git a/tests/test_registry.c b/tests/test_registry.c index 320c7256..69e78638 100644 --- a/tests/test_registry.c +++ b/tests/test_registry.c @@ -650,6 +650,33 @@ TEST(perl_builtin_set_rejects_project_subs) { PASS(); } +TEST(perl_suppress_drops_weak_builtin_and_method_matches) { + /* #476: a builtin/method call that landed via a WEAK short-name strategy is + * generic-resolver noise and must be suppressed. */ + ASSERT_TRUE(cbm_perl_suppress_generic_match(true, false, "push", "suffix_match")); + ASSERT_TRUE(cbm_perl_suppress_generic_match(true, false, "keys", "unique_name")); + ASSERT_TRUE(cbm_perl_suppress_generic_match(true, true, "commit", "suffix_match")); + ASSERT_TRUE(cbm_perl_suppress_generic_match(true, true, "log", "unique_name")); + PASS(); +} + +TEST(perl_suppress_keeps_high_confidence_and_genuine_calls) { + /* #476: high-confidence strategies are kept so a genuine same-file/imported + * call to a builtin-named sub still resolves (criterion d). */ + ASSERT_FALSE(cbm_perl_suppress_generic_match(true, false, "log", "same_module")); + ASSERT_FALSE(cbm_perl_suppress_generic_match(true, false, "open", "import_map")); + ASSERT_FALSE(cbm_perl_suppress_generic_match(true, true, "commit", "same_module")); + /* A genuine non-builtin function call is never suppressed (edge survives). */ + ASSERT_FALSE(cbm_perl_suppress_generic_match(true, false, "helper", "suffix_match")); + /* Non-Perl languages are never affected. */ + ASSERT_FALSE(cbm_perl_suppress_generic_match(false, false, "push", "suffix_match")); + ASSERT_FALSE(cbm_perl_suppress_generic_match(false, true, "commit", "suffix_match")); + /* No match (NULL/empty strategy) → nothing to suppress. */ + ASSERT_FALSE(cbm_perl_suppress_generic_match(true, false, "push", NULL)); + ASSERT_FALSE(cbm_perl_suppress_generic_match(true, true, "commit", "")); + PASS(); +} + /* ── Suite ─────────────────────────────────────────────────────── */ SUITE(registry) { @@ -712,4 +739,6 @@ SUITE(registry) { /* Perl builtin guard */ RUN_TEST(perl_builtin_set_recognizes_core_builtins); RUN_TEST(perl_builtin_set_rejects_project_subs); + RUN_TEST(perl_suppress_drops_weak_builtin_and_method_matches); + RUN_TEST(perl_suppress_keeps_high_confidence_and_genuine_calls); } From 1612523684831346df84eb43ca12242d264ffb7c Mon Sep 17 00:00:00 2001 From: Shane McCarron Date: Tue, 16 Jun 2026 09:45:53 -0500 Subject: [PATCH 6/6] fix(perl): address QA round 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 2 (Claude panel) caught a regression introduced by the round-1 refactor: cbm_perl_suppress_generic_match whitelisted only the exact strategies "same_module" and "import_map", but resolve_import_map can also return "import_map_suffix" (confidence 0.85 — a genuine import-based resolution, not a weak short-name guess). A '::'-qualified Perl builtin/method call resolved via the import-suffix fallback was therefore dropped, contradicting the helper's documented contract and partially missing acceptance criterion (d). Add import_map_suffix to the kept (high-confidence) set so only the weak short-name strategies (suffix_match / unique_name) are suppressed; update the doc comment and add a unit-test case asserting import_map_suffix is retained. Deferred as advisory (non-blocking, noted on the PR): a hypothetical leading-'::' (main:: shorthand) under-extraction in perl_is_identifier_callee, and a colon-edge-case coverage gap (logic correct by inspection). Signed-off-by: Shane McCarron --- src/pipeline/registry.c | 13 ++++++++----- tests/test_registry.c | 3 +++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/pipeline/registry.c b/src/pipeline/registry.c index 221e02a5..2bf3f217 100644 --- a/src/pipeline/registry.c +++ b/src/pipeline/registry.c @@ -398,9 +398,11 @@ bool cbm_perl_is_builtin(const char *name) { /* Decide whether a *resolved* Perl call edge is generic-resolver noise that * should be suppressed (#476). Returns true only for Perl, only for a builtin * invocation or a method call, and only when the registry landed the match via - * a WEAK short-name strategy. High-confidence same_module / import_map matches - * are KEPT so a genuine same-file or imported call to a builtin-named sub still - * resolves. `strategy` is the cbm_resolution_t.strategy of a non-empty match; + * a WEAK short-name strategy. High-confidence import/same-module strategies + * (same_module, import_map, import_map_suffix) are KEPT so a genuine same-file + * or imported call to a builtin-named sub still resolves — only the weak + * short-name guesses (suffix_match, unique_name) are dropped. `strategy` is the + * cbm_resolution_t.strategy of a non-empty match; * NULL/empty (no match) returns false. Pure + side-effect-free so the * suppression contract is unit-testable without a full pipeline. */ bool cbm_perl_suppress_generic_match(bool is_perl, bool is_method, const char *callee_name, @@ -414,8 +416,9 @@ bool cbm_perl_suppress_generic_match(bool is_perl, bool is_method, const char *c if (!strategy || !strategy[0]) { return false; } - if (strcmp(strategy, "same_module") == 0 || strcmp(strategy, "import_map") == 0) { - return false; /* high-confidence — keep the genuine edge */ + if (strcmp(strategy, "same_module") == 0 || strcmp(strategy, "import_map") == 0 || + strcmp(strategy, "import_map_suffix") == 0) { + return false; /* high-confidence import/same-module match — keep the genuine edge */ } return true; /* weak short-name match (suffix_match / unique_name / …) → drop */ } diff --git a/tests/test_registry.c b/tests/test_registry.c index 69e78638..46cfc3d8 100644 --- a/tests/test_registry.c +++ b/tests/test_registry.c @@ -665,6 +665,9 @@ TEST(perl_suppress_keeps_high_confidence_and_genuine_calls) { * call to a builtin-named sub still resolves (criterion d). */ ASSERT_FALSE(cbm_perl_suppress_generic_match(true, false, "log", "same_module")); ASSERT_FALSE(cbm_perl_suppress_generic_match(true, false, "open", "import_map")); + /* import_map_suffix is a genuine import resolution (conf 0.85), not a weak + * short-name guess — a '::'-qualified call resolved this way must be kept. */ + ASSERT_FALSE(cbm_perl_suppress_generic_match(true, true, "Foo::Bar::m", "import_map_suffix")); ASSERT_FALSE(cbm_perl_suppress_generic_match(true, true, "commit", "same_module")); /* A genuine non-builtin function call is never suppressed (edge survives). */ ASSERT_FALSE(cbm_perl_suppress_generic_match(true, false, "helper", "suffix_match"));