Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/ecp-analyzer/src/post_process/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
pub mod class_membership;
pub mod imports_edges;
pub mod overrides;
pub mod schema_field_mirrors;
189 changes: 189 additions & 0 deletions crates/ecp-analyzer/src/post_process/schema_field_mirrors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
//! `SchemaField` Node emission + `MirrorsField` heuristic edge bucketing
//! (T4-7).
//!
//! Bridges the gap between per-file `RawSchemaField` (T4-1..T4-6 detectors)
//! and the queryable graph layer: each `RawSchemaField` becomes a
//! `NodeKind::SchemaField` Node connected to its owning class via
//! `HasProperty`, and cross-framework mirrors (e.g. Pydantic `User.email`
//! vs SQLAlchemy `User.email`) are linked by the heuristic `MirrorsField`
//! edge.
//!
//! ## Algorithm
//!
//! 1. **Promote**: iterate `LocalGraph.schema_fields`, emit one
//! `SchemaField` Node + one `HasProperty` edge (Class → SchemaField)
//! per field. SymbolTable lookup resolves the owning class name in
//! the same file; misses silently drop the field (no false-positive
//! edge to a wrong class).
//!
//! 2. **Bucket**: group emitted SchemaField Nodes by
//! `(name.to_lowercase(), SchemaType)` so case-variants and
//! type-compatible mirrors share a bucket.
//!
//! 3. **Pair within bucket**: apply the 4-point strict rubric per pair —
//! exact case-sensitive name, identical `SchemaType` (granted by
//! bucket key), identical owner-class name, and bidirectional top-1.
//! For k=2 the top-1 check is trivial; for k≥3 with a uniform
//! `(name, type, owner_class)` triple, D3 cluster semantics emit all
//! `k×(k−1)/2` pairs at confidence 0.9. Different owner-class within
//! the same bucket is currently dropped (BlindSpot emission is a
//! documented follow-up — see `test_partial_match_emits_blindspot`).
//!
//! ## Perf
//!
//! - O(N) bucket build; O(k²) pairwise emission where k = #fields sharing
//! `(name_lc, type)`. Typical k < 10 (real corpora rarely have more
//! than a handful of cross-framework mirrors per identifier).
//! - Offline-only — runs once at build time, never on `ecp` hot paths.

use crate::resolution::index::SymbolTable;
use ecp_core::analyzer::types::{LocalGraph, SchemaType};
use ecp_core::graph::{Edge, Node, NodeKind, RelType};
use ecp_core::pool::StringPool;
use rustc_hash::FxHashMap;

/// Promote `RawSchemaField`s to `SchemaField` Nodes + emit `HasProperty`
/// connections + `MirrorsField` heuristic edges. Returns the count of
/// emitted MirrorsField edges (HasProperty + SchemaField Node counts are
/// derivable from `nodes.len()` delta; this return is for telemetry).
///
/// `file_node_count_before` is the size of `nodes` BEFORE the File-node
/// append loop runs — used to bound SchemaField nodes to the
/// raw-symbols-and-extras region so `file_node_idx` registration further
/// downstream still hits the correct File-node range.
pub fn emit_edges(
local_graphs: &[LocalGraph],
symbol_table: &SymbolTable,
string_pool: &mut StringPool,
nodes: &mut Vec<Node>,
edges: &mut Vec<Edge>,
) -> usize {
let reason_has_property = string_pool.add("post_process:schema_field:has_property");
let reason_mirror = string_pool.add("post_process:schema_field:mirrors_field");

/// Bucket entry: (node_idx, owner_class, exact_name).
type BucketEntry<'a> = (u32, &'a str, &'a str);
let mut buckets: FxHashMap<(String, SchemaType), Vec<BucketEntry<'_>>> = FxHashMap::default();

// Phase 1 — emit SchemaField Nodes + HasProperty edges, populate buckets.
//
// Skip `LocalGraph`s with no `schema_fields` (the majority — most files
// carry no ORM/schema surface). Per-file Owner-class lookup uses
// SymbolTable's per-file index; misses (e.g. extractor emitted a class
// name that SymbolTable doesn't know about, perhaps due to file
// boundary edge cases) silently drop the field. No fabricated edges.
for (lg_idx, local_graph) in local_graphs.iter().enumerate() {
let Some(ref schema_fields) = local_graph.schema_fields else {
continue;
};
if schema_fields.is_empty() {
continue;
}
let path_str = local_graph.file_path.to_string_lossy().replace('\\', "/");
let file_idx = lg_idx as u32;

for raw_sf in schema_fields.iter() {
let owner_name = &*raw_sf.owner_class;
let field_name = &*raw_sf.name;

// Resolve the owning class to an existing Node idx. If the
// SymbolTable doesn't know `owner_name` in this file, the
// class was not parsed as a `Class` / `Struct` / `Trait` /
// `Interface` — likely a generated-code or DSL pattern that
// we don't model. Silently skip rather than emit dangling
// HasProperty.
let Some(class_idx) = symbol_table.lookup_in_file(&path_str, owner_name) else {
continue;
};

// UID format: stable across reindex; includes framework so
// cross-framework mirrors with same (owner, name) get distinct
// UIDs. Format mirrors File-node UID convention.
let uid_str = format!(
"SchemaField:{:?}:{}.{}:{}",
raw_sf.framework, owner_name, field_name, path_str
);
let uid_ref = string_pool.add(&uid_str);
let name_ref = string_pool.add(field_name);
let sf_idx = nodes.len() as u32;
nodes.push(Node {
uid: uid_ref,
name: name_ref,
file_idx,
kind: NodeKind::SchemaField,
span: raw_sf.span,
community_id: 0,
});

// HasProperty: <Class> -> <SchemaField>. Non-heuristic
// (extractor saw an actual `name: T` / `Column(T)` form, so
// the class-owns-this-field claim is structural, not inferred).
edges.push(Edge {
source: class_idx,
target: sf_idx,
rel_type: RelType::HasProperty,
confidence: 1.0,
reason: reason_has_property,
});

// Bucket key: (lowercase_name, type). Lowercase normalizes
// `email` vs `Email`; type-class match keeps `email: str`
// from binding to `email: int`.
buckets
.entry((field_name.to_ascii_lowercase(), raw_sf.type_class))
.or_default()
.push((sf_idx, owner_name, field_name));
}
}

// Phase 2 — pairwise MirrorsField emission within each bucket.
//
// Rubric (per spec line 540 + D3 cluster semantics):
// - exact case-sensitive name match
// - identical SchemaType (granted by bucket key)
// - identical owner-class name (e.g. both "User")
// - bidirectional top-1 (trivial for k=2; D3 covers k≥3 uniform)
//
// Implementation: sub-group bucket by (exact_name, owner_class). Any
// sub-group of size ≥ 2 satisfies all four points and emits pairwise
// `MirrorsField` at heuristic confidence (RelType::MirrorsField is
// listed under `is_heuristic` so default `ecp impact` hides these
// unless `--show-heuristic` is set).
//
// BlindSpot emission for partial matches (3/4) is a documented
// follow-up — see ignored test `test_partial_match_emits_blindspot`.
let mut mirror_count = 0usize;
for entries in buckets.values() {
if entries.len() < 2 {
continue;
}
// Sub-group by (exact_name, owner_class).
let mut sub: FxHashMap<(&str, &str), Vec<u32>> = FxHashMap::default();
for &(idx, owner, name) in entries {
sub.entry((name, owner)).or_default().push(idx);
}
for group in sub.values() {
if group.len() < 2 {
continue;
}
// Pairwise emit. Deterministic order: idx-ascending pairs
// (i, j) where i < j by position in `group`. The vec is
// populated in iteration order from `entries`, which itself
// mirrors LocalGraph iteration order — stable across runs.
for i in 0..group.len() {
for j in (i + 1)..group.len() {
edges.push(Edge {
source: group[i],
target: group[j],
rel_type: RelType::MirrorsField,
confidence: 0.9,
reason: reason_mirror,
});
mirror_count += 1;
}
}
}
}

mirror_count
}
35 changes: 13 additions & 22 deletions crates/ecp-analyzer/src/python/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ use ecp_core::analyzer::types::{
RawRoute, RawTxScope,
};
use ecp_core::graph::{FileCategory, NodeKind};
use ecp_core::pool::StringPool;
use std::path::Path;
use streaming_iterator::StreamingIterator;
use tree_sitter::{Node, Query, QueryCursor};
Expand Down Expand Up @@ -1070,27 +1069,19 @@ impl LanguageProvider for PythonProvider {

crate::framework_helpers::stamp_owner_class_by_span(&mut nodes);

// The local pool is dropped after this block. `StrRef` fields inside
// `RawSchemaField` carry byte offsets relative to this pool — they
// remain valid Copy values, but string resolution requires the builder
// to re-intern them (see TODO: builder pass for T4-schema integration).
// Since the builder currently ignores `schema_fields`, no caller
// dereferences these StrRefs today.
let schema_fields = {
let mut pool = StringPool::new();
let fields = crate::schema_field::extract_schema_fields(
&tree,
source,
&self.query,
&[
crate::python::schema_extractors::PYDANTIC_CONFIG,
crate::python::schema_extractors::SQLALCHEMY_CONFIG,
],
&imports,
&mut pool,
);
(!fields.is_empty()).then(|| fields.into_boxed_slice())
};
// T4-7 refactor: `RawSchemaField` now stores owned `Box<str>` so the
// per-file parser scope can drop cleanly without dangling-pool risk.
let fields = crate::schema_field::extract_schema_fields(
&tree,
source,
&self.query,
&[
crate::python::schema_extractors::PYDANTIC_CONFIG,
crate::python::schema_extractors::SQLALCHEMY_CONFIG,
],
&imports,
);
let schema_fields = (!fields.is_empty()).then(|| fields.into_boxed_slice());

Ok(LocalGraph {
content_hash: [0; 8],
Expand Down
15 changes: 15 additions & 0 deletions crates/ecp-analyzer/src/resolution/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1233,6 +1233,21 @@ impl GraphBuilder {
&mut edges,
);

// T4-7: promote `RawSchemaField` → `SchemaField` Nodes + `HasProperty`
// (Class → SchemaField) + `MirrorsField` heuristic edges across
// detected cross-framework mirrors. Runs after overrides because it
// appends new Nodes; before File-node loop so SchemaField nodes
// occupy idx range BEFORE files[] (keeps file_node_idx contiguous).
// SchemaField nodes are NOT registered in SymbolTable — nothing
// references them by name; reverse traversal goes via HasProperty.
crate::post_process::schema_field_mirrors::emit_edges(
&self.local_graphs,
&symbol_table,
&mut string_pool,
&mut nodes,
&mut edges,
);

// Append one `NodeKind::File` node per LocalGraph at the tail of
// `nodes` (idx >= raw-node count). Doing it here — AFTER all passes
// that index symbols by SymbolTable + use raw node idx ranges —
Expand Down
16 changes: 8 additions & 8 deletions crates/ecp-analyzer/src/schema_field/extract.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use super::config::SchemaFieldConfig;
use crate::framework_helpers::has_import_from;
use ecp_core::analyzer::types::{RawImport, RawSchemaField, SchemaType};
use ecp_core::pool::StringPool;
use rustc_hash::FxHashMap;
use streaming_iterator::StreamingIterator;
use tree_sitter::{Query, QueryCursor, Tree};
Expand All @@ -10,10 +9,6 @@ use tree_sitter::{Query, QueryCursor, Tree};
/// each match to the first `SchemaFieldConfig` whose import-gate is satisfied,
/// and emit a `RawSchemaField` for every accepted match.
///
/// `pool` is used to intern `name` and `owner_class` strings. The caller
/// typically passes the same pool used for the rest of the file's `LocalGraph`
/// to maximise dedup across nodes.
///
/// The caller is responsible for supplying a query whose capture names align
/// with the `owner_capture`, `name_capture`, and `type_capture` fields of at
/// least one config. Captures not referenced by any config are silently
Expand All @@ -24,13 +19,18 @@ use tree_sitter::{Query, QueryCursor, Tree};
/// A config fires only when `has_import_from(imports, config.import_gate)`
/// returns `true`. When no config's gate is satisfied by the file's imports,
/// this function returns an empty `Vec` — no false positives.
///
/// # Ownership
/// `RawSchemaField` stores `name` / `owner_class` as owned `Box<str>` so the
/// per-file parser scope can be dropped without dangling the strings — the
/// pre-T4-7 design interned into a transient `StringPool` which was dropped
/// at scope exit, leaving `RawSchemaField` carrying unreachable `StrRef`s.
pub fn extract_schema_fields(
tree: &Tree,
source: &[u8],
query: &Query,
configs: &[SchemaFieldConfig],
imports: &[RawImport],
pool: &mut StringPool,
) -> Vec<RawSchemaField> {
// Identify which configs are live for this file once, not per-match.
// Empty import_gate is vacuously satisfied — language built-ins (e.g.
Expand Down Expand Up @@ -121,9 +121,9 @@ pub fn extract_schema_fields(
);

out.push(RawSchemaField {
name: pool.add(name),
name: name.into(),
type_class,
owner_class: pool.add(owner),
owner_class: owner.into(),
framework: config.framework,
span,
});
Expand Down
29 changes: 10 additions & 19 deletions crates/ecp-analyzer/src/typescript/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ use ecp_core::analyzer::lang_spec::LangSpec;
use ecp_core::analyzer::provider::LanguageProvider;
use ecp_core::analyzer::types::{LocalGraph, RawFrameworkRef, RawImport, RawNode, RawRoute};
use ecp_core::graph::NodeKind;
use ecp_core::pool::StringPool;
use std::path::Path;
use streaming_iterator::StreamingIterator;
use tree_sitter::{Parser, Query, QueryCursor};
Expand Down Expand Up @@ -524,24 +523,16 @@ impl LanguageProvider for TypeScriptProvider {

crate::framework_helpers::stamp_owner_class_by_span(&mut nodes);

// The local pool is dropped after this block. `StrRef` fields inside
// `RawSchemaField` carry byte offsets relative to this pool — they
// remain valid Copy values, but string resolution requires the builder
// to re-intern them (see TODO: builder pass for T4-schema integration).
// Since the builder currently ignores `schema_fields`, no caller
// dereferences these StrRefs today.
let schema_fields = {
let mut pool = StringPool::new();
let fields = crate::schema_field::extract_schema_fields(
&tree,
source,
&self.query,
&[crate::typescript::schema_extractors::TS_INTERFACE_CONFIG],
&imports,
&mut pool,
);
(!fields.is_empty()).then(|| fields.into_boxed_slice())
};
// T4-7 refactor: `RawSchemaField` now stores owned `Box<str>` so the
// per-file parser scope can drop cleanly without dangling-pool risk.
let fields = crate::schema_field::extract_schema_fields(
&tree,
source,
&self.query,
&[crate::typescript::schema_extractors::TS_INTERFACE_CONFIG],
&imports,
);
let schema_fields = (!fields.is_empty()).then(|| fields.into_boxed_slice());

Ok(LocalGraph {
content_hash: [0; 8],
Expand Down
Loading
Loading