diff --git a/internal/redundancy/redundancy.go b/internal/redundancy/redundancy.go index 2b2dd20..ce50833 100644 --- a/internal/redundancy/redundancy.go +++ b/internal/redundancy/redundancy.go @@ -16,15 +16,21 @@ type Result struct { // Checker detects redundant content at three levels: exact, normalized, and similar. // It is safe for concurrent use. type Checker struct { - mu sync.Mutex - threshold float64 - raw []string - normalized []string + mu sync.Mutex + threshold float64 + raw []string + normalized []string + exactIdx map[string]bool // O(1) exact lookup + normalizedIdx map[string]bool // O(1) normalized lookup } // NewChecker creates a Checker with the given Jaccard similarity threshold. func NewChecker(threshold float64) *Checker { - return &Checker{threshold: threshold} + return &Checker{ + threshold: threshold, + exactIdx: make(map[string]bool), + normalizedIdx: make(map[string]bool), + } } // normalize sorts whitespace-separated tokens so order-independent comparison works. @@ -38,12 +44,35 @@ func normalize(s string) string { func (c *Checker) Record(content string) { c.mu.Lock() defer c.mu.Unlock() + norm := normalize(content) c.raw = append(c.raw, content) - c.normalized = append(c.normalized, normalize(content)) + c.normalized = append(c.normalized, norm) + c.exactIdx[content] = true + c.normalizedIdx[norm] = true } // Check tests content against all previously recorded strings. // It does NOT record the content — call Record separately if desired. +func (c *Checker) ExactIndex() map[string]bool { + c.mu.Lock() + defer c.mu.Unlock() + out := make(map[string]bool, len(c.exactIdx)) + for k, v := range c.exactIdx { + out[k] = v + } + return out +} + +func (c *Checker) NormalizedIndex() map[string]bool { + c.mu.Lock() + defer c.mu.Unlock() + out := make(map[string]bool, len(c.normalizedIdx)) + for k, v := range c.normalizedIdx { + out[k] = v + } + return out +} + func (c *Checker) Check(content string) Result { return c.CheckWithThreshold(content, c.threshold) } @@ -55,16 +84,17 @@ func (c *Checker) CheckWithThreshold(content string, threshold float64) Result { defer c.mu.Unlock() norm := normalize(content) - normTokens := strings.Fields(norm) - for i, r := range c.raw { - if r == content { - return Result{IsRedundant: true, Kind: "exact", Similarity: 1.0} - } - if c.normalized[i] == norm { - return Result{IsRedundant: true, Kind: "normalized", Similarity: 1.0} - } - sim := jaccard(normTokens, strings.Fields(c.normalized[i])) + if c.exactIdx[content] { + return Result{IsRedundant: true, Kind: "exact", Similarity: 1.0} + } + if c.normalizedIdx[norm] { + return Result{IsRedundant: true, Kind: "normalized", Similarity: 1.0} + } + + normTokens := strings.Fields(norm) + for _, n := range c.normalized { + sim := jaccard(normTokens, strings.Fields(n)) if sim >= threshold { return Result{IsRedundant: true, Kind: "similar", Similarity: sim} } diff --git a/internal/redundancy/redundancy_test.go b/internal/redundancy/redundancy_test.go index 9dc0be5..3a57f00 100644 --- a/internal/redundancy/redundancy_test.go +++ b/internal/redundancy/redundancy_test.go @@ -1,6 +1,7 @@ package redundancy_test import ( + "strings" "testing" "github.com/pythondatascrape/engram/internal/redundancy" @@ -45,3 +46,50 @@ func TestFirstCallNotRedundant(t *testing.T) { result := c.Check("lang=go") require.False(t, result.IsRedundant) } + +// TestExactLookupUsesIndex verifies that exact duplicates are found via O(1) map lookup. +// The Checker must expose an ExactIndex() map for inspection. +func TestExactLookupUsesIndex(t *testing.T) { + c := redundancy.NewChecker(0.9) + c.Record("lang=go arch=monolith") + c.Record("lang=rust framework=axum") + idx := c.ExactIndex() + require.True(t, idx["lang=go arch=monolith"], "exact index must contain recorded entry") + require.True(t, idx["lang=rust framework=axum"], "exact index must contain second entry") + require.False(t, idx["lang=go"], "non-recorded string must not appear in exact index") +} + +// TestNormalizedLookupUsesIndex verifies that normalized duplicates are found via O(1) map lookup. +func TestNormalizedLookupUsesIndex(t *testing.T) { + c := redundancy.NewChecker(0.9) + c.Record("lang=go arch=monolith") + idx := c.NormalizedIndex() + // normalized form of "lang=go arch=monolith" is tokens sorted: "arch=monolith lang=go" + require.True(t, idx["arch=monolith lang=go"], "normalized index must contain sorted-token form") +} + +// TestLargeExactIndex ensures exact match stays fast with many entries (index must not degrade). +func TestLargeExactIndex(t *testing.T) { + c := redundancy.NewChecker(0.9) + const n = 10_000 + for i := 0; i < n; i++ { + c.Record(strings.Repeat("x", i+1)) + } + target := strings.Repeat("x", n/2) + result := c.Check(target) + require.True(t, result.IsRedundant) + require.Equal(t, "exact", result.Kind) +} + +// TestLargeNormalizedIndex ensures normalized match stays fast with many entries. +func TestLargeNormalizedIndex(t *testing.T) { + c := redundancy.NewChecker(0.9) + const n = 10_000 + for i := 0; i < n; i++ { + c.Record(strings.Repeat("y", i+1)) + } + // Check reverse-order of a recorded entry — same tokens, different order (single token so same). + target := strings.Repeat("y", n/2) + result := c.Check(target) + require.True(t, result.IsRedundant) +}