Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 45 additions & 15 deletions internal/redundancy/redundancy.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,21 @@ type Result struct {
// Checker detects redundant content at three levels: exact, normalized, and similar.
// It is safe for concurrent use.
type Checker struct {
mu sync.Mutex
threshold float64
raw []string
normalized []string
mu sync.Mutex
threshold float64
raw []string
normalized []string
exactIdx map[string]bool // O(1) exact lookup
normalizedIdx map[string]bool // O(1) normalized lookup
}

// NewChecker creates a Checker with the given Jaccard similarity threshold.
func NewChecker(threshold float64) *Checker {
return &Checker{threshold: threshold}
return &Checker{
threshold: threshold,
exactIdx: make(map[string]bool),
normalizedIdx: make(map[string]bool),
}
}

// normalize sorts whitespace-separated tokens so order-independent comparison works.
Expand All @@ -38,12 +44,35 @@ func normalize(s string) string {
func (c *Checker) Record(content string) {
c.mu.Lock()
defer c.mu.Unlock()
norm := normalize(content)
c.raw = append(c.raw, content)
c.normalized = append(c.normalized, normalize(content))
c.normalized = append(c.normalized, norm)
c.exactIdx[content] = true
c.normalizedIdx[norm] = true
}

// Check tests content against all previously recorded strings.
// It does NOT record the content — call Record separately if desired.
func (c *Checker) ExactIndex() map[string]bool {
c.mu.Lock()
Comment on lines 54 to +57
defer c.mu.Unlock()
out := make(map[string]bool, len(c.exactIdx))
for k, v := range c.exactIdx {
out[k] = v
}
return out
}

func (c *Checker) NormalizedIndex() map[string]bool {
c.mu.Lock()
defer c.mu.Unlock()
out := make(map[string]bool, len(c.normalizedIdx))
for k, v := range c.normalizedIdx {
out[k] = v
}
return out
}

func (c *Checker) Check(content string) Result {
return c.CheckWithThreshold(content, c.threshold)
}
Expand All @@ -55,16 +84,17 @@ func (c *Checker) CheckWithThreshold(content string, threshold float64) Result {
defer c.mu.Unlock()

norm := normalize(content)
normTokens := strings.Fields(norm)

for i, r := range c.raw {
if r == content {
return Result{IsRedundant: true, Kind: "exact", Similarity: 1.0}
}
if c.normalized[i] == norm {
return Result{IsRedundant: true, Kind: "normalized", Similarity: 1.0}
}
sim := jaccard(normTokens, strings.Fields(c.normalized[i]))
if c.exactIdx[content] {
return Result{IsRedundant: true, Kind: "exact", Similarity: 1.0}
}
if c.normalizedIdx[norm] {
return Result{IsRedundant: true, Kind: "normalized", Similarity: 1.0}
}

normTokens := strings.Fields(norm)
for _, n := range c.normalized {
sim := jaccard(normTokens, strings.Fields(n))
if sim >= threshold {
return Result{IsRedundant: true, Kind: "similar", Similarity: sim}
}
Expand Down
48 changes: 48 additions & 0 deletions internal/redundancy/redundancy_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package redundancy_test

import (
"strings"
"testing"

"github.com/pythondatascrape/engram/internal/redundancy"
Expand Down Expand Up @@ -45,3 +46,50 @@ func TestFirstCallNotRedundant(t *testing.T) {
result := c.Check("lang=go")
require.False(t, result.IsRedundant)
}

// TestExactLookupUsesIndex verifies that exact duplicates are found via O(1) map lookup.
// The Checker must expose an ExactIndex() map for inspection.
func TestExactLookupUsesIndex(t *testing.T) {
c := redundancy.NewChecker(0.9)
c.Record("lang=go arch=monolith")
c.Record("lang=rust framework=axum")
idx := c.ExactIndex()
require.True(t, idx["lang=go arch=monolith"], "exact index must contain recorded entry")
require.True(t, idx["lang=rust framework=axum"], "exact index must contain second entry")
require.False(t, idx["lang=go"], "non-recorded string must not appear in exact index")
}

// TestNormalizedLookupUsesIndex verifies that normalized duplicates are found via O(1) map lookup.
func TestNormalizedLookupUsesIndex(t *testing.T) {
c := redundancy.NewChecker(0.9)
c.Record("lang=go arch=monolith")
idx := c.NormalizedIndex()
// normalized form of "lang=go arch=monolith" is tokens sorted: "arch=monolith lang=go"
require.True(t, idx["arch=monolith lang=go"], "normalized index must contain sorted-token form")
}

// TestLargeExactIndex ensures exact match stays fast with many entries (index must not degrade).
func TestLargeExactIndex(t *testing.T) {
Comment on lines +71 to +72
c := redundancy.NewChecker(0.9)
const n = 10_000
for i := 0; i < n; i++ {
c.Record(strings.Repeat("x", i+1))
}
target := strings.Repeat("x", n/2)
result := c.Check(target)
require.True(t, result.IsRedundant)
require.Equal(t, "exact", result.Kind)
}

// TestLargeNormalizedIndex ensures normalized match stays fast with many entries.
func TestLargeNormalizedIndex(t *testing.T) {
Comment on lines +84 to +85
c := redundancy.NewChecker(0.9)
const n = 10_000
for i := 0; i < n; i++ {
c.Record(strings.Repeat("y", i+1))
}
// Check reverse-order of a recorded entry — same tokens, different order (single token so same).
target := strings.Repeat("y", n/2)
result := c.Check(target)
require.True(t, result.IsRedundant)
Comment on lines +89 to +94
}
Loading