From 72fddd8366d4c0c8ce53d702b1492036426eecc1 Mon Sep 17 00:00:00 2001 From: Luis Negrin Date: Mon, 13 Apr 2026 16:50:45 -0400 Subject: [PATCH 1/2] Utilize bulk scoring interface during HNSW graph builder NeighborArray#isWorstNonDiverse --- lucene/CHANGES.txt | 2 + .../lucene/util/hnsw/NeighborArray.java | 40 +++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f0274c83c98d..5ae26ec8b3b1 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -132,6 +132,8 @@ Optimizations * GITHUB#15597, GITHUB#15777: Reduce memory usage of NeighborArray (Viliam Durina) +* GITHUB#15606: Utilize bulk scoring for NeighborArray#isWorstNonDiverse (Luis Negrin) + Bug Fixes --------------------- * GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java index 3ddba160b44e..70af4af097dc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java @@ -292,13 +292,16 @@ private int findWorstNonDiverse(UpdateableRandomVectorScorer scorer) throws IOEx int[] uncheckedIndexes = sort(scorer); assert uncheckedIndexes != null : "We will always have something unchecked"; int uncheckedCursor = uncheckedIndexes.length - 1; + int[] bulkScoreNodes = new int[size]; + float[] bulkScores = new float[size]; for (int i = size - 1; i > 0; i--) { if (uncheckedCursor < 0) { // no unchecked node left break; } scorer.setScoringOrdinal(nodes[i]); - if (isWorstNonDiverse(i, uncheckedIndexes, uncheckedCursor, scorer)) { + if (isWorstNonDiverse( + i, uncheckedIndexes, uncheckedCursor, scorer, bulkScoreNodes, bulkScores)) { return i; } if (i == uncheckedIndexes[uncheckedCursor]) { @@ -309,31 +312,26 @@ private int findWorstNonDiverse(UpdateableRandomVectorScorer scorer) throws IOEx } private boolean isWorstNonDiverse( - int candidateIndex, int[] uncheckedIndexes, int uncheckedCursor, RandomVectorScorer scorer) + int candidateIndex, + int[] uncheckedIndexes, + int uncheckedCursor, + RandomVectorScorer scorer, + int[] bulkScoreNodes, + float[] bulkScores) throws IOException { float minAcceptedSimilarity = scores[candidateIndex]; if (candidateIndex == uncheckedIndexes[uncheckedCursor]) { // the candidate itself is unchecked - for (int i = candidateIndex - 1; i >= 0; i--) { - float neighborSimilarity = scorer.score(nodes[i]); - // candidate node is too similar to node i given its score relative to the base node - if (neighborSimilarity >= minAcceptedSimilarity) { - return true; - } - } - } else { - // else we just need to make sure candidate does not violate diversity with the (newly - // inserted) unchecked nodes - assert candidateIndex > uncheckedIndexes[uncheckedCursor]; - for (int i = uncheckedCursor; i >= 0; i--) { - float neighborSimilarity = scorer.score(nodes[uncheckedIndexes[i]]); - // candidate node is too similar to node i given its score relative to the base node - if (neighborSimilarity >= minAcceptedSimilarity) { - return true; - } - } + return scorer.bulkScore(nodes, bulkScores, candidateIndex) >= minAcceptedSimilarity; + } + // else we just need to make sure candidate does not violate diversity with the (newly + // inserted) unchecked nodes + assert candidateIndex > uncheckedIndexes[uncheckedCursor]; + for (int i = uncheckedCursor; i >= 0; i--) { + bulkScoreNodes[i] = nodes[uncheckedIndexes[i]]; } - return false; + return scorer.bulkScore(bulkScoreNodes, bulkScores, uncheckedCursor + 1) + >= minAcceptedSimilarity; } public int maxSize() { From e3e5261224930a9a32fa56437b5043e195c441da Mon Sep 17 00:00:00 2001 From: Luis Negrin Date: Thu, 28 May 2026 14:15:38 -0400 Subject: [PATCH 2/2] Refactor: populate uncheckedNodes upfront in findWorstNonDiverse --- .../org/apache/lucene/util/hnsw/NeighborArray.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java index 70af4af097dc..41ff20ac105f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java @@ -292,7 +292,10 @@ private int findWorstNonDiverse(UpdateableRandomVectorScorer scorer) throws IOEx int[] uncheckedIndexes = sort(scorer); assert uncheckedIndexes != null : "We will always have something unchecked"; int uncheckedCursor = uncheckedIndexes.length - 1; - int[] bulkScoreNodes = new int[size]; + int[] uncheckedNodes = new int[uncheckedIndexes.length]; + for (int i = uncheckedCursor; i >= 0; i--) { + uncheckedNodes[i] = nodes[uncheckedIndexes[i]]; + } float[] bulkScores = new float[size]; for (int i = size - 1; i > 0; i--) { if (uncheckedCursor < 0) { @@ -301,7 +304,7 @@ private int findWorstNonDiverse(UpdateableRandomVectorScorer scorer) throws IOEx } scorer.setScoringOrdinal(nodes[i]); if (isWorstNonDiverse( - i, uncheckedIndexes, uncheckedCursor, scorer, bulkScoreNodes, bulkScores)) { + i, uncheckedIndexes, uncheckedCursor, scorer, uncheckedNodes, bulkScores)) { return i; } if (i == uncheckedIndexes[uncheckedCursor]) { @@ -316,7 +319,7 @@ private boolean isWorstNonDiverse( int[] uncheckedIndexes, int uncheckedCursor, RandomVectorScorer scorer, - int[] bulkScoreNodes, + int[] uncheckedNodes, float[] bulkScores) throws IOException { float minAcceptedSimilarity = scores[candidateIndex]; @@ -327,10 +330,7 @@ private boolean isWorstNonDiverse( // else we just need to make sure candidate does not violate diversity with the (newly // inserted) unchecked nodes assert candidateIndex > uncheckedIndexes[uncheckedCursor]; - for (int i = uncheckedCursor; i >= 0; i--) { - bulkScoreNodes[i] = nodes[uncheckedIndexes[i]]; - } - return scorer.bulkScore(bulkScoreNodes, bulkScores, uncheckedCursor + 1) + return scorer.bulkScore(uncheckedNodes, bulkScores, uncheckedCursor + 1) >= minAcceptedSimilarity; }