Skip to content
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ Optimizations

* GITHUB#15597, GITHUB#15777: Reduce memory usage of NeighborArray (Viliam Durina)

* GITHUB#15606: Utilize bulk scoring for NeighborArray#isWorstNonDiverse (Luis Negrin)

Bug Fixes
---------------------
* GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero
Expand Down
40 changes: 19 additions & 21 deletions lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java
Original file line number Diff line number Diff line change
Expand Up @@ -292,13 +292,19 @@ private int findWorstNonDiverse(UpdateableRandomVectorScorer scorer) throws IOEx
int[] uncheckedIndexes = sort(scorer);
assert uncheckedIndexes != null : "We will always have something unchecked";
int uncheckedCursor = uncheckedIndexes.length - 1;
int[] uncheckedNodes = new int[uncheckedIndexes.length];
for (int i = uncheckedCursor; i >= 0; i--) {
uncheckedNodes[i] = nodes[uncheckedIndexes[i]];
}
float[] bulkScores = new float[size];
for (int i = size - 1; i > 0; i--) {
if (uncheckedCursor < 0) {
// no unchecked node left
break;
}
scorer.setScoringOrdinal(nodes[i]);
if (isWorstNonDiverse(i, uncheckedIndexes, uncheckedCursor, scorer)) {
if (isWorstNonDiverse(
i, uncheckedIndexes, uncheckedCursor, scorer, uncheckedNodes, bulkScores)) {
return i;
}
if (i == uncheckedIndexes[uncheckedCursor]) {
Expand All @@ -309,31 +315,23 @@ private int findWorstNonDiverse(UpdateableRandomVectorScorer scorer) throws IOEx
}

private boolean isWorstNonDiverse(
int candidateIndex, int[] uncheckedIndexes, int uncheckedCursor, RandomVectorScorer scorer)
int candidateIndex,
int[] uncheckedIndexes,
int uncheckedCursor,
RandomVectorScorer scorer,
int[] uncheckedNodes,
float[] bulkScores)
throws IOException {
float minAcceptedSimilarity = scores[candidateIndex];
if (candidateIndex == uncheckedIndexes[uncheckedCursor]) {
// the candidate itself is unchecked
for (int i = candidateIndex - 1; i >= 0; i--) {
float neighborSimilarity = scorer.score(nodes[i]);
// candidate node is too similar to node i given its score relative to the base node
if (neighborSimilarity >= minAcceptedSimilarity) {
return true;
}
}
} else {
// else we just need to make sure candidate does not violate diversity with the (newly
// inserted) unchecked nodes
assert candidateIndex > uncheckedIndexes[uncheckedCursor];
for (int i = uncheckedCursor; i >= 0; i--) {
float neighborSimilarity = scorer.score(nodes[uncheckedIndexes[i]]);
// candidate node is too similar to node i given its score relative to the base node
if (neighborSimilarity >= minAcceptedSimilarity) {
return true;
}
}
return scorer.bulkScore(nodes, bulkScores, candidateIndex) >= minAcceptedSimilarity;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like we only want to score uncheckedIndexes right?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @benwtrent , thanks for the review!
Not exactly, it depends on whether the candidate itself is checked or unchecked.

Case 1 (candidate is unchecked): brand new node, never been diversity-checked against anyone, so we score against all better neighbors (0..candidateIndex-1) not just unchecked ones.
Case 2 (candidate is checked): already survived diversity checks against all checked nodes, so we only score against the newly added unchecked nodes.

This is the same logic as before my change, I only replaced the per-node scorer.score() loop with bulkScore.

That being said, looking at it again I made two small improvements: renamed bulkScoreNodes to uncheckedNodes to make it clearer what it actually contains, and moved the population of uncheckedNodes up into findWorstNonDiverse so it's built once and reused across calls to isWorstNonDiverse rather than being recreated every time.

}
return false;
// else we just need to make sure candidate does not violate diversity with the (newly
// inserted) unchecked nodes
assert candidateIndex > uncheckedIndexes[uncheckedCursor];
return scorer.bulkScore(uncheckedNodes, bulkScores, uncheckedCursor + 1)
>= minAcceptedSimilarity;
}

public int maxSize() {
Expand Down
Loading