Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
ec4255f
utils
naknomum Jul 25, 2025
abf604e
wip Embedding object and (alleged) persistence in db, including pgvec…
naknomum Jul 25, 2025
ee3a649
more futzing with vector-vs-datanucleus
naknomum Jul 28, 2025
eaf3ee2
more wranglin with datanucleus and vectors
naknomum Jul 30, 2025
0864d2c
oh yeah this too
naknomum Jul 30, 2025
104f7fd
WIP: import embeddins json from ML world
naknomum Jul 30, 2025
d820855
more embedding import work
naknomum Jul 30, 2025
62fcd75
actual indexing of embeddings into opensearch; more emb/vector method…
naknomum Jul 31, 2025
d68e177
actually name the individual and link to enc sighhhhhh
naknomum Jul 31, 2025
7720715
enable knn on annotation index creation
naknomum Jul 31, 2025
0e32c77
generate vector-based match query (how many levels is too deep for js…
naknomum Jul 31, 2025
5acc4a7
actual getMatches() !!!
naknomum Jul 31, 2025
b6b0ed7
utility to test embedding-based matching
naknomum Jul 31, 2025
5715f08
lets not blow up when we dont have an annotation/embedding match (and…
naknomum Aug 7, 2025
dbef9cc
step 0.00 of MLService (and linting)
naknomum Aug 14, 2025
88077c1
Merge branch 'main' into 1183_vectors
naknomum Aug 14, 2025
e5f8218
JSONArray -> List<Integer> util
naknomum Aug 27, 2025
2dba4d2
hooray: IAException; more MLService config/conn/etc
naknomum Aug 27, 2025
c3a1ac9
ok, now with Double
naknomum Aug 27, 2025
c69c8df
MLService jobs requeue into detection queue
naknomum Aug 27, 2025
5df5764
bunch of ways to compare annotations
naknomum Aug 27, 2025
43272aa
ma.findAnnotation(ann) seems useful
naknomum Aug 27, 2025
72cef84
WIP creating actual annotations from MLService results
naknomum Aug 27, 2025
902ed40
wip
naknomum Aug 29, 2025
c14e7bc
Merge branch 'main' into 1183_vectors
naknomum Sep 30, 2025
f0a1f0b
Merge branch 'main' into 1183_vectors
naknomum Oct 9, 2025
31922f4
apply new /extract endpoint for annotations [wip]
naknomum Oct 9, 2025
d4ae7d9
silly trailing slash!
naknomum Oct 9, 2025
f332747
even *more* jsony
naknomum Oct 9, 2025
eb647b3
grrr
naknomum Oct 9, 2025
93b44db
utility function to compare vectors
naknomum Oct 10, 2025
09a21fb
findEmbeddingByVector()
naknomum Oct 10, 2025
434357c
create Embedding from results, add to Annotation if applicable
naknomum Oct 10, 2025
6217c7a
Merge branch 'main' into 1183_vectors
naknomum Oct 14, 2025
943a53e
send for embeddings when new annotations made
naknomum Oct 15, 2025
5dae83a
short circuit; linting
naknomum Oct 15, 2025
bdb1289
cleanup and convenience method
naknomum Oct 15, 2025
01cac00
lets have a Task for embedding calls; *temporarily* short-circuit the…
naknomum Oct 15, 2025
d68389e
let embedding extraction fake a callback (to initiate identification)…
naknomum Oct 15, 2025
ec412fc
simple embeddings listing in obrowse
naknomum Oct 16, 2025
148f896
handle embedding extraction "completion" differently [wip]
naknomum Oct 16, 2025
f86a33a
fill out annot map for triggering ident kickoff [wip]
naknomum Oct 16, 2025
b33f104
bugfix
naknomum Oct 16, 2025
2e9b778
oops actually use the boolean we made for this express purpose
naknomum Oct 16, 2025
0364086
release the annots!
naknomum Oct 16, 2025
1640038
wip: sending annots to wbia using own id
naknomum Oct 23, 2025
4a137fb
check ids when forced [wip]
naknomum Oct 23, 2025
fa92554
bugfix
naknomum Oct 23, 2025
1bdacba
test of force-id for images
naknomum Nov 5, 2025
575d3e4
attempt to gracefully move on to ident if embedding not enabled
naknomum Nov 5, 2025
cbeebc9
JsonProperties @-aliasing will never not be confusing to me
naknomum Nov 14, 2025
496832f
Merge branch 'main' into 1183_vectors
naknomum Dec 10, 2025
5c68b03
docker-compose differences for psql vector support and OpenSearch vec…
naknomum Dec 10, 2025
d0af815
bugfix: dont fall into the trap of using ann.theta... ann.getTheta() …
naknomum Dec 10, 2025
079a479
unit test for new Annotation comparing methods
naknomum Dec 10, 2025
6e998b3
unit tests for basic Embedding stuff; some bugfixes said tests necess…
naknomum Dec 10, 2025
068a766
Merge branch 'main' into 1183_vectors
naknomum Dec 11, 2025
9e0ffa2
Merge branch 'main' into 1183_vectors
naknomum Dec 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions devops/development/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
services:
db:
image: postgres:13.4
# technically pg vector support not really needed yet due to persistence from PGvector (java) to DB
# not being supported by DataNucleus ... but this change is noted here anyway
# image: postgres:13.4
image: pgvector/pgvector:pg13
healthcheck:
test: [ "CMD-SHELL", "pg_isready -U postgres || exit 1" ]
interval: 10s
Expand Down Expand Up @@ -62,7 +65,7 @@ services:
JAVA_OPTS: "-Djava.awt.headless=true -Xms4096m -Xmx4096m"

opensearch:
image: opensearchproject/opensearch:2.15.0
image: opensearchproject/opensearch:3.1.0
healthcheck:
test: [ "CMD-SHELL", "curl --silent --fail 127.0.0.1:9200/_cluster/health || exit 1" ]
interval: 10s
Expand Down
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,12 @@
<version>2.0</version>
</dependency>

<dependency>
<groupId>com.pgvector</groupId>
<artifactId>pgvector</artifactId>
<version>0.1.6</version>
</dependency>

<dependency>
<groupId>org.opensearch.client</groupId>
<artifactId>opensearch-java</artifactId>
Expand Down
235 changes: 235 additions & 0 deletions src/main/java/org/ecocean/Annotation.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
Expand All @@ -19,6 +20,7 @@
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.ecocean.api.ApiException;
import org.ecocean.ia.IA;
import org.ecocean.ia.MLService;
import org.ecocean.ia.Task;
import org.ecocean.identity.IBEISIA;
import org.ecocean.media.Feature;
Expand Down Expand Up @@ -51,6 +53,7 @@ public Annotation() {}
private Boolean isOfInterest = null; // aka AoI (Annotation of Interest)
protected String identificationStatus;
private ArrayList<Feature> features;
private Set<Embedding> embeddings;
protected String acmId;

// this is used to decide "should we match against this" problem is: that is not very (IA-)algorithm agnostic
Expand Down Expand Up @@ -156,6 +159,24 @@ public JSONObject opensearchMapping() {
// all case-insensitive keyword-ish types
// map.put("fubar", keywordNormalType);

// embeddings have some metadata (algorithm etc)
// and then the vector that is the embedding
JSONObject embMap = new JSONObject();
embMap.put("type", "nested");
embMap.put("dynamic", false);
JSONObject embProps = new JSONObject();
embProps.put("method", keywordType);
embProps.put("methodVersion", keywordType);
JSONObject embVect = new JSONObject();
// https://docs.opensearch.org/docs/latest/vector-search/creating-vector-index/
embVect.put("type", "knn_vector");
embVect.put("dimension", Embedding.getVectorDimension());
embVect.put("space_type", "l2");
// etc...... TODO
embProps.put("vector", embVect);
embMap.put("properties", embProps);
map.put("embeddings", embMap);

return map;
}

Expand Down Expand Up @@ -194,6 +215,28 @@ public void opensearchDocumentSerializer(JsonGenerator jgen, Shepherd myShepherd
if (tod > 0) jgen.writeNumberField("encounterIndividualTimeOfDeath", tod);
}
}
jgen.writeArrayFieldStart("embeddings");
if (this.embeddings != null)
for (Embedding emb : this.embeddings) {
jgen.writeStartObject();
jgen.writeStringField("id", emb.getId());
jgen.writeStringField("method", emb.getMethod());
jgen.writeStringField("methodVersion", emb.getMethodVersion());
jgen.writeNumberField("created", emb.getCreated());

float[] vecFloat = emb.vectorToFloatArray();
// System.out.println("[INFO] indexing emb " + emb.getId() + " vector length " + ((vecFloat == null) ? "null" : vecFloat.length));
if ((vecFloat != null) && (vecFloat.length > 0)) {
jgen.writeFieldName("vector");
jgen.writeStartArray();
for (int i = 0; i < vecFloat.length; i++) {
jgen.writeNumber(vecFloat[i]);
}
jgen.writeEndArray();
}
jgen.writeEndObject();
}
jgen.writeEndArray();
}

// TODO should this also be limited by matchAgainst and acmId?
Expand Down Expand Up @@ -684,6 +727,44 @@ public int[] getBbox() {
return bbox;
}

public boolean equalsBbox(Annotation other) {
if (other == null) return false;
int[] mine = this.getBbox();
if (mine == null) return false;
int[] otherBbox = other.getBbox();
if (otherBbox == null) return false;
if (mine.length != otherBbox.length) return false;
for (int i = 0; i < mine.length; i++) {
if (mine[i] != otherBbox[i]) return false;
}
return true;
}

public boolean equalsTheta(Annotation other) {
if (other == null) return false;
return (this.getTheta() == other.getTheta());
}

// combines theta + bbox
public boolean equalsShape(Annotation other) {
if (!equalsTheta(other)) return false;
return equalsBbox(other);
}

public boolean equalsIAClass(Annotation other) {
if (other == null) return false;
if ((other.getIAClass() == null) && (iaClass == null)) return true; // sketchy?
if (iaClass == null) return false;
return iaClass.equals(other.getIAClass());
}

public boolean equalsViewpoint(Annotation other) {
if (other == null) return false;
if ((other.getViewpoint() == null) && (viewpoint == null)) return true; // sketchy?
if (viewpoint == null) return false;
return viewpoint.equals(other.getViewpoint());
}

public String getBboxAsString() {
return Arrays.toString(this.getBbox());
}
Expand All @@ -694,6 +775,7 @@ public String toString() {
.append("species", species)
.append("iaClass", iaClass)
.append("bbox", getBbox())
.append("numEmbed", numberEmbeddings())
.toString();
}

Expand Down Expand Up @@ -1011,6 +1093,71 @@ public ArrayList<Annotation> getMatchingSet(Shepherd myShepherd, JSONObject task
return anns;
}

// a variation of matchingSet query, but includes the vector stuff - thus returns actual matches(!)
// method and methodVersion are used to determine *which* embedding to use; if null it will use 1st embedding
// return null when this annot has no embeddings to match, sorry!
public JSONObject getMatchQuery(Shepherd myShepherd, JSONObject taskParams, boolean useClauses,
String method, String methodVersion) {
Embedding emb = getEmbeddingByMethod(method, methodVersion);

if (emb == null) return null;
JSONObject query = getMatchingSetQuery(myShepherd, taskParams, useClauses);
JSONObject nested = new JSONObject(
"{\"nested\": {\"path\": \"embeddings\", \"query\": {\"bool\": {}}}}");
JSONArray must = new JSONArray();
JSONObject knn = new JSONObject("{\"knn\": {\"embeddings.vector\": {}}}");
knn.getJSONObject("knn").getJSONObject("embeddings.vector").put("vector",
new JSONArray(emb.vectorToFloatArray()));
knn.getJSONObject("knn").getJSONObject("embeddings.vector").put("k", 5);
must.put(knn);
if (method != null)
must.put(new JSONObject("{\"term\": {\"embeddings.method\":\"" + method + "\"}}"));
if (methodVersion != null)
must.put(new JSONObject("{\"term\": {\"embeddings.methodVersion\":\"" + methodVersion +
"\"}}"));
nested.getJSONObject("nested").getJSONObject("query").getJSONObject("bool").put("must",
must);
query.getJSONObject("query").getJSONObject("bool").getJSONArray("filter").put(nested);
return query;
}

// finds annotations based on embedding vector matches
// null means we didnt have an embedding to query with
public List<Annotation> getMatches(Shepherd myShepherd, JSONObject taskParams,
boolean useClauses, String method, String methodVersion) {
List<Annotation> anns = new ArrayList<Annotation>();
JSONObject query = getMatchQuery(myShepherd, taskParams, useClauses, method, methodVersion);

if (query == null) return null;
OpenSearch os = new OpenSearch();
long startTime = System.currentTimeMillis();
JSONObject queryRes = null;
int hitSize = -1;
try {
int pageSize = 10000;
try {
pageSize = os.getSettings("annotation").optInt("max_result_window", 10000);
} catch (Exception ex) {}
os.deletePit("annotation");
queryRes = os.queryPit("annotation", query, 0, pageSize, null, null);
hitSize = queryRes.optJSONObject("hits").optJSONObject("total").optInt("value");
} catch (Exception ex) {
System.out.println("getMatches() exception: " + ex);
ex.printStackTrace();
}
JSONArray hits = OpenSearch.getHits(queryRes);
for (int i = 0; i < hits.length(); i++) {
JSONObject hit = hits.optJSONObject(i);
if (hit == null) continue;
Annotation ann = myShepherd.getAnnotation(hit.optString("_id", null));
if (ann != null) anns.add(ann);
}
System.out.println("getMatches() results: hitSize=" + hitSize + "; hits length=" +
hits.length() + "; anns size=" + anns.size() + "; " +
(System.currentTimeMillis() - startTime) + "ms");
return anns;
}

/*
sorta weird to have this in here, but it is inherently linked with getMatchingSetXXX() above ...
this is a string that uniquely identifies the matchingSet, dependent of content (e.g. cant be based on content uuids)
Expand Down Expand Up @@ -1419,6 +1566,18 @@ public static Base createFromApi(JSONObject payload, List<File> files, Shepherd
return ann;
}

public void queueForEmbeddingExtraction(Task task, Shepherd myShepherd) {
MLService mlserv = new MLService();

try {
mlserv.initiateRequest(this, this.getSpecies(myShepherd), task);
} catch (IOException ex) {
System.out.println("[ERROR] queueForEmbeddingExtraction() failed on " + this + ": " +
ex);
ex.printStackTrace();
}
}

public static Object validateFieldValue(String fieldName, JSONObject data)
throws ApiException {
if (data == null) throw new ApiException("empty payload");
Expand Down Expand Up @@ -1651,6 +1810,82 @@ private Rectangle getRect(Annotation ann) {
return null;
}

public Set<Embedding> getEmbeddings() {
return embeddings;
}

public int numberEmbeddings() {
return Util.collectionSize(embeddings);
}

public Set<Embedding> addEmbedding(Embedding emb) {
if (embeddings == null) embeddings = new HashSet<Embedding>();
if (emb == null) return embeddings;
embeddings.add(emb);
if (!this.equals(emb.getAnnotation())) emb.setAnnotation(this);
return embeddings;
}

public boolean hasEmbedding(Embedding emb) {
if (embeddings == null) return false;
return embeddings.contains(emb);
}

// since embeddings is a set, there isnt really an order so...
// pretty much random; null if we have none
public Embedding getAnEmbedding() {
return getEmbeddingByMethod(null, null);
}

public Embedding getEmbeddingByMethod(String method) {
return getEmbeddingByMethod(method, null);
}

// suppose we could order by created?
public Embedding getEmbeddingByMethod(String method, String methodVersion) {
if (numberEmbeddings() < 1) return null;
Iterator it = embeddings.iterator();
if (method == null) return (Embedding)it.next();
while (it.hasNext()) {
Embedding emb = (Embedding)it.next();
if (!method.equals(emb.getMethod())) continue;
if ((methodVersion == null) || (methodVersion.equals(emb.getMethodVersion())))
return emb;
}
return null;
}

// this will match only vector (not other properties)
public Embedding findEmbeddingByVector(Embedding find) {
if (find == null) return null;
if (numberEmbeddings() < 1) return null;
Iterator it = embeddings.iterator();
while (it.hasNext()) {
Embedding emb = (Embedding)it.next();
if (emb.hasEqualVector(find)) return emb;
}
return null;
}

/*
public void loadEmbeddingVectors(Shepherd myShepherd) {
if (embeddings == null) return;
for (Embedding emb : this.embeddings) {
emb.loadVector(myShepherd);
}
}
*/

// need these two so we can use things like List.contains()
// note: this basically is "id-equivalence" rather than *content* equivalence, so will not compare semantic similarity of 2 annots
public boolean equals(final Object o2) {
if (o2 == null) return false;
if (!(o2 instanceof Annotation)) return false;
Annotation two = (Annotation)o2;
if ((this.id == null) || (two == null) || (two.getId() == null)) return false;
return this.id.equals(two.getId());
}

public int hashCode() {
if (id == null) return Util.generateUUID().hashCode(); // random(ish) so we dont get two users with no uuid equals! :/
return id.hashCode();
Expand Down
Loading
Loading