diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..365ca53 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,38 @@ +name: CI + +on: + push: + branches: ["main"] + pull_request: + +jobs: + quality: + runs-on: ubuntu-latest + strategy: + matrix: + node-version: [20] + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ matrix.node-version }} + cache: npm + + - name: Install dependencies + run: npm ci + + - name: Build + run: npm run build + + - name: Lint + run: npm run lint + + - name: Test + run: npm test -- --runInBand + + - name: Coverage Threshold + run: npm run test:coverage diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..b8474f1 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,21 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on Keep a Changelog, +and this project adheres to Semantic Versioning. + +## [Unreleased] + +### Added +- CI workflow for build, lint, test, and coverage. +- Persistence-focused test coverage for IndexedDB-backed index behavior. +- API reference and tuning guidance in README. + +### Changed +- Lint gate now targets published library sources (`src/**`, excluding benchmark CLI code). +- README persistence example now loads from the same DB name it saved to. + +### Fixed +- `HNSWWithDB.deleteIndex()` now awaits DB re-initialization. +- `HNSWWithDB` now surfaces initialization/load/delete errors instead of silently swallowing them. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9a9b4aa..1b6ebcc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,7 +19,8 @@ Thanks for taking the time to improve this HNSW implementation. The checklist be - Add or update tests under `tests/` that capture the bug fix or feature so regressions are caught automatically. The `tests/HNSW.test.ts` suite shows how to build deterministic indices for verification. - Before committing, run the full set of quality gates: - `npm test` – runs the Jest harness - - `npm run lint` – checks the TypeScript sources with TSLint + - `npm run lint` – checks published TypeScript sources with TSLint + - `npm run lint:bench` – optional lint pass for benchmark CLI sources - `npm run build` – ensures the TypeScript compiler can emit the distributable files ## 4. Commit with context diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..65f5d40 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 deepfates.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index f97ac9c..7d67fa9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # HNSW +[![npm version](https://img.shields.io/npm/v/hnsw)](https://www.npmjs.com/package/hnsw) +[![license](https://img.shields.io/npm/l/hnsw)](./LICENSE) + This is a small Typescript package that implements the Hierarchical Navigable Small Worlds algorithm for approximate nearest neighbor search. I wrote this package because I wanted to do efficient vector search directly in the client browser. All the other implementations I found for TS were either bindings for libraries written in other languages, or dealt with WASM compilation complexity. @@ -61,8 +64,8 @@ const data = [ await index.buildIndex(data); await index.saveIndex(); -// Load the index -const index2 = await HNSWWithDB.create(16, 200, 'my-index-2', 50); +// Load the same index from disk +const index2 = await HNSWWithDB.create(16, 200, 'my-index', 50); await index2.loadIndex(); // Search for nearest neighbors @@ -77,6 +80,52 @@ Notes: - The `metric` determines how scores are computed: `cosine` uses cosine similarity and `euclidean` uses an inverse-distance similarity (higher is better in both cases). - `efSearch` controls query-time exploration and should be at least `k` for best recall. +## API Reference + +### `new HNSW(M, efConstruction, d?, metric?, efSearch?)` + +- `M`: Max neighbors stored per node and layer. Higher values usually improve recall and memory cost. +- `efConstruction`: Build-time exploration depth. Higher values improve index quality and build time cost. +- `d`: Vector dimension. If omitted, inferred from first inserted vector. +- `metric`: `cosine` or `euclidean`. +- `efSearch`: Query-time exploration depth. Higher values improve recall and query latency cost. + +### `buildIndex(data, options?)` + +- `data`: Array of `{ id, vector }`. +- `options.onProgress(current, total)`: Optional progress callback. +- `options.progressInterval`: Callback cadence (default `10000`). + +### `searchKNN(query, k, options?)` + +- Returns up to `k` results with shape `{ id, score }`. +- `options.efSearch`: Per-query override. Effective search breadth is `max(k, efSearch)`. + +### `toJSON()` / `HNSW.fromJSON(json)` + +- Serialize and restore in-memory indices for transport or persistence. + +### `HNSWWithDB.create(M, efConstruction, dbName, efSearch?)` + +- Creates an IndexedDB-backed index (browser/runtime with IndexedDB support). +- `saveIndex()`: Persist current graph. +- `loadIndex()`: Load previously persisted graph (no-op if missing). +- `deleteIndex()`: Delete persisted graph and reinitialize DB. +- `close()`: Close the active IndexedDB connection. + +## Tuning Guide + +- Start with `M=16`, `efConstruction=200`, `efSearch=50`. +- Increase `efSearch` first when recall is too low. +- Increase `M` for tougher datasets when memory budget allows. +- Keep `efSearch >= k` for better recall consistency. + +## Limitations + +- This implementation prioritizes simplicity over peak throughput and memory efficiency. +- IndexedDB support depends on environment support for IndexedDB APIs. +- Benchmark tools under `src/bench` are maintained as CLI utilities and are not part of the runtime API surface. + ## Benchmarks A lightweight benchmark harness is available to validate recall/latency tradeoffs and the impact of parameters like `efSearch`, `M`, and `efConstruction`. diff --git a/jestconfig.json b/jestconfig.json index 118ffd8..ea225f4 100644 --- a/jestconfig.json +++ b/jestconfig.json @@ -1,7 +1,15 @@ { - "transform": { - "^.+\\.(t|j)sx?$": "ts-jest" - }, - "testRegex": "(/tests/.*|(\\.|/)(test|spec))\\.(jsx?|tsx?)$", - "moduleFileExtensions": ["ts", "tsx", "js", "jsx", "json", "node"] - } \ No newline at end of file + "transform": { + "^.+\\.(t|j)sx?$": "ts-jest" + }, + "testRegex": "(/tests/.*|(\\.|/)(test|spec))\\.(jsx?|tsx?)$", + "moduleFileExtensions": ["ts", "tsx", "js", "jsx", "json", "node"], + "coverageThreshold": { + "global": { + "branches": 65, + "functions": 80, + "lines": 80, + "statements": 80 + } + } +} diff --git a/package-lock.json b/package-lock.json index 9a5c0a0..86b9fe0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,6 +13,8 @@ }, "devDependencies": { "@types/jest": "^29.5.1", + "@types/node": "^20.11.30", + "fake-indexeddb": "^6.2.5", "jest": "^29.5.0", "prettier": "^2.8.8", "ts-jest": "^29.1.0", @@ -1059,10 +1061,14 @@ } }, "node_modules/@types/node": { - "version": "20.2.5", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.2.5.tgz", - "integrity": "sha512-JJulVEQXmiY9Px5axXHeYGLSjhkZEnD+MDPDGbCbIAbMslkKwmygtZFy1X6s/075Yo94sf8GuSlFfPzysQrWZQ==", - "dev": true + "version": "20.19.33", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.33.tgz", + "integrity": "sha512-Rs1bVAIdBs5gbTIKza/tgpMuG1k3U/UMJLWecIMxNdJFDMzcM5LOiLVRYh3PilWEYDIeUDv7bpiHPLPsbydGcw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } }, "node_modules/@types/prettier": { "version": "2.7.2", @@ -1670,6 +1676,16 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/fake-indexeddb": { + "version": "6.2.5", + "resolved": "https://registry.npmjs.org/fake-indexeddb/-/fake-indexeddb-6.2.5.tgz", + "integrity": "sha512-CGnyrvbhPlWYMngksqrSSUT1BAVP49dZocrHuK0SvtR0D5TMs5wP0o3j7jexDJW01KSadjBp1M/71o/KR3nD1w==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=18" + } + }, "node_modules/fast-json-stable-stringify": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", @@ -3621,6 +3637,13 @@ "node": ">=12.20" } }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, "node_modules/update-browserslist-db": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.11.tgz", diff --git a/package.json b/package.json index 06c8d86..68a9f67 100644 --- a/package.json +++ b/package.json @@ -3,6 +3,9 @@ "version": "1.0.3", "description": "A TypeScript implementation of HNSW (Hierarchical Navigable Small World) algorithm for approximate nearest neighbor search", "homepage": "https://github.com/deepfates/hnsw#readme", + "bugs": { + "url": "https://github.com/deepfates/hnsw/issues" + }, "repository": { "type": "git", "url": "https://github.com/deepfates/hnsw.git" @@ -11,13 +14,15 @@ "types": "dist/index.d.ts", "scripts": { "test": "jest --config jestconfig.json", + "test:coverage": "jest --config jestconfig.json --coverage --runInBand", "build": "tsc", "bench": "node dist/bench/run.js", "bench:download": "node dist/bench/download.js", "bench:report": "node dist/bench/report.js", "bench:compare": "node dist/bench/compare.js", "format": "prettier --write \"src/**/*.ts\"", - "lint": "tslint -p tsconfig.json", + "lint": "tslint -p tsconfig.lint.json", + "lint:bench": "tslint -p tsconfig.json src/bench/**/*.ts", "prepare": "npm run build", "prepublishOnly": "npm test && npm run lint", "preversion": "npm run lint", @@ -26,7 +31,12 @@ }, "keywords": [ "nearest neighbor", - "vector search" + "vector search", + "hnsw", + "ann", + "similarity search", + "indexeddb", + "browser" ], "author": "deepfates.com", "license": "MIT", @@ -36,6 +46,7 @@ "devDependencies": { "@types/jest": "^29.5.1", "@types/node": "^20.11.30", + "fake-indexeddb": "^6.2.5", "jest": "^29.5.0", "prettier": "^2.8.8", "ts-jest": "^29.1.0", diff --git a/src/db.ts b/src/db.ts index 1bbe974..b97bfbb 100644 --- a/src/db.ts +++ b/src/db.ts @@ -1,10 +1,13 @@ import { HNSW } from './main'; import { openDB, deleteDB, DBSchema, IDBPDatabase } from 'idb'; +import { cosineSimilarity, euclideanSimilarity } from './similarity'; + +type SerializedIndex = ReturnType; interface HNSWDB extends DBSchema { 'hnsw-index': { key: string; - value: any; + value: SerializedIndex; }; } @@ -17,6 +20,9 @@ export class HNSWWithDB extends HNSW { this.dbName = dbName; } + /** + * Creates an IndexedDB-backed HNSW instance. + */ static async create(M: number, efConstruction: number, dbName: string, efSearch = 50) { const instance = new HNSWWithDB(M, efConstruction, dbName, efSearch); await instance.initDB(); @@ -31,25 +37,39 @@ export class HNSWWithDB extends HNSW { }); } - async saveIndex() { + /** + * Closes the current IndexedDB connection if open. + */ + close() { if (!this.db) { - // console.error('Database is not initialized'); return; } + this.db.close(); + this.db = null; + } + + /** + * Persists the current graph to IndexedDB. + */ + async saveIndex() { + if (!this.db) { + throw new Error('Database is not initialized'); + } await this.db.put('hnsw-index', this.toJSON(), 'hnsw'); } + /** + * Loads a persisted graph from IndexedDB if present. + */ async loadIndex() { if (!this.db) { - // console.error('Database is not initialized'); - return; + throw new Error('Database is not initialized'); } - const loadedHNSW: HNSW | undefined = await this.db.get('hnsw-index', 'hnsw'); + const loadedHNSW = await this.db.get('hnsw-index', 'hnsw'); if (!loadedHNSW) { - // console.error('No saved HNSW index found'); return; } @@ -60,23 +80,22 @@ export class HNSWWithDB extends HNSW { this.efSearch = hnsw.efSearch; this.metric = hnsw.metric; this.d = hnsw.d; - this.similarityFunction = (this as any).getMetric(hnsw.metric); + this.similarityFunction = hnsw.metric === 'cosine' ? cosineSimilarity : euclideanSimilarity; this.levelMax = hnsw.levelMax; this.entryPointId = hnsw.entryPointId; this.nodes = hnsw.nodes; } + /** + * Deletes persisted graph data and re-initializes the backing DB. + */ async deleteIndex() { if (!this.db) { - // console.error('Database is not initialized'); - return; + throw new Error('Database is not initialized'); } - try { - await deleteDB(this.dbName); - this.initDB(); - } catch (error) { - // console.error('Failed to delete index:', error); - } + this.close(); + await deleteDB(this.dbName); + await this.initDB(); } } diff --git a/src/main.ts b/src/main.ts index 6d8af8c..01b856b 100644 --- a/src/main.ts +++ b/src/main.ts @@ -17,6 +17,9 @@ export class HNSW { nodes: Map; // Map of nodes probs: number[]; // Probabilities for the levels + /** + * Creates an in-memory HNSW index. + */ constructor(M = 16, efConstruction = 200, d: number | null = null, metric = 'cosine', efSearch?: number) { this.metric = metric as Metric; this.d = d; @@ -225,6 +228,9 @@ export class HNSW { } } + /** + * Adds a single vector to the graph. + */ async addPoint(id: number, vector: Float32Array | number[]) { if (this.d !== null && vector.length !== this.d) { throw new Error('All vectors must be of the same dimension'); @@ -241,6 +247,9 @@ export class HNSW { await this.addNodeToGraph(node); } + /** + * Returns up to k nearest neighbors for the query vector. + */ searchKNN( query: Float32Array | number[], k: number, @@ -273,6 +282,9 @@ export class HNSW { return results; } + /** + * Rebuilds the graph from the provided data. + */ async buildIndex( data: { id: number; vector: Float32Array | number[] }[], options?: { @@ -304,6 +316,9 @@ export class HNSW { } } + /** + * Serializes the current in-memory index. + */ toJSON() { const entries = Array.from(this.nodes.entries()); return { @@ -328,6 +343,9 @@ export class HNSW { }; } + /** + * Restores an index from serialized JSON produced by toJSON(). + */ static fromJSON(json: any): HNSW { // efSearch defaults to efConstruction if not present (backward compatibility) const hnsw = new HNSW(json.M, json.efConstruction, json.d ?? null, json.metric ?? 'cosine', json.efSearch); diff --git a/tests/HNSW.test.ts b/tests/HNSW.test.ts index 430e25d..12809de 100644 --- a/tests/HNSW.test.ts +++ b/tests/HNSW.test.ts @@ -1,4 +1,6 @@ +import 'fake-indexeddb/auto'; import { HNSW } from '../src'; +import { HNSWWithDB } from '../src'; import { Node } from '../src/node'; const createSequentialData = (count: number, dimensions = 5) => @@ -122,4 +124,58 @@ describe('HNSW', () => { expect(restoredResults).toEqual(originalResults); }); + + it('throws when adding vectors with inconsistent dimensions', async () => { + const hnsw = new HNSW(16, 32, 3, 'cosine', 16); + await hnsw.addPoint(1, [1, 2, 3]); + await expect(hnsw.addPoint(2, [1, 2])).rejects.toThrow('All vectors must be of the same dimension'); + }); + + it('returns all candidates when k is larger than index size', async () => { + const hnsw = await buildBasicIndex(baseData, { levelSequence: Array(baseData.length).fill(0) }); + const results = hnsw.searchKNN([6, 7, 8, 9, 10], 10); + expect(results.length).toBe(baseData.length); + }); + + it('invokes progress callback for final partial interval', async () => { + const hnsw = new HNSW(16, 32, 5, 'cosine', 16); + const onProgress = jest.fn(); + await hnsw.buildIndex(baseData, { onProgress, progressInterval: 3 }); + expect(onProgress).toHaveBeenCalledWith(3, 5); + expect(onProgress).toHaveBeenCalledWith(5, 5); + }); +}); + +describe('HNSWWithDB', () => { + const dbName = () => `hnsw-test-${Date.now()}-${Math.random().toString(16).slice(2)}`; + + it('saves and loads persisted indices', async () => { + const name = dbName(); + const index = await HNSWWithDB.create(16, 32, name, 24); + await index.buildIndex(baseData); + const baseline = index.searchKNN([6, 7, 8, 9, 10], 3); + await index.saveIndex(); + + const loaded = await HNSWWithDB.create(16, 32, name, 24); + await loaded.loadIndex(); + const restored = loaded.searchKNN([6, 7, 8, 9, 10], 3); + + expect(restored).toEqual(baseline); + index.close(); + loaded.close(); + }); + + it('deleteIndex removes persisted state and re-initializes DB', async () => { + const name = dbName(); + const index = await HNSWWithDB.create(16, 32, name, 24); + await index.buildIndex(baseData); + await index.saveIndex(); + await index.deleteIndex(); + index.close(); + + const loaded = await HNSWWithDB.create(16, 32, name, 24); + await loaded.loadIndex(); + expect(loaded.searchKNN([6, 7, 8, 9, 10], 3)).toEqual([]); + loaded.close(); + }); }); diff --git a/tsconfig.lint.json b/tsconfig.lint.json new file mode 100644 index 0000000..7412b23 --- /dev/null +++ b/tsconfig.lint.json @@ -0,0 +1,4 @@ +{ + "extends": "./tsconfig.json", + "exclude": ["node_modules", "dist", "tests", "src/bench"] +}