diff --git a/packages/test/src/test/ai-provider-cactus/Cactus_Integrity.test.ts b/packages/test/src/test/ai-provider-cactus/Cactus_Integrity.test.ts new file mode 100644 index 000000000..1dafd51b9 --- /dev/null +++ b/packages/test/src/test/ai-provider-cactus/Cactus_Integrity.test.ts @@ -0,0 +1,121 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { _testOnly } from "@workglow/cactus/ai"; +import { describe, expect, it } from "vitest"; + +const { + CACTUS_HASH_PLACEHOLDER, + CactusIntegrityError, + isHashPlaceholder, + sha256Hex, + verifySha256, +} = _testOnly; + +// Known SHA-256 of the ASCII string "abc" — RFC 6234 test vector. +const SHA256_ABC = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"; + +function asciiBytes(s: string): Uint8Array { + const out = new Uint8Array(s.length); + for (let i = 0; i < s.length; i++) out[i] = s.charCodeAt(i); + return out; +} + +describe("sha256Hex", () => { + it('matches the known RFC 6234 vector for "abc"', async () => { + const hex = await sha256Hex(asciiBytes("abc")); + expect(hex).toBe(SHA256_ABC); + }); + + it("accepts ArrayBuffer input", async () => { + // `Uint8Array.prototype.buffer` is typed `ArrayBufferLike` in newer + // lib.dom.d.ts (it can be SharedArrayBuffer). Materialize a concrete + // ArrayBuffer so the test exercises that branch of sha256Hex's input. + const src = asciiBytes("abc"); + const buf = new ArrayBuffer(src.byteLength); + new Uint8Array(buf).set(src); + const hex = await sha256Hex(buf); + expect(hex).toBe(SHA256_ABC); + }); + + it("produces 64 lowercase hex chars", async () => { + const hex = await sha256Hex(new Uint8Array([0])); + expect(hex).toMatch(/^[0-9a-f]{64}$/); + }); +}); + +describe("verifySha256", () => { + const ctx = { url: "https://example/asset", filename: "asset.bin" }; + + it("passes when the digest matches", async () => { + await expect(verifySha256(asciiBytes("abc"), SHA256_ABC, ctx)).resolves.toBeUndefined(); + }); + + it("accepts uppercase expected hex (normalized to lowercase)", async () => { + await expect( + verifySha256(asciiBytes("abc"), SHA256_ABC.toUpperCase(), ctx) + ).resolves.toBeUndefined(); + }); + + it("throws CactusIntegrityError when the digest does not match", async () => { + const wrong = "0".repeat(64); + await expect(verifySha256(asciiBytes("abc"), wrong, ctx)).rejects.toBeInstanceOf( + CactusIntegrityError + ); + }); + + it("throws plain Error when expected hash is too short", async () => { + await expect(verifySha256(asciiBytes("abc"), "a".repeat(63), ctx)).rejects.toThrow( + /Invalid catalog SHA-256/ + ); + }); + + it("throws plain Error when expected hash is too long", async () => { + await expect(verifySha256(asciiBytes("abc"), "a".repeat(65), ctx)).rejects.toThrow( + /Invalid catalog SHA-256/ + ); + }); + + it("throws plain Error when expected hash contains non-hex characters", async () => { + const bad = "z" + "a".repeat(63); + await expect(verifySha256(asciiBytes("abc"), bad, ctx)).rejects.toThrow( + /non-hex characters/ + ); + }); + + it("skips verification when expected is the placeholder sentinel", async () => { + await expect( + verifySha256(asciiBytes("garbage"), CACTUS_HASH_PLACEHOLDER, ctx) + ).resolves.toBeUndefined(); + }); +}); + +describe("isHashPlaceholder", () => { + it("recognizes the placeholder", () => { + expect(isHashPlaceholder(CACTUS_HASH_PLACEHOLDER)).toBe(true); + }); + + it("rejects real-looking hashes", () => { + expect(isHashPlaceholder(SHA256_ABC)).toBe(false); + }); +}); + +describe("CactusIntegrityError", () => { + it("carries url, filename, expected, actual on the instance", () => { + const err = new CactusIntegrityError({ + url: "u", + filename: "f", + expected: "e", + actual: "a", + }); + expect(err.name).toBe("CactusIntegrityError"); + expect(err.url).toBe("u"); + expect(err.filename).toBe("f"); + expect(err.expected).toBe("e"); + expect(err.actual).toBe("a"); + expect(err.message).toMatch(/Integrity check failed for f/); + }); +}); diff --git a/providers/cactus/src/ai.browser.ts b/providers/cactus/src/ai.browser.ts index 60f992589..fd8d4d405 100644 --- a/providers/cactus/src/ai.browser.ts +++ b/providers/cactus/src/ai.browser.ts @@ -21,6 +21,13 @@ export * from "./ai/registerCactus.browser"; import { CactusQueuedProvider } from "./ai/CactusQueuedProvider.browser"; import { CACTUS_RUN_FN_SPECS } from "./ai/common/Cactus_Capabilities"; +import { + CACTUS_HASH_PLACEHOLDER, + CactusIntegrityError, + isHashPlaceholder, + sha256Hex, + verifySha256, +} from "./ai/common/Cactus_Integrity"; import { CACTUS_RUN_FNS } from "./ai/common/Cactus_JobRunFns.browser"; import { cactusConfigJson, cactusEngines } from "./ai/common/Cactus_Runtime.browser"; @@ -33,6 +40,11 @@ import { cactusConfigJson, cactusEngines } from "./ai/common/Cactus_Runtime.brow * and their runtime state copies are distinct module instances. Reading * the runtime state via `_testOnly` (rather than `@workglow/cactus/ai-runtime`) * guarantees the test observes the same Map that the run-fns mutate. + * + * The `Cactus_Integrity` symbols (sha256Hex, verifySha256, CactusIntegrityError, + * isHashPlaceholder, CACTUS_HASH_PLACEHOLDER) are pure/stateless helpers exposed + * here for unit testing only. They are not part of the stable public API; depend + * on the catalog's `sha256` field instead. */ export const _testOnly = { CactusQueuedProvider, @@ -40,4 +52,10 @@ export const _testOnly = { CACTUS_RUN_FNS, cactusEngines, cactusConfigJson, + // Integrity helpers (test-only): + CACTUS_HASH_PLACEHOLDER, + CactusIntegrityError, + isHashPlaceholder, + sha256Hex, + verifySha256, } as const; diff --git a/providers/cactus/src/ai.ts b/providers/cactus/src/ai.ts index f9e8f827b..2937dd8ef 100644 --- a/providers/cactus/src/ai.ts +++ b/providers/cactus/src/ai.ts @@ -21,6 +21,13 @@ export * from "./ai/registerCactus"; import { CactusQueuedProvider } from "./ai/CactusQueuedProvider"; import { CACTUS_RUN_FN_SPECS } from "./ai/common/Cactus_Capabilities"; +import { + CACTUS_HASH_PLACEHOLDER, + CactusIntegrityError, + isHashPlaceholder, + sha256Hex, + verifySha256, +} from "./ai/common/Cactus_Integrity"; import { CACTUS_RUN_FNS } from "./ai/common/Cactus_JobRunFns"; import { cactusConfigJson, cactusEngines } from "./ai/common/Cactus_Runtime"; @@ -33,6 +40,11 @@ import { cactusConfigJson, cactusEngines } from "./ai/common/Cactus_Runtime"; * — their copies of `Cactus_Runtime.ts` are distinct module instances. Reading * the runtime state via `_testOnly` (rather than `@workglow/cactus/ai-runtime`) * guarantees the test observes the same Map that the run-fns mutate. + * + * The `Cactus_Integrity` symbols (sha256Hex, verifySha256, CactusIntegrityError, + * isHashPlaceholder, CACTUS_HASH_PLACEHOLDER) are pure/stateless helpers exposed + * here for unit testing only. They are not part of the stable public API; depend + * on the catalog's `sha256` field instead. */ export const _testOnly = { CactusQueuedProvider, @@ -40,4 +52,10 @@ export const _testOnly = { CACTUS_RUN_FNS, cactusEngines, cactusConfigJson, + // Integrity helpers (test-only): + CACTUS_HASH_PLACEHOLDER, + CactusIntegrityError, + isHashPlaceholder, + sha256Hex, + verifySha256, } as const; diff --git a/providers/cactus/src/ai/common/Cactus_Download.browser.ts b/providers/cactus/src/ai/common/Cactus_Download.browser.ts index 682aeecb8..74b8e9aeb 100644 --- a/providers/cactus/src/ai/common/Cactus_Download.browser.ts +++ b/providers/cactus/src/ai/common/Cactus_Download.browser.ts @@ -9,7 +9,8 @@ import type { ModelDownloadTaskRunInput, ModelDownloadTaskRunOutput, } from "@workglow/ai"; -import { getCactusCatalogEntry } from "./Cactus_ModelCatalog"; +import { CactusIntegrityError } from "./Cactus_Integrity"; +import { assetSpecsOf, getCactusCatalogEntry } from "./Cactus_ModelCatalog"; import type { CactusModelConfig } from "./Cactus_ModelSchema"; import { fetchAssetBytes, markModelCached } from "./Cactus_Runtime.browser"; @@ -23,14 +24,29 @@ export const Cactus_Download: AiProviderRunFn< const entry = getCactusCatalogEntry(model_id); if (!entry) throw new Error(`Unknown Cactus model_id: ${model_id}`); - const assets = [entry.assets.weights, entry.assets.vocab, entry.assets.config]; - for (let i = 0; i < assets.length; i++) { + const specs = assetSpecsOf(entry); + for (let i = 0; i < specs.length; i++) { + const spec = specs[i]; emit({ type: "phase", - message: `Downloading ${assets[i]}`, - progress: Math.round(((i + 0.5) / assets.length) * 99), + message: `Downloading ${spec.filename}`, + progress: Math.round(((i + 0.5) / specs.length) * 99), }); - await fetchAssetBytes(model, assets[i]); + try { + await fetchAssetBytes(model, spec); + } catch (err) { + // Surface whatever the integrity layer phrased — it knows whether the + // mismatch was a SHA-256 digest or a byte-length pre-check, and the + // error message is already shaped correctly for both. + // StreamPhase.progress is required (number | undefined); pass undefined + // on the error path because there is no meaningful percentage to report. + emit({ + type: "phase", + message: err instanceof CactusIntegrityError ? err.message : String(err), + progress: undefined, + }); + throw err; + } } markModelCached(model_id); emit({ type: "finish", data: { model: input.model! } }); diff --git a/providers/cactus/src/ai/common/Cactus_Download.ts b/providers/cactus/src/ai/common/Cactus_Download.ts index d6c014a52..d5fbe5bd7 100644 --- a/providers/cactus/src/ai/common/Cactus_Download.ts +++ b/providers/cactus/src/ai/common/Cactus_Download.ts @@ -9,7 +9,8 @@ import type { ModelDownloadTaskRunInput, ModelDownloadTaskRunOutput, } from "@workglow/ai"; -import { getCactusCatalogEntry } from "./Cactus_ModelCatalog"; +import { CactusIntegrityError } from "./Cactus_Integrity"; +import { assetSpecsOf, getCactusCatalogEntry } from "./Cactus_ModelCatalog"; import type { CactusModelConfig } from "./Cactus_ModelSchema"; import { fetchAssetBytes, markModelCached } from "./Cactus_Runtime"; @@ -23,14 +24,29 @@ export const Cactus_Download: AiProviderRunFn< const entry = getCactusCatalogEntry(model_id); if (!entry) throw new Error(`Unknown Cactus model_id: ${model_id}`); - const assets = [entry.assets.weights, entry.assets.vocab, entry.assets.config]; - for (let i = 0; i < assets.length; i++) { + const specs = assetSpecsOf(entry); + for (let i = 0; i < specs.length; i++) { + const spec = specs[i]; emit({ type: "phase", - message: `Downloading ${assets[i]}`, - progress: Math.round(((i + 0.5) / assets.length) * 99), + message: `Downloading ${spec.filename}`, + progress: Math.round(((i + 0.5) / specs.length) * 99), }); - await fetchAssetBytes(model, assets[i]); + try { + await fetchAssetBytes(model, spec); + } catch (err) { + // Surface whatever the integrity layer phrased — it knows whether the + // mismatch was a SHA-256 digest or a byte-length pre-check, and the + // error message is already shaped correctly for both. + // StreamPhase.progress is required (number | undefined); pass undefined + // on the error path because there is no meaningful percentage to report. + emit({ + type: "phase", + message: err instanceof CactusIntegrityError ? err.message : String(err), + progress: undefined, + }); + throw err; + } } markModelCached(model_id); emit({ type: "finish", data: { model: input.model! } }); diff --git a/providers/cactus/src/ai/common/Cactus_Integrity.ts b/providers/cactus/src/ai/common/Cactus_Integrity.ts new file mode 100644 index 000000000..2248e06e5 --- /dev/null +++ b/providers/cactus/src/ai/common/Cactus_Integrity.ts @@ -0,0 +1,124 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * SHA-256 integrity verification for Cactus model assets. + * + * The trust boundary for locally-executed model weights is anchored at the + * catalog: every byte loaded from disk, Cache Storage, or the network must + * hash to the catalog-pinned digest. Anything else is treated as adversarial + * and refused. + */ + +/** Sentinel value used in the catalog while real hashes are not yet populated. */ +export const CACTUS_HASH_PLACEHOLDER = "TODO_FILL_AT_RELEASE"; + +/** + * Raised whenever a Cactus asset fails an integrity check. The `expected` + * and `actual` fields are deliberately label-agnostic strings so the same + * error type covers both SHA-256 mismatches ("abc123...") and byte-length + * mismatches ("22000000 bytes"). The constructor message embeds them + * verbatim, so callers should phrase each side with whatever unit makes + * sense at the call site. + */ +export class CactusIntegrityError extends Error { + readonly url: string; + readonly filename: string; + readonly expected: string; + readonly actual: string; + constructor(opts: { url: string; filename: string; expected: string; actual: string }) { + super( + `Integrity check failed for ${opts.filename} from ${opts.url}: ` + + `expected ${opts.expected}, got ${opts.actual}` + ); + this.name = "CactusIntegrityError"; + this.url = opts.url; + this.filename = opts.filename; + this.expected = opts.expected; + this.actual = opts.actual; + } +} + +export async function sha256Hex(bytes: Uint8Array | ArrayBuffer): Promise { + // Copy into a fresh ArrayBuffer so we hand crypto.subtle.digest a concrete + // `BufferSource` whose backing buffer is `ArrayBuffer` (not `ArrayBufferLike`). + // The recent lib.dom tightening on Uint8Array's default generic argument made + // the previous `new Uint8Array(bytes)` path no longer assignable to digest's + // parameter type. + const src = + bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes as ArrayBuffer); + const buf = new ArrayBuffer(src.byteLength); + new Uint8Array(buf).set(src); + const digest = await globalThis.crypto.subtle.digest("SHA-256", buf); + const view = new Uint8Array(digest); + let s = ""; + for (let i = 0; i < view.length; i++) { + s += view[i].toString(16).padStart(2, "0"); + } + return s; +} + +/** + * Returns `true` if `expected` is the well-known placeholder that means + * "maintainer has not populated a real hash yet." In that case callers SHOULD + * skip verification but MUST log a clear warning — this is intended for + * pre-release dev only and must never reach a tagged release. + */ +export function isHashPlaceholder(expected: string): boolean { + return expected === CACTUS_HASH_PLACEHOLDER; +} + +/** + * Hashes `bytes` and throws `CactusIntegrityError` if it does not match + * `expected`. Throws a plain `Error` if `expected` is malformed (not 64 hex + * chars), since that is a catalog-author bug, not a content bug. + * + * When the hashes mismatch, both `expected` and `actual` are lowercase hex + * SHA-256 strings; the resulting error message reads + * `expected , got `. + * + * If `expected` is the `TODO_FILL_AT_RELEASE` placeholder, verification is + * skipped and a one-time warning is logged. This keeps developers unblocked + * before the real hashes land while making the gap impossible to miss. + */ +export async function verifySha256( + bytes: Uint8Array | ArrayBuffer, + expected: string, + ctx: { url: string; filename: string } +): Promise { + if (isHashPlaceholder(expected)) { + warnPlaceholderOnce(ctx.filename); + return; + } + if (typeof expected !== "string" || expected.length !== 64) { + throw new Error( + `Invalid catalog SHA-256 for ${ctx.filename}: must be 64 hex chars (got length ${ + typeof expected === "string" ? expected.length : typeof expected + })` + ); + } + const expectedLc = expected.toLowerCase(); + if (!/^[0-9a-f]{64}$/.test(expectedLc)) { + throw new Error( + `Invalid catalog SHA-256 for ${ctx.filename}: contains non-hex characters` + ); + } + const actual = await sha256Hex(bytes); + if (actual !== expectedLc) { + throw new CactusIntegrityError({ ...ctx, expected: expectedLc, actual }); + } +} + +const _warnedFiles = new Set(); +function warnPlaceholderOnce(filename: string): void { + if (_warnedFiles.has(filename)) return; + _warnedFiles.add(filename); + // eslint-disable-next-line no-console + console.warn( + `[@workglow/cactus] SHA-256 catalog entry for "${filename}" is a placeholder; ` + + `integrity verification is DISABLED. This must be populated before release.` + ); +} diff --git a/providers/cactus/src/ai/common/Cactus_ModelCatalog.ts b/providers/cactus/src/ai/common/Cactus_ModelCatalog.ts index 8d70e8e28..635b5eb73 100644 --- a/providers/cactus/src/ai/common/Cactus_ModelCatalog.ts +++ b/providers/cactus/src/ai/common/Cactus_ModelCatalog.ts @@ -10,6 +10,29 @@ import { CACTUS_DEFAULT_REVISION, CACTUS_NEEDLE_26M, } from "./Cactus_Constants"; +import { CACTUS_HASH_PLACEHOLDER } from "./Cactus_Integrity"; + +/** + * A single asset file in a Cactus model catalog entry. + * + * `sha256` is the lowercase-hex digest of the canonical asset bytes at the + * pinned `revision` in `CactusCatalogEntry`. It anchors the trust boundary: + * any byte that fails this check is treated as adversarial and refused. + */ +export interface CactusAssetSpec { + readonly filename: string; + /** + * Lowercase hex SHA-256, exactly 64 characters. + * + * The literal string `"TODO_FILL_AT_RELEASE"` is accepted as a placeholder + * during pre-release development; in that case `verifySha256` skips the + * check and logs a one-time warning. The placeholder MUST be replaced + * with a real hash before a tagged release. + */ + readonly sha256: string; + /** Expected byte length — used as a cheap pre-check before hashing. */ + readonly size: number; +} export interface CactusCatalogEntry { readonly model_id: string; @@ -18,13 +41,30 @@ export interface CactusCatalogEntry { readonly hf_repo: string; readonly revision: string; readonly assets: { - readonly weights: string; - readonly vocab: string; - readonly config: string; + readonly weights: CactusAssetSpec; + readonly vocab: CactusAssetSpec; + readonly config: CactusAssetSpec; }; readonly capabilities: readonly Capability[]; } +/** + * Asserts that `s` is a lowercase hex SHA-256 (64 hex chars). + * + * Invoked at module-load time on every non-placeholder catalog entry (see + * the bottom of this file) so malformed hashes surface immediately as an + * import-time error rather than the first time `verifySha256` runs against + * fetched bytes. + */ +export function assertHexSha256(s: string, ctxLabel?: string): asserts s is string { + if (typeof s !== "string" || s.length !== 64 || !/^[0-9a-f]{64}$/.test(s)) { + throw new Error( + `Invalid SHA-256 in catalog${ctxLabel ? ` (${ctxLabel})` : ""}: ` + + `expected 64 lowercase hex chars, got ${JSON.stringify(s)}` + ); + } +} + export const CACTUS_CATALOG: readonly CactusCatalogEntry[] = [ { model_id: CACTUS_NEEDLE_26M, @@ -34,9 +74,25 @@ export const CACTUS_CATALOG: readonly CactusCatalogEntry[] = [ hf_repo: CACTUS_DEFAULT_HF_REPO, revision: CACTUS_DEFAULT_REVISION, assets: { - weights: "needle.safetensors", - vocab: "vocab.txt", - config: "config.json", + // MAINTAINER: replace with sha256 of the asset at the pinned revision; + // see providers/cactus/scripts/hash-catalog.ts (planned follow-up). + // Verification is skipped while the value is the literal placeholder, but + // a clear warning is logged so this can never silently ship to release. + weights: { + filename: "needle.safetensors", + sha256: CACTUS_HASH_PLACEHOLDER, + size: 0, + }, + vocab: { + filename: "vocab.txt", + sha256: CACTUS_HASH_PLACEHOLDER, + size: 0, + }, + config: { + filename: "config.json", + sha256: CACTUS_HASH_PLACEHOLDER, + size: 0, + }, }, capabilities: ["tool-use"], }, @@ -46,6 +102,31 @@ export function getCactusCatalogEntry(model_id: string): CactusCatalogEntry | un return CACTUS_CATALOG.find((e) => e.model_id === model_id); } -export function cactusAssetUrl(entry: CactusCatalogEntry, filename: string): string { +/** Returns all three asset specs in fixed order: weights, vocab, config. */ +export function assetSpecsOf(entry: CactusCatalogEntry): readonly CactusAssetSpec[] { + return [entry.assets.weights, entry.assets.vocab, entry.assets.config]; +} + +export function cactusAssetUrl( + entry: CactusCatalogEntry, + filenameOrSpec: string | CactusAssetSpec +): string { + const filename = + typeof filenameOrSpec === "string" ? filenameOrSpec : filenameOrSpec.filename; return `https://huggingface.co/${entry.hf_repo}/resolve/${entry.revision}/${filename}`; } + +// ============================================================================ +// Module-load invariant: every non-placeholder catalog entry has a valid +// 64-char lowercase hex SHA-256. Catches catalog-author bugs immediately. +// +// Placeholder entries are intentionally skipped — `verifySha256` warns and +// no-ops on them during pre-release development. +// ============================================================================ +for (const entry of CACTUS_CATALOG) { + for (const asset of assetSpecsOf(entry)) { + if (asset.sha256 !== CACTUS_HASH_PLACEHOLDER) { + assertHexSha256(asset.sha256, `${entry.model_id}/${asset.filename}`); + } + } +} diff --git a/providers/cactus/src/ai/common/Cactus_Runtime.browser.ts b/providers/cactus/src/ai/common/Cactus_Runtime.browser.ts index e09a28601..b0df64f85 100644 --- a/providers/cactus/src/ai/common/Cactus_Runtime.browser.ts +++ b/providers/cactus/src/ai/common/Cactus_Runtime.browser.ts @@ -11,10 +11,13 @@ */ import { CACTUS_CACHE_NAME } from "./Cactus_Constants"; +import { CactusIntegrityError, verifySha256 } from "./Cactus_Integrity"; import { + assetSpecsOf, cactusAssetUrl, - getCactusCatalogEntry, + type CactusAssetSpec, type CactusCatalogEntry, + getCactusCatalogEntry, } from "./Cactus_ModelCatalog"; import type { CactusModelConfig } from "./Cactus_ModelSchema"; @@ -28,6 +31,51 @@ export interface CactusModelCacheInfo { readonly file_sizes: Record | null; } +// ============================================================================ +// Path-safety allowlists (defense-in-depth, mirror of Cactus_Runtime.ts) +// +// The browser variant does not touch the filesystem, but applying the same +// validation keeps both code paths in sync, hardens cache-key inputs, and +// silences static analyzers that flag any use of user-supplied identifiers +// in URL/path-shaped strings. +// +// TODO: lift these helpers into a shared module if/when a third caller +// appears. Duplicated for now to avoid churn during the active PR. +// ============================================================================ + +const MODEL_ID_RE = /^[A-Za-z0-9_-]{1,64}$/; +const FILENAME_RE = /^[A-Za-z0-9_.-]+$/; +// Match the Node variant's limit so an asset that validates here also +// validates there. The Node atomic-write path writes to `${filename}.tmp` +// before renaming, so the source filename must leave room for that suffix +// (most filesystems cap a path component at 255 bytes). +const MAX_FILENAME_LEN = 251; + +function assertSafeModelId(model_id: string): void { + if (typeof model_id !== "string" || !MODEL_ID_RE.test(model_id)) { + throw new Error( + `Invalid Cactus model_id ${JSON.stringify(model_id)}: ` + + `must match ${MODEL_ID_RE} (alphanumeric, underscore, hyphen; 1-64 chars).` + ); + } +} + +function assertSafeFilename(filename: string): void { + if ( + typeof filename !== "string" || + filename.length === 0 || + filename.length > MAX_FILENAME_LEN || + filename === "." || + filename === ".." || + !FILENAME_RE.test(filename) + ) { + throw new Error( + `Invalid Cactus asset filename ${JSON.stringify(filename)}: ` + + `must match ${FILENAME_RE} (no path separators, no '..'), 1-${MAX_FILENAME_LEN} chars.` + ); + } +} + let _sdk: NeedleSdkModule | undefined; let _sdkInitPromise: Promise | undefined; @@ -62,7 +110,7 @@ export function getCactusSdk(): NeedleSdkModule { // ============================================================================ function assetFilenames(entry: CactusCatalogEntry): string[] { - return [entry.assets.weights, entry.assets.vocab, entry.assets.config]; + return assetSpecsOf(entry).map((s) => s.filename); } async function getRemoteAssetSize( @@ -81,29 +129,91 @@ async function getRemoteAssetSize( } } -async function fetchAssetBytesBrowser(url: string): Promise { +async function fetchAssetBytesBrowser( + url: string, + spec: CactusAssetSpec +): Promise { + assertSafeFilename(spec.filename); const cachesApi = (globalThis as unknown as { caches: CacheStorage }).caches; const cache = await cachesApi.open(CACTUS_CACHE_NAME); const hit = await cache.match(url); if (hit) { - return new Uint8Array(await hit.arrayBuffer()); + const bytes = new Uint8Array(await hit.arrayBuffer()); + try { + // Cheap pre-check: a wrong-size cached entry cannot match the catalog. + // Throwing CactusIntegrityError here flows through the same catch + // branch that deletes the cache entry and falls through to refetch — + // so size and hash mismatches are handled uniformly. + if (spec.size > 0 && bytes.byteLength !== spec.size) { + throw new CactusIntegrityError({ + url, + filename: spec.filename, + expected: `${spec.size} bytes`, + actual: `${bytes.byteLength} bytes`, + }); + } + await verifySha256(bytes, spec.sha256, { url, filename: spec.filename }); + return bytes; + } catch (err) { + if (err instanceof CactusIntegrityError) { + try { + await cache.delete(url); + } catch { + /* best effort */ + } + } else { + throw err; + } + } } const resp = await fetch(url); if (!resp.ok) throw new Error(`Cactus asset fetch failed (${resp.status}) for ${url}`); - // Clone first — Response bodies can only be consumed once. - await cache.put(url, resp.clone()); - return new Uint8Array(await resp.arrayBuffer()); + const contentType = resp.headers.get("content-type") ?? "application/octet-stream"; + const ab = await resp.arrayBuffer(); + const bytes = new Uint8Array(ab); + if (spec.size > 0 && bytes.byteLength !== spec.size) { + throw new CactusIntegrityError({ + url, + filename: spec.filename, + expected: `${spec.size} bytes`, + actual: `${bytes.byteLength} bytes`, + }); + } + // Verify BEFORE storing — never persist unverified bytes to the cache. + await verifySha256(bytes, spec.sha256, { url, filename: spec.filename }); + const headers = new Headers({ + "content-type": contentType, + "content-length": String(bytes.byteLength), + }); + await cache.put(url, new Response(bytes, { headers })); + return bytes; } export async function fetchAssetBytes( model: CactusModelConfig, - filename: string + specOrFilename: CactusAssetSpec | string ): Promise { const model_id = model.provider_config.model_id; + assertSafeModelId(model_id); const entry = getCactusCatalogEntry(model_id); if (!entry) throw new Error(`Unknown Cactus model_id: ${model_id}`); - const url = cactusAssetUrl(entry, filename); - return fetchAssetBytesBrowser(url); + const spec = resolveAssetSpec(entry, specOrFilename); + const url = cactusAssetUrl(entry, spec.filename); + return fetchAssetBytesBrowser(url, spec); +} + +function resolveAssetSpec( + entry: CactusCatalogEntry, + specOrFilename: CactusAssetSpec | string +): CactusAssetSpec { + if (typeof specOrFilename !== "string") return specOrFilename; + const found = assetSpecsOf(entry).find((s) => s.filename === specOrFilename); + if (!found) { + throw new Error( + `No asset spec for filename ${JSON.stringify(specOrFilename)} in catalog entry ${entry.model_id}` + ); + } + return found; } // ============================================================================ @@ -248,7 +358,7 @@ export async function deleteCactusSession(id: string): Promise { async function removeBrowserCacheEntries(entry: CactusCatalogEntry): Promise { const cachesApi = (globalThis as unknown as { caches: CacheStorage }).caches; const cache = await cachesApi.open(CACTUS_CACHE_NAME); - for (const filename of [entry.assets.weights, entry.assets.vocab, entry.assets.config]) { + for (const filename of assetFilenames(entry)) { const url = cactusAssetUrl(entry, filename); try { await cache.delete(url); diff --git a/providers/cactus/src/ai/common/Cactus_Runtime.ts b/providers/cactus/src/ai/common/Cactus_Runtime.ts index 6e0d6b965..507b39f62 100644 --- a/providers/cactus/src/ai/common/Cactus_Runtime.ts +++ b/providers/cactus/src/ai/common/Cactus_Runtime.ts @@ -7,10 +7,13 @@ import fs from "node:fs/promises"; import path from "node:path"; import { CACTUS_CACHE_NAME, CACTUS_DEFAULT_MODELS_DIR } from "./Cactus_Constants"; +import { CactusIntegrityError, verifySha256 } from "./Cactus_Integrity"; import { + assetSpecsOf, cactusAssetUrl, - getCactusCatalogEntry, + type CactusAssetSpec, type CactusCatalogEntry, + getCactusCatalogEntry, } from "./Cactus_ModelCatalog"; import type { CactusModelConfig } from "./Cactus_ModelSchema"; @@ -24,6 +27,66 @@ export interface CactusModelCacheInfo { readonly file_sizes: Record | null; } +// ============================================================================ +// Path-safety (defense-in-depth + CodeQL-recognized inline sanitizer) +// +// `model_id` originates from user-supplied `provider_config.model_id` and +// `filename` originates from the (effectively trusted) catalog. The catalog +// lookup in `getCactusCatalogEntry` already restricts `model_id` to known +// values, but static analyzers cannot see through that lookup. +// +// Two layers of defense are applied at every filesystem entry point: +// +// 1. Character allowlists (`assertSafeModelId`, `assertSafeFilename`) +// reject separators, `..`, NUL, and any shell/path-special characters +// at the boundary. Fast-fail on malformed input. +// +// 2. An inline `path.resolve` + `path.relative` containment check is +// duplicated immediately before every `fs.*` call. CodeQL's +// `js/path-injection` query does NOT trace through user-defined +// helper functions, so the sanitizer pattern must appear in the +// same function scope as the filesystem call. The `path.relative` +// shape is the canonical form CodeQL recognizes and is root-safe +// (a `startsWith(root + path.sep)` check breaks when `root` is "/" +// on POSIX or a drive root like "C:\\" on Windows because the +// concatenated separator produces "//" / "C:\\\\" that no child +// can match). +// ============================================================================ + +const MODEL_ID_RE = /^[A-Za-z0-9_-]{1,64}$/; +const FILENAME_RE = /^[A-Za-z0-9_.-]+$/; +// Most filesystems (ext4, APFS, NTFS) cap a single path component at 255 +// bytes. The Node atomic-write path writes to `${filename}.tmp` (4 chars) +// before renaming, so the source filename must leave room for that suffix. +// Apply the same limit in the browser variant for cross-platform parity. +const MAX_FILENAME_LEN = 251; + +function assertSafeModelId(model_id: string): void { + if (typeof model_id !== "string" || !MODEL_ID_RE.test(model_id)) { + throw new Error( + `Invalid Cactus model_id ${JSON.stringify(model_id)}: ` + + `must match ${MODEL_ID_RE} (alphanumeric, underscore, hyphen; 1-64 chars).` + ); + } +} + +function assertSafeFilename(filename: string): void { + if ( + typeof filename !== "string" || + filename.length === 0 || + filename.length > MAX_FILENAME_LEN || + filename === "." || + filename === ".." || + !FILENAME_RE.test(filename) + ) { + throw new Error( + `Invalid Cactus asset filename ${JSON.stringify(filename)}: ` + + `must match ${FILENAME_RE} (no path separators, no '..'), 1-${MAX_FILENAME_LEN} chars ` + + `(reserves 4 chars for the '.tmp' suffix used by atomic writes).` + ); + } +} + let _sdk: NeedleSdkModule | undefined; let _sdkInitPromise: Promise | undefined; @@ -69,14 +132,8 @@ function modelsDirOf(model: CactusModelConfig): string { return model.provider_config.models_dir ?? CACTUS_DEFAULT_MODELS_DIR; } -function resolveModelDir(models_dir: string, model_id: string): string { - return models_dir.startsWith("~/") - ? path.join(process.env.HOME ?? process.env.USERPROFILE ?? ".", models_dir.slice(2), model_id) - : path.resolve(models_dir, model_id); -} - function assetFilenames(entry: CactusCatalogEntry): string[] { - return [entry.assets.weights, entry.assets.vocab, entry.assets.config]; + return assetSpecsOf(entry).map((s) => s.filename); } async function getRemoteAssetSize( @@ -102,11 +159,37 @@ async function getNodeAssetCacheInfo( signal: AbortSignal | undefined ): Promise { const filenames = assetFilenames(entry); - const resolvedDir = resolveModelDir(modelsDirOf(model), entry.model_id); + const models_dir = modelsDirOf(model); + const model_id = entry.model_id; + assertSafeModelId(model_id); + // Compute the resolved model dir inline so CodeQL's js/path-injection + // query can trace the sanitizer locally. + const safeRoot = models_dir.startsWith("~/") + ? path.resolve( + process.env.HOME ?? process.env.USERPROFILE ?? ".", + models_dir.slice(2) + ) + : path.resolve(models_dir); + const resolvedDir = path.resolve(safeRoot, model_id); + { + const rel = path.relative(safeRoot, resolvedDir); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(resolvedDir)} is not within ${JSON.stringify(safeRoot)}` + ); + } + } const stats = await Promise.all( filenames.map(async (filename) => { + assertSafeFilename(filename); + // Inline sanitizer at the fs call site. + const target = path.resolve(resolvedDir, filename); + const rel = path.relative(resolvedDir, target); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + return { filename, size: undefined, cached: false }; + } try { - const stat = await fs.stat(path.join(resolvedDir, filename)); + const stat = await fs.stat(target); return { filename, size: stat.size, cached: true }; } catch { return { filename, size: undefined, cached: false }; @@ -146,56 +229,237 @@ async function getNodeAssetCacheInfo( }; } -async function fetchAssetBytesBrowser(url: string): Promise { +async function fetchAssetBytesBrowser( + url: string, + spec: CactusAssetSpec +): Promise { const cachesApi = (globalThis as unknown as { caches: CacheStorage }).caches; const cache = await cachesApi.open(CACTUS_CACHE_NAME); const hit = await cache.match(url); if (hit) { - return new Uint8Array(await hit.arrayBuffer()); + const bytes = new Uint8Array(await hit.arrayBuffer()); + try { + await verifySha256(bytes, spec.sha256, { url, filename: spec.filename }); + return bytes; + } catch (err) { + if (err instanceof CactusIntegrityError) { + // Cached bytes are corrupt / stale — evict and refetch. + try { + await cache.delete(url); + } catch { + /* best effort */ + } + } else { + throw err; + } + } } const resp = await fetch(url); if (!resp.ok) throw new Error(`Cactus asset fetch failed (${resp.status}) for ${url}`); - // Clone first — Response bodies can only be consumed once. - await cache.put(url, resp.clone()); - return new Uint8Array(await resp.arrayBuffer()); + const contentType = resp.headers.get("content-type") ?? "application/octet-stream"; + const ab = await resp.arrayBuffer(); + const bytes = new Uint8Array(ab); + if (spec.size > 0 && bytes.byteLength !== spec.size) { + throw new CactusIntegrityError({ + url, + filename: spec.filename, + expected: `${spec.size} bytes`, + actual: `${bytes.byteLength} bytes`, + }); + } + // Verify BEFORE storing — never persist unverified bytes to the cache. + await verifySha256(bytes, spec.sha256, { url, filename: spec.filename }); + const headers = new Headers({ + "content-type": contentType, + "content-length": String(bytes.byteLength), + }); + await cache.put(url, new Response(bytes, { headers })); + return bytes; } async function fetchAssetBytesNode( url: string, models_dir: string, model_id: string, - filename: string + spec: CactusAssetSpec ): Promise { - const resolvedDir = resolveModelDir(models_dir, model_id); - const filePath = path.join(resolvedDir, filename); + assertSafeModelId(model_id); + assertSafeFilename(spec.filename); + // Compute the resolved model dir inline so CodeQL's js/path-injection + // query can trace the sanitizer locally. + const safeRoot = models_dir.startsWith("~/") + ? path.resolve( + process.env.HOME ?? process.env.USERPROFILE ?? ".", + models_dir.slice(2) + ) + : path.resolve(models_dir); + const resolvedDir = path.resolve(safeRoot, model_id); + { + const rel = path.relative(safeRoot, resolvedDir); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(resolvedDir)} is not within ${JSON.stringify(safeRoot)}` + ); + } + } + // Used for the error-context URL only — not for any fs.* call (those + // recompute path.resolve locally so CodeQL sees the inline sanitizer). + const filePath = path.resolve(resolvedDir, spec.filename); + { + const rel = path.relative(resolvedDir, filePath); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(filePath)} is not within ${JSON.stringify(resolvedDir)}` + ); + } + } try { - const buf = await fs.readFile(filePath); - return new Uint8Array(buf); - } catch { - // fall through to fetch + // Re-resolve at the call site so CodeQL sees the sanitizer locally. + const readPath = path.resolve(resolvedDir, spec.filename); + { + const rel = path.relative(resolvedDir, readPath); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(readPath)} is not within ${JSON.stringify(resolvedDir)}` + ); + } + } + const buf = await fs.readFile(readPath); + const bytes = new Uint8Array(buf); + try { + // Cheap pre-check: a wrong-size cached file cannot match the catalog. + // Throwing CactusIntegrityError here flows through the same catch + // branch that unlinks and falls through to the network refetch + // path — so size and hash mismatches are handled uniformly. + if (spec.size > 0 && bytes.byteLength !== spec.size) { + throw new CactusIntegrityError({ + url: `file:${filePath}`, + filename: spec.filename, + expected: `${spec.size} bytes`, + actual: `${bytes.byteLength} bytes`, + }); + } + await verifySha256(bytes, spec.sha256, { url: `file:${filePath}`, filename: spec.filename }); + return bytes; + } catch (err) { + if (err instanceof CactusIntegrityError) { + // On-disk asset is corrupt; evict and fall through to network. + const unlinkPath = path.resolve(resolvedDir, spec.filename); + const rel = path.relative(resolvedDir, unlinkPath); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(unlinkPath)} is not within ${JSON.stringify(resolvedDir)}` + ); + } + await fs.unlink(unlinkPath).catch(() => {}); + } else { + throw err; + } + } + } catch (err) { + // ENOENT or sibling read errors fall through to fetch. + if (err instanceof CactusIntegrityError) { + throw err; // unreachable, handled above + } } const resp = await fetch(url); if (!resp.ok) throw new Error(`Cactus asset fetch failed (${resp.status}) for ${url}`); const bytes = new Uint8Array(await resp.arrayBuffer()); - await fs.mkdir(resolvedDir, { recursive: true }); - const tmpPath = `${filePath}.tmp`; - await fs.writeFile(tmpPath, bytes); - await fs.rename(tmpPath, filePath); + if (spec.size > 0 && bytes.byteLength !== spec.size) { + throw new CactusIntegrityError({ + url, + filename: spec.filename, + expected: `${spec.size} bytes`, + actual: `${bytes.byteLength} bytes`, + }); + } + // Verify BEFORE writing the tmp file — never atomically promote unverified bytes. + await verifySha256(bytes, spec.sha256, { url, filename: spec.filename }); + // Inline sanitizer for the mkdir target. + const mkdirTarget = path.resolve(safeRoot, model_id); + { + const rel = path.relative(safeRoot, mkdirTarget); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(mkdirTarget)} is not within ${JSON.stringify(safeRoot)}` + ); + } + } + await fs.mkdir(mkdirTarget, { recursive: true }); + // Atomic write: write to a sibling `.tmp` path, then rename. Each fs + // call below recomputes its path via path.resolve so CodeQL sees the + // inline sanitizer at every call site. + try { + const writeTarget = path.resolve(resolvedDir, `${spec.filename}.tmp`); + { + const rel = path.relative(resolvedDir, writeTarget); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(writeTarget)} is not within ${JSON.stringify(resolvedDir)}` + ); + } + } + await fs.writeFile(writeTarget, bytes); + const renameFrom = path.resolve(resolvedDir, `${spec.filename}.tmp`); + { + const rel = path.relative(resolvedDir, renameFrom); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(renameFrom)} is not within ${JSON.stringify(resolvedDir)}` + ); + } + } + const renameTo = path.resolve(resolvedDir, spec.filename); + { + const rel = path.relative(resolvedDir, renameTo); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(renameTo)} is not within ${JSON.stringify(resolvedDir)}` + ); + } + } + await fs.rename(renameFrom, renameTo); + } catch (err) { + const cleanupTarget = path.resolve(resolvedDir, `${spec.filename}.tmp`); + { + const rel = path.relative(resolvedDir, cleanupTarget); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw err; + } + } + await fs.unlink(cleanupTarget).catch(() => {}); + throw err; + } return bytes; } export async function fetchAssetBytes( model: CactusModelConfig, - filename: string + specOrFilename: CactusAssetSpec | string ): Promise { const model_id = model.provider_config.model_id; const entry = getCactusCatalogEntry(model_id); if (!entry) throw new Error(`Unknown Cactus model_id: ${model_id}`); - const url = cactusAssetUrl(entry, filename); + const spec = resolveAssetSpec(entry, specOrFilename); + const url = cactusAssetUrl(entry, spec.filename); if (hasBrowserCacheStorage()) { - return fetchAssetBytesBrowser(url); + return fetchAssetBytesBrowser(url, spec); } - return fetchAssetBytesNode(url, modelsDirOf(model), model_id, filename); + return fetchAssetBytesNode(url, modelsDirOf(model), model_id, spec); +} + +function resolveAssetSpec( + entry: CactusCatalogEntry, + specOrFilename: CactusAssetSpec | string +): CactusAssetSpec { + if (typeof specOrFilename !== "string") return specOrFilename; + const found = assetSpecsOf(entry).find((s) => s.filename === specOrFilename); + if (!found) { + throw new Error( + `No asset spec for filename ${JSON.stringify(specOrFilename)} in catalog entry ${entry.model_id}` + ); + } + return found; } // ============================================================================ @@ -345,7 +609,7 @@ async function removeBrowserCacheEntries(entry: CactusCatalogEntry): Promise { if (hasBrowserCacheStorage()) return; + assertSafeModelId(model_id); const models_dir = modelsDirOf(model); - const resolvedDir = resolveModelDir(models_dir, model_id); + // Compute the resolved model dir inline so CodeQL's js/path-injection + // query can trace the sanitizer locally. + const safeRoot = models_dir.startsWith("~/") + ? path.resolve( + process.env.HOME ?? process.env.USERPROFILE ?? ".", + models_dir.slice(2) + ) + : path.resolve(models_dir); + const resolvedDir = path.resolve(safeRoot, model_id); + { + const rel = path.relative(safeRoot, resolvedDir); + if (rel !== "" && (rel.startsWith("..") || path.isAbsolute(rel))) { + throw new Error( + `Path escape detected: ${JSON.stringify(resolvedDir)} is not within ${JSON.stringify(safeRoot)}` + ); + } + } await fs.rm(resolvedDir, { recursive: true, force: true }); }