From ab55855e0ef72a1802fcd57d45b05d9c9cce7392 Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Wed, 24 Jun 2026 13:17:54 +0000 Subject: [PATCH 01/14] feat(bb-knowledge-base): add isReady()/waitUntilReady() ingestion readiness API --- .changeset/kb-readiness.md | 12 + package-lock.json | 197 ++++++------ packages/bb-knowledge-base/package.json | 1 + packages/bb-knowledge-base/src/errors.ts | 4 + .../bb-knowledge-base/src/index.aws.test.ts | 281 ++++++++++++++++++ packages/bb-knowledge-base/src/index.aws.ts | 186 +++++++++++- .../bb-knowledge-base/src/index.browser.ts | 12 +- packages/bb-knowledge-base/src/index.cdk.ts | 35 ++- .../bb-knowledge-base/src/index.mock.test.ts | 28 ++ packages/bb-knowledge-base/src/index.mock.ts | 30 +- packages/bb-knowledge-base/src/types.ts | 18 ++ 11 files changed, 700 insertions(+), 104 deletions(-) create mode 100644 .changeset/kb-readiness.md diff --git a/.changeset/kb-readiness.md b/.changeset/kb-readiness.md new file mode 100644 index 00000000..4ccd20d5 --- /dev/null +++ b/.changeset/kb-readiness.md @@ -0,0 +1,12 @@ +--- +"@aws-blocks/bb-knowledge-base": minor +--- + +Add `isReady()` / `waitUntilReady()` ingestion-readiness API to KnowledgeBase. + +Bedrock ingestion runs asynchronously after deploy, so during the warm-up window `retrieve()` returns an empty array even for queries that would later match — making "empty" ambiguous between "still warming up" and "ingested, no match". The new methods resolve that ambiguity: + +- `isReady(): Promise` — `true` once the data source's most recent ingestion job is `COMPLETE` (or when there is no BB-managed data source to track, e.g. an imported `s3://` source); `false` while ingestion is pending. Throws a typed `IngestionFailedException` (including `failureReasons`) if the latest job failed. +- `waitUntilReady(options?: { timeoutMs?: number; pollIntervalMs?: number }): Promise` — polls until ready (defaults: `timeoutMs` 300000, `pollIntervalMs` 5000), throwing a typed `KnowledgeBaseTimeoutException` on timeout or propagating `IngestionFailedException` on a failed job. + +Purely additive — `retrieve()` and all existing signatures are unchanged. The local mock reports ready immediately (no warm-up window in local dev). diff --git a/package-lock.json b/package-lock.json index cac07660..4fda6737 100644 --- a/package-lock.json +++ b/package-lock.json @@ -21884,6 +21884,27 @@ "node": ">=20.0.0" } }, + "node_modules/@aws-sdk/client-bedrock-agent": { + "version": "3.1075.0", + "resolved": "https://registry.npmjs.org/@aws-sdk/client-bedrock-agent/-/client-bedrock-agent-3.1075.0.tgz", + "integrity": "sha512-CLYI9l9ub2FkNtsJNThQXoG/HJewgPCnhWI0QS9uLZOPEZKt7FoBNSOREmx9IEfI/goWDPC70sEZK/hjlmSyyg==", + "license": "Apache-2.0", + "dependencies": { + "@aws-crypto/sha256-browser": "5.2.0", + "@aws-crypto/sha256-js": "5.2.0", + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/credential-provider-node": "^3.972.58", + "@aws-sdk/types": "^3.973.13", + "@smithy/core": "^3.24.6", + "@smithy/fetch-http-handler": "^5.4.6", + "@smithy/node-http-handler": "^4.7.6", + "@smithy/types": "^4.14.3", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, "node_modules/@aws-sdk/client-bedrock-agent-runtime": { "version": "3.1046.0", "resolved": "https://registry.npmjs.org/@aws-sdk/client-bedrock-agent-runtime/-/client-bedrock-agent-runtime-3.1046.0.tgz", @@ -22838,13 +22859,13 @@ } }, "node_modules/@aws-sdk/core": { - "version": "3.974.20", - "resolved": "https://registry.npmjs.org/@aws-sdk/core/-/core-3.974.20.tgz", - "integrity": "sha512-7sDi2B2N3mc3nf1nz6FyEx/FCrJ1N1QnBmraHHQNabFaeAh2IaOOLml48/rHOD1bICHgTRkbBgNTvUzEr5Z35g==", + "version": "3.974.23", + "resolved": "https://registry.npmjs.org/@aws-sdk/core/-/core-3.974.23.tgz", + "integrity": "sha512-MiWR/uWjxjFXGzrE0Ghc5lWxUxzHsUWFhV+OX7M4cR9SrmrnZs6TXavnCWnzzdwJeFri34xQo81rvGNzK3c4BQ==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/types": "^3.973.12", - "@aws-sdk/xml-builder": "^3.972.29", + "@aws-sdk/types": "^3.973.13", + "@aws-sdk/xml-builder": "^3.972.31", "@aws/lambda-invoke-store": "^0.2.2", "@smithy/core": "^3.24.6", "@smithy/signature-v4": "^5.4.6", @@ -22887,13 +22908,13 @@ } }, "node_modules/@aws-sdk/credential-provider-env": { - "version": "3.972.46", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-env/-/credential-provider-env-3.972.46.tgz", - "integrity": "sha512-+GPXVS2srMOlH74S+SmC1gVuP2TvUZ0siuC0onKO93q+udP+M72dmY8wJfVQ5CX9z/9X5A1HHwz5yRIGBtskvQ==", + "version": "3.972.49", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-env/-/credential-provider-env-3.972.49.tgz", + "integrity": "sha512-liB3yQNHCM9k/gu/w36XHMKPluT7HTlnGUhRbBGSISDQkcr/Sy1zsZabiuvQj8WG5yW573u9RehrBvvnIQ9OEQ==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.20", - "@aws-sdk/types": "^3.973.12", + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/types": "^4.14.3", "tslib": "^2.6.2" @@ -22903,13 +22924,13 @@ } }, "node_modules/@aws-sdk/credential-provider-http": { - "version": "3.972.48", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-http/-/credential-provider-http-3.972.48.tgz", - "integrity": "sha512-fA5loSdlocacRxyUXtpoHSMuk5rsIKRDzQYVMnMxjcmFeZshaJlJ8lymy/hYKji6sne/UmNGj5pxuEs6kq/Qcg==", + "version": "3.972.51", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-http/-/credential-provider-http-3.972.51.tgz", + "integrity": "sha512-XET0H2oofciJ5lMRWNIvRjAP7Q3wv2XT+JtJJEdhPWUMwe3TvQ9qcxonpu7vXmNngncvFpi4E2It+Tamas/naA==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.20", - "@aws-sdk/types": "^3.973.12", + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/fetch-http-handler": "^5.4.6", "@smithy/node-http-handler": "^4.7.6", @@ -22921,20 +22942,20 @@ } }, "node_modules/@aws-sdk/credential-provider-ini": { - "version": "3.972.53", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-ini/-/credential-provider-ini-3.972.53.tgz", - "integrity": "sha512-ZfdhIOR41q8TcWEnUac+gCOb+O2LBWdHLmjedXpXz4IEFW2ppNuFcm6p0sMTavpM+zD5TYfpH5Gp7guRyqSgsQ==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.974.20", - "@aws-sdk/credential-provider-env": "^3.972.46", - "@aws-sdk/credential-provider-http": "^3.972.48", - "@aws-sdk/credential-provider-login": "^3.972.52", - "@aws-sdk/credential-provider-process": "^3.972.46", - "@aws-sdk/credential-provider-sso": "^3.972.52", - "@aws-sdk/credential-provider-web-identity": "^3.972.52", - "@aws-sdk/nested-clients": "^3.997.20", - "@aws-sdk/types": "^3.973.12", + "version": "3.972.56", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-ini/-/credential-provider-ini-3.972.56.tgz", + "integrity": "sha512-IAmc61hbgQiHht9U3x0tnRwz0lzdwOwD/i9voRgdJrKamF+JtmrBOsW9GwB7mfFonNWOWL4qARWYrF8veEMe3w==", + "license": "Apache-2.0", + "dependencies": { + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/credential-provider-env": "^3.972.49", + "@aws-sdk/credential-provider-http": "^3.972.51", + "@aws-sdk/credential-provider-login": "^3.972.55", + "@aws-sdk/credential-provider-process": "^3.972.49", + "@aws-sdk/credential-provider-sso": "^3.972.55", + "@aws-sdk/credential-provider-web-identity": "^3.972.55", + "@aws-sdk/nested-clients": "^3.997.23", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/credential-provider-imds": "^4.3.7", "@smithy/types": "^4.14.3", @@ -22945,14 +22966,14 @@ } }, "node_modules/@aws-sdk/credential-provider-login": { - "version": "3.972.52", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-login/-/credential-provider-login-3.972.52.tgz", - "integrity": "sha512-9hu2oR0qH7Fst5Tzdx+UWxm+w5zCXtErTLtOOW5hwwQc170CLwOeniRxyFY6s9mHfGEfC5zFukNBdKBwJR8mhQ==", + "version": "3.972.55", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-login/-/credential-provider-login-3.972.55.tgz", + "integrity": "sha512-hBBkANo3cDn+h2qxxzER4a+J8JCO9o9Z/YYmU7iky6AcaarX5RRdRcHNC6SLdwY0vAXQygn6soUbDqPn3GghaA==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.20", - "@aws-sdk/nested-clients": "^3.997.20", - "@aws-sdk/types": "^3.973.12", + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/nested-clients": "^3.997.23", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/types": "^4.14.3", "tslib": "^2.6.2" @@ -22962,18 +22983,18 @@ } }, "node_modules/@aws-sdk/credential-provider-node": { - "version": "3.972.55", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-node/-/credential-provider-node-3.972.55.tgz", - "integrity": "sha512-zMGLa/dhESVqmCD7mmIFFKSwSFrJGScvCXcjvBZEVOOMauFS5JRQvLTMukFpMEFWiV6dTAlsen2ATDBulLPtbg==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/credential-provider-env": "^3.972.46", - "@aws-sdk/credential-provider-http": "^3.972.48", - "@aws-sdk/credential-provider-ini": "^3.972.53", - "@aws-sdk/credential-provider-process": "^3.972.46", - "@aws-sdk/credential-provider-sso": "^3.972.52", - "@aws-sdk/credential-provider-web-identity": "^3.972.52", - "@aws-sdk/types": "^3.973.12", + "version": "3.972.58", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-node/-/credential-provider-node-3.972.58.tgz", + "integrity": "sha512-OyCLVmSI7pZO8hxwNVX6pXhTVlJqRBTp+ijdEfJSUj0RyjHnF602OfAarOzGq6wkGodeFkYBt8MmJ6A6ycRgWw==", + "license": "Apache-2.0", + "dependencies": { + "@aws-sdk/credential-provider-env": "^3.972.49", + "@aws-sdk/credential-provider-http": "^3.972.51", + "@aws-sdk/credential-provider-ini": "^3.972.56", + "@aws-sdk/credential-provider-process": "^3.972.49", + "@aws-sdk/credential-provider-sso": "^3.972.55", + "@aws-sdk/credential-provider-web-identity": "^3.972.55", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/credential-provider-imds": "^4.3.7", "@smithy/types": "^4.14.3", @@ -22984,13 +23005,13 @@ } }, "node_modules/@aws-sdk/credential-provider-process": { - "version": "3.972.46", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-process/-/credential-provider-process-3.972.46.tgz", - "integrity": "sha512-VUoNFBIjWrUN8NbFiQiuxQEgFjvziAlBRPK+ddh27aj65gk0BYu6bLZnrdrNZwpW6vAihtSUtEMQ1PUJ32QRPA==", + "version": "3.972.49", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-process/-/credential-provider-process-3.972.49.tgz", + "integrity": "sha512-C8h36lBuC/RnBSsjlO+dn6xZm3KbAl5vpJaVPAfQnMmz2/OISmKOc8XZcqMQgO2ADwBYNRMM6Kf3vz9G/TulMQ==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.20", - "@aws-sdk/types": "^3.973.12", + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/types": "^4.14.3", "tslib": "^2.6.2" @@ -23000,15 +23021,15 @@ } }, "node_modules/@aws-sdk/credential-provider-sso": { - "version": "3.972.52", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-sso/-/credential-provider-sso-3.972.52.tgz", - "integrity": "sha512-nb2/n4o/HQf+FVpVbZe9vCTFngmuDoIsltMgLAtjixaKzvzhB4J8WSDFyWgnErgLHk55ctWH+I4PU+LIHhyffg==", + "version": "3.972.55", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-sso/-/credential-provider-sso-3.972.55.tgz", + "integrity": "sha512-1FkOz74Ea5QGS9jtIoXp55T/IkSS3spv+nLTT07fRY/+T5xmEOqaYBVIaEmX4zTNvbV6g2lrtlaVKWEoNyJt3w==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.20", - "@aws-sdk/nested-clients": "^3.997.20", - "@aws-sdk/token-providers": "3.1066.0", - "@aws-sdk/types": "^3.973.12", + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/nested-clients": "^3.997.23", + "@aws-sdk/token-providers": "3.1074.0", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/types": "^4.14.3", "tslib": "^2.6.2" @@ -23018,14 +23039,14 @@ } }, "node_modules/@aws-sdk/credential-provider-sso/node_modules/@aws-sdk/token-providers": { - "version": "3.1066.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/token-providers/-/token-providers-3.1066.0.tgz", - "integrity": "sha512-UqEUJq7dqa44hneLDUcX7UJy95cg8YqEWyakRpvIPnrNS3Mq+UlQHgCDGu5pvwAPtlIW4qcYbvW6reG6++FyvA==", + "version": "3.1074.0", + "resolved": "https://registry.npmjs.org/@aws-sdk/token-providers/-/token-providers-3.1074.0.tgz", + "integrity": "sha512-pv80IzgGW4RnXWtft692chZOM9i6PhebVsLCcnaM4dBEPZva2fE6FXAHs76G7Rc7s3yGyX/68G0nZMrUy+Vmpg==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.20", - "@aws-sdk/nested-clients": "^3.997.20", - "@aws-sdk/types": "^3.973.12", + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/nested-clients": "^3.997.23", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/types": "^4.14.3", "tslib": "^2.6.2" @@ -23035,14 +23056,14 @@ } }, "node_modules/@aws-sdk/credential-provider-web-identity": { - "version": "3.972.52", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-web-identity/-/credential-provider-web-identity-3.972.52.tgz", - "integrity": "sha512-lKj6aRSGbqLmpYmM24bY7a1Xmfcq2vkE3hv8CSPYfc1yCu0BPu/XEJ1L4Fm61MsU6ULLNSG8UGsffNoFUBjESA==", + "version": "3.972.55", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-web-identity/-/credential-provider-web-identity-3.972.55.tgz", + "integrity": "sha512-g2BoECD1q01kTPByi56+VLVvdWDzMkKIcr77qixpqH0okw2t0U5CoPv+6S8v/D1Y2Wa6QKKtn6XAtDzP+Kfpvg==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.20", - "@aws-sdk/nested-clients": "^3.997.20", - "@aws-sdk/types": "^3.973.12", + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/nested-clients": "^3.997.23", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/types": "^4.14.3", "tslib": "^2.6.2" @@ -23474,16 +23495,16 @@ } }, "node_modules/@aws-sdk/nested-clients": { - "version": "3.997.20", - "resolved": "https://registry.npmjs.org/@aws-sdk/nested-clients/-/nested-clients-3.997.20.tgz", - "integrity": "sha512-IYJuLpXp2DEILVQpQOy0PMpkftv0AHEOCn52o0atyOaumA0CdWQ3klPyXdViGYLbNpESsVFMVybvHUeZAuiGxA==", + "version": "3.997.23", + "resolved": "https://registry.npmjs.org/@aws-sdk/nested-clients/-/nested-clients-3.997.23.tgz", + "integrity": "sha512-gO93ZPsI2bxeFZD42f1/qjDw6FAZkNZcKRO94LIiT03fzOmcJ9e/tunxjVjA1Rl69ClmVJzz8H3G9CdKef10PA==", "license": "Apache-2.0", "dependencies": { "@aws-crypto/sha256-browser": "5.2.0", "@aws-crypto/sha256-js": "5.2.0", - "@aws-sdk/core": "^3.974.20", - "@aws-sdk/signature-v4-multi-region": "^3.996.34", - "@aws-sdk/types": "^3.973.12", + "@aws-sdk/core": "^3.974.23", + "@aws-sdk/signature-v4-multi-region": "^3.996.35", + "@aws-sdk/types": "^3.973.13", "@smithy/core": "^3.24.6", "@smithy/fetch-http-handler": "^5.4.6", "@smithy/node-http-handler": "^4.7.6", @@ -23525,12 +23546,12 @@ } }, "node_modules/@aws-sdk/signature-v4-multi-region": { - "version": "3.996.34", - "resolved": "https://registry.npmjs.org/@aws-sdk/signature-v4-multi-region/-/signature-v4-multi-region-3.996.34.tgz", - "integrity": "sha512-mx1L5qlumSOt/nKM3BFaHE2HVkWwz0i4Bw0pyYO42FfX/FeLlo8YI6csC0gSPprEk6fTIqI+CZN9RwUwKd5krQ==", + "version": "3.996.35", + "resolved": "https://registry.npmjs.org/@aws-sdk/signature-v4-multi-region/-/signature-v4-multi-region-3.996.35.tgz", + "integrity": "sha512-6L/VWs+Wch2stHemCGTmUNqKLMzURxQDK5boNG3Jn3kAOp71meDUuS5sbObpEvFxHDq0uWeSLFDNSYsjNt+Dlg==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/types": "^3.973.12", + "@aws-sdk/types": "^3.973.13", "@smithy/signature-v4": "^5.4.6", "@smithy/types": "^4.14.3", "tslib": "^2.6.2" @@ -23557,9 +23578,9 @@ } }, "node_modules/@aws-sdk/types": { - "version": "3.973.12", - "resolved": "https://registry.npmjs.org/@aws-sdk/types/-/types-3.973.12.tgz", - "integrity": "sha512-43ajd1NF0RMgX5k0hxCNUyEdrtFUsb2aHT2QvpktSC/2Eyb2Jr/JPVqdp0XIoaHWikZJq5tNWSLO6kB5q2eMCA==", + "version": "3.973.13", + "resolved": "https://registry.npmjs.org/@aws-sdk/types/-/types-3.973.13.tgz", + "integrity": "sha512-pEHZqRkAlHfnfAU9tK+WpKv/gBNjGJrHMgA3A0iYRGyswBS2t0pfez+lWlwktb3Bqa0ovh7w/QJTFwp3fDxLNg==", "license": "Apache-2.0", "dependencies": { "@smithy/types": "^4.14.3", @@ -23648,13 +23669,12 @@ } }, "node_modules/@aws-sdk/xml-builder": { - "version": "3.972.29", - "resolved": "https://registry.npmjs.org/@aws-sdk/xml-builder/-/xml-builder-3.972.29.tgz", - "integrity": "sha512-fk0niuGFxfi8yIJuMVM4mhwObkiQSuwZFj3tAPrLVx64Pk3BkrEIpqjzHKY4hKoEBUD6Jg/S74Zj9jy+5F3DnQ==", + "version": "3.972.31", + "resolved": "https://registry.npmjs.org/@aws-sdk/xml-builder/-/xml-builder-3.972.31.tgz", + "integrity": "sha512-SzE4Pgyl+hDF+BuyuzxUSpwnuUu9lJuO1YGgteG89/4Qv0+2IQiVQqdbPV32IozLvXWQChPQcdkk/sKvb1QHiQ==", "license": "Apache-2.0", "dependencies": { "@smithy/types": "^4.14.3", - "fast-xml-parser": "5.7.3", "tslib": "^2.6.2" }, "engines": { @@ -55645,6 +55665,7 @@ "dependencies": { "@aws-blocks/bb-logger": "^0.1.2", "@aws-blocks/core": "^0.1.2", + "@aws-sdk/client-bedrock-agent": "^3.0.0", "@aws-sdk/client-bedrock-agent-runtime": "^3.0.0" }, "devDependencies": { diff --git a/packages/bb-knowledge-base/package.json b/packages/bb-knowledge-base/package.json index be9b3bbf..f55ea5e9 100644 --- a/packages/bb-knowledge-base/package.json +++ b/packages/bb-knowledge-base/package.json @@ -31,6 +31,7 @@ "dependencies": { "@aws-blocks/bb-logger": "^0.1.2", "@aws-blocks/core": "^0.1.2", + "@aws-sdk/client-bedrock-agent": "^3.0.0", "@aws-sdk/client-bedrock-agent-runtime": "^3.0.0" }, "devDependencies": { diff --git a/packages/bb-knowledge-base/src/errors.ts b/packages/bb-knowledge-base/src/errors.ts index 441de13a..8201e519 100644 --- a/packages/bb-knowledge-base/src/errors.ts +++ b/packages/bb-knowledge-base/src/errors.ts @@ -28,4 +28,8 @@ export const KnowledgeBaseErrors = { ValidationError: 'KnowledgeBaseValidationError', /** KnowledgeBase is server-side only and cannot be used in browser contexts. */ BrowserNotSupported: 'BrowserNotSupportedException', + /** The data source's most recent Bedrock ingestion job failed. The error message includes the reported `failureReasons`. */ + IngestionFailed: 'IngestionFailedException', + /** `waitUntilReady()` exceeded its timeout before the knowledge base finished ingesting. */ + Timeout: 'KnowledgeBaseTimeoutException', } as const; diff --git a/packages/bb-knowledge-base/src/index.aws.test.ts b/packages/bb-knowledge-base/src/index.aws.test.ts index 39310a0b..55b2e4f8 100644 --- a/packages/bb-knowledge-base/src/index.aws.test.ts +++ b/packages/bb-knowledge-base/src/index.aws.test.ts @@ -4,6 +4,7 @@ import { test, describe, mock, afterEach } from 'node:test'; import assert from 'node:assert'; import { BedrockAgentRuntimeClient } from '@aws-sdk/client-bedrock-agent-runtime'; +import { BedrockAgentClient } from '@aws-sdk/client-bedrock-agent'; import { KnowledgeBaseErrors, KnowledgeBase } from './index.aws.js'; // ── SDK mock helpers ─────────────────────────────────────────────────────── @@ -12,6 +13,11 @@ function mockRuntimeSend(fn: (cmd: unknown) => unknown) { return mock.method(BedrockAgentRuntimeClient.prototype, 'send', fn); } +// Control-plane client used by isReady()/waitUntilReady(). +function mockAgentSend(fn: (cmd: { constructor: { name: string }; input: any }) => unknown) { + return mock.method(BedrockAgentClient.prototype, 'send', fn as (cmd: unknown) => unknown); +} + afterEach(() => { try { mock.restoreAll(); } catch {} }); @@ -24,6 +30,23 @@ function setKbEnv(scopeId: string, instanceId: string, kbId = 'kb-test-123') { }; } +// Sets KB_ID and (unless dataSourceId is null) DATA_SOURCE_ID, mirroring the +// two config values the CDK layer registers. Used by readiness tests. +function setReadyEnv( + scopeId: string, + instanceId: string, + opts: { kbId?: string; dataSourceId?: string | null } = {}, +) { + const { kbId = 'kb-test-123', dataSourceId = 'ds-test-123' } = opts; + const prefix = `BLOCKS_${scopeId}_${instanceId}`.toUpperCase().replace(/[^A-Z0-9]/g, '_'); + process.env[`${prefix}_KB_ID`] = kbId; + if (dataSourceId !== null) process.env[`${prefix}_DATA_SOURCE_ID`] = dataSourceId; + return () => { + delete process.env[`${prefix}_KB_ID`]; + delete process.env[`${prefix}_DATA_SOURCE_ID`]; + }; +} + // ── Constructor validation ───────────────────────────────────────────────── describe('KnowledgeBase constructor validation', () => { @@ -520,3 +543,261 @@ describe('error classification — other SDK exceptions', () => { } }); }); + +// ── Readiness — isReady() ────────────────────────────────────────────────── +// +// Ingestion runs asynchronously after deploy, so isReady() inspects the data +// source's most recent ingestion job: COMPLETE → ready, FAILED → throws, +// anything else (or no jobs / no data source) → not-ready (or ready when there +// is nothing to track). + +describe('isReady', () => { + test('returns true when the latest ingestion job is COMPLETE', async () => { + const cleanup = setReadyEnv('TEST', 'RDY1'); + mockAgentSend((cmd) => { + assert.strictEqual(cmd.constructor.name, 'ListIngestionJobsCommand'); + return { ingestionJobSummaries: [{ ingestionJobId: 'job-1', status: 'COMPLETE' }] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'rdy1', { source: './knowledge' }); + assert.strictEqual(await kb.isReady(), true); + } finally { + cleanup(); + } + }); + + test('returns false when the latest ingestion job is IN_PROGRESS', async () => { + const cleanup = setReadyEnv('TEST', 'RDY2'); + mockAgentSend(() => ({ ingestionJobSummaries: [{ ingestionJobId: 'job-1', status: 'IN_PROGRESS' }] })); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'rdy2', { source: './knowledge' }); + assert.strictEqual(await kb.isReady(), false); + } finally { + cleanup(); + } + }); + + test('returns false when no ingestion jobs exist yet (empty list)', async () => { + const cleanup = setReadyEnv('TEST', 'RDY3'); + mockAgentSend(() => ({ ingestionJobSummaries: [] })); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'rdy3', { source: './knowledge' }); + assert.strictEqual(await kb.isReady(), false); + } finally { + cleanup(); + } + }); + + test('returns false when ingestionJobSummaries is undefined', async () => { + const cleanup = setReadyEnv('TEST', 'RDY3B'); + mockAgentSend(() => ({})); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'rdy3b', { source: './knowledge' }); + assert.strictEqual(await kb.isReady(), false); + } finally { + cleanup(); + } + }); + + test('returns true (without calling the control plane) when no data source id is configured', async () => { + // Imported s3:// source / pre-feature deployment: KB_ID present, DATA_SOURCE_ID absent. + const cleanup = setReadyEnv('TEST', 'RDY4', { dataSourceId: null }); + let sendCalled = false; + mockAgentSend(() => { + sendCalled = true; + return { ingestionJobSummaries: [] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'rdy4', { source: 's3://my-docs-bucket' }); + assert.strictEqual(await kb.isReady(), true); + assert.strictEqual(sendCalled, false, 'should not query the control plane when there is no data source to track'); + } finally { + cleanup(); + } + }); + + test('throws NotReady when KB_ID env var is not set', async () => { + const prefix = 'BLOCKS_TEST_RDY5'; + const orig = process.env[`${prefix}_KB_ID`]; + delete process.env[`${prefix}_KB_ID`]; + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'rdy5', { source: './knowledge' }); + await assert.rejects( + () => kb.isReady(), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.NotReady); + return true; + }, + ); + } finally { + if (orig !== undefined) process.env[`${prefix}_KB_ID`] = orig; + } + }); + + test('throws IngestionFailed (with failureReasons) when the latest job FAILED', async () => { + const cleanup = setReadyEnv('TEST', 'RDY6'); + mockAgentSend((cmd) => { + if (cmd.constructor.name === 'ListIngestionJobsCommand') { + return { ingestionJobSummaries: [{ ingestionJobId: 'job-x', status: 'FAILED' }] }; + } + // GetIngestionJobCommand → failure detail + return { ingestionJob: { status: 'FAILED', failureReasons: ['boom one', 'boom two'] } }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'rdy6', { source: './knowledge' }); + await assert.rejects( + () => kb.isReady(), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.IngestionFailed); + assert.ok(err.message.includes('boom one'), 'message should include failure reasons'); + assert.ok(err.message.includes('boom two')); + return true; + }, + ); + } finally { + cleanup(); + } + }); + + test('queries ListIngestionJobs with the configured ids, sorted by STARTED_AT desc, maxResults 1', async () => { + const cleanup = setReadyEnv('TEST', 'RDY7', { kbId: 'kb-aaa', dataSourceId: 'ds-bbb' }); + let captured: any; + mockAgentSend((cmd) => { + captured = cmd.input; + return { ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'COMPLETE' }] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'rdy7', { source: './knowledge' }); + await kb.isReady(); + assert.strictEqual(captured.knowledgeBaseId, 'kb-aaa'); + assert.strictEqual(captured.dataSourceId, 'ds-bbb'); + assert.strictEqual(captured.maxResults, 1); + assert.strictEqual(captured.sortBy.attribute, 'STARTED_AT'); + assert.strictEqual(captured.sortBy.order, 'DESCENDING'); + } finally { + cleanup(); + } + }); + + test('maps control-plane ResourceNotFoundException to NotReady', async () => { + const cleanup = setReadyEnv('TEST', 'RDY8'); + const err = new Error('No knowledge base with ID kb-test-123 exists'); + err.name = 'ResourceNotFoundException'; + mockAgentSend(() => { throw err; }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'rdy8', { source: './knowledge' }); + await assert.rejects( + () => kb.isReady(), + (e: Error) => { + assert.strictEqual(e.name, KnowledgeBaseErrors.NotReady); + return true; + }, + ); + } finally { + cleanup(); + } + }); +}); + +// ── Readiness — waitUntilReady() ─────────────────────────────────────────── + +describe('waitUntilReady', () => { + test('resolves immediately when ingestion is already COMPLETE', async () => { + const cleanup = setReadyEnv('TEST', 'WUR1'); + mockAgentSend(() => ({ ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'COMPLETE' }] })); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur1', { source: './knowledge' }); + await kb.waitUntilReady({ timeoutMs: 1000, pollIntervalMs: 10 }); + } finally { + cleanup(); + } + }); + + test('polls until the ingestion job becomes COMPLETE', async () => { + const cleanup = setReadyEnv('TEST', 'WUR2'); + let calls = 0; + mockAgentSend(() => { + calls += 1; + const status = calls < 3 ? 'IN_PROGRESS' : 'COMPLETE'; + return { ingestionJobSummaries: [{ ingestionJobId: 'j', status }] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur2', { source: './knowledge' }); + await kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 5 }); + assert.ok(calls >= 3, `expected at least 3 polls before COMPLETE, got ${calls}`); + } finally { + cleanup(); + } + }); + + test('throws IngestionFailed (with failureReasons) when ingestion FAILED', async () => { + const cleanup = setReadyEnv('TEST', 'WUR3'); + mockAgentSend((cmd) => { + if (cmd.constructor.name === 'ListIngestionJobsCommand') { + return { ingestionJobSummaries: [{ ingestionJobId: 'job-fail', status: 'FAILED' }] }; + } + return { ingestionJob: { status: 'FAILED', failureReasons: ['S3 access denied'] } }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur3', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 1000, pollIntervalMs: 10 }), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.IngestionFailed); + assert.ok(err.message.includes('S3 access denied'), 'should surface failure reasons'); + return true; + }, + ); + } finally { + cleanup(); + } + }); + + test('throws Timeout when the job never completes within the budget', async () => { + const cleanup = setReadyEnv('TEST', 'WUR4'); + mockAgentSend(() => ({ ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'IN_PROGRESS' }] })); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur4', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 30, pollIntervalMs: 5 }), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.Timeout); + assert.ok(err.message.includes('30ms'), 'timeout message should include the budget'); + return true; + }, + ); + } finally { + cleanup(); + } + }); + + test('resolves immediately when no data source id is configured', async () => { + const cleanup = setReadyEnv('TEST', 'WUR5', { dataSourceId: null }); + let sendCalled = false; + mockAgentSend(() => { + sendCalled = true; + return { ingestionJobSummaries: [] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur5', { source: 's3://my-docs-bucket' }); + await kb.waitUntilReady({ timeoutMs: 30, pollIntervalMs: 5 }); + assert.strictEqual(sendCalled, false, 'should not poll the control plane when there is nothing to track'); + } finally { + cleanup(); + } + }); +}); diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index 341bc32c..f647f570 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -7,9 +7,21 @@ import { type RetrievalFilter, type KnowledgeBaseRetrievalResult, } from '@aws-sdk/client-bedrock-agent-runtime'; +import { + BedrockAgentClient, + ListIngestionJobsCommand, + GetIngestionJobCommand, + type IngestionJobSummary, +} from '@aws-sdk/client-bedrock-agent'; import { Scope, registerSdkIdentifiers, getSdkIdentifiers } from '@aws-blocks/core'; import type { ScopeParent } from '@aws-blocks/core'; -import type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter } from './types.js'; +import type { + KnowledgeBaseOptions, + RetrieveOptions, + RetrieveResult, + MetadataFilter, + WaitUntilReadyOptions, +} from './types.js'; import { KnowledgeBaseErrors } from './errors.js'; import { BB_NAME, BB_VERSION } from './version.js'; import { Logger } from '@aws-blocks/bb-logger'; @@ -23,6 +35,7 @@ export type { RetrieveOptions, RetrieveResult, MetadataFilter, + WaitUntilReadyOptions, } from './types.js'; export { KnowledgeBaseErrors } from './errors.js'; @@ -43,6 +56,11 @@ function blocksError(name: string, message: string): Error { return err; } +/** Resolve after `ms` milliseconds. Used to space out readiness polls in `waitUntilReady()`. */ +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + // Match only messages that clearly indicate a metadata filter issue. // Default unknown ValidationExceptions to ValidationError — false negatives // (filter error → generic) are less harmful than false positives (content @@ -138,11 +156,13 @@ function buildFilter(filter?: MetadataFilter): RetrievalFilter | undefined { * * **Environment variables (injected by CDK):** * - `BLOCKS_{FULLID}_KB_ID` — Bedrock Knowledge Base ID + * - `BLOCKS_{FULLID}_DATA_SOURCE_ID` — Bedrock data source ID (used by `isReady()` / `waitUntilReady()`) */ export class KnowledgeBase extends Scope { readonly bbName = BB_NAME; private readonly fullIdCached: string; private readonly runtimeClient: BedrockAgentRuntimeClient; + private readonly agentClient: BedrockAgentClient; /** @internal Logger for internal operations. Defaults to error-level when not provided. */ protected log: ChildLogger; @@ -156,8 +176,15 @@ export class KnowledgeBase extends Scope { retryMode: 'adaptive', customUserAgent: this.buildUserAgentChain(), }); + // Control-plane client for ingestion-job status (readiness checks). + this.agentClient = new BedrockAgentClient({ + maxAttempts: 3, + retryMode: 'adaptive', + customUserAgent: this.buildUserAgentChain(), + }); const kbId = process.env[envKey(this.fullIdCached, 'KB_ID')] ?? ''; - registerSdkIdentifiers(this.fullId, { kbId }); + const dataSourceId = process.env[envKey(this.fullIdCached, 'DATA_SOURCE_ID')] ?? ''; + registerSdkIdentifiers(this.fullId, { kbId, dataSourceId }); } private ensureKbId(): string { @@ -170,6 +197,17 @@ export class KnowledgeBase extends Scope { ); } + /** + * Resolve the configured Bedrock data source id, or `undefined` when none + * was registered. A missing data source id means there is no BB-managed + * ingestion job to track (e.g. an imported `s3://` source, or a deployment + * that predates the readiness API), so callers treat the KB as ready. + */ + private ensureDataSourceId(): string | undefined { + const dataSourceId = getSdkIdentifiers(this).dataSourceId; + return dataSourceId ? dataSourceId : undefined; + } + /** * Retrieve relevant document chunks for a natural language query. * @@ -227,6 +265,150 @@ export class KnowledgeBase extends Scope { throw mapped; } } + + /** + * Report whether the knowledge base has finished ingesting and is ready to + * serve `retrieve()` calls. + * + * Bedrock ingestion runs asynchronously after deploy (it is triggered + * fire-and-forget), so during the warm-up window `retrieve()` returns an + * empty array even for queries that will later match. Use `isReady()` to + * distinguish "still warming up" (`false`) from "ingested, genuinely no + * match" (`true` alongside an empty `retrieve()` result). + * + * Resolution strategy: lists the data source's ingestion jobs (most recent + * first) and inspects the latest job's status — `COMPLETE` → ready, + * `FAILED` → throws, anything else (`STARTING` / `IN_PROGRESS`, or no jobs + * yet) → not ready. When no BB-managed data source id is configured (e.g. + * an imported `s3://` source, or a deployment predating this API) there is + * no ingestion job to track, so the KB is reported ready. + * + * @returns `true` when the latest ingestion job is `COMPLETE` (or there is + * no managed data source to track); `false` while ingestion is pending. + * @throws {KnowledgeBaseNotReadyException} If the KB has not been created/deployed. + * @throws {IngestionFailedException} If the most recent ingestion job failed (message includes `failureReasons`). + * @throws {RetrievalFailedException} For other Bedrock control-plane errors (network, auth, throttling). + * + * @example + * ```typescript + * if (await kb.isReady()) { + * const results = await kb.retrieve('how do I reset my password'); + * } + * ``` + */ + async isReady(): Promise { + const knowledgeBaseId = this.ensureKbId(); + const dataSourceId = this.ensureDataSourceId(); + // No BB-managed ingestion to track → nothing to wait for. + if (!dataSourceId) return true; + + const job = await this.fetchLatestIngestionJob(knowledgeBaseId, dataSourceId); + // No ingestion job recorded yet → ingestion has not started; still warming. + if (!job) return false; + + if (job.status === 'COMPLETE') return true; + if (job.status === 'FAILED') { + const reasons = await this.fetchFailureReasons(knowledgeBaseId, dataSourceId, job.ingestionJobId); + throw blocksError( + KnowledgeBaseErrors.IngestionFailed, + `Knowledge base ingestion failed.${reasons.length ? ` Reasons: ${reasons.join('; ')}` : ''}`, + ); + } + // STARTING | IN_PROGRESS | STOPPING | STOPPED → not ready. + return false; + } + + /** + * Wait until the knowledge base has finished ingesting, polling its + * ingestion-job status until ready or until the timeout elapses. + * + * Polls {@link isReady} every `pollIntervalMs` until it returns `true` + * (resolves) or the `timeoutMs` budget is exhausted (throws). If the most + * recent ingestion job has `FAILED`, the underlying `IngestionFailedException` + * propagates immediately rather than waiting out the timeout. + * + * @param {WaitUntilReadyOptions} options - Optional polling parameters. + * `timeoutMs` (default 300000) bounds the total wait; `pollIntervalMs` + * (default 5000, clamped to a minimum of 1ms) spaces out the polls. + * @throws {KnowledgeBaseTimeoutException} If the KB does not become ready within `timeoutMs`. + * @throws {IngestionFailedException} If the most recent ingestion job failed (message includes `failureReasons`). + * @throws {KnowledgeBaseNotReadyException} If the KB has not been created/deployed. + * @throws {RetrievalFailedException} For other Bedrock control-plane errors (network, auth, throttling). + * + * @example + * ```typescript + * // Block until the KB is queryable (e.g. right after deploy) + * await kb.waitUntilReady({ timeoutMs: 600_000 }); + * const results = await kb.retrieve('getting started'); + * ``` + */ + async waitUntilReady(options?: WaitUntilReadyOptions): Promise { + const timeoutMs = Math.max(options?.timeoutMs ?? 300_000, 0); + const pollIntervalMs = Math.max(options?.pollIntervalMs ?? 5_000, 1); + const deadline = Date.now() + timeoutMs; + + for (;;) { + // isReady() throws IngestionFailedException on a FAILED job — let it propagate. + if (await this.isReady()) return; + if (Date.now() >= deadline) { + throw blocksError( + KnowledgeBaseErrors.Timeout, + `Knowledge base did not become ready within ${timeoutMs}ms.`, + ); + } + // Never sleep past the deadline. + await sleep(Math.min(pollIntervalMs, Math.max(deadline - Date.now(), 0))); + } + } + + /** + * List the data source's ingestion jobs (most recent first) and return the + * latest summary, or `undefined` when none exist yet. SDK errors are mapped + * to Blocks error constants via {@link mapSdkError}. + */ + private async fetchLatestIngestionJob( + knowledgeBaseId: string, + dataSourceId: string, + ): Promise { + try { + const response = await this.agentClient.send( + new ListIngestionJobsCommand({ + knowledgeBaseId, + dataSourceId, + sortBy: { attribute: 'STARTED_AT', order: 'DESCENDING' }, + maxResults: 1, + }), + ); + return response.ingestionJobSummaries?.[0]; + } catch (err) { + const mapped = mapSdkError(err); + this.log.error(mapped.message); + throw mapped; + } + } + + /** + * Fetch the `failureReasons` for a failed ingestion job. Best-effort: the + * `ListIngestionJobs` summary omits failure reasons, so this issues a + * `GetIngestionJob` for the detail. Returns an empty array if the id is + * missing or the lookup fails — the caller still reports the failure. + */ + private async fetchFailureReasons( + knowledgeBaseId: string, + dataSourceId: string, + ingestionJobId: string | undefined, + ): Promise { + if (!ingestionJobId) return []; + try { + const response = await this.agentClient.send( + new GetIngestionJobCommand({ knowledgeBaseId, dataSourceId, ingestionJobId }), + ); + return response.ingestionJob?.failureReasons ?? []; + } catch (err) { + this.log.error(mapSdkError(err).message); + return []; + } + } } // ── Result mapping ───────────────────────────────────────────────────────── diff --git a/packages/bb-knowledge-base/src/index.browser.ts b/packages/bb-knowledge-base/src/index.browser.ts index b4de0772..c31e1e3b 100644 --- a/packages/bb-knowledge-base/src/index.browser.ts +++ b/packages/bb-knowledge-base/src/index.browser.ts @@ -2,14 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 import type { ScopeParent } from '@aws-blocks/core'; -import type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult } from './types.js'; +import type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, WaitUntilReadyOptions } from './types.js'; import { KnowledgeBaseErrors } from './errors.js'; export type { KnowledgeBaseOptions, SourceConfig, ChunkingConfig, ChunkingStrategy, RetrieveOptions, RetrieveResult, - MetadataFilter, + MetadataFilter, WaitUntilReadyOptions, } from './types.js'; export { KnowledgeBaseErrors } from './errors.js'; @@ -40,4 +40,12 @@ export class KnowledgeBase { async retrieve(_query: string, _options?: RetrieveOptions): Promise { throw browserError(); } + + async isReady(): Promise { + throw browserError(); + } + + async waitUntilReady(_options?: WaitUntilReadyOptions): Promise { + throw browserError(); + } } diff --git a/packages/bb-knowledge-base/src/index.cdk.ts b/packages/bb-knowledge-base/src/index.cdk.ts index 765cb26b..43ebbe99 100644 --- a/packages/bb-knowledge-base/src/index.cdk.ts +++ b/packages/bb-knowledge-base/src/index.cdk.ts @@ -19,7 +19,7 @@ export type { KnowledgeBaseOptions, SourceConfig, ChunkingConfig, ChunkingStrategy, RetrieveOptions, RetrieveResult, - MetadataFilter, + MetadataFilter, WaitUntilReadyOptions, } from './types.js'; export { KnowledgeBaseErrors } from './errors.js'; @@ -176,9 +176,11 @@ function generateMetadataSidecars(sourceDir: string): string | undefined { * * **Environment variables injected into the handler:** * - `BLOCKS_{FULLID}_KB_ID` — Bedrock Knowledge Base ID (used by the AWS runtime) + * - `BLOCKS_{FULLID}_DATA_SOURCE_ID` — Bedrock data source ID (used by `isReady()` / `waitUntilReady()`) * * **IAM grants to the handler:** * - `bedrock:Retrieve` — query the knowledge base at runtime + * - `bedrock:GetIngestionJob`, `bedrock:ListIngestionJobs` — check ingestion readiness * * @param scope - Parent scope. * @param id - Unique identifier within the scope. @@ -430,23 +432,34 @@ export class KnowledgeBase extends Scope { startIngestion.node.addDependency(deployment); } - // ── 8. Handler env vars ─────────────────────────────────────────── - // The AWS runtime reads these to locate the Bedrock resources. + // ── 8. Handler config (read by the AWS runtime) ─────────────────── + // Registered via registerConfig (not addEnvironment) so the runtime can + // locate the Bedrock resources. KB_ID drives retrieve(); DATA_SOURCE_ID + // drives the isReady()/waitUntilReady() ingestion-readiness checks. registerConfig(this, envKey(this.fullId, 'KB_ID'), knowledgeBase.attrKnowledgeBaseId); + registerConfig(this, envKey(this.fullId, 'DATA_SOURCE_ID'), dataSource.attrDataSourceId); // ── 9. Handler IAM grants ───────────────────────────────────────── + const knowledgeBaseArn = cdk.Stack.of(this).formatArn({ + service: 'bedrock', + resource: 'knowledge-base', + resourceName: knowledgeBase.attrKnowledgeBaseId, + arnFormat: cdk.ArnFormat.SLASH_RESOURCE_NAME, + }); + this.handler.addToRolePolicy(new iam.PolicyStatement({ actions: ['bedrock:Retrieve'], - resources: [ - cdk.Stack.of(this).formatArn({ - service: 'bedrock', - resource: 'knowledge-base', - resourceName: knowledgeBase.attrKnowledgeBaseId, - arnFormat: cdk.ArnFormat.SLASH_RESOURCE_NAME, - }), - ], + resources: [knowledgeBaseArn], + })); + + // Ingestion-job status for isReady()/waitUntilReady(). These actions are + // authorized at the knowledge-base resource level (the data source and + // ingestion jobs are sub-resources of the KB ARN). + this.handler.addToRolePolicy(new iam.PolicyStatement({ + actions: ['bedrock:GetIngestionJob', 'bedrock:ListIngestionJobs'], + resources: [knowledgeBaseArn], })); } } diff --git a/packages/bb-knowledge-base/src/index.mock.test.ts b/packages/bb-knowledge-base/src/index.mock.test.ts index 27d11d5b..f4ed132e 100644 --- a/packages/bb-knowledge-base/src/index.mock.test.ts +++ b/packages/bb-knowledge-base/src/index.mock.test.ts @@ -1032,5 +1032,33 @@ describe('unicode / multilingual retrieval', () => { }); }); +// ── Readiness (local dev: no warm-up window) ──────────────────────────────── +// +// The local corpus loads synchronously on first retrieve(), so there is no +// asynchronous ingestion warm-up — isReady() is always true and +// waitUntilReady() resolves immediately (options are ignored). + +describe('readiness', () => { + test('isReady() resolves true immediately', async () => { + const kb = new KnowledgeBase({ id: 'test' }, 'ready', { source: 'test-knowledge-tmp' }); + assert.strictEqual(await kb.isReady(), true); + }); + + test('waitUntilReady() resolves immediately', async () => { + const kb = new KnowledgeBase({ id: 'test' }, 'waitready', { source: 'test-knowledge-tmp' }); + await kb.waitUntilReady(); + }); + + test('isReady() is true even for an S3 URI source (no local warm-up)', async () => { + const kb = new KnowledgeBase({ id: 'test' }, 'readys3', { source: 's3://my-docs-bucket' }); + assert.strictEqual(await kb.isReady(), true); + }); + + test('waitUntilReady() ignores options and resolves immediately', async () => { + const kb = new KnowledgeBase({ id: 'test' }, 'waitopts', { source: 'test-knowledge-tmp' }); + await kb.waitUntilReady({ timeoutMs: 1, pollIntervalMs: 1 }); + }); +}); + // ── Cleanup after all tests ──────────────────────────────────────────────── test('cleanup', () => { cleanup(); }); diff --git a/packages/bb-knowledge-base/src/index.mock.ts b/packages/bb-knowledge-base/src/index.mock.ts index d873ca9c..eab04857 100644 --- a/packages/bb-knowledge-base/src/index.mock.ts +++ b/packages/bb-knowledge-base/src/index.mock.ts @@ -8,7 +8,7 @@ import { existsSync, readFileSync, writeFileSync, mkdirSync, readdirSync, statSy import { join, relative, dirname, extname, resolve, sep } from 'node:path'; import { createHash } from 'node:crypto'; import { buildIndex, search, type TfIdfIndex } from './tfidf.js'; -import type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, ChunkingStrategy } from './types.js'; +import type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, ChunkingStrategy, WaitUntilReadyOptions } from './types.js'; import { KnowledgeBaseErrors } from './errors.js'; import { Logger } from '@aws-blocks/bb-logger'; import type { ChildLogger } from '@aws-blocks/bb-logger'; @@ -22,6 +22,7 @@ export type { RetrieveOptions, RetrieveResult, MetadataFilter, + WaitUntilReadyOptions, } from './types.js'; export { KnowledgeBaseErrors } from './errors.js'; @@ -251,6 +252,33 @@ export class KnowledgeBase extends Scope { return results; } + /** + * Report whether the knowledge base is ready to serve `retrieve()` calls. + * + * Local development has no asynchronous ingestion warm-up window — the + * corpus is read and indexed synchronously on the first `retrieve()` — so + * this always resolves `true`. (In production the AWS runtime polls the + * Bedrock ingestion-job status, which may briefly report `false`.) + * + * @returns Always `true` in local development. + */ + async isReady(): Promise { + return true; + } + + /** + * Resolve once the knowledge base has finished ingesting. + * + * Local development has no ingestion warm-up window (see {@link isReady}), + * so this resolves immediately. The options are accepted for API parity + * with the AWS runtime and are otherwise ignored locally. + * + * @param {WaitUntilReadyOptions} _options - Accepted for API parity; ignored in local development. + */ + async waitUntilReady(_options?: WaitUntilReadyOptions): Promise { + // No-op: the local corpus loads synchronously, so there is nothing to wait for. + } + // ── Lazy loading ────────────────────────────────────────────────────── private ensureLoaded(): Promise { diff --git a/packages/bb-knowledge-base/src/types.ts b/packages/bb-knowledge-base/src/types.ts index 6068bcdb..88fb5ae1 100644 --- a/packages/bb-knowledge-base/src/types.ts +++ b/packages/bb-knowledge-base/src/types.ts @@ -153,3 +153,21 @@ export interface RetrieveResult { /** Document metadata key-value pairs. Includes auto-populated `folder` key derived from subfolder structure. */ metadata: Record; } + +// ── Readiness Options ────────────────────────────────────────────────────── + +/** + * Options for the `waitUntilReady()` method. + * + * @example + * ```typescript + * // Wait up to 10 minutes, polling every 10 seconds + * await kb.waitUntilReady({ timeoutMs: 600_000, pollIntervalMs: 10_000 }); + * ``` + */ +export interface WaitUntilReadyOptions { + /** Maximum time to wait for ingestion to complete, in milliseconds. Default: 300000 (5 minutes). */ + timeoutMs?: number; + /** Delay between readiness polls, in milliseconds. Clamped to a minimum of 1ms. Default: 5000 (5 seconds). */ + pollIntervalMs?: number; +} From 3615b6a58a14b3c8bd907dc8954a1012349a5cbd Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Wed, 24 Jun 2026 13:33:38 +0000 Subject: [PATCH 02/14] fix(bb-knowledge-base): apply removal policy to S3 Vectors resources + CDK teardown test + decisions note --- .changeset/kb-teardown.md | 9 ++ packages/bb-knowledge-base/DECISIONS.md | 81 ++++++++++++++++ .../bb-knowledge-base/src/index.cdk.test.ts | 92 +++++++++++++++++++ packages/bb-knowledge-base/src/index.cdk.ts | 27 +++++- 4 files changed, 204 insertions(+), 5 deletions(-) create mode 100644 .changeset/kb-teardown.md create mode 100644 packages/bb-knowledge-base/DECISIONS.md create mode 100644 packages/bb-knowledge-base/src/index.cdk.test.ts diff --git a/.changeset/kb-teardown.md b/.changeset/kb-teardown.md new file mode 100644 index 00000000..7fe5b506 --- /dev/null +++ b/.changeset/kb-teardown.md @@ -0,0 +1,9 @@ +--- +"@aws-blocks/bb-knowledge-base": patch +--- + +fix(bb-knowledge-base): apply the data bucket's removal policy to the S3 Vectors resources on teardown + +On a `removalPolicy: 'destroy'` (or sandbox) teardown, the data `s3.Bucket` was force-deleted and auto-emptied, but the S3 Vectors store — the `CfnVectorBucket` + `CfnIndex` L1 resources — relied solely on its default CloudFormation `DeletionPolicy` and leaked. Those resources now mirror the data bucket: `DeletionPolicy: Delete` (via `applyRemovalPolicy(RemovalPolicy.DESTROY)`) when `destroy` is requested, and `RemovalPolicy.RETAIN` otherwise, so the vector bucket and index are dropped alongside the data bucket on a clean teardown. + +Purely additive — no exported types, signatures, or error constants changed. diff --git a/packages/bb-knowledge-base/DECISIONS.md b/packages/bb-knowledge-base/DECISIONS.md new file mode 100644 index 00000000..90aacda2 --- /dev/null +++ b/packages/bb-knowledge-base/DECISIONS.md @@ -0,0 +1,81 @@ + + +# KnowledgeBase — Design Decisions + +Short rationale notes for non-obvious infrastructure choices in +`src/index.cdk.ts`. These exist to save the next maintainer a round of +archaeology. + +## D1 — Raw `s3.Bucket` for the data bucket (not the `FileBucket` Building Block) + +The data bucket is provisioned with a raw `aws-cdk-lib/aws-s3` `s3.Bucket` +rather than the `FileBucket` Building Block, even though `FileBucket` exists for +"an app needs an S3 bucket" use cases. + +Bedrock ingestion assumes an IAM role that must **read** the data bucket, and +the Knowledge Base / Data Source wiring needs low-level bucket primitives that +`FileBucket` intentionally does not expose: + +- **`bucketArn`** — fed verbatim into `CfnDataSource.s3Configuration.bucketArn`. +- **`grantRead(role)`** — grants the Bedrock service-principal role read access + with the exact resource scoping CDK generates. +- **`enforceSSL: true`** — required posture for the bucket policy. +- **`PhysicalName.GENERATE_IF_NEEDED`** — a CDK-generated name so the bucket can + be referenced cross-construct (and, for an imported `s3://` source, swapped + for `Bucket.fromBucketName`) without the caller having to name it. + +`FileBucket` is a higher-level, app-facing abstraction (presigned uploads, +client access patterns) and does not surface these primitives. Reaching for the +raw L2 here keeps the Bedrock IAM grant precise and avoids bending `FileBucket` +into an infrastructure role it was not designed for. + +## D2 — S3 Vectors resources mirror the data bucket's removal policy + +The vector store is a pair of S3 Vectors **L1** resources +(`s3vectors.CfnVectorBucket` + `s3vectors.CfnIndex`). Unlike the L2 `s3.Bucket` +— which defaults to `RETAIN` and supports `autoDeleteObjects` — these L1 +resources rely solely on their CloudFormation `DeletionPolicy`, whose default is +`Delete`. Left unmanaged they are inconsistent with the data bucket on teardown. + +We therefore apply a removal policy to both, computed from the **same** `destroy` +signal that drives the data bucket: + +- `removalPolicy: 'destroy'` (or sandbox mode with no explicit policy) → + `RemovalPolicy.DESTROY` → `DeletionPolicy: Delete`. The vector bucket + index + are dropped alongside the (auto-emptied) data bucket. +- otherwise → `RemovalPolicy.RETAIN` → `DeletionPolicy: Retain`, matching the + data bucket's `RETAIN`-by-default posture so customer data is never silently + destroyed. + +`applyRemovalPolicy()` sets both `DeletionPolicy` and `UpdateReplacePolicy`. +There is no `autoDeleteObjects` equivalent for S3 Vectors, but a vector bucket +deleted by CloudFormation is removed with its contents, so no manual emptying +step is needed for the vector store. + +## D3 — Teardown caveat: the stack-level `RemovalPolicies` aspect cannot auto-empty the data bucket + +Some templates force a whole stack to be destroyable with a CDK aspect: + +```ts +import { RemovalPolicies } from 'aws-cdk-lib'; +RemovalPolicies.of(stack).destroy(); +``` + +This aspect flips every resource's `DeletionPolicy` to `Delete`, **but it cannot +enable `autoDeleteObjects`** on a bucket — `autoDeleteObjects` is a constructor +behavior (it provisions a custom resource + Lambda that empties the bucket on +delete), not a CloudFormation attribute an aspect can toggle after the fact. + +Consequence: if you rely solely on the stack-level aspect and do **not** pass +`removalPolicy: 'destroy'` to the KnowledgeBase, the data bucket's +`DeletionPolicy` becomes `Delete` but it still has objects in it, so +CloudFormation's `DELETE` fails with `BucketNotEmpty` and the teardown stalls. + +**Recommendation:** for a clean teardown, pass `removalPolicy: 'destroy'` to the +KnowledgeBase (or run in sandbox mode). That path pairs `RemovalPolicy.DESTROY` +with `autoDeleteObjects` on the data bucket and `DeletionPolicy: Delete` on the +S3 Vectors resources (see D2), so the bucket is emptied and every resource is +removed without manual intervention. diff --git a/packages/bb-knowledge-base/src/index.cdk.test.ts b/packages/bb-knowledge-base/src/index.cdk.test.ts new file mode 100644 index 00000000..16bec2ea --- /dev/null +++ b/packages/bb-knowledge-base/src/index.cdk.test.ts @@ -0,0 +1,92 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * CDK-side teardown tests for KnowledgeBase. + * + * History: the data `s3.Bucket` paired `RemovalPolicy.DESTROY` with + * `autoDeleteObjects` on a `destroy`/sandbox teardown, but the S3 Vectors L1 + * resources (`CfnVectorBucket` + `CfnIndex`) relied solely on their default + * CloudFormation `DeletionPolicy` and leaked. These tests pin the fix: the + * vector resources now mirror the data bucket's removal policy. + */ +import { test } from 'node:test'; +import { fileURLToPath } from 'node:url'; +import { dirname, resolve } from 'node:path'; +import * as cdk from 'aws-cdk-lib'; +import type { Construct } from 'constructs'; +import { Template } from 'aws-cdk-lib/assertions'; +import * as s3vectors from 'aws-cdk-lib/aws-s3vectors'; +import { Scope, DEFAULT_NODE_RUNTIME } from '@aws-blocks/core/cdk'; +import { KnowledgeBase } from './index.cdk.js'; + +// Real local-folder source so BucketDeployment + sidecar generation synth. +const FIXTURES = resolve(dirname(fileURLToPath(import.meta.url)), '..', 'test-fixtures', 'knowledge'); + +// Pull CFN type names off the L1 classes so the assertions don't drift if AWS +// renames the underlying resource types. +const VECTOR_BUCKET_TYPE = s3vectors.CfnVectorBucket.CFN_RESOURCE_TYPE_NAME; +const VECTOR_INDEX_TYPE = s3vectors.CfnIndex.CFN_RESOURCE_TYPE_NAME; + +// Minimal BlocksStack-shaped parent — KnowledgeBase calls +// `this.handler.addToRolePolicy(...)` and `cdk.Stack.of(this)`, both of which +// resolve through CURRENT_BLOCKS_STACK (mirrors the production BlocksStack). +class StubBlocksStack extends cdk.Stack { + public readonly handler: cdk.aws_lambda.Function; + public readonly id: string; + constructor(scope: Construct, id: string) { + super(scope, id); + this.id = id; + (globalThis as any).CURRENT_BLOCKS_STACK = this; + this.handler = new cdk.aws_lambda.Function(this, 'StubHandler', { + runtime: DEFAULT_NODE_RUNTIME, + handler: 'index.handler', + code: cdk.aws_lambda.Code.fromInline('exports.handler = async () => {};'), + }); + } +} + +function synth(options: { removalPolicy?: 'destroy' | 'retain'; sandbox?: boolean } = {}): Template { + const app = new cdk.App(options.sandbox ? { context: { sandboxMode: 'true' } } : undefined); + // S3 bucket names must be lowercase; the data bucket derives its name from + // the scope chain, so keep ids lowercase. + const stack = new StubBlocksStack(app, 'teststack'); + const parent = new Scope('app'); + new KnowledgeBase(parent, 'docs', { + source: FIXTURES, + ...(options.removalPolicy ? { removalPolicy: options.removalPolicy } : {}), + }); + return Template.fromStack(stack); +} + +test("CDK: removalPolicy 'destroy' makes the data bucket + vector store deletable and adds auto-delete", () => { + const template = synth({ removalPolicy: 'destroy' }); + + // Data bucket: force-deletable and auto-empties on teardown. + template.hasResource('AWS::S3::Bucket', { DeletionPolicy: 'Delete' }); + template.resourceCountIs('Custom::S3AutoDeleteObjects', 1); + + // S3 Vectors resources mirror the data bucket — dropped on teardown. + template.hasResource(VECTOR_BUCKET_TYPE, { DeletionPolicy: 'Delete' }); + template.hasResource(VECTOR_INDEX_TYPE, { DeletionPolicy: 'Delete' }); +}); + +test("CDK: removalPolicy 'retain' keeps the data bucket + vector store and omits auto-delete", () => { + const template = synth({ removalPolicy: 'retain' }); + + template.hasResource('AWS::S3::Bucket', { DeletionPolicy: 'Retain' }); + template.resourceCountIs('Custom::S3AutoDeleteObjects', 0); + + template.hasResource(VECTOR_BUCKET_TYPE, { DeletionPolicy: 'Retain' }); + template.hasResource(VECTOR_INDEX_TYPE, { DeletionPolicy: 'Retain' }); +}); + +test('CDK: sandboxMode context defaults the data bucket + vector store to destroy', () => { + const template = synth({ sandbox: true }); + + template.hasResource('AWS::S3::Bucket', { DeletionPolicy: 'Delete' }); + template.resourceCountIs('Custom::S3AutoDeleteObjects', 1); + + template.hasResource(VECTOR_BUCKET_TYPE, { DeletionPolicy: 'Delete' }); + template.hasResource(VECTOR_INDEX_TYPE, { DeletionPolicy: 'Delete' }); +}); diff --git a/packages/bb-knowledge-base/src/index.cdk.ts b/packages/bb-knowledge-base/src/index.cdk.ts index 43ebbe99..c900538f 100644 --- a/packages/bb-knowledge-base/src/index.cdk.ts +++ b/packages/bb-knowledge-base/src/index.cdk.ts @@ -194,6 +194,15 @@ export class KnowledgeBase extends Scope { // ── 1. S3 Data Bucket ────────────────────────────────────────────── + // In sandbox mode, default to DESTROY + autoDeleteObjects so a teardown + // can fully clean up without manual bucket emptying. An explicit + // `removalPolicy` from the customer always takes precedence. Computed + // up-front because it also drives the S3 Vectors resources' deletion + // policy (section 2) — keeping the data bucket and the vector store in + // sync on teardown. + const isSandbox = cdk.Stack.of(this).node.tryGetContext('sandboxMode') === 'true'; + const destroy = options.removalPolicy === 'destroy' || (isSandbox && options.removalPolicy === undefined); + let dataBucket: s3.IBucket; let inclusionPrefixes: string[] | undefined; @@ -221,11 +230,6 @@ export class KnowledgeBase extends Scope { inclusionPrefixes = [prefix.endsWith('/') ? prefix : prefix + '/']; } } else { - // In sandbox mode, default to DESTROY + autoDeleteObjects so - // `cdk destroy` can fully clean up without manual bucket emptying. - // Explicit `removalPolicy` from the customer takes precedence. - const isSandbox = cdk.Stack.of(this).node.tryGetContext('sandboxMode') === 'true'; - const destroy = options.removalPolicy === 'destroy' || (isSandbox && options.removalPolicy === undefined); dataBucket = new s3.Bucket(this, 'Data', { bucketName: cdk.PhysicalName.GENERATE_IF_NEEDED, blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL, @@ -258,6 +262,19 @@ export class KnowledgeBase extends Scope { }, }); + // Mirror the data bucket's teardown behavior on the S3 Vectors L1 + // resources. Unlike the L2 s3.Bucket — which defaults to RETAIN and can + // auto-empty via autoDeleteObjects — these CfnVectorBucket/CfnIndex + // resources rely solely on their CloudFormation DeletionPolicy (default: + // Delete). Without an explicit policy they'd be inconsistent with the + // data bucket on teardown. applyRemovalPolicy sets both DeletionPolicy + // and UpdateReplacePolicy. RETAIN by default (parity with the data + // bucket); DELETE only when a `removalPolicy:'destroy'` / sandbox + // teardown is requested, so the vector store is dropped alongside it. + const vectorRemovalPolicy = destroy ? cdk.RemovalPolicy.DESTROY : cdk.RemovalPolicy.RETAIN; + vectorBucket.applyRemovalPolicy(vectorRemovalPolicy); + vectorIndex.applyRemovalPolicy(vectorRemovalPolicy); + // ── 3. IAM Role for Bedrock ──────────────────────────────────────── // Scoped to this account via aws:SourceAccount to prevent confused-deputy. // Ideally we'd also add aws:SourceArn scoped to the KB ARN, but that From 726fc4f60ac1b1fd49c7953d5a6bcd3a84e22c5d Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Wed, 24 Jun 2026 13:40:15 +0000 Subject: [PATCH 03/14] chore(comprehensive): gate KB e2e on waitUntilReady + friendly warming output --- test-apps/comprehensive/aws-blocks/index.ts | 14 ++++- .../comprehensive/test/knowledge-base.test.ts | 53 ++++++++++++------- 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/test-apps/comprehensive/aws-blocks/index.ts b/test-apps/comprehensive/aws-blocks/index.ts index 0fd92b6c..f35c3dee 100644 --- a/test-apps/comprehensive/aws-blocks/index.ts +++ b/test-apps/comprehensive/aws-blocks/index.ts @@ -13,7 +13,7 @@ import { DistributedTableErrors } from '@aws-blocks/bb-distributed-table'; import { isBlocksError } from '@aws-blocks/core'; import { AsyncJob } from '@aws-blocks/bb-async-job'; import { AppSetting } from '@aws-blocks/bb-app-setting'; -import type { RetrieveOptions } from '@aws-blocks/bb-knowledge-base'; +import type { RetrieveOptions, WaitUntilReadyOptions } from '@aws-blocks/bb-knowledge-base'; import { Tracer } from '@aws-blocks/bb-tracer'; import { Logger } from '@aws-blocks/bb-logger'; import { createKyselyAdapter, DatabaseErrors } from '@aws-blocks/bb-data'; @@ -1838,6 +1838,18 @@ export const api = new ApiNamespace(scope, 'api', (context) => ({ return await kb.retrieve(query, options); }, + // Ingestion readiness — Bedrock ingests asynchronously after deploy, so e2e + // tests gate retrieval on these instead of polling retrieve() for results. + // The local mock reports ready immediately. + async kbReady() { + return await kb.isReady(); + }, + + async kbWaitUntilReady(options?: WaitUntilReadyOptions) { + await kb.waitUntilReady(options); + return { success: true }; + }, + // ------------------------------------------------------------------------ // EmailClient Tests // ------------------------------------------------------------------------ diff --git a/test-apps/comprehensive/test/knowledge-base.test.ts b/test-apps/comprehensive/test/knowledge-base.test.ts index 1aea12af..ebecd6ea 100644 --- a/test-apps/comprehensive/test/knowledge-base.test.ts +++ b/test-apps/comprehensive/test/knowledge-base.test.ts @@ -12,35 +12,48 @@ const ENV = process.env.BLOCKS_TEST_ENV || 'local'; const isLocal = ENV === 'local'; /** - * Poll kbRetrieve until at least one result is returned, indicating that - * the knowledge-base ingestion job has completed. In local mode the mock - * returns results immediately; in sandbox/production Bedrock ingestion is - * async and may take a couple of minutes after deploy. + * Gate retrieval tests on knowledge-base ingestion readiness using the + * `isReady()` API (exposed here as `kbReady`). Bedrock ingests asynchronously + * after deploy, so during the warm-up window we poll readiness rather than + * probing `kbRetrieve` for results. + * + * - `kbReady() === false` is the expected transient "still ingesting" state — + * we print a friendly one-liner and keep polling. + * - A *thrown* error is a real failure (a failed ingestion job surfaced as + * `IngestionFailedException`, a `KnowledgeBaseValidationError`, or anything + * unexpected) and is surfaced immediately rather than masked as warm-up. + * + * In local mode the mock reports ready immediately, so this returns on the + * first poll. */ -async function waitForIngestion( +async function gateOnReadiness( api: typeof apiType, - query: string, - { timeoutMs = 180_000, intervalMs = 10_000 } = {}, + { timeoutMs = 180_000, pollIntervalMs = 10_000 } = {}, ): Promise { const start = Date.now(); const deadline = start + timeoutMs; let attempt = 0; - while (Date.now() < deadline) { + while (true) { attempt++; const elapsed = Math.round((Date.now() - start) / 1000); + let ready: boolean; try { - const results = await api.kbRetrieve(query); - console.log(`[KB ingestion poll #${attempt}] ${results.length} results (${elapsed}s elapsed)`); - if (results.length > 0) return; + ready = await api.kbReady(); } catch (err: any) { - console.log(`[KB ingestion poll #${attempt}] error: ${err.name || err.message} (${elapsed}s elapsed)`); - // ValidationError = real bug, throw immediately - if (isBlocksError(err, ValidationError)) throw err; - // NotReady, RetrievalFailed, etc. = KB not ready yet, keep polling + // Real failure (failed ingestion / validation / unexpected) — surface it. + console.error(`❌ KB readiness check failed: ${err.name || err.message}`); + throw err; } - await setTimeout(intervalMs); + if (ready) { + console.log(`✅ KB ready (ingestion complete) — ${elapsed}s elapsed`); + return; + } + if (Date.now() >= deadline) { + throw new Error(`KB did not become ready within ${timeoutMs / 1000}s`); + } + console.log(`⏳ KB still warming up (ingestion in progress) — attempt #${attempt}, ${elapsed}s elapsed`); + await setTimeout(pollIntervalMs); } - throw new Error(`KB ingestion did not complete within ${timeoutMs / 1000}s`); } export function knowledgeBaseTests(getApi: () => typeof apiType) { @@ -77,7 +90,7 @@ export function knowledgeBaseTests(getApi: () => typeof apiType) { before(async () => { const api = getApi(); - await waitForIngestion(api, 'getting started'); + await gateOnReadiness(api); }); test('returns results for a matching query', async () => { @@ -134,7 +147,7 @@ export function knowledgeBaseTests(getApi: () => typeof apiType) { before(async () => { const api = getApi(); - await waitForIngestion(api, 'getting started'); + await gateOnReadiness(api); }); test('maxResults limits results', async () => { @@ -172,7 +185,7 @@ export function knowledgeBaseTests(getApi: () => typeof apiType) { before(async () => { const api = getApi(); - await waitForIngestion(api, 'deployment'); + await gateOnReadiness(api); }); test('customer metadata category is present on tutorial doc', async () => { From 3edefb865d49eff42049cc0d56f1c00703adae2b Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Thu, 25 Jun 2026 00:19:17 +0000 Subject: [PATCH 04/14] docs(bb-knowledge-base): document readiness API + errors in README/DESIGN, consolidate decisions --- packages/bb-knowledge-base/DECISIONS.md | 81 --------------------- packages/bb-knowledge-base/DESIGN.md | 40 +++++++++- packages/bb-knowledge-base/README.md | 23 +++++- packages/bb-knowledge-base/src/index.aws.ts | 15 +++- 4 files changed, 70 insertions(+), 89 deletions(-) delete mode 100644 packages/bb-knowledge-base/DECISIONS.md diff --git a/packages/bb-knowledge-base/DECISIONS.md b/packages/bb-knowledge-base/DECISIONS.md deleted file mode 100644 index 90aacda2..00000000 --- a/packages/bb-knowledge-base/DECISIONS.md +++ /dev/null @@ -1,81 +0,0 @@ - - -# KnowledgeBase — Design Decisions - -Short rationale notes for non-obvious infrastructure choices in -`src/index.cdk.ts`. These exist to save the next maintainer a round of -archaeology. - -## D1 — Raw `s3.Bucket` for the data bucket (not the `FileBucket` Building Block) - -The data bucket is provisioned with a raw `aws-cdk-lib/aws-s3` `s3.Bucket` -rather than the `FileBucket` Building Block, even though `FileBucket` exists for -"an app needs an S3 bucket" use cases. - -Bedrock ingestion assumes an IAM role that must **read** the data bucket, and -the Knowledge Base / Data Source wiring needs low-level bucket primitives that -`FileBucket` intentionally does not expose: - -- **`bucketArn`** — fed verbatim into `CfnDataSource.s3Configuration.bucketArn`. -- **`grantRead(role)`** — grants the Bedrock service-principal role read access - with the exact resource scoping CDK generates. -- **`enforceSSL: true`** — required posture for the bucket policy. -- **`PhysicalName.GENERATE_IF_NEEDED`** — a CDK-generated name so the bucket can - be referenced cross-construct (and, for an imported `s3://` source, swapped - for `Bucket.fromBucketName`) without the caller having to name it. - -`FileBucket` is a higher-level, app-facing abstraction (presigned uploads, -client access patterns) and does not surface these primitives. Reaching for the -raw L2 here keeps the Bedrock IAM grant precise and avoids bending `FileBucket` -into an infrastructure role it was not designed for. - -## D2 — S3 Vectors resources mirror the data bucket's removal policy - -The vector store is a pair of S3 Vectors **L1** resources -(`s3vectors.CfnVectorBucket` + `s3vectors.CfnIndex`). Unlike the L2 `s3.Bucket` -— which defaults to `RETAIN` and supports `autoDeleteObjects` — these L1 -resources rely solely on their CloudFormation `DeletionPolicy`, whose default is -`Delete`. Left unmanaged they are inconsistent with the data bucket on teardown. - -We therefore apply a removal policy to both, computed from the **same** `destroy` -signal that drives the data bucket: - -- `removalPolicy: 'destroy'` (or sandbox mode with no explicit policy) → - `RemovalPolicy.DESTROY` → `DeletionPolicy: Delete`. The vector bucket + index - are dropped alongside the (auto-emptied) data bucket. -- otherwise → `RemovalPolicy.RETAIN` → `DeletionPolicy: Retain`, matching the - data bucket's `RETAIN`-by-default posture so customer data is never silently - destroyed. - -`applyRemovalPolicy()` sets both `DeletionPolicy` and `UpdateReplacePolicy`. -There is no `autoDeleteObjects` equivalent for S3 Vectors, but a vector bucket -deleted by CloudFormation is removed with its contents, so no manual emptying -step is needed for the vector store. - -## D3 — Teardown caveat: the stack-level `RemovalPolicies` aspect cannot auto-empty the data bucket - -Some templates force a whole stack to be destroyable with a CDK aspect: - -```ts -import { RemovalPolicies } from 'aws-cdk-lib'; -RemovalPolicies.of(stack).destroy(); -``` - -This aspect flips every resource's `DeletionPolicy` to `Delete`, **but it cannot -enable `autoDeleteObjects`** on a bucket — `autoDeleteObjects` is a constructor -behavior (it provisions a custom resource + Lambda that empties the bucket on -delete), not a CloudFormation attribute an aspect can toggle after the fact. - -Consequence: if you rely solely on the stack-level aspect and do **not** pass -`removalPolicy: 'destroy'` to the KnowledgeBase, the data bucket's -`DeletionPolicy` becomes `Delete` but it still has objects in it, so -CloudFormation's `DELETE` fails with `BucketNotEmpty` and the teardown stalls. - -**Recommendation:** for a clean teardown, pass `removalPolicy: 'destroy'` to the -KnowledgeBase (or run in sandbox mode). That path pairs `RemovalPolicy.DESTROY` -with `autoDeleteObjects` on the data bucket and `DeletionPolicy: Delete` on the -S3 Vectors resources (see D2), so the bucket is emptied and every resource is -removed without manual intervention. diff --git a/packages/bb-knowledge-base/DESIGN.md b/packages/bb-knowledge-base/DESIGN.md index f44b5655..151ba4d0 100644 --- a/packages/bb-knowledge-base/DESIGN.md +++ b/packages/bb-knowledge-base/DESIGN.md @@ -20,6 +20,10 @@ Design document for KnowledgeBase. For usage, see [README.md](./README.md). **Rationale:** Ingestion can take minutes to hours depending on corpus size. Blocking `cdk deploy` until ingestion completes would make iterative development painful. Fire-and-forget means the deploy finishes quickly and ingestion happens in the background. The trade-off is that the knowledge base may return stale or empty results for a brief window after deploy. This is acceptable because the alternative (using a CDK `Provider` with `isComplete` polling) adds significant complexity and Lambda cold-start cost for a one-time operation. +**Resolution of the warm-up window:** The `isReady()` / `waitUntilReady()` readiness API (see [README.md](./README.md#readiness)) closes the gap left by fire-and-forget ingestion. Rather than blocking the deploy, callers poll the data source's ingestion-job status at runtime (`ListIngestionJobs` / `GetIngestionJob`) and gate `retrieve()` on completion — keeping deploys fast while giving application code a reliable "is the KB queryable yet?" signal. `COMPLETE` → ready, `FAILED` → throws `IngestionFailedException`, anything else (or no jobs yet) → not ready. + +**Known asymmetry (imported `s3://` sources):** When the source is an imported `s3://` URI, no BB-managed `DATA_SOURCE_ID` is registered, so there is no managed ingestion job for the readiness API to track. `isReady()` therefore returns `true` immediately even though Bedrock may still be ingesting against the imported bucket. Readiness gating only covers BB-created data sources; for imported buckets the caller owns ingestion timing. + ### D-KB-3: Semantic chunking as default strategy **Decision:** Default chunking strategy is `'semantic'` (breakpoint-based topic detection), not fixed-size. @@ -56,13 +60,43 @@ Design document for KnowledgeBase. For usage, see [README.md](./README.md). **Rationale:** KnowledgeBase requires Bedrock API access (AWS runtime) or filesystem reads (mock). Neither is available in the browser. Throwing at construction — not at `retrieve()` time — gives developers an immediate, clear error message guiding them to use server actions, API routes, or Lambda handlers. This follows the same pattern as other server-only Building Blocks. +### D-KB-9: Raw `s3.Bucket` for the data bucket (not the `FileBucket` Building Block) + +**Decision:** Provision the data bucket with a raw `aws-cdk-lib/aws-s3` `s3.Bucket` rather than the `FileBucket` Building Block, even though `FileBucket` exists for "an app needs an S3 bucket" use cases. + +**Rationale:** Bedrock ingestion assumes an IAM role that must **read** the data bucket, and the Knowledge Base / Data Source wiring needs low-level bucket primitives that `FileBucket` intentionally does not expose: + +- **`bucketArn`** — fed verbatim into `CfnDataSource.s3Configuration.bucketArn`. +- **`grantRead(role)`** — grants the Bedrock service-principal role read access with the exact resource scoping CDK generates. +- **`enforceSSL: true`** — required posture for the bucket policy. +- **`PhysicalName.GENERATE_IF_NEEDED`** — a CDK-generated name so the bucket can be referenced cross-construct (and, for an imported `s3://` source, swapped for `Bucket.fromBucketName`) without the caller having to name it. + +`FileBucket` is a higher-level, app-facing abstraction (presigned uploads, client access patterns) and does not surface these primitives. Reaching for the raw L2 here keeps the Bedrock IAM grant precise and avoids bending `FileBucket` into an infrastructure role it was not designed for. + +### D-KB-10: S3 Vectors resources mirror the data bucket's removal policy + +**Decision:** Apply a removal policy to the S3 Vectors L1 resources (`s3vectors.CfnVectorBucket` + `s3vectors.CfnIndex`), computed from the **same** `destroy` signal that drives the data bucket. + +**Rationale:** Unlike the L2 `s3.Bucket` — which defaults to `RETAIN` and supports `autoDeleteObjects` — these L1 resources rely solely on their CloudFormation `DeletionPolicy`, whose default is `Delete`. Left unmanaged they are inconsistent with the data bucket on teardown. So: + +- `removalPolicy: 'destroy'` (or sandbox mode with no explicit policy) → `RemovalPolicy.DESTROY` → `DeletionPolicy: Delete`. The vector bucket + index are dropped alongside the (auto-emptied) data bucket. +- otherwise → `RemovalPolicy.RETAIN` → `DeletionPolicy: Retain`, matching the data bucket's `RETAIN`-by-default posture so customer data is never silently destroyed. + +`applyRemovalPolicy()` sets both `DeletionPolicy` and `UpdateReplacePolicy`. There is no `autoDeleteObjects` equivalent for S3 Vectors, but a vector bucket deleted by CloudFormation is removed with its contents, so no manual emptying step is needed for the vector store. + +### D-KB-11: Teardown caveat — the stack-level `RemovalPolicies` aspect cannot auto-empty the data bucket + +**Decision:** For a clean teardown, pass `removalPolicy: 'destroy'` to the KnowledgeBase (or run in sandbox mode) rather than relying solely on a stack-level `RemovalPolicies.of(stack).destroy()` aspect. + +**Rationale:** The stack-level aspect flips every resource's `DeletionPolicy` to `Delete`, **but it cannot enable `autoDeleteObjects`** on a bucket — `autoDeleteObjects` is a constructor behavior (it provisions a custom resource + Lambda that empties the bucket on delete), not a CloudFormation attribute an aspect can toggle after the fact. Consequence: if you rely solely on the stack-level aspect and do **not** pass `removalPolicy: 'destroy'` to the KnowledgeBase, the data bucket's `DeletionPolicy` becomes `Delete` but it still has objects in it, so CloudFormation's `DELETE` fails with `BucketNotEmpty` and the teardown stalls. Passing `removalPolicy: 'destroy'` (or running in sandbox mode) pairs `RemovalPolicy.DESTROY` with `autoDeleteObjects` on the data bucket and `DeletionPolicy: Delete` on the S3 Vectors resources (see D-KB-10), so the bucket is emptied and every resource is removed without manual intervention. + ## Infrastructure (CDK) Creates the following resources: 1. **S3 Data Bucket** — Stores source documents. Created new for local folder sources; imported via `Bucket.fromBucketName` for `s3://` URI sources. Block public access enabled, SSE-S3 encryption. Removal policy defaults to CDK's default (`RETAIN`) — the bucket and its documents are preserved on `cdk destroy` — unless `removalPolicy: 'destroy'` is set or the stack is in sandbox mode (`sandboxMode` context), in which case it becomes `DESTROY` with `autoDeleteObjects` enabled. -2. **S3 Vectors VectorBucket + Index** — Serverless vector store for embeddings. Index configured with `float32` data type, cosine distance metric, and configurable dimensions (default 1024). `AMAZON_BEDROCK_TEXT` and `AMAZON_BEDROCK_METADATA` marked as non-filterable metadata keys. +2. **S3 Vectors VectorBucket + Index** — Serverless vector store for embeddings. Index configured with `float32` data type, cosine distance metric, and configurable dimensions (default 1024). `AMAZON_BEDROCK_TEXT` and `AMAZON_BEDROCK_METADATA` marked as non-filterable metadata keys. On teardown these two L1 resources now mirror the data bucket's removal policy — `DeletionPolicy: Delete` when `removalPolicy: 'destroy'` (or sandbox mode), `Retain` otherwise — so they are dropped alongside the data bucket instead of being left behind (see D-KB-10). 3. **IAM Role** — Assumed by `bedrock.amazonaws.com` (scoped via `aws:SourceAccount`). Grants: S3 read on data bucket, S3 Vectors CRUD on vector bucket/index, `bedrock:InvokeModel` on Titan V2 (both inference profile and foundation model ARNs). @@ -74,8 +108,8 @@ Creates the following resources: 7. **AwsCustomResource (StartIngestionJob)** — Fires `bedrock:StartIngestionJob` on Create/Update. Ingestion runs asynchronously. Depends on both the data source and bucket deployment (when present) so documents are in S3 before ingestion starts. -**Environment variables injected:** `BLOCKS_{FULLID}_KB_ID` -**IAM grants to handler:** `bedrock:Retrieve` on the knowledge base ARN +**Environment variables injected:** `BLOCKS_{FULLID}_KB_ID`, `BLOCKS_{FULLID}_DATA_SOURCE_ID` (the data source id drives the `isReady()` / `waitUntilReady()` readiness checks) +**IAM grants to handler:** `bedrock:Retrieve`, `bedrock:GetIngestionJob`, `bedrock:ListIngestionJobs` on the knowledge base ARN (the ingestion-job actions back the readiness checks; the data source and its ingestion jobs are sub-resources of the KB ARN) ## Mock Implementation diff --git a/packages/bb-knowledge-base/README.md b/packages/bb-knowledge-base/README.md index 139e910e..e826904a 100644 --- a/packages/bb-knowledge-base/README.md +++ b/packages/bb-knowledge-base/README.md @@ -38,6 +38,8 @@ const kb = new KnowledgeBase(scope, id, options) | Method | Returns | Description | |--------|---------|-------------| | `retrieve(query, options?)` | `Promise` | Search for relevant document chunks. Returns results ranked by relevance score. | +| `isReady()` | `Promise` | Whether async ingestion has finished and the KB can serve `retrieve()`. `true` once the latest ingestion job is `COMPLETE` (or there is no BB-managed data source to track). Throws `IngestionFailed` if the latest job failed. | +| `waitUntilReady(options?)` | `Promise` | Poll `isReady()` until the KB is ready or the timeout elapses. Throws `Timeout` if it does not become ready in time. | ### Options @@ -103,6 +105,23 @@ chunking: { strategy: 'fixed', chunkSize: 500, chunkOverlap: 10 } | `source` | `string` | Source document path or URL. | | `metadata` | `Record` | Document metadata. Includes auto-populated `folder` from subfolders. | +### Readiness + +Bedrock ingestion runs asynchronously after deploy, so immediately after `cdk deploy` the knowledge base may not yet be queryable — `retrieve()` returns an empty array even for queries that will later match. Use `isReady()` / `waitUntilReady()` to gate on ingestion completion: + +```typescript +// Block until the KB is queryable (e.g. right after deploy), then query +await kb.waitUntilReady({ timeoutMs: 600_000 }); +const results = await kb.retrieve('getting started'); + +// Or check without blocking +if (await kb.isReady()) { + const results = await kb.retrieve('getting started'); +} +``` + +`waitUntilReady(options?)` accepts `timeoutMs` (default `300_000`) and `pollIntervalMs` (default `5_000`). For an imported `s3://` source there is no BB-managed data source to track, so `isReady()` returns `true` immediately. In local development the mock is always ready. + ## Metadata Filtering Filter results by document metadata. All conditions use AND semantics: @@ -147,6 +166,8 @@ try { |---|---|---| | `KnowledgeBaseErrors.RetrievalFailed` | `RetrievalFailedException` | Bedrock retrieval call failed | | `KnowledgeBaseErrors.NotReady` | `KnowledgeBaseNotReadyException` | KB not deployed or env vars missing | +| `KnowledgeBaseErrors.IngestionFailed` | `IngestionFailedException` | The most recent ingestion job failed (message includes `failureReasons`) — thrown by `isReady()` / `waitUntilReady()` | +| `KnowledgeBaseErrors.Timeout` | `KnowledgeBaseTimeoutException` | `waitUntilReady()` exceeded its timeout before ingestion completed | | `KnowledgeBaseErrors.InvalidSource` | `InvalidSourceConfigException` | Source folder not found or invalid config | | `KnowledgeBaseErrors.InvalidFilter` | `InvalidFilterException` | Invalid filter keys in Bedrock query | | `KnowledgeBaseErrors.ValidationError` | `KnowledgeBaseValidationError` | Empty or invalid query | @@ -154,7 +175,7 @@ try { ## Deploy Behavior -`cdk deploy` automatically triggers document ingestion (fire-and-forget). Ingestion runs asynchronously after the deploy completes. Check the AWS console to monitor ingestion progress. +`cdk deploy` automatically triggers document ingestion (fire-and-forget). Ingestion runs asynchronously after the deploy completes. Check the AWS console to monitor ingestion progress, or call [`isReady()` / `waitUntilReady()`](#readiness) from your code to gate queries on ingestion completion. ## Scaling & Cost (AWS) diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index f647f570..9ba2fdf1 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -285,9 +285,13 @@ export class KnowledgeBase extends Scope { * * @returns `true` when the latest ingestion job is `COMPLETE` (or there is * no managed data source to track); `false` while ingestion is pending. - * @throws {KnowledgeBaseNotReadyException} If the KB has not been created/deployed. * @throws {IngestionFailedException} If the most recent ingestion job failed (message includes `failureReasons`). - * @throws {RetrievalFailedException} For other Bedrock control-plane errors (network, auth, throttling). + * @throws {KnowledgeBaseNotReadyException | KnowledgeBaseValidationError | InvalidFilterException | RetrievalFailedException} + * For mapped Bedrock control-plane errors. The `KB_ID` env var being unset, and a + * control-plane `ResourceNotFoundException`, both map to `NotReady`; a control-plane + * `ValidationException` maps to `KnowledgeBaseValidationError` (or `InvalidFilterException`); + * any other SDK error (network, auth, throttling) maps to `RetrievalFailedException`. + * This is the same mapping {@link mapSdkError} applies to `retrieve()`. * * @example * ```typescript @@ -332,8 +336,11 @@ export class KnowledgeBase extends Scope { * (default 5000, clamped to a minimum of 1ms) spaces out the polls. * @throws {KnowledgeBaseTimeoutException} If the KB does not become ready within `timeoutMs`. * @throws {IngestionFailedException} If the most recent ingestion job failed (message includes `failureReasons`). - * @throws {KnowledgeBaseNotReadyException} If the KB has not been created/deployed. - * @throws {RetrievalFailedException} For other Bedrock control-plane errors (network, auth, throttling). + * @throws {KnowledgeBaseNotReadyException | KnowledgeBaseValidationError | InvalidFilterException | RetrievalFailedException} + * Propagated from {@link isReady} for mapped Bedrock control-plane errors — see its docs + * for the full mapping (`ResourceNotFoundException`/unset `KB_ID` → `NotReady`, + * `ValidationException` → `KnowledgeBaseValidationError`/`InvalidFilterException`, + * other SDK errors → `RetrievalFailedException`). * * @example * ```typescript From d9ead3c7b13cfe4e394750ae378c38b8032acebc Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Thu, 25 Jun 2026 00:27:46 +0000 Subject: [PATCH 05/14] fix(bb-knowledge-base): tolerate transient control-plane errors in waitUntilReady --- .../bb-knowledge-base/src/index.aws.test.ts | 126 ++++++++++++++++++ packages/bb-knowledge-base/src/index.aws.ts | 62 ++++++++- packages/bb-knowledge-base/src/types.ts | 15 +++ 3 files changed, 198 insertions(+), 5 deletions(-) diff --git a/packages/bb-knowledge-base/src/index.aws.test.ts b/packages/bb-knowledge-base/src/index.aws.test.ts index 55b2e4f8..e1190459 100644 --- a/packages/bb-knowledge-base/src/index.aws.test.ts +++ b/packages/bb-knowledge-base/src/index.aws.test.ts @@ -800,4 +800,130 @@ describe('waitUntilReady', () => { cleanup(); } }); + + test('tolerates a transient control-plane error, then resolves once COMPLETE', async () => { + const cleanup = setReadyEnv('TEST', 'WUR6'); + let calls = 0; + mockAgentSend(() => { + calls += 1; + if (calls === 1) { + // Unrecognized SDK error → mapSdkError classifies it as RetrievalFailed (transient). + const e = new Error('Rate exceeded'); + e.name = 'ThrottlingException'; + throw e; + } + return { ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'COMPLETE' }] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur6', { source: './knowledge' }); + await kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1 }); + assert.ok(calls >= 2, `expected a retry after the transient blip, got ${calls} call(s)`); + } finally { + cleanup(); + } + }); + + test('throws once consecutive transient errors exceed the tolerance', async () => { + const cleanup = setReadyEnv('TEST', 'WUR7'); + let calls = 0; + mockAgentSend(() => { + calls += 1; + const e = new Error('Rate exceeded'); + e.name = 'ThrottlingException'; // → RetrievalFailed (transient) on every poll + throw e; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur7', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 2 }), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.RetrievalFailed); + return true; + }, + ); + // tolerance 2 → polls 1 & 2 absorbed, poll 3 exceeds the limit and rethrows. + assert.strictEqual(calls, 3, `expected 3 polls (2 tolerated + 1 over the limit), got ${calls}`); + } finally { + cleanup(); + } + }); + + test('short-circuits immediately on IngestionFailed (never retried as transient)', async () => { + const cleanup = setReadyEnv('TEST', 'WUR8'); + let listCalls = 0; + mockAgentSend((cmd) => { + if (cmd.constructor.name === 'ListIngestionJobsCommand') { + listCalls += 1; + return { ingestionJobSummaries: [{ ingestionJobId: 'job-fail', status: 'FAILED' }] }; + } + return { ingestionJob: { status: 'FAILED', failureReasons: ['boom'] } }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur8', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 5 }), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.IngestionFailed); + return true; + }, + ); + assert.strictEqual(listCalls, 1, 'a FAILED job is terminal — it must short-circuit, not poll again'); + } finally { + cleanup(); + } + }); + + test('short-circuits immediately on NotReady (unset KB_ID is not retried forever)', async () => { + const prefix = 'BLOCKS_TEST_WUR9'; + const orig = process.env[`${prefix}_KB_ID`]; + delete process.env[`${prefix}_KB_ID`]; + let sendCalled = false; + mockAgentSend(() => { + sendCalled = true; + return { ingestionJobSummaries: [] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur9', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 5 }), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.NotReady); + return true; + }, + ); + assert.strictEqual(sendCalled, false, 'a missing-KB config error fails before any poll and must not be retried'); + } finally { + if (orig !== undefined) process.env[`${prefix}_KB_ID`] = orig; + } + }); + + test('resets the transient-error counter after a clean poll', async () => { + const cleanup = setReadyEnv('TEST', 'WUR10'); + // transient → clean (IN_PROGRESS) → transient → COMPLETE. With tolerance 1 this + // only succeeds if the counter resets after the clean poll — otherwise the second + // transient error would be the 2nd consecutive failure and exceed the limit. + const seq = ['throw', 'IN_PROGRESS', 'throw', 'COMPLETE']; + let i = 0; + mockAgentSend(() => { + const step = seq[i++] ?? 'COMPLETE'; + if (step === 'throw') { + const e = new Error('Rate exceeded'); + e.name = 'ThrottlingException'; + throw e; + } + return { ingestionJobSummaries: [{ ingestionJobId: 'j', status: step }] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur10', { source: './knowledge' }); + await kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 1 }); + assert.strictEqual(i, 4, 'should consume the full transient/clean/transient/complete sequence'); + } finally { + cleanup(); + } + }); }); diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index 9ba2fdf1..3acb534d 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -113,6 +113,19 @@ function mapSdkError(err: unknown): Error { return mapped; } +/** + * Whether a mapped readiness error is a *transient* control-plane failure worth + * a bounded retry in {@link KnowledgeBase.waitUntilReady}. True only for + * `RetrievalFailedException` — the bucket {@link mapSdkError} uses for network + * errors, throttling, and other unrecognized SDK failures. Terminal errors + * (`IngestionFailedException`, `KnowledgeBaseNotReadyException`, and the + * validation errors) map to other names and are intentionally excluded so they + * short-circuit the wait immediately. + */ +function isTransientControlPlaneError(err: unknown): boolean { + return err instanceof Error && err.name === KnowledgeBaseErrors.RetrievalFailed; +} + // ── Filter builder ───────────────────────────────────────────────────────── function buildFilter(filter?: MetadataFilter): RetrievalFilter | undefined { @@ -331,16 +344,28 @@ export class KnowledgeBase extends Scope { * recent ingestion job has `FAILED`, the underlying `IngestionFailedException` * propagates immediately rather than waiting out the timeout. * + * Because this method exists for the noisy post-deploy window, it tolerates a + * bounded run of *transient* control-plane errors (throttling / transient + * network failures, mapped to `RetrievalFailedException`) rather than aborting + * on the first blip: up to `maxConsecutiveTransientErrors` consecutive transient + * failures are absorbed and retried, and any clean poll resets that counter. + * Terminal errors still short-circuit immediately — a `FAILED` job + * (`IngestionFailedException`) and a missing-KB config error + * (`KnowledgeBaseNotReadyException`, e.g. `KB_ID` unset) are never retried. + * * @param {WaitUntilReadyOptions} options - Optional polling parameters. * `timeoutMs` (default 300000) bounds the total wait; `pollIntervalMs` - * (default 5000, clamped to a minimum of 1ms) spaces out the polls. + * (default 5000, clamped to a minimum of 1ms) spaces out the polls; + * `maxConsecutiveTransientErrors` (default 3, minimum 0) bounds how many + * consecutive transient control-plane errors are tolerated before giving up. * @throws {KnowledgeBaseTimeoutException} If the KB does not become ready within `timeoutMs`. * @throws {IngestionFailedException} If the most recent ingestion job failed (message includes `failureReasons`). * @throws {KnowledgeBaseNotReadyException | KnowledgeBaseValidationError | InvalidFilterException | RetrievalFailedException} * Propagated from {@link isReady} for mapped Bedrock control-plane errors — see its docs * for the full mapping (`ResourceNotFoundException`/unset `KB_ID` → `NotReady`, * `ValidationException` → `KnowledgeBaseValidationError`/`InvalidFilterException`, - * other SDK errors → `RetrievalFailedException`). + * other SDK errors → `RetrievalFailedException`). Transient `RetrievalFailedException`s + * are retried up to `maxConsecutiveTransientErrors` times before being rethrown. * * @example * ```typescript @@ -352,11 +377,29 @@ export class KnowledgeBase extends Scope { async waitUntilReady(options?: WaitUntilReadyOptions): Promise { const timeoutMs = Math.max(options?.timeoutMs ?? 300_000, 0); const pollIntervalMs = Math.max(options?.pollIntervalMs ?? 5_000, 1); + const maxConsecutiveTransientErrors = Math.max(options?.maxConsecutiveTransientErrors ?? 3, 0); const deadline = Date.now() + timeoutMs; + let consecutiveTransientErrors = 0; for (;;) { - // isReady() throws IngestionFailedException on a FAILED job — let it propagate. - if (await this.isReady()) return; + try { + // isReady() resolves true (ready) / false (still warming), throws + // IngestionFailedException on a FAILED job, NotReady when the KB is + // not deployed, or RetrievalFailedException for transient blips. + if (await this.isReady()) return; + // A clean poll clears any transient-error streak. + consecutiveTransientErrors = 0; + } catch (err) { + // Terminal errors (FAILED job, missing-KB config, validation) short-circuit. + if (!isTransientControlPlaneError(err)) throw err; + // Transient control-plane blip: absorb a bounded run, then give up. + consecutiveTransientErrors += 1; + if (consecutiveTransientErrors > maxConsecutiveTransientErrors) throw err; + this.log.warn( + `waitUntilReady: tolerating transient control-plane error ` + + `(${consecutiveTransientErrors}/${maxConsecutiveTransientErrors}), retrying: ${(err as Error).message}`, + ); + } if (Date.now() >= deadline) { throw blocksError( KnowledgeBaseErrors.Timeout, @@ -410,7 +453,16 @@ export class KnowledgeBase extends Scope { const response = await this.agentClient.send( new GetIngestionJobCommand({ knowledgeBaseId, dataSourceId, ingestionJobId }), ); - return response.ingestionJob?.failureReasons ?? []; + const reasons = response.ingestionJob?.failureReasons ?? []; + if (reasons.length === 0) { + // A FAILED job with no reported reasons is unusual — surface a hint so + // the otherwise reason-less IngestionFailedException is easier to diagnose. + this.log.warn( + `Ingestion job ${ingestionJobId} is FAILED but reported no failureReasons; ` + + `the surfaced error will not include a cause.`, + ); + } + return reasons; } catch (err) { this.log.error(mapSdkError(err).message); return []; diff --git a/packages/bb-knowledge-base/src/types.ts b/packages/bb-knowledge-base/src/types.ts index 88fb5ae1..9afe35d0 100644 --- a/packages/bb-knowledge-base/src/types.ts +++ b/packages/bb-knowledge-base/src/types.ts @@ -170,4 +170,19 @@ export interface WaitUntilReadyOptions { timeoutMs?: number; /** Delay between readiness polls, in milliseconds. Clamped to a minimum of 1ms. Default: 5000 (5 seconds). */ pollIntervalMs?: number; + /** + * Maximum number of *consecutive* transient control-plane errors to tolerate + * before giving up, instead of aborting the wait on the first blip. A transient + * error is one mapped to `RetrievalFailedException` — the catch-all for network + * failures, throttling, and other unrecognized SDK errors during a readiness + * poll. Each clean poll (ingestion still in progress, or ready) resets the + * counter, so only an unbroken run of failures counts toward the limit. + * + * Terminal errors always short-circuit immediately regardless of this value: + * `IngestionFailedException` (the job failed), `KnowledgeBaseNotReadyException` + * (the KB is not deployed / `KB_ID` is unset), and validation errors. Set to `0` + * to fail fast on the first transient error. Clamped to a minimum of 0. + * Default: 3. + */ + maxConsecutiveTransientErrors?: number; } From 67e2356be634691eedb5e5f22a5124efc9048366 Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Thu, 25 Jun 2026 09:41:14 +0000 Subject: [PATCH 06/14] docs(bb-knowledge-base): correct s3:// readiness docs; export WaitUntilReadyOptions from umbrella; document maxConsecutiveTransientErrors; e2e for waitUntilReady - Correct the s3:// readiness claim in DESIGN.md, README.md, .changeset/kb-readiness.md and the index.aws.ts isReady()/ensureDataSourceId() JSDoc. index.cdk.ts creates a CfnDataSource and registers DATA_SOURCE_ID unconditionally for BOTH folder and imported s3:// sources, so both are tracked by isReady()/waitUntilReady(). The "no managed data source -> ready" shortcut applies only to deployments predating this readiness API (no DATA_SOURCE_ID injected). index.cdk.ts behaviour unchanged. - Re-export the WaitUntilReadyOptions type from the @aws-blocks/blocks umbrella (index.ts + index.cdk.ts) so consumers can import it like RetrieveOptions/MetadataFilter. - Document maxConsecutiveTransientErrors (default 3, min 0; counts consecutive transient control-plane errors, resets on a clean poll, terminal errors short-circuit) in the README readiness section and the changeset signature. - Add a minimal local-mock e2e in test-apps/comprehensive covering the wired kbWaitUntilReady endpoint end-to-end. - Regenerate API reports (npm run update:api): blocks/API.md picks up WaitUntilReadyOptions; bb-knowledge-base/API.md was stale (missing the already-merged isReady/waitUntilReady/ WaitUntilReadyOptions) and is brought in sync so the blocking check:api gate passes. --- .changeset/kb-readiness.md | 4 ++-- packages/bb-knowledge-base/API.md | 11 +++++++++++ packages/bb-knowledge-base/DESIGN.md | 2 +- packages/bb-knowledge-base/README.md | 2 +- packages/bb-knowledge-base/src/index.aws.ts | 15 +++++++++------ packages/blocks/API.md | 3 +++ packages/blocks/src/index.cdk.ts | 2 +- packages/blocks/src/index.ts | 2 +- .../comprehensive/test/knowledge-base.test.ts | 17 +++++++++++++++++ 9 files changed, 46 insertions(+), 12 deletions(-) diff --git a/.changeset/kb-readiness.md b/.changeset/kb-readiness.md index 4ccd20d5..e2e896d2 100644 --- a/.changeset/kb-readiness.md +++ b/.changeset/kb-readiness.md @@ -6,7 +6,7 @@ Add `isReady()` / `waitUntilReady()` ingestion-readiness API to KnowledgeBase. Bedrock ingestion runs asynchronously after deploy, so during the warm-up window `retrieve()` returns an empty array even for queries that would later match — making "empty" ambiguous between "still warming up" and "ingested, no match". The new methods resolve that ambiguity: -- `isReady(): Promise` — `true` once the data source's most recent ingestion job is `COMPLETE` (or when there is no BB-managed data source to track, e.g. an imported `s3://` source); `false` while ingestion is pending. Throws a typed `IngestionFailedException` (including `failureReasons`) if the latest job failed. -- `waitUntilReady(options?: { timeoutMs?: number; pollIntervalMs?: number }): Promise` — polls until ready (defaults: `timeoutMs` 300000, `pollIntervalMs` 5000), throwing a typed `KnowledgeBaseTimeoutException` on timeout or propagating `IngestionFailedException` on a failed job. +- `isReady(): Promise` — `true` once the data source's most recent ingestion job is `COMPLETE`; `false` while ingestion is pending. Both local-folder and imported `s3://` sources register a BB-managed data source, so both are tracked (the "no managed data source → ready" shortcut applies only to deployments predating this API, which have no data source id injected). Throws a typed `IngestionFailedException` (including `failureReasons`) if the latest job failed. +- `waitUntilReady(options?: { timeoutMs?: number; pollIntervalMs?: number; maxConsecutiveTransientErrors?: number }): Promise` — polls until ready (defaults: `timeoutMs` 300000, `pollIntervalMs` 5000, `maxConsecutiveTransientErrors` 3), throwing a typed `KnowledgeBaseTimeoutException` on timeout or propagating `IngestionFailedException` on a failed job. Up to `maxConsecutiveTransientErrors` *consecutive* transient control-plane errors are tolerated (the counter resets on a clean poll); terminal errors short-circuit immediately. Purely additive — `retrieve()` and all existing signatures are unchanged. The local mock reports ready immediately (no warm-up window in local dev). diff --git a/packages/bb-knowledge-base/API.md b/packages/bb-knowledge-base/API.md index a86e204a..d5217421 100644 --- a/packages/bb-knowledge-base/API.md +++ b/packages/bb-knowledge-base/API.md @@ -24,9 +24,11 @@ export class KnowledgeBase extends Scope { constructor(scope: ScopeParent, id: string, _options: KnowledgeBaseOptions); // (undocumented) readonly bbName = "KnowledgeBase"; + isReady(): Promise; // @internal protected log: ChildLogger; retrieve(query: string, options?: RetrieveOptions): Promise; + waitUntilReady(options?: WaitUntilReadyOptions): Promise; } // @public @@ -37,6 +39,8 @@ export const KnowledgeBaseErrors: { readonly InvalidFilter: "InvalidFilterException"; readonly ValidationError: "KnowledgeBaseValidationError"; readonly BrowserNotSupported: "BrowserNotSupportedException"; + readonly IngestionFailed: "IngestionFailedException"; + readonly Timeout: "KnowledgeBaseTimeoutException"; }; // @public @@ -71,6 +75,13 @@ export interface RetrieveResult { // @public export type SourceConfig = string; +// @public +export interface WaitUntilReadyOptions { + maxConsecutiveTransientErrors?: number; + pollIntervalMs?: number; + timeoutMs?: number; +} + // (No @packageDocumentation comment for this package) ``` diff --git a/packages/bb-knowledge-base/DESIGN.md b/packages/bb-knowledge-base/DESIGN.md index 151ba4d0..560b0d46 100644 --- a/packages/bb-knowledge-base/DESIGN.md +++ b/packages/bb-knowledge-base/DESIGN.md @@ -22,7 +22,7 @@ Design document for KnowledgeBase. For usage, see [README.md](./README.md). **Resolution of the warm-up window:** The `isReady()` / `waitUntilReady()` readiness API (see [README.md](./README.md#readiness)) closes the gap left by fire-and-forget ingestion. Rather than blocking the deploy, callers poll the data source's ingestion-job status at runtime (`ListIngestionJobs` / `GetIngestionJob`) and gate `retrieve()` on completion — keeping deploys fast while giving application code a reliable "is the KB queryable yet?" signal. `COMPLETE` → ready, `FAILED` → throws `IngestionFailedException`, anything else (or no jobs yet) → not ready. -**Known asymmetry (imported `s3://` sources):** When the source is an imported `s3://` URI, no BB-managed `DATA_SOURCE_ID` is registered, so there is no managed ingestion job for the readiness API to track. `isReady()` therefore returns `true` immediately even though Bedrock may still be ingesting against the imported bucket. Readiness gating only covers BB-created data sources; for imported buckets the caller owns ingestion timing. +**Source coverage (folder and imported `s3://`):** Both a local-folder source and an imported `s3://` URI create a BB-managed `CfnDataSource` and register its `DATA_SOURCE_ID` unconditionally, so the readiness API tracks the ingestion job for either source type — `isReady()` / `waitUntilReady()` reflect that data source's most recent ingestion job in both cases. (For an `s3://` source the construct skips the `BucketDeployment` step, since the documents are expected to already be in the bucket, but it still creates the data source and fires the ingestion job — so readiness is tracked the same way.) The only case with nothing to track is a deployment that predates this readiness API: such a handler has no `DATA_SOURCE_ID` injected, so `isReady()` returns `true` immediately (treating "no managed data source" as ready). Re-deploying injects the id and enables readiness tracking. ### D-KB-3: Semantic chunking as default strategy diff --git a/packages/bb-knowledge-base/README.md b/packages/bb-knowledge-base/README.md index e826904a..dc9f5ec3 100644 --- a/packages/bb-knowledge-base/README.md +++ b/packages/bb-knowledge-base/README.md @@ -120,7 +120,7 @@ if (await kb.isReady()) { } ``` -`waitUntilReady(options?)` accepts `timeoutMs` (default `300_000`) and `pollIntervalMs` (default `5_000`). For an imported `s3://` source there is no BB-managed data source to track, so `isReady()` returns `true` immediately. In local development the mock is always ready. +`waitUntilReady(options?)` accepts `timeoutMs` (default `300_000`), `pollIntervalMs` (default `5_000`, clamped to a 1ms minimum), and `maxConsecutiveTransientErrors` (default `3`, minimum `0`) — the number of *consecutive* transient control-plane errors (throttling / transient network failures) tolerated before giving up. The counter resets on any clean poll, and terminal errors (a `FAILED` ingestion job, or a missing-KB config) always short-circuit immediately regardless of that limit. Both local-folder and imported `s3://` sources register a BB-managed data source, so readiness reflects that data source's ingestion job in either case. (A deployment predating this readiness API has no data source id injected, so `isReady()` returns `true` immediately — there is nothing to track.) In local development the mock is always ready. ## Metadata Filtering diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index 3acb534d..66111cbc 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -212,9 +212,11 @@ export class KnowledgeBase extends Scope { /** * Resolve the configured Bedrock data source id, or `undefined` when none - * was registered. A missing data source id means there is no BB-managed - * ingestion job to track (e.g. an imported `s3://` source, or a deployment - * that predates the readiness API), so callers treat the KB as ready. + * was registered. Both folder and imported `s3://` sources register a + * BB-managed data source id at deploy time, so this normally returns a value + * for either source type. It is `undefined` only for deployments that predate + * the readiness API (no `DATA_SOURCE_ID` injected) — in which case there is no + * ingestion job to track and callers treat the KB as ready. */ private ensureDataSourceId(): string | undefined { const dataSourceId = getSdkIdentifiers(this).dataSourceId; @@ -292,9 +294,10 @@ export class KnowledgeBase extends Scope { * Resolution strategy: lists the data source's ingestion jobs (most recent * first) and inspects the latest job's status — `COMPLETE` → ready, * `FAILED` → throws, anything else (`STARTING` / `IN_PROGRESS`, or no jobs - * yet) → not ready. When no BB-managed data source id is configured (e.g. - * an imported `s3://` source, or a deployment predating this API) there is - * no ingestion job to track, so the KB is reported ready. + * yet) → not ready. Both folder and imported `s3://` sources register a + * BB-managed data source id, so both are tracked here; the "no data source + * id configured → reported ready" shortcut applies only to deployments that + * predate this API (no `DATA_SOURCE_ID` injected — nothing to track). * * @returns `true` when the latest ingestion job is `COMPLETE` (or there is * no managed data source to track); `false` while ingestion is pending. diff --git a/packages/blocks/API.md b/packages/blocks/API.md index 4dfc7453..6526c60f 100644 --- a/packages/blocks/API.md +++ b/packages/blocks/API.md @@ -163,6 +163,7 @@ import { Transaction } from '@aws-blocks/bb-data'; import { TransactionOptions } from '@aws-blocks/bb-distributed-data'; import { UpdateAttributeOutcome } from '@aws-blocks/bb-auth-cognito'; import { UserAttribute } from '@aws-blocks/bb-auth-cognito'; +import { WaitUntilReadyOptions } from '@aws-blocks/bb-knowledge-base'; export { Agent } @@ -571,6 +572,8 @@ export { UpdateAttributeOutcome } export { UserAttribute } +export { WaitUntilReadyOptions } + export * from "@aws-blocks/core"; diff --git a/packages/blocks/src/index.cdk.ts b/packages/blocks/src/index.cdk.ts index 771cef0c..192425b9 100644 --- a/packages/blocks/src/index.cdk.ts +++ b/packages/blocks/src/index.cdk.ts @@ -53,7 +53,7 @@ export type { FileBucketOptions, PutOptions as FBPutOptions, GetUrlOptions, PutU export { AppSetting, AppSettingErrors } from '@aws-blocks/bb-app-setting'; export type { AppSettingOptions } from '@aws-blocks/bb-app-setting'; export { KnowledgeBase, KnowledgeBaseErrors } from '@aws-blocks/bb-knowledge-base'; -export type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, SourceConfig, ChunkingConfig, ChunkingStrategy } from '@aws-blocks/bb-knowledge-base'; +export type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, SourceConfig, ChunkingConfig, ChunkingStrategy, WaitUntilReadyOptions } from '@aws-blocks/bb-knowledge-base'; export { Tracer } from '@aws-blocks/bb-tracer'; export type { TracerOptions, Segment, AnnotationValue } from '@aws-blocks/bb-tracer'; export { Logger, LoggingErrors } from '@aws-blocks/bb-logger'; diff --git a/packages/blocks/src/index.ts b/packages/blocks/src/index.ts index 28a5727e..ea6ea8a0 100644 --- a/packages/blocks/src/index.ts +++ b/packages/blocks/src/index.ts @@ -286,7 +286,7 @@ export type { AppSettingOptions } from '@aws-blocks/bb-app-setting'; * Full docs: `README.md` in the package directory above. */ export { KnowledgeBase, KnowledgeBaseErrors } from '@aws-blocks/bb-knowledge-base'; -export type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, SourceConfig, ChunkingConfig, ChunkingStrategy } from '@aws-blocks/bb-knowledge-base'; +export type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, SourceConfig, ChunkingConfig, ChunkingStrategy, WaitUntilReadyOptions } from '@aws-blocks/bb-knowledge-base'; /** * **Distributed tracing backed by AWS X-Ray.** diff --git a/test-apps/comprehensive/test/knowledge-base.test.ts b/test-apps/comprehensive/test/knowledge-base.test.ts index ebecd6ea..8a4fc170 100644 --- a/test-apps/comprehensive/test/knowledge-base.test.ts +++ b/test-apps/comprehensive/test/knowledge-base.test.ts @@ -85,6 +85,23 @@ export function knowledgeBaseTests(getApi: () => typeof apiType) { }); }); + // --- Readiness: cover the wired waitUntilReady() endpoint end-to-end --- + // The retrieval suites gate on isReady() (via kbReady); this exercises the + // separate waitUntilReady() polling path. Locally the mock resolves on the + // first poll; on AWS we give it the same budget as gateOnReadiness so a + // still-ingesting KB is waited out rather than surfaced as a failure. + describe('waitUntilReady', () => { + test('resolves once the KB is ready', async () => { + const api = getApi(); + const result = await api.kbWaitUntilReady( + isLocal + ? { timeoutMs: 5_000, pollIntervalMs: 50 } + : { timeoutMs: 180_000, pollIntervalMs: 10_000 }, + ); + assert.deepStrictEqual(result, { success: true }); + }); + }); + // --- Retrieval tests: wait for ingestion before running --- describe('retrieve', () => { From 4b20037e8a55c4866e17db17b874e7abf8fe31ce Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Thu, 25 Jun 2026 11:45:22 +0000 Subject: [PATCH 07/14] fix(bb-knowledge-base): ride out transient control-plane ResourceNotFound in waitUntilReady; add AbortSignal+jitter; rename getDataSourceId; doc/comment fixes --- .changeset/kb-readiness.md | 2 +- packages/bb-knowledge-base/API.md | 1 + packages/bb-knowledge-base/README.md | 11 +- .../bb-knowledge-base/src/index.aws.test.ts | 156 ++++++++++++++++- packages/bb-knowledge-base/src/index.aws.ts | 157 ++++++++++++++---- packages/bb-knowledge-base/src/types.ts | 41 +++-- 6 files changed, 316 insertions(+), 52 deletions(-) diff --git a/.changeset/kb-readiness.md b/.changeset/kb-readiness.md index e2e896d2..338a4bbe 100644 --- a/.changeset/kb-readiness.md +++ b/.changeset/kb-readiness.md @@ -7,6 +7,6 @@ Add `isReady()` / `waitUntilReady()` ingestion-readiness API to KnowledgeBase. Bedrock ingestion runs asynchronously after deploy, so during the warm-up window `retrieve()` returns an empty array even for queries that would later match — making "empty" ambiguous between "still warming up" and "ingested, no match". The new methods resolve that ambiguity: - `isReady(): Promise` — `true` once the data source's most recent ingestion job is `COMPLETE`; `false` while ingestion is pending. Both local-folder and imported `s3://` sources register a BB-managed data source, so both are tracked (the "no managed data source → ready" shortcut applies only to deployments predating this API, which have no data source id injected). Throws a typed `IngestionFailedException` (including `failureReasons`) if the latest job failed. -- `waitUntilReady(options?: { timeoutMs?: number; pollIntervalMs?: number; maxConsecutiveTransientErrors?: number }): Promise` — polls until ready (defaults: `timeoutMs` 300000, `pollIntervalMs` 5000, `maxConsecutiveTransientErrors` 3), throwing a typed `KnowledgeBaseTimeoutException` on timeout or propagating `IngestionFailedException` on a failed job. Up to `maxConsecutiveTransientErrors` *consecutive* transient control-plane errors are tolerated (the counter resets on a clean poll); terminal errors short-circuit immediately. +- `waitUntilReady(options?: { timeoutMs?: number; pollIntervalMs?: number; maxConsecutiveTransientErrors?: number; signal?: AbortSignal }): Promise` — polls until ready (defaults: `timeoutMs` 300000, `pollIntervalMs` 5000, `maxConsecutiveTransientErrors` 3), throwing a typed `KnowledgeBaseTimeoutException` on timeout or propagating `IngestionFailedException` on a failed job. Up to `maxConsecutiveTransientErrors` *consecutive* transient control-plane errors are tolerated (the counter resets on a clean poll); terminal errors short-circuit immediately. Transient covers both throttling / transient network failures **and** a *not-yet-visible* knowledge base — during the post-deploy window the control plane can briefly return `ResourceNotFoundException` (the freshly-created KB/data source hasn't propagated yet), which is ridden out rather than treated as terminal; a *missing-KB config* error (`KB_ID` unset) stays terminal. The poll interval carries ±20% jitter (only the delay between polls varies, never the poll count or the deadline) so many KBs don't poll in lockstep. Pass an optional `signal` (`AbortSignal`) to cancel the wait — checked before each poll and during the inter-poll delay — which rejects with the signal's abort reason (default: a `DOMException` named `'AbortError'`). Purely additive — `retrieve()` and all existing signatures are unchanged. The local mock reports ready immediately (no warm-up window in local dev). diff --git a/packages/bb-knowledge-base/API.md b/packages/bb-knowledge-base/API.md index d5217421..34aa5cd9 100644 --- a/packages/bb-knowledge-base/API.md +++ b/packages/bb-knowledge-base/API.md @@ -79,6 +79,7 @@ export type SourceConfig = string; export interface WaitUntilReadyOptions { maxConsecutiveTransientErrors?: number; pollIntervalMs?: number; + signal?: AbortSignal; timeoutMs?: number; } diff --git a/packages/bb-knowledge-base/README.md b/packages/bb-knowledge-base/README.md index dc9f5ec3..5a5888d1 100644 --- a/packages/bb-knowledge-base/README.md +++ b/packages/bb-knowledge-base/README.md @@ -39,7 +39,7 @@ const kb = new KnowledgeBase(scope, id, options) |--------|---------|-------------| | `retrieve(query, options?)` | `Promise` | Search for relevant document chunks. Returns results ranked by relevance score. | | `isReady()` | `Promise` | Whether async ingestion has finished and the KB can serve `retrieve()`. `true` once the latest ingestion job is `COMPLETE` (or there is no BB-managed data source to track). Throws `IngestionFailed` if the latest job failed. | -| `waitUntilReady(options?)` | `Promise` | Poll `isReady()` until the KB is ready or the timeout elapses. Throws `Timeout` if it does not become ready in time. | +| `waitUntilReady(options?)` | `Promise` | Poll `isReady()` until the KB is ready or the timeout elapses. Throws `Timeout` if it does not become ready in time. Accepts an optional `AbortSignal` to cancel the wait. | ### Options @@ -118,9 +118,16 @@ const results = await kb.retrieve('getting started'); if (await kb.isReady()) { const results = await kb.retrieve('getting started'); } + +// Cancel the wait with an AbortSignal (e.g. an overall request deadline) +await kb.waitUntilReady({ signal: AbortSignal.timeout(120_000) }); ``` -`waitUntilReady(options?)` accepts `timeoutMs` (default `300_000`), `pollIntervalMs` (default `5_000`, clamped to a 1ms minimum), and `maxConsecutiveTransientErrors` (default `3`, minimum `0`) — the number of *consecutive* transient control-plane errors (throttling / transient network failures) tolerated before giving up. The counter resets on any clean poll, and terminal errors (a `FAILED` ingestion job, or a missing-KB config) always short-circuit immediately regardless of that limit. Both local-folder and imported `s3://` sources register a BB-managed data source, so readiness reflects that data source's ingestion job in either case. (A deployment predating this readiness API has no data source id injected, so `isReady()` returns `true` immediately — there is nothing to track.) In local development the mock is always ready. +`waitUntilReady(options?)` accepts `timeoutMs` (default `300_000`), `pollIntervalMs` (default `5_000`, clamped to a 1ms minimum), `maxConsecutiveTransientErrors` (default `3`, minimum `0`), and an optional `signal` (`AbortSignal`). The poll interval carries a small amount of random jitter (±20%) so that many knowledge bases polling after a shared deploy don't fall into lockstep — the jitter only varies the delay *between* polls and never pushes a sleep past `timeoutMs`. + +`maxConsecutiveTransientErrors` is the number of *consecutive* transient control-plane errors tolerated before giving up; the counter resets on any clean poll. Two conditions are treated as transient and ridden out: throttling / transient network failures, **and** a *not-yet-visible* knowledge base — in the post-deploy window the control plane can briefly return `ResourceNotFoundException` (the freshly-created KB or data source hasn't propagated yet), which `waitUntilReady()` absorbs rather than giving up on. Terminal errors always short-circuit immediately regardless of the limit: a `FAILED` ingestion job, and a *missing-KB config* error (the `KB_ID` env var is unset — distinct from the transient not-yet-visible case). When `signal` is provided, the wait is cancelled promptly (checked before each poll and during the inter-poll delay), rejecting with the signal's abort reason (by default a `DOMException` named `'AbortError'`). + +Both local-folder and imported `s3://` sources register a BB-managed data source, so readiness reflects that data source's ingestion job in either case. (A deployment predating this readiness API has no data source id injected, so `isReady()` returns `true` immediately — there is nothing to track.) In local development the mock is always ready. ## Metadata Filtering diff --git a/packages/bb-knowledge-base/src/index.aws.test.ts b/packages/bb-knowledge-base/src/index.aws.test.ts index e1190459..13ade7da 100644 --- a/packages/bb-knowledge-base/src/index.aws.test.ts +++ b/packages/bb-knowledge-base/src/index.aws.test.ts @@ -604,7 +604,11 @@ describe('isReady', () => { }); test('returns true (without calling the control plane) when no data source id is configured', async () => { - // Imported s3:// source / pre-feature deployment: KB_ID present, DATA_SOURCE_ID absent. + // A deployment that predates the readiness API: KB_ID present, but no + // DATA_SOURCE_ID was injected, so there is no ingestion job to track. (The + // CDK layer now always registers a DATA_SOURCE_ID for both folder and + // imported s3:// sources — see DESIGN.md D-KB-2 — so this is purely the + // pre-feature case, not a source-type distinction.) const cleanup = setReadyEnv('TEST', 'RDY4', { dataSourceId: null }); let sendCalled = false; mockAgentSend(() => { @@ -613,7 +617,7 @@ describe('isReady', () => { }); try { - const kb = new KnowledgeBase({ id: 'test' }, 'rdy4', { source: 's3://my-docs-bucket' }); + const kb = new KnowledgeBase({ id: 'test' }, 'rdy4', { source: './knowledge' }); assert.strictEqual(await kb.isReady(), true); assert.strictEqual(sendCalled, false, 'should not query the control plane when there is no data source to track'); } finally { @@ -785,6 +789,9 @@ describe('waitUntilReady', () => { }); test('resolves immediately when no data source id is configured', async () => { + // Pre-readiness-API deployment: no DATA_SOURCE_ID injected, so there is + // nothing to poll. (Not a source-type distinction — the CDK layer registers + // DATA_SOURCE_ID for folder and imported s3:// sources alike; see DESIGN.md D-KB-2.) const cleanup = setReadyEnv('TEST', 'WUR5', { dataSourceId: null }); let sendCalled = false; mockAgentSend(() => { @@ -793,7 +800,7 @@ describe('waitUntilReady', () => { }); try { - const kb = new KnowledgeBase({ id: 'test' }, 'wur5', { source: 's3://my-docs-bucket' }); + const kb = new KnowledgeBase({ id: 'test' }, 'wur5', { source: './knowledge' }); await kb.waitUntilReady({ timeoutMs: 30, pollIntervalMs: 5 }); assert.strictEqual(sendCalled, false, 'should not poll the control plane when there is nothing to track'); } finally { @@ -926,4 +933,147 @@ describe('waitUntilReady', () => { cleanup(); } }); + + // Cause-based transient classification: a control-plane ResourceNotFoundException + // (the KB/data source not yet visible in the post-deploy window) maps to NotReady + // WITH a `cause`, and is tolerated as transient — whereas an unset-KB_ID NotReady + // (thrown directly by ensureKbId, no `cause`) stays terminal. + + test('tolerates a transient control-plane ResourceNotFoundException (KB not yet visible), then resolves once COMPLETE', async () => { + const cleanup = setReadyEnv('TEST', 'WUR11'); + let calls = 0; + mockAgentSend(() => { + calls += 1; + if (calls === 1) { + // Post-deploy window: the freshly-created KB/data source has not + // propagated yet, so the control plane 404s. mapSdkError maps this to + // NotReady with cause.name === 'ResourceNotFoundException' → transient. + const e = new Error('No knowledge base with ID kb-test-123 exists'); + e.name = 'ResourceNotFoundException'; + throw e; + } + return { ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'COMPLETE' }] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur11', { source: './knowledge' }); + await kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1 }); + assert.ok(calls >= 2, `expected a retry after the not-yet-visible blip, got ${calls} call(s)`); + } finally { + cleanup(); + } + }); + + test('throws once consecutive control-plane ResourceNotFound errors exceed the tolerance', async () => { + const cleanup = setReadyEnv('TEST', 'WUR12'); + let calls = 0; + mockAgentSend(() => { + calls += 1; + const e = new Error('No knowledge base with ID kb-test-123 exists'); + e.name = 'ResourceNotFoundException'; // → NotReady (transient, carries cause) on every poll + throw e; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur12', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 2 }), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.NotReady); + assert.strictEqual( + (err.cause as Error | undefined)?.name, + 'ResourceNotFoundException', + 'the rethrown NotReady should still carry the originating SDK error as cause', + ); + return true; + }, + ); + // tolerance 2 → polls 1 & 2 absorbed, poll 3 exceeds the limit and rethrows. + assert.strictEqual(calls, 3, `expected 3 polls (2 tolerated + 1 over the limit), got ${calls}`); + } finally { + cleanup(); + } + }); + + test('does NOT retry an unset-KB_ID NotReady (config error has no cause → terminal)', async () => { + const prefix = 'BLOCKS_TEST_WUR13'; + const orig = process.env[`${prefix}_KB_ID`]; + delete process.env[`${prefix}_KB_ID`]; + let sendCalled = false; + mockAgentSend(() => { + sendCalled = true; + return { ingestionJobSummaries: [] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur13', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 5 }), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.NotReady); + // The cause-based classification hinges on this: ensureKbId() throws + // NotReady directly, so there is no `cause` (unlike a not-yet-visible + // ResourceNotFoundException) — which keeps the config error terminal. + assert.strictEqual(err.cause, undefined, 'a config NotReady must carry no cause'); + return true; + }, + ); + assert.strictEqual(sendCalled, false, 'a missing-KB config error fails before any poll and must not be retried'); + } finally { + if (orig !== undefined) process.env[`${prefix}_KB_ID`] = orig; + } + }); + + // Cancellation via AbortSignal — checked before each poll and during the inter-poll sleep. + + test('rejects immediately when the signal is already aborted (no polling)', async () => { + const cleanup = setReadyEnv('TEST', 'WUR14'); + let sendCalled = false; + mockAgentSend(() => { + sendCalled = true; + return { ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'IN_PROGRESS' }] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur14', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 5, signal: AbortSignal.abort() }), + (err: Error) => { + assert.strictEqual(err.name, 'AbortError', 'default abort reason is a DOMException named AbortError'); + return true; + }, + ); + assert.strictEqual(sendCalled, false, 'an already-aborted signal must reject before any poll'); + } finally { + cleanup(); + } + }); + + test('aborts during the inter-poll delay and rejects with the supplied abort reason', async () => { + const cleanup = setReadyEnv('TEST', 'WUR15'); + const controller = new AbortController(); + let calls = 0; + mockAgentSend(() => { + calls += 1; + // Always "still warming" so the wait reaches the inter-poll sleep, where + // the abort fired below interrupts it. + return { ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'IN_PROGRESS' }] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur15', { source: './knowledge' }); + const reason = new Error('caller cancelled'); + setTimeout(() => controller.abort(reason), 20).unref?.(); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 60_000, pollIntervalMs: 50, signal: controller.signal }), + (err: Error) => { + assert.strictEqual(err, reason, 'should reject with the exact reason passed to abort()'); + return true; + }, + ); + assert.ok(calls >= 1, 'should have polled at least once before being aborted'); + } finally { + cleanup(); + } + }); }); diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index 66111cbc..414e4c25 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -56,9 +56,42 @@ function blocksError(name: string, message: string): Error { return err; } -/** Resolve after `ms` milliseconds. Used to space out readiness polls in `waitUntilReady()`. */ -function sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)); +/** + * Resolve after `ms` milliseconds. Used to space out readiness polls in + * `waitUntilReady()`. If an {@link AbortSignal} is supplied and fires (or is + * already aborted), the returned promise rejects promptly with the signal's + * abort reason instead of waiting out the full delay. + */ +function sleep(ms: number, signal?: AbortSignal): Promise { + return new Promise((resolve, reject) => { + if (signal?.aborted) { + reject(signal.reason); + return; + } + let onAbort: (() => void) | undefined; + const timer = setTimeout(() => { + if (signal && onAbort) signal.removeEventListener('abort', onAbort); + resolve(); + }, ms); + if (signal) { + onAbort = () => { + clearTimeout(timer); + reject(signal.reason); + }; + signal.addEventListener('abort', onAbort, { once: true }); + } + }); +} + +/** + * Apply ±20% random jitter to a poll interval so that many knowledge bases + * polling after a shared deploy do not synchronize into lockstep. Only the + * delay *between* polls varies — never the number of polls — and the caller + * still clamps the result to the remaining time budget. + */ +function jitterInterval(ms: number): number { + const factor = 1 + (Math.random() * 2 - 1) * 0.2; // 0.8–1.2 + return Math.round(ms * factor); } // Match only messages that clearly indicate a metadata filter issue. @@ -115,15 +148,35 @@ function mapSdkError(err: unknown): Error { /** * Whether a mapped readiness error is a *transient* control-plane failure worth - * a bounded retry in {@link KnowledgeBase.waitUntilReady}. True only for - * `RetrievalFailedException` — the bucket {@link mapSdkError} uses for network - * errors, throttling, and other unrecognized SDK failures. Terminal errors - * (`IngestionFailedException`, `KnowledgeBaseNotReadyException`, and the - * validation errors) map to other names and are intentionally excluded so they - * short-circuit the wait immediately. + * a bounded retry in {@link KnowledgeBase.waitUntilReady}, rather than a terminal + * one that should short-circuit the wait. + * + * Two cases are transient: + * - `RetrievalFailedException` — the bucket {@link mapSdkError} uses for network + * errors, throttling, and other unrecognized SDK failures. + * - A *not-yet-visible* knowledge base. During the post-deploy window the control + * plane can briefly return `ResourceNotFoundException` (the KB or data source + * isn't visible yet); {@link mapSdkError} maps that to `KnowledgeBaseNotReadyException` + * **with the original SDK error attached as the non-enumerable `cause`**. Detect + * it via `cause.name === 'ResourceNotFoundException'` and ride it out — that is + * the entire purpose of `waitUntilReady()`. + * + * Everything else is terminal and short-circuits immediately: the `NotReady` + * raised for an unset `KB_ID` config is thrown directly by `ensureKbId()` (so it + * carries **no** `cause`, which is exactly how we tell it apart from the transient + * not-yet-visible case above); `IngestionFailedException` (the job failed); and + * the validation errors all map to other names. */ function isTransientControlPlaneError(err: unknown): boolean { - return err instanceof Error && err.name === KnowledgeBaseErrors.RetrievalFailed; + if (!(err instanceof Error)) return false; + if (err.name === KnowledgeBaseErrors.RetrievalFailed) return true; + // A control-plane ResourceNotFoundException is mapped to NotReady WITH the SDK + // error attached as `cause`; the unset-KB_ID NotReady is thrown directly and has + // none. Only the former — a KB not yet visible post-deploy — is transient. + return ( + err.name === KnowledgeBaseErrors.NotReady && + (err.cause as Error | undefined)?.name === 'ResourceNotFoundException' + ); } // ── Filter builder ───────────────────────────────────────────────────────── @@ -211,14 +264,17 @@ export class KnowledgeBase extends Scope { } /** - * Resolve the configured Bedrock data source id, or `undefined` when none - * was registered. Both folder and imported `s3://` sources register a - * BB-managed data source id at deploy time, so this normally returns a value - * for either source type. It is `undefined` only for deployments that predate - * the readiness API (no `DATA_SOURCE_ID` injected) — in which case there is no - * ingestion job to track and callers treat the KB as ready. + * Resolve the configured Bedrock data source id, or `undefined` when none was + * registered. Named `get*` rather than `ensure*` because — unlike + * {@link KnowledgeBase.ensureKbId}, which throws when the id is missing — a + * missing data source id is a valid state that this simply reports as + * `undefined`. Both folder and imported `s3://` sources register a BB-managed + * data source id at deploy time, so this normally returns a value for either + * source type. It is `undefined` only for deployments that predate the readiness + * API (no `DATA_SOURCE_ID` injected) — in which case there is no ingestion job + * to track and callers treat the KB as ready. */ - private ensureDataSourceId(): string | undefined { + private getDataSourceId(): string | undefined { const dataSourceId = getSdkIdentifiers(this).dataSourceId; return dataSourceId ? dataSourceId : undefined; } @@ -303,11 +359,15 @@ export class KnowledgeBase extends Scope { * no managed data source to track); `false` while ingestion is pending. * @throws {IngestionFailedException} If the most recent ingestion job failed (message includes `failureReasons`). * @throws {KnowledgeBaseNotReadyException | KnowledgeBaseValidationError | InvalidFilterException | RetrievalFailedException} - * For mapped Bedrock control-plane errors. The `KB_ID` env var being unset, and a - * control-plane `ResourceNotFoundException`, both map to `NotReady`; a control-plane - * `ValidationException` maps to `KnowledgeBaseValidationError` (or `InvalidFilterException`); - * any other SDK error (network, auth, throttling) maps to `RetrievalFailedException`. - * This is the same mapping {@link mapSdkError} applies to `retrieve()`. + * For mapped Bedrock control-plane errors. Two distinct conditions map to `NotReady`: + * the `KB_ID` env var being unset (a *config* error thrown directly, so it carries no + * `cause`), and a control-plane `ResourceNotFoundException` (a *not-yet-visible* KB, + * mapped with the SDK error as `cause`). {@link waitUntilReady} relies on that + * distinction: it rides out the not-yet-visible case as transient but treats the + * unset-`KB_ID` config error as terminal. A control-plane `ValidationException` maps to + * `KnowledgeBaseValidationError` (or `InvalidFilterException`); any other SDK error + * (network, auth, throttling) maps to `RetrievalFailedException`. This is the same + * mapping {@link mapSdkError} applies to `retrieve()`. * * @example * ```typescript @@ -318,7 +378,7 @@ export class KnowledgeBase extends Scope { */ async isReady(): Promise { const knowledgeBaseId = this.ensureKbId(); - const dataSourceId = this.ensureDataSourceId(); + const dataSourceId = this.getDataSourceId(); // No BB-managed ingestion to track → nothing to wait for. if (!dataSourceId) return true; @@ -348,47 +408,71 @@ export class KnowledgeBase extends Scope { * propagates immediately rather than waiting out the timeout. * * Because this method exists for the noisy post-deploy window, it tolerates a - * bounded run of *transient* control-plane errors (throttling / transient - * network failures, mapped to `RetrievalFailedException`) rather than aborting - * on the first blip: up to `maxConsecutiveTransientErrors` consecutive transient + * bounded run of *transient* control-plane errors rather than aborting on the + * first blip: up to `maxConsecutiveTransientErrors` consecutive transient * failures are absorbed and retried, and any clean poll resets that counter. + * Two kinds of error are transient — `RetrievalFailedException` (throttling / + * transient network failures) and a *not-yet-visible* KB, where the control + * plane briefly returns `ResourceNotFoundException` while the freshly-deployed + * KB or data source has not propagated yet (mapped to `KnowledgeBaseNotReadyException`). + * Riding out that window is the whole point of this method. + * * Terminal errors still short-circuit immediately — a `FAILED` job - * (`IngestionFailedException`) and a missing-KB config error - * (`KnowledgeBaseNotReadyException`, e.g. `KB_ID` unset) are never retried. + * (`IngestionFailedException`) and a *config* missing-KB error (`KB_ID` unset, + * which `ensureKbId()` throws directly with no `cause`, distinct from the + * transient not-yet-visible case above) are never retried. + * + * The delay between polls carries ±20% jitter so that many knowledge bases + * polling after a shared deploy do not synchronize; jitter only varies the + * sleep duration, never the number of polls, and is clamped to the deadline. + * Pass `options.signal` to cancel the wait — it is checked before each poll + * and during the inter-poll delay, rejecting promptly with the signal's abort + * reason (default: a `DOMException` named `'AbortError'`). * * @param {WaitUntilReadyOptions} options - Optional polling parameters. * `timeoutMs` (default 300000) bounds the total wait; `pollIntervalMs` - * (default 5000, clamped to a minimum of 1ms) spaces out the polls; - * `maxConsecutiveTransientErrors` (default 3, minimum 0) bounds how many - * consecutive transient control-plane errors are tolerated before giving up. + * (default 5000, clamped to a minimum of 1ms, ±20% jitter) spaces out the + * polls; `maxConsecutiveTransientErrors` (default 3, minimum 0) bounds how + * many consecutive transient control-plane errors are tolerated before + * giving up; `signal` (optional `AbortSignal`) cancels the wait. * @throws {KnowledgeBaseTimeoutException} If the KB does not become ready within `timeoutMs`. * @throws {IngestionFailedException} If the most recent ingestion job failed (message includes `failureReasons`). * @throws {KnowledgeBaseNotReadyException | KnowledgeBaseValidationError | InvalidFilterException | RetrievalFailedException} * Propagated from {@link isReady} for mapped Bedrock control-plane errors — see its docs * for the full mapping (`ResourceNotFoundException`/unset `KB_ID` → `NotReady`, * `ValidationException` → `KnowledgeBaseValidationError`/`InvalidFilterException`, - * other SDK errors → `RetrievalFailedException`). Transient `RetrievalFailedException`s - * are retried up to `maxConsecutiveTransientErrors` times before being rethrown. + * other SDK errors → `RetrievalFailedException`). Transient errors (a `RetrievalFailedException`, + * or a `NotReady` caused by a not-yet-visible `ResourceNotFoundException`) are retried up + * to `maxConsecutiveTransientErrors` times before being rethrown. + * @throws The `signal`'s abort reason (default `DOMException` `'AbortError'`) if `options.signal` fires. * * @example * ```typescript * // Block until the KB is queryable (e.g. right after deploy) * await kb.waitUntilReady({ timeoutMs: 600_000 }); * const results = await kb.retrieve('getting started'); + * + * // With cancellation (e.g. an overall request deadline) + * await kb.waitUntilReady({ signal: AbortSignal.timeout(120_000) }); * ``` */ async waitUntilReady(options?: WaitUntilReadyOptions): Promise { const timeoutMs = Math.max(options?.timeoutMs ?? 300_000, 0); const pollIntervalMs = Math.max(options?.pollIntervalMs ?? 5_000, 1); const maxConsecutiveTransientErrors = Math.max(options?.maxConsecutiveTransientErrors ?? 3, 0); + const signal = options?.signal; const deadline = Date.now() + timeoutMs; let consecutiveTransientErrors = 0; for (;;) { + // Cancellation: bail out before doing any work on each iteration. An + // already-aborted signal throws here on the very first pass (no poll). + signal?.throwIfAborted(); try { // isReady() resolves true (ready) / false (still warming), throws // IngestionFailedException on a FAILED job, NotReady when the KB is - // not deployed, or RetrievalFailedException for transient blips. + // not deployed (or briefly not-yet-visible), or RetrievalFailedException + // for transient blips. if (await this.isReady()) return; // A clean poll clears any transient-error streak. consecutiveTransientErrors = 0; @@ -409,8 +493,9 @@ export class KnowledgeBase extends Scope { `Knowledge base did not become ready within ${timeoutMs}ms.`, ); } - // Never sleep past the deadline. - await sleep(Math.min(pollIntervalMs, Math.max(deadline - Date.now(), 0))); + // Jitter the interval to avoid lockstep, but never sleep past the + // deadline; the sleep is abortable via the signal. + await sleep(Math.min(jitterInterval(pollIntervalMs), Math.max(deadline - Date.now(), 0)), signal); } } diff --git a/packages/bb-knowledge-base/src/types.ts b/packages/bb-knowledge-base/src/types.ts index 9afe35d0..c2b4ef96 100644 --- a/packages/bb-knowledge-base/src/types.ts +++ b/packages/bb-knowledge-base/src/types.ts @@ -168,21 +168,42 @@ export interface RetrieveResult { export interface WaitUntilReadyOptions { /** Maximum time to wait for ingestion to complete, in milliseconds. Default: 300000 (5 minutes). */ timeoutMs?: number; - /** Delay between readiness polls, in milliseconds. Clamped to a minimum of 1ms. Default: 5000 (5 seconds). */ + /** + * Delay between readiness polls, in milliseconds. Clamped to a minimum of 1ms. + * A small amount of random jitter (±20%) is applied to each delay so that many + * knowledge bases polling after a shared deploy do not fall into lockstep — the + * jitter only varies the wait *between* polls (never the number of polls) and is + * still clamped so a sleep never overruns `timeoutMs`. Default: 5000 (5 seconds). + */ pollIntervalMs?: number; /** * Maximum number of *consecutive* transient control-plane errors to tolerate - * before giving up, instead of aborting the wait on the first blip. A transient - * error is one mapped to `RetrievalFailedException` — the catch-all for network - * failures, throttling, and other unrecognized SDK errors during a readiness - * poll. Each clean poll (ingestion still in progress, or ready) resets the - * counter, so only an unbroken run of failures counts toward the limit. + * before giving up, instead of aborting the wait on the first blip. Two kinds + * of error are treated as transient during a readiness poll: + * - `RetrievalFailedException` — the catch-all for network failures, throttling, + * and other unrecognized SDK errors. + * - A *not-yet-visible* knowledge base: in the post-deploy window the control + * plane can briefly return `ResourceNotFoundException` (the KB or data source + * isn't visible yet), which surfaces as `KnowledgeBaseNotReadyException`. Only + * this control-plane variant is transient — riding it out is the whole point + * of `waitUntilReady()`. + * + * Each clean poll (ingestion still in progress, or ready) resets the counter, + * so only an unbroken run of failures counts toward the limit. * * Terminal errors always short-circuit immediately regardless of this value: - * `IngestionFailedException` (the job failed), `KnowledgeBaseNotReadyException` - * (the KB is not deployed / `KB_ID` is unset), and validation errors. Set to `0` - * to fail fast on the first transient error. Clamped to a minimum of 0. - * Default: 3. + * `IngestionFailedException` (the job failed), a *config* `KnowledgeBaseNotReadyException` + * (the `KB_ID` env var is unset — distinct from the transient not-yet-visible case + * above), and validation errors. Set to `0` to fail fast on the first transient + * error. Clamped to a minimum of 0. Default: 3. */ maxConsecutiveTransientErrors?: number; + /** + * Optional {@link AbortSignal} to cancel the wait. When the signal is aborted, + * `waitUntilReady()` rejects promptly — checked before each poll and during the + * inter-poll delay — with the signal's abort reason (by default a `DOMException` + * named `'AbortError'`, or whatever value was passed to `AbortController.abort(reason)`). + * An already-aborted signal rejects immediately, before any polling. + */ + signal?: AbortSignal; } From b72c6628313f93ba1ae2c27cabff482c1a71a3d9 Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Fri, 26 Jun 2026 00:03:24 +0000 Subject: [PATCH 08/14] chore: add changeset for @aws-blocks/blocks umbrella re-export of WaitUntilReadyOptions --- .changeset/kb-readiness.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.changeset/kb-readiness.md b/.changeset/kb-readiness.md index 338a4bbe..84f9ecc9 100644 --- a/.changeset/kb-readiness.md +++ b/.changeset/kb-readiness.md @@ -1,5 +1,6 @@ --- "@aws-blocks/bb-knowledge-base": minor +"@aws-blocks/blocks": minor --- Add `isReady()` / `waitUntilReady()` ingestion-readiness API to KnowledgeBase. @@ -10,3 +11,5 @@ Bedrock ingestion runs asynchronously after deploy, so during the warm-up window - `waitUntilReady(options?: { timeoutMs?: number; pollIntervalMs?: number; maxConsecutiveTransientErrors?: number; signal?: AbortSignal }): Promise` — polls until ready (defaults: `timeoutMs` 300000, `pollIntervalMs` 5000, `maxConsecutiveTransientErrors` 3), throwing a typed `KnowledgeBaseTimeoutException` on timeout or propagating `IngestionFailedException` on a failed job. Up to `maxConsecutiveTransientErrors` *consecutive* transient control-plane errors are tolerated (the counter resets on a clean poll); terminal errors short-circuit immediately. Transient covers both throttling / transient network failures **and** a *not-yet-visible* knowledge base — during the post-deploy window the control plane can briefly return `ResourceNotFoundException` (the freshly-created KB/data source hasn't propagated yet), which is ridden out rather than treated as terminal; a *missing-KB config* error (`KB_ID` unset) stays terminal. The poll interval carries ±20% jitter (only the delay between polls varies, never the poll count or the deadline) so many KBs don't poll in lockstep. Pass an optional `signal` (`AbortSignal`) to cancel the wait — checked before each poll and during the inter-poll delay — which rejects with the signal's abort reason (default: a `DOMException` named `'AbortError'`). Purely additive — `retrieve()` and all existing signatures are unchanged. The local mock reports ready immediately (no warm-up window in local dev). + +The umbrella `@aws-blocks/blocks` package now also re-exports the new `WaitUntilReadyOptions` type (alongside the existing `KnowledgeBase` re-exports) from both its runtime and CDK entry points, so consumers importing from `@aws-blocks/blocks` can reference it directly. From 2125410fe9ed401687d0b7190dfcf05fd9c1fbe1 Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Mon, 29 Jun 2026 08:33:28 +0000 Subject: [PATCH 09/14] chore(bb-knowledge-base): lazy-init agent client + doc/comment nits --- .../bb-knowledge-base/src/index.aws.test.ts | 8 +++-- packages/bb-knowledge-base/src/index.aws.ts | 33 ++++++++++++++----- .../comprehensive/test/knowledge-base.test.ts | 4 +++ 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/packages/bb-knowledge-base/src/index.aws.test.ts b/packages/bb-knowledge-base/src/index.aws.test.ts index 13ade7da..b37e3dcd 100644 --- a/packages/bb-knowledge-base/src/index.aws.test.ts +++ b/packages/bb-knowledge-base/src/index.aws.test.ts @@ -607,8 +607,9 @@ describe('isReady', () => { // A deployment that predates the readiness API: KB_ID present, but no // DATA_SOURCE_ID was injected, so there is no ingestion job to track. (The // CDK layer now always registers a DATA_SOURCE_ID for both folder and - // imported s3:// sources — see DESIGN.md D-KB-2 — so this is purely the - // pre-feature case, not a source-type distinction.) + // imported s3:// sources — see DESIGN.md, the "Source coverage (folder and + // imported s3://)" note — so this is purely the pre-feature case, not a + // source-type distinction.) const cleanup = setReadyEnv('TEST', 'RDY4', { dataSourceId: null }); let sendCalled = false; mockAgentSend(() => { @@ -791,7 +792,8 @@ describe('waitUntilReady', () => { test('resolves immediately when no data source id is configured', async () => { // Pre-readiness-API deployment: no DATA_SOURCE_ID injected, so there is // nothing to poll. (Not a source-type distinction — the CDK layer registers - // DATA_SOURCE_ID for folder and imported s3:// sources alike; see DESIGN.md D-KB-2.) + // DATA_SOURCE_ID for folder and imported s3:// sources alike; see DESIGN.md, + // the "Source coverage (folder and imported s3://)" note.) const cleanup = setReadyEnv('TEST', 'WUR5', { dataSourceId: null }); let sendCalled = false; mockAgentSend(() => { diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index 414e4c25..ef4df961 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -228,7 +228,10 @@ export class KnowledgeBase extends Scope { readonly bbName = BB_NAME; private readonly fullIdCached: string; private readonly runtimeClient: BedrockAgentRuntimeClient; - private readonly agentClient: BedrockAgentClient; + // Control-plane client for ingestion-job status (readiness checks). Created + // lazily on first readiness call via getAgentClient() so instances that only + // ever retrieve() (or never check readiness) don't allocate it. + private agentClient?: BedrockAgentClient; /** @internal Logger for internal operations. Defaults to error-level when not provided. */ protected log: ChildLogger; @@ -242,12 +245,6 @@ export class KnowledgeBase extends Scope { retryMode: 'adaptive', customUserAgent: this.buildUserAgentChain(), }); - // Control-plane client for ingestion-job status (readiness checks). - this.agentClient = new BedrockAgentClient({ - maxAttempts: 3, - retryMode: 'adaptive', - customUserAgent: this.buildUserAgentChain(), - }); const kbId = process.env[envKey(this.fullIdCached, 'KB_ID')] ?? ''; const dataSourceId = process.env[envKey(this.fullIdCached, 'DATA_SOURCE_ID')] ?? ''; registerSdkIdentifiers(this.fullId, { kbId, dataSourceId }); @@ -499,6 +496,24 @@ export class KnowledgeBase extends Scope { } } + /** + * Lazily construct (and memoize) the Bedrock control-plane client used for + * ingestion-job status during readiness checks. Built on first use rather + * than in the constructor so instances that only ever call {@link retrieve} + * — or never check readiness at all — don't allocate a client they won't use. + * Subsequent calls return the cached instance. + */ + private getAgentClient(): BedrockAgentClient { + if (!this.agentClient) { + this.agentClient = new BedrockAgentClient({ + maxAttempts: 3, + retryMode: 'adaptive', + customUserAgent: this.buildUserAgentChain(), + }); + } + return this.agentClient; + } + /** * List the data source's ingestion jobs (most recent first) and return the * latest summary, or `undefined` when none exist yet. SDK errors are mapped @@ -509,7 +524,7 @@ export class KnowledgeBase extends Scope { dataSourceId: string, ): Promise { try { - const response = await this.agentClient.send( + const response = await this.getAgentClient().send( new ListIngestionJobsCommand({ knowledgeBaseId, dataSourceId, @@ -538,7 +553,7 @@ export class KnowledgeBase extends Scope { ): Promise { if (!ingestionJobId) return []; try { - const response = await this.agentClient.send( + const response = await this.getAgentClient().send( new GetIngestionJobCommand({ knowledgeBaseId, dataSourceId, ingestionJobId }), ); const reasons = response.ingestionJob?.failureReasons ?? []; diff --git a/test-apps/comprehensive/test/knowledge-base.test.ts b/test-apps/comprehensive/test/knowledge-base.test.ts index 8a4fc170..60ee1574 100644 --- a/test-apps/comprehensive/test/knowledge-base.test.ts +++ b/test-apps/comprehensive/test/knowledge-base.test.ts @@ -33,6 +33,10 @@ async function gateOnReadiness( const start = Date.now(); const deadline = start + timeoutMs; let attempt = 0; + // Intentionally unbounded: the only exits are a ready KB (return), a thrown + // readiness error, or the mid-loop `Date.now() >= deadline` check below — that + // check is the sole timeout path (it must run after a poll, so the throw, not + // a loop-condition exit, is what bounds the wait). while (true) { attempt++; const elapsed = Math.round((Date.now() - start) / 1000); From 133c66ebe45a680c8f0266a9857fb9c40941484f Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Mon, 29 Jun 2026 09:43:30 +0000 Subject: [PATCH 10/14] chore(bb-knowledge-base): enrich readiness timeout message + doc precision nits --- packages/bb-knowledge-base/DESIGN.md | 2 +- packages/bb-knowledge-base/README.md | 2 +- .../bb-knowledge-base/src/index.aws.test.ts | 41 +++++++++++++++++++ packages/bb-knowledge-base/src/index.aws.ts | 11 ++++- 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/packages/bb-knowledge-base/DESIGN.md b/packages/bb-knowledge-base/DESIGN.md index 560b0d46..e4f2fe71 100644 --- a/packages/bb-knowledge-base/DESIGN.md +++ b/packages/bb-knowledge-base/DESIGN.md @@ -108,7 +108,7 @@ Creates the following resources: 7. **AwsCustomResource (StartIngestionJob)** — Fires `bedrock:StartIngestionJob` on Create/Update. Ingestion runs asynchronously. Depends on both the data source and bucket deployment (when present) so documents are in S3 before ingestion starts. -**Environment variables injected:** `BLOCKS_{FULLID}_KB_ID`, `BLOCKS_{FULLID}_DATA_SOURCE_ID` (the data source id drives the `isReady()` / `waitUntilReady()` readiness checks) +**Handler config** (registered via `registerConfig`, surfaced to the runtime as env vars): `BLOCKS_{FULLID}_KB_ID`, `BLOCKS_{FULLID}_DATA_SOURCE_ID` (the data source id drives the `isReady()` / `waitUntilReady()` readiness checks) **IAM grants to handler:** `bedrock:Retrieve`, `bedrock:GetIngestionJob`, `bedrock:ListIngestionJobs` on the knowledge base ARN (the ingestion-job actions back the readiness checks; the data source and its ingestion jobs are sub-resources of the KB ARN) ## Mock Implementation diff --git a/packages/bb-knowledge-base/README.md b/packages/bb-knowledge-base/README.md index 5a5888d1..effa6fdb 100644 --- a/packages/bb-knowledge-base/README.md +++ b/packages/bb-knowledge-base/README.md @@ -127,7 +127,7 @@ await kb.waitUntilReady({ signal: AbortSignal.timeout(120_000) }); `maxConsecutiveTransientErrors` is the number of *consecutive* transient control-plane errors tolerated before giving up; the counter resets on any clean poll. Two conditions are treated as transient and ridden out: throttling / transient network failures, **and** a *not-yet-visible* knowledge base — in the post-deploy window the control plane can briefly return `ResourceNotFoundException` (the freshly-created KB or data source hasn't propagated yet), which `waitUntilReady()` absorbs rather than giving up on. Terminal errors always short-circuit immediately regardless of the limit: a `FAILED` ingestion job, and a *missing-KB config* error (the `KB_ID` env var is unset — distinct from the transient not-yet-visible case). When `signal` is provided, the wait is cancelled promptly (checked before each poll and during the inter-poll delay), rejecting with the signal's abort reason (by default a `DOMException` named `'AbortError'`). -Both local-folder and imported `s3://` sources register a BB-managed data source, so readiness reflects that data source's ingestion job in either case. (A deployment predating this readiness API has no data source id injected, so `isReady()` returns `true` immediately — there is nothing to track.) In local development the mock is always ready. +Both local-folder and imported `s3://` sources register a BB-managed data source, so readiness reflects that data source's ingestion job in either case. (A deployment predating this readiness API has no data source id injected, so `isReady()` returns `true` immediately — there is nothing to track.) This pre-feature deployment is the **only** case where `isReady()` returns `true` without consulting an actual ingestion job — re-deploying injects `DATA_SOURCE_ID` and restores real tracking, so a freshly deployed KB always reflects a real job status (don't mistake the "nothing to track" shortcut for "ingestion confirmed complete" when gating live traffic). In local development the mock is always ready. ## Metadata Filtering diff --git a/packages/bb-knowledge-base/src/index.aws.test.ts b/packages/bb-knowledge-base/src/index.aws.test.ts index b37e3dcd..3135930f 100644 --- a/packages/bb-knowledge-base/src/index.aws.test.ts +++ b/packages/bb-knowledge-base/src/index.aws.test.ts @@ -781,6 +781,47 @@ describe('waitUntilReady', () => { (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.Timeout); assert.ok(err.message.includes('30ms'), 'timeout message should include the budget'); + // Every poll was a clean IN_PROGRESS (no transient errors), so the + // message stays the plain form — no transient detail appended. + assert.ok( + !err.message.includes('last transient error'), + 'a clean (non-transient) timeout must not claim a transient error', + ); + return true; + }, + ); + } finally { + cleanup(); + } + }); + + test('Timeout message surfaces the last transient error when the budget runs out mid-streak', async () => { + const cleanup = setReadyEnv('TEST', 'WUR16'); + // Every poll throws a transient (throttling → RetrievalFailed) error, but the + // tolerance is high enough that the deadline — not the transient budget — ends + // the wait. The Timeout message must then surface the last transient error so a + // caller can't mistake it for a healthy KB that merely never finished ingesting. + mockAgentSend(() => { + const e = new Error('Rate exceeded'); + e.name = 'ThrottlingException'; // → RetrievalFailed (transient) on every poll + throw e; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur16', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 30, pollIntervalMs: 5, maxConsecutiveTransientErrors: 1000 }), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.Timeout); + assert.ok(err.message.includes('30ms'), 'timeout message should include the budget'); + assert.ok( + err.message.includes('last transient error'), + 'timeout message should flag that the final polls were failing transiently', + ); + assert.ok( + err.message.includes('Rate exceeded'), + 'timeout message should include the underlying transient detail', + ); return true; }, ); diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index ef4df961..d732f0d5 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -461,6 +461,7 @@ export class KnowledgeBase extends Scope { const deadline = Date.now() + timeoutMs; let consecutiveTransientErrors = 0; + let lastTransient: Error | undefined; for (;;) { // Cancellation: bail out before doing any work on each iteration. An // already-aborted signal throws here on the very first pass (no poll). @@ -478,6 +479,7 @@ export class KnowledgeBase extends Scope { if (!isTransientControlPlaneError(err)) throw err; // Transient control-plane blip: absorb a bounded run, then give up. consecutiveTransientErrors += 1; + lastTransient = err as Error; if (consecutiveTransientErrors > maxConsecutiveTransientErrors) throw err; this.log.warn( `waitUntilReady: tolerating transient control-plane error ` + @@ -485,9 +487,16 @@ export class KnowledgeBase extends Scope { ); } if (Date.now() >= deadline) { + // If the budget ran out while we were still absorbing transient + // control-plane errors, fold the most recent one into the message. + // Otherwise a timeout reads like a healthy KB that just never finished + // ingesting, hiding that the final polls were actually failing transiently. + const base = `Knowledge base did not become ready within ${timeoutMs}ms`; throw blocksError( KnowledgeBaseErrors.Timeout, - `Knowledge base did not become ready within ${timeoutMs}ms.`, + consecutiveTransientErrors > 0 && lastTransient + ? `${base} (last transient error: ${lastTransient.message})` + : `${base}.`, ); } // Jitter the interval to avoid lockstep, but never sleep past the From 655e754a381b4921841bb52a3dd56c9d6bd095ae Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Mon, 29 Jun 2026 10:51:18 +0000 Subject: [PATCH 11/14] chore(bb-knowledge-base): gate e2e on waitUntilReady (transient-tolerant), mock parity doc, narrow catch --- packages/bb-knowledge-base/DESIGN.md | 1 + packages/bb-knowledge-base/src/index.aws.ts | 9 ++- .../comprehensive/test/knowledge-base.test.ts | 62 +++++++------------ 3 files changed, 31 insertions(+), 41 deletions(-) diff --git a/packages/bb-knowledge-base/DESIGN.md b/packages/bb-knowledge-base/DESIGN.md index e4f2fe71..dffe9a6b 100644 --- a/packages/bb-knowledge-base/DESIGN.md +++ b/packages/bb-knowledge-base/DESIGN.md @@ -132,3 +132,4 @@ Creates the following resources: | No ingestion pipeline | Documents are indexed synchronously on first `retrieve()` | No mitigation — the mock doesn't need async ingestion. First call may be slower due to indexing | | No IAM enforcement | Permission errors only surface in AWS | No mitigation — IAM is handled by CDK grants automatically | | Immediate consistency | New documents appear instantly vs async ingestion in AWS | No mitigation — eventual consistency in AWS is inherent to the Bedrock ingestion pipeline | +| Unconditional mock readiness | `isReady()` always returns `true` (and `waitUntilReady()` resolves immediately) — even for an `s3://` source that `retrieve()` rejects with `InvalidSourceConfigException`. Local readiness is therefore NOT a proxy for a working local `retrieve()` on `s3://` sources — the inverse of the production contract, where `isReady() === true` implies `retrieve()` is queryable | No mitigation — local has no async ingestion to wait on, so readiness is a no-op. `s3://` sources require AWS infrastructure; validate them in sandbox/production where readiness genuinely reflects queryability | diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index d732f0d5..78a1bbb1 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -476,14 +476,17 @@ export class KnowledgeBase extends Scope { consecutiveTransientErrors = 0; } catch (err) { // Terminal errors (FAILED job, missing-KB config, validation) short-circuit. - if (!isTransientControlPlaneError(err)) throw err; + // isTransientControlPlaneError() only returns true after its own + // `instanceof Error` guard, so pairing it with one here narrows `err` to + // `Error` for the rest of the catch — no per-use `as Error` casts needed. + if (!(err instanceof Error) || !isTransientControlPlaneError(err)) throw err; // Transient control-plane blip: absorb a bounded run, then give up. consecutiveTransientErrors += 1; - lastTransient = err as Error; + lastTransient = err; if (consecutiveTransientErrors > maxConsecutiveTransientErrors) throw err; this.log.warn( `waitUntilReady: tolerating transient control-plane error ` + - `(${consecutiveTransientErrors}/${maxConsecutiveTransientErrors}), retrying: ${(err as Error).message}`, + `(${consecutiveTransientErrors}/${maxConsecutiveTransientErrors}), retrying: ${err.message}`, ); } if (Date.now() >= deadline) { diff --git a/test-apps/comprehensive/test/knowledge-base.test.ts b/test-apps/comprehensive/test/knowledge-base.test.ts index 60ee1574..24d89d13 100644 --- a/test-apps/comprehensive/test/knowledge-base.test.ts +++ b/test-apps/comprehensive/test/knowledge-base.test.ts @@ -3,7 +3,6 @@ import { describe, test, before } from 'node:test'; import assert from 'node:assert'; -import { setTimeout } from 'node:timers/promises'; import { isBlocksError } from '@aws-blocks/core'; import type { api as apiType } from 'aws-blocks'; @@ -12,52 +11,39 @@ const ENV = process.env.BLOCKS_TEST_ENV || 'local'; const isLocal = ENV === 'local'; /** - * Gate retrieval tests on knowledge-base ingestion readiness using the - * `isReady()` API (exposed here as `kbReady`). Bedrock ingests asynchronously - * after deploy, so during the warm-up window we poll readiness rather than - * probing `kbRetrieve` for results. + * Gate retrieval tests on knowledge-base ingestion readiness. * - * - `kbReady() === false` is the expected transient "still ingesting" state — - * we print a friendly one-liner and keep polling. - * - A *thrown* error is a real failure (a failed ingestion job surfaced as - * `IngestionFailedException`, a `KnowledgeBaseValidationError`, or anything - * unexpected) and is surfaced immediately rather than masked as warm-up. + * Bedrock ingests asynchronously after deploy, so during the warm-up window we + * wait for the KB to become queryable before probing `kbRetrieve`. We delegate + * to the wired `waitUntilReady()` endpoint (exposed here as `kbWaitUntilReady`) + * rather than hand-rolling a poll loop over `isReady()` (`kbReady`): + * `waitUntilReady()` owns the deadline AND rides out transient control-plane + * blips — throttling, a `RetrievalFailed`, or a brief not-yet-visible + * `ResourceNotFoundException` during the post-deploy poll — that a per-poll + * `isReady()` would otherwise surface as a hard suite failure. * - * In local mode the mock reports ready immediately, so this returns on the - * first poll. + * A *thrown* error here is therefore a real failure (a failed ingestion job + * surfaced as `IngestionFailedException`, a `KnowledgeBaseValidationError`, the + * readiness timeout, or anything unexpected) and is surfaced immediately rather + * than masked as warm-up. + * + * In local mode the mock resolves immediately, so this returns on the first poll. */ async function gateOnReadiness( api: typeof apiType, { timeoutMs = 180_000, pollIntervalMs = 10_000 } = {}, ): Promise { const start = Date.now(); - const deadline = start + timeoutMs; - let attempt = 0; - // Intentionally unbounded: the only exits are a ready KB (return), a thrown - // readiness error, or the mid-loop `Date.now() >= deadline` check below — that - // check is the sole timeout path (it must run after a poll, so the throw, not - // a loop-condition exit, is what bounds the wait). - while (true) { - attempt++; - const elapsed = Math.round((Date.now() - start) / 1000); - let ready: boolean; - try { - ready = await api.kbReady(); - } catch (err: any) { - // Real failure (failed ingestion / validation / unexpected) — surface it. - console.error(`❌ KB readiness check failed: ${err.name || err.message}`); - throw err; - } - if (ready) { - console.log(`✅ KB ready (ingestion complete) — ${elapsed}s elapsed`); - return; - } - if (Date.now() >= deadline) { - throw new Error(`KB did not become ready within ${timeoutMs / 1000}s`); - } - console.log(`⏳ KB still warming up (ingestion in progress) — attempt #${attempt}, ${elapsed}s elapsed`); - await setTimeout(pollIntervalMs); + console.log('⏳ Waiting for KB ingestion readiness (warming up if needed)…'); + try { + await api.kbWaitUntilReady({ timeoutMs, pollIntervalMs }); + } catch (err: any) { + // Real failure (failed ingestion / validation / timeout / unexpected) — surface it. + console.error(`❌ KB readiness check failed: ${err.name || err.message}`); + throw err; } + const elapsed = Math.round((Date.now() - start) / 1000); + console.log(`✅ KB ready (ingestion complete) — ${elapsed}s elapsed`); } export function knowledgeBaseTests(getApi: () => typeof apiType) { From ab9abe69a7df5a760e80100a45c28484765356c6 Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Mon, 29 Jun 2026 12:19:25 +0000 Subject: [PATCH 12/14] feat(bb-knowledge-base): add synthGuard stubs for runtime methods + CDK test for ingestion IAM grants & DATA_SOURCE_ID config --- .../bb-knowledge-base/src/index.cdk.test.ts | 132 ++++++++++++++++-- packages/bb-knowledge-base/src/index.cdk.ts | 12 +- 2 files changed, 135 insertions(+), 9 deletions(-) diff --git a/packages/bb-knowledge-base/src/index.cdk.test.ts b/packages/bb-knowledge-base/src/index.cdk.test.ts index 16bec2ea..7f5838a6 100644 --- a/packages/bb-knowledge-base/src/index.cdk.test.ts +++ b/packages/bb-knowledge-base/src/index.cdk.test.ts @@ -2,22 +2,33 @@ // SPDX-License-Identifier: Apache-2.0 /** - * CDK-side teardown tests for KnowledgeBase. + * CDK-side tests for KnowledgeBase. * - * History: the data `s3.Bucket` paired `RemovalPolicy.DESTROY` with + * Teardown: the data `s3.Bucket` paired `RemovalPolicy.DESTROY` with * `autoDeleteObjects` on a `destroy`/sandbox teardown, but the S3 Vectors L1 * resources (`CfnVectorBucket` + `CfnIndex`) relied solely on their default - * CloudFormation `DeletionPolicy` and leaked. These tests pin the fix: the + * CloudFormation `DeletionPolicy` and leaked. Those tests pin the fix: the * vector resources now mirror the data bucket's removal policy. + * + * Ingestion readiness: the handler role must be able to read ingestion-job + * status (`bedrock:GetIngestionJob` / `bedrock:ListIngestionJobs`) — scoped to + * the KB ARN like the existing `bedrock:Retrieve` grant — and the + * `DATA_SOURCE_ID` config the runtime readiness checks rely on must be + * registered and surface in the synthesized template. + * + * Synth guards: the runtime methods (`retrieve` / `isReady` / `waitUntilReady`) + * are stubbed on the CDK construct so an accidental synth-time call throws an + * actionable error instead of a cryptic `TypeError: not a function`. */ import { test } from 'node:test'; +import assert from 'node:assert'; import { fileURLToPath } from 'node:url'; import { dirname, resolve } from 'node:path'; import * as cdk from 'aws-cdk-lib'; import type { Construct } from 'constructs'; -import { Template } from 'aws-cdk-lib/assertions'; +import { Template, Match } from 'aws-cdk-lib/assertions'; import * as s3vectors from 'aws-cdk-lib/aws-s3vectors'; -import { Scope, DEFAULT_NODE_RUNTIME } from '@aws-blocks/core/cdk'; +import { Scope, DEFAULT_NODE_RUNTIME, finalizeConfigRegistry } from '@aws-blocks/core/cdk'; import { KnowledgeBase } from './index.cdk.js'; // Real local-folder source so BucketDeployment + sidecar generation synth. @@ -46,17 +57,24 @@ class StubBlocksStack extends cdk.Stack { } } -function synth(options: { removalPolicy?: 'destroy' | 'retain'; sandbox?: boolean } = {}): Template { +function buildStack(options: { removalPolicy?: 'destroy' | 'retain'; sandbox?: boolean } = {}): { + stack: StubBlocksStack; + kb: KnowledgeBase; +} { const app = new cdk.App(options.sandbox ? { context: { sandboxMode: 'true' } } : undefined); // S3 bucket names must be lowercase; the data bucket derives its name from // the scope chain, so keep ids lowercase. const stack = new StubBlocksStack(app, 'teststack'); const parent = new Scope('app'); - new KnowledgeBase(parent, 'docs', { + const kb = new KnowledgeBase(parent, 'docs', { source: FIXTURES, ...(options.removalPolicy ? { removalPolicy: options.removalPolicy } : {}), }); - return Template.fromStack(stack); + return { stack, kb }; +} + +function synth(options: { removalPolicy?: 'destroy' | 'retain'; sandbox?: boolean } = {}): Template { + return Template.fromStack(buildStack(options).stack); } test("CDK: removalPolicy 'destroy' makes the data bucket + vector store deletable and adds auto-delete", () => { @@ -90,3 +108,101 @@ test('CDK: sandboxMode context defaults the data bucket + vector store to destro template.hasResource(VECTOR_BUCKET_TYPE, { DeletionPolicy: 'Delete' }); template.hasResource(VECTOR_INDEX_TYPE, { DeletionPolicy: 'Delete' }); }); + +test('CDK: handler role can read ingestion-job status (GetIngestionJob/ListIngestionJobs), scoped to the KB ARN like bedrock:Retrieve', () => { + const template = synth(); + + // isReady()/waitUntilReady() poll ingestion-job status — the handler role + // needs both actions, granted as Allow. + template.hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: Match.objectLike({ + Statement: Match.arrayWith([ + Match.objectLike({ + Action: ['bedrock:GetIngestionJob', 'bedrock:ListIngestionJobs'], + Effect: 'Allow', + }), + ]), + }), + }); + + // ...and that grant is scoped to the SAME knowledge-base ARN as the existing + // bedrock:Retrieve grant (not a wildcard) — ingestion jobs are sub-resources + // of the KB ARN. + const statements = Object.values(template.findResources('AWS::IAM::Policy')).flatMap( + (policy) => policy.Properties.PolicyDocument.Statement as Array>, + ); + const retrieveStmt = statements.find((s) => s.Action === 'bedrock:Retrieve'); + const ingestionStmt = statements.find( + (s) => Array.isArray(s.Action) && (s.Action as string[]).includes('bedrock:GetIngestionJob'), + ); + assert.ok(retrieveStmt, 'bedrock:Retrieve grant is present'); + assert.ok(ingestionStmt, 'ingestion-status grant is present'); + assert.deepStrictEqual( + ingestionStmt.Resource, + retrieveStmt.Resource, + 'ingestion-status grant is scoped to the same KB ARN as bedrock:Retrieve', + ); +}); + +test('CDK: registers the DATA_SOURCE_ID config (wired to the data source) and surfaces it in the synthesized template', () => { + const { stack } = buildStack(); + + // registerConfig records BLOCKS_{FULLID}_DATA_SOURCE_ID on the stack's config + // registry, bound to the Bedrock data source's id — the runtime readiness + // checks read it back at cold start. (Mirrors bb-app-setting's CDK test.) + const registry = (stack as any)[Symbol.for('BLOCKS_CONFIG_REGISTRY')] as + | { entries: Map } + | undefined; + assert.ok(registry, 'config registry exists on the stack'); + + const dataSourceKey = [...registry.entries.keys()].find((k) => k.endsWith('_DATA_SOURCE_ID')); + assert.ok(dataSourceKey, 'a *_DATA_SOURCE_ID config key is registered'); + assert.match(dataSourceKey, /^BLOCKS_.+_DATA_SOURCE_ID$/); + + const resolvedValue = stack.resolve(registry.entries.get(dataSourceKey)) as { + 'Fn::GetAtt'?: [string, string]; + }; + assert.ok(resolvedValue['Fn::GetAtt'], 'config value is a CDK token (Fn::GetAtt)'); + assert.strictEqual( + resolvedValue['Fn::GetAtt'][1], + 'DataSourceId', + 'config value is wired to the data source id', + ); + + // finalizeConfigRegistry serializes the registry into blocks-config.json via a + // BucketDeployment; the rendered config blob in the synthesized template + // carries the DATA_SOURCE_ID key bound to the data source's DataSourceId, and + // the handler is wired to read it from S3. (Mirrors bb-auth-cognito's CDK test.) + finalizeConfigRegistry(stack, stack.handler); + const template = Template.fromStack(stack); + + const configBlob = JSON.stringify( + Object.values(template.findResources('Custom::CDKBucketDeployment')), + ); + assert.match(configBlob, /BLOCKS_[A-Z0-9_]+_DATA_SOURCE_ID/); + assert.ok( + configBlob.includes('DataSourceId'), + 'config blob binds the DATA_SOURCE_ID key to the data source id', + ); + + template.hasResourceProperties('AWS::Lambda::Function', { + Environment: Match.objectLike({ + Variables: Match.objectLike({ + BLOCKS_CONFIG_BUCKET: Match.anyValue(), + BLOCKS_CONFIG_KEY: 'blocks-config.json', + }), + }), + }); +}); + +test('CDK: calling a runtime method throws an actionable synth-time error (not a cryptic TypeError)', () => { + const { kb } = buildStack(); + const construct = kb as unknown as Record unknown>; + for (const method of ['retrieve', 'isReady', 'waitUntilReady']) { + assert.throws( + () => construct[method]('x'), + /cannot be called during CDK synth/, + `${method}() should throw the actionable synth-time error`, + ); + } +}); diff --git a/packages/bb-knowledge-base/src/index.cdk.ts b/packages/bb-knowledge-base/src/index.cdk.ts index c900538f..6aeb5add 100644 --- a/packages/bb-knowledge-base/src/index.cdk.ts +++ b/packages/bb-knowledge-base/src/index.cdk.ts @@ -8,7 +8,7 @@ import * as bedrock from 'aws-cdk-lib/aws-bedrock'; import * as s3vectors from 'aws-cdk-lib/aws-s3vectors'; import * as s3deploy from 'aws-cdk-lib/aws-s3-deployment'; import * as cr from 'aws-cdk-lib/custom-resources'; -import { Scope, registerConfig } from '@aws-blocks/core/cdk'; +import { Scope, registerConfig, synthGuard } from '@aws-blocks/core/cdk'; import type { ScopeParent } from '@aws-blocks/core'; import type { KnowledgeBaseOptions, ChunkingConfig } from './types.js'; import * as path from 'node:path'; @@ -479,4 +479,14 @@ export class KnowledgeBase extends Scope { resources: [knowledgeBaseArn], })); } + + // ── Runtime methods are not available during CDK synth ──────────────── + // Under `--conditions=cdk` a KnowledgeBase resolves to this construct, which + // only provisions infrastructure. The data/readiness methods (retrieve/ + // isReady/waitUntilReady) live in the runtime build. Calling them at module + // top-level (which runs during synth) would otherwise fail with a cryptic + // `X is not a function`; these stubs turn that into an actionable message. + retrieve(..._args: unknown[]): never { return synthGuard('KnowledgeBase', 'retrieve'); } + isReady(..._args: unknown[]): never { return synthGuard('KnowledgeBase', 'isReady'); } + waitUntilReady(..._args: unknown[]): never { return synthGuard('KnowledgeBase', 'waitUntilReady'); } } From 6be00432b6b9712a557a0d360a296ca835c9799a Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Mon, 29 Jun 2026 20:21:14 +0000 Subject: [PATCH 13/14] docs(bb-knowledge-base): add s3:// readiness caveat to README; clamp jitter floor; fix e2e gate comment; soft-pass e2e waitUntilReady on Bedrock warm-up timeout; reset lastTransient on clean poll; document timeoutMs:0 one-shot semantics --- packages/bb-knowledge-base/README.md | 2 +- .../bb-knowledge-base/src/index.aws.test.ts | 41 +++++++++++++++++++ packages/bb-knowledge-base/src/index.aws.ts | 8 +++- packages/bb-knowledge-base/src/types.ts | 9 +++- .../comprehensive/test/knowledge-base.test.ts | 40 +++++++++++++----- 5 files changed, 85 insertions(+), 15 deletions(-) diff --git a/packages/bb-knowledge-base/README.md b/packages/bb-knowledge-base/README.md index effa6fdb..81d4b702 100644 --- a/packages/bb-knowledge-base/README.md +++ b/packages/bb-knowledge-base/README.md @@ -127,7 +127,7 @@ await kb.waitUntilReady({ signal: AbortSignal.timeout(120_000) }); `maxConsecutiveTransientErrors` is the number of *consecutive* transient control-plane errors tolerated before giving up; the counter resets on any clean poll. Two conditions are treated as transient and ridden out: throttling / transient network failures, **and** a *not-yet-visible* knowledge base — in the post-deploy window the control plane can briefly return `ResourceNotFoundException` (the freshly-created KB or data source hasn't propagated yet), which `waitUntilReady()` absorbs rather than giving up on. Terminal errors always short-circuit immediately regardless of the limit: a `FAILED` ingestion job, and a *missing-KB config* error (the `KB_ID` env var is unset — distinct from the transient not-yet-visible case). When `signal` is provided, the wait is cancelled promptly (checked before each poll and during the inter-poll delay), rejecting with the signal's abort reason (by default a `DOMException` named `'AbortError'`). -Both local-folder and imported `s3://` sources register a BB-managed data source, so readiness reflects that data source's ingestion job in either case. (A deployment predating this readiness API has no data source id injected, so `isReady()` returns `true` immediately — there is nothing to track.) This pre-feature deployment is the **only** case where `isReady()` returns `true` without consulting an actual ingestion job — re-deploying injects `DATA_SOURCE_ID` and restores real tracking, so a freshly deployed KB always reflects a real job status (don't mistake the "nothing to track" shortcut for "ingestion confirmed complete" when gating live traffic). In local development the mock is always ready. +Both local-folder and imported `s3://` sources register a BB-managed data source, so readiness reflects that data source's ingestion job in either case. (A deployment predating this readiness API has no data source id injected, so `isReady()` returns `true` immediately — there is nothing to track.) This pre-feature deployment is the **only** case where `isReady()` returns `true` without consulting an actual ingestion job — re-deploying injects `DATA_SOURCE_ID` and restores real tracking, so a freshly deployed KB always reflects a real job status (don't mistake the "nothing to track" shortcut for "ingestion confirmed complete" when gating live traffic). In local development the mock is always ready. Note that a local `isReady()` of `true` does **not** imply `retrieve()` works for an `s3://` source — the mock rejects `s3://` with `InvalidSourceConfigException` (the inverse of the production contract), so validate `s3://` sources in sandbox/production where readiness genuinely reflects queryability. ## Metadata Filtering diff --git a/packages/bb-knowledge-base/src/index.aws.test.ts b/packages/bb-knowledge-base/src/index.aws.test.ts index 3135930f..95a60648 100644 --- a/packages/bb-knowledge-base/src/index.aws.test.ts +++ b/packages/bb-knowledge-base/src/index.aws.test.ts @@ -830,6 +830,47 @@ describe('waitUntilReady', () => { } }); + test('Timeout message stays plain when an early transient blip was cleared by later clean polls', async () => { + const cleanup = setReadyEnv('TEST', 'WUR17'); + // Inverse of the mid-streak case (WUR16): the transient blip happens on the + // FIRST poll, but every later poll is a clean IN_PROGRESS, so the streak (and + // the remembered error) reset well before the deadline. The Timeout that + // eventually fires must read as a plain "still ingesting" timeout — the stale + // transient from the already-cleared streak must never be folded into it. + let calls = 0; + mockAgentSend(() => { + calls += 1; + if (calls === 1) { + const e = new Error('Rate exceeded'); + e.name = 'ThrottlingException'; // → RetrievalFailed (transient) on the first poll only + throw e; + } + return { ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'IN_PROGRESS' }] }; + }); + + try { + const kb = new KnowledgeBase({ id: 'test' }, 'wur17', { source: './knowledge' }); + await assert.rejects( + () => kb.waitUntilReady({ timeoutMs: 40, pollIntervalMs: 5 }), + (err: Error) => { + assert.strictEqual(err.name, KnowledgeBaseErrors.Timeout); + assert.ok(calls >= 2, `expected clean polls after the initial blip, got ${calls} call(s)`); + assert.ok( + !err.message.includes('last transient error'), + 'a transient blip cleared by later clean polls must not leak into the timeout message', + ); + assert.ok( + !err.message.includes('Rate exceeded'), + 'the stale transient detail from the cleared streak must not appear', + ); + return true; + }, + ); + } finally { + cleanup(); + } + }); + test('resolves immediately when no data source id is configured', async () => { // Pre-readiness-API deployment: no DATA_SOURCE_ID injected, so there is // nothing to poll. (Not a source-type distinction — the CDK layer registers diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index 78a1bbb1..b7caf099 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -91,7 +91,7 @@ function sleep(ms: number, signal?: AbortSignal): Promise { */ function jitterInterval(ms: number): number { const factor = 1 + (Math.random() * 2 - 1) * 0.2; // 0.8–1.2 - return Math.round(ms * factor); + return Math.max(1, Math.round(ms * factor)); } // Match only messages that clearly indicate a metadata filter issue. @@ -472,8 +472,12 @@ export class KnowledgeBase extends Scope { // not deployed (or briefly not-yet-visible), or RetrievalFailedException // for transient blips. if (await this.isReady()) return; - // A clean poll clears any transient-error streak. + // A clean poll clears any transient-error streak — reset the remembered + // error alongside the counter so a later Timeout can only ever fold in a + // transient from the streak still in flight at the deadline, never a stale + // one from an earlier streak that clean polls already rode out. consecutiveTransientErrors = 0; + lastTransient = undefined; } catch (err) { // Terminal errors (FAILED job, missing-KB config, validation) short-circuit. // isTransientControlPlaneError() only returns true after its own diff --git a/packages/bb-knowledge-base/src/types.ts b/packages/bb-knowledge-base/src/types.ts index c2b4ef96..84192769 100644 --- a/packages/bb-knowledge-base/src/types.ts +++ b/packages/bb-knowledge-base/src/types.ts @@ -166,7 +166,14 @@ export interface RetrieveResult { * ``` */ export interface WaitUntilReadyOptions { - /** Maximum time to wait for ingestion to complete, in milliseconds. Default: 300000 (5 minutes). */ + /** + * Maximum time to wait for ingestion to complete, in milliseconds. Default: 300000 (5 minutes). + * + * `timeoutMs: 0` is a one-shot check, not a no-poll fast-fail: the deadline is + * evaluated *after* the first readiness poll, so exactly one `isReady()` poll + * always runs (and can resolve the wait) before a `KnowledgeBaseTimeoutException` + * is thrown. Clamped to a minimum of 0. + */ timeoutMs?: number; /** * Delay between readiness polls, in milliseconds. Clamped to a minimum of 1ms. diff --git a/test-apps/comprehensive/test/knowledge-base.test.ts b/test-apps/comprehensive/test/knowledge-base.test.ts index 24d89d13..a1a7a623 100644 --- a/test-apps/comprehensive/test/knowledge-base.test.ts +++ b/test-apps/comprehensive/test/knowledge-base.test.ts @@ -7,6 +7,7 @@ import { isBlocksError } from '@aws-blocks/core'; import type { api as apiType } from 'aws-blocks'; const ValidationError = 'KnowledgeBaseValidationError'; +const Timeout = 'KnowledgeBaseTimeoutException'; const ENV = process.env.BLOCKS_TEST_ENV || 'local'; const isLocal = ENV === 'local'; @@ -76,19 +77,36 @@ export function knowledgeBaseTests(getApi: () => typeof apiType) { }); // --- Readiness: cover the wired waitUntilReady() endpoint end-to-end --- - // The retrieval suites gate on isReady() (via kbReady); this exercises the - // separate waitUntilReady() polling path. Locally the mock resolves on the - // first poll; on AWS we give it the same budget as gateOnReadiness so a - // still-ingesting KB is waited out rather than surfaced as a failure. + // The retrieval suites gate via gateOnReadiness() → kbWaitUntilReady(); this + // exercises that same waitUntilReady() polling path directly. (kbReady / + // isReady() is wired but not exercised by this suite — it's covered by unit + // tests.) Locally the mock resolves on the first poll; on AWS we give it the + // same budget as gateOnReadiness so a still-ingesting KB is waited out rather + // than surfaced as a failure. describe('waitUntilReady', () => { - test('resolves once the KB is ready', async () => { + test('resolves once the KB is ready', async (t) => { const api = getApi(); - const result = await api.kbWaitUntilReady( - isLocal - ? { timeoutMs: 5_000, pollIntervalMs: 50 } - : { timeoutMs: 180_000, pollIntervalMs: 10_000 }, - ); - assert.deepStrictEqual(result, { success: true }); + try { + const result = await api.kbWaitUntilReady( + isLocal + ? { timeoutMs: 5_000, pollIntervalMs: 50 } + : { timeoutMs: 180_000, pollIntervalMs: 10_000 }, + ); + assert.deepStrictEqual(result, { success: true }); + } catch (err: unknown) { + // Warm-up tolerance: unlike the retrieval suites, this standalone test is NOT + // behind the gateOnReadiness() gate, so on AWS a slow-but-healthy KB whose + // ingestion outruns the 180s budget makes kbWaitUntilReady throw a Timeout. + // That's the post-deploy warm-up window, not a defect — soft-skip it here. A + // genuine IngestionFailed (or any other error) still fails the test. The local + // mock resolves on the first poll, so this branch never trips and the success + // assertion above always runs. + if (isBlocksError(err, Timeout)) { + t.skip(`KB still warming up — ingestion exceeded budget: ${(err as Error).message}`); + return; + } + throw err; + } }); }); From 26c2122257411a6fbb42c7b246b6890c0f952e15 Mon Sep 17 00:00:00 2001 From: "H. Furkan Bozkurt" Date: Wed, 1 Jul 2026 11:20:39 +0000 Subject: [PATCH 14/14] refactor(bb-knowledge-base): rename readiness API to isSynced/waitUntilSynced + address review feedback --- .changeset/kb-readiness.md | 12 +- packages/bb-knowledge-base/API.md | 6 +- packages/bb-knowledge-base/DESIGN.md | 14 +- packages/bb-knowledge-base/README.md | 28 ++-- packages/bb-knowledge-base/src/errors.ts | 2 +- .../bb-knowledge-base/src/index.aws.test.ts | 124 +++++++++--------- packages/bb-knowledge-base/src/index.aws.ts | 124 +++++++++++------- .../bb-knowledge-base/src/index.browser.ts | 8 +- .../bb-knowledge-base/src/index.cdk.test.ts | 89 +++++++++++-- packages/bb-knowledge-base/src/index.cdk.ts | 18 +-- .../bb-knowledge-base/src/index.mock.test.ts | 30 ++--- packages/bb-knowledge-base/src/index.mock.ts | 25 ++-- packages/bb-knowledge-base/src/types.ts | 26 ++-- packages/blocks/API.md | 4 +- packages/blocks/src/index.cdk.ts | 2 +- packages/blocks/src/index.ts | 2 +- test-apps/comprehensive/aws-blocks/index.ts | 14 +- .../comprehensive/test/knowledge-base.test.ts | 64 ++++----- 18 files changed, 354 insertions(+), 238 deletions(-) diff --git a/.changeset/kb-readiness.md b/.changeset/kb-readiness.md index 84f9ecc9..86e5c32c 100644 --- a/.changeset/kb-readiness.md +++ b/.changeset/kb-readiness.md @@ -3,13 +3,13 @@ "@aws-blocks/blocks": minor --- -Add `isReady()` / `waitUntilReady()` ingestion-readiness API to KnowledgeBase. +Add `isSynced()` / `waitUntilSynced()` ingestion-sync API to KnowledgeBase. -Bedrock ingestion runs asynchronously after deploy, so during the warm-up window `retrieve()` returns an empty array even for queries that would later match — making "empty" ambiguous between "still warming up" and "ingested, no match". The new methods resolve that ambiguity: +Bedrock ingestion runs asynchronously after deploy, so during the initial pre-sync window `retrieve()` returns an empty array even for queries that would later match — making "empty" ambiguous between "not yet synced with your latest data" and "synced, no match". The new methods resolve that ambiguity (mirroring Bedrock's own "Sync" / "sync with your latest data" terminology): -- `isReady(): Promise` — `true` once the data source's most recent ingestion job is `COMPLETE`; `false` while ingestion is pending. Both local-folder and imported `s3://` sources register a BB-managed data source, so both are tracked (the "no managed data source → ready" shortcut applies only to deployments predating this API, which have no data source id injected). Throws a typed `IngestionFailedException` (including `failureReasons`) if the latest job failed. -- `waitUntilReady(options?: { timeoutMs?: number; pollIntervalMs?: number; maxConsecutiveTransientErrors?: number; signal?: AbortSignal }): Promise` — polls until ready (defaults: `timeoutMs` 300000, `pollIntervalMs` 5000, `maxConsecutiveTransientErrors` 3), throwing a typed `KnowledgeBaseTimeoutException` on timeout or propagating `IngestionFailedException` on a failed job. Up to `maxConsecutiveTransientErrors` *consecutive* transient control-plane errors are tolerated (the counter resets on a clean poll); terminal errors short-circuit immediately. Transient covers both throttling / transient network failures **and** a *not-yet-visible* knowledge base — during the post-deploy window the control plane can briefly return `ResourceNotFoundException` (the freshly-created KB/data source hasn't propagated yet), which is ridden out rather than treated as terminal; a *missing-KB config* error (`KB_ID` unset) stays terminal. The poll interval carries ±20% jitter (only the delay between polls varies, never the poll count or the deadline) so many KBs don't poll in lockstep. Pass an optional `signal` (`AbortSignal`) to cancel the wait — checked before each poll and during the inter-poll delay — which rejects with the signal's abort reason (default: a `DOMException` named `'AbortError'`). +- `isSynced(): Promise` — `true` once the data source's most recent ingestion job is `COMPLETE`; `false` while it is not yet synced with your latest data. This reports data *freshness*, not availability — `retrieve()` is always callable and serves the prior synced snapshot during a re-ingestion. Both local-folder and imported `s3://` sources register a BB-managed data source, so both are tracked (the "no managed data source → synced" shortcut applies only to deployments predating this API, which have no data source id injected). Throws a typed `IngestionFailedException` (including `failureReasons`) if the latest job failed. +- `waitUntilSynced(options?: { timeoutMs?: number; pollIntervalMs?: number; maxConsecutiveTransientErrors?: number; signal?: AbortSignal }): Promise` — polls until synced (defaults: `timeoutMs` 300000, `pollIntervalMs` 5000, `maxConsecutiveTransientErrors` 3), throwing a typed `KnowledgeBaseTimeoutException` on timeout or propagating `IngestionFailedException` on a failed job. Up to `maxConsecutiveTransientErrors` *consecutive* transient control-plane errors are tolerated (the counter resets on a clean poll); terminal errors short-circuit immediately. Transient covers both throttling / transient network failures **and** a *not-yet-visible* knowledge base — during the post-deploy window the control plane can briefly return `ResourceNotFoundException` (the freshly-created KB/data source hasn't propagated yet), which is ridden out rather than treated as terminal; a *missing-KB config* error (`KB_ID` unset) stays terminal. The poll interval carries ±20% jitter (only the delay between polls varies, never the poll count or the deadline) so many KBs don't poll in lockstep. Pass an optional `signal` (`AbortSignal`) to cancel the wait — checked before each poll and during the inter-poll delay — which rejects with the signal's abort reason (default: a `DOMException` named `'AbortError'`). -Purely additive — `retrieve()` and all existing signatures are unchanged. The local mock reports ready immediately (no warm-up window in local dev). +Purely additive — `retrieve()` and all existing signatures are unchanged. The local mock reports synced immediately (no async ingestion window in local dev). -The umbrella `@aws-blocks/blocks` package now also re-exports the new `WaitUntilReadyOptions` type (alongside the existing `KnowledgeBase` re-exports) from both its runtime and CDK entry points, so consumers importing from `@aws-blocks/blocks` can reference it directly. +The umbrella `@aws-blocks/blocks` package now also re-exports the new `WaitUntilSyncedOptions` type (alongside the existing `KnowledgeBase` re-exports) from both its runtime and CDK entry points, so consumers importing from `@aws-blocks/blocks` can reference it directly. diff --git a/packages/bb-knowledge-base/API.md b/packages/bb-knowledge-base/API.md index 34aa5cd9..80405b75 100644 --- a/packages/bb-knowledge-base/API.md +++ b/packages/bb-knowledge-base/API.md @@ -24,11 +24,11 @@ export class KnowledgeBase extends Scope { constructor(scope: ScopeParent, id: string, _options: KnowledgeBaseOptions); // (undocumented) readonly bbName = "KnowledgeBase"; - isReady(): Promise; + isSynced(): Promise; // @internal protected log: ChildLogger; retrieve(query: string, options?: RetrieveOptions): Promise; - waitUntilReady(options?: WaitUntilReadyOptions): Promise; + waitUntilSynced(options?: WaitUntilSyncedOptions): Promise; } // @public @@ -76,7 +76,7 @@ export interface RetrieveResult { export type SourceConfig = string; // @public -export interface WaitUntilReadyOptions { +export interface WaitUntilSyncedOptions { maxConsecutiveTransientErrors?: number; pollIntervalMs?: number; signal?: AbortSignal; diff --git a/packages/bb-knowledge-base/DESIGN.md b/packages/bb-knowledge-base/DESIGN.md index dffe9a6b..53dd5d44 100644 --- a/packages/bb-knowledge-base/DESIGN.md +++ b/packages/bb-knowledge-base/DESIGN.md @@ -20,9 +20,11 @@ Design document for KnowledgeBase. For usage, see [README.md](./README.md). **Rationale:** Ingestion can take minutes to hours depending on corpus size. Blocking `cdk deploy` until ingestion completes would make iterative development painful. Fire-and-forget means the deploy finishes quickly and ingestion happens in the background. The trade-off is that the knowledge base may return stale or empty results for a brief window after deploy. This is acceptable because the alternative (using a CDK `Provider` with `isComplete` polling) adds significant complexity and Lambda cold-start cost for a one-time operation. -**Resolution of the warm-up window:** The `isReady()` / `waitUntilReady()` readiness API (see [README.md](./README.md#readiness)) closes the gap left by fire-and-forget ingestion. Rather than blocking the deploy, callers poll the data source's ingestion-job status at runtime (`ListIngestionJobs` / `GetIngestionJob`) and gate `retrieve()` on completion — keeping deploys fast while giving application code a reliable "is the KB queryable yet?" signal. `COMPLETE` → ready, `FAILED` → throws `IngestionFailedException`, anything else (or no jobs yet) → not ready. +**Resolution of the warm-up window:** The `isSynced()` / `waitUntilSynced()` sync API (see [README.md](./README.md#sync)) closes the gap left by fire-and-forget ingestion. Rather than blocking the deploy, callers poll the data source's ingestion-job status at runtime (`ListIngestionJobs` / `GetIngestionJob`) and gate on completion — keeping deploys fast while giving application code a reliable "is the KB synced with my latest data yet?" signal. `COMPLETE` → synced, `FAILED` → throws `IngestionFailedException`, anything else (or no jobs yet) → not synced. This tracks *freshness*, not availability: `retrieve()` is always callable and serves the prior snapshot during a re-ingestion (it returns empty only during the initial pre-sync window, before the first ingestion completes). So `isSynced() === false` means "not yet synced with your latest data," never "unavailable." -**Source coverage (folder and imported `s3://`):** Both a local-folder source and an imported `s3://` URI create a BB-managed `CfnDataSource` and register its `DATA_SOURCE_ID` unconditionally, so the readiness API tracks the ingestion job for either source type — `isReady()` / `waitUntilReady()` reflect that data source's most recent ingestion job in both cases. (For an `s3://` source the construct skips the `BucketDeployment` step, since the documents are expected to already be in the bucket, but it still creates the data source and fires the ingestion job — so readiness is tracked the same way.) The only case with nothing to track is a deployment that predates this readiness API: such a handler has no `DATA_SOURCE_ID` injected, so `isReady()` returns `true` immediately (treating "no managed data source" as ready). Re-deploying injects the id and enables readiness tracking. +**Embedding-propagation lag after `COMPLETE`:** `isSynced() === true` means the ingestion *job* reached `COMPLETE`. Per the Bedrock docs, for non-Aurora vector stores — and this BB uses S3 Vectors — newly-written embeddings can take a few minutes after `COMPLETE` before they are fully queryable. So `isSynced()` signals "the ingestion job finished," with a possible short embedding-propagation lag before the freshest chunks surface in `retrieve()` results. + +**Source coverage (folder and imported `s3://`):** Both a local-folder source and an imported `s3://` URI create a BB-managed `CfnDataSource` and register its `DATA_SOURCE_ID` unconditionally, so the sync API tracks the ingestion job for either source type — `isSynced()` / `waitUntilSynced()` reflect that data source's most recent ingestion job in both cases. (For an `s3://` source the construct skips the `BucketDeployment` step, since the documents are expected to already be in the bucket, but it still creates the data source and fires the ingestion job — so sync is tracked the same way.) The only case with nothing to track is a deployment that predates this sync API: such a handler has no `DATA_SOURCE_ID` injected, so `isSynced()` returns `true` immediately (treating "no managed data source" as synced). Re-deploying injects the id and enables sync tracking. ### D-KB-3: Semantic chunking as default strategy @@ -60,6 +62,8 @@ Design document for KnowledgeBase. For usage, see [README.md](./README.md). **Rationale:** KnowledgeBase requires Bedrock API access (AWS runtime) or filesystem reads (mock). Neither is available in the browser. Throwing at construction — not at `retrieve()` time — gives developers an immediate, clear error message guiding them to use server actions, API routes, or Lambda handlers. This follows the same pattern as other server-only Building Blocks. +The method stubs are consistent with construction: `retrieve()`, `isSynced()`, and `waitUntilSynced()` **all** throw `BrowserNotSupportedException` as well (none silently no-ops or returns a fake "synced"). So the browser layer's sync contract matches `retrieve()` — completing the picture across all three layers: mock is always synced (`isSynced()` returns `true`, `waitUntilSynced()` resolves immediately; see the table below), AWS polls the real ingestion job, and browser throws for the data *and* sync methods alike. + ### D-KB-9: Raw `s3.Bucket` for the data bucket (not the `FileBucket` Building Block) **Decision:** Provision the data bucket with a raw `aws-cdk-lib/aws-s3` `s3.Bucket` rather than the `FileBucket` Building Block, even though `FileBucket` exists for "an app needs an S3 bucket" use cases. @@ -108,8 +112,8 @@ Creates the following resources: 7. **AwsCustomResource (StartIngestionJob)** — Fires `bedrock:StartIngestionJob` on Create/Update. Ingestion runs asynchronously. Depends on both the data source and bucket deployment (when present) so documents are in S3 before ingestion starts. -**Handler config** (registered via `registerConfig`, surfaced to the runtime as env vars): `BLOCKS_{FULLID}_KB_ID`, `BLOCKS_{FULLID}_DATA_SOURCE_ID` (the data source id drives the `isReady()` / `waitUntilReady()` readiness checks) -**IAM grants to handler:** `bedrock:Retrieve`, `bedrock:GetIngestionJob`, `bedrock:ListIngestionJobs` on the knowledge base ARN (the ingestion-job actions back the readiness checks; the data source and its ingestion jobs are sub-resources of the KB ARN) +**Handler config** (registered via `registerConfig`, surfaced to the runtime as env vars): `BLOCKS_{FULLID}_KB_ID`, `BLOCKS_{FULLID}_DATA_SOURCE_ID` (the data source id drives the `isSynced()` / `waitUntilSynced()` sync checks) +**IAM grants to handler:** `bedrock:Retrieve`, `bedrock:GetIngestionJob`, `bedrock:ListIngestionJobs` on the knowledge base ARN (the ingestion-job actions back the sync checks; the data source and its ingestion jobs are sub-resources of the KB ARN) ## Mock Implementation @@ -132,4 +136,4 @@ Creates the following resources: | No ingestion pipeline | Documents are indexed synchronously on first `retrieve()` | No mitigation — the mock doesn't need async ingestion. First call may be slower due to indexing | | No IAM enforcement | Permission errors only surface in AWS | No mitigation — IAM is handled by CDK grants automatically | | Immediate consistency | New documents appear instantly vs async ingestion in AWS | No mitigation — eventual consistency in AWS is inherent to the Bedrock ingestion pipeline | -| Unconditional mock readiness | `isReady()` always returns `true` (and `waitUntilReady()` resolves immediately) — even for an `s3://` source that `retrieve()` rejects with `InvalidSourceConfigException`. Local readiness is therefore NOT a proxy for a working local `retrieve()` on `s3://` sources — the inverse of the production contract, where `isReady() === true` implies `retrieve()` is queryable | No mitigation — local has no async ingestion to wait on, so readiness is a no-op. `s3://` sources require AWS infrastructure; validate them in sandbox/production where readiness genuinely reflects queryability | +| Unconditional mock sync | `isSynced()` always returns `true` (and `waitUntilSynced()` resolves immediately) — even for an `s3://` source that `retrieve()` rejects with `InvalidSourceConfigException`. Local sync state is therefore NOT a proxy for a working local `retrieve()` on `s3://` sources — the inverse of the production contract, where `isSynced() === true` implies `retrieve()` is queryable | No mitigation — local has no async ingestion to wait on, so sync is a no-op. `s3://` sources require AWS infrastructure; validate them in sandbox/production where sync state genuinely reflects queryability | diff --git a/packages/bb-knowledge-base/README.md b/packages/bb-knowledge-base/README.md index 81d4b702..9f102d46 100644 --- a/packages/bb-knowledge-base/README.md +++ b/packages/bb-knowledge-base/README.md @@ -38,8 +38,8 @@ const kb = new KnowledgeBase(scope, id, options) | Method | Returns | Description | |--------|---------|-------------| | `retrieve(query, options?)` | `Promise` | Search for relevant document chunks. Returns results ranked by relevance score. | -| `isReady()` | `Promise` | Whether async ingestion has finished and the KB can serve `retrieve()`. `true` once the latest ingestion job is `COMPLETE` (or there is no BB-managed data source to track). Throws `IngestionFailed` if the latest job failed. | -| `waitUntilReady(options?)` | `Promise` | Poll `isReady()` until the KB is ready or the timeout elapses. Throws `Timeout` if it does not become ready in time. Accepts an optional `AbortSignal` to cancel the wait. | +| `isSynced()` | `Promise` | Whether the KB is synced with your latest data. `true` once the latest ingestion job is `COMPLETE` (or there is no BB-managed data source to track). Reports data *freshness*, not availability — `retrieve()` is always callable and serves the prior snapshot while a re-ingestion is in flight. Throws `IngestionFailed` if the latest job failed. | +| `waitUntilSynced(options?)` | `Promise` | Poll `isSynced()` until the KB is synced with your latest data or the timeout elapses. Throws `Timeout` if it does not sync in time. Accepts an optional `AbortSignal` to cancel the wait. | ### Options @@ -105,29 +105,29 @@ chunking: { strategy: 'fixed', chunkSize: 500, chunkOverlap: 10 } | `source` | `string` | Source document path or URL. | | `metadata` | `Record` | Document metadata. Includes auto-populated `folder` from subfolders. | -### Readiness +### Sync -Bedrock ingestion runs asynchronously after deploy, so immediately after `cdk deploy` the knowledge base may not yet be queryable — `retrieve()` returns an empty array even for queries that will later match. Use `isReady()` / `waitUntilReady()` to gate on ingestion completion: +Bedrock ingestion runs asynchronously after deploy, so immediately after `cdk deploy` the knowledge base is not yet synced with your latest data — during that initial pre-sync window `retrieve()` returns an empty array even for queries that will later match. (Once at least one ingestion job has completed, `retrieve()` always serves the most recent synced snapshot, even while a later re-ingestion is in flight.) Use `isSynced()` / `waitUntilSynced()` to gate on ingestion completion: ```typescript -// Block until the KB is queryable (e.g. right after deploy), then query -await kb.waitUntilReady({ timeoutMs: 600_000 }); +// Block until the KB is synced with your latest data (e.g. right after deploy), then query +await kb.waitUntilSynced({ timeoutMs: 600_000 }); const results = await kb.retrieve('getting started'); // Or check without blocking -if (await kb.isReady()) { +if (await kb.isSynced()) { const results = await kb.retrieve('getting started'); } // Cancel the wait with an AbortSignal (e.g. an overall request deadline) -await kb.waitUntilReady({ signal: AbortSignal.timeout(120_000) }); +await kb.waitUntilSynced({ signal: AbortSignal.timeout(120_000) }); ``` -`waitUntilReady(options?)` accepts `timeoutMs` (default `300_000`), `pollIntervalMs` (default `5_000`, clamped to a 1ms minimum), `maxConsecutiveTransientErrors` (default `3`, minimum `0`), and an optional `signal` (`AbortSignal`). The poll interval carries a small amount of random jitter (±20%) so that many knowledge bases polling after a shared deploy don't fall into lockstep — the jitter only varies the delay *between* polls and never pushes a sleep past `timeoutMs`. +`waitUntilSynced(options?)` accepts `timeoutMs` (default `300_000`), `pollIntervalMs` (default `5_000`, clamped to a 1ms minimum), `maxConsecutiveTransientErrors` (default `3`, minimum `0`), and an optional `signal` (`AbortSignal`). The poll interval carries a small amount of random jitter (±20%) so that many knowledge bases polling after a shared deploy don't fall into lockstep — the jitter only varies the delay *between* polls and never pushes a sleep past `timeoutMs`. -`maxConsecutiveTransientErrors` is the number of *consecutive* transient control-plane errors tolerated before giving up; the counter resets on any clean poll. Two conditions are treated as transient and ridden out: throttling / transient network failures, **and** a *not-yet-visible* knowledge base — in the post-deploy window the control plane can briefly return `ResourceNotFoundException` (the freshly-created KB or data source hasn't propagated yet), which `waitUntilReady()` absorbs rather than giving up on. Terminal errors always short-circuit immediately regardless of the limit: a `FAILED` ingestion job, and a *missing-KB config* error (the `KB_ID` env var is unset — distinct from the transient not-yet-visible case). When `signal` is provided, the wait is cancelled promptly (checked before each poll and during the inter-poll delay), rejecting with the signal's abort reason (by default a `DOMException` named `'AbortError'`). +`maxConsecutiveTransientErrors` is the number of *consecutive* transient control-plane errors tolerated before giving up; the counter resets on any clean poll. Two conditions are treated as transient and ridden out: throttling / transient network failures, **and** a *not-yet-visible* knowledge base — in the post-deploy window the control plane can briefly return `ResourceNotFoundException` (the freshly-created KB or data source hasn't propagated yet), which `waitUntilSynced()` absorbs rather than giving up on. Terminal errors always short-circuit immediately regardless of the limit: a `FAILED` ingestion job, and a *missing-KB config* error (the `KB_ID` env var is unset — distinct from the transient not-yet-visible case). When `signal` is provided, the wait is cancelled promptly (checked before each poll and during the inter-poll delay), rejecting with the signal's abort reason (by default a `DOMException` named `'AbortError'`). -Both local-folder and imported `s3://` sources register a BB-managed data source, so readiness reflects that data source's ingestion job in either case. (A deployment predating this readiness API has no data source id injected, so `isReady()` returns `true` immediately — there is nothing to track.) This pre-feature deployment is the **only** case where `isReady()` returns `true` without consulting an actual ingestion job — re-deploying injects `DATA_SOURCE_ID` and restores real tracking, so a freshly deployed KB always reflects a real job status (don't mistake the "nothing to track" shortcut for "ingestion confirmed complete" when gating live traffic). In local development the mock is always ready. Note that a local `isReady()` of `true` does **not** imply `retrieve()` works for an `s3://` source — the mock rejects `s3://` with `InvalidSourceConfigException` (the inverse of the production contract), so validate `s3://` sources in sandbox/production where readiness genuinely reflects queryability. +Both local-folder and imported `s3://` sources register a BB-managed data source, so sync state reflects that data source's ingestion job in either case. (A deployment predating this sync API has no data source id injected, so `isSynced()` returns `true` immediately — there is nothing to track.) This pre-feature deployment is the **only** case where `isSynced()` returns `true` without consulting an actual ingestion job — re-deploying injects `DATA_SOURCE_ID` and restores real tracking, so a freshly deployed KB always reflects a real job status (don't mistake the "nothing to track" shortcut for "ingestion confirmed complete" when gating live traffic). In local development the mock is always synced. Note that a local `isSynced()` of `true` does **not** imply `retrieve()` works for an `s3://` source — the mock rejects `s3://` with `InvalidSourceConfigException` (the inverse of the production contract), so validate `s3://` sources in sandbox/production where sync state genuinely reflects queryability. ## Metadata Filtering @@ -173,8 +173,8 @@ try { |---|---|---| | `KnowledgeBaseErrors.RetrievalFailed` | `RetrievalFailedException` | Bedrock retrieval call failed | | `KnowledgeBaseErrors.NotReady` | `KnowledgeBaseNotReadyException` | KB not deployed or env vars missing | -| `KnowledgeBaseErrors.IngestionFailed` | `IngestionFailedException` | The most recent ingestion job failed (message includes `failureReasons`) — thrown by `isReady()` / `waitUntilReady()` | -| `KnowledgeBaseErrors.Timeout` | `KnowledgeBaseTimeoutException` | `waitUntilReady()` exceeded its timeout before ingestion completed | +| `KnowledgeBaseErrors.IngestionFailed` | `IngestionFailedException` | The most recent ingestion job failed (message includes `failureReasons`) — thrown by `isSynced()` / `waitUntilSynced()` | +| `KnowledgeBaseErrors.Timeout` | `KnowledgeBaseTimeoutException` | `waitUntilSynced()` exceeded its timeout before ingestion completed | | `KnowledgeBaseErrors.InvalidSource` | `InvalidSourceConfigException` | Source folder not found or invalid config | | `KnowledgeBaseErrors.InvalidFilter` | `InvalidFilterException` | Invalid filter keys in Bedrock query | | `KnowledgeBaseErrors.ValidationError` | `KnowledgeBaseValidationError` | Empty or invalid query | @@ -182,7 +182,7 @@ try { ## Deploy Behavior -`cdk deploy` automatically triggers document ingestion (fire-and-forget). Ingestion runs asynchronously after the deploy completes. Check the AWS console to monitor ingestion progress, or call [`isReady()` / `waitUntilReady()`](#readiness) from your code to gate queries on ingestion completion. +`cdk deploy` automatically triggers document ingestion (fire-and-forget). Ingestion runs asynchronously after the deploy completes. Check the AWS console to monitor ingestion progress, or call [`isSynced()` / `waitUntilSynced()`](#sync) from your code to gate queries on ingestion completion. ## Scaling & Cost (AWS) diff --git a/packages/bb-knowledge-base/src/errors.ts b/packages/bb-knowledge-base/src/errors.ts index 8201e519..7859dfb6 100644 --- a/packages/bb-knowledge-base/src/errors.ts +++ b/packages/bb-knowledge-base/src/errors.ts @@ -30,6 +30,6 @@ export const KnowledgeBaseErrors = { BrowserNotSupported: 'BrowserNotSupportedException', /** The data source's most recent Bedrock ingestion job failed. The error message includes the reported `failureReasons`. */ IngestionFailed: 'IngestionFailedException', - /** `waitUntilReady()` exceeded its timeout before the knowledge base finished ingesting. */ + /** `waitUntilSynced()` exceeded its timeout before the knowledge base finished ingesting. */ Timeout: 'KnowledgeBaseTimeoutException', } as const; diff --git a/packages/bb-knowledge-base/src/index.aws.test.ts b/packages/bb-knowledge-base/src/index.aws.test.ts index 95a60648..63a76640 100644 --- a/packages/bb-knowledge-base/src/index.aws.test.ts +++ b/packages/bb-knowledge-base/src/index.aws.test.ts @@ -13,7 +13,7 @@ function mockRuntimeSend(fn: (cmd: unknown) => unknown) { return mock.method(BedrockAgentRuntimeClient.prototype, 'send', fn); } -// Control-plane client used by isReady()/waitUntilReady(). +// Control-plane client used by isSynced()/waitUntilSynced(). function mockAgentSend(fn: (cmd: { constructor: { name: string }; input: any }) => unknown) { return mock.method(BedrockAgentClient.prototype, 'send', fn as (cmd: unknown) => unknown); } @@ -31,8 +31,8 @@ function setKbEnv(scopeId: string, instanceId: string, kbId = 'kb-test-123') { } // Sets KB_ID and (unless dataSourceId is null) DATA_SOURCE_ID, mirroring the -// two config values the CDK layer registers. Used by readiness tests. -function setReadyEnv( +// two config values the CDK layer registers. Used by the sync tests. +function setSyncEnv( scopeId: string, instanceId: string, opts: { kbId?: string; dataSourceId?: string | null } = {}, @@ -544,16 +544,16 @@ describe('error classification — other SDK exceptions', () => { }); }); -// ── Readiness — isReady() ────────────────────────────────────────────────── +// ── Sync — isSynced() ──────────────────────────────────────────────────────── // -// Ingestion runs asynchronously after deploy, so isReady() inspects the data -// source's most recent ingestion job: COMPLETE → ready, FAILED → throws, -// anything else (or no jobs / no data source) → not-ready (or ready when there +// Ingestion runs asynchronously after deploy, so isSynced() inspects the data +// source's most recent ingestion job: COMPLETE → synced, FAILED → throws, +// anything else (or no jobs / no data source) → not-synced (or synced when there // is nothing to track). -describe('isReady', () => { +describe('isSynced', () => { test('returns true when the latest ingestion job is COMPLETE', async () => { - const cleanup = setReadyEnv('TEST', 'RDY1'); + const cleanup = setSyncEnv('TEST', 'RDY1'); mockAgentSend((cmd) => { assert.strictEqual(cmd.constructor.name, 'ListIngestionJobsCommand'); return { ingestionJobSummaries: [{ ingestionJobId: 'job-1', status: 'COMPLETE' }] }; @@ -561,56 +561,56 @@ describe('isReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'rdy1', { source: './knowledge' }); - assert.strictEqual(await kb.isReady(), true); + assert.strictEqual(await kb.isSynced(), true); } finally { cleanup(); } }); test('returns false when the latest ingestion job is IN_PROGRESS', async () => { - const cleanup = setReadyEnv('TEST', 'RDY2'); + const cleanup = setSyncEnv('TEST', 'RDY2'); mockAgentSend(() => ({ ingestionJobSummaries: [{ ingestionJobId: 'job-1', status: 'IN_PROGRESS' }] })); try { const kb = new KnowledgeBase({ id: 'test' }, 'rdy2', { source: './knowledge' }); - assert.strictEqual(await kb.isReady(), false); + assert.strictEqual(await kb.isSynced(), false); } finally { cleanup(); } }); test('returns false when no ingestion jobs exist yet (empty list)', async () => { - const cleanup = setReadyEnv('TEST', 'RDY3'); + const cleanup = setSyncEnv('TEST', 'RDY3'); mockAgentSend(() => ({ ingestionJobSummaries: [] })); try { const kb = new KnowledgeBase({ id: 'test' }, 'rdy3', { source: './knowledge' }); - assert.strictEqual(await kb.isReady(), false); + assert.strictEqual(await kb.isSynced(), false); } finally { cleanup(); } }); test('returns false when ingestionJobSummaries is undefined', async () => { - const cleanup = setReadyEnv('TEST', 'RDY3B'); + const cleanup = setSyncEnv('TEST', 'RDY3B'); mockAgentSend(() => ({})); try { const kb = new KnowledgeBase({ id: 'test' }, 'rdy3b', { source: './knowledge' }); - assert.strictEqual(await kb.isReady(), false); + assert.strictEqual(await kb.isSynced(), false); } finally { cleanup(); } }); test('returns true (without calling the control plane) when no data source id is configured', async () => { - // A deployment that predates the readiness API: KB_ID present, but no + // A deployment that predates the sync API: KB_ID present, but no // DATA_SOURCE_ID was injected, so there is no ingestion job to track. (The // CDK layer now always registers a DATA_SOURCE_ID for both folder and // imported s3:// sources — see DESIGN.md, the "Source coverage (folder and // imported s3://)" note — so this is purely the pre-feature case, not a // source-type distinction.) - const cleanup = setReadyEnv('TEST', 'RDY4', { dataSourceId: null }); + const cleanup = setSyncEnv('TEST', 'RDY4', { dataSourceId: null }); let sendCalled = false; mockAgentSend(() => { sendCalled = true; @@ -619,7 +619,7 @@ describe('isReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'rdy4', { source: './knowledge' }); - assert.strictEqual(await kb.isReady(), true); + assert.strictEqual(await kb.isSynced(), true); assert.strictEqual(sendCalled, false, 'should not query the control plane when there is no data source to track'); } finally { cleanup(); @@ -634,7 +634,7 @@ describe('isReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'rdy5', { source: './knowledge' }); await assert.rejects( - () => kb.isReady(), + () => kb.isSynced(), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.NotReady); return true; @@ -646,7 +646,7 @@ describe('isReady', () => { }); test('throws IngestionFailed (with failureReasons) when the latest job FAILED', async () => { - const cleanup = setReadyEnv('TEST', 'RDY6'); + const cleanup = setSyncEnv('TEST', 'RDY6'); mockAgentSend((cmd) => { if (cmd.constructor.name === 'ListIngestionJobsCommand') { return { ingestionJobSummaries: [{ ingestionJobId: 'job-x', status: 'FAILED' }] }; @@ -658,7 +658,7 @@ describe('isReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'rdy6', { source: './knowledge' }); await assert.rejects( - () => kb.isReady(), + () => kb.isSynced(), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.IngestionFailed); assert.ok(err.message.includes('boom one'), 'message should include failure reasons'); @@ -672,7 +672,7 @@ describe('isReady', () => { }); test('queries ListIngestionJobs with the configured ids, sorted by STARTED_AT desc, maxResults 1', async () => { - const cleanup = setReadyEnv('TEST', 'RDY7', { kbId: 'kb-aaa', dataSourceId: 'ds-bbb' }); + const cleanup = setSyncEnv('TEST', 'RDY7', { kbId: 'kb-aaa', dataSourceId: 'ds-bbb' }); let captured: any; mockAgentSend((cmd) => { captured = cmd.input; @@ -681,7 +681,7 @@ describe('isReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'rdy7', { source: './knowledge' }); - await kb.isReady(); + await kb.isSynced(); assert.strictEqual(captured.knowledgeBaseId, 'kb-aaa'); assert.strictEqual(captured.dataSourceId, 'ds-bbb'); assert.strictEqual(captured.maxResults, 1); @@ -693,7 +693,7 @@ describe('isReady', () => { }); test('maps control-plane ResourceNotFoundException to NotReady', async () => { - const cleanup = setReadyEnv('TEST', 'RDY8'); + const cleanup = setSyncEnv('TEST', 'RDY8'); const err = new Error('No knowledge base with ID kb-test-123 exists'); err.name = 'ResourceNotFoundException'; mockAgentSend(() => { throw err; }); @@ -701,7 +701,7 @@ describe('isReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'rdy8', { source: './knowledge' }); await assert.rejects( - () => kb.isReady(), + () => kb.isSynced(), (e: Error) => { assert.strictEqual(e.name, KnowledgeBaseErrors.NotReady); return true; @@ -713,23 +713,23 @@ describe('isReady', () => { }); }); -// ── Readiness — waitUntilReady() ─────────────────────────────────────────── +// ── Sync — waitUntilSynced() ───────────────────────────────────────────────── -describe('waitUntilReady', () => { +describe('waitUntilSynced', () => { test('resolves immediately when ingestion is already COMPLETE', async () => { - const cleanup = setReadyEnv('TEST', 'WUR1'); + const cleanup = setSyncEnv('TEST', 'WUR1'); mockAgentSend(() => ({ ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'COMPLETE' }] })); try { const kb = new KnowledgeBase({ id: 'test' }, 'wur1', { source: './knowledge' }); - await kb.waitUntilReady({ timeoutMs: 1000, pollIntervalMs: 10 }); + await kb.waitUntilSynced({ timeoutMs: 1000, pollIntervalMs: 10 }); } finally { cleanup(); } }); test('polls until the ingestion job becomes COMPLETE', async () => { - const cleanup = setReadyEnv('TEST', 'WUR2'); + const cleanup = setSyncEnv('TEST', 'WUR2'); let calls = 0; mockAgentSend(() => { calls += 1; @@ -739,7 +739,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur2', { source: './knowledge' }); - await kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 5 }); + await kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 5 }); assert.ok(calls >= 3, `expected at least 3 polls before COMPLETE, got ${calls}`); } finally { cleanup(); @@ -747,7 +747,7 @@ describe('waitUntilReady', () => { }); test('throws IngestionFailed (with failureReasons) when ingestion FAILED', async () => { - const cleanup = setReadyEnv('TEST', 'WUR3'); + const cleanup = setSyncEnv('TEST', 'WUR3'); mockAgentSend((cmd) => { if (cmd.constructor.name === 'ListIngestionJobsCommand') { return { ingestionJobSummaries: [{ ingestionJobId: 'job-fail', status: 'FAILED' }] }; @@ -758,7 +758,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur3', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 1000, pollIntervalMs: 10 }), + () => kb.waitUntilSynced({ timeoutMs: 1000, pollIntervalMs: 10 }), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.IngestionFailed); assert.ok(err.message.includes('S3 access denied'), 'should surface failure reasons'); @@ -771,13 +771,13 @@ describe('waitUntilReady', () => { }); test('throws Timeout when the job never completes within the budget', async () => { - const cleanup = setReadyEnv('TEST', 'WUR4'); + const cleanup = setSyncEnv('TEST', 'WUR4'); mockAgentSend(() => ({ ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'IN_PROGRESS' }] })); try { const kb = new KnowledgeBase({ id: 'test' }, 'wur4', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 30, pollIntervalMs: 5 }), + () => kb.waitUntilSynced({ timeoutMs: 30, pollIntervalMs: 5 }), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.Timeout); assert.ok(err.message.includes('30ms'), 'timeout message should include the budget'); @@ -796,7 +796,7 @@ describe('waitUntilReady', () => { }); test('Timeout message surfaces the last transient error when the budget runs out mid-streak', async () => { - const cleanup = setReadyEnv('TEST', 'WUR16'); + const cleanup = setSyncEnv('TEST', 'WUR16'); // Every poll throws a transient (throttling → RetrievalFailed) error, but the // tolerance is high enough that the deadline — not the transient budget — ends // the wait. The Timeout message must then surface the last transient error so a @@ -810,7 +810,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur16', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 30, pollIntervalMs: 5, maxConsecutiveTransientErrors: 1000 }), + () => kb.waitUntilSynced({ timeoutMs: 30, pollIntervalMs: 5, maxConsecutiveTransientErrors: 1000 }), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.Timeout); assert.ok(err.message.includes('30ms'), 'timeout message should include the budget'); @@ -831,7 +831,7 @@ describe('waitUntilReady', () => { }); test('Timeout message stays plain when an early transient blip was cleared by later clean polls', async () => { - const cleanup = setReadyEnv('TEST', 'WUR17'); + const cleanup = setSyncEnv('TEST', 'WUR17'); // Inverse of the mid-streak case (WUR16): the transient blip happens on the // FIRST poll, but every later poll is a clean IN_PROGRESS, so the streak (and // the remembered error) reset well before the deadline. The Timeout that @@ -851,7 +851,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur17', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 40, pollIntervalMs: 5 }), + () => kb.waitUntilSynced({ timeoutMs: 40, pollIntervalMs: 5 }), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.Timeout); assert.ok(calls >= 2, `expected clean polls after the initial blip, got ${calls} call(s)`); @@ -872,11 +872,11 @@ describe('waitUntilReady', () => { }); test('resolves immediately when no data source id is configured', async () => { - // Pre-readiness-API deployment: no DATA_SOURCE_ID injected, so there is + // Pre-sync-API deployment: no DATA_SOURCE_ID injected, so there is // nothing to poll. (Not a source-type distinction — the CDK layer registers // DATA_SOURCE_ID for folder and imported s3:// sources alike; see DESIGN.md, // the "Source coverage (folder and imported s3://)" note.) - const cleanup = setReadyEnv('TEST', 'WUR5', { dataSourceId: null }); + const cleanup = setSyncEnv('TEST', 'WUR5', { dataSourceId: null }); let sendCalled = false; mockAgentSend(() => { sendCalled = true; @@ -885,7 +885,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur5', { source: './knowledge' }); - await kb.waitUntilReady({ timeoutMs: 30, pollIntervalMs: 5 }); + await kb.waitUntilSynced({ timeoutMs: 30, pollIntervalMs: 5 }); assert.strictEqual(sendCalled, false, 'should not poll the control plane when there is nothing to track'); } finally { cleanup(); @@ -893,7 +893,7 @@ describe('waitUntilReady', () => { }); test('tolerates a transient control-plane error, then resolves once COMPLETE', async () => { - const cleanup = setReadyEnv('TEST', 'WUR6'); + const cleanup = setSyncEnv('TEST', 'WUR6'); let calls = 0; mockAgentSend(() => { calls += 1; @@ -908,7 +908,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur6', { source: './knowledge' }); - await kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1 }); + await kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 1 }); assert.ok(calls >= 2, `expected a retry after the transient blip, got ${calls} call(s)`); } finally { cleanup(); @@ -916,7 +916,7 @@ describe('waitUntilReady', () => { }); test('throws once consecutive transient errors exceed the tolerance', async () => { - const cleanup = setReadyEnv('TEST', 'WUR7'); + const cleanup = setSyncEnv('TEST', 'WUR7'); let calls = 0; mockAgentSend(() => { calls += 1; @@ -928,7 +928,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur7', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 2 }), + () => kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 2 }), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.RetrievalFailed); return true; @@ -942,7 +942,7 @@ describe('waitUntilReady', () => { }); test('short-circuits immediately on IngestionFailed (never retried as transient)', async () => { - const cleanup = setReadyEnv('TEST', 'WUR8'); + const cleanup = setSyncEnv('TEST', 'WUR8'); let listCalls = 0; mockAgentSend((cmd) => { if (cmd.constructor.name === 'ListIngestionJobsCommand') { @@ -955,7 +955,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur8', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 5 }), + () => kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 5 }), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.IngestionFailed); return true; @@ -980,7 +980,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur9', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 5 }), + () => kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 5 }), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.NotReady); return true; @@ -993,7 +993,7 @@ describe('waitUntilReady', () => { }); test('resets the transient-error counter after a clean poll', async () => { - const cleanup = setReadyEnv('TEST', 'WUR10'); + const cleanup = setSyncEnv('TEST', 'WUR10'); // transient → clean (IN_PROGRESS) → transient → COMPLETE. With tolerance 1 this // only succeeds if the counter resets after the clean poll — otherwise the second // transient error would be the 2nd consecutive failure and exceed the limit. @@ -1011,7 +1011,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur10', { source: './knowledge' }); - await kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 1 }); + await kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 1 }); assert.strictEqual(i, 4, 'should consume the full transient/clean/transient/complete sequence'); } finally { cleanup(); @@ -1024,7 +1024,7 @@ describe('waitUntilReady', () => { // (thrown directly by ensureKbId, no `cause`) stays terminal. test('tolerates a transient control-plane ResourceNotFoundException (KB not yet visible), then resolves once COMPLETE', async () => { - const cleanup = setReadyEnv('TEST', 'WUR11'); + const cleanup = setSyncEnv('TEST', 'WUR11'); let calls = 0; mockAgentSend(() => { calls += 1; @@ -1041,7 +1041,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur11', { source: './knowledge' }); - await kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1 }); + await kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 1 }); assert.ok(calls >= 2, `expected a retry after the not-yet-visible blip, got ${calls} call(s)`); } finally { cleanup(); @@ -1049,7 +1049,7 @@ describe('waitUntilReady', () => { }); test('throws once consecutive control-plane ResourceNotFound errors exceed the tolerance', async () => { - const cleanup = setReadyEnv('TEST', 'WUR12'); + const cleanup = setSyncEnv('TEST', 'WUR12'); let calls = 0; mockAgentSend(() => { calls += 1; @@ -1061,7 +1061,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur12', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 2 }), + () => kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 2 }), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.NotReady); assert.strictEqual( @@ -1092,7 +1092,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur13', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 5 }), + () => kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 1, maxConsecutiveTransientErrors: 5 }), (err: Error) => { assert.strictEqual(err.name, KnowledgeBaseErrors.NotReady); // The cause-based classification hinges on this: ensureKbId() throws @@ -1111,7 +1111,7 @@ describe('waitUntilReady', () => { // Cancellation via AbortSignal — checked before each poll and during the inter-poll sleep. test('rejects immediately when the signal is already aborted (no polling)', async () => { - const cleanup = setReadyEnv('TEST', 'WUR14'); + const cleanup = setSyncEnv('TEST', 'WUR14'); let sendCalled = false; mockAgentSend(() => { sendCalled = true; @@ -1121,7 +1121,7 @@ describe('waitUntilReady', () => { try { const kb = new KnowledgeBase({ id: 'test' }, 'wur14', { source: './knowledge' }); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 5000, pollIntervalMs: 5, signal: AbortSignal.abort() }), + () => kb.waitUntilSynced({ timeoutMs: 5000, pollIntervalMs: 5, signal: AbortSignal.abort() }), (err: Error) => { assert.strictEqual(err.name, 'AbortError', 'default abort reason is a DOMException named AbortError'); return true; @@ -1134,12 +1134,12 @@ describe('waitUntilReady', () => { }); test('aborts during the inter-poll delay and rejects with the supplied abort reason', async () => { - const cleanup = setReadyEnv('TEST', 'WUR15'); + const cleanup = setSyncEnv('TEST', 'WUR15'); const controller = new AbortController(); let calls = 0; mockAgentSend(() => { calls += 1; - // Always "still warming" so the wait reaches the inter-poll sleep, where + // Always "not synced yet" so the wait reaches the inter-poll sleep, where // the abort fired below interrupts it. return { ingestionJobSummaries: [{ ingestionJobId: 'j', status: 'IN_PROGRESS' }] }; }); @@ -1149,7 +1149,7 @@ describe('waitUntilReady', () => { const reason = new Error('caller cancelled'); setTimeout(() => controller.abort(reason), 20).unref?.(); await assert.rejects( - () => kb.waitUntilReady({ timeoutMs: 60_000, pollIntervalMs: 50, signal: controller.signal }), + () => kb.waitUntilSynced({ timeoutMs: 60_000, pollIntervalMs: 50, signal: controller.signal }), (err: Error) => { assert.strictEqual(err, reason, 'should reject with the exact reason passed to abort()'); return true; diff --git a/packages/bb-knowledge-base/src/index.aws.ts b/packages/bb-knowledge-base/src/index.aws.ts index b7caf099..521b996a 100644 --- a/packages/bb-knowledge-base/src/index.aws.ts +++ b/packages/bb-knowledge-base/src/index.aws.ts @@ -20,7 +20,7 @@ import type { RetrieveOptions, RetrieveResult, MetadataFilter, - WaitUntilReadyOptions, + WaitUntilSyncedOptions, } from './types.js'; import { KnowledgeBaseErrors } from './errors.js'; import { BB_NAME, BB_VERSION } from './version.js'; @@ -35,7 +35,7 @@ export type { RetrieveOptions, RetrieveResult, MetadataFilter, - WaitUntilReadyOptions, + WaitUntilSyncedOptions, } from './types.js'; export { KnowledgeBaseErrors } from './errors.js'; @@ -57,8 +57,8 @@ function blocksError(name: string, message: string): Error { } /** - * Resolve after `ms` milliseconds. Used to space out readiness polls in - * `waitUntilReady()`. If an {@link AbortSignal} is supplied and fires (or is + * Resolve after `ms` milliseconds. Used to space out the sync polls in + * `waitUntilSynced()`. If an {@link AbortSignal} is supplied and fires (or is * already aborted), the returned promise rejects promptly with the signal's * abort reason instead of waiting out the full delay. */ @@ -147,8 +147,8 @@ function mapSdkError(err: unknown): Error { } /** - * Whether a mapped readiness error is a *transient* control-plane failure worth - * a bounded retry in {@link KnowledgeBase.waitUntilReady}, rather than a terminal + * Whether a mapped sync-poll error is a *transient* control-plane failure worth + * a bounded retry in {@link KnowledgeBase.waitUntilSynced}, rather than a terminal * one that should short-circuit the wait. * * Two cases are transient: @@ -159,7 +159,7 @@ function mapSdkError(err: unknown): Error { * isn't visible yet); {@link mapSdkError} maps that to `KnowledgeBaseNotReadyException` * **with the original SDK error attached as the non-enumerable `cause`**. Detect * it via `cause.name === 'ResourceNotFoundException'` and ride it out — that is - * the entire purpose of `waitUntilReady()`. + * the entire purpose of `waitUntilSynced()`. * * Everything else is terminal and short-circuits immediately: the `NotReady` * raised for an unset `KB_ID` config is thrown directly by `ensureKbId()` (so it @@ -222,15 +222,15 @@ function buildFilter(filter?: MetadataFilter): RetrievalFilter | undefined { * * **Environment variables (injected by CDK):** * - `BLOCKS_{FULLID}_KB_ID` — Bedrock Knowledge Base ID - * - `BLOCKS_{FULLID}_DATA_SOURCE_ID` — Bedrock data source ID (used by `isReady()` / `waitUntilReady()`) + * - `BLOCKS_{FULLID}_DATA_SOURCE_ID` — Bedrock data source ID (used by `isSynced()` / `waitUntilSynced()`) */ export class KnowledgeBase extends Scope { readonly bbName = BB_NAME; private readonly fullIdCached: string; private readonly runtimeClient: BedrockAgentRuntimeClient; - // Control-plane client for ingestion-job status (readiness checks). Created - // lazily on first readiness call via getAgentClient() so instances that only - // ever retrieve() (or never check readiness) don't allocate it. + // Control-plane client for ingestion-job status (sync checks). Created + // lazily on first sync call via getAgentClient() so instances that only + // ever retrieve() (or never check sync state) don't allocate it. private agentClient?: BedrockAgentClient; /** @internal Logger for internal operations. Defaults to error-level when not provided. */ @@ -267,9 +267,9 @@ export class KnowledgeBase extends Scope { * missing data source id is a valid state that this simply reports as * `undefined`. Both folder and imported `s3://` sources register a BB-managed * data source id at deploy time, so this normally returns a value for either - * source type. It is `undefined` only for deployments that predate the readiness + * source type. It is `undefined` only for deployments that predate the sync * API (no `DATA_SOURCE_ID` injected) — in which case there is no ingestion job - * to track and callers treat the KB as ready. + * to track and callers treat the KB as synced. */ private getDataSourceId(): string | undefined { const dataSourceId = getSdkIdentifiers(this).dataSourceId; @@ -335,23 +335,41 @@ export class KnowledgeBase extends Scope { } /** - * Report whether the knowledge base has finished ingesting and is ready to - * serve `retrieve()` calls. + * Report whether the knowledge base is **synced with your latest data** — + * i.e. its most recent Bedrock ingestion job has reached `COMPLETE`. Mirrors + * the "Sync" state Bedrock surfaces in the console. * * Bedrock ingestion runs asynchronously after deploy (it is triggered - * fire-and-forget), so during the warm-up window `retrieve()` returns an - * empty array even for queries that will later match. Use `isReady()` to - * distinguish "still warming up" (`false`) from "ingested, genuinely no - * match" (`true` alongside an empty `retrieve()` result). + * fire-and-forget), so on a first deploy `retrieve()` returns an empty array + * during the initial pre-sync window even for queries that will later match. + * Use `isSynced()` to distinguish "not synced with your latest data yet" + * (`false`) from "synced, genuinely no match" (`true` alongside an empty + * `retrieve()` result). + * + * **Freshness, not availability.** This reports freshness, not reachability. + * Once the first ingestion has completed, `retrieve()` stays queryable + * throughout any subsequent re-ingestion — Bedrock keeps serving the prior + * snapshot while it re-indexes, it does not go dark. So `false` during a + * re-sync means "your newest documents aren't indexed yet", **not** "the KB + * is unavailable"; a caller that gates every `retrieve()` on `isSynced()` + * would back off unnecessarily on each document-update cycle even though the + * previous snapshot is fully queryable. * * Resolution strategy: lists the data source's ingestion jobs (most recent - * first) and inspects the latest job's status — `COMPLETE` → ready, + * first) and inspects the latest job's status — `COMPLETE` → synced, * `FAILED` → throws, anything else (`STARTING` / `IN_PROGRESS`, or no jobs - * yet) → not ready. Both folder and imported `s3://` sources register a + * yet) → not synced. Both folder and imported `s3://` sources register a * BB-managed data source id, so both are tracked here; the "no data source - * id configured → reported ready" shortcut applies only to deployments that + * id configured → reported synced" shortcut applies only to deployments that * predate this API (no `DATA_SOURCE_ID` injected — nothing to track). * + * **Embedding-propagation lag.** `COMPLETE` reflects that the ingestion job + * finished. For non-Aurora vector stores — this Building Block uses S3 + * Vectors — AWS notes embeddings can take a few more minutes to become + * queryable after the job completes, so `isSynced() === true` means the job + * completed, with a possible short propagation lag before the newest chunks + * surface in `retrieve()`. + * * @returns `true` when the latest ingestion job is `COMPLETE` (or there is * no managed data source to track); `false` while ingestion is pending. * @throws {IngestionFailedException} If the most recent ingestion job failed (message includes `failureReasons`). @@ -359,7 +377,7 @@ export class KnowledgeBase extends Scope { * For mapped Bedrock control-plane errors. Two distinct conditions map to `NotReady`: * the `KB_ID` env var being unset (a *config* error thrown directly, so it carries no * `cause`), and a control-plane `ResourceNotFoundException` (a *not-yet-visible* KB, - * mapped with the SDK error as `cause`). {@link waitUntilReady} relies on that + * mapped with the SDK error as `cause`). {@link waitUntilSynced} relies on that * distinction: it rides out the not-yet-visible case as transient but treats the * unset-`KB_ID` config error as terminal. A control-plane `ValidationException` maps to * `KnowledgeBaseValidationError` (or `InvalidFilterException`); any other SDK error @@ -368,19 +386,19 @@ export class KnowledgeBase extends Scope { * * @example * ```typescript - * if (await kb.isReady()) { + * if (await kb.isSynced()) { * const results = await kb.retrieve('how do I reset my password'); * } * ``` */ - async isReady(): Promise { + async isSynced(): Promise { const knowledgeBaseId = this.ensureKbId(); const dataSourceId = this.getDataSourceId(); // No BB-managed ingestion to track → nothing to wait for. if (!dataSourceId) return true; const job = await this.fetchLatestIngestionJob(knowledgeBaseId, dataSourceId); - // No ingestion job recorded yet → ingestion has not started; still warming. + // No ingestion job recorded yet → ingestion has not started; not synced yet. if (!job) return false; if (job.status === 'COMPLETE') return true; @@ -391,15 +409,16 @@ export class KnowledgeBase extends Scope { `Knowledge base ingestion failed.${reasons.length ? ` Reasons: ${reasons.join('; ')}` : ''}`, ); } - // STARTING | IN_PROGRESS | STOPPING | STOPPED → not ready. + // STARTING | IN_PROGRESS | STOPPING | STOPPED → not synced yet. return false; } /** - * Wait until the knowledge base has finished ingesting, polling its - * ingestion-job status until ready or until the timeout elapses. + * Wait until the knowledge base is **synced with your latest data** (its most + * recent ingestion job reaches `COMPLETE`), polling the ingestion-job status + * until synced or until the timeout elapses. * - * Polls {@link isReady} every `pollIntervalMs` until it returns `true` + * Polls {@link isSynced} every `pollIntervalMs` until it returns `true` * (resolves) or the `timeoutMs` budget is exhausted (throws). If the most * recent ingestion job has `FAILED`, the underlying `IngestionFailedException` * propagates immediately rather than waiting out the timeout. @@ -426,16 +445,16 @@ export class KnowledgeBase extends Scope { * and during the inter-poll delay, rejecting promptly with the signal's abort * reason (default: a `DOMException` named `'AbortError'`). * - * @param {WaitUntilReadyOptions} options - Optional polling parameters. + * @param {WaitUntilSyncedOptions} options - Optional polling parameters. * `timeoutMs` (default 300000) bounds the total wait; `pollIntervalMs` * (default 5000, clamped to a minimum of 1ms, ±20% jitter) spaces out the * polls; `maxConsecutiveTransientErrors` (default 3, minimum 0) bounds how * many consecutive transient control-plane errors are tolerated before * giving up; `signal` (optional `AbortSignal`) cancels the wait. - * @throws {KnowledgeBaseTimeoutException} If the KB does not become ready within `timeoutMs`. + * @throws {KnowledgeBaseTimeoutException} If the KB does not sync within `timeoutMs`. * @throws {IngestionFailedException} If the most recent ingestion job failed (message includes `failureReasons`). * @throws {KnowledgeBaseNotReadyException | KnowledgeBaseValidationError | InvalidFilterException | RetrievalFailedException} - * Propagated from {@link isReady} for mapped Bedrock control-plane errors — see its docs + * Propagated from {@link isSynced} for mapped Bedrock control-plane errors — see its docs * for the full mapping (`ResourceNotFoundException`/unset `KB_ID` → `NotReady`, * `ValidationException` → `KnowledgeBaseValidationError`/`InvalidFilterException`, * other SDK errors → `RetrievalFailedException`). Transient errors (a `RetrievalFailedException`, @@ -446,14 +465,14 @@ export class KnowledgeBase extends Scope { * @example * ```typescript * // Block until the KB is queryable (e.g. right after deploy) - * await kb.waitUntilReady({ timeoutMs: 600_000 }); + * await kb.waitUntilSynced({ timeoutMs: 600_000 }); * const results = await kb.retrieve('getting started'); * * // With cancellation (e.g. an overall request deadline) - * await kb.waitUntilReady({ signal: AbortSignal.timeout(120_000) }); + * await kb.waitUntilSynced({ signal: AbortSignal.timeout(120_000) }); * ``` */ - async waitUntilReady(options?: WaitUntilReadyOptions): Promise { + async waitUntilSynced(options?: WaitUntilSyncedOptions): Promise { const timeoutMs = Math.max(options?.timeoutMs ?? 300_000, 0); const pollIntervalMs = Math.max(options?.pollIntervalMs ?? 5_000, 1); const maxConsecutiveTransientErrors = Math.max(options?.maxConsecutiveTransientErrors ?? 3, 0); @@ -467,11 +486,11 @@ export class KnowledgeBase extends Scope { // already-aborted signal throws here on the very first pass (no poll). signal?.throwIfAborted(); try { - // isReady() resolves true (ready) / false (still warming), throws + // isSynced() resolves true (synced) / false (not synced yet), throws // IngestionFailedException on a FAILED job, NotReady when the KB is // not deployed (or briefly not-yet-visible), or RetrievalFailedException // for transient blips. - if (await this.isReady()) return; + if (await this.isSynced()) return; // A clean poll clears any transient-error streak — reset the remembered // error alongside the counter so a later Timeout can only ever fold in a // transient from the streak still in flight at the deadline, never a stale @@ -487,9 +506,19 @@ export class KnowledgeBase extends Scope { // Transient control-plane blip: absorb a bounded run, then give up. consecutiveTransientErrors += 1; lastTransient = err; - if (consecutiveTransientErrors > maxConsecutiveTransientErrors) throw err; + if (consecutiveTransientErrors > maxConsecutiveTransientErrors) { + // Distinct from a Timeout on a healthy-but-still-ingesting KB: log that + // the transient tolerance was exhausted before rethrowing, so "gave up + // after N consecutive control-plane errors" is greppable in CloudWatch + // and not mistaken for a KB that simply never finished syncing. + this.log.warn( + `waitUntilSynced: giving up after ${consecutiveTransientErrors} consecutive transient ` + + `control-plane error(s) — tolerance (${maxConsecutiveTransientErrors}) exhausted: ${err.message}`, + ); + throw err; + } this.log.warn( - `waitUntilReady: tolerating transient control-plane error ` + + `waitUntilSynced: tolerating transient control-plane error ` + `(${consecutiveTransientErrors}/${maxConsecutiveTransientErrors}), retrying: ${err.message}`, ); } @@ -498,7 +527,7 @@ export class KnowledgeBase extends Scope { // control-plane errors, fold the most recent one into the message. // Otherwise a timeout reads like a healthy KB that just never finished // ingesting, hiding that the final polls were actually failing transiently. - const base = `Knowledge base did not become ready within ${timeoutMs}ms`; + const base = `Knowledge base did not sync within ${timeoutMs}ms`; throw blocksError( KnowledgeBaseErrors.Timeout, consecutiveTransientErrors > 0 && lastTransient @@ -514,9 +543,9 @@ export class KnowledgeBase extends Scope { /** * Lazily construct (and memoize) the Bedrock control-plane client used for - * ingestion-job status during readiness checks. Built on first use rather + * ingestion-job status during sync checks. Built on first use rather * than in the constructor so instances that only ever call {@link retrieve} - * — or never check readiness at all — don't allocate a client they won't use. + * — or never check sync state at all — don't allocate a client they won't use. * Subsequent calls return the cached instance. */ private getAgentClient(): BedrockAgentClient { @@ -551,7 +580,14 @@ export class KnowledgeBase extends Scope { return response.ingestionJobSummaries?.[0]; } catch (err) { const mapped = mapSdkError(err); - this.log.error(mapped.message); + // Logged at debug, not error: this path fires for the transient control-plane + // blips (throttling → RetrievalFailed, a not-yet-visible KB → NotReady) that + // waitUntilSynced() is designed to absorb and retry during the post-deploy + // warm-up window — emitting them at error produced spurious CloudWatch ERROR + // entries during expected behavior. waitUntilSynced() owns the operator signal + // (its own warn at the retry/give-up sites); a direct isSynced() caller receives + // the thrown mapped error and owns how to surface it. + this.log.debug(mapped.message); throw mapped; } } diff --git a/packages/bb-knowledge-base/src/index.browser.ts b/packages/bb-knowledge-base/src/index.browser.ts index c31e1e3b..f66cb8e0 100644 --- a/packages/bb-knowledge-base/src/index.browser.ts +++ b/packages/bb-knowledge-base/src/index.browser.ts @@ -2,14 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 import type { ScopeParent } from '@aws-blocks/core'; -import type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, WaitUntilReadyOptions } from './types.js'; +import type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, WaitUntilSyncedOptions } from './types.js'; import { KnowledgeBaseErrors } from './errors.js'; export type { KnowledgeBaseOptions, SourceConfig, ChunkingConfig, ChunkingStrategy, RetrieveOptions, RetrieveResult, - MetadataFilter, WaitUntilReadyOptions, + MetadataFilter, WaitUntilSyncedOptions, } from './types.js'; export { KnowledgeBaseErrors } from './errors.js'; @@ -41,11 +41,11 @@ export class KnowledgeBase { throw browserError(); } - async isReady(): Promise { + async isSynced(): Promise { throw browserError(); } - async waitUntilReady(_options?: WaitUntilReadyOptions): Promise { + async waitUntilSynced(_options?: WaitUntilSyncedOptions): Promise { throw browserError(); } } diff --git a/packages/bb-knowledge-base/src/index.cdk.test.ts b/packages/bb-knowledge-base/src/index.cdk.test.ts index 7f5838a6..b0a12e41 100644 --- a/packages/bb-knowledge-base/src/index.cdk.test.ts +++ b/packages/bb-knowledge-base/src/index.cdk.test.ts @@ -10,13 +10,13 @@ * CloudFormation `DeletionPolicy` and leaked. Those tests pin the fix: the * vector resources now mirror the data bucket's removal policy. * - * Ingestion readiness: the handler role must be able to read ingestion-job + * Ingestion sync: the handler role must be able to read ingestion-job * status (`bedrock:GetIngestionJob` / `bedrock:ListIngestionJobs`) — scoped to * the KB ARN like the existing `bedrock:Retrieve` grant — and the - * `DATA_SOURCE_ID` config the runtime readiness checks rely on must be + * `DATA_SOURCE_ID` config the runtime sync checks rely on must be * registered and surface in the synthesized template. * - * Synth guards: the runtime methods (`retrieve` / `isReady` / `waitUntilReady`) + * Synth guards: the runtime methods (`retrieve` / `isSynced` / `waitUntilSynced`) * are stubbed on the CDK construct so an accidental synth-time call throws an * actionable error instead of a cryptic `TypeError: not a function`. */ @@ -57,7 +57,7 @@ class StubBlocksStack extends cdk.Stack { } } -function buildStack(options: { removalPolicy?: 'destroy' | 'retain'; sandbox?: boolean } = {}): { +function buildStack(options: { removalPolicy?: 'destroy' | 'retain'; sandbox?: boolean; source?: string } = {}): { stack: StubBlocksStack; kb: KnowledgeBase; } { @@ -67,7 +67,7 @@ function buildStack(options: { removalPolicy?: 'destroy' | 'retain'; sandbox?: b const stack = new StubBlocksStack(app, 'teststack'); const parent = new Scope('app'); const kb = new KnowledgeBase(parent, 'docs', { - source: FIXTURES, + source: options.source ?? FIXTURES, ...(options.removalPolicy ? { removalPolicy: options.removalPolicy } : {}), }); return { stack, kb }; @@ -112,7 +112,7 @@ test('CDK: sandboxMode context defaults the data bucket + vector store to destro test('CDK: handler role can read ingestion-job status (GetIngestionJob/ListIngestionJobs), scoped to the KB ARN like bedrock:Retrieve', () => { const template = synth(); - // isReady()/waitUntilReady() poll ingestion-job status — the handler role + // isSynced()/waitUntilSynced() poll ingestion-job status — the handler role // needs both actions, granted as Allow. template.hasResourceProperties('AWS::IAM::Policy', { PolicyDocument: Match.objectLike({ @@ -148,7 +148,7 @@ test('CDK: registers the DATA_SOURCE_ID config (wired to the data source) and su const { stack } = buildStack(); // registerConfig records BLOCKS_{FULLID}_DATA_SOURCE_ID on the stack's config - // registry, bound to the Bedrock data source's id — the runtime readiness + // registry, bound to the Bedrock data source's id — the runtime sync // checks read it back at cold start. (Mirrors bb-app-setting's CDK test.) const registry = (stack as any)[Symbol.for('BLOCKS_CONFIG_REGISTRY')] as | { entries: Map } @@ -195,10 +195,83 @@ test('CDK: registers the DATA_SOURCE_ID config (wired to the data source) and su }); }); +// ── S3 URI (imported bucket) source ───────────────────────────────────────── +// An imported s3:// source skips the documents BucketDeployment (the objects +// already live in the bucket) but still provisions a BB-managed CfnDataSource +// and fires the ingestion job — so the runtime sync grants and DATA_SOURCE_ID +// wiring must be present exactly as they are for a local-folder source (see +// DESIGN.md, "Source coverage (folder and imported s3://)"). +const S3_SOURCE = 's3://my-docs-bucket'; + +test('CDK (s3:// source): handler still gets bedrock:Retrieve + ingestion-status grants scoped to the KB ARN', () => { + const { stack } = buildStack({ source: S3_SOURCE }); + const template = Template.fromStack(stack); + + // Imported bucket → no documents BucketDeployment (proves the s3:// branch is + // taken, not the folder path; finalizeConfigRegistry isn't called here). + template.resourceCountIs('Custom::CDKBucketDeployment', 0); + + // Same ingestion-status grant as a folder source: both actions, granted Allow. + template.hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: Match.objectLike({ + Statement: Match.arrayWith([ + Match.objectLike({ + Action: ['bedrock:GetIngestionJob', 'bedrock:ListIngestionJobs'], + Effect: 'Allow', + }), + ]), + }), + }); + + // ...scoped to the SAME knowledge-base ARN as the existing bedrock:Retrieve + // grant (not a wildcard) — ingestion jobs are sub-resources of the KB ARN. + const statements = Object.values(template.findResources('AWS::IAM::Policy')).flatMap( + (policy) => policy.Properties.PolicyDocument.Statement as Array>, + ); + const retrieveStmt = statements.find((s) => s.Action === 'bedrock:Retrieve'); + const ingestionStmt = statements.find( + (s) => Array.isArray(s.Action) && (s.Action as string[]).includes('bedrock:GetIngestionJob'), + ); + assert.ok(retrieveStmt, 'bedrock:Retrieve grant is present for an s3:// source'); + assert.ok(ingestionStmt, 'ingestion-status grant is present for an s3:// source'); + assert.deepStrictEqual( + ingestionStmt.Resource, + retrieveStmt.Resource, + 'ingestion-status grant is scoped to the same KB ARN as bedrock:Retrieve', + ); +}); + +test('CDK (s3:// source): DATA_SOURCE_ID config is wired to the data source id (same as a folder source)', () => { + const { stack } = buildStack({ source: S3_SOURCE }); + + // Even though the bucket is imported, the construct still registers + // BLOCKS_{FULLID}_DATA_SOURCE_ID bound to the Bedrock data source's id, so the + // runtime isSynced()/waitUntilSynced() checks track the imported source's + // ingestion job exactly as they do for a local folder. + const registry = (stack as any)[Symbol.for('BLOCKS_CONFIG_REGISTRY')] as + | { entries: Map } + | undefined; + assert.ok(registry, 'config registry exists on the stack'); + + const dataSourceKey = [...registry.entries.keys()].find((k) => k.endsWith('_DATA_SOURCE_ID')); + assert.ok(dataSourceKey, 'a *_DATA_SOURCE_ID config key is registered for an s3:// source'); + assert.match(dataSourceKey, /^BLOCKS_.+_DATA_SOURCE_ID$/); + + const resolvedValue = stack.resolve(registry.entries.get(dataSourceKey)) as { + 'Fn::GetAtt'?: [string, string]; + }; + assert.ok(resolvedValue['Fn::GetAtt'], 'config value is a CDK token (Fn::GetAtt)'); + assert.strictEqual( + resolvedValue['Fn::GetAtt'][1], + 'DataSourceId', + 'config value is wired to the data source id even when the source is an S3 URI', + ); +}); + test('CDK: calling a runtime method throws an actionable synth-time error (not a cryptic TypeError)', () => { const { kb } = buildStack(); const construct = kb as unknown as Record unknown>; - for (const method of ['retrieve', 'isReady', 'waitUntilReady']) { + for (const method of ['retrieve', 'isSynced', 'waitUntilSynced']) { assert.throws( () => construct[method]('x'), /cannot be called during CDK synth/, diff --git a/packages/bb-knowledge-base/src/index.cdk.ts b/packages/bb-knowledge-base/src/index.cdk.ts index 6aeb5add..9d9b409c 100644 --- a/packages/bb-knowledge-base/src/index.cdk.ts +++ b/packages/bb-knowledge-base/src/index.cdk.ts @@ -19,7 +19,7 @@ export type { KnowledgeBaseOptions, SourceConfig, ChunkingConfig, ChunkingStrategy, RetrieveOptions, RetrieveResult, - MetadataFilter, WaitUntilReadyOptions, + MetadataFilter, WaitUntilSyncedOptions, } from './types.js'; export { KnowledgeBaseErrors } from './errors.js'; @@ -176,11 +176,11 @@ function generateMetadataSidecars(sourceDir: string): string | undefined { * * **Environment variables injected into the handler:** * - `BLOCKS_{FULLID}_KB_ID` — Bedrock Knowledge Base ID (used by the AWS runtime) - * - `BLOCKS_{FULLID}_DATA_SOURCE_ID` — Bedrock data source ID (used by `isReady()` / `waitUntilReady()`) + * - `BLOCKS_{FULLID}_DATA_SOURCE_ID` — Bedrock data source ID (used by `isSynced()` / `waitUntilSynced()`) * * **IAM grants to the handler:** * - `bedrock:Retrieve` — query the knowledge base at runtime - * - `bedrock:GetIngestionJob`, `bedrock:ListIngestionJobs` — check ingestion readiness + * - `bedrock:GetIngestionJob`, `bedrock:ListIngestionJobs` — poll ingestion-job sync status * * @param scope - Parent scope. * @param id - Unique identifier within the scope. @@ -452,7 +452,7 @@ export class KnowledgeBase extends Scope { // ── 8. Handler config (read by the AWS runtime) ─────────────────── // Registered via registerConfig (not addEnvironment) so the runtime can // locate the Bedrock resources. KB_ID drives retrieve(); DATA_SOURCE_ID - // drives the isReady()/waitUntilReady() ingestion-readiness checks. + // drives the isSynced()/waitUntilSynced() ingestion-sync checks. registerConfig(this, envKey(this.fullId, 'KB_ID'), knowledgeBase.attrKnowledgeBaseId); registerConfig(this, envKey(this.fullId, 'DATA_SOURCE_ID'), dataSource.attrDataSourceId); @@ -471,7 +471,7 @@ export class KnowledgeBase extends Scope { resources: [knowledgeBaseArn], })); - // Ingestion-job status for isReady()/waitUntilReady(). These actions are + // Ingestion-job status for isSynced()/waitUntilSynced(). These actions are // authorized at the knowledge-base resource level (the data source and // ingestion jobs are sub-resources of the KB ARN). this.handler.addToRolePolicy(new iam.PolicyStatement({ @@ -482,11 +482,11 @@ export class KnowledgeBase extends Scope { // ── Runtime methods are not available during CDK synth ──────────────── // Under `--conditions=cdk` a KnowledgeBase resolves to this construct, which - // only provisions infrastructure. The data/readiness methods (retrieve/ - // isReady/waitUntilReady) live in the runtime build. Calling them at module + // only provisions infrastructure. The data/sync methods (retrieve/ + // isSynced/waitUntilSynced) live in the runtime build. Calling them at module // top-level (which runs during synth) would otherwise fail with a cryptic // `X is not a function`; these stubs turn that into an actionable message. retrieve(..._args: unknown[]): never { return synthGuard('KnowledgeBase', 'retrieve'); } - isReady(..._args: unknown[]): never { return synthGuard('KnowledgeBase', 'isReady'); } - waitUntilReady(..._args: unknown[]): never { return synthGuard('KnowledgeBase', 'waitUntilReady'); } + isSynced(..._args: unknown[]): never { return synthGuard('KnowledgeBase', 'isSynced'); } + waitUntilSynced(..._args: unknown[]): never { return synthGuard('KnowledgeBase', 'waitUntilSynced'); } } diff --git a/packages/bb-knowledge-base/src/index.mock.test.ts b/packages/bb-knowledge-base/src/index.mock.test.ts index f4ed132e..b51cee26 100644 --- a/packages/bb-knowledge-base/src/index.mock.test.ts +++ b/packages/bb-knowledge-base/src/index.mock.test.ts @@ -1032,31 +1032,31 @@ describe('unicode / multilingual retrieval', () => { }); }); -// ── Readiness (local dev: no warm-up window) ──────────────────────────────── +// ── Sync (local dev: no async ingestion window) ───────────────────────────── // // The local corpus loads synchronously on first retrieve(), so there is no -// asynchronous ingestion warm-up — isReady() is always true and -// waitUntilReady() resolves immediately (options are ignored). +// asynchronous ingestion window — isSynced() is always true and +// waitUntilSynced() resolves immediately (options are ignored). -describe('readiness', () => { - test('isReady() resolves true immediately', async () => { - const kb = new KnowledgeBase({ id: 'test' }, 'ready', { source: 'test-knowledge-tmp' }); - assert.strictEqual(await kb.isReady(), true); +describe('sync', () => { + test('isSynced() resolves true immediately', async () => { + const kb = new KnowledgeBase({ id: 'test' }, 'synced', { source: 'test-knowledge-tmp' }); + assert.strictEqual(await kb.isSynced(), true); }); - test('waitUntilReady() resolves immediately', async () => { - const kb = new KnowledgeBase({ id: 'test' }, 'waitready', { source: 'test-knowledge-tmp' }); - await kb.waitUntilReady(); + test('waitUntilSynced() resolves immediately', async () => { + const kb = new KnowledgeBase({ id: 'test' }, 'waitsynced', { source: 'test-knowledge-tmp' }); + await kb.waitUntilSynced(); }); - test('isReady() is true even for an S3 URI source (no local warm-up)', async () => { - const kb = new KnowledgeBase({ id: 'test' }, 'readys3', { source: 's3://my-docs-bucket' }); - assert.strictEqual(await kb.isReady(), true); + test('isSynced() is true even for an S3 URI source (no local ingestion window)', async () => { + const kb = new KnowledgeBase({ id: 'test' }, 'synceds3', { source: 's3://my-docs-bucket' }); + assert.strictEqual(await kb.isSynced(), true); }); - test('waitUntilReady() ignores options and resolves immediately', async () => { + test('waitUntilSynced() ignores options and resolves immediately', async () => { const kb = new KnowledgeBase({ id: 'test' }, 'waitopts', { source: 'test-knowledge-tmp' }); - await kb.waitUntilReady({ timeoutMs: 1, pollIntervalMs: 1 }); + await kb.waitUntilSynced({ timeoutMs: 1, pollIntervalMs: 1 }); }); }); diff --git a/packages/bb-knowledge-base/src/index.mock.ts b/packages/bb-knowledge-base/src/index.mock.ts index eab04857..44ebeb70 100644 --- a/packages/bb-knowledge-base/src/index.mock.ts +++ b/packages/bb-knowledge-base/src/index.mock.ts @@ -8,7 +8,7 @@ import { existsSync, readFileSync, writeFileSync, mkdirSync, readdirSync, statSy import { join, relative, dirname, extname, resolve, sep } from 'node:path'; import { createHash } from 'node:crypto'; import { buildIndex, search, type TfIdfIndex } from './tfidf.js'; -import type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, ChunkingStrategy, WaitUntilReadyOptions } from './types.js'; +import type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, ChunkingStrategy, WaitUntilSyncedOptions } from './types.js'; import { KnowledgeBaseErrors } from './errors.js'; import { Logger } from '@aws-blocks/bb-logger'; import type { ChildLogger } from '@aws-blocks/bb-logger'; @@ -22,7 +22,7 @@ export type { RetrieveOptions, RetrieveResult, MetadataFilter, - WaitUntilReadyOptions, + WaitUntilSyncedOptions, } from './types.js'; export { KnowledgeBaseErrors } from './errors.js'; @@ -253,29 +253,30 @@ export class KnowledgeBase extends Scope { } /** - * Report whether the knowledge base is ready to serve `retrieve()` calls. + * Report whether the knowledge base is synced with your latest data. * - * Local development has no asynchronous ingestion warm-up window — the - * corpus is read and indexed synchronously on the first `retrieve()` — so - * this always resolves `true`. (In production the AWS runtime polls the - * Bedrock ingestion-job status, which may briefly report `false`.) + * Local development has no asynchronous ingestion window — the corpus is read + * and indexed synchronously on the first `retrieve()` — so it is always in + * sync and this resolves `true`. (In production the AWS runtime polls the + * Bedrock ingestion-job status, which reports `false` until the latest + * ingestion job reaches `COMPLETE`.) * * @returns Always `true` in local development. */ - async isReady(): Promise { + async isSynced(): Promise { return true; } /** - * Resolve once the knowledge base has finished ingesting. + * Resolve once the knowledge base is synced with your latest data. * - * Local development has no ingestion warm-up window (see {@link isReady}), + * Local development has no asynchronous ingestion window (see {@link isSynced}), * so this resolves immediately. The options are accepted for API parity * with the AWS runtime and are otherwise ignored locally. * - * @param {WaitUntilReadyOptions} _options - Accepted for API parity; ignored in local development. + * @param {WaitUntilSyncedOptions} _options - Accepted for API parity; ignored in local development. */ - async waitUntilReady(_options?: WaitUntilReadyOptions): Promise { + async waitUntilSynced(_options?: WaitUntilSyncedOptions): Promise { // No-op: the local corpus loads synchronously, so there is nothing to wait for. } diff --git a/packages/bb-knowledge-base/src/types.ts b/packages/bb-knowledge-base/src/types.ts index 84192769..3ed3e017 100644 --- a/packages/bb-knowledge-base/src/types.ts +++ b/packages/bb-knowledge-base/src/types.ts @@ -154,29 +154,31 @@ export interface RetrieveResult { metadata: Record; } -// ── Readiness Options ────────────────────────────────────────────────────── +// ── Sync Options ─────────────────────────────────────────────────────────── /** - * Options for the `waitUntilReady()` method. + * Options for the `waitUntilSynced()` method. * * @example * ```typescript * // Wait up to 10 minutes, polling every 10 seconds - * await kb.waitUntilReady({ timeoutMs: 600_000, pollIntervalMs: 10_000 }); + * await kb.waitUntilSynced({ timeoutMs: 600_000, pollIntervalMs: 10_000 }); * ``` */ -export interface WaitUntilReadyOptions { +export interface WaitUntilSyncedOptions { /** * Maximum time to wait for ingestion to complete, in milliseconds. Default: 300000 (5 minutes). * * `timeoutMs: 0` is a one-shot check, not a no-poll fast-fail: the deadline is - * evaluated *after* the first readiness poll, so exactly one `isReady()` poll - * always runs (and can resolve the wait) before a `KnowledgeBaseTimeoutException` - * is thrown. Clamped to a minimum of 0. + * evaluated *after* the first `isSynced()` poll, so exactly one poll always + * runs (and can resolve the wait) before a `KnowledgeBaseTimeoutException` is + * thrown. No inter-poll sleep occurs in this case either — the sleep is + * `min(jitter, deadline − now)`, and the remaining-budget clamp drives it to 0 + * when the budget is already spent. Clamped to a minimum of 0. */ timeoutMs?: number; /** - * Delay between readiness polls, in milliseconds. Clamped to a minimum of 1ms. + * Delay between sync polls, in milliseconds. Clamped to a minimum of 1ms. * A small amount of random jitter (±20%) is applied to each delay so that many * knowledge bases polling after a shared deploy do not fall into lockstep — the * jitter only varies the wait *between* polls (never the number of polls) and is @@ -186,16 +188,16 @@ export interface WaitUntilReadyOptions { /** * Maximum number of *consecutive* transient control-plane errors to tolerate * before giving up, instead of aborting the wait on the first blip. Two kinds - * of error are treated as transient during a readiness poll: + * of error are treated as transient during a sync poll: * - `RetrievalFailedException` — the catch-all for network failures, throttling, * and other unrecognized SDK errors. * - A *not-yet-visible* knowledge base: in the post-deploy window the control * plane can briefly return `ResourceNotFoundException` (the KB or data source * isn't visible yet), which surfaces as `KnowledgeBaseNotReadyException`. Only * this control-plane variant is transient — riding it out is the whole point - * of `waitUntilReady()`. + * of `waitUntilSynced()`. * - * Each clean poll (ingestion still in progress, or ready) resets the counter, + * Each clean poll (ingestion still in progress, or synced) resets the counter, * so only an unbroken run of failures counts toward the limit. * * Terminal errors always short-circuit immediately regardless of this value: @@ -207,7 +209,7 @@ export interface WaitUntilReadyOptions { maxConsecutiveTransientErrors?: number; /** * Optional {@link AbortSignal} to cancel the wait. When the signal is aborted, - * `waitUntilReady()` rejects promptly — checked before each poll and during the + * `waitUntilSynced()` rejects promptly — checked before each poll and during the * inter-poll delay — with the signal's abort reason (by default a `DOMException` * named `'AbortError'`, or whatever value was passed to `AbortController.abort(reason)`). * An already-aborted signal rejects immediately, before any polling. diff --git a/packages/blocks/API.md b/packages/blocks/API.md index 6526c60f..62785807 100644 --- a/packages/blocks/API.md +++ b/packages/blocks/API.md @@ -163,7 +163,7 @@ import { Transaction } from '@aws-blocks/bb-data'; import { TransactionOptions } from '@aws-blocks/bb-distributed-data'; import { UpdateAttributeOutcome } from '@aws-blocks/bb-auth-cognito'; import { UserAttribute } from '@aws-blocks/bb-auth-cognito'; -import { WaitUntilReadyOptions } from '@aws-blocks/bb-knowledge-base'; +import { WaitUntilSyncedOptions } from '@aws-blocks/bb-knowledge-base'; export { Agent } @@ -572,7 +572,7 @@ export { UpdateAttributeOutcome } export { UserAttribute } -export { WaitUntilReadyOptions } +export { WaitUntilSyncedOptions } export * from "@aws-blocks/core"; diff --git a/packages/blocks/src/index.cdk.ts b/packages/blocks/src/index.cdk.ts index 192425b9..b18940aa 100644 --- a/packages/blocks/src/index.cdk.ts +++ b/packages/blocks/src/index.cdk.ts @@ -53,7 +53,7 @@ export type { FileBucketOptions, PutOptions as FBPutOptions, GetUrlOptions, PutU export { AppSetting, AppSettingErrors } from '@aws-blocks/bb-app-setting'; export type { AppSettingOptions } from '@aws-blocks/bb-app-setting'; export { KnowledgeBase, KnowledgeBaseErrors } from '@aws-blocks/bb-knowledge-base'; -export type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, SourceConfig, ChunkingConfig, ChunkingStrategy, WaitUntilReadyOptions } from '@aws-blocks/bb-knowledge-base'; +export type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, SourceConfig, ChunkingConfig, ChunkingStrategy, WaitUntilSyncedOptions } from '@aws-blocks/bb-knowledge-base'; export { Tracer } from '@aws-blocks/bb-tracer'; export type { TracerOptions, Segment, AnnotationValue } from '@aws-blocks/bb-tracer'; export { Logger, LoggingErrors } from '@aws-blocks/bb-logger'; diff --git a/packages/blocks/src/index.ts b/packages/blocks/src/index.ts index ea6ea8a0..48ad1215 100644 --- a/packages/blocks/src/index.ts +++ b/packages/blocks/src/index.ts @@ -286,7 +286,7 @@ export type { AppSettingOptions } from '@aws-blocks/bb-app-setting'; * Full docs: `README.md` in the package directory above. */ export { KnowledgeBase, KnowledgeBaseErrors } from '@aws-blocks/bb-knowledge-base'; -export type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, SourceConfig, ChunkingConfig, ChunkingStrategy, WaitUntilReadyOptions } from '@aws-blocks/bb-knowledge-base'; +export type { KnowledgeBaseOptions, RetrieveOptions, RetrieveResult, MetadataFilter, SourceConfig, ChunkingConfig, ChunkingStrategy, WaitUntilSyncedOptions } from '@aws-blocks/bb-knowledge-base'; /** * **Distributed tracing backed by AWS X-Ray.** diff --git a/test-apps/comprehensive/aws-blocks/index.ts b/test-apps/comprehensive/aws-blocks/index.ts index a6d9e9a6..01c5d909 100644 --- a/test-apps/comprehensive/aws-blocks/index.ts +++ b/test-apps/comprehensive/aws-blocks/index.ts @@ -13,7 +13,7 @@ import { DistributedTableErrors } from '@aws-blocks/bb-distributed-table'; import { isBlocksError } from '@aws-blocks/core'; import { AsyncJob } from '@aws-blocks/bb-async-job'; import { AppSetting } from '@aws-blocks/bb-app-setting'; -import type { RetrieveOptions, WaitUntilReadyOptions } from '@aws-blocks/bb-knowledge-base'; +import type { RetrieveOptions, WaitUntilSyncedOptions } from '@aws-blocks/bb-knowledge-base'; import { Tracer } from '@aws-blocks/bb-tracer'; import { Logger } from '@aws-blocks/bb-logger'; import { createKyselyAdapter, DatabaseErrors } from '@aws-blocks/bb-data'; @@ -1863,15 +1863,15 @@ export const api = new ApiNamespace(scope, 'api', (context) => ({ return await kb.retrieve(query, options); }, - // Ingestion readiness — Bedrock ingests asynchronously after deploy, so e2e + // Ingestion sync — Bedrock ingests asynchronously after deploy, so e2e // tests gate retrieval on these instead of polling retrieve() for results. - // The local mock reports ready immediately. - async kbReady() { - return await kb.isReady(); + // The local mock reports synced immediately. + async kbSynced() { + return await kb.isSynced(); }, - async kbWaitUntilReady(options?: WaitUntilReadyOptions) { - await kb.waitUntilReady(options); + async kbWaitUntilSynced(options?: WaitUntilSyncedOptions) { + await kb.waitUntilSynced(options); return { success: true }; }, diff --git a/test-apps/comprehensive/test/knowledge-base.test.ts b/test-apps/comprehensive/test/knowledge-base.test.ts index a1a7a623..aea1a062 100644 --- a/test-apps/comprehensive/test/knowledge-base.test.ts +++ b/test-apps/comprehensive/test/knowledge-base.test.ts @@ -12,39 +12,39 @@ const ENV = process.env.BLOCKS_TEST_ENV || 'local'; const isLocal = ENV === 'local'; /** - * Gate retrieval tests on knowledge-base ingestion readiness. + * Gate retrieval tests on knowledge-base ingestion sync. * - * Bedrock ingests asynchronously after deploy, so during the warm-up window we - * wait for the KB to become queryable before probing `kbRetrieve`. We delegate - * to the wired `waitUntilReady()` endpoint (exposed here as `kbWaitUntilReady`) - * rather than hand-rolling a poll loop over `isReady()` (`kbReady`): - * `waitUntilReady()` owns the deadline AND rides out transient control-plane - * blips — throttling, a `RetrievalFailed`, or a brief not-yet-visible - * `ResourceNotFoundException` during the post-deploy poll — that a per-poll - * `isReady()` would otherwise surface as a hard suite failure. + * Bedrock ingests asynchronously after deploy, so during the initial pre-sync + * window we wait for the KB to sync with our latest data before probing + * `kbRetrieve`. We delegate to the wired `waitUntilSynced()` endpoint (exposed + * here as `kbWaitUntilSynced`) rather than hand-rolling a poll loop over + * `isSynced()` (`kbSynced`): `waitUntilSynced()` owns the deadline AND rides out + * transient control-plane blips — throttling, a `RetrievalFailed`, or a brief + * not-yet-visible `ResourceNotFoundException` during the post-deploy poll — that + * a per-poll `isSynced()` would otherwise surface as a hard suite failure. * * A *thrown* error here is therefore a real failure (a failed ingestion job * surfaced as `IngestionFailedException`, a `KnowledgeBaseValidationError`, the - * readiness timeout, or anything unexpected) and is surfaced immediately rather - * than masked as warm-up. + * sync timeout, or anything unexpected) and is surfaced immediately rather than + * masked as an in-progress sync. * * In local mode the mock resolves immediately, so this returns on the first poll. */ -async function gateOnReadiness( +async function gateOnSync( api: typeof apiType, { timeoutMs = 180_000, pollIntervalMs = 10_000 } = {}, ): Promise { const start = Date.now(); - console.log('⏳ Waiting for KB ingestion readiness (warming up if needed)…'); + console.log('⏳ Waiting for KB to sync with latest data (ingesting if needed)…'); try { - await api.kbWaitUntilReady({ timeoutMs, pollIntervalMs }); + await api.kbWaitUntilSynced({ timeoutMs, pollIntervalMs }); } catch (err: any) { // Real failure (failed ingestion / validation / timeout / unexpected) — surface it. - console.error(`❌ KB readiness check failed: ${err.name || err.message}`); + console.error(`❌ KB sync check failed: ${err.name || err.message}`); throw err; } const elapsed = Math.round((Date.now() - start) / 1000); - console.log(`✅ KB ready (ingestion complete) — ${elapsed}s elapsed`); + console.log(`✅ KB synced (ingestion complete) — ${elapsed}s elapsed`); } export function knowledgeBaseTests(getApi: () => typeof apiType) { @@ -76,33 +76,33 @@ export function knowledgeBaseTests(getApi: () => typeof apiType) { }); }); - // --- Readiness: cover the wired waitUntilReady() endpoint end-to-end --- - // The retrieval suites gate via gateOnReadiness() → kbWaitUntilReady(); this - // exercises that same waitUntilReady() polling path directly. (kbReady / - // isReady() is wired but not exercised by this suite — it's covered by unit + // --- Sync: cover the wired waitUntilSynced() endpoint end-to-end --- + // The retrieval suites gate via gateOnSync() → kbWaitUntilSynced(); this + // exercises that same waitUntilSynced() polling path directly. (kbSynced / + // isSynced() is wired but not exercised by this suite — it's covered by unit // tests.) Locally the mock resolves on the first poll; on AWS we give it the - // same budget as gateOnReadiness so a still-ingesting KB is waited out rather + // same budget as gateOnSync so a still-ingesting KB is waited out rather // than surfaced as a failure. - describe('waitUntilReady', () => { - test('resolves once the KB is ready', async (t) => { + describe('waitUntilSynced', () => { + test('resolves once the KB is synced', async (t) => { const api = getApi(); try { - const result = await api.kbWaitUntilReady( + const result = await api.kbWaitUntilSynced( isLocal ? { timeoutMs: 5_000, pollIntervalMs: 50 } : { timeoutMs: 180_000, pollIntervalMs: 10_000 }, ); assert.deepStrictEqual(result, { success: true }); } catch (err: unknown) { - // Warm-up tolerance: unlike the retrieval suites, this standalone test is NOT - // behind the gateOnReadiness() gate, so on AWS a slow-but-healthy KB whose - // ingestion outruns the 180s budget makes kbWaitUntilReady throw a Timeout. - // That's the post-deploy warm-up window, not a defect — soft-skip it here. A + // Sync tolerance: unlike the retrieval suites, this standalone test is NOT + // behind the gateOnSync() gate, so on AWS a slow-but-healthy KB whose + // ingestion outruns the 180s budget makes kbWaitUntilSynced throw a Timeout. + // That's the post-deploy sync window, not a defect — soft-skip it here. A // genuine IngestionFailed (or any other error) still fails the test. The local // mock resolves on the first poll, so this branch never trips and the success // assertion above always runs. if (isBlocksError(err, Timeout)) { - t.skip(`KB still warming up — ingestion exceeded budget: ${(err as Error).message}`); + t.skip(`KB still syncing — ingestion exceeded budget: ${(err as Error).message}`); return; } throw err; @@ -115,7 +115,7 @@ export function knowledgeBaseTests(getApi: () => typeof apiType) { before(async () => { const api = getApi(); - await gateOnReadiness(api); + await gateOnSync(api); }); test('returns results for a matching query', async () => { @@ -172,7 +172,7 @@ export function knowledgeBaseTests(getApi: () => typeof apiType) { before(async () => { const api = getApi(); - await gateOnReadiness(api); + await gateOnSync(api); }); test('maxResults limits results', async () => { @@ -210,7 +210,7 @@ export function knowledgeBaseTests(getApi: () => typeof apiType) { before(async () => { const api = getApi(); - await gateOnReadiness(api); + await gateOnSync(api); }); test('customer metadata category is present on tutorial doc', async () => {