From 573617a5321d70ccf73c8e9907ea4777cddc11aa Mon Sep 17 00:00:00 2001 From: Pawel Kosiec Date: Thu, 14 May 2026 14:11:33 +0200 Subject: [PATCH 1/7] Absorb DevHub recipe patterns into agent skills Unify 21 DevHub recipes under agent skills as the single source of truth. New reference files: - pgvector.md: vector similarity search with pgvector extension - lakehouse-sync.md: CDC from Lakebase Postgres to Unity Catalog - off-platform.md: env management, token refresh, Drizzle ORM (@databricks/lakebase) - medallion-from-cdc.md: Silver/Gold layers from CDC history tables Enrichments to existing skills: - model-serving.md: AI SDK v6 streaming, embeddings, AI Gateway patterns - lakebase.md: chat persistence, naming conventions, project-owner exception - genie.md: multi-space deployment, troubleshooting - synced-tables.md: timeseries_key, cost guidance, FGAC workaround - files.md: user_api_scopes for OBO - databricks-apps/SKILL.md: State Storage Rule, post-deploy verification - databricks-lakebase/SKILL.md: reference links, JSON path table, pgvector cross-ref - databricks-model-serving/SKILL.md: scale_to_zero_enabled, model discovery - databricks-pipelines/SKILL.md: medallion-from-cdc reference link Co-authored-by: Isaac --- manifest.json | 22 +- skills/databricks-apps/SKILL.md | 25 +- .../references/appkit/files.md | 4 +- .../references/appkit/genie.md | 100 ++++++ .../references/appkit/lakebase.md | 76 +++++ .../references/appkit/model-serving.md | 175 +++++++++- skills/databricks-lakebase/SKILL.md | 15 + .../references/lakehouse-sync.md | 152 +++++++++ .../references/off-platform.md | 300 ++++++++++++++++++ .../references/pgvector.md | 149 +++++++++ .../references/synced-tables.md | 11 + skills/databricks-model-serving/SKILL.md | 5 +- skills/databricks-pipelines/SKILL.md | 1 + .../references/medallion-from-cdc.md | 144 +++++++++ 14 files changed, 1161 insertions(+), 18 deletions(-) create mode 100644 skills/databricks-lakebase/references/lakehouse-sync.md create mode 100644 skills/databricks-lakebase/references/off-platform.md create mode 100644 skills/databricks-lakebase/references/pgvector.md create mode 100644 skills/databricks-pipelines/references/medallion-from-cdc.md diff --git a/manifest.json b/manifest.json index bc7836a..8a4ec3f 100644 --- a/manifest.json +++ b/manifest.json @@ -1,12 +1,12 @@ { "version": "2", - "updated_at": "2026-05-11T13:22:07Z", + "updated_at": "2026-05-13T16:06:21Z", "skills": { "databricks-apps": { "version": "0.1.1", "description": "Databricks Apps development and deployment (evaluates analytics vs synced tables data access)", "experimental": false, - "updated_at": "2026-05-11T13:22:01Z", + "updated_at": "2026-05-13T16:05:34Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -33,7 +33,7 @@ "version": "0.1.0", "description": "Core Databricks skill for CLI, auth, and data exploration", "experimental": false, - "updated_at": "2026-05-11T10:22:59Z", + "updated_at": "2026-05-12T22:07:25Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -48,7 +48,7 @@ "version": "0.0.0", "description": "Declarative Automation Bundles (DABs) for deploying and managing Databricks resources", "experimental": false, - "updated_at": "2026-05-05T15:31:42Z", + "updated_at": "2026-05-12T20:04:29Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -66,7 +66,7 @@ "version": "0.1.0", "description": "Databricks Jobs orchestration and scheduling", "experimental": false, - "updated_at": "2026-05-07T15:19:50Z", + "updated_at": "2026-05-12T20:04:29Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -78,7 +78,7 @@ "version": "0.1.0", "description": "Databricks Lakebase Postgres: projects, scaling, connectivity, synced tables, and Data API", "experimental": false, - "updated_at": "2026-05-11T10:23:05Z", + "updated_at": "2026-05-13T16:02:53Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -86,6 +86,9 @@ "assets/databricks.svg", "references/computes-and-scaling.md", "references/connectivity.md", + "references/lakehouse-sync.md", + "references/off-platform.md", + "references/pgvector.md", "references/synced-tables.md" ] }, @@ -93,7 +96,7 @@ "version": "0.1.0", "description": "Databricks Model Serving endpoint management", "experimental": false, - "updated_at": "2026-05-07T15:19:45Z", + "updated_at": "2026-05-13T16:06:15Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -105,7 +108,7 @@ "version": "0.1.0", "description": "Databricks Pipelines (DLT) for ETL and streaming", "experimental": false, - "updated_at": "2026-05-07T15:19:55Z", + "updated_at": "2026-05-13T16:03:14Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -125,6 +128,7 @@ "references/materialized-view-python.md", "references/materialized-view-sql.md", "references/materialized-view.md", + "references/medallion-from-cdc.md", "references/options-avro.md", "references/options-csv.md", "references/options-json.md", @@ -152,7 +156,7 @@ "version": "0.1.0", "description": "Migrate Databricks workloads from classic compute to serverless compute, including compatibility checks and concrete fixes", "experimental": false, - "updated_at": "2026-05-07T15:19:59Z", + "updated_at": "2026-05-12T20:04:29Z", "files": [ "SKILL.md", "agents/openai.yaml", diff --git a/skills/databricks-apps/SKILL.md b/skills/databricks-apps/SKILL.md index 009d751..e4ca621 100644 --- a/skills/databricks-apps/SKILL.md +++ b/skills/databricks-apps/SKILL.md @@ -1,6 +1,6 @@ --- name: databricks-apps -description: "Build apps on Databricks Apps platform. Use when asked to create dashboards, data apps, analytics tools, or visualizations. Evaluates data access patterns (analytics vs Lakebase synced tables) before scaffolding. Invoke BEFORE starting implementation." +description: "Build apps on Databricks Apps platform. Auto-detects need for Lakebase when app stores state; evaluates data access patterns (analytics vs Lakebase synced tables) before scaffolding. Invoke BEFORE starting implementation." compatibility: Requires databricks CLI (>= v0.294.0) metadata: version: "0.1.1" @@ -17,7 +17,7 @@ Build apps that deploy to Databricks Apps platform. | Phase | READ BEFORE proceeding | |-------|------------------------| -| Scaffolding | **⚠️ STOP — complete the Data Access Decision Gate below before scaffolding.** Parent `databricks-core` skill (auth, warehouse discovery); then run `databricks apps manifest` + `databricks apps init` with `--features` and `--set` (see AppKit section below) | +| Scaffolding | **⚠️ STOP — evaluate the State Storage Rule and Data Access Decision Gate below before scaffolding.** Parent `databricks-core` skill (auth, warehouse discovery); then run `databricks apps manifest` + `databricks apps init` with `--features` and `--set` (see AppKit section below) | | Writing SQL queries | [SQL Queries Guide](references/appkit/sql-queries.md) | | Writing UI components | [Frontend Guide](references/appkit/frontend.md) | | Using `useAnalyticsQuery` | [AppKit SDK](references/appkit/appkit-sdk.md) | @@ -61,6 +61,16 @@ Build apps that deploy to Databricks Apps platform. Before writing any SQL, use the parent `databricks-core` skill for data exploration — search `information_schema` by keyword, then batch `discover-schema` for the tables you need. Do NOT skip this step. +**State Storage Rule (evaluate BEFORE the Decision Gate):** + +If the user's app description implies storing or persisting data — forms, CRUD operations, user input, preferences, bookmarks, orders, todos, comments, votes, or any user-generated content — the app needs a Lakebase database. Do not wait for the user to ask for one. + +1. Use the **`databricks-lakebase`** skill to create a Lakebase project (if one doesn't already exist) and obtain the branch and database resource names. +2. Scaffold with `--features lakebase` and pass `--set lakebase.postgres.branch= --set lakebase.postgres.database=`. +3. If the app **also** reads from Unity Catalog tables, proceed to the Data Access Decision Gate below to determine whether to add `--features analytics` or use Lakebase synced tables. + +This is not optional — any app that writes user-generated data needs Lakebase. + ## Development Workflow (FOLLOW THIS ORDER) **Data Access Decision Gate (REQUIRED before scaffolding):** @@ -79,7 +89,7 @@ After the user chooses: - (A) Lakebase synced tables → scaffold with `--features lakebase`. See [Lakebase Guide](references/appkit/lakebase.md) for full workflow. - (B) Analytics → scaffold with `--features analytics`. - Both → scaffold with `--features analytics,lakebase` if the app needs both patterns. -- If the app does NOT read UC data (pure CRUD, Genie, Model Serving), skip this gate and scaffold with the appropriate `--features` flag. +- If the app does NOT read UC data (pure CRUD, Genie, Model Serving), skip this gate. For pure CRUD/state apps, the State Storage Rule above already applies — scaffold with `--features lakebase`. For Genie or Model Serving, scaffold with the corresponding `--features` flag. **Analytics apps** (`--features analytics`): @@ -194,3 +204,12 @@ App names must be lowercase with hyphens only (≤26 chars). Databricks Apps supports any framework that runs as an HTTP server. LLMs already know these frameworks — the challenge is Databricks platform integration. **READ [Other Frameworks Guide](references/other-frameworks.md) BEFORE building any non-AppKit app.** It covers port/host configuration, `app.yaml` and `databricks.yml` setup, dependency management, networking, and framework-specific gotchas. + +### Post-Deploy Verification + +After deploying, verify the app is running: + +```bash +databricks apps get --profile -o json # Check app_status.state: RUNNING +databricks apps logs --profile # Stream BUILD/SYSTEM/APP logs +``` diff --git a/skills/databricks-apps/references/appkit/files.md b/skills/databricks-apps/references/appkit/files.md index 3432f09..af18571 100644 --- a/skills/databricks-apps/references/appkit/files.md +++ b/skills/databricks-apps/references/appkit/files.md @@ -231,12 +231,14 @@ const handleCreateDirectory = async (name: string) => { ## Resource Requirements -Each volume key requires a resource with `WRITE_VOLUME` permission. Declare in `databricks.yml`: +Each volume key requires a resource with `WRITE_VOLUME` permission and `user_api_scopes` for on-behalf-of (OBO) token access. Declare in `databricks.yml`: ```yaml resources: apps: my_app: + user_api_scopes: + - files.files # Required for OBO token access in production resources: - name: uploads-volume volume: diff --git a/skills/databricks-apps/references/appkit/genie.md b/skills/databricks-apps/references/appkit/genie.md index 8320e35..80d6c9f 100644 --- a/skills/databricks-apps/references/appkit/genie.md +++ b/skills/databricks-apps/references/appkit/genie.md @@ -147,6 +147,103 @@ Update smoke tests if headings or routes changed, then `databricks apps validate For advanced Genie plugin usage, see `npx @databricks/appkit docs ./docs/plugins/genie.md`. +## Multi-Space Deployment + +For the `spaces` map API, `GenieChat alias` prop, and `useGenieChat` hook, see `npx @databricks/appkit docs ./docs/plugins/genie.md`. + +This section covers the **deployment-specific patterns** for multi-space Genie apps (databricks.yml, app.yaml, stale conversation cleanup). + +**databricks.yml** — add one variable + resource per space, plus target-level values: + +```yaml +variables: + genie_space_id: + description: Default Genie space ID (required by AppKit) + genie_space_sales_id: + description: Sales Genie space ID + genie_space_support_id: + description: Support Genie space ID + +resources: + apps: + app: + user_api_scopes: + - dashboards.genie + resources: + - name: genie-space + genie_space: + name: ${var.genie_space_name} + space_id: ${var.genie_space_id} + permission: CAN_RUN + - name: genie-space-sales + genie_space: + name: genie-space-sales + space_id: ${var.genie_space_sales_id} + permission: CAN_RUN + - name: genie-space-support + genie_space: + name: genie-space-support + space_id: ${var.genie_space_support_id} + permission: CAN_RUN + +targets: + default: + variables: + genie_space_id: + genie_space_sales_id: + genie_space_support_id: +``` + +**app.yaml** — keep `DATABRICKS_GENIE_SPACE_ID` (AppKit validates it on startup). Add one `valueFrom` per UI space: + +```yaml +env: + - name: DATABRICKS_GENIE_SPACE_ID + valueFrom: genie-space + - name: DATABRICKS_GENIE_SPACE_SALES + valueFrom: genie-space-sales + - name: DATABRICKS_GENIE_SPACE_SUPPORT + valueFrom: genie-space-support +``` + +**Critical gotcha**: `DATABRICKS_GENIE_SPACE_ID` must always be set — AppKit validates it on startup even when using a custom `spaces` map. + +**Build version stamp** — stamp every build so the page can detect a new deployment and clear stale conversation state: + +```typescript +// client/vite.config.ts +export default defineConfig({ + // ... existing config ... + define: { + "import.meta.env.VITE_APP_VERSION": JSON.stringify(Date.now().toString()), + }, +}); +``` + +**Stale conversation cleanup** — `GenieChat` stores conversation IDs in URLs and localStorage that become stale across space switches or redeployments: + +```typescript +function clearConversationUrl() { + const url = new URL(window.location.href); + url.searchParams.delete("conversationId"); + window.history.replaceState({}, "", url.toString()); +} + +function initAlias(): string { + const buildVersion = import.meta.env.VITE_APP_VERSION ?? "dev"; + if (localStorage.getItem("appkit:genie:version") !== buildVersion) { + const savedAlias = localStorage.getItem("appkit:genie:alias"); + Object.keys(localStorage) + .filter((k) => k.startsWith("appkit:genie:")) + .forEach((k) => localStorage.removeItem(k)); + localStorage.setItem("appkit:genie:version", buildVersion); + if (savedAlias) localStorage.setItem("appkit:genie:alias", savedAlias); + clearConversationUrl(); + } + return localStorage.getItem("appkit:genie:alias") ?? SPACES[0]?.alias ?? ""; +} +``` + ## Frontend **For full component API**: run `npx @databricks/appkit docs "GenieChat"`. @@ -197,3 +294,6 @@ The plugin mounts SSE endpoints under `/api/genie`: | `plugin "genie" has no resource with key "..."` | Wrong `--set` flags during scaffold | Always derive resource keys from `databricks apps manifest` | | Chat collapses or renders poorly | No explicit height on container | Give the parent a fixed height | | Duplicate routes or import confusion | Old local Genie proxy file | Remove it — use `genie` from `@databricks/appkit` | +| `does not have required scopes: genie` | Missing API scope | Confirm `user_api_scopes` includes `dashboards.genie` in `databricks.yml` and redeploy | +| Genie space not found | Wrong space ID | Verify space ID matches the value on the Genie space **About** tab | +| `valueFrom` mismatch | `app.yaml` value doesn't match `databricks.yml` | `valueFrom` in `app.yaml` must exactly match the resource `name` in `databricks.yml` | diff --git a/skills/databricks-apps/references/appkit/lakebase.md b/skills/databricks-apps/references/appkit/lakebase.md index d75e888..328ace5 100644 --- a/skills/databricks-apps/references/appkit/lakebase.md +++ b/skills/databricks-apps/references/appkit/lakebase.md @@ -39,6 +39,8 @@ Use the `databricks-lakebase` skill to create a Lakebase project and discover br > For multi-environment deployments (dev/prod), use `variables:` and `targets:` blocks in `databricks.yml` — see the **`databricks-dabs`** skill for patterns. +**Naming conventions:** Use domain names for user-facing code (`ItemsPage.tsx`, `/api/items`, `item-routes.ts`). Keep `lakebase` naming only for infrastructure config (`lakebase()` plugin, `LAKEBASE_ENDPOINT`, `postgres` app resource). + **Get resource names** (if you have an existing project): ```bash # List branches → use the name field of a READY branch @@ -180,6 +182,78 @@ const prisma = new PrismaClient({ adapter }); For ORM-compatible config: `appkit.lakebase.getOrmConfig()`. +## Chat Persistence Pattern + +Save AI chat conversations to Lakebase so users can resume sessions and scroll full message history. + +**Schema** — create in a separate `chat` schema (not `app`) so the deploy-first ownership model stays clean: + +```sql +CREATE SCHEMA IF NOT EXISTS chat; + +CREATE TABLE IF NOT EXISTS chat.chats ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + user_id TEXT NOT NULL, + title TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS chat.messages ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + chat_id UUID NOT NULL REFERENCES chat.chats(id) ON DELETE CASCADE, + role TEXT NOT NULL CHECK (role IN ('system', 'user', 'assistant', 'tool')), + content TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_messages_chat_id_created_at + ON chat.messages(chat_id, created_at); +``` + +**Bootstrap** — run setup in `onPluginsReady` so tables exist before the server accepts requests: + +```typescript +await createApp({ + plugins: [server(), lakebase()], + async onPluginsReady(appkit) { + await setupChatTables(appkit); + // then register routes via appkit.server.extend(...) + }, +}); +``` + +**Persistence helpers** — use parameterized queries: + +```typescript +export async function createChat(appkit, input: { userId: string; title: string }) { + const result = await appkit.lakebase.query( + `INSERT INTO chat.chats (user_id, title) VALUES ($1, $2) + RETURNING id, user_id, title, created_at, updated_at`, + [input.userId, input.title], + ); + return result.rows[0]; +} + +export async function appendMessage(appkit, input: { chatId: string; role: string; content: string }) { + const result = await appkit.lakebase.query( + `INSERT INTO chat.messages (chat_id, role, content) VALUES ($1, $2, $3) + RETURNING id, chat_id, role, content, created_at`, + [input.chatId, input.role, input.content], + ); + return result.rows[0]; +} +``` + +**User identity**: In deployed apps, use `req.header("x-forwarded-email")`. For local dev, hardcode a test user ID. + +**History endpoints**: +- `GET /api/chats` — list chats for current user +- `GET /api/chats/:chatId/messages` — load ordered history +- `DELETE /api/chats/:chatId` — delete chat (messages cascade) + +**AI SDK v6 integration**: Use `setMessages()` from `useChat` return value for history loading (NOT `initialMessages`). To read response headers like `X-Chat-Id`, pass a custom `fetch` wrapper on the `TextStreamChatTransport` constructor. + ## Reading from Lakebase synced tables Lakebase synced tables materialize Delta/UC tables into Lakebase Postgres for low-latency app reads. The lakehouse remains the source of truth; Lakebase serves as a read-optimized index. @@ -262,6 +336,8 @@ If you skip this step, the Service Principal won't own the database schema. You' Lakebase project creators already have database access after the first deploy. Collaborators need `databricks_superuser` granted by the project creator via Branch Overview. +> **Project-owner note:** If you are the Lakebase project owner, `databricks_create_role` may fail with "role already exists" and `GRANT databricks_superuser` may fail with "permission denied to grant role" — both errors are safe to ignore; the project owner already has the necessary access. + The Lakebase env vars (`PGHOST`, `PGDATABASE`, etc.) are auto-set only when deployed. For local development, get the connection details from your endpoint and set them manually: ```bash diff --git a/skills/databricks-apps/references/appkit/model-serving.md b/skills/databricks-apps/references/appkit/model-serving.md index 967f8cb..acc5add 100644 --- a/skills/databricks-apps/references/appkit/model-serving.md +++ b/skills/databricks-apps/references/appkit/model-serving.md @@ -56,7 +56,7 @@ env: The injected value is the endpoint **name** (not a URL). Use it in server-side code to call the endpoint. -## tRPC Pattern +## Non-Streaming Query Pattern Always use tRPC for model serving calls — do NOT call endpoints directly from the client. @@ -94,9 +94,176 @@ const result = await trpc.queryModel.query({ prompt: userInput }); const answer = result.choices?.[0]?.message?.content; ``` -For streaming and advanced patterns, see `npx @databricks/appkit docs ./docs/plugins/model-serving.md`. +For AppKit's built-in serving plugin streaming (SSE via `stream()` and `useServingStream`), see `npx @databricks/appkit docs ./docs/plugins/model-serving.md`. The patterns below are for direct AI SDK v6 integration with Databricks AI Gateway. -AppKit integrates with **Model Serving endpoints**. AI Gateway (beta) endpoints are not directly supported — use the underlying Model Serving endpoint name instead. AI Gateway features (rate limits, usage tracking) can be configured on Model Serving endpoints via the `databricks-model-serving` skill. +## AI SDK v6 Streaming Pattern + +Use this pattern for streaming AI chat with Databricks AI Gateway and Vercel AI SDK v6. + +**Dependencies:** `ai@6`, `@ai-sdk/react@3`, `@ai-sdk/openai`, `@databricks/sdk-experimental` + +**Auth helper** — works for both local dev (CLI profile) and deployed apps (service principal token): + +```typescript +import { Config } from "@databricks/sdk-experimental"; + +async function getDatabricksToken() { + if (process.env.DATABRICKS_TOKEN) { + return process.env.DATABRICKS_TOKEN; + } + const config = new Config({ + profile: process.env.DATABRICKS_CONFIG_PROFILE || "DEFAULT", + }); + await config.ensureResolved(); + const headers = new Headers(); + await config.authenticate(headers); + const authHeader = headers.get("Authorization"); + if (!authHeader) { + throw new Error( + "Failed to get Databricks token. Check your CLI profile or set DATABRICKS_TOKEN.", + ); + } + return authHeader.replace("Bearer ", ""); +} +``` + +**Server route** (`POST /api/chat`): + +```typescript +import { createOpenAI } from "@ai-sdk/openai"; +import { streamText, type UIMessage } from "ai"; + +app.post("/api/chat", async (req, res) => { + const { messages } = req.body; + + // AI SDK v6 client sends UIMessage objects with a parts array. + // Convert to CoreMessage format for streamText(). + const coreMessages = (messages as UIMessage[]).map((m) => ({ + role: m.role as "user" | "assistant" | "system", + content: + m.parts + ?.filter((p) => p.type === "text" && p.text) + .map((p) => p.text) + .join("") ?? + m.content ?? + "", + })); + + try { + const token = await getDatabricksToken(); + const endpoint = process.env.DATABRICKS_ENDPOINT || ""; + + // AI Gateway URL uses /mlflow/v1 path, NOT /openai/v1 + const databricks = createOpenAI({ + baseURL: `https://${process.env.DATABRICKS_WORKSPACE_ID}.ai-gateway.cloud.databricks.com/mlflow/v1`, + apiKey: token, + }); + + const result = streamText({ + model: databricks.chat(endpoint), + messages: coreMessages, + maxOutputTokens: 1000, + }); + + result.pipeTextStreamToResponse(res); + } catch (err) { + const message = (err as Error).message; + console.error(`[chat] Streaming request failed:`, message); + res.status(502).json({ error: "Chat request failed", detail: message }); + } +}); +``` + +**Environment variables:** +- `DATABRICKS_WORKSPACE_ID` — auto-discovered by AppKit at runtime; for explicit setup: `databricks api get /api/2.1/unity-catalog/current-metastore-assignment --profile ` → `workspace_id` field +- `DATABRICKS_ENDPOINT` — model endpoint name (e.g. `databricks-meta-llama-3-3-70b-instruct`). Run `databricks serving-endpoints list --profile ` to see available models. + +## Streaming Client Pattern (AI SDK v6) + +```tsx +import { useChat } from "@ai-sdk/react"; +import { TextStreamChatTransport } from "ai"; +import { useState } from "react"; + +export function ChatPage() { + const [input, setInput] = useState(""); + + const { messages, sendMessage, status } = useChat({ + transport: new TextStreamChatTransport({ api: "/api/chat" }), + }); + + return ( +
+
+ {messages.map((m) => ( +
+ + {m.role === "user" ? "You" : "Assistant"} + + {m.parts.map((part, i) => + part.type === "text" ? ( +

+ {part.text} +

+ ) : null, + )} +
+ ))} + {status === "submitted" &&
Loading...
} +
+
{ + e.preventDefault(); + if (input.trim()) { + void sendMessage({ text: input }); + setInput(""); + } + }} + className="border-t p-4 flex gap-2" + > + setInput(e.target.value)} + placeholder="Ask a question..." + className="flex-1 border rounded px-3 py-2" + disabled={status !== "ready"} + /> + +
+
+ ); +} +``` + +Key differences from AI SDK v5: use `sendMessage({ text })` (NOT `append`), render `m.parts` array (NOT `m.content`), and `status` states are `ready`, `submitted`, `streaming`. + +## Embeddings Pattern + +Generate text embeddings using a Databricks AI Gateway endpoint. + +```typescript +import { getWorkspaceClient } from "@databricks/appkit"; + +const workspaceClient = getWorkspaceClient({}); + +export async function generateEmbedding(text: string): Promise { + const endpoint = + process.env.DATABRICKS_EMBEDDING_ENDPOINT || "databricks-gte-large-en"; + const result = await workspaceClient.servingEndpoints.query({ + name: endpoint, + input: text, + }); + return result.data![0].embedding!; +} +``` + +Common embedding endpoints: `databricks-gte-large-en` (1024d), `databricks-bge-large-en` (1024d). Set `DATABRICKS_EMBEDDING_ENDPOINT` in `.env` and `app.yaml`. + +For vector similarity search with these embeddings, see the `databricks-lakebase` skill's [pgvector.md](../../../databricks-lakebase/references/pgvector.md). ## Troubleshooting @@ -106,3 +273,5 @@ AppKit integrates with **Model Serving endpoints**. AI Gateway (beta) endpoints | `SERVING_ENDPOINT` env var empty | Missing env injection | Add `valueFrom: serving-endpoint` to `app.yaml` env section | | 504 Gateway Timeout | Inference exceeds 120s proxy limit | Reduce `max_tokens` or use WebSockets — see [Platform Guide](../platform-guide.md) | | `getExecutionContext` undefined | Called outside AppKit server context | Ensure call is inside a tRPC procedure on the server side | +| 502 from AI Gateway | Token expired or invalid endpoint | Refresh token via `getDatabricksToken()`; verify endpoint exists | +| `TextStreamChatTransport` not found | Wrong AI SDK version | Requires `ai@6` and `@ai-sdk/react@3` | diff --git a/skills/databricks-lakebase/SKILL.md b/skills/databricks-lakebase/SKILL.md index 44e5b34..459c88e 100644 --- a/skills/databricks-lakebase/SKILL.md +++ b/skills/databricks-lakebase/SKILL.md @@ -33,6 +33,9 @@ Lakebase is Databricks' serverless Postgres-compatible database, available on bo - [computes-and-scaling.md](references/computes-and-scaling.md) — Sizing, endpoint management, scale-to-zero, HA - [connectivity.md](references/connectivity.md) — Connection patterns, token refresh, Data API - [synced-tables.md](references/synced-tables.md) — Lakebase synced tables, data type mapping, capacity planning +- [lakehouse-sync.md](references/lakehouse-sync.md) — CDC from Lakebase Postgres to Unity Catalog Delta tables +- [pgvector.md](references/pgvector.md) — Vector similarity search with pgvector extension +- [off-platform.md](references/off-platform.md) — Off-platform Lakebase: env management, token refresh, Drizzle ORM ## Resource Hierarchy @@ -89,8 +92,18 @@ After creation, verify: ```bash databricks postgres list-branches projects/ --profile databricks postgres list-endpoints projects//branches/ --profile +databricks postgres list-databases projects//branches/ --profile ``` +**Extract connection values from JSON output:** + +| Value | JSON path | Used for | +|-------|-----------|----------| +| Endpoint host | `status.hosts.host` | `PGHOST`, `lakebase.postgres.host` | +| Endpoint resource path | `name` | `LAKEBASE_ENDPOINT`, `lakebase.postgres.endpointPath` | +| Database resource path | `name` | `lakebase.postgres.database` | +| PostgreSQL database name | `status.postgres_database` | `PGDATABASE`, `lakebase.postgres.databaseName` | + ### Updating a Project ```bash @@ -266,6 +279,8 @@ SELECT * FROM pg_available_extensions ORDER BY name; CREATE EXTENSION IF NOT EXISTS ; ``` +For vector embeddings with pgvector, see [pgvector.md](references/pgvector.md). + ## Troubleshooting | Error | Solution | diff --git a/skills/databricks-lakebase/references/lakehouse-sync.md b/skills/databricks-lakebase/references/lakehouse-sync.md new file mode 100644 index 0000000..fceb169 --- /dev/null +++ b/skills/databricks-lakebase/references/lakehouse-sync.md @@ -0,0 +1,152 @@ +# Lakehouse Sync: CDC from Lakebase to Unity Catalog + +Lakehouse Sync continuously streams changes **from** Lakebase Postgres **into** Unity Catalog Delta tables using Change Data Capture (CDC). Each synced table produces an SCD Type 2 history table in Unity Catalog, giving you a full audit trail queryable from the lakehouse. + +This is the reverse direction from synced tables (which go UC → Lakebase). No external compute, pipelines, or jobs are required — it is a native Lakebase feature. + +## When to Use + +- Analyze operational data (orders, user activity, support tickets) in the lakehouse +- Need a historical record of every insert, update, and delete from Postgres tables +- Join operational data with analytics data in Spark, SQL, or BI tools +- Feed Lakebase data into downstream pipelines or ML models + +## History Tables + +For each synced table, a Delta history table is created in Unity Catalog: + +``` +lb__history +``` + +Each row includes CDC metadata columns: + +| Column | Description | +|--------|-------------| +| `_change_type` | `insert`, `update_preimage`, `update_postimage`, or `delete` | +| `_lsn` | Log Sequence Number for ordering changes | +| `_commit_timestamp` | When the change was captured | + +## Enablement + +**Lakehouse Sync is UI-only** — configured via the "Lakehouse sync" tab in the branch overview, not via CLI or API. It operates at the **schema level**: once enabled, all current and future tables in that schema sync to Unity Catalog. + +Navigate to: **Catalog** → your Autoscaling project → branch → **Lakehouse Sync** → **Start Sync**, then select the source database/schema, destination catalog/schema, and tables. + +## Prerequisites + +- Lakebase Autoscaling project running **Postgres 17** +- Tables must reside in the `databricks_postgres` database +- `REPLICA IDENTITY FULL` must be set on all source tables: + +```sql +ALTER TABLE REPLICA IDENTITY FULL; +``` + +- Verify replica identity: + +```sql +SELECT n.nspname AS table_schema, + c.relname AS table_name, + CASE c.relreplident + WHEN 'd' THEN 'default' + WHEN 'n' THEN 'nothing' + WHEN 'f' THEN 'full' + WHEN 'i' THEN 'index' + END AS replica_identity +FROM pg_class c +JOIN pg_namespace n ON n.oid = c.relnamespace +WHERE c.relkind = 'r' + AND n.nspname = 'public' +ORDER BY n.nspname, c.relname; +``` + +- **Permissions:** CAN MANAGE on source project; USE CATALOG + USE SCHEMA + CREATE TABLE on destination +- Catalogs with default storage are **unsupported** + +## Supported Data Types + +`bool`, `int2`, `int4`, `int8`, `text`, `varchar`, `bpchar`, `jsonb`, `numeric`, `date`, `timestamp`, `timestamptz`, `real`, `float4`, `float8`, plus enum types (`typcategory = 'E'`). + +Check for unsupported types: + +```sql +SELECT c.table_schema, c.table_name, c.column_name, c.udt_name AS data_type +FROM information_schema.columns c +JOIN pg_catalog.pg_type t ON t.typname = c.udt_name +WHERE c.table_schema = 'public' + AND NOT ( + c.udt_name IN ( + 'bool', 'int2', 'int4', 'int8', 'text', 'varchar', 'bpchar', + 'jsonb', 'numeric', 'date', 'timestamp', 'timestamptz', + 'real', 'float4', 'float8' + ) + OR t.typcategory = 'E' + ) +ORDER BY c.table_schema, c.table_name, c.ordinal_position; +``` + +## Monitoring + +Check active syncs from Postgres (the `wal2delta` schema only exists after Lakehouse Sync has been enabled): + +```sql +SELECT * FROM wal2delta.tables; +``` + +## Querying History Tables + +**Latest state of each row** (deduplicated current state): + +```sql +SELECT * +FROM ( + SELECT *, + ROW_NUMBER() OVER (PARTITION BY ORDER BY _lsn DESC) AS rn + FROM ..lb__history + WHERE _change_type IN ('insert', 'update_postimage', 'delete') +) +WHERE rn = 1 + AND _change_type != 'delete'; +``` + +**Full change history for a record:** + +```sql +SELECT * +FROM ..lb__history +WHERE = +ORDER BY _lsn; +``` + +## Schema Changes + +If you need to change a synced table's schema in Postgres, use the rename-and-swap pattern: + +```sql +CREATE TABLE _v2 ( + id INT PRIMARY KEY, + name TEXT, + new_column TEXT +); + +ALTER TABLE
_v2 REPLICA IDENTITY FULL; + +INSERT INTO
_v2 SELECT *, NULL FROM
; + +BEGIN; +ALTER TABLE
RENAME TO
_backup; +ALTER TABLE
_v2 RENAME TO
; +COMMIT; +``` + +## Limitations + +- Partitioned tables are not supported +- Disabling and re-enabling sync does **not** re-snapshot — missing changes are lost permanently +- AWS only (Beta). Azure support is not yet available. + +## Cross-references + +- For building Silver/Gold layers from CDC history tables, see the `databricks-pipelines` skill's [medallion-from-cdc.md](../../databricks-pipelines/references/medallion-from-cdc.md) +- For syncing in the reverse direction (UC → Lakebase), see [synced-tables.md](synced-tables.md) diff --git a/skills/databricks-lakebase/references/off-platform.md b/skills/databricks-lakebase/references/off-platform.md new file mode 100644 index 0000000..52f1758 --- /dev/null +++ b/skills/databricks-lakebase/references/off-platform.md @@ -0,0 +1,300 @@ +# Off-Platform Lakebase: Connecting from External Apps + +Connect to Lakebase from apps deployed outside Databricks App Platform (e.g. Vercel, AWS, Netlify, or any Node.js server). + +## Recommended: `@databricks/lakebase` Package + +The simplest way to connect — a drop-in `pg.Pool` replacement with automatic OAuth token refresh. + +```bash +npm install @databricks/lakebase +``` + +**Zero-config usage** (reads from environment variables): + +```typescript +import { createLakebasePool } from "@databricks/lakebase"; + +const pool = createLakebasePool(); +const result = await pool.query("SELECT * FROM users"); +``` + +**Explicit config:** + +```typescript +const pool = createLakebasePool({ + host: "your-lakebase-host.databricks.com", + database: "your_database_name", + endpoint: "projects//branches//endpoints/", + user: "user_id", + max: 10, +}); +``` + +**Key features:** +- Automatic OAuth token refresh (1-hour lifetime, 2-minute buffer) +- Token caching to reduce API calls +- Username resolution: explicit config → `PGUSER` → `DATABRICKS_CLIENT_ID` → API lookup via `getUsernameWithApiLookup()` +- `getLakebaseOrmConfig()` for ORM-compatible connection config +- OpenTelemetry metrics: `lakebase.token.refresh.duration`, `lakebase.query.duration`, pool connection gauges +- Logging: `{ debug, info, warn, error }` boolean flags or custom logger instance + +**ORM integration:** + +```typescript +// Drizzle +import { drizzle } from "drizzle-orm/node-postgres"; +const db = drizzle(pool); + +// Prisma +import { PrismaPg } from "@prisma/adapter-pg"; +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +// TypeORM / Sequelize +import { getLakebaseOrmConfig } from "@databricks/lakebase"; +// Pass getLakebaseOrmConfig() to your ORM's connection config +``` + +## Environment Management + +### Required Environment Variables + +| Variable | Description | How to find | +|----------|-------------|-------------| +| `PGHOST` | Lakebase endpoint host | `databricks postgres list-endpoints projects//branches/production --profile -o json` → `status.hosts.host` | +| `PGDATABASE` | Postgres database name | `databricks postgres list-databases projects//branches/production --profile -o json` → `status.postgres_database` | +| `LAKEBASE_ENDPOINT` | Endpoint resource path | Same `list-endpoints` command → `name` field | +| `PGUSER` | Username | Your Databricks email (local dev) or service principal application ID (M2M) | +| `PGSSLMODE` | SSL mode | `require` (default) | +| `PGPORT` | Port | `5432` (default) | + +### Authentication + +**Local dev** — use a short-lived workspace token: +```bash +export DATABRICKS_TOKEN=$(databricks auth token --profile -o json | jq -r '.access_token') +``` + +**Production** — use OAuth M2M credentials: +```bash +export DATABRICKS_CLIENT_ID= +export DATABRICKS_CLIENT_SECRET= +export DATABRICKS_HOST=https://.cloud.databricks.com +``` + +### `.env.example` Template + +```bash +DATABRICKS_HOST=https:// +LAKEBASE_ENDPOINT=projects//branches/production/endpoints/primary +PGHOST= +PGPORT=5432 +PGDATABASE= +PGUSER= +PGSSLMODE=require + +# Option A: local dev, token auth (expires ~1h) +DATABRICKS_TOKEN= + +# Option B: production, M2M auth (service principal) +DATABRICKS_CLIENT_ID= +DATABRICKS_CLIENT_SECRET= +``` + +### Optional: Zod Validation + +For strict fast-fail validation at startup: + +```typescript +import { z } from "zod"; + +const baseSchema = z.object({ + DATABRICKS_HOST: z.string().min(1), + LAKEBASE_ENDPOINT: z.string().min(1), + PGHOST: z.string().min(1), + PGPORT: z.coerce.number().default(5432), + PGDATABASE: z.string().min(1), + PGUSER: z.string().min(1), + PGSSLMODE: z.enum(["require", "prefer", "disable"]).default("require"), + DATABRICKS_TOKEN: z.string().optional(), + DATABRICKS_CLIENT_ID: z.string().optional(), + DATABRICKS_CLIENT_SECRET: z.string().optional(), +}); + +function validateAuth(env: z.infer) { + const hasToken = Boolean(env.DATABRICKS_TOKEN); + const hasM2M = Boolean(env.DATABRICKS_CLIENT_ID) && Boolean(env.DATABRICKS_CLIENT_SECRET); + if (!hasToken && !hasM2M) { + throw new Error("Set DATABRICKS_TOKEN or both DATABRICKS_CLIENT_ID and DATABRICKS_CLIENT_SECRET"); + } + return env; +} + +export const env = validateAuth(baseSchema.parse(process.env)); +``` + +Import `env` at the top of your server entry point for fast-fail on missing variables. + +## Manual Token Management + +> **Prefer `@databricks/lakebase`** for Node.js apps — it handles everything below automatically. Use this section only for non-Node.js apps or custom token flows. + +Lakebase requires a **two-token system**: a workspace token + a short-lived Lakebase Postgres credential. + +```typescript +const REFRESH_BUFFER_MS = 2 * 60 * 1000; // Refresh 2 minutes before expiry + +type CachedToken = { value: string; expiresAt: number }; + +let cachedWorkspaceToken: CachedToken | null = null; +let workspaceRefreshPromise: Promise | null = null; +let cachedLakebaseToken: CachedToken | null = null; +let lakebaseRefreshPromise: Promise | null = null; + +function isFresh(token: CachedToken | null): token is CachedToken { + return token !== null && Date.now() < token.expiresAt - REFRESH_BUFFER_MS; +} +``` + +**M2M OIDC flow** (production): + +```typescript +async function fetchWorkspaceTokenM2M(host: string, clientId: string, clientSecret: string): Promise { + const response = await fetch(`${host}/oidc/v1/token`, { + method: "POST", + headers: { "Content-Type": "application/x-www-form-urlencoded" }, + body: new URLSearchParams({ + grant_type: "client_credentials", + client_id: clientId, + client_secret: clientSecret, + scope: "all-apis", + }), + }); + if (!response.ok) throw new Error(`M2M token request failed: ${response.status}`); + const data = await response.json() as { access_token: string; expires_in: number }; + return { value: data.access_token, expiresAt: Date.now() + data.expires_in * 1000 }; +} +``` + +**Lakebase credential** (exchange workspace token for Postgres password): + +```typescript +async function fetchLakebaseCredential(host: string, workspaceToken: string): Promise { + const response = await fetch(`${host}/api/2.0/postgres/credentials`, { + method: "POST", + headers: { + Authorization: `Bearer ${workspaceToken}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ endpoint: env.LAKEBASE_ENDPOINT }), + }); + if (!response.ok) throw new Error(`Lakebase credential request failed: ${response.status}`); + const data = await response.json() as { token: string; expire_time: string }; + return { value: data.token, expiresAt: new Date(data.expire_time).getTime() }; +} +``` + +**Concurrent deduplication** — use a singleton promise pattern to avoid duplicate refresh calls: + +```typescript +export async function getLakebasePostgresToken(): Promise { + if (isFresh(cachedLakebaseToken)) return cachedLakebaseToken.value; + if (!lakebaseRefreshPromise) { + lakebaseRefreshPromise = (async () => { + const auth = authStrategyFromEnv(); + const workspaceToken = await getWorkspaceToken(auth); + return fetchLakebaseCredential(env.DATABRICKS_HOST.replace(/\/$/, ""), workspaceToken); + })() + .then((token) => { cachedLakebaseToken = token; return token; }) + .finally(() => { lakebaseRefreshPromise = null; }); + } + return (await lakebaseRefreshPromise).value; +} +``` + +**Local dev refresh script** (`scripts/refresh-lakebase-token.ts`): + +```typescript +import { execSync } from "node:child_process"; +import { readFileSync, writeFileSync, existsSync } from "node:fs"; + +const envFile = process.argv[2] ?? ".env.local"; +const profile = process.env.DATABRICKS_CONFIG_PROFILE ?? "DEFAULT"; +const raw = execSync(`databricks auth token --profile "${profile}" -o json`, { encoding: "utf-8" }); +const parsed = JSON.parse(raw) as { access_token?: string }; +if (!parsed.access_token) throw new Error("Failed to get access token from Databricks CLI"); +if (!existsSync(envFile)) throw new Error(`Env file not found: ${envFile}`); + +const content = readFileSync(envFile, "utf-8"); +const tokenLine = `DATABRICKS_TOKEN="${parsed.access_token}"`; +const updated = content.includes("DATABRICKS_TOKEN=") + ? content.replace(/^DATABRICKS_TOKEN=.*/m, tokenLine) + : `${content.trimEnd()}\n${tokenLine}\n`; +writeFileSync(envFile, updated); +console.log(`Updated DATABRICKS_TOKEN in ${envFile}`); +``` + +## Drizzle ORM Integration + +**With `@databricks/lakebase`** (recommended): + +```typescript +import { drizzle } from "drizzle-orm/node-postgres"; +import { createLakebasePool } from "@databricks/lakebase"; +import * as itemsSchema from "@/lib/items/schema"; + +const pool = createLakebasePool(); +export const db = drizzle({ client: pool, schema: { ...itemsSchema } }); +``` + +**Schema per domain** — organize schemas under `src/lib//schema.ts`: + +```typescript +import { pgTable, serial, text, timestamp } from "drizzle-orm/pg-core"; + +export const items = pgTable("items", { + id: serial("id").primaryKey(), + name: text("name").notNull(), + createdAt: timestamp("created_at", { withTimezone: true }).notNull().defaultNow(), +}); +``` + +**Migration with Lakebase credentials** — `drizzle-kit` cannot use `pg` password callbacks. Build a one-time URL: + +```typescript +// scripts/db-migrate.ts +import { execSync } from "node:child_process"; +import { getLakebasePostgresToken } from "@/lib/lakebase/tokens"; + +async function runMigrations() { + const token = await getLakebasePostgresToken(); + const databaseUrl = + `postgresql://${encodeURIComponent(env.PGUSER)}:${encodeURIComponent(token)}` + + `@${env.PGHOST}:${env.PGPORT}/${env.PGDATABASE}?sslmode=${env.PGSSLMODE}`; + execSync("npx drizzle-kit migrate", { + stdio: "inherit", + env: { ...process.env, DATABASE_URL: databaseUrl }, + }); +} +runMigrations().catch((error) => { console.error(error); process.exit(1); }); +``` + +**`drizzle.config.ts`** — conditional `dbCredentials` (only needed when `DATABASE_URL` is set by migration script): + +```typescript +import { defineConfig } from "drizzle-kit"; + +export default defineConfig({ + schema: "./src/lib/*/schema.ts", + out: "./src/lib/db/migrations", + dialect: "postgresql", + ...(process.env.DATABASE_URL && { + dbCredentials: { url: process.env.DATABASE_URL }, + }), +}); +``` + +**Commands:** +- Generate (local, no DB connection): `npx drizzle-kit generate` +- Migrate (needs credentials): `npx dotenv -e .env.local -- npx tsx scripts/db-migrate.ts` diff --git a/skills/databricks-lakebase/references/pgvector.md b/skills/databricks-lakebase/references/pgvector.md new file mode 100644 index 0000000..5530a63 --- /dev/null +++ b/skills/databricks-lakebase/references/pgvector.md @@ -0,0 +1,149 @@ +# Vector Similarity Search with pgvector + +Use the pgvector extension in Lakebase for embedding-based similarity search (RAG, semantic search, recommendations). + +## Extension Setup + +```bash +databricks psql --project --profile -- -c " + CREATE EXTENSION IF NOT EXISTS vector; +" +``` + +If you get error code `42501` (insufficient privileges), the extension may already exist — this is safe to ignore in `setupVectorTables()`: + +```typescript +try { + await appkit.lakebase.query("CREATE EXTENSION IF NOT EXISTS vector"); +} catch (err: unknown) { + const code = (err as { code?: string }).code; + if (code === "42501") { + console.log("[vectors] Skipping extension creation — insufficient privileges (likely already exists)"); + } else { + throw err; + } +} +``` + +## Table Schema + +```sql +CREATE SCHEMA IF NOT EXISTS vectors; + +CREATE TABLE IF NOT EXISTS vectors.documents ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + content TEXT NOT NULL, + embedding VECTOR(1024), + metadata JSONB NOT NULL DEFAULT '{}', + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); +``` + +**Dimension matching**: `VECTOR(1024)` must match your embedding model's output dimension. Common Databricks endpoints: +- `databricks-gte-large-en` — 1024 dimensions +- `databricks-bge-large-en` — 1024 dimensions + +If using a different model (768d or 1536d), change `VECTOR(1024)` to match. + +## Vector Store Module + +Create `server/lib/vector-store.ts`: + +```typescript +import type { Application } from "express"; + +interface AppKitWithLakebase { + lakebase: { + query(text: string, params?: unknown[]): Promise<{ rows: Record[] }>; + }; + server: { + extend(fn: (app: Application) => void): void; + }; +} + +export async function setupVectorTables(appkit: AppKitWithLakebase) { + try { + await appkit.lakebase.query("CREATE EXTENSION IF NOT EXISTS vector"); + } catch (err: unknown) { + const code = (err as { code?: string }).code; + if (code === "42501") { + console.log("[vectors] Skipping extension creation — insufficient privileges (likely already exists)"); + } else { + throw err; + } + } + const { rows } = await appkit.lakebase.query( + `SELECT 1 FROM information_schema.tables + WHERE table_schema = 'vectors' AND table_name = 'documents'`, + ); + if (rows.length > 0) return; + await appkit.lakebase.query(`CREATE SCHEMA IF NOT EXISTS vectors`); + await appkit.lakebase.query(` + CREATE TABLE IF NOT EXISTS vectors.documents ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + content TEXT NOT NULL, + embedding VECTOR(1024), + metadata JSONB NOT NULL DEFAULT '{}', + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ) + `); +} + +export async function insertDocument( + appkit: AppKitWithLakebase, + input: { content: string; embedding: number[]; metadata?: Record }, +) { + const result = await appkit.lakebase.query( + `INSERT INTO vectors.documents (content, embedding, metadata) + VALUES ($1, $2::vector, $3) + RETURNING id, content, metadata, created_at`, + [input.content, JSON.stringify(input.embedding), JSON.stringify(input.metadata ?? {})], + ); + return result.rows[0]; +} + +export async function retrieveSimilar( + appkit: AppKitWithLakebase, + queryEmbedding: number[], + limit = 5, +) { + const result = await appkit.lakebase.query( + `SELECT id, content, metadata, 1 - (embedding <=> $1::vector) AS similarity + FROM vectors.documents + WHERE embedding IS NOT NULL + ORDER BY embedding <=> $1::vector + LIMIT $2`, + [JSON.stringify(queryEmbedding), limit], + ); + return result.rows; +} +``` + +Call `setupVectorTables(appkit)` from `onPluginsReady` before starting the server. + +## Distance Operators + +| Operator | Distance | Use for | +|----------|----------|---------| +| `<=>` | Cosine | Text similarity (default) | +| `<->` | L2 (Euclidean) | Spatial data | +| `<#>` | Inner product | Normalized embeddings | + +Similarity score: `1 - (embedding <=> $1::vector) AS similarity` (0 = unrelated, 1 = identical). + +## Indexing + +Add an index **after** inserting initial data (IVFFlat needs representative data to build): + +```sql +CREATE INDEX IF NOT EXISTS idx_documents_embedding + ON vectors.documents USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); +ANALYZE vectors.documents; +``` + +For higher recall without tuning, use HNSW instead: `USING hnsw (embedding vector_cosine_ops)`. + +## Cross-references + +- For generating embeddings, see the `databricks-apps` skill's [model-serving.md](../../databricks-apps/references/appkit/model-serving.md) → Embeddings Pattern +- For Lakebase connection patterns, see [connectivity.md](connectivity.md) diff --git a/skills/databricks-lakebase/references/synced-tables.md b/skills/databricks-lakebase/references/synced-tables.md index 9e74882..a661d81 100644 --- a/skills/databricks-lakebase/references/synced-tables.md +++ b/skills/databricks-lakebase/references/synced-tables.md @@ -80,6 +80,9 @@ databricks postgres create-synced-table ..
\ | `create_database_objects_if_missing` | No | Auto-create Postgres schema/database if missing (default: `false`) | | `new_pipeline_spec.storage_catalog` | Yes | A **regular** UC catalog for DLT pipeline metadata (NOT the Lakebase catalog) | | `new_pipeline_spec.storage_schema` | Yes | Schema in the storage catalog for pipeline metadata (e.g. `default`) | +| `timeseries_key` | No | Column for deduplication when source has duplicate PKs (latest wins). Performance penalty. | + +> **Note:** Nulls in PK columns are excluded from sync. Long-running operation; CLI waits by default. Use `--no-wait` to return immediately. @@ -186,6 +189,11 @@ If a Databricks App reads synced tables, the app's Service Principal needs expli - **Naming:** Database, schema, and table names allow `[A-Za-z0-9_]+` only - **Schema evolution:** Only additive changes (adding columns) for Triggered/Continuous modes +**Cost guidance:** +- **Continuous mode:** Reuse pipelines for ~10 tables/pipeline (~$204/table/month) vs separate pipelines (~$2,044/table/month) +- **Cost formula:** `[Rows / (Speed × CUs × 3600)] × DLT Hourly Rate` +- **Snapshot vs incremental:** Snapshot is ~10x faster when >10% of data changes per cycle + ## Lakehouse Sync (Beta) Reverse direction: continuously streams changes **from** Lakebase Postgres **into** Unity Catalog Delta tables using CDC (SCD Type 2 history). Destination tables are named `lb__history`. Does not require external compute, pipelines, or jobs — it is a native Lakebase feature. Available on AWS and Azure. @@ -215,6 +223,8 @@ Reverse direction: continuously streams changes **from** Lakebase Postgres **int - Partitioned tables are not supported - Disabling and re-enabling sync does **not** re-snapshot — missing changes are lost permanently +For the full Lakehouse Sync reference, see [lakehouse-sync.md](lakehouse-sync.md). For building medallion pipelines from CDC history, see the `databricks-pipelines` skill's [medallion-from-cdc.md](../../databricks-pipelines/references/medallion-from-cdc.md). + ## Use Cases **Product catalog:** Sync gold-tier product data to Lakebase for low-latency web app reads. Use Triggered mode for hourly/daily updates. @@ -235,3 +245,4 @@ Reverse direction: continuously streams changes **from** Lakebase Postgres **int - **Read-only in Postgres:** Only SELECT queries, CREATE INDEX, and DROP TABLE are allowed on synced tables. Any data modifications (INSERT, UPDATE, DELETE) corrupt the sync pipeline. - **Null bytes:** Null bytes (0x00) in STRING, ARRAY, MAP, or STRUCT columns cause sync failures. Sanitize source data: `REPLACE(col, CAST(CHAR(0) AS STRING), '')`. - **Unsupported types:** GEOGRAPHY, GEOMETRY, VARIANT, OBJECT columns cannot be synced. +- **FGAC not propagated:** Fine-grained access control (row filters, column masks) from Unity Catalog is not propagated to synced tables. **Workaround:** Create a view on the source table with the desired filter (`SELECT * FROM table WHERE ...`), then sync the view in Snapshot mode. Caveat: the sync runs as the creator and only sees their visible rows. diff --git a/skills/databricks-model-serving/SKILL.md b/skills/databricks-model-serving/SKILL.md index eafedbf..628fd33 100644 --- a/skills/databricks-model-serving/SKILL.md +++ b/skills/databricks-model-serving/SKILL.md @@ -62,7 +62,8 @@ databricks serving-endpoints create \ "entity_version": "", "min_provisioned_throughput": 0, "max_provisioned_throughput": 0, - "workload_size": "Small" + "workload_size": "Small", + "scale_to_zero_enabled": true }], "traffic_config": { "routes": [{ @@ -73,7 +74,7 @@ databricks serving-endpoints create \ }' --profile ``` -- Discover available Foundation Models: check the `system.ai` catalog in Unity Catalog. +- Discover available Foundation Models: check the `system.ai` catalog in Unity Catalog, or use `databricks serving-endpoints list --profile ` to see available endpoints. Use `databricks serving-endpoints get-open-api --profile ` to inspect the endpoint's API schema. - Long-running operation; the CLI waits for completion by default. Use `--no-wait` to return immediately, then poll: ```bash databricks serving-endpoints get --profile diff --git a/skills/databricks-pipelines/SKILL.md b/skills/databricks-pipelines/SKILL.md index d08d0b1..6c3d799 100644 --- a/skills/databricks-pipelines/SKILL.md +++ b/skills/databricks-pipelines/SKILL.md @@ -269,3 +269,4 @@ Detailed reference guides for each pipeline API. **Read the relevant guide befor - [Expectations](references/expectations.md) — Define and enforce data quality constraints ([Python](references/expectations-python.md), [SQL](references/expectations-sql.md)) - [Sinks](references/sink.md) — Write to Kafka, Event Hubs, external Delta tables ([Python](references/sink-python.md)) - [ForEachBatch Sinks](references/foreach-batch-sink.md) — Custom streaming sink with per-batch Python logic ([Python](references/foreach-batch-sink-python.md)) +- [Medallion from CDC](references/medallion-from-cdc.md) — Build Silver/Gold layers from Lakehouse Sync CDC history tables diff --git a/skills/databricks-pipelines/references/medallion-from-cdc.md b/skills/databricks-pipelines/references/medallion-from-cdc.md new file mode 100644 index 0000000..81204d7 --- /dev/null +++ b/skills/databricks-pipelines/references/medallion-from-cdc.md @@ -0,0 +1,144 @@ +# Medallion Architecture from CDC History Tables + +Build Silver and Gold analytics layers from Lakehouse Sync CDC history tables using Lakeflow Declarative Pipelines. + +## When to Use + +- You have Lakehouse Sync CDC history tables (`lb_
_history`) in Unity Catalog +- You want Bronze → Silver → Gold layers on top of operational data +- You need clean current-state views, deduplication, and business aggregations for BI, ML, or Genie + +## Layer Mapping + +| Layer | Purpose | Source | Output | +|-------|---------|--------|--------| +| **Bronze** | Raw CDC records with full history | Lakehouse Sync `lb_
_history` tables | No transformation needed; already exist | +| **Silver** | Current state, deduplicated and cleaned | Bronze history tables | One materialized view per entity | +| **Gold** | Business aggregations and KPIs | Silver tables | Materialized views with aggregations | + +## 1. Scaffold a Pipeline Project + +```bash +databricks bundle init lakeflow-pipelines \ + --config-file <(echo '{"project_name": "operational_analytics", "language": "sql", "serverless": "yes"}') \ + --profile < /dev/null +cd operational_analytics +``` + +## 2. Configure Pipeline Catalog and Schema + +Edit `resources/operational_analytics.pipeline.yml`: + +```yaml +resources: + pipelines: + operational_analytics: + name: operational_analytics + catalog: + schema: + development: true + serverless: true + libraries: + - file: + path: src/ +``` + +## 3. Silver Layer: Current State from CDC + +For each entity, create `src/silver_.sql`: + +```sql +CREATE OR REFRESH MATERIALIZED VIEW silver_ +COMMENT "Current state of records, deduplicated from CDC history" +AS +SELECT * EXCEPT (rn, _change_type, _lsn, _commit_timestamp) +FROM ( + SELECT *, + ROW_NUMBER() OVER ( + PARTITION BY + ORDER BY _lsn DESC + ) AS rn + FROM ..lb__history + WHERE _change_type IN ('insert', 'update_postimage', 'delete') +) +WHERE rn = 1 + AND _change_type != 'delete' +``` + +Replace ``, `.`, and `` with your values. + +## 4. Gold Layer: Business Aggregations + +Create `src/gold_.sql`: + +```sql +CREATE OR REFRESH MATERIALIZED VIEW gold_daily_order_summary +COMMENT "Daily order counts and revenue by status" +AS +SELECT + DATE_TRUNC('day', created_at) AS order_date, + status, + COUNT(*) AS order_count, + SUM(total_amount) AS total_revenue +FROM silver_orders +GROUP BY DATE_TRUNC('day', created_at), status +``` + +Gold tables read from silver tables within the same pipeline. + +## 5. Data Quality Expectations + +Add constraints to silver or gold tables: + +```sql +CREATE OR REFRESH MATERIALIZED VIEW silver_ ( + CONSTRAINT valid_primary_key EXPECT ( IS NOT NULL) ON VIOLATION DROP ROW, + CONSTRAINT valid_timestamp EXPECT (created_at IS NOT NULL) ON VIOLATION DROP ROW +) +COMMENT "Current state of records with quality enforcement" +AS +SELECT ... +``` + +## 6. Deploy and Run + +```bash +databricks bundle validate --profile +databricks bundle deploy -t dev --profile +databricks bundle run operational_analytics -t dev --profile +``` + +## 7. Schedule Ongoing Refreshes + +Create `resources/operational_analytics_job.job.yml`: + +```yaml +resources: + jobs: + operational_analytics_job: + trigger: + periodic: + interval: 1 + unit: HOURS + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.operational_analytics.id} +``` + +Deploy: `databricks bundle deploy -t dev --profile ` + +## Troubleshooting + +| Issue | Fix | +|-------|-----| +| Silver table returns no rows | Verify bronze history table has data: `SELECT COUNT(*) FROM lb__history` | +| `TABLE_OR_VIEW_NOT_FOUND` for bronze table | Use fully-qualified name: `..lb__history` | +| Gold aggregation includes deleted records | Confirm silver layer filters `_change_type != 'delete'` | +| Pipeline fails on deploy | Run `databricks bundle validate` first to catch config errors | +| Incremental refresh not picking up changes | Verify Lakehouse Sync is active and bronze table is updating | + +## Cross-references + +- For Lakehouse Sync setup, see the `databricks-lakebase` skill's [lakehouse-sync.md](../../databricks-lakebase/references/lakehouse-sync.md) +- For synced tables (UC → Lakebase direction), see [synced-tables.md](../../databricks-lakebase/references/synced-tables.md) From af958cda8895998125371b01bff5416a504ad8da Mon Sep 17 00:00:00 2001 From: Pawel Kosiec Date: Thu, 14 May 2026 19:45:50 +0200 Subject: [PATCH 2/7] Fix review findings across DevHub recipe absorption - model-serving: fix placeholder style (), add cloud-variant comment for AI Gateway URL, clarify tRPC heading, add off-platform note for AI SDK v6 pattern - lakehouse-sync + synced-tables: fix Lakehouse Sync availability to AWS/Azure/GCP per official docs - genie: add missing genie_space_name variable in Multi-Space snippet - off-platform: add verify-full/verify-ca to PGSSLMODE Zod enum, add cross-references section - pgvector: reorder schema creation before table existence check - SKILL.md: clarify State Storage Rule vs Decision Gate scope - lakebase: add platform proxy context for x-forwarded-email header Co-authored-by: Isaac --- skills/databricks-apps/SKILL.md | 2 +- skills/databricks-apps/references/appkit/genie.md | 3 +++ skills/databricks-apps/references/appkit/lakebase.md | 2 +- .../databricks-apps/references/appkit/model-serving.md | 9 +++++---- skills/databricks-lakebase/references/lakehouse-sync.md | 2 +- skills/databricks-lakebase/references/off-platform.md | 8 +++++++- skills/databricks-lakebase/references/pgvector.md | 2 +- skills/databricks-lakebase/references/synced-tables.md | 2 +- 8 files changed, 20 insertions(+), 10 deletions(-) diff --git a/skills/databricks-apps/SKILL.md b/skills/databricks-apps/SKILL.md index e4ca621..abf8780 100644 --- a/skills/databricks-apps/SKILL.md +++ b/skills/databricks-apps/SKILL.md @@ -69,7 +69,7 @@ If the user's app description implies storing or persisting data — forms, CRUD 2. Scaffold with `--features lakebase` and pass `--set lakebase.postgres.branch= --set lakebase.postgres.database=`. 3. If the app **also** reads from Unity Catalog tables, proceed to the Data Access Decision Gate below to determine whether to add `--features analytics` or use Lakebase synced tables. -This is not optional — any app that writes user-generated data needs Lakebase. +This rule governs **state storage** only. For how the app reads existing lakehouse data, proceed to the Decision Gate below. This is not optional — any app that writes user-generated data needs Lakebase. ## Development Workflow (FOLLOW THIS ORDER) diff --git a/skills/databricks-apps/references/appkit/genie.md b/skills/databricks-apps/references/appkit/genie.md index 80d6c9f..5975d78 100644 --- a/skills/databricks-apps/references/appkit/genie.md +++ b/skills/databricks-apps/references/appkit/genie.md @@ -159,6 +159,8 @@ This section covers the **deployment-specific patterns** for multi-space Genie a variables: genie_space_id: description: Default Genie space ID (required by AppKit) + genie_space_name: + description: Default Genie space name genie_space_sales_id: description: Sales Genie space ID genie_space_support_id: @@ -190,6 +192,7 @@ targets: default: variables: genie_space_id: + genie_space_name: genie_space_sales_id: genie_space_support_id: ``` diff --git a/skills/databricks-apps/references/appkit/lakebase.md b/skills/databricks-apps/references/appkit/lakebase.md index 328ace5..abfe862 100644 --- a/skills/databricks-apps/references/appkit/lakebase.md +++ b/skills/databricks-apps/references/appkit/lakebase.md @@ -245,7 +245,7 @@ export async function appendMessage(appkit, input: { chatId: string; role: strin } ``` -**User identity**: In deployed apps, use `req.header("x-forwarded-email")`. For local dev, hardcode a test user ID. +**User identity**: In deployed apps, use `req.header("x-forwarded-email")` (injected by the Databricks Apps platform proxy; for off-platform deployments, use your own auth middleware). For local dev, hardcode a test user ID. **History endpoints**: - `GET /api/chats` — list chats for current user diff --git a/skills/databricks-apps/references/appkit/model-serving.md b/skills/databricks-apps/references/appkit/model-serving.md index acc5add..f8ebcf7 100644 --- a/skills/databricks-apps/references/appkit/model-serving.md +++ b/skills/databricks-apps/references/appkit/model-serving.md @@ -56,7 +56,7 @@ env: The injected value is the endpoint **name** (not a URL). Use it in server-side code to call the endpoint. -## Non-Streaming Query Pattern +## Non-Streaming Query Pattern (tRPC) Always use tRPC for model serving calls — do NOT call endpoints directly from the client. @@ -94,11 +94,11 @@ const result = await trpc.queryModel.query({ prompt: userInput }); const answer = result.choices?.[0]?.message?.content; ``` -For AppKit's built-in serving plugin streaming (SSE via `stream()` and `useServingStream`), see `npx @databricks/appkit docs ./docs/plugins/model-serving.md`. The patterns below are for direct AI SDK v6 integration with Databricks AI Gateway. +For AppKit's built-in serving plugin streaming (SSE via `stream()` and `useServingStream`), see `npx @databricks/appkit docs ./docs/plugins/model-serving.md`. The patterns below are for apps deployed **outside** Databricks Apps (e.g., Vercel, AWS, standalone Node.js servers) using direct AI SDK v6 integration with Databricks AI Gateway. For AppKit-based apps, use the built-in serving plugin above. ## AI SDK v6 Streaming Pattern -Use this pattern for streaming AI chat with Databricks AI Gateway and Vercel AI SDK v6. +Use this pattern for streaming AI chat with Databricks AI Gateway and Vercel AI SDK v6 in off-platform apps. **Dependencies:** `ai@6`, `@ai-sdk/react@3`, `@ai-sdk/openai`, `@databricks/sdk-experimental` @@ -151,9 +151,10 @@ app.post("/api/chat", async (req, res) => { try { const token = await getDatabricksToken(); - const endpoint = process.env.DATABRICKS_ENDPOINT || ""; + const endpoint = process.env.DATABRICKS_ENDPOINT || ""; // AI Gateway URL uses /mlflow/v1 path, NOT /openai/v1 + // URL varies by cloud: .cloud.databricks.com (AWS), .azuredatabricks.net (Azure), .gcp.databricks.com (GCP) const databricks = createOpenAI({ baseURL: `https://${process.env.DATABRICKS_WORKSPACE_ID}.ai-gateway.cloud.databricks.com/mlflow/v1`, apiKey: token, diff --git a/skills/databricks-lakebase/references/lakehouse-sync.md b/skills/databricks-lakebase/references/lakehouse-sync.md index fceb169..6882403 100644 --- a/skills/databricks-lakebase/references/lakehouse-sync.md +++ b/skills/databricks-lakebase/references/lakehouse-sync.md @@ -144,7 +144,7 @@ COMMIT; - Partitioned tables are not supported - Disabling and re-enabling sync does **not** re-snapshot — missing changes are lost permanently -- AWS only (Beta). Azure support is not yet available. +- Available on AWS, Azure, and GCP. ## Cross-references diff --git a/skills/databricks-lakebase/references/off-platform.md b/skills/databricks-lakebase/references/off-platform.md index 52f1758..a7ee354 100644 --- a/skills/databricks-lakebase/references/off-platform.md +++ b/skills/databricks-lakebase/references/off-platform.md @@ -116,7 +116,7 @@ const baseSchema = z.object({ PGPORT: z.coerce.number().default(5432), PGDATABASE: z.string().min(1), PGUSER: z.string().min(1), - PGSSLMODE: z.enum(["require", "prefer", "disable"]).default("require"), + PGSSLMODE: z.enum(["require", "verify-full", "verify-ca", "prefer", "disable"]).default("require"), DATABRICKS_TOKEN: z.string().optional(), DATABRICKS_CLIENT_ID: z.string().optional(), DATABRICKS_CLIENT_SECRET: z.string().optional(), @@ -298,3 +298,9 @@ export default defineConfig({ **Commands:** - Generate (local, no DB connection): `npx drizzle-kit generate` - Migrate (needs credentials): `npx dotenv -e .env.local -- npx tsx scripts/db-migrate.ts` + +## Cross-references + +- For on-platform connection patterns, see [connectivity.md](connectivity.md) +- For vector similarity search with pgvector, see [pgvector.md](pgvector.md) +- For AppKit-based Lakebase integration, see the `databricks-apps` skill's [lakebase.md](../../databricks-apps/references/appkit/lakebase.md) diff --git a/skills/databricks-lakebase/references/pgvector.md b/skills/databricks-lakebase/references/pgvector.md index 5530a63..80ddb9e 100644 --- a/skills/databricks-lakebase/references/pgvector.md +++ b/skills/databricks-lakebase/references/pgvector.md @@ -72,12 +72,12 @@ export async function setupVectorTables(appkit: AppKitWithLakebase) { throw err; } } + await appkit.lakebase.query(`CREATE SCHEMA IF NOT EXISTS vectors`); const { rows } = await appkit.lakebase.query( `SELECT 1 FROM information_schema.tables WHERE table_schema = 'vectors' AND table_name = 'documents'`, ); if (rows.length > 0) return; - await appkit.lakebase.query(`CREATE SCHEMA IF NOT EXISTS vectors`); await appkit.lakebase.query(` CREATE TABLE IF NOT EXISTS vectors.documents ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), diff --git a/skills/databricks-lakebase/references/synced-tables.md b/skills/databricks-lakebase/references/synced-tables.md index a661d81..0d139df 100644 --- a/skills/databricks-lakebase/references/synced-tables.md +++ b/skills/databricks-lakebase/references/synced-tables.md @@ -196,7 +196,7 @@ If a Databricks App reads synced tables, the app's Service Principal needs expli ## Lakehouse Sync (Beta) -Reverse direction: continuously streams changes **from** Lakebase Postgres **into** Unity Catalog Delta tables using CDC (SCD Type 2 history). Destination tables are named `lb__history`. Does not require external compute, pipelines, or jobs — it is a native Lakebase feature. Available on AWS and Azure. +Reverse direction: continuously streams changes **from** Lakebase Postgres **into** Unity Catalog Delta tables using CDC (SCD Type 2 history). Destination tables are named `lb__history`. Does not require external compute, pipelines, or jobs — it is a native Lakebase feature. Available on AWS, Azure, and GCP. > **Important:** Tables must reside in the `databricks_postgres` database for Lakehouse Sync to work. From ca465a866ddb8f99bf1cecee5774f53683ba7f13 Mon Sep 17 00:00:00 2001 From: Pawel Kosiec Date: Fri, 15 May 2026 11:18:16 +0200 Subject: [PATCH 3/7] Fix skill file issues from review - Move off-platform AI SDK v6 streaming/embeddings patterns from databricks-apps to databricks-model-serving (they don't use AppKit) - Replace deprecated server({ autoStart: false }) + .then() + .start() with onPluginsReady callback in lakebase.md - Clarify OBO scope requirement in files.md (only for .asUser(req) API) - Deduplicate scaffolding instructions in lakebase.md (reference SKILL.md) - Fix databricks apps logs to use --follow for streaming - Drop stale dollar costs in synced-tables.md, keep relative guidance - Move medallion-from-cdc.md from databricks-pipelines to databricks-lakebase (it's a Lakehouse Sync continuation, not a general pipeline pattern) Co-authored-by: Isaac --- skills/databricks-apps/SKILL.md | 2 +- .../references/appkit/files.md | 6 +- .../references/appkit/lakebase.md | 44 +---- .../references/appkit/model-serving.md | 174 +---------------- .../references/lakehouse-sync.md | 2 +- .../references/medallion-from-cdc.md | 0 .../references/synced-tables.md | 4 +- .../references/off-platform-streaming.md | 180 ++++++++++++++++++ 8 files changed, 199 insertions(+), 213 deletions(-) rename skills/{databricks-pipelines => databricks-lakebase}/references/medallion-from-cdc.md (100%) create mode 100644 skills/databricks-model-serving/references/off-platform-streaming.md diff --git a/skills/databricks-apps/SKILL.md b/skills/databricks-apps/SKILL.md index abf8780..b9b12db 100644 --- a/skills/databricks-apps/SKILL.md +++ b/skills/databricks-apps/SKILL.md @@ -211,5 +211,5 @@ After deploying, verify the app is running: ```bash databricks apps get --profile -o json # Check app_status.state: RUNNING -databricks apps logs --profile # Stream BUILD/SYSTEM/APP logs +databricks apps logs --follow --profile # Stream live logs (Ctrl+C to stop) ``` diff --git a/skills/databricks-apps/references/appkit/files.md b/skills/databricks-apps/references/appkit/files.md index af18571..108d9c0 100644 --- a/skills/databricks-apps/references/appkit/files.md +++ b/skills/databricks-apps/references/appkit/files.md @@ -231,14 +231,14 @@ const handleCreateDirectory = async (name: string) => { ## Resource Requirements -Each volume key requires a resource with `WRITE_VOLUME` permission and `user_api_scopes` for on-behalf-of (OBO) token access. Declare in `databricks.yml`: +Each volume key requires a resource with `WRITE_VOLUME` permission. Declare in `databricks.yml`: ```yaml resources: apps: my_app: user_api_scopes: - - files.files # Required for OBO token access in production + - files.files # Needed when using .asUser(req) programmatic API resources: - name: uploads-volume volume: @@ -246,6 +246,8 @@ resources: permission: WRITE_VOLUME ``` +> **Note:** The scaffolded HTTP routes (`/api/files/...`) execute as the service principal and do not require `user_api_scopes`. The scope is needed when using the programmatic `appkit.files("key").asUser(req)` API for per-user Volume access. + Wire the env var in `app.yaml`: ```yaml diff --git a/skills/databricks-apps/references/appkit/lakebase.md b/skills/databricks-apps/references/appkit/lakebase.md index abfe862..b4752bf 100644 --- a/skills/databricks-apps/references/appkit/lakebase.md +++ b/skills/databricks-apps/references/appkit/lakebase.md @@ -14,30 +14,7 @@ Use Lakebase when your app needs **persistent read/write storage** — forms, CR ## Scaffolding -**ALWAYS scaffold with the correct feature flags** — do not add Lakebase manually to an analytics-only scaffold. - -**Lakebase only** (no analytics SQL warehouse): -```bash -databricks apps init --name --features lakebase \ - --set "lakebase.postgres.branch=" \ - --set "lakebase.postgres.database=" \ - --run none --profile -``` - -**Both Lakebase and analytics**: -```bash -databricks apps init --name --features analytics,lakebase \ - --set "analytics.sql-warehouse.id=" \ - --set "lakebase.postgres.branch=" \ - --set "lakebase.postgres.database=" \ - --run none --profile -``` - -Where `` and `` are full resource names (e.g. `projects//branches/` and `projects//branches//databases/`). - -Use the `databricks-lakebase` skill to create a Lakebase project and discover branch/database resource names before running this command. - -> For multi-environment deployments (dev/prod), use `variables:` and `targets:` blocks in `databricks.yml` — see the **`databricks-dabs`** skill for patterns. +Scaffold with `--features lakebase` as described in the parent SKILL.md (State Storage Rule and Scaffolding section). Use the **`databricks-lakebase`** skill to create a Lakebase project and discover branch/database resource names first. **Naming conventions:** Use domain names for user-facing code (`ItemsPage.tsx`, `/api/items`, `item-routes.ts`). Keep `lakebase` naming only for infrastructure config (`lakebase()` plugin, `LAKEBASE_ENDPOINT`, `postgres` app resource). @@ -94,18 +71,17 @@ The `lakebase()` plugin auto-configures from platform-injected env vars at deplo ## CRUD Routes Pattern -Always use server-side routes for Lakebase operations — do NOT call `appkit.lakebase.query()` from the client. Use `server.extend()` to register Express routes: +Always use server-side routes for Lakebase operations — do NOT call `appkit.lakebase.query()` from the client. Use `onPluginsReady` to initialize the schema and register Express routes: ```typescript // server/server.ts import { createApp, server, lakebase } from "@databricks/appkit"; import { z } from 'zod'; -createApp({ - plugins: [server({ autoStart: false }), lakebase()], -}) - .then(async (appkit) => { - // Schema init (runs once at startup) +await createApp({ + plugins: [server(), lakebase()], + async onPluginsReady(appkit) { + // Schema init (runs once before server accepts requests) await appkit.lakebase.query(` CREATE SCHEMA IF NOT EXISTS app_data; CREATE TABLE IF NOT EXISTS app_data.items ( @@ -141,17 +117,15 @@ createApp({ res.status(204).send(); }); }); - - await appkit.server.start(); - }) - .catch(console.error); + }, +}); ``` > **Deploy first (App + Lakebase only)!** When your Databricks App uses Lakebase, the Service Principal must create and own the schema. Run `databricks apps deploy` before any local development. See **`databricks-lakebase`** skill's **Schema Permissions for Deployed Apps** for details. ## Schema Initialization -**Always create a custom schema** — the Service Principal cannot access any existing schemas (including `public`). It must create the schema itself to become its owner. See **`databricks-lakebase`** skill's **Schema Permissions for Deployed Apps** for the full permission model and deploy-first workflow. Initialize tables inside the `.then()` callback before registering routes (see CRUD pattern above): +**Always create a custom schema** — the Service Principal cannot access any existing schemas (including `public`). It must create the schema itself to become its owner. See **`databricks-lakebase`** skill's **Schema Permissions for Deployed Apps** for the full permission model and deploy-first workflow. Initialize tables inside the `onPluginsReady` callback before registering routes (see CRUD pattern above): ```typescript // Inside onPluginsReady — runs once at startup before handling requests diff --git a/skills/databricks-apps/references/appkit/model-serving.md b/skills/databricks-apps/references/appkit/model-serving.md index f8ebcf7..40a8772 100644 --- a/skills/databricks-apps/references/appkit/model-serving.md +++ b/skills/databricks-apps/references/appkit/model-serving.md @@ -94,177 +94,9 @@ const result = await trpc.queryModel.query({ prompt: userInput }); const answer = result.choices?.[0]?.message?.content; ``` -For AppKit's built-in serving plugin streaming (SSE via `stream()` and `useServingStream`), see `npx @databricks/appkit docs ./docs/plugins/model-serving.md`. The patterns below are for apps deployed **outside** Databricks Apps (e.g., Vercel, AWS, standalone Node.js servers) using direct AI SDK v6 integration with Databricks AI Gateway. For AppKit-based apps, use the built-in serving plugin above. +For AppKit's built-in serving plugin streaming (SSE via `stream()` and `useServingStream`), see `npx @databricks/appkit docs ./docs/plugins/model-serving.md`. -## AI SDK v6 Streaming Pattern - -Use this pattern for streaming AI chat with Databricks AI Gateway and Vercel AI SDK v6 in off-platform apps. - -**Dependencies:** `ai@6`, `@ai-sdk/react@3`, `@ai-sdk/openai`, `@databricks/sdk-experimental` - -**Auth helper** — works for both local dev (CLI profile) and deployed apps (service principal token): - -```typescript -import { Config } from "@databricks/sdk-experimental"; - -async function getDatabricksToken() { - if (process.env.DATABRICKS_TOKEN) { - return process.env.DATABRICKS_TOKEN; - } - const config = new Config({ - profile: process.env.DATABRICKS_CONFIG_PROFILE || "DEFAULT", - }); - await config.ensureResolved(); - const headers = new Headers(); - await config.authenticate(headers); - const authHeader = headers.get("Authorization"); - if (!authHeader) { - throw new Error( - "Failed to get Databricks token. Check your CLI profile or set DATABRICKS_TOKEN.", - ); - } - return authHeader.replace("Bearer ", ""); -} -``` - -**Server route** (`POST /api/chat`): - -```typescript -import { createOpenAI } from "@ai-sdk/openai"; -import { streamText, type UIMessage } from "ai"; - -app.post("/api/chat", async (req, res) => { - const { messages } = req.body; - - // AI SDK v6 client sends UIMessage objects with a parts array. - // Convert to CoreMessage format for streamText(). - const coreMessages = (messages as UIMessage[]).map((m) => ({ - role: m.role as "user" | "assistant" | "system", - content: - m.parts - ?.filter((p) => p.type === "text" && p.text) - .map((p) => p.text) - .join("") ?? - m.content ?? - "", - })); - - try { - const token = await getDatabricksToken(); - const endpoint = process.env.DATABRICKS_ENDPOINT || ""; - - // AI Gateway URL uses /mlflow/v1 path, NOT /openai/v1 - // URL varies by cloud: .cloud.databricks.com (AWS), .azuredatabricks.net (Azure), .gcp.databricks.com (GCP) - const databricks = createOpenAI({ - baseURL: `https://${process.env.DATABRICKS_WORKSPACE_ID}.ai-gateway.cloud.databricks.com/mlflow/v1`, - apiKey: token, - }); - - const result = streamText({ - model: databricks.chat(endpoint), - messages: coreMessages, - maxOutputTokens: 1000, - }); - - result.pipeTextStreamToResponse(res); - } catch (err) { - const message = (err as Error).message; - console.error(`[chat] Streaming request failed:`, message); - res.status(502).json({ error: "Chat request failed", detail: message }); - } -}); -``` - -**Environment variables:** -- `DATABRICKS_WORKSPACE_ID` — auto-discovered by AppKit at runtime; for explicit setup: `databricks api get /api/2.1/unity-catalog/current-metastore-assignment --profile ` → `workspace_id` field -- `DATABRICKS_ENDPOINT` — model endpoint name (e.g. `databricks-meta-llama-3-3-70b-instruct`). Run `databricks serving-endpoints list --profile ` to see available models. - -## Streaming Client Pattern (AI SDK v6) - -```tsx -import { useChat } from "@ai-sdk/react"; -import { TextStreamChatTransport } from "ai"; -import { useState } from "react"; - -export function ChatPage() { - const [input, setInput] = useState(""); - - const { messages, sendMessage, status } = useChat({ - transport: new TextStreamChatTransport({ api: "/api/chat" }), - }); - - return ( -
-
- {messages.map((m) => ( -
- - {m.role === "user" ? "You" : "Assistant"} - - {m.parts.map((part, i) => - part.type === "text" ? ( -

- {part.text} -

- ) : null, - )} -
- ))} - {status === "submitted" &&
Loading...
} -
-
{ - e.preventDefault(); - if (input.trim()) { - void sendMessage({ text: input }); - setInput(""); - } - }} - className="border-t p-4 flex gap-2" - > - setInput(e.target.value)} - placeholder="Ask a question..." - className="flex-1 border rounded px-3 py-2" - disabled={status !== "ready"} - /> - - -
- ); -} -``` - -Key differences from AI SDK v5: use `sendMessage({ text })` (NOT `append`), render `m.parts` array (NOT `m.content`), and `status` states are `ready`, `submitted`, `streaming`. - -## Embeddings Pattern - -Generate text embeddings using a Databricks AI Gateway endpoint. - -```typescript -import { getWorkspaceClient } from "@databricks/appkit"; - -const workspaceClient = getWorkspaceClient({}); - -export async function generateEmbedding(text: string): Promise { - const endpoint = - process.env.DATABRICKS_EMBEDDING_ENDPOINT || "databricks-gte-large-en"; - const result = await workspaceClient.servingEndpoints.query({ - name: endpoint, - input: text, - }); - return result.data![0].embedding!; -} -``` - -Common embedding endpoints: `databricks-gte-large-en` (1024d), `databricks-bge-large-en` (1024d). Set `DATABRICKS_EMBEDDING_ENDPOINT` in `.env` and `app.yaml`. - -For vector similarity search with these embeddings, see the `databricks-lakebase` skill's [pgvector.md](../../../databricks-lakebase/references/pgvector.md). +For off-platform streaming (AI SDK v6 with Databricks AI Gateway), see the **`databricks-model-serving`** skill. ## Troubleshooting @@ -274,5 +106,3 @@ For vector similarity search with these embeddings, see the `databricks-lakebase | `SERVING_ENDPOINT` env var empty | Missing env injection | Add `valueFrom: serving-endpoint` to `app.yaml` env section | | 504 Gateway Timeout | Inference exceeds 120s proxy limit | Reduce `max_tokens` or use WebSockets — see [Platform Guide](../platform-guide.md) | | `getExecutionContext` undefined | Called outside AppKit server context | Ensure call is inside a tRPC procedure on the server side | -| 502 from AI Gateway | Token expired or invalid endpoint | Refresh token via `getDatabricksToken()`; verify endpoint exists | -| `TextStreamChatTransport` not found | Wrong AI SDK version | Requires `ai@6` and `@ai-sdk/react@3` | diff --git a/skills/databricks-lakebase/references/lakehouse-sync.md b/skills/databricks-lakebase/references/lakehouse-sync.md index 6882403..c780faf 100644 --- a/skills/databricks-lakebase/references/lakehouse-sync.md +++ b/skills/databricks-lakebase/references/lakehouse-sync.md @@ -148,5 +148,5 @@ COMMIT; ## Cross-references -- For building Silver/Gold layers from CDC history tables, see the `databricks-pipelines` skill's [medallion-from-cdc.md](../../databricks-pipelines/references/medallion-from-cdc.md) +- For building Silver/Gold layers from CDC history tables, see [medallion-from-cdc.md](medallion-from-cdc.md) - For syncing in the reverse direction (UC → Lakebase), see [synced-tables.md](synced-tables.md) diff --git a/skills/databricks-pipelines/references/medallion-from-cdc.md b/skills/databricks-lakebase/references/medallion-from-cdc.md similarity index 100% rename from skills/databricks-pipelines/references/medallion-from-cdc.md rename to skills/databricks-lakebase/references/medallion-from-cdc.md diff --git a/skills/databricks-lakebase/references/synced-tables.md b/skills/databricks-lakebase/references/synced-tables.md index 0d139df..a4b82da 100644 --- a/skills/databricks-lakebase/references/synced-tables.md +++ b/skills/databricks-lakebase/references/synced-tables.md @@ -190,8 +190,8 @@ If a Databricks App reads synced tables, the app's Service Principal needs expli - **Schema evolution:** Only additive changes (adding columns) for Triggered/Continuous modes **Cost guidance:** -- **Continuous mode:** Reuse pipelines for ~10 tables/pipeline (~$204/table/month) vs separate pipelines (~$2,044/table/month) -- **Cost formula:** `[Rows / (Speed × CUs × 3600)] × DLT Hourly Rate` +- **Continuous mode:** Reuse pipelines for ~10 tables/pipeline — roughly 10x cheaper per table than separate pipelines +- **Cost formula:** `[Rows / (Speed × CUs × 3600)] × DLT Hourly Rate` (check current DLT pricing for your cloud/region) - **Snapshot vs incremental:** Snapshot is ~10x faster when >10% of data changes per cycle ## Lakehouse Sync (Beta) diff --git a/skills/databricks-model-serving/references/off-platform-streaming.md b/skills/databricks-model-serving/references/off-platform-streaming.md new file mode 100644 index 0000000..5199baa --- /dev/null +++ b/skills/databricks-model-serving/references/off-platform-streaming.md @@ -0,0 +1,180 @@ +# Off-Platform Streaming with AI SDK v6 + +These patterns are for apps deployed **outside** Databricks Apps (e.g., Vercel, AWS, standalone Node.js servers) using direct AI SDK v6 integration with Databricks AI Gateway. For AppKit-based apps, use the **`databricks-apps`** skill's built-in serving plugin instead. + +## AI SDK v6 Streaming Pattern + +Use this pattern for streaming AI chat with Databricks AI Gateway and Vercel AI SDK v6 in off-platform apps. + +**Dependencies:** `ai@6`, `@ai-sdk/react@3`, `@ai-sdk/openai`, `@databricks/sdk-experimental` + +**Auth helper** — works for both local dev (CLI profile) and deployed apps (service principal token): + +```typescript +import { Config } from "@databricks/sdk-experimental"; + +async function getDatabricksToken() { + if (process.env.DATABRICKS_TOKEN) { + return process.env.DATABRICKS_TOKEN; + } + const config = new Config({ + profile: process.env.DATABRICKS_CONFIG_PROFILE || "DEFAULT", + }); + await config.ensureResolved(); + const headers = new Headers(); + await config.authenticate(headers); + const authHeader = headers.get("Authorization"); + if (!authHeader) { + throw new Error( + "Failed to get Databricks token. Check your CLI profile or set DATABRICKS_TOKEN.", + ); + } + return authHeader.replace("Bearer ", ""); +} +``` + +**Server route** (`POST /api/chat`): + +```typescript +import { createOpenAI } from "@ai-sdk/openai"; +import { streamText, type UIMessage } from "ai"; + +app.post("/api/chat", async (req, res) => { + const { messages } = req.body; + + // AI SDK v6 client sends UIMessage objects with a parts array. + // Convert to CoreMessage format for streamText(). + const coreMessages = (messages as UIMessage[]).map((m) => ({ + role: m.role as "user" | "assistant" | "system", + content: + m.parts + ?.filter((p) => p.type === "text" && p.text) + .map((p) => p.text) + .join("") ?? + m.content ?? + "", + })); + + try { + const token = await getDatabricksToken(); + const endpoint = process.env.DATABRICKS_ENDPOINT || ""; + + // AI Gateway URL uses /mlflow/v1 path, NOT /openai/v1 + // URL varies by cloud: .cloud.databricks.com (AWS), .azuredatabricks.net (Azure), .gcp.databricks.com (GCP) + const databricks = createOpenAI({ + baseURL: `https://${process.env.DATABRICKS_WORKSPACE_ID}.ai-gateway.cloud.databricks.com/mlflow/v1`, + apiKey: token, + }); + + const result = streamText({ + model: databricks.chat(endpoint), + messages: coreMessages, + maxOutputTokens: 1000, + }); + + result.pipeTextStreamToResponse(res); + } catch (err) { + const message = (err as Error).message; + console.error(`[chat] Streaming request failed:`, message); + res.status(502).json({ error: "Chat request failed", detail: message }); + } +}); +``` + +**Environment variables:** +- `DATABRICKS_WORKSPACE_ID` — for explicit setup: `databricks api get /api/2.1/unity-catalog/current-metastore-assignment --profile ` → `workspace_id` field +- `DATABRICKS_ENDPOINT` — model endpoint name (e.g. `databricks-meta-llama-3-3-70b-instruct`). Run `databricks serving-endpoints list --profile ` to see available models. + +## Streaming Client Pattern (AI SDK v6) + +```tsx +import { useChat } from "@ai-sdk/react"; +import { TextStreamChatTransport } from "ai"; +import { useState } from "react"; + +export function ChatPage() { + const [input, setInput] = useState(""); + + const { messages, sendMessage, status } = useChat({ + transport: new TextStreamChatTransport({ api: "/api/chat" }), + }); + + return ( +
+
+ {messages.map((m) => ( +
+ + {m.role === "user" ? "You" : "Assistant"} + + {m.parts.map((part, i) => + part.type === "text" ? ( +

+ {part.text} +

+ ) : null, + )} +
+ ))} + {status === "submitted" &&
Loading...
} +
+
{ + e.preventDefault(); + if (input.trim()) { + void sendMessage({ text: input }); + setInput(""); + } + }} + className="border-t p-4 flex gap-2" + > + setInput(e.target.value)} + placeholder="Ask a question..." + className="flex-1 border rounded px-3 py-2" + disabled={status !== "ready"} + /> + + +
+ ); +} +``` + +Key differences from AI SDK v5: use `sendMessage({ text })` (NOT `append`), render `m.parts` array (NOT `m.content`), and `status` states are `ready`, `submitted`, `streaming`. + +## Embeddings Pattern + +Generate text embeddings using a Databricks AI Gateway endpoint. + +```typescript +import { getWorkspaceClient } from "@databricks/appkit"; + +const workspaceClient = getWorkspaceClient({}); + +export async function generateEmbedding(text: string): Promise { + const endpoint = + process.env.DATABRICKS_EMBEDDING_ENDPOINT || "databricks-gte-large-en"; + const result = await workspaceClient.servingEndpoints.query({ + name: endpoint, + input: text, + }); + return result.data![0].embedding!; +} +``` + +Common embedding endpoints: `databricks-gte-large-en` (1024d), `databricks-bge-large-en` (1024d). Set `DATABRICKS_EMBEDDING_ENDPOINT` in `.env` and `app.yaml`. + +For vector similarity search with these embeddings, see the **`databricks-lakebase`** skill. + +## Troubleshooting + +| Error | Cause | Solution | +|-------|-------|---------| +| 502 from AI Gateway | Token expired or invalid endpoint | Refresh token via `getDatabricksToken()`; verify endpoint exists | +| `TextStreamChatTransport` not found | Wrong AI SDK version | Requires `ai@6` and `@ai-sdk/react@3` | From 02ab9d2209b1791d4aefe40ab6a4f205954b959e Mon Sep 17 00:00:00 2001 From: Pawel Kosiec Date: Fri, 15 May 2026 11:21:15 +0200 Subject: [PATCH 4/7] Restore scaffolding instructions in lakebase.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep the full --features lakebase commands with --set flags inline rather than referencing SKILL.md — agents need copy-pasteable commands when working on a Lakebase app. Co-authored-by: Isaac --- .../references/appkit/lakebase.md | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/skills/databricks-apps/references/appkit/lakebase.md b/skills/databricks-apps/references/appkit/lakebase.md index b4752bf..1e2e22e 100644 --- a/skills/databricks-apps/references/appkit/lakebase.md +++ b/skills/databricks-apps/references/appkit/lakebase.md @@ -14,7 +14,30 @@ Use Lakebase when your app needs **persistent read/write storage** — forms, CR ## Scaffolding -Scaffold with `--features lakebase` as described in the parent SKILL.md (State Storage Rule and Scaffolding section). Use the **`databricks-lakebase`** skill to create a Lakebase project and discover branch/database resource names first. +**ALWAYS scaffold with the correct feature flags** — do not add Lakebase manually to an analytics-only scaffold. + +**Lakebase only** (no analytics SQL warehouse): +```bash +databricks apps init --name --features lakebase \ + --set "lakebase.postgres.branch=" \ + --set "lakebase.postgres.database=" \ + --run none --profile +``` + +**Both Lakebase and analytics**: +```bash +databricks apps init --name --features analytics,lakebase \ + --set "analytics.sql-warehouse.id=" \ + --set "lakebase.postgres.branch=" \ + --set "lakebase.postgres.database=" \ + --run none --profile +``` + +Where `` and `` are full resource names (e.g. `projects//branches/` and `projects//branches//databases/`). + +Use the `databricks-lakebase` skill to create a Lakebase project and discover branch/database resource names before running this command. + +> For multi-environment deployments (dev/prod), use `variables:` and `targets:` blocks in `databricks.yml` — see the **`databricks-dabs`** skill for patterns. **Naming conventions:** Use domain names for user-facing code (`ItemsPage.tsx`, `/api/items`, `item-routes.ts`). Keep `lakebase` naming only for infrastructure config (`lakebase()` plugin, `LAKEBASE_ENDPOINT`, `postgres` app resource). From 1d6c5af4db7aca9babf54221d5007c0ee196efe4 Mon Sep 17 00:00:00 2001 From: Pawel Kosiec Date: Fri, 15 May 2026 14:07:37 +0200 Subject: [PATCH 5/7] Fix verified inaccuracies across skill reference files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - lakehouse-sync.md: Fix CDC column names (_change_type → _pg_change_type, _lsn → _pg_lsn, _commit_timestamp → _timestamp), add missing columns (_pg_xid, _sort_by), label rename-and-swap as community guidance - medallion-from-cdc.md: Fix CDC column names in Silver layer SQL, remove invalid development: true from pipeline YAML, fix cross-reference paths - off-platform.md: Remove nonexistent list-databases CLI command, fix Drizzle API (drizzle(pool) → drizzle({ client: pool })), add Autoscaling-only limitation note and README link - pgvector.md: Clarify <#> is negative inner product - databricks-pipelines/SKILL.md: Remove broken medallion-from-cdc link (moved to databricks-lakebase, referenced from lakehouse-sync.md) - databricks-apps/SKILL.md: Update description (user change) - Regenerate manifest.json Co-authored-by: Isaac --- manifest.json | 15 +++++++------ skills/databricks-apps/SKILL.md | 2 +- .../references/lakehouse-sync.md | 22 ++++++++++--------- .../references/medallion-from-cdc.md | 15 ++++++------- .../references/off-platform.md | 6 +++-- .../references/pgvector.md | 2 +- skills/databricks-pipelines/SKILL.md | 1 - 7 files changed, 33 insertions(+), 30 deletions(-) diff --git a/manifest.json b/manifest.json index 8a4ec3f..86bb013 100644 --- a/manifest.json +++ b/manifest.json @@ -1,12 +1,12 @@ { "version": "2", - "updated_at": "2026-05-13T16:06:21Z", + "updated_at": "2026-05-15T12:14:35Z", "skills": { "databricks-apps": { "version": "0.1.1", "description": "Databricks Apps development and deployment (evaluates analytics vs synced tables data access)", "experimental": false, - "updated_at": "2026-05-13T16:05:34Z", + "updated_at": "2026-05-15T09:46:18Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -78,7 +78,7 @@ "version": "0.1.0", "description": "Databricks Lakebase Postgres: projects, scaling, connectivity, synced tables, and Data API", "experimental": false, - "updated_at": "2026-05-13T16:02:53Z", + "updated_at": "2026-05-15T12:07:06Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -87,6 +87,7 @@ "references/computes-and-scaling.md", "references/connectivity.md", "references/lakehouse-sync.md", + "references/medallion-from-cdc.md", "references/off-platform.md", "references/pgvector.md", "references/synced-tables.md" @@ -96,19 +97,20 @@ "version": "0.1.0", "description": "Databricks Model Serving endpoint management", "experimental": false, - "updated_at": "2026-05-13T16:06:15Z", + "updated_at": "2026-05-15T09:16:06Z", "files": [ "SKILL.md", "agents/openai.yaml", "assets/databricks.png", - "assets/databricks.svg" + "assets/databricks.svg", + "references/off-platform-streaming.md" ] }, "databricks-pipelines": { "version": "0.1.0", "description": "Databricks Pipelines (DLT) for ETL and streaming", "experimental": false, - "updated_at": "2026-05-13T16:03:14Z", + "updated_at": "2026-05-15T12:14:27Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -128,7 +130,6 @@ "references/materialized-view-python.md", "references/materialized-view-sql.md", "references/materialized-view.md", - "references/medallion-from-cdc.md", "references/options-avro.md", "references/options-csv.md", "references/options-json.md", diff --git a/skills/databricks-apps/SKILL.md b/skills/databricks-apps/SKILL.md index b9b12db..aee8b55 100644 --- a/skills/databricks-apps/SKILL.md +++ b/skills/databricks-apps/SKILL.md @@ -1,6 +1,6 @@ --- name: databricks-apps -description: "Build apps on Databricks Apps platform. Auto-detects need for Lakebase when app stores state; evaluates data access patterns (analytics vs Lakebase synced tables) before scaffolding. Invoke BEFORE starting implementation." +description: "Build apps on Databricks Apps platform. Use when asked to create dashboards, data apps, analytics tools, or visualizations. Auto-detects need for Lakebase when app stores state; evaluates data access patterns (analytics vs Lakebase synced tables) before scaffolding. Invoke BEFORE starting implementation." compatibility: Requires databricks CLI (>= v0.294.0) metadata: version: "0.1.1" diff --git a/skills/databricks-lakebase/references/lakehouse-sync.md b/skills/databricks-lakebase/references/lakehouse-sync.md index c780faf..7e9af2b 100644 --- a/skills/databricks-lakebase/references/lakehouse-sync.md +++ b/skills/databricks-lakebase/references/lakehouse-sync.md @@ -21,11 +21,13 @@ lb__history Each row includes CDC metadata columns: -| Column | Description | -|--------|-------------| -| `_change_type` | `insert`, `update_preimage`, `update_postimage`, or `delete` | -| `_lsn` | Log Sequence Number for ordering changes | -| `_commit_timestamp` | When the change was captured | +| Column | Type | Description | +|--------|------|-------------| +| `_pg_change_type` | TEXT | `insert`, `update_preimage`, `update_postimage`, or `delete` | +| `_pg_lsn` | BIGINT | Postgres Log Sequence Number for ordering changes | +| `_pg_xid` | INTEGER | Postgres Transaction ID | +| `_timestamp` | TIMESTAMP | When the sync processed the change (without timezone) | +| `_sort_by` | BIGINT | Monotonic sort key for ordering all changes | ## Enablement @@ -102,12 +104,12 @@ SELECT * FROM wal2delta.tables; SELECT * FROM ( SELECT *, - ROW_NUMBER() OVER (PARTITION BY ORDER BY _lsn DESC) AS rn + ROW_NUMBER() OVER (PARTITION BY ORDER BY _pg_lsn DESC) AS rn FROM ..lb__history - WHERE _change_type IN ('insert', 'update_postimage', 'delete') + WHERE _pg_change_type IN ('insert', 'update_postimage', 'delete') ) WHERE rn = 1 - AND _change_type != 'delete'; + AND _pg_change_type != 'delete'; ``` **Full change history for a record:** @@ -116,12 +118,12 @@ WHERE rn = 1 SELECT * FROM ..lb__history WHERE = -ORDER BY _lsn; +ORDER BY _pg_lsn; ``` ## Schema Changes -If you need to change a synced table's schema in Postgres, use the rename-and-swap pattern: +If you need to change a synced table's schema in Postgres, you can use the rename-and-swap pattern. Note: this is community guidance — the official behavior is that column changes (add, drop, type change) trigger a full resnapshot of the affected table. ```sql CREATE TABLE
_v2 ( diff --git a/skills/databricks-lakebase/references/medallion-from-cdc.md b/skills/databricks-lakebase/references/medallion-from-cdc.md index 81204d7..69cc106 100644 --- a/skills/databricks-lakebase/references/medallion-from-cdc.md +++ b/skills/databricks-lakebase/references/medallion-from-cdc.md @@ -36,7 +36,6 @@ resources: name: operational_analytics catalog: schema: - development: true serverless: true libraries: - file: @@ -51,18 +50,18 @@ For each entity, create `src/silver_.sql`: CREATE OR REFRESH MATERIALIZED VIEW silver_ COMMENT "Current state of records, deduplicated from CDC history" AS -SELECT * EXCEPT (rn, _change_type, _lsn, _commit_timestamp) +SELECT * EXCEPT (rn, _pg_change_type, _pg_lsn, _pg_xid, _timestamp, _sort_by) FROM ( SELECT *, ROW_NUMBER() OVER ( PARTITION BY - ORDER BY _lsn DESC + ORDER BY _pg_lsn DESC ) AS rn FROM ..lb__history - WHERE _change_type IN ('insert', 'update_postimage', 'delete') + WHERE _pg_change_type IN ('insert', 'update_postimage', 'delete') ) WHERE rn = 1 - AND _change_type != 'delete' + AND _pg_change_type != 'delete' ``` Replace ``, `.`, and `` with your values. @@ -134,11 +133,11 @@ Deploy: `databricks bundle deploy -t dev --profile ` |-------|-----| | Silver table returns no rows | Verify bronze history table has data: `SELECT COUNT(*) FROM lb__history` | | `TABLE_OR_VIEW_NOT_FOUND` for bronze table | Use fully-qualified name: `..lb__history` | -| Gold aggregation includes deleted records | Confirm silver layer filters `_change_type != 'delete'` | +| Gold aggregation includes deleted records | Confirm silver layer filters `_pg_change_type != 'delete'` | | Pipeline fails on deploy | Run `databricks bundle validate` first to catch config errors | | Incremental refresh not picking up changes | Verify Lakehouse Sync is active and bronze table is updating | ## Cross-references -- For Lakehouse Sync setup, see the `databricks-lakebase` skill's [lakehouse-sync.md](../../databricks-lakebase/references/lakehouse-sync.md) -- For synced tables (UC → Lakebase direction), see [synced-tables.md](../../databricks-lakebase/references/synced-tables.md) +- For Lakehouse Sync setup, see [lakehouse-sync.md](lakehouse-sync.md) +- For synced tables (UC → Lakebase direction), see [synced-tables.md](synced-tables.md) diff --git a/skills/databricks-lakebase/references/off-platform.md b/skills/databricks-lakebase/references/off-platform.md index a7ee354..f94279e 100644 --- a/skills/databricks-lakebase/references/off-platform.md +++ b/skills/databricks-lakebase/references/off-platform.md @@ -39,12 +39,14 @@ const pool = createLakebasePool({ - OpenTelemetry metrics: `lakebase.token.refresh.duration`, `lakebase.query.duration`, pool connection gauges - Logging: `{ debug, info, warn, error }` boolean flags or custom logger instance +> **Lakebase Autoscaling only.** This package is not compatible with Lakebase Provisioned. For the full config reference, see the [`@databricks/lakebase` README](https://github.com/databricks/appkit/tree/main/packages/lakebase). + **ORM integration:** ```typescript // Drizzle import { drizzle } from "drizzle-orm/node-postgres"; -const db = drizzle(pool); +const db = drizzle({ client: pool }); // Prisma import { PrismaPg } from "@prisma/adapter-pg"; @@ -63,7 +65,7 @@ import { getLakebaseOrmConfig } from "@databricks/lakebase"; | Variable | Description | How to find | |----------|-------------|-------------| | `PGHOST` | Lakebase endpoint host | `databricks postgres list-endpoints projects//branches/production --profile -o json` → `status.hosts.host` | -| `PGDATABASE` | Postgres database name | `databricks postgres list-databases projects//branches/production --profile -o json` → `status.postgres_database` | +| `PGDATABASE` | Postgres database name | Default is `databricks_postgres`. Verify via psql: `SELECT datname FROM pg_database WHERE datistemplate = false;` | | `LAKEBASE_ENDPOINT` | Endpoint resource path | Same `list-endpoints` command → `name` field | | `PGUSER` | Username | Your Databricks email (local dev) or service principal application ID (M2M) | | `PGSSLMODE` | SSL mode | `require` (default) | diff --git a/skills/databricks-lakebase/references/pgvector.md b/skills/databricks-lakebase/references/pgvector.md index 80ddb9e..f6f7314 100644 --- a/skills/databricks-lakebase/references/pgvector.md +++ b/skills/databricks-lakebase/references/pgvector.md @@ -127,7 +127,7 @@ Call `setupVectorTables(appkit)` from `onPluginsReady` before starting the serve |----------|----------|---------| | `<=>` | Cosine | Text similarity (default) | | `<->` | L2 (Euclidean) | Spatial data | -| `<#>` | Inner product | Normalized embeddings | +| `<#>` | Negative inner product | Normalized embeddings (smaller = more similar) | Similarity score: `1 - (embedding <=> $1::vector) AS similarity` (0 = unrelated, 1 = identical). diff --git a/skills/databricks-pipelines/SKILL.md b/skills/databricks-pipelines/SKILL.md index 6c3d799..d08d0b1 100644 --- a/skills/databricks-pipelines/SKILL.md +++ b/skills/databricks-pipelines/SKILL.md @@ -269,4 +269,3 @@ Detailed reference guides for each pipeline API. **Read the relevant guide befor - [Expectations](references/expectations.md) — Define and enforce data quality constraints ([Python](references/expectations-python.md), [SQL](references/expectations-sql.md)) - [Sinks](references/sink.md) — Write to Kafka, Event Hubs, external Delta tables ([Python](references/sink-python.md)) - [ForEachBatch Sinks](references/foreach-batch-sink.md) — Custom streaming sink with per-batch Python logic ([Python](references/foreach-batch-sink-python.md)) -- [Medallion from CDC](references/medallion-from-cdc.md) — Build Silver/Gold layers from Lakehouse Sync CDC history tables From d94ef0a55497179f689520ccda94ae6e3a1021e6 Mon Sep 17 00:00:00 2001 From: Pawel Kosiec Date: Fri, 15 May 2026 14:20:09 +0200 Subject: [PATCH 6/7] Fix review findings: broken cross-ref, AppKit import, missing context - Fix broken cross-reference in synced-tables.md pointing to pipelines skill instead of sibling medallion-from-cdc.md - Replace AppKit getWorkspaceClient with sdk-experimental WorkspaceClient in off-platform-streaming.md embeddings pattern - Add SPACES context comment in genie.md initAlias() snippet - Regenerate manifest Co-authored-by: Isaac --- manifest.json | 8 ++++---- skills/databricks-apps/references/appkit/genie.md | 1 + skills/databricks-lakebase/references/synced-tables.md | 2 +- .../references/off-platform-streaming.md | 6 ++++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/manifest.json b/manifest.json index 86bb013..4383e24 100644 --- a/manifest.json +++ b/manifest.json @@ -1,12 +1,12 @@ { "version": "2", - "updated_at": "2026-05-15T12:14:35Z", + "updated_at": "2026-05-15T12:20:02Z", "skills": { "databricks-apps": { "version": "0.1.1", "description": "Databricks Apps development and deployment (evaluates analytics vs synced tables data access)", "experimental": false, - "updated_at": "2026-05-15T09:46:18Z", + "updated_at": "2026-05-15T12:19:02Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -78,7 +78,7 @@ "version": "0.1.0", "description": "Databricks Lakebase Postgres: projects, scaling, connectivity, synced tables, and Data API", "experimental": false, - "updated_at": "2026-05-15T12:07:06Z", + "updated_at": "2026-05-15T12:18:47Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -97,7 +97,7 @@ "version": "0.1.0", "description": "Databricks Model Serving endpoint management", "experimental": false, - "updated_at": "2026-05-15T09:16:06Z", + "updated_at": "2026-05-15T12:19:44Z", "files": [ "SKILL.md", "agents/openai.yaml", diff --git a/skills/databricks-apps/references/appkit/genie.md b/skills/databricks-apps/references/appkit/genie.md index 5975d78..21734d1 100644 --- a/skills/databricks-apps/references/appkit/genie.md +++ b/skills/databricks-apps/references/appkit/genie.md @@ -243,6 +243,7 @@ function initAlias(): string { if (savedAlias) localStorage.setItem("appkit:genie:alias", savedAlias); clearConversationUrl(); } + // SPACES: array of {alias, spaceId} defined in your component return localStorage.getItem("appkit:genie:alias") ?? SPACES[0]?.alias ?? ""; } ``` diff --git a/skills/databricks-lakebase/references/synced-tables.md b/skills/databricks-lakebase/references/synced-tables.md index a4b82da..ca1b99c 100644 --- a/skills/databricks-lakebase/references/synced-tables.md +++ b/skills/databricks-lakebase/references/synced-tables.md @@ -223,7 +223,7 @@ Reverse direction: continuously streams changes **from** Lakebase Postgres **int - Partitioned tables are not supported - Disabling and re-enabling sync does **not** re-snapshot — missing changes are lost permanently -For the full Lakehouse Sync reference, see [lakehouse-sync.md](lakehouse-sync.md). For building medallion pipelines from CDC history, see the `databricks-pipelines` skill's [medallion-from-cdc.md](../../databricks-pipelines/references/medallion-from-cdc.md). +For the full Lakehouse Sync reference, see [lakehouse-sync.md](lakehouse-sync.md). For building medallion pipelines from CDC history, see [medallion-from-cdc.md](medallion-from-cdc.md). ## Use Cases diff --git a/skills/databricks-model-serving/references/off-platform-streaming.md b/skills/databricks-model-serving/references/off-platform-streaming.md index 5199baa..000913d 100644 --- a/skills/databricks-model-serving/references/off-platform-streaming.md +++ b/skills/databricks-model-serving/references/off-platform-streaming.md @@ -153,9 +153,11 @@ Key differences from AI SDK v5: use `sendMessage({ text })` (NOT `append`), rend Generate text embeddings using a Databricks AI Gateway endpoint. ```typescript -import { getWorkspaceClient } from "@databricks/appkit"; +import { WorkspaceClient } from "@databricks/sdk-experimental"; -const workspaceClient = getWorkspaceClient({}); +const workspaceClient = new WorkspaceClient({ + host: process.env.DATABRICKS_HOST, +}); export async function generateEmbedding(text: string): Promise { const endpoint = From ee159bc6310f4bff7a59eef21768718fdbbee306 Mon Sep 17 00:00:00 2001 From: Pawel Kosiec Date: Fri, 15 May 2026 14:29:42 +0200 Subject: [PATCH 7/7] Remove manual token management, use @databricks/lakebase driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop ~100 lines of hand-rolled two-token refresh plumbing from off-platform.md — the @databricks/lakebase driver handles all of this automatically. Replace drizzle-kit shell migration with Drizzle's programmatic migrator using createLakebasePool() directly. Co-authored-by: Isaac --- manifest.json | 4 +- .../references/off-platform.md | 133 ++---------------- 2 files changed, 15 insertions(+), 122 deletions(-) diff --git a/manifest.json b/manifest.json index 4383e24..8d6cefe 100644 --- a/manifest.json +++ b/manifest.json @@ -1,6 +1,6 @@ { "version": "2", - "updated_at": "2026-05-15T12:20:02Z", + "updated_at": "2026-05-15T12:29:33Z", "skills": { "databricks-apps": { "version": "0.1.1", @@ -78,7 +78,7 @@ "version": "0.1.0", "description": "Databricks Lakebase Postgres: projects, scaling, connectivity, synced tables, and Data API", "experimental": false, - "updated_at": "2026-05-15T12:18:47Z", + "updated_at": "2026-05-15T12:29:29Z", "files": [ "SKILL.md", "agents/openai.yaml", diff --git a/skills/databricks-lakebase/references/off-platform.md b/skills/databricks-lakebase/references/off-platform.md index f94279e..911e1f4 100644 --- a/skills/databricks-lakebase/references/off-platform.md +++ b/skills/databricks-lakebase/references/off-platform.md @@ -138,105 +138,6 @@ export const env = validateAuth(baseSchema.parse(process.env)); Import `env` at the top of your server entry point for fast-fail on missing variables. -## Manual Token Management - -> **Prefer `@databricks/lakebase`** for Node.js apps — it handles everything below automatically. Use this section only for non-Node.js apps or custom token flows. - -Lakebase requires a **two-token system**: a workspace token + a short-lived Lakebase Postgres credential. - -```typescript -const REFRESH_BUFFER_MS = 2 * 60 * 1000; // Refresh 2 minutes before expiry - -type CachedToken = { value: string; expiresAt: number }; - -let cachedWorkspaceToken: CachedToken | null = null; -let workspaceRefreshPromise: Promise | null = null; -let cachedLakebaseToken: CachedToken | null = null; -let lakebaseRefreshPromise: Promise | null = null; - -function isFresh(token: CachedToken | null): token is CachedToken { - return token !== null && Date.now() < token.expiresAt - REFRESH_BUFFER_MS; -} -``` - -**M2M OIDC flow** (production): - -```typescript -async function fetchWorkspaceTokenM2M(host: string, clientId: string, clientSecret: string): Promise { - const response = await fetch(`${host}/oidc/v1/token`, { - method: "POST", - headers: { "Content-Type": "application/x-www-form-urlencoded" }, - body: new URLSearchParams({ - grant_type: "client_credentials", - client_id: clientId, - client_secret: clientSecret, - scope: "all-apis", - }), - }); - if (!response.ok) throw new Error(`M2M token request failed: ${response.status}`); - const data = await response.json() as { access_token: string; expires_in: number }; - return { value: data.access_token, expiresAt: Date.now() + data.expires_in * 1000 }; -} -``` - -**Lakebase credential** (exchange workspace token for Postgres password): - -```typescript -async function fetchLakebaseCredential(host: string, workspaceToken: string): Promise { - const response = await fetch(`${host}/api/2.0/postgres/credentials`, { - method: "POST", - headers: { - Authorization: `Bearer ${workspaceToken}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ endpoint: env.LAKEBASE_ENDPOINT }), - }); - if (!response.ok) throw new Error(`Lakebase credential request failed: ${response.status}`); - const data = await response.json() as { token: string; expire_time: string }; - return { value: data.token, expiresAt: new Date(data.expire_time).getTime() }; -} -``` - -**Concurrent deduplication** — use a singleton promise pattern to avoid duplicate refresh calls: - -```typescript -export async function getLakebasePostgresToken(): Promise { - if (isFresh(cachedLakebaseToken)) return cachedLakebaseToken.value; - if (!lakebaseRefreshPromise) { - lakebaseRefreshPromise = (async () => { - const auth = authStrategyFromEnv(); - const workspaceToken = await getWorkspaceToken(auth); - return fetchLakebaseCredential(env.DATABRICKS_HOST.replace(/\/$/, ""), workspaceToken); - })() - .then((token) => { cachedLakebaseToken = token; return token; }) - .finally(() => { lakebaseRefreshPromise = null; }); - } - return (await lakebaseRefreshPromise).value; -} -``` - -**Local dev refresh script** (`scripts/refresh-lakebase-token.ts`): - -```typescript -import { execSync } from "node:child_process"; -import { readFileSync, writeFileSync, existsSync } from "node:fs"; - -const envFile = process.argv[2] ?? ".env.local"; -const profile = process.env.DATABRICKS_CONFIG_PROFILE ?? "DEFAULT"; -const raw = execSync(`databricks auth token --profile "${profile}" -o json`, { encoding: "utf-8" }); -const parsed = JSON.parse(raw) as { access_token?: string }; -if (!parsed.access_token) throw new Error("Failed to get access token from Databricks CLI"); -if (!existsSync(envFile)) throw new Error(`Env file not found: ${envFile}`); - -const content = readFileSync(envFile, "utf-8"); -const tokenLine = `DATABRICKS_TOKEN="${parsed.access_token}"`; -const updated = content.includes("DATABRICKS_TOKEN=") - ? content.replace(/^DATABRICKS_TOKEN=.*/m, tokenLine) - : `${content.trimEnd()}\n${tokenLine}\n`; -writeFileSync(envFile, updated); -console.log(`Updated DATABRICKS_TOKEN in ${envFile}`); -``` - ## Drizzle ORM Integration **With `@databricks/lakebase`** (recommended): @@ -262,27 +163,22 @@ export const items = pgTable("items", { }); ``` -**Migration with Lakebase credentials** — `drizzle-kit` cannot use `pg` password callbacks. Build a one-time URL: +**Running migrations** — use Drizzle's programmatic migrator with the Lakebase pool: ```typescript // scripts/db-migrate.ts -import { execSync } from "node:child_process"; -import { getLakebasePostgresToken } from "@/lib/lakebase/tokens"; - -async function runMigrations() { - const token = await getLakebasePostgresToken(); - const databaseUrl = - `postgresql://${encodeURIComponent(env.PGUSER)}:${encodeURIComponent(token)}` + - `@${env.PGHOST}:${env.PGPORT}/${env.PGDATABASE}?sslmode=${env.PGSSLMODE}`; - execSync("npx drizzle-kit migrate", { - stdio: "inherit", - env: { ...process.env, DATABASE_URL: databaseUrl }, - }); -} -runMigrations().catch((error) => { console.error(error); process.exit(1); }); +import { drizzle } from "drizzle-orm/node-postgres"; +import { migrate } from "drizzle-orm/node-postgres/migrator"; +import { createLakebasePool } from "@databricks/lakebase"; + +const pool = createLakebasePool(); +const db = drizzle({ client: pool }); +await migrate(db, { migrationsFolder: "./src/lib/db/migrations" }); +await pool.end(); +console.log("Migrations applied successfully"); ``` -**`drizzle.config.ts`** — conditional `dbCredentials` (only needed when `DATABASE_URL` is set by migration script): +**`drizzle.config.ts`** — used by `drizzle-kit generate` (no DB connection needed): ```typescript import { defineConfig } from "drizzle-kit"; @@ -291,15 +187,12 @@ export default defineConfig({ schema: "./src/lib/*/schema.ts", out: "./src/lib/db/migrations", dialect: "postgresql", - ...(process.env.DATABASE_URL && { - dbCredentials: { url: process.env.DATABASE_URL }, - }), }); ``` **Commands:** -- Generate (local, no DB connection): `npx drizzle-kit generate` -- Migrate (needs credentials): `npx dotenv -e .env.local -- npx tsx scripts/db-migrate.ts` +- Generate: `npx drizzle-kit generate` +- Migrate: `npx dotenv -e .env.local -- npx tsx scripts/db-migrate.ts` ## Cross-references