diff --git a/manifest.json b/manifest.json index bc7836a..8d6cefe 100644 --- a/manifest.json +++ b/manifest.json @@ -1,12 +1,12 @@ { "version": "2", - "updated_at": "2026-05-11T13:22:07Z", + "updated_at": "2026-05-15T12:29:33Z", "skills": { "databricks-apps": { "version": "0.1.1", "description": "Databricks Apps development and deployment (evaluates analytics vs synced tables data access)", "experimental": false, - "updated_at": "2026-05-11T13:22:01Z", + "updated_at": "2026-05-15T12:19:02Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -33,7 +33,7 @@ "version": "0.1.0", "description": "Core Databricks skill for CLI, auth, and data exploration", "experimental": false, - "updated_at": "2026-05-11T10:22:59Z", + "updated_at": "2026-05-12T22:07:25Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -48,7 +48,7 @@ "version": "0.0.0", "description": "Declarative Automation Bundles (DABs) for deploying and managing Databricks resources", "experimental": false, - "updated_at": "2026-05-05T15:31:42Z", + "updated_at": "2026-05-12T20:04:29Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -66,7 +66,7 @@ "version": "0.1.0", "description": "Databricks Jobs orchestration and scheduling", "experimental": false, - "updated_at": "2026-05-07T15:19:50Z", + "updated_at": "2026-05-12T20:04:29Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -78,7 +78,7 @@ "version": "0.1.0", "description": "Databricks Lakebase Postgres: projects, scaling, connectivity, synced tables, and Data API", "experimental": false, - "updated_at": "2026-05-11T10:23:05Z", + "updated_at": "2026-05-15T12:29:29Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -86,6 +86,10 @@ "assets/databricks.svg", "references/computes-and-scaling.md", "references/connectivity.md", + "references/lakehouse-sync.md", + "references/medallion-from-cdc.md", + "references/off-platform.md", + "references/pgvector.md", "references/synced-tables.md" ] }, @@ -93,19 +97,20 @@ "version": "0.1.0", "description": "Databricks Model Serving endpoint management", "experimental": false, - "updated_at": "2026-05-07T15:19:45Z", + "updated_at": "2026-05-15T12:19:44Z", "files": [ "SKILL.md", "agents/openai.yaml", "assets/databricks.png", - "assets/databricks.svg" + "assets/databricks.svg", + "references/off-platform-streaming.md" ] }, "databricks-pipelines": { "version": "0.1.0", "description": "Databricks Pipelines (DLT) for ETL and streaming", "experimental": false, - "updated_at": "2026-05-07T15:19:55Z", + "updated_at": "2026-05-15T12:14:27Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -152,7 +157,7 @@ "version": "0.1.0", "description": "Migrate Databricks workloads from classic compute to serverless compute, including compatibility checks and concrete fixes", "experimental": false, - "updated_at": "2026-05-07T15:19:59Z", + "updated_at": "2026-05-12T20:04:29Z", "files": [ "SKILL.md", "agents/openai.yaml", diff --git a/skills/databricks-apps/SKILL.md b/skills/databricks-apps/SKILL.md index 009d751..aee8b55 100644 --- a/skills/databricks-apps/SKILL.md +++ b/skills/databricks-apps/SKILL.md @@ -1,6 +1,6 @@ --- name: databricks-apps -description: "Build apps on Databricks Apps platform. Use when asked to create dashboards, data apps, analytics tools, or visualizations. Evaluates data access patterns (analytics vs Lakebase synced tables) before scaffolding. Invoke BEFORE starting implementation." +description: "Build apps on Databricks Apps platform. Use when asked to create dashboards, data apps, analytics tools, or visualizations. Auto-detects need for Lakebase when app stores state; evaluates data access patterns (analytics vs Lakebase synced tables) before scaffolding. Invoke BEFORE starting implementation." compatibility: Requires databricks CLI (>= v0.294.0) metadata: version: "0.1.1" @@ -17,7 +17,7 @@ Build apps that deploy to Databricks Apps platform. | Phase | READ BEFORE proceeding | |-------|------------------------| -| Scaffolding | **⚠️ STOP — complete the Data Access Decision Gate below before scaffolding.** Parent `databricks-core` skill (auth, warehouse discovery); then run `databricks apps manifest` + `databricks apps init` with `--features` and `--set` (see AppKit section below) | +| Scaffolding | **⚠️ STOP — evaluate the State Storage Rule and Data Access Decision Gate below before scaffolding.** Parent `databricks-core` skill (auth, warehouse discovery); then run `databricks apps manifest` + `databricks apps init` with `--features` and `--set` (see AppKit section below) | | Writing SQL queries | [SQL Queries Guide](references/appkit/sql-queries.md) | | Writing UI components | [Frontend Guide](references/appkit/frontend.md) | | Using `useAnalyticsQuery` | [AppKit SDK](references/appkit/appkit-sdk.md) | @@ -61,6 +61,16 @@ Build apps that deploy to Databricks Apps platform. Before writing any SQL, use the parent `databricks-core` skill for data exploration — search `information_schema` by keyword, then batch `discover-schema` for the tables you need. Do NOT skip this step. +**State Storage Rule (evaluate BEFORE the Decision Gate):** + +If the user's app description implies storing or persisting data — forms, CRUD operations, user input, preferences, bookmarks, orders, todos, comments, votes, or any user-generated content — the app needs a Lakebase database. Do not wait for the user to ask for one. + +1. Use the **`databricks-lakebase`** skill to create a Lakebase project (if one doesn't already exist) and obtain the branch and database resource names. +2. Scaffold with `--features lakebase` and pass `--set lakebase.postgres.branch= --set lakebase.postgres.database=`. +3. If the app **also** reads from Unity Catalog tables, proceed to the Data Access Decision Gate below to determine whether to add `--features analytics` or use Lakebase synced tables. + +This rule governs **state storage** only. For how the app reads existing lakehouse data, proceed to the Decision Gate below. This is not optional — any app that writes user-generated data needs Lakebase. + ## Development Workflow (FOLLOW THIS ORDER) **Data Access Decision Gate (REQUIRED before scaffolding):** @@ -79,7 +89,7 @@ After the user chooses: - (A) Lakebase synced tables → scaffold with `--features lakebase`. See [Lakebase Guide](references/appkit/lakebase.md) for full workflow. - (B) Analytics → scaffold with `--features analytics`. - Both → scaffold with `--features analytics,lakebase` if the app needs both patterns. -- If the app does NOT read UC data (pure CRUD, Genie, Model Serving), skip this gate and scaffold with the appropriate `--features` flag. +- If the app does NOT read UC data (pure CRUD, Genie, Model Serving), skip this gate. For pure CRUD/state apps, the State Storage Rule above already applies — scaffold with `--features lakebase`. For Genie or Model Serving, scaffold with the corresponding `--features` flag. **Analytics apps** (`--features analytics`): @@ -194,3 +204,12 @@ App names must be lowercase with hyphens only (≤26 chars). Databricks Apps supports any framework that runs as an HTTP server. LLMs already know these frameworks — the challenge is Databricks platform integration. **READ [Other Frameworks Guide](references/other-frameworks.md) BEFORE building any non-AppKit app.** It covers port/host configuration, `app.yaml` and `databricks.yml` setup, dependency management, networking, and framework-specific gotchas. + +### Post-Deploy Verification + +After deploying, verify the app is running: + +```bash +databricks apps get --profile -o json # Check app_status.state: RUNNING +databricks apps logs --follow --profile # Stream live logs (Ctrl+C to stop) +``` diff --git a/skills/databricks-apps/references/appkit/files.md b/skills/databricks-apps/references/appkit/files.md index 3432f09..108d9c0 100644 --- a/skills/databricks-apps/references/appkit/files.md +++ b/skills/databricks-apps/references/appkit/files.md @@ -237,6 +237,8 @@ Each volume key requires a resource with `WRITE_VOLUME` permission. Declare in ` resources: apps: my_app: + user_api_scopes: + - files.files # Needed when using .asUser(req) programmatic API resources: - name: uploads-volume volume: @@ -244,6 +246,8 @@ resources: permission: WRITE_VOLUME ``` +> **Note:** The scaffolded HTTP routes (`/api/files/...`) execute as the service principal and do not require `user_api_scopes`. The scope is needed when using the programmatic `appkit.files("key").asUser(req)` API for per-user Volume access. + Wire the env var in `app.yaml`: ```yaml diff --git a/skills/databricks-apps/references/appkit/genie.md b/skills/databricks-apps/references/appkit/genie.md index 8320e35..21734d1 100644 --- a/skills/databricks-apps/references/appkit/genie.md +++ b/skills/databricks-apps/references/appkit/genie.md @@ -147,6 +147,107 @@ Update smoke tests if headings or routes changed, then `databricks apps validate For advanced Genie plugin usage, see `npx @databricks/appkit docs ./docs/plugins/genie.md`. +## Multi-Space Deployment + +For the `spaces` map API, `GenieChat alias` prop, and `useGenieChat` hook, see `npx @databricks/appkit docs ./docs/plugins/genie.md`. + +This section covers the **deployment-specific patterns** for multi-space Genie apps (databricks.yml, app.yaml, stale conversation cleanup). + +**databricks.yml** — add one variable + resource per space, plus target-level values: + +```yaml +variables: + genie_space_id: + description: Default Genie space ID (required by AppKit) + genie_space_name: + description: Default Genie space name + genie_space_sales_id: + description: Sales Genie space ID + genie_space_support_id: + description: Support Genie space ID + +resources: + apps: + app: + user_api_scopes: + - dashboards.genie + resources: + - name: genie-space + genie_space: + name: ${var.genie_space_name} + space_id: ${var.genie_space_id} + permission: CAN_RUN + - name: genie-space-sales + genie_space: + name: genie-space-sales + space_id: ${var.genie_space_sales_id} + permission: CAN_RUN + - name: genie-space-support + genie_space: + name: genie-space-support + space_id: ${var.genie_space_support_id} + permission: CAN_RUN + +targets: + default: + variables: + genie_space_id: + genie_space_name: + genie_space_sales_id: + genie_space_support_id: +``` + +**app.yaml** — keep `DATABRICKS_GENIE_SPACE_ID` (AppKit validates it on startup). Add one `valueFrom` per UI space: + +```yaml +env: + - name: DATABRICKS_GENIE_SPACE_ID + valueFrom: genie-space + - name: DATABRICKS_GENIE_SPACE_SALES + valueFrom: genie-space-sales + - name: DATABRICKS_GENIE_SPACE_SUPPORT + valueFrom: genie-space-support +``` + +**Critical gotcha**: `DATABRICKS_GENIE_SPACE_ID` must always be set — AppKit validates it on startup even when using a custom `spaces` map. + +**Build version stamp** — stamp every build so the page can detect a new deployment and clear stale conversation state: + +```typescript +// client/vite.config.ts +export default defineConfig({ + // ... existing config ... + define: { + "import.meta.env.VITE_APP_VERSION": JSON.stringify(Date.now().toString()), + }, +}); +``` + +**Stale conversation cleanup** — `GenieChat` stores conversation IDs in URLs and localStorage that become stale across space switches or redeployments: + +```typescript +function clearConversationUrl() { + const url = new URL(window.location.href); + url.searchParams.delete("conversationId"); + window.history.replaceState({}, "", url.toString()); +} + +function initAlias(): string { + const buildVersion = import.meta.env.VITE_APP_VERSION ?? "dev"; + if (localStorage.getItem("appkit:genie:version") !== buildVersion) { + const savedAlias = localStorage.getItem("appkit:genie:alias"); + Object.keys(localStorage) + .filter((k) => k.startsWith("appkit:genie:")) + .forEach((k) => localStorage.removeItem(k)); + localStorage.setItem("appkit:genie:version", buildVersion); + if (savedAlias) localStorage.setItem("appkit:genie:alias", savedAlias); + clearConversationUrl(); + } + // SPACES: array of {alias, spaceId} defined in your component + return localStorage.getItem("appkit:genie:alias") ?? SPACES[0]?.alias ?? ""; +} +``` + ## Frontend **For full component API**: run `npx @databricks/appkit docs "GenieChat"`. @@ -197,3 +298,6 @@ The plugin mounts SSE endpoints under `/api/genie`: | `plugin "genie" has no resource with key "..."` | Wrong `--set` flags during scaffold | Always derive resource keys from `databricks apps manifest` | | Chat collapses or renders poorly | No explicit height on container | Give the parent a fixed height | | Duplicate routes or import confusion | Old local Genie proxy file | Remove it — use `genie` from `@databricks/appkit` | +| `does not have required scopes: genie` | Missing API scope | Confirm `user_api_scopes` includes `dashboards.genie` in `databricks.yml` and redeploy | +| Genie space not found | Wrong space ID | Verify space ID matches the value on the Genie space **About** tab | +| `valueFrom` mismatch | `app.yaml` value doesn't match `databricks.yml` | `valueFrom` in `app.yaml` must exactly match the resource `name` in `databricks.yml` | diff --git a/skills/databricks-apps/references/appkit/lakebase.md b/skills/databricks-apps/references/appkit/lakebase.md index d75e888..1e2e22e 100644 --- a/skills/databricks-apps/references/appkit/lakebase.md +++ b/skills/databricks-apps/references/appkit/lakebase.md @@ -39,6 +39,8 @@ Use the `databricks-lakebase` skill to create a Lakebase project and discover br > For multi-environment deployments (dev/prod), use `variables:` and `targets:` blocks in `databricks.yml` — see the **`databricks-dabs`** skill for patterns. +**Naming conventions:** Use domain names for user-facing code (`ItemsPage.tsx`, `/api/items`, `item-routes.ts`). Keep `lakebase` naming only for infrastructure config (`lakebase()` plugin, `LAKEBASE_ENDPOINT`, `postgres` app resource). + **Get resource names** (if you have an existing project): ```bash # List branches → use the name field of a READY branch @@ -92,18 +94,17 @@ The `lakebase()` plugin auto-configures from platform-injected env vars at deplo ## CRUD Routes Pattern -Always use server-side routes for Lakebase operations — do NOT call `appkit.lakebase.query()` from the client. Use `server.extend()` to register Express routes: +Always use server-side routes for Lakebase operations — do NOT call `appkit.lakebase.query()` from the client. Use `onPluginsReady` to initialize the schema and register Express routes: ```typescript // server/server.ts import { createApp, server, lakebase } from "@databricks/appkit"; import { z } from 'zod'; -createApp({ - plugins: [server({ autoStart: false }), lakebase()], -}) - .then(async (appkit) => { - // Schema init (runs once at startup) +await createApp({ + plugins: [server(), lakebase()], + async onPluginsReady(appkit) { + // Schema init (runs once before server accepts requests) await appkit.lakebase.query(` CREATE SCHEMA IF NOT EXISTS app_data; CREATE TABLE IF NOT EXISTS app_data.items ( @@ -139,17 +140,15 @@ createApp({ res.status(204).send(); }); }); - - await appkit.server.start(); - }) - .catch(console.error); + }, +}); ``` > **Deploy first (App + Lakebase only)!** When your Databricks App uses Lakebase, the Service Principal must create and own the schema. Run `databricks apps deploy` before any local development. See **`databricks-lakebase`** skill's **Schema Permissions for Deployed Apps** for details. ## Schema Initialization -**Always create a custom schema** — the Service Principal cannot access any existing schemas (including `public`). It must create the schema itself to become its owner. See **`databricks-lakebase`** skill's **Schema Permissions for Deployed Apps** for the full permission model and deploy-first workflow. Initialize tables inside the `.then()` callback before registering routes (see CRUD pattern above): +**Always create a custom schema** — the Service Principal cannot access any existing schemas (including `public`). It must create the schema itself to become its owner. See **`databricks-lakebase`** skill's **Schema Permissions for Deployed Apps** for the full permission model and deploy-first workflow. Initialize tables inside the `onPluginsReady` callback before registering routes (see CRUD pattern above): ```typescript // Inside onPluginsReady — runs once at startup before handling requests @@ -180,6 +179,78 @@ const prisma = new PrismaClient({ adapter }); For ORM-compatible config: `appkit.lakebase.getOrmConfig()`. +## Chat Persistence Pattern + +Save AI chat conversations to Lakebase so users can resume sessions and scroll full message history. + +**Schema** — create in a separate `chat` schema (not `app`) so the deploy-first ownership model stays clean: + +```sql +CREATE SCHEMA IF NOT EXISTS chat; + +CREATE TABLE IF NOT EXISTS chat.chats ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + user_id TEXT NOT NULL, + title TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS chat.messages ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + chat_id UUID NOT NULL REFERENCES chat.chats(id) ON DELETE CASCADE, + role TEXT NOT NULL CHECK (role IN ('system', 'user', 'assistant', 'tool')), + content TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_messages_chat_id_created_at + ON chat.messages(chat_id, created_at); +``` + +**Bootstrap** — run setup in `onPluginsReady` so tables exist before the server accepts requests: + +```typescript +await createApp({ + plugins: [server(), lakebase()], + async onPluginsReady(appkit) { + await setupChatTables(appkit); + // then register routes via appkit.server.extend(...) + }, +}); +``` + +**Persistence helpers** — use parameterized queries: + +```typescript +export async function createChat(appkit, input: { userId: string; title: string }) { + const result = await appkit.lakebase.query( + `INSERT INTO chat.chats (user_id, title) VALUES ($1, $2) + RETURNING id, user_id, title, created_at, updated_at`, + [input.userId, input.title], + ); + return result.rows[0]; +} + +export async function appendMessage(appkit, input: { chatId: string; role: string; content: string }) { + const result = await appkit.lakebase.query( + `INSERT INTO chat.messages (chat_id, role, content) VALUES ($1, $2, $3) + RETURNING id, chat_id, role, content, created_at`, + [input.chatId, input.role, input.content], + ); + return result.rows[0]; +} +``` + +**User identity**: In deployed apps, use `req.header("x-forwarded-email")` (injected by the Databricks Apps platform proxy; for off-platform deployments, use your own auth middleware). For local dev, hardcode a test user ID. + +**History endpoints**: +- `GET /api/chats` — list chats for current user +- `GET /api/chats/:chatId/messages` — load ordered history +- `DELETE /api/chats/:chatId` — delete chat (messages cascade) + +**AI SDK v6 integration**: Use `setMessages()` from `useChat` return value for history loading (NOT `initialMessages`). To read response headers like `X-Chat-Id`, pass a custom `fetch` wrapper on the `TextStreamChatTransport` constructor. + ## Reading from Lakebase synced tables Lakebase synced tables materialize Delta/UC tables into Lakebase Postgres for low-latency app reads. The lakehouse remains the source of truth; Lakebase serves as a read-optimized index. @@ -262,6 +333,8 @@ If you skip this step, the Service Principal won't own the database schema. You' Lakebase project creators already have database access after the first deploy. Collaborators need `databricks_superuser` granted by the project creator via Branch Overview. +> **Project-owner note:** If you are the Lakebase project owner, `databricks_create_role` may fail with "role already exists" and `GRANT databricks_superuser` may fail with "permission denied to grant role" — both errors are safe to ignore; the project owner already has the necessary access. + The Lakebase env vars (`PGHOST`, `PGDATABASE`, etc.) are auto-set only when deployed. For local development, get the connection details from your endpoint and set them manually: ```bash diff --git a/skills/databricks-apps/references/appkit/model-serving.md b/skills/databricks-apps/references/appkit/model-serving.md index 967f8cb..40a8772 100644 --- a/skills/databricks-apps/references/appkit/model-serving.md +++ b/skills/databricks-apps/references/appkit/model-serving.md @@ -56,7 +56,7 @@ env: The injected value is the endpoint **name** (not a URL). Use it in server-side code to call the endpoint. -## tRPC Pattern +## Non-Streaming Query Pattern (tRPC) Always use tRPC for model serving calls — do NOT call endpoints directly from the client. @@ -94,9 +94,9 @@ const result = await trpc.queryModel.query({ prompt: userInput }); const answer = result.choices?.[0]?.message?.content; ``` -For streaming and advanced patterns, see `npx @databricks/appkit docs ./docs/plugins/model-serving.md`. +For AppKit's built-in serving plugin streaming (SSE via `stream()` and `useServingStream`), see `npx @databricks/appkit docs ./docs/plugins/model-serving.md`. -AppKit integrates with **Model Serving endpoints**. AI Gateway (beta) endpoints are not directly supported — use the underlying Model Serving endpoint name instead. AI Gateway features (rate limits, usage tracking) can be configured on Model Serving endpoints via the `databricks-model-serving` skill. +For off-platform streaming (AI SDK v6 with Databricks AI Gateway), see the **`databricks-model-serving`** skill. ## Troubleshooting diff --git a/skills/databricks-lakebase/SKILL.md b/skills/databricks-lakebase/SKILL.md index 44e5b34..459c88e 100644 --- a/skills/databricks-lakebase/SKILL.md +++ b/skills/databricks-lakebase/SKILL.md @@ -33,6 +33,9 @@ Lakebase is Databricks' serverless Postgres-compatible database, available on bo - [computes-and-scaling.md](references/computes-and-scaling.md) — Sizing, endpoint management, scale-to-zero, HA - [connectivity.md](references/connectivity.md) — Connection patterns, token refresh, Data API - [synced-tables.md](references/synced-tables.md) — Lakebase synced tables, data type mapping, capacity planning +- [lakehouse-sync.md](references/lakehouse-sync.md) — CDC from Lakebase Postgres to Unity Catalog Delta tables +- [pgvector.md](references/pgvector.md) — Vector similarity search with pgvector extension +- [off-platform.md](references/off-platform.md) — Off-platform Lakebase: env management, token refresh, Drizzle ORM ## Resource Hierarchy @@ -89,8 +92,18 @@ After creation, verify: ```bash databricks postgres list-branches projects/ --profile databricks postgres list-endpoints projects//branches/ --profile +databricks postgres list-databases projects//branches/ --profile ``` +**Extract connection values from JSON output:** + +| Value | JSON path | Used for | +|-------|-----------|----------| +| Endpoint host | `status.hosts.host` | `PGHOST`, `lakebase.postgres.host` | +| Endpoint resource path | `name` | `LAKEBASE_ENDPOINT`, `lakebase.postgres.endpointPath` | +| Database resource path | `name` | `lakebase.postgres.database` | +| PostgreSQL database name | `status.postgres_database` | `PGDATABASE`, `lakebase.postgres.databaseName` | + ### Updating a Project ```bash @@ -266,6 +279,8 @@ SELECT * FROM pg_available_extensions ORDER BY name; CREATE EXTENSION IF NOT EXISTS ; ``` +For vector embeddings with pgvector, see [pgvector.md](references/pgvector.md). + ## Troubleshooting | Error | Solution | diff --git a/skills/databricks-lakebase/references/lakehouse-sync.md b/skills/databricks-lakebase/references/lakehouse-sync.md new file mode 100644 index 0000000..7e9af2b --- /dev/null +++ b/skills/databricks-lakebase/references/lakehouse-sync.md @@ -0,0 +1,154 @@ +# Lakehouse Sync: CDC from Lakebase to Unity Catalog + +Lakehouse Sync continuously streams changes **from** Lakebase Postgres **into** Unity Catalog Delta tables using Change Data Capture (CDC). Each synced table produces an SCD Type 2 history table in Unity Catalog, giving you a full audit trail queryable from the lakehouse. + +This is the reverse direction from synced tables (which go UC → Lakebase). No external compute, pipelines, or jobs are required — it is a native Lakebase feature. + +## When to Use + +- Analyze operational data (orders, user activity, support tickets) in the lakehouse +- Need a historical record of every insert, update, and delete from Postgres tables +- Join operational data with analytics data in Spark, SQL, or BI tools +- Feed Lakebase data into downstream pipelines or ML models + +## History Tables + +For each synced table, a Delta history table is created in Unity Catalog: + +``` +lb__history +``` + +Each row includes CDC metadata columns: + +| Column | Type | Description | +|--------|------|-------------| +| `_pg_change_type` | TEXT | `insert`, `update_preimage`, `update_postimage`, or `delete` | +| `_pg_lsn` | BIGINT | Postgres Log Sequence Number for ordering changes | +| `_pg_xid` | INTEGER | Postgres Transaction ID | +| `_timestamp` | TIMESTAMP | When the sync processed the change (without timezone) | +| `_sort_by` | BIGINT | Monotonic sort key for ordering all changes | + +## Enablement + +**Lakehouse Sync is UI-only** — configured via the "Lakehouse sync" tab in the branch overview, not via CLI or API. It operates at the **schema level**: once enabled, all current and future tables in that schema sync to Unity Catalog. + +Navigate to: **Catalog** → your Autoscaling project → branch → **Lakehouse Sync** → **Start Sync**, then select the source database/schema, destination catalog/schema, and tables. + +## Prerequisites + +- Lakebase Autoscaling project running **Postgres 17** +- Tables must reside in the `databricks_postgres` database +- `REPLICA IDENTITY FULL` must be set on all source tables: + +```sql +ALTER TABLE REPLICA IDENTITY FULL; +``` + +- Verify replica identity: + +```sql +SELECT n.nspname AS table_schema, + c.relname AS table_name, + CASE c.relreplident + WHEN 'd' THEN 'default' + WHEN 'n' THEN 'nothing' + WHEN 'f' THEN 'full' + WHEN 'i' THEN 'index' + END AS replica_identity +FROM pg_class c +JOIN pg_namespace n ON n.oid = c.relnamespace +WHERE c.relkind = 'r' + AND n.nspname = 'public' +ORDER BY n.nspname, c.relname; +``` + +- **Permissions:** CAN MANAGE on source project; USE CATALOG + USE SCHEMA + CREATE TABLE on destination +- Catalogs with default storage are **unsupported** + +## Supported Data Types + +`bool`, `int2`, `int4`, `int8`, `text`, `varchar`, `bpchar`, `jsonb`, `numeric`, `date`, `timestamp`, `timestamptz`, `real`, `float4`, `float8`, plus enum types (`typcategory = 'E'`). + +Check for unsupported types: + +```sql +SELECT c.table_schema, c.table_name, c.column_name, c.udt_name AS data_type +FROM information_schema.columns c +JOIN pg_catalog.pg_type t ON t.typname = c.udt_name +WHERE c.table_schema = 'public' + AND NOT ( + c.udt_name IN ( + 'bool', 'int2', 'int4', 'int8', 'text', 'varchar', 'bpchar', + 'jsonb', 'numeric', 'date', 'timestamp', 'timestamptz', + 'real', 'float4', 'float8' + ) + OR t.typcategory = 'E' + ) +ORDER BY c.table_schema, c.table_name, c.ordinal_position; +``` + +## Monitoring + +Check active syncs from Postgres (the `wal2delta` schema only exists after Lakehouse Sync has been enabled): + +```sql +SELECT * FROM wal2delta.tables; +``` + +## Querying History Tables + +**Latest state of each row** (deduplicated current state): + +```sql +SELECT * +FROM ( + SELECT *, + ROW_NUMBER() OVER (PARTITION BY ORDER BY _pg_lsn DESC) AS rn + FROM ..lb__history + WHERE _pg_change_type IN ('insert', 'update_postimage', 'delete') +) +WHERE rn = 1 + AND _pg_change_type != 'delete'; +``` + +**Full change history for a record:** + +```sql +SELECT * +FROM ..lb__history +WHERE = +ORDER BY _pg_lsn; +``` + +## Schema Changes + +If you need to change a synced table's schema in Postgres, you can use the rename-and-swap pattern. Note: this is community guidance — the official behavior is that column changes (add, drop, type change) trigger a full resnapshot of the affected table. + +```sql +CREATE TABLE _v2 ( + id INT PRIMARY KEY, + name TEXT, + new_column TEXT +); + +ALTER TABLE
_v2 REPLICA IDENTITY FULL; + +INSERT INTO
_v2 SELECT *, NULL FROM
; + +BEGIN; +ALTER TABLE
RENAME TO
_backup; +ALTER TABLE
_v2 RENAME TO
; +COMMIT; +``` + +## Limitations + +- Partitioned tables are not supported +- Disabling and re-enabling sync does **not** re-snapshot — missing changes are lost permanently +- Available on AWS, Azure, and GCP. + +## Cross-references + +- For building Silver/Gold layers from CDC history tables, see [medallion-from-cdc.md](medallion-from-cdc.md) +- For syncing in the reverse direction (UC → Lakebase), see [synced-tables.md](synced-tables.md) diff --git a/skills/databricks-lakebase/references/medallion-from-cdc.md b/skills/databricks-lakebase/references/medallion-from-cdc.md new file mode 100644 index 0000000..69cc106 --- /dev/null +++ b/skills/databricks-lakebase/references/medallion-from-cdc.md @@ -0,0 +1,143 @@ +# Medallion Architecture from CDC History Tables + +Build Silver and Gold analytics layers from Lakehouse Sync CDC history tables using Lakeflow Declarative Pipelines. + +## When to Use + +- You have Lakehouse Sync CDC history tables (`lb_
_history`) in Unity Catalog +- You want Bronze → Silver → Gold layers on top of operational data +- You need clean current-state views, deduplication, and business aggregations for BI, ML, or Genie + +## Layer Mapping + +| Layer | Purpose | Source | Output | +|-------|---------|--------|--------| +| **Bronze** | Raw CDC records with full history | Lakehouse Sync `lb_
_history` tables | No transformation needed; already exist | +| **Silver** | Current state, deduplicated and cleaned | Bronze history tables | One materialized view per entity | +| **Gold** | Business aggregations and KPIs | Silver tables | Materialized views with aggregations | + +## 1. Scaffold a Pipeline Project + +```bash +databricks bundle init lakeflow-pipelines \ + --config-file <(echo '{"project_name": "operational_analytics", "language": "sql", "serverless": "yes"}') \ + --profile < /dev/null +cd operational_analytics +``` + +## 2. Configure Pipeline Catalog and Schema + +Edit `resources/operational_analytics.pipeline.yml`: + +```yaml +resources: + pipelines: + operational_analytics: + name: operational_analytics + catalog: + schema: + serverless: true + libraries: + - file: + path: src/ +``` + +## 3. Silver Layer: Current State from CDC + +For each entity, create `src/silver_.sql`: + +```sql +CREATE OR REFRESH MATERIALIZED VIEW silver_ +COMMENT "Current state of records, deduplicated from CDC history" +AS +SELECT * EXCEPT (rn, _pg_change_type, _pg_lsn, _pg_xid, _timestamp, _sort_by) +FROM ( + SELECT *, + ROW_NUMBER() OVER ( + PARTITION BY + ORDER BY _pg_lsn DESC + ) AS rn + FROM ..lb__history + WHERE _pg_change_type IN ('insert', 'update_postimage', 'delete') +) +WHERE rn = 1 + AND _pg_change_type != 'delete' +``` + +Replace ``, `.`, and `` with your values. + +## 4. Gold Layer: Business Aggregations + +Create `src/gold_.sql`: + +```sql +CREATE OR REFRESH MATERIALIZED VIEW gold_daily_order_summary +COMMENT "Daily order counts and revenue by status" +AS +SELECT + DATE_TRUNC('day', created_at) AS order_date, + status, + COUNT(*) AS order_count, + SUM(total_amount) AS total_revenue +FROM silver_orders +GROUP BY DATE_TRUNC('day', created_at), status +``` + +Gold tables read from silver tables within the same pipeline. + +## 5. Data Quality Expectations + +Add constraints to silver or gold tables: + +```sql +CREATE OR REFRESH MATERIALIZED VIEW silver_ ( + CONSTRAINT valid_primary_key EXPECT ( IS NOT NULL) ON VIOLATION DROP ROW, + CONSTRAINT valid_timestamp EXPECT (created_at IS NOT NULL) ON VIOLATION DROP ROW +) +COMMENT "Current state of records with quality enforcement" +AS +SELECT ... +``` + +## 6. Deploy and Run + +```bash +databricks bundle validate --profile +databricks bundle deploy -t dev --profile +databricks bundle run operational_analytics -t dev --profile +``` + +## 7. Schedule Ongoing Refreshes + +Create `resources/operational_analytics_job.job.yml`: + +```yaml +resources: + jobs: + operational_analytics_job: + trigger: + periodic: + interval: 1 + unit: HOURS + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.operational_analytics.id} +``` + +Deploy: `databricks bundle deploy -t dev --profile ` + +## Troubleshooting + +| Issue | Fix | +|-------|-----| +| Silver table returns no rows | Verify bronze history table has data: `SELECT COUNT(*) FROM lb__history` | +| `TABLE_OR_VIEW_NOT_FOUND` for bronze table | Use fully-qualified name: `..lb__history` | +| Gold aggregation includes deleted records | Confirm silver layer filters `_pg_change_type != 'delete'` | +| Pipeline fails on deploy | Run `databricks bundle validate` first to catch config errors | +| Incremental refresh not picking up changes | Verify Lakehouse Sync is active and bronze table is updating | + +## Cross-references + +- For Lakehouse Sync setup, see [lakehouse-sync.md](lakehouse-sync.md) +- For synced tables (UC → Lakebase direction), see [synced-tables.md](synced-tables.md) diff --git a/skills/databricks-lakebase/references/off-platform.md b/skills/databricks-lakebase/references/off-platform.md new file mode 100644 index 0000000..911e1f4 --- /dev/null +++ b/skills/databricks-lakebase/references/off-platform.md @@ -0,0 +1,201 @@ +# Off-Platform Lakebase: Connecting from External Apps + +Connect to Lakebase from apps deployed outside Databricks App Platform (e.g. Vercel, AWS, Netlify, or any Node.js server). + +## Recommended: `@databricks/lakebase` Package + +The simplest way to connect — a drop-in `pg.Pool` replacement with automatic OAuth token refresh. + +```bash +npm install @databricks/lakebase +``` + +**Zero-config usage** (reads from environment variables): + +```typescript +import { createLakebasePool } from "@databricks/lakebase"; + +const pool = createLakebasePool(); +const result = await pool.query("SELECT * FROM users"); +``` + +**Explicit config:** + +```typescript +const pool = createLakebasePool({ + host: "your-lakebase-host.databricks.com", + database: "your_database_name", + endpoint: "projects//branches//endpoints/", + user: "user_id", + max: 10, +}); +``` + +**Key features:** +- Automatic OAuth token refresh (1-hour lifetime, 2-minute buffer) +- Token caching to reduce API calls +- Username resolution: explicit config → `PGUSER` → `DATABRICKS_CLIENT_ID` → API lookup via `getUsernameWithApiLookup()` +- `getLakebaseOrmConfig()` for ORM-compatible connection config +- OpenTelemetry metrics: `lakebase.token.refresh.duration`, `lakebase.query.duration`, pool connection gauges +- Logging: `{ debug, info, warn, error }` boolean flags or custom logger instance + +> **Lakebase Autoscaling only.** This package is not compatible with Lakebase Provisioned. For the full config reference, see the [`@databricks/lakebase` README](https://github.com/databricks/appkit/tree/main/packages/lakebase). + +**ORM integration:** + +```typescript +// Drizzle +import { drizzle } from "drizzle-orm/node-postgres"; +const db = drizzle({ client: pool }); + +// Prisma +import { PrismaPg } from "@prisma/adapter-pg"; +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +// TypeORM / Sequelize +import { getLakebaseOrmConfig } from "@databricks/lakebase"; +// Pass getLakebaseOrmConfig() to your ORM's connection config +``` + +## Environment Management + +### Required Environment Variables + +| Variable | Description | How to find | +|----------|-------------|-------------| +| `PGHOST` | Lakebase endpoint host | `databricks postgres list-endpoints projects//branches/production --profile -o json` → `status.hosts.host` | +| `PGDATABASE` | Postgres database name | Default is `databricks_postgres`. Verify via psql: `SELECT datname FROM pg_database WHERE datistemplate = false;` | +| `LAKEBASE_ENDPOINT` | Endpoint resource path | Same `list-endpoints` command → `name` field | +| `PGUSER` | Username | Your Databricks email (local dev) or service principal application ID (M2M) | +| `PGSSLMODE` | SSL mode | `require` (default) | +| `PGPORT` | Port | `5432` (default) | + +### Authentication + +**Local dev** — use a short-lived workspace token: +```bash +export DATABRICKS_TOKEN=$(databricks auth token --profile -o json | jq -r '.access_token') +``` + +**Production** — use OAuth M2M credentials: +```bash +export DATABRICKS_CLIENT_ID= +export DATABRICKS_CLIENT_SECRET= +export DATABRICKS_HOST=https://.cloud.databricks.com +``` + +### `.env.example` Template + +```bash +DATABRICKS_HOST=https:// +LAKEBASE_ENDPOINT=projects//branches/production/endpoints/primary +PGHOST= +PGPORT=5432 +PGDATABASE= +PGUSER= +PGSSLMODE=require + +# Option A: local dev, token auth (expires ~1h) +DATABRICKS_TOKEN= + +# Option B: production, M2M auth (service principal) +DATABRICKS_CLIENT_ID= +DATABRICKS_CLIENT_SECRET= +``` + +### Optional: Zod Validation + +For strict fast-fail validation at startup: + +```typescript +import { z } from "zod"; + +const baseSchema = z.object({ + DATABRICKS_HOST: z.string().min(1), + LAKEBASE_ENDPOINT: z.string().min(1), + PGHOST: z.string().min(1), + PGPORT: z.coerce.number().default(5432), + PGDATABASE: z.string().min(1), + PGUSER: z.string().min(1), + PGSSLMODE: z.enum(["require", "verify-full", "verify-ca", "prefer", "disable"]).default("require"), + DATABRICKS_TOKEN: z.string().optional(), + DATABRICKS_CLIENT_ID: z.string().optional(), + DATABRICKS_CLIENT_SECRET: z.string().optional(), +}); + +function validateAuth(env: z.infer) { + const hasToken = Boolean(env.DATABRICKS_TOKEN); + const hasM2M = Boolean(env.DATABRICKS_CLIENT_ID) && Boolean(env.DATABRICKS_CLIENT_SECRET); + if (!hasToken && !hasM2M) { + throw new Error("Set DATABRICKS_TOKEN or both DATABRICKS_CLIENT_ID and DATABRICKS_CLIENT_SECRET"); + } + return env; +} + +export const env = validateAuth(baseSchema.parse(process.env)); +``` + +Import `env` at the top of your server entry point for fast-fail on missing variables. + +## Drizzle ORM Integration + +**With `@databricks/lakebase`** (recommended): + +```typescript +import { drizzle } from "drizzle-orm/node-postgres"; +import { createLakebasePool } from "@databricks/lakebase"; +import * as itemsSchema from "@/lib/items/schema"; + +const pool = createLakebasePool(); +export const db = drizzle({ client: pool, schema: { ...itemsSchema } }); +``` + +**Schema per domain** — organize schemas under `src/lib//schema.ts`: + +```typescript +import { pgTable, serial, text, timestamp } from "drizzle-orm/pg-core"; + +export const items = pgTable("items", { + id: serial("id").primaryKey(), + name: text("name").notNull(), + createdAt: timestamp("created_at", { withTimezone: true }).notNull().defaultNow(), +}); +``` + +**Running migrations** — use Drizzle's programmatic migrator with the Lakebase pool: + +```typescript +// scripts/db-migrate.ts +import { drizzle } from "drizzle-orm/node-postgres"; +import { migrate } from "drizzle-orm/node-postgres/migrator"; +import { createLakebasePool } from "@databricks/lakebase"; + +const pool = createLakebasePool(); +const db = drizzle({ client: pool }); +await migrate(db, { migrationsFolder: "./src/lib/db/migrations" }); +await pool.end(); +console.log("Migrations applied successfully"); +``` + +**`drizzle.config.ts`** — used by `drizzle-kit generate` (no DB connection needed): + +```typescript +import { defineConfig } from "drizzle-kit"; + +export default defineConfig({ + schema: "./src/lib/*/schema.ts", + out: "./src/lib/db/migrations", + dialect: "postgresql", +}); +``` + +**Commands:** +- Generate: `npx drizzle-kit generate` +- Migrate: `npx dotenv -e .env.local -- npx tsx scripts/db-migrate.ts` + +## Cross-references + +- For on-platform connection patterns, see [connectivity.md](connectivity.md) +- For vector similarity search with pgvector, see [pgvector.md](pgvector.md) +- For AppKit-based Lakebase integration, see the `databricks-apps` skill's [lakebase.md](../../databricks-apps/references/appkit/lakebase.md) diff --git a/skills/databricks-lakebase/references/pgvector.md b/skills/databricks-lakebase/references/pgvector.md new file mode 100644 index 0000000..f6f7314 --- /dev/null +++ b/skills/databricks-lakebase/references/pgvector.md @@ -0,0 +1,149 @@ +# Vector Similarity Search with pgvector + +Use the pgvector extension in Lakebase for embedding-based similarity search (RAG, semantic search, recommendations). + +## Extension Setup + +```bash +databricks psql --project --profile -- -c " + CREATE EXTENSION IF NOT EXISTS vector; +" +``` + +If you get error code `42501` (insufficient privileges), the extension may already exist — this is safe to ignore in `setupVectorTables()`: + +```typescript +try { + await appkit.lakebase.query("CREATE EXTENSION IF NOT EXISTS vector"); +} catch (err: unknown) { + const code = (err as { code?: string }).code; + if (code === "42501") { + console.log("[vectors] Skipping extension creation — insufficient privileges (likely already exists)"); + } else { + throw err; + } +} +``` + +## Table Schema + +```sql +CREATE SCHEMA IF NOT EXISTS vectors; + +CREATE TABLE IF NOT EXISTS vectors.documents ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + content TEXT NOT NULL, + embedding VECTOR(1024), + metadata JSONB NOT NULL DEFAULT '{}', + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); +``` + +**Dimension matching**: `VECTOR(1024)` must match your embedding model's output dimension. Common Databricks endpoints: +- `databricks-gte-large-en` — 1024 dimensions +- `databricks-bge-large-en` — 1024 dimensions + +If using a different model (768d or 1536d), change `VECTOR(1024)` to match. + +## Vector Store Module + +Create `server/lib/vector-store.ts`: + +```typescript +import type { Application } from "express"; + +interface AppKitWithLakebase { + lakebase: { + query(text: string, params?: unknown[]): Promise<{ rows: Record[] }>; + }; + server: { + extend(fn: (app: Application) => void): void; + }; +} + +export async function setupVectorTables(appkit: AppKitWithLakebase) { + try { + await appkit.lakebase.query("CREATE EXTENSION IF NOT EXISTS vector"); + } catch (err: unknown) { + const code = (err as { code?: string }).code; + if (code === "42501") { + console.log("[vectors] Skipping extension creation — insufficient privileges (likely already exists)"); + } else { + throw err; + } + } + await appkit.lakebase.query(`CREATE SCHEMA IF NOT EXISTS vectors`); + const { rows } = await appkit.lakebase.query( + `SELECT 1 FROM information_schema.tables + WHERE table_schema = 'vectors' AND table_name = 'documents'`, + ); + if (rows.length > 0) return; + await appkit.lakebase.query(` + CREATE TABLE IF NOT EXISTS vectors.documents ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + content TEXT NOT NULL, + embedding VECTOR(1024), + metadata JSONB NOT NULL DEFAULT '{}', + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ) + `); +} + +export async function insertDocument( + appkit: AppKitWithLakebase, + input: { content: string; embedding: number[]; metadata?: Record }, +) { + const result = await appkit.lakebase.query( + `INSERT INTO vectors.documents (content, embedding, metadata) + VALUES ($1, $2::vector, $3) + RETURNING id, content, metadata, created_at`, + [input.content, JSON.stringify(input.embedding), JSON.stringify(input.metadata ?? {})], + ); + return result.rows[0]; +} + +export async function retrieveSimilar( + appkit: AppKitWithLakebase, + queryEmbedding: number[], + limit = 5, +) { + const result = await appkit.lakebase.query( + `SELECT id, content, metadata, 1 - (embedding <=> $1::vector) AS similarity + FROM vectors.documents + WHERE embedding IS NOT NULL + ORDER BY embedding <=> $1::vector + LIMIT $2`, + [JSON.stringify(queryEmbedding), limit], + ); + return result.rows; +} +``` + +Call `setupVectorTables(appkit)` from `onPluginsReady` before starting the server. + +## Distance Operators + +| Operator | Distance | Use for | +|----------|----------|---------| +| `<=>` | Cosine | Text similarity (default) | +| `<->` | L2 (Euclidean) | Spatial data | +| `<#>` | Negative inner product | Normalized embeddings (smaller = more similar) | + +Similarity score: `1 - (embedding <=> $1::vector) AS similarity` (0 = unrelated, 1 = identical). + +## Indexing + +Add an index **after** inserting initial data (IVFFlat needs representative data to build): + +```sql +CREATE INDEX IF NOT EXISTS idx_documents_embedding + ON vectors.documents USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); +ANALYZE vectors.documents; +``` + +For higher recall without tuning, use HNSW instead: `USING hnsw (embedding vector_cosine_ops)`. + +## Cross-references + +- For generating embeddings, see the `databricks-apps` skill's [model-serving.md](../../databricks-apps/references/appkit/model-serving.md) → Embeddings Pattern +- For Lakebase connection patterns, see [connectivity.md](connectivity.md) diff --git a/skills/databricks-lakebase/references/synced-tables.md b/skills/databricks-lakebase/references/synced-tables.md index 9e74882..ca1b99c 100644 --- a/skills/databricks-lakebase/references/synced-tables.md +++ b/skills/databricks-lakebase/references/synced-tables.md @@ -80,6 +80,9 @@ databricks postgres create-synced-table ..
\ | `create_database_objects_if_missing` | No | Auto-create Postgres schema/database if missing (default: `false`) | | `new_pipeline_spec.storage_catalog` | Yes | A **regular** UC catalog for DLT pipeline metadata (NOT the Lakebase catalog) | | `new_pipeline_spec.storage_schema` | Yes | Schema in the storage catalog for pipeline metadata (e.g. `default`) | +| `timeseries_key` | No | Column for deduplication when source has duplicate PKs (latest wins). Performance penalty. | + +> **Note:** Nulls in PK columns are excluded from sync. Long-running operation; CLI waits by default. Use `--no-wait` to return immediately. @@ -186,9 +189,14 @@ If a Databricks App reads synced tables, the app's Service Principal needs expli - **Naming:** Database, schema, and table names allow `[A-Za-z0-9_]+` only - **Schema evolution:** Only additive changes (adding columns) for Triggered/Continuous modes +**Cost guidance:** +- **Continuous mode:** Reuse pipelines for ~10 tables/pipeline — roughly 10x cheaper per table than separate pipelines +- **Cost formula:** `[Rows / (Speed × CUs × 3600)] × DLT Hourly Rate` (check current DLT pricing for your cloud/region) +- **Snapshot vs incremental:** Snapshot is ~10x faster when >10% of data changes per cycle + ## Lakehouse Sync (Beta) -Reverse direction: continuously streams changes **from** Lakebase Postgres **into** Unity Catalog Delta tables using CDC (SCD Type 2 history). Destination tables are named `lb__history`. Does not require external compute, pipelines, or jobs — it is a native Lakebase feature. Available on AWS and Azure. +Reverse direction: continuously streams changes **from** Lakebase Postgres **into** Unity Catalog Delta tables using CDC (SCD Type 2 history). Destination tables are named `lb__history`. Does not require external compute, pipelines, or jobs — it is a native Lakebase feature. Available on AWS, Azure, and GCP. > **Important:** Tables must reside in the `databricks_postgres` database for Lakehouse Sync to work. @@ -215,6 +223,8 @@ Reverse direction: continuously streams changes **from** Lakebase Postgres **int - Partitioned tables are not supported - Disabling and re-enabling sync does **not** re-snapshot — missing changes are lost permanently +For the full Lakehouse Sync reference, see [lakehouse-sync.md](lakehouse-sync.md). For building medallion pipelines from CDC history, see [medallion-from-cdc.md](medallion-from-cdc.md). + ## Use Cases **Product catalog:** Sync gold-tier product data to Lakebase for low-latency web app reads. Use Triggered mode for hourly/daily updates. @@ -235,3 +245,4 @@ Reverse direction: continuously streams changes **from** Lakebase Postgres **int - **Read-only in Postgres:** Only SELECT queries, CREATE INDEX, and DROP TABLE are allowed on synced tables. Any data modifications (INSERT, UPDATE, DELETE) corrupt the sync pipeline. - **Null bytes:** Null bytes (0x00) in STRING, ARRAY, MAP, or STRUCT columns cause sync failures. Sanitize source data: `REPLACE(col, CAST(CHAR(0) AS STRING), '')`. - **Unsupported types:** GEOGRAPHY, GEOMETRY, VARIANT, OBJECT columns cannot be synced. +- **FGAC not propagated:** Fine-grained access control (row filters, column masks) from Unity Catalog is not propagated to synced tables. **Workaround:** Create a view on the source table with the desired filter (`SELECT * FROM table WHERE ...`), then sync the view in Snapshot mode. Caveat: the sync runs as the creator and only sees their visible rows. diff --git a/skills/databricks-model-serving/SKILL.md b/skills/databricks-model-serving/SKILL.md index eafedbf..628fd33 100644 --- a/skills/databricks-model-serving/SKILL.md +++ b/skills/databricks-model-serving/SKILL.md @@ -62,7 +62,8 @@ databricks serving-endpoints create \ "entity_version": "", "min_provisioned_throughput": 0, "max_provisioned_throughput": 0, - "workload_size": "Small" + "workload_size": "Small", + "scale_to_zero_enabled": true }], "traffic_config": { "routes": [{ @@ -73,7 +74,7 @@ databricks serving-endpoints create \ }' --profile ``` -- Discover available Foundation Models: check the `system.ai` catalog in Unity Catalog. +- Discover available Foundation Models: check the `system.ai` catalog in Unity Catalog, or use `databricks serving-endpoints list --profile ` to see available endpoints. Use `databricks serving-endpoints get-open-api --profile ` to inspect the endpoint's API schema. - Long-running operation; the CLI waits for completion by default. Use `--no-wait` to return immediately, then poll: ```bash databricks serving-endpoints get --profile diff --git a/skills/databricks-model-serving/references/off-platform-streaming.md b/skills/databricks-model-serving/references/off-platform-streaming.md new file mode 100644 index 0000000..000913d --- /dev/null +++ b/skills/databricks-model-serving/references/off-platform-streaming.md @@ -0,0 +1,182 @@ +# Off-Platform Streaming with AI SDK v6 + +These patterns are for apps deployed **outside** Databricks Apps (e.g., Vercel, AWS, standalone Node.js servers) using direct AI SDK v6 integration with Databricks AI Gateway. For AppKit-based apps, use the **`databricks-apps`** skill's built-in serving plugin instead. + +## AI SDK v6 Streaming Pattern + +Use this pattern for streaming AI chat with Databricks AI Gateway and Vercel AI SDK v6 in off-platform apps. + +**Dependencies:** `ai@6`, `@ai-sdk/react@3`, `@ai-sdk/openai`, `@databricks/sdk-experimental` + +**Auth helper** — works for both local dev (CLI profile) and deployed apps (service principal token): + +```typescript +import { Config } from "@databricks/sdk-experimental"; + +async function getDatabricksToken() { + if (process.env.DATABRICKS_TOKEN) { + return process.env.DATABRICKS_TOKEN; + } + const config = new Config({ + profile: process.env.DATABRICKS_CONFIG_PROFILE || "DEFAULT", + }); + await config.ensureResolved(); + const headers = new Headers(); + await config.authenticate(headers); + const authHeader = headers.get("Authorization"); + if (!authHeader) { + throw new Error( + "Failed to get Databricks token. Check your CLI profile or set DATABRICKS_TOKEN.", + ); + } + return authHeader.replace("Bearer ", ""); +} +``` + +**Server route** (`POST /api/chat`): + +```typescript +import { createOpenAI } from "@ai-sdk/openai"; +import { streamText, type UIMessage } from "ai"; + +app.post("/api/chat", async (req, res) => { + const { messages } = req.body; + + // AI SDK v6 client sends UIMessage objects with a parts array. + // Convert to CoreMessage format for streamText(). + const coreMessages = (messages as UIMessage[]).map((m) => ({ + role: m.role as "user" | "assistant" | "system", + content: + m.parts + ?.filter((p) => p.type === "text" && p.text) + .map((p) => p.text) + .join("") ?? + m.content ?? + "", + })); + + try { + const token = await getDatabricksToken(); + const endpoint = process.env.DATABRICKS_ENDPOINT || ""; + + // AI Gateway URL uses /mlflow/v1 path, NOT /openai/v1 + // URL varies by cloud: .cloud.databricks.com (AWS), .azuredatabricks.net (Azure), .gcp.databricks.com (GCP) + const databricks = createOpenAI({ + baseURL: `https://${process.env.DATABRICKS_WORKSPACE_ID}.ai-gateway.cloud.databricks.com/mlflow/v1`, + apiKey: token, + }); + + const result = streamText({ + model: databricks.chat(endpoint), + messages: coreMessages, + maxOutputTokens: 1000, + }); + + result.pipeTextStreamToResponse(res); + } catch (err) { + const message = (err as Error).message; + console.error(`[chat] Streaming request failed:`, message); + res.status(502).json({ error: "Chat request failed", detail: message }); + } +}); +``` + +**Environment variables:** +- `DATABRICKS_WORKSPACE_ID` — for explicit setup: `databricks api get /api/2.1/unity-catalog/current-metastore-assignment --profile ` → `workspace_id` field +- `DATABRICKS_ENDPOINT` — model endpoint name (e.g. `databricks-meta-llama-3-3-70b-instruct`). Run `databricks serving-endpoints list --profile ` to see available models. + +## Streaming Client Pattern (AI SDK v6) + +```tsx +import { useChat } from "@ai-sdk/react"; +import { TextStreamChatTransport } from "ai"; +import { useState } from "react"; + +export function ChatPage() { + const [input, setInput] = useState(""); + + const { messages, sendMessage, status } = useChat({ + transport: new TextStreamChatTransport({ api: "/api/chat" }), + }); + + return ( +
+
+ {messages.map((m) => ( +
+ + {m.role === "user" ? "You" : "Assistant"} + + {m.parts.map((part, i) => + part.type === "text" ? ( +

+ {part.text} +

+ ) : null, + )} +
+ ))} + {status === "submitted" &&
Loading...
} +
+
{ + e.preventDefault(); + if (input.trim()) { + void sendMessage({ text: input }); + setInput(""); + } + }} + className="border-t p-4 flex gap-2" + > + setInput(e.target.value)} + placeholder="Ask a question..." + className="flex-1 border rounded px-3 py-2" + disabled={status !== "ready"} + /> + + +
+ ); +} +``` + +Key differences from AI SDK v5: use `sendMessage({ text })` (NOT `append`), render `m.parts` array (NOT `m.content`), and `status` states are `ready`, `submitted`, `streaming`. + +## Embeddings Pattern + +Generate text embeddings using a Databricks AI Gateway endpoint. + +```typescript +import { WorkspaceClient } from "@databricks/sdk-experimental"; + +const workspaceClient = new WorkspaceClient({ + host: process.env.DATABRICKS_HOST, +}); + +export async function generateEmbedding(text: string): Promise { + const endpoint = + process.env.DATABRICKS_EMBEDDING_ENDPOINT || "databricks-gte-large-en"; + const result = await workspaceClient.servingEndpoints.query({ + name: endpoint, + input: text, + }); + return result.data![0].embedding!; +} +``` + +Common embedding endpoints: `databricks-gte-large-en` (1024d), `databricks-bge-large-en` (1024d). Set `DATABRICKS_EMBEDDING_ENDPOINT` in `.env` and `app.yaml`. + +For vector similarity search with these embeddings, see the **`databricks-lakebase`** skill. + +## Troubleshooting + +| Error | Cause | Solution | +|-------|-------|---------| +| 502 from AI Gateway | Token expired or invalid endpoint | Refresh token via `getDatabricksToken()`; verify endpoint exists | +| `TextStreamChatTransport` not found | Wrong AI SDK version | Requires `ai@6` and `@ai-sdk/react@3` |