From 06821057acdedd9150e0ed15f578e83732ded9bc Mon Sep 17 00:00:00 2001
From: Shaowei <Shaowei@users.noreply.github.com>
Date: Sun, 17 May 2026 12:47:36 +0800
Subject: [PATCH] Add LangChain vector sample

---
 samples/README.md                             |   7 +
 samples/llm-vector-langchain/README.md        |  64 +++++++
 samples/llm-vector-langchain/app.py           | 168 ++++++++++++++++++
 .../credit_card_products.csv                  |   5 +
 .../customer_profiles/customer_profiles.csv   |   5 +
 .../data/transactions/transactions.csv        |  13 ++
 .../llm-vector-langchain/dozer-config.yaml    |  77 ++++++++
 samples/llm-vector-langchain/dozer_client.py  |  24 +++
 samples/llm-vector-langchain/recommender.py   |  79 ++++++++
 samples/llm-vector-langchain/requirements.txt |   7 +
 .../tests/test_recommender.py                 |  63 +++++++
 11 files changed, 512 insertions(+)
 create mode 100644 samples/README.md
 create mode 100644 samples/llm-vector-langchain/README.md
 create mode 100644 samples/llm-vector-langchain/app.py
 create mode 100644 samples/llm-vector-langchain/data/credit_card_products/credit_card_products.csv
 create mode 100644 samples/llm-vector-langchain/data/customer_profiles/customer_profiles.csv
 create mode 100644 samples/llm-vector-langchain/data/transactions/transactions.csv
 create mode 100644 samples/llm-vector-langchain/dozer-config.yaml
 create mode 100644 samples/llm-vector-langchain/dozer_client.py
 create mode 100644 samples/llm-vector-langchain/recommender.py
 create mode 100644 samples/llm-vector-langchain/requirements.txt
 create mode 100644 samples/llm-vector-langchain/tests/test_recommender.py

diff --git a/samples/README.md b/samples/README.md
new file mode 100644
index 0000000000..aed34896d4
--- /dev/null
+++ b/samples/README.md
@@ -0,0 +1,7 @@
+# Dozer samples
+
+## LLM vector LangChain sample
+
+The `llm-vector-langchain` sample shows how to expose customer, transaction,
+and credit-card product CSV data through Dozer REST endpoints, then use
+LangChain and Chroma to answer personalized recommendation questions.
diff --git a/samples/llm-vector-langchain/README.md b/samples/llm-vector-langchain/README.md
new file mode 100644
index 0000000000..9125fa515b
--- /dev/null
+++ b/samples/llm-vector-langchain/README.md
@@ -0,0 +1,64 @@
+# Dozer + LangChain customer profile sample
+
+This sample turns Dozer REST endpoints into retrieval context for a credit-card
+recommendation assistant. It follows the use case from the Dozer LLM article:
+combine profile, transaction, and product data, index it in a vector database,
+then answer with personalized product recommendations.
+
+## What is included
+
+- Three small CSV datasets under `data/`.
+- `dozer-config.yaml`, which exposes the CSV files through Dozer LocalStorage
+  endpoints.
+- A LangChain application that reads the Dozer REST API, writes documents to
+  Chroma, and answers a recommendation question.
+- A small offline test for the recommendation/context-building logic.
+
+## Run the sample
+
+From this directory:
+
+```bash
+dozer --config-path dozer-config.yaml
+```
+
+In another terminal:
+
+```bash
+python -m venv .venv
+. .venv/bin/activate
+pip install -r requirements.txt
+python app.py --customer-id cust_001
+```
+
+The application defaults to `http://localhost:8080`. Set `DOZER_BASE_URL` if
+Dozer is running elsewhere.
+
+```bash
+DOZER_BASE_URL=http://localhost:8080 python app.py --customer-id cust_003
+```
+
+If `OPENAI_API_KEY` is set, the sample uses OpenAI through LangChain. Without an
+API key it uses a deterministic local response, which keeps the sample runnable
+for local development and CI.
+
+## Dozer endpoints
+
+The config exposes these endpoints:
+
+- `/customer_profiles`: customer segment, income band, and declared goals.
+- `/transactions`: transaction history grouped by customer.
+- `/credit_card_products`: available card products and eligibility metadata.
+
+The Python app queries each endpoint, builds one customer profile document plus
+one product document per card, and stores them in Chroma. The retriever is then
+used to choose the best product based on both customer fit and recent spend.
+
+## Offline test
+
+The core recommendation helpers do not require network access or LangChain, so
+they can be checked with:
+
+```bash
+python -m unittest discover -s tests
+```
diff --git a/samples/llm-vector-langchain/app.py b/samples/llm-vector-langchain/app.py
new file mode 100644
index 0000000000..1691ab3d3c
--- /dev/null
+++ b/samples/llm-vector-langchain/app.py
@@ -0,0 +1,168 @@
+from __future__ import annotations
+
+import argparse
+import hashlib
+import os
+from dataclasses import dataclass
+from typing import Any
+
+from dotenv import load_dotenv
+from langchain_chroma import Chroma
+from langchain_core.documents import Document
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from dozer_client import DozerClient
+from recommender import build_customer_context, build_product_context, rank_products
+
+
+DEFAULT_BASE_URL = "http://localhost:8080"
+
+
+@dataclass(frozen=True)
+class RetrievalBundle:
+    documents: list[Document]
+    customer_context: str
+    ranked_products: list[dict[str, Any]]
+
+
+class HashEmbedding:
+    """Small deterministic embedding used when no OpenAI key is configured."""
+
+    def __init__(self, dimensions: int = 64) -> None:
+        self.dimensions = dimensions
+
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        return [self._embed(text) for text in texts]
+
+    def embed_query(self, text: str) -> list[float]:
+        return self._embed(text)
+
+    def _embed(self, text: str) -> list[float]:
+        vector = [0.0] * self.dimensions
+        for token in text.lower().split():
+            digest = hashlib.sha256(token.encode("utf-8")).digest()
+            index = int.from_bytes(digest[:4], byteorder="big") % self.dimensions
+            vector[index] += 1.0
+        magnitude = sum(value * value for value in vector) ** 0.5 or 1.0
+        return [value / magnitude for value in vector]
+
+
+def load_bundle(client: DozerClient, customer_id: str) -> RetrievalBundle:
+    customers = client.fetch_endpoint("customer_profiles")
+    transactions = client.fetch_endpoint("transactions")
+    products = client.fetch_endpoint("credit_card_products")
+
+    customer = next(
+        (row for row in customers if row.get("customer_id") == customer_id),
+        None,
+    )
+    if customer is None:
+        raise ValueError(f"Customer {customer_id!r} was not returned by Dozer")
+
+    customer_transactions = [
+        row for row in transactions if row.get("customer_id") == customer_id
+    ]
+    ranked_products = rank_products(customer, customer_transactions, products)
+
+    customer_context = build_customer_context(customer, customer_transactions)
+    product_contexts = [build_product_context(product) for product in products]
+    documents = [
+        Document(
+            page_content=customer_context,
+            metadata={"kind": "customer", "customer_id": customer_id},
+        )
+    ]
+    documents.extend(
+        Document(
+            page_content=content,
+            metadata={"kind": "product", "product_id": product["product_id"]},
+        )
+        for content, product in zip(product_contexts, products)
+    )
+
+    return RetrievalBundle(
+        documents=documents,
+        customer_context=customer_context,
+        ranked_products=ranked_products,
+    )
+
+
+def build_vector_store(documents: list[Document]) -> Chroma:
+    splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=80)
+    chunks = splitter.split_documents(documents)
+    embeddings = (
+        OpenAIEmbeddings()
+        if os.getenv("OPENAI_API_KEY")
+        else HashEmbedding()
+    )
+    return Chroma.from_documents(chunks, embedding=embeddings)
+
+
+def answer(bundle: RetrievalBundle, question: str) -> str:
+    vector_store = build_vector_store(bundle.documents)
+    retrieved = vector_store.as_retriever(search_kwargs={"k": 4}).invoke(question)
+    retrieved_context = "\n\n".join(doc.page_content for doc in retrieved)
+    ranked_context = "\n".join(
+        f"- {product['name']}: score={product['score']}, reason={product['reason']}"
+        for product in bundle.ranked_products[:3]
+    )
+
+    if not os.getenv("OPENAI_API_KEY"):
+        best = bundle.ranked_products[0]
+        return (
+            f"Recommended card: {best['name']}.\n"
+            f"Reason: {best['reason']}.\n\n"
+            f"Retrieved context:\n{retrieved_context}"
+        )
+
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            (
+                "system",
+                "You recommend credit-card products using only the supplied "
+                "Dozer context. Mention the card, why it fits, and one caveat.",
+            ),
+            (
+                "human",
+                "Customer context:\n{customer_context}\n\n"
+                "Ranked products:\n{ranked_context}\n\n"
+                "Retrieved context:\n{retrieved_context}\n\n"
+                "Question: {question}",
+            ),
+        ]
+    )
+    model = ChatOpenAI(model="gpt-4o-mini", temperature=0)
+    message = (prompt | model).invoke(
+        {
+            "customer_context": bundle.customer_context,
+            "ranked_context": ranked_context,
+            "retrieved_context": retrieved_context,
+            "question": question,
+        }
+    )
+    return str(message.content)
+
+
+def main() -> None:
+    load_dotenv()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--customer-id", default="cust_001")
+    parser.add_argument(
+        "--question",
+        default="Which credit card should this customer be offered?",
+    )
+    parser.add_argument(
+        "--base-url",
+        default=os.getenv("DOZER_BASE_URL", DEFAULT_BASE_URL),
+    )
+    args = parser.parse_args()
+
+    client = DozerClient(args.base_url)
+    bundle = load_bundle(client, args.customer_id)
+    print(answer(bundle, args.question))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/llm-vector-langchain/data/credit_card_products/credit_card_products.csv b/samples/llm-vector-langchain/data/credit_card_products/credit_card_products.csv
new file mode 100644
index 0000000000..0eee3c310b
--- /dev/null
+++ b/samples/llm-vector-langchain/data/credit_card_products/credit_card_products.csv
@@ -0,0 +1,5 @@
+product_id,name,annual_fee_usd,rewards_focus,intro_offer,eligibility,caveat
+card_001,Aurora Travel Signature,395,"travel dining lounge","75000 bonus points after qualifying spend","premium income and excellent credit","High annual fee only works for frequent travelers"
+card_002,Everyday Cashback Plus,0,"groceries streaming cashback","$200 statement credit after qualifying spend","starter or mass market income","Lower earn rate on travel"
+card_003,Family Balance Saver,95,"groceries utilities balance transfer","0 percent intro APR for 15 months","mass affluent income and good credit","Balance transfer fee applies"
+card_004,Nomad Rewards Flex,150,"travel dining subscriptions","50000 points after qualifying spend","premium income and good credit","Subscription credits require activation"
diff --git a/samples/llm-vector-langchain/data/customer_profiles/customer_profiles.csv b/samples/llm-vector-langchain/data/customer_profiles/customer_profiles.csv
new file mode 100644
index 0000000000..3f53ffaf49
--- /dev/null
+++ b/samples/llm-vector-langchain/data/customer_profiles/customer_profiles.csv
@@ -0,0 +1,5 @@
+customer_id,age,income_band,segment,goals,risk_tolerance
+cust_001,34,premium,frequent_traveler,"travel rewards, airport lounge access, lower foreign transaction costs",medium
+cust_002,27,starter,young_professional,"cashback on groceries, no annual fee, credit building",low
+cust_003,42,mass_affluent,family_planner,"groceries, school expenses, balance transfer",low
+cust_004,31,premium,digital_nomad,"travel rewards, dining perks, subscription credits",medium
diff --git a/samples/llm-vector-langchain/data/transactions/transactions.csv b/samples/llm-vector-langchain/data/transactions/transactions.csv
new file mode 100644
index 0000000000..a658860192
--- /dev/null
+++ b/samples/llm-vector-langchain/data/transactions/transactions.csv
@@ -0,0 +1,13 @@
+transaction_id,customer_id,merchant_category,amount_usd,txn_date
+txn_001,cust_001,travel,820.50,2026-01-12
+txn_002,cust_001,dining,188.10,2026-01-15
+txn_003,cust_001,travel,1320.00,2026-02-03
+txn_004,cust_002,groceries,122.23,2026-01-17
+txn_005,cust_002,streaming,18.99,2026-01-19
+txn_006,cust_002,groceries,98.11,2026-02-02
+txn_007,cust_003,groceries,240.42,2026-01-07
+txn_008,cust_003,utilities,210.00,2026-01-10
+txn_009,cust_003,education,510.00,2026-02-05
+txn_010,cust_004,dining,305.45,2026-01-21
+txn_011,cust_004,travel,940.00,2026-02-08
+txn_012,cust_004,subscriptions,64.99,2026-02-11
diff --git a/samples/llm-vector-langchain/dozer-config.yaml b/samples/llm-vector-langchain/dozer-config.yaml
new file mode 100644
index 0000000000..5d2e450f08
--- /dev/null
+++ b/samples/llm-vector-langchain/dozer-config.yaml
@@ -0,0 +1,77 @@
+app_name: llm-vector-langchain-sample
+version: 1
+
+api:
+  rest:
+    port: 8080
+    url: "[::0]"
+    cors: true
+  grpc:
+    port: 50051
+    url: "[::0]"
+    cors: true
+    web: true
+  auth: false
+  internal:
+    port: 50052
+    host: "[::1]"
+
+connections:
+  - db_type: ObjectStore
+    name: local_customer_data
+    authentication: !LocalStorage
+      details:
+        path: ./data
+      tables:
+        - !Table
+          name: customer_profiles
+          config: !CSV
+            path: customer_profiles
+            extension: .csv
+        - !Table
+          name: transactions
+          config: !CSV
+            path: transactions
+            extension: .csv
+        - !Table
+          name: credit_card_products
+          config: !CSV
+            path: credit_card_products
+            extension: .csv
+
+sources:
+  - name: customer_profiles
+    table_name: customer_profiles
+    connection: local_customer_data
+    columns:
+      - customer_id
+  - name: transactions
+    table_name: transactions
+    connection: local_customer_data
+    columns:
+      - transaction_id
+  - name: credit_card_products
+    table_name: credit_card_products
+    connection: local_customer_data
+    columns:
+      - product_id
+
+endpoints:
+  - name: customer_profiles
+    path: /customer_profiles
+    sql: SELECT customer_id, age, income_band, segment, goals, risk_tolerance FROM customer_profiles;
+    index:
+      primary_key:
+        - customer_id
+  - name: transactions
+    path: /transactions
+    sql: SELECT transaction_id, customer_id, merchant_category, amount_usd, txn_date FROM transactions;
+    index:
+      primary_key:
+        - transaction_id
+  - name: credit_card_products
+    path: /credit_card_products
+    sql: SELECT product_id, name, annual_fee_usd, rewards_focus, intro_offer, eligibility, caveat FROM credit_card_products;
+    index:
+      primary_key:
+        - product_id
diff --git a/samples/llm-vector-langchain/dozer_client.py b/samples/llm-vector-langchain/dozer_client.py
new file mode 100644
index 0000000000..6aa94d9fbb
--- /dev/null
+++ b/samples/llm-vector-langchain/dozer_client.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from typing import Any
+
+import requests
+
+
+class DozerClient:
+    def __init__(self, base_url: str) -> None:
+        self.base_url = base_url.rstrip("/")
+
+    def fetch_endpoint(self, endpoint: str) -> list[dict[str, Any]]:
+        url = f"{self.base_url}/{endpoint.lstrip('/')}"
+        response = requests.get(url, timeout=15)
+        response.raise_for_status()
+        payload = response.json()
+        if isinstance(payload, list):
+            return [dict(row) for row in payload]
+        if isinstance(payload, dict):
+            for key in ("data", "records", "items"):
+                value = payload.get(key)
+                if isinstance(value, list):
+                    return [dict(row) for row in value]
+        raise ValueError(f"Unexpected Dozer response shape from {url}: {payload!r}")
diff --git a/samples/llm-vector-langchain/recommender.py b/samples/llm-vector-langchain/recommender.py
new file mode 100644
index 0000000000..3221b0f93c
--- /dev/null
+++ b/samples/llm-vector-langchain/recommender.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from collections import Counter
+from typing import Any
+
+
+def build_customer_context(
+    customer: dict[str, Any],
+    transactions: list[dict[str, Any]],
+) -> str:
+    categories = Counter(row["merchant_category"] for row in transactions)
+    total_spend = sum(float(row["amount_usd"]) for row in transactions)
+    top_categories = ", ".join(
+        f"{category} ({count})" for category, count in categories.most_common(3)
+    )
+    return (
+        f"Customer {customer['customer_id']} is in the {customer['segment']} "
+        f"segment with income band {customer['income_band']}. Goals: "
+        f"{customer['goals']}. Risk tolerance: {customer['risk_tolerance']}. "
+        f"Recent spend: ${total_spend:.2f}. Top categories: {top_categories}."
+    )
+
+
+def build_product_context(product: dict[str, Any]) -> str:
+    return (
+        f"Card {product['name']} has an annual fee of "
+        f"${float(product['annual_fee_usd']):.0f}. Rewards focus: "
+        f"{product['rewards_focus']}. Intro offer: {product['intro_offer']}. "
+        f"Eligibility: {product['eligibility']}. Caveat: {product['caveat']}."
+    )
+
+
+def rank_products(
+    customer: dict[str, Any],
+    transactions: list[dict[str, Any]],
+    products: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    category_counts = Counter(row["merchant_category"].lower() for row in transactions)
+    goals = str(customer.get("goals", "")).lower()
+    income_band = str(customer.get("income_band", "")).lower()
+    ranked: list[dict[str, Any]] = []
+
+    for product in products:
+        focus = str(product.get("rewards_focus", "")).lower()
+        eligibility = str(product.get("eligibility", "")).lower()
+        fee = float(product.get("annual_fee_usd", 0) or 0)
+        score = 0
+        reasons: list[str] = []
+
+        for category, count in category_counts.items():
+            if category in focus:
+                score += count * 3
+                reasons.append(f"matches {category} spend")
+
+        for goal in ("travel", "cashback", "balance transfer", "groceries"):
+            if goal in goals and goal in focus:
+                score += 4
+                reasons.append(f"supports {goal} goal")
+
+        if "premium" in income_band and fee > 0:
+            score += 1
+            reasons.append("premium segment can absorb an annual fee")
+        elif "starter" in income_band and fee == 0:
+            score += 2
+            reasons.append("no annual fee suits starter income band")
+
+        if "excellent credit" in eligibility and "starter" in income_band:
+            score -= 2
+            reasons.append("eligibility may be tight")
+
+        ranked.append(
+            {
+                **product,
+                "score": score,
+                "reason": "; ".join(reasons) or "general fit",
+            }
+        )
+
+    return sorted(ranked, key=lambda row: row["score"], reverse=True)
diff --git a/samples/llm-vector-langchain/requirements.txt b/samples/llm-vector-langchain/requirements.txt
new file mode 100644
index 0000000000..28f6bc55eb
--- /dev/null
+++ b/samples/llm-vector-langchain/requirements.txt
@@ -0,0 +1,7 @@
+chromadb>=0.5.0
+langchain>=0.3.0
+langchain-chroma>=0.1.4
+langchain-openai>=0.2.0
+langchain-text-splitters>=0.3.0
+python-dotenv>=1.0.0
+requests>=2.32.0
diff --git a/samples/llm-vector-langchain/tests/test_recommender.py b/samples/llm-vector-langchain/tests/test_recommender.py
new file mode 100644
index 0000000000..d4cb3b83f7
--- /dev/null
+++ b/samples/llm-vector-langchain/tests/test_recommender.py
@@ -0,0 +1,63 @@
+import unittest
+
+from recommender import build_customer_context, rank_products
+
+
+class RecommenderTest(unittest.TestCase):
+    def test_travel_customer_gets_travel_card(self) -> None:
+        customer = {
+            "customer_id": "cust_001",
+            "income_band": "premium",
+            "segment": "frequent_traveler",
+            "goals": "travel rewards and lounge access",
+            "risk_tolerance": "medium",
+        }
+        transactions = [
+            {"merchant_category": "travel", "amount_usd": "500"},
+            {"merchant_category": "travel", "amount_usd": "300"},
+            {"merchant_category": "dining", "amount_usd": "120"},
+        ]
+        products = [
+            {
+                "product_id": "card_001",
+                "name": "Aurora Travel Signature",
+                "annual_fee_usd": "395",
+                "rewards_focus": "travel dining lounge",
+                "eligibility": "premium income and excellent credit",
+            },
+            {
+                "product_id": "card_002",
+                "name": "Everyday Cashback Plus",
+                "annual_fee_usd": "0",
+                "rewards_focus": "groceries streaming cashback",
+                "eligibility": "starter or mass market income",
+            },
+        ]
+
+        ranked = rank_products(customer, transactions, products)
+
+        self.assertEqual(ranked[0]["product_id"], "card_001")
+        self.assertIn("matches travel spend", ranked[0]["reason"])
+
+    def test_customer_context_summarizes_spend(self) -> None:
+        customer = {
+            "customer_id": "cust_002",
+            "income_band": "starter",
+            "segment": "young_professional",
+            "goals": "cashback",
+            "risk_tolerance": "low",
+        }
+        transactions = [
+            {"merchant_category": "groceries", "amount_usd": "10.50"},
+            {"merchant_category": "groceries", "amount_usd": "12.25"},
+            {"merchant_category": "streaming", "amount_usd": "8.00"},
+        ]
+
+        context = build_customer_context(customer, transactions)
+
+        self.assertIn("Recent spend: $30.75", context)
+        self.assertIn("groceries (2)", context)
+
+
+if __name__ == "__main__":
+    unittest.main()