From 06821057acdedd9150e0ed15f578e83732ded9bc Mon Sep 17 00:00:00 2001 From: Shaowei Date: Sun, 17 May 2026 12:47:36 +0800 Subject: [PATCH] Add LangChain vector sample --- samples/README.md | 7 + samples/llm-vector-langchain/README.md | 64 +++++++ samples/llm-vector-langchain/app.py | 168 ++++++++++++++++++ .../credit_card_products.csv | 5 + .../customer_profiles/customer_profiles.csv | 5 + .../data/transactions/transactions.csv | 13 ++ .../llm-vector-langchain/dozer-config.yaml | 77 ++++++++ samples/llm-vector-langchain/dozer_client.py | 24 +++ samples/llm-vector-langchain/recommender.py | 79 ++++++++ samples/llm-vector-langchain/requirements.txt | 7 + .../tests/test_recommender.py | 63 +++++++ 11 files changed, 512 insertions(+) create mode 100644 samples/README.md create mode 100644 samples/llm-vector-langchain/README.md create mode 100644 samples/llm-vector-langchain/app.py create mode 100644 samples/llm-vector-langchain/data/credit_card_products/credit_card_products.csv create mode 100644 samples/llm-vector-langchain/data/customer_profiles/customer_profiles.csv create mode 100644 samples/llm-vector-langchain/data/transactions/transactions.csv create mode 100644 samples/llm-vector-langchain/dozer-config.yaml create mode 100644 samples/llm-vector-langchain/dozer_client.py create mode 100644 samples/llm-vector-langchain/recommender.py create mode 100644 samples/llm-vector-langchain/requirements.txt create mode 100644 samples/llm-vector-langchain/tests/test_recommender.py diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 0000000000..aed34896d4 --- /dev/null +++ b/samples/README.md @@ -0,0 +1,7 @@ +# Dozer samples + +## LLM vector LangChain sample + +The `llm-vector-langchain` sample shows how to expose customer, transaction, +and credit-card product CSV data through Dozer REST endpoints, then use +LangChain and Chroma to answer personalized recommendation questions. diff --git a/samples/llm-vector-langchain/README.md b/samples/llm-vector-langchain/README.md new file mode 100644 index 0000000000..9125fa515b --- /dev/null +++ b/samples/llm-vector-langchain/README.md @@ -0,0 +1,64 @@ +# Dozer + LangChain customer profile sample + +This sample turns Dozer REST endpoints into retrieval context for a credit-card +recommendation assistant. It follows the use case from the Dozer LLM article: +combine profile, transaction, and product data, index it in a vector database, +then answer with personalized product recommendations. + +## What is included + +- Three small CSV datasets under `data/`. +- `dozer-config.yaml`, which exposes the CSV files through Dozer LocalStorage + endpoints. +- A LangChain application that reads the Dozer REST API, writes documents to + Chroma, and answers a recommendation question. +- A small offline test for the recommendation/context-building logic. + +## Run the sample + +From this directory: + +```bash +dozer --config-path dozer-config.yaml +``` + +In another terminal: + +```bash +python -m venv .venv +. .venv/bin/activate +pip install -r requirements.txt +python app.py --customer-id cust_001 +``` + +The application defaults to `http://localhost:8080`. Set `DOZER_BASE_URL` if +Dozer is running elsewhere. + +```bash +DOZER_BASE_URL=http://localhost:8080 python app.py --customer-id cust_003 +``` + +If `OPENAI_API_KEY` is set, the sample uses OpenAI through LangChain. Without an +API key it uses a deterministic local response, which keeps the sample runnable +for local development and CI. + +## Dozer endpoints + +The config exposes these endpoints: + +- `/customer_profiles`: customer segment, income band, and declared goals. +- `/transactions`: transaction history grouped by customer. +- `/credit_card_products`: available card products and eligibility metadata. + +The Python app queries each endpoint, builds one customer profile document plus +one product document per card, and stores them in Chroma. The retriever is then +used to choose the best product based on both customer fit and recent spend. + +## Offline test + +The core recommendation helpers do not require network access or LangChain, so +they can be checked with: + +```bash +python -m unittest discover -s tests +``` diff --git a/samples/llm-vector-langchain/app.py b/samples/llm-vector-langchain/app.py new file mode 100644 index 0000000000..1691ab3d3c --- /dev/null +++ b/samples/llm-vector-langchain/app.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +import argparse +import hashlib +import os +from dataclasses import dataclass +from typing import Any + +from dotenv import load_dotenv +from langchain_chroma import Chroma +from langchain_core.documents import Document +from langchain_core.prompts import ChatPromptTemplate +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from langchain_text_splitters import RecursiveCharacterTextSplitter + +from dozer_client import DozerClient +from recommender import build_customer_context, build_product_context, rank_products + + +DEFAULT_BASE_URL = "http://localhost:8080" + + +@dataclass(frozen=True) +class RetrievalBundle: + documents: list[Document] + customer_context: str + ranked_products: list[dict[str, Any]] + + +class HashEmbedding: + """Small deterministic embedding used when no OpenAI key is configured.""" + + def __init__(self, dimensions: int = 64) -> None: + self.dimensions = dimensions + + def embed_documents(self, texts: list[str]) -> list[list[float]]: + return [self._embed(text) for text in texts] + + def embed_query(self, text: str) -> list[float]: + return self._embed(text) + + def _embed(self, text: str) -> list[float]: + vector = [0.0] * self.dimensions + for token in text.lower().split(): + digest = hashlib.sha256(token.encode("utf-8")).digest() + index = int.from_bytes(digest[:4], byteorder="big") % self.dimensions + vector[index] += 1.0 + magnitude = sum(value * value for value in vector) ** 0.5 or 1.0 + return [value / magnitude for value in vector] + + +def load_bundle(client: DozerClient, customer_id: str) -> RetrievalBundle: + customers = client.fetch_endpoint("customer_profiles") + transactions = client.fetch_endpoint("transactions") + products = client.fetch_endpoint("credit_card_products") + + customer = next( + (row for row in customers if row.get("customer_id") == customer_id), + None, + ) + if customer is None: + raise ValueError(f"Customer {customer_id!r} was not returned by Dozer") + + customer_transactions = [ + row for row in transactions if row.get("customer_id") == customer_id + ] + ranked_products = rank_products(customer, customer_transactions, products) + + customer_context = build_customer_context(customer, customer_transactions) + product_contexts = [build_product_context(product) for product in products] + documents = [ + Document( + page_content=customer_context, + metadata={"kind": "customer", "customer_id": customer_id}, + ) + ] + documents.extend( + Document( + page_content=content, + metadata={"kind": "product", "product_id": product["product_id"]}, + ) + for content, product in zip(product_contexts, products) + ) + + return RetrievalBundle( + documents=documents, + customer_context=customer_context, + ranked_products=ranked_products, + ) + + +def build_vector_store(documents: list[Document]) -> Chroma: + splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=80) + chunks = splitter.split_documents(documents) + embeddings = ( + OpenAIEmbeddings() + if os.getenv("OPENAI_API_KEY") + else HashEmbedding() + ) + return Chroma.from_documents(chunks, embedding=embeddings) + + +def answer(bundle: RetrievalBundle, question: str) -> str: + vector_store = build_vector_store(bundle.documents) + retrieved = vector_store.as_retriever(search_kwargs={"k": 4}).invoke(question) + retrieved_context = "\n\n".join(doc.page_content for doc in retrieved) + ranked_context = "\n".join( + f"- {product['name']}: score={product['score']}, reason={product['reason']}" + for product in bundle.ranked_products[:3] + ) + + if not os.getenv("OPENAI_API_KEY"): + best = bundle.ranked_products[0] + return ( + f"Recommended card: {best['name']}.\n" + f"Reason: {best['reason']}.\n\n" + f"Retrieved context:\n{retrieved_context}" + ) + + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You recommend credit-card products using only the supplied " + "Dozer context. Mention the card, why it fits, and one caveat.", + ), + ( + "human", + "Customer context:\n{customer_context}\n\n" + "Ranked products:\n{ranked_context}\n\n" + "Retrieved context:\n{retrieved_context}\n\n" + "Question: {question}", + ), + ] + ) + model = ChatOpenAI(model="gpt-4o-mini", temperature=0) + message = (prompt | model).invoke( + { + "customer_context": bundle.customer_context, + "ranked_context": ranked_context, + "retrieved_context": retrieved_context, + "question": question, + } + ) + return str(message.content) + + +def main() -> None: + load_dotenv() + parser = argparse.ArgumentParser() + parser.add_argument("--customer-id", default="cust_001") + parser.add_argument( + "--question", + default="Which credit card should this customer be offered?", + ) + parser.add_argument( + "--base-url", + default=os.getenv("DOZER_BASE_URL", DEFAULT_BASE_URL), + ) + args = parser.parse_args() + + client = DozerClient(args.base_url) + bundle = load_bundle(client, args.customer_id) + print(answer(bundle, args.question)) + + +if __name__ == "__main__": + main() diff --git a/samples/llm-vector-langchain/data/credit_card_products/credit_card_products.csv b/samples/llm-vector-langchain/data/credit_card_products/credit_card_products.csv new file mode 100644 index 0000000000..0eee3c310b --- /dev/null +++ b/samples/llm-vector-langchain/data/credit_card_products/credit_card_products.csv @@ -0,0 +1,5 @@ +product_id,name,annual_fee_usd,rewards_focus,intro_offer,eligibility,caveat +card_001,Aurora Travel Signature,395,"travel dining lounge","75000 bonus points after qualifying spend","premium income and excellent credit","High annual fee only works for frequent travelers" +card_002,Everyday Cashback Plus,0,"groceries streaming cashback","$200 statement credit after qualifying spend","starter or mass market income","Lower earn rate on travel" +card_003,Family Balance Saver,95,"groceries utilities balance transfer","0 percent intro APR for 15 months","mass affluent income and good credit","Balance transfer fee applies" +card_004,Nomad Rewards Flex,150,"travel dining subscriptions","50000 points after qualifying spend","premium income and good credit","Subscription credits require activation" diff --git a/samples/llm-vector-langchain/data/customer_profiles/customer_profiles.csv b/samples/llm-vector-langchain/data/customer_profiles/customer_profiles.csv new file mode 100644 index 0000000000..3f53ffaf49 --- /dev/null +++ b/samples/llm-vector-langchain/data/customer_profiles/customer_profiles.csv @@ -0,0 +1,5 @@ +customer_id,age,income_band,segment,goals,risk_tolerance +cust_001,34,premium,frequent_traveler,"travel rewards, airport lounge access, lower foreign transaction costs",medium +cust_002,27,starter,young_professional,"cashback on groceries, no annual fee, credit building",low +cust_003,42,mass_affluent,family_planner,"groceries, school expenses, balance transfer",low +cust_004,31,premium,digital_nomad,"travel rewards, dining perks, subscription credits",medium diff --git a/samples/llm-vector-langchain/data/transactions/transactions.csv b/samples/llm-vector-langchain/data/transactions/transactions.csv new file mode 100644 index 0000000000..a658860192 --- /dev/null +++ b/samples/llm-vector-langchain/data/transactions/transactions.csv @@ -0,0 +1,13 @@ +transaction_id,customer_id,merchant_category,amount_usd,txn_date +txn_001,cust_001,travel,820.50,2026-01-12 +txn_002,cust_001,dining,188.10,2026-01-15 +txn_003,cust_001,travel,1320.00,2026-02-03 +txn_004,cust_002,groceries,122.23,2026-01-17 +txn_005,cust_002,streaming,18.99,2026-01-19 +txn_006,cust_002,groceries,98.11,2026-02-02 +txn_007,cust_003,groceries,240.42,2026-01-07 +txn_008,cust_003,utilities,210.00,2026-01-10 +txn_009,cust_003,education,510.00,2026-02-05 +txn_010,cust_004,dining,305.45,2026-01-21 +txn_011,cust_004,travel,940.00,2026-02-08 +txn_012,cust_004,subscriptions,64.99,2026-02-11 diff --git a/samples/llm-vector-langchain/dozer-config.yaml b/samples/llm-vector-langchain/dozer-config.yaml new file mode 100644 index 0000000000..5d2e450f08 --- /dev/null +++ b/samples/llm-vector-langchain/dozer-config.yaml @@ -0,0 +1,77 @@ +app_name: llm-vector-langchain-sample +version: 1 + +api: + rest: + port: 8080 + url: "[::0]" + cors: true + grpc: + port: 50051 + url: "[::0]" + cors: true + web: true + auth: false + internal: + port: 50052 + host: "[::1]" + +connections: + - db_type: ObjectStore + name: local_customer_data + authentication: !LocalStorage + details: + path: ./data + tables: + - !Table + name: customer_profiles + config: !CSV + path: customer_profiles + extension: .csv + - !Table + name: transactions + config: !CSV + path: transactions + extension: .csv + - !Table + name: credit_card_products + config: !CSV + path: credit_card_products + extension: .csv + +sources: + - name: customer_profiles + table_name: customer_profiles + connection: local_customer_data + columns: + - customer_id + - name: transactions + table_name: transactions + connection: local_customer_data + columns: + - transaction_id + - name: credit_card_products + table_name: credit_card_products + connection: local_customer_data + columns: + - product_id + +endpoints: + - name: customer_profiles + path: /customer_profiles + sql: SELECT customer_id, age, income_band, segment, goals, risk_tolerance FROM customer_profiles; + index: + primary_key: + - customer_id + - name: transactions + path: /transactions + sql: SELECT transaction_id, customer_id, merchant_category, amount_usd, txn_date FROM transactions; + index: + primary_key: + - transaction_id + - name: credit_card_products + path: /credit_card_products + sql: SELECT product_id, name, annual_fee_usd, rewards_focus, intro_offer, eligibility, caveat FROM credit_card_products; + index: + primary_key: + - product_id diff --git a/samples/llm-vector-langchain/dozer_client.py b/samples/llm-vector-langchain/dozer_client.py new file mode 100644 index 0000000000..6aa94d9fbb --- /dev/null +++ b/samples/llm-vector-langchain/dozer_client.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from typing import Any + +import requests + + +class DozerClient: + def __init__(self, base_url: str) -> None: + self.base_url = base_url.rstrip("/") + + def fetch_endpoint(self, endpoint: str) -> list[dict[str, Any]]: + url = f"{self.base_url}/{endpoint.lstrip('/')}" + response = requests.get(url, timeout=15) + response.raise_for_status() + payload = response.json() + if isinstance(payload, list): + return [dict(row) for row in payload] + if isinstance(payload, dict): + for key in ("data", "records", "items"): + value = payload.get(key) + if isinstance(value, list): + return [dict(row) for row in value] + raise ValueError(f"Unexpected Dozer response shape from {url}: {payload!r}") diff --git a/samples/llm-vector-langchain/recommender.py b/samples/llm-vector-langchain/recommender.py new file mode 100644 index 0000000000..3221b0f93c --- /dev/null +++ b/samples/llm-vector-langchain/recommender.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from collections import Counter +from typing import Any + + +def build_customer_context( + customer: dict[str, Any], + transactions: list[dict[str, Any]], +) -> str: + categories = Counter(row["merchant_category"] for row in transactions) + total_spend = sum(float(row["amount_usd"]) for row in transactions) + top_categories = ", ".join( + f"{category} ({count})" for category, count in categories.most_common(3) + ) + return ( + f"Customer {customer['customer_id']} is in the {customer['segment']} " + f"segment with income band {customer['income_band']}. Goals: " + f"{customer['goals']}. Risk tolerance: {customer['risk_tolerance']}. " + f"Recent spend: ${total_spend:.2f}. Top categories: {top_categories}." + ) + + +def build_product_context(product: dict[str, Any]) -> str: + return ( + f"Card {product['name']} has an annual fee of " + f"${float(product['annual_fee_usd']):.0f}. Rewards focus: " + f"{product['rewards_focus']}. Intro offer: {product['intro_offer']}. " + f"Eligibility: {product['eligibility']}. Caveat: {product['caveat']}." + ) + + +def rank_products( + customer: dict[str, Any], + transactions: list[dict[str, Any]], + products: list[dict[str, Any]], +) -> list[dict[str, Any]]: + category_counts = Counter(row["merchant_category"].lower() for row in transactions) + goals = str(customer.get("goals", "")).lower() + income_band = str(customer.get("income_band", "")).lower() + ranked: list[dict[str, Any]] = [] + + for product in products: + focus = str(product.get("rewards_focus", "")).lower() + eligibility = str(product.get("eligibility", "")).lower() + fee = float(product.get("annual_fee_usd", 0) or 0) + score = 0 + reasons: list[str] = [] + + for category, count in category_counts.items(): + if category in focus: + score += count * 3 + reasons.append(f"matches {category} spend") + + for goal in ("travel", "cashback", "balance transfer", "groceries"): + if goal in goals and goal in focus: + score += 4 + reasons.append(f"supports {goal} goal") + + if "premium" in income_band and fee > 0: + score += 1 + reasons.append("premium segment can absorb an annual fee") + elif "starter" in income_band and fee == 0: + score += 2 + reasons.append("no annual fee suits starter income band") + + if "excellent credit" in eligibility and "starter" in income_band: + score -= 2 + reasons.append("eligibility may be tight") + + ranked.append( + { + **product, + "score": score, + "reason": "; ".join(reasons) or "general fit", + } + ) + + return sorted(ranked, key=lambda row: row["score"], reverse=True) diff --git a/samples/llm-vector-langchain/requirements.txt b/samples/llm-vector-langchain/requirements.txt new file mode 100644 index 0000000000..28f6bc55eb --- /dev/null +++ b/samples/llm-vector-langchain/requirements.txt @@ -0,0 +1,7 @@ +chromadb>=0.5.0 +langchain>=0.3.0 +langchain-chroma>=0.1.4 +langchain-openai>=0.2.0 +langchain-text-splitters>=0.3.0 +python-dotenv>=1.0.0 +requests>=2.32.0 diff --git a/samples/llm-vector-langchain/tests/test_recommender.py b/samples/llm-vector-langchain/tests/test_recommender.py new file mode 100644 index 0000000000..d4cb3b83f7 --- /dev/null +++ b/samples/llm-vector-langchain/tests/test_recommender.py @@ -0,0 +1,63 @@ +import unittest + +from recommender import build_customer_context, rank_products + + +class RecommenderTest(unittest.TestCase): + def test_travel_customer_gets_travel_card(self) -> None: + customer = { + "customer_id": "cust_001", + "income_band": "premium", + "segment": "frequent_traveler", + "goals": "travel rewards and lounge access", + "risk_tolerance": "medium", + } + transactions = [ + {"merchant_category": "travel", "amount_usd": "500"}, + {"merchant_category": "travel", "amount_usd": "300"}, + {"merchant_category": "dining", "amount_usd": "120"}, + ] + products = [ + { + "product_id": "card_001", + "name": "Aurora Travel Signature", + "annual_fee_usd": "395", + "rewards_focus": "travel dining lounge", + "eligibility": "premium income and excellent credit", + }, + { + "product_id": "card_002", + "name": "Everyday Cashback Plus", + "annual_fee_usd": "0", + "rewards_focus": "groceries streaming cashback", + "eligibility": "starter or mass market income", + }, + ] + + ranked = rank_products(customer, transactions, products) + + self.assertEqual(ranked[0]["product_id"], "card_001") + self.assertIn("matches travel spend", ranked[0]["reason"]) + + def test_customer_context_summarizes_spend(self) -> None: + customer = { + "customer_id": "cust_002", + "income_band": "starter", + "segment": "young_professional", + "goals": "cashback", + "risk_tolerance": "low", + } + transactions = [ + {"merchant_category": "groceries", "amount_usd": "10.50"}, + {"merchant_category": "groceries", "amount_usd": "12.25"}, + {"merchant_category": "streaming", "amount_usd": "8.00"}, + ] + + context = build_customer_context(customer, transactions) + + self.assertIn("Recent spend: $30.75", context) + self.assertIn("groceries (2)", context) + + +if __name__ == "__main__": + unittest.main()