This guide will get you up and running with PullData in under 5 minutes.
- Python 3.9 or higher
- 8GB RAM minimum (16GB recommended)
- Optional: NVIDIA GPU with CUDA support (Tesla P4 or better)
# Navigate to project directory
cd PullData
# Create virtual environment
python -m venv venv
# Activate virtual environment
# Windows:
venv\Scripts\activate
# Linux/Mac:
source venv/bin/activate
# Install PullData
pip install -e .# Copy environment template
cp .env.example .env
# Edit .env with your settings (optional)
# Default settings work out-of-box for local modefrom pulldata import PullData
# Initialize with default local storage
pd = PullData(project="quickstart")
# Ingest documents
stats = pd.ingest("path/to/your/documents/*.pdf")
print(f"Processed {stats['new_chunks']} chunks")
# Query with automatic Excel generation
result = pd.query(
query="What are the key financial metrics?",
output_format="excel" # File automatically saved to ./output/
)
print(f"Answer: {result.llm_response.text}")
print(f"Report saved to: {result.output_path}")
print(f"Sources: {len(result.retrieved_chunks)}")
# Cleanup
pd.close()# Initialize a project
pulldata init --project quickstart
# Ingest documents
pulldata ingest --project quickstart --path ./documents/
# Query with Excel output
pulldata query \
--project quickstart \
--query "What are the key financial metrics?" \
--output excel \
--save financial_metrics.xlsx
# Query with Markdown output
pulldata query \
--project quickstart \
--query "Summarize the main findings" \
--output markdown \
--save summary.mdChoose between local models or API endpoints:
# configs/default.yaml
models:
llm:
provider: local
local:
name: Qwen/Qwen2.5-3B-Instruct
quantization: int8
device: cudamodels:
llm:
provider: api
api:
base_url: http://localhost:1234/v1
api_key: sk-dummy
model: local-modelmodels:
llm:
provider: api
api:
base_url: https://api.openai.com/v1
api_key: ${OPENAI_API_KEY} # Set in .env
model: gpt-3.5-turboSee API Configuration Guide for complete setup instructions.
PullData supports three storage backends:
# configs/default.yaml
storage:
backend: local
local:
sqlite_path: ./data/pulldata.db
faiss_index_path: ./data/faiss_indexesstorage:
backend: postgres
postgres:
host: localhost
port: 5432
database: pulldata
user: pulldata_user
password: ${POSTGRES_PASSWORD}storage:
backend: chromadb
chromadb:
persist_directory: ./data/chroma_dbChoose models based on your hardware:
models:
embedder:
name: BAAI/bge-small-en-v1.5
llm:
name: Qwen/Qwen2.5-3B-Instruct
quantization: int8models:
embedder:
name: BAAI/bge-small-en-v1.5
device: cpu
llm:
name: Qwen/Qwen2.5-3B-Instruct
quantization: int8
device: cpumodels:
embedder:
name: BAAI/bge-large-en-v1.5
llm:
name: Qwen/Qwen2.5-7B-Instruct
quantization: fp16PullData automatically generates deliverable files in multiple formats!
result = pd.query(
query="Extract revenue by region",
output_format="excel" # Automatically saved to ./output/
)
print(f"Excel report: {result.output_path}")result = pd.query(
query="Summarize key findings",
output_format="markdown"
)
print(f"Markdown summary: {result.output_path}")result = pd.query(
query="Extract structured data",
output_format="json"
)
print(f"JSON data: {result.output_path}")result = pd.query(
query="Create presentation slides",
output_format="powerpoint"
)
print(f"PowerPoint: {result.output_path}")result = pd.query(
query="Generate report",
output_format="pdf"
)
print(f"PDF report: {result.output_path}")Note: Files are automatically saved to ./output/{project}_query_{timestamp}.{format} with the full path available in result.output_path.
result = rag.query(
query="Revenue trends",
filters={
"doc_type": "financial_report",
"date_range": ("2024-01-01", "2024-12-31"),
"tags": ["quarterly", "audited"],
"page_number": 5
}
)# First ingest (full processing)
pd.ingest("report.pdf")
# Update document (only changed content re-processed)
stats = pd.ingest("report.pdf")
print(f"Skipped {stats['skipped_chunks']} unchanged chunks") # Much faster!# Separate projects for different document collections
finance = PullData(project="finance")
legal = PullData(project="legal")
finance.ingest("financial_docs/*.pdf")
legal.ingest("legal_docs/*.pdf")
# Each project has isolated storage
finance_result = finance.query("What's Q3 revenue?", output_format="excel")
legal_result = legal.query("What are the terms?", output_format="pdf")
finance.close()
legal.close()# Reduce batch size in configs/default.yaml
performance:
batch_size: 16 # Default: 32
max_memory_gb: 6 # For P4: 6# Enable GPU if available
models:
embedder:
device: cuda
llm:
device: cuda# Adjust table detection settings
parsing:
pdf:
table_settings:
min_words_vertical: 2 # Lower threshold
intersection_tolerance: 5 # Higher tolerance# Format code
make format
# Run linting
make lint
# Run tests
make test
# Run tests with coverage
make test-cov
# Type checking
make type-check
# Clean build artifacts
make clean| Task | Performance | Hardware |
|---|---|---|
| Document ingestion | <5s/page | Tesla P4 |
| Query latency | <2s | Tesla P4 |
| Cache hit latency | <0.05s | Any |
| Table extraction | >90% accuracy | - |
- Read the full documentation: README.md
- Explore configuration options: configs/default.yaml
- Check model presets: configs/models.yaml
- Review contributing guide: CONTRIBUTING.md
- Issues: https://github.com/pulldata/pulldata/issues
- Discussions: https://github.com/pulldata/pulldata/discussions
- Documentation: https://pulldata.readthedocs.io
See examples/ directory for complete working examples:
- basic_usage.py - Simple RAG query workflow
- lm_studio_api_embeddings.py - Using LM Studio for embeddings and LLM
- query_with_output_formats.py - End-to-end with all output formats
- output_formats_example.py - Standalone formatter usage
- pdf_ingestion_example.py - PDF processing and ingestion
from pathlib import Path
from pulldata import PullData
# Initialize
pd = PullData(
project="financial_analysis",
config_path="configs/lm_studio_api_embeddings.yaml"
)
# Ingest documents
stats = pd.ingest("financial_reports/*.pdf")
print(f"Ingested {stats['new_chunks']} chunks from {stats['processed_files']} files")
# Query with Excel output
result = pd.query(
query="What was the total revenue in Q3 2024?",
output_format="excel"
)
print(f"\nAnswer: {result.llm_response.text}")
print(f"Sources: {len(result.retrieved_chunks)}")
print(f"Excel report saved to: {result.output_path}")
# Query with PowerPoint output
presentation = pd.query(
query="Create a summary of key financial metrics",
output_format="powerpoint"
)
print(f"PowerPoint saved to: {presentation.output_path}")
pd.close()Version: 0.1.0 Status: Alpha Last Updated: 2024-12-18