-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest.py
More file actions
52 lines (45 loc) · 1.65 KB
/
ingest.py
File metadata and controls
52 lines (45 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import csv
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
from dotenv import load_dotenv
import os
load_dotenv()
openai_api_key: str = os.getenv("OPENAI_API_KEY")
def read_from_csv_adapted(csv_path: str) -> list[Document]:
"""
Reads the adapted CSV file and converts its content into a list of Document objects.
:param csv_path: The path to the adapted CSV file.
:return: A list of Document objects.
"""
documents = []
with open(csv_path, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
doc = Document(
page_content=row['text_chunk'],
metadata={
"page_number": 1, # Placeholder
"chunk": 1, # Placeholder
"source": "processed_embeddings.csv", # Placeholder
# Add other metadata fields if needed
}
)
documents.append(doc)
return documents
if __name__ == "__main__":
# Step 1: Read from CSV
csv_path_adapted = "processed_embeddings.csv"
document_chunks = read_from_csv_adapted(csv_path_adapted)
# Step 3 + 4: Generate embeddings and store them in DB
embeddings = OpenAIEmbeddings()
vector_store = Chroma.from_documents(
document_chunks,
embeddings,
collection_name="html",
persist_directory="chroma",
)
# Save DB locally
vector_store.persist()
# Number of documents in the vector store and DB written
print(f"Number of Document Chunks: {len(document_chunks)}")