-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_data.py
More file actions
41 lines (31 loc) · 1.24 KB
/
load_data.py
File metadata and controls
41 lines (31 loc) · 1.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# Import necessary modules
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
def create_vector_db():
"""Create a vector database from PDF documents."""
# Load documents from the specified directory
loader = PyPDFDirectoryLoader(os.getenv('DATA_PATH'))
documents = loader.load()
print(f"Processed {len(documents)} pdf files")
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
# Create a vector store from the document chunks
vectorStore = Chroma.from_documents(
documents=texts,
embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
persist_directory=os.getenv('DB_PATH')
)
# Persist the vector store to disk
vectorStore.persist()
def main():
"""Main function to create the vector database."""
create_vector_db()
if __name__ == "__main__":
main()