-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocuments.py
More file actions
192 lines (152 loc) · 7.18 KB
/
documents.py
File metadata and controls
192 lines (152 loc) · 7.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import datetime
from unstructured.cleaners.core import clean
from unstructured.chunking.title import chunk_by_title
from dbmodel import Document, DocumentChunk, DocumentTypes
import dbconnect
from fastembed import TextEmbedding
model = TextEmbedding() # using BAAI/bge-small-en-v1.5
def add_document(elements, title, doc_type, url):
"""
FCreate document chunks, embed them, and store the document.
This function groups and cleans elements, then chunks them by title. For each meaningful chunk it
computes a vector embedding via the global ``model`` and creates a
``DocumentChunk`` which is appended to a ``Document``. The resulting document is
persisted using ``dbconnect.add_document``.
Args:
url (str): The full URL of the website page to index.
Returns:
None: The function has side effects (saves to the database) and does not
return a value.
Raises:
httpx.HTTPError: If the HTTP request for ``url`` fails.
Exception: Propagates errors from partitioning, embedding, or database
operations so callers can handle or log them as needed.
Notes:
- Chunks shorter than about 20 characters are skipped.
- Embeddings are created with ``model.embed`` and stored in
``DocumentChunk.content_vector``.
"""
document = Document(title=title, url=url, doc_type=doc_type)
_create_documentchunks(document, elements, url, title, doc_type)
dbconnect.add_document(document)
return document
def update_document(document: Document, elements, title, doc_type, url):
_create_documentchunks(document, elements, url, title, doc_type)
document.modified_at = datetime.datetime.now()
dbconnect.update_document(document)
return document
def _create_documentchunks(document, elements, url, title, doc_type):
chunks = _calculate_chunks(elements, url)
for i, chunk in enumerate(chunks):
embeddings = list(model.embed(chunk["text"]))
chunk_record = DocumentChunk(order_index=i, content=chunk["text"], content_vector=embeddings[0], metadata_json=chunk["metadata"])
document.chunks.append(chunk_record)
def _calculate_chunks(elements, url):
"""Page elements into cleaned, titled chunks ready for embedding.
This helper takes elements from unstructured ``elements`` and the original
``url`` and performs the following steps:
- Group and chunk elements by title with ``chunk_by_title`` (configurable
thresholds are used in the function).
- Clean and group broken paragraphs, skipping chunks that are too short to be
meaningful.
- Return a list of payload dictionaries suitable for embedding and storage.
Args:
elements (list): List of elements generated by unstructured's partitioning of the document.
url (str): The source URL — added to each chunk's metadata as ``source``.
Returns:
list[dict]: A list of chunk dictionaries with the shape::
{
"id": "file-<n>",
"text": ["paragraph 1", "paragraph 2", ...],
"metadata": {
"source": url,
"type": chunk.category,
"page_number": <int>,
}
}
Raises:
Exception: Propagates errors from partitioning, chunking, or cleaning
operations to allow callers to handle failures.
Notes:
- Chunks with very short text (fewer than ~20 items/characters) are
skipped to avoid indexing noise.
- The function uses ``chunk_by_title`` with thresholds tuned for
reasonably-sized chunks (max 2000 chars, new after 1500 chars).
"""
processed_chunks = []
chunks = chunk_by_title(
elements,
max_characters=2000,
new_after_n_chars=1500,
combine_text_under_n_chars=500 # Merges tiny snippets into the bigger chunk
)
for i, chunk in enumerate(chunks):
# for some reason these do not get stripped out otherwise
text_content = chunk.text.replace("\\r", " ").replace("\\t", " ").replace("\\n", " ")
# clean null character from text, which causes issues with vector DB.
text_content = text_content.replace("\x00", " ")
text_content = clean(text_content, extra_whitespace=True, dashes=True)
# Skip elements that are too short to be meaningful for search
if len(text_content) < 20:
continue
# 3. Create the payload structure
# Most vector DBs expect: 'id', 'vector' (added later), and 'metadata'
chunk_json = {
"id": f"file-{i}",
"text": text_content,
"metadata": {
"source": url,
"type": chunk.category, # e.g., 'Title', 'NarrativeText', 'ListItem'
"page_number": chunk.metadata.page_number if chunk.metadata.page_number else 1,
}
}
processed_chunks.append(chunk_json)
return processed_chunks
def get_documents(page: int = 1, page_size: int = 10, return_chunks: bool = True) -> tuple[int, list[Document]]:
"""
Retrieve a paginated list of documents from the database.
This function calculates the appropriate offset based on the provided page number
and page size, then queries the database for that slice of documents. It returns
both the total count of documents and the list of documents with their metadata and chunks.
Args:
page (int): The 1-based page number to retrieve (default is 1).
page_size (int): The number of documents to include in each page (default is 10).
Returns:
tuple: A tuple containing:
- total_count (int): The total number of documents in the database.
- documents (list): A list of document dictionaries, each containing:
- id (int): The document ID.
- title (str): The document title.
- url (str): The document URL.
- chunks (list): A list of chunk dictionaries, each with:
- order_index (int): The chunk's order index within the document.
- content (str): The text content of the chunk.
"""
# Ensure page is at least 1
page = max(1, page)
offset_value = (page - 1) * page_size
documents = dbconnect.get_documents(offset_value, page_size)
count = dbconnect.get_documents_count()
return (count, [
{
"id": doc.id,
"title": doc.title,
"url": doc.url,
"doc_type": doc.doc_type,
"created_at": doc.created_at.isoformat() if doc.created_at else "",
"modified_at": doc.modified_at.isoformat() if doc.modified_at else "",
"chunks": [
{
"order_index": c.order_index,
"content": c.content
}
for c in doc.chunks
] if return_chunks else []
}
for doc in documents
])
def get_document_by_url(url: str) -> Document | None:
return dbconnect.get_document_by_url(url)
def delete_document(document_id: int) -> bool:
"""Delete a document and all its chunks by ID."""
return dbconnect.delete_document(document_id)