forked from pratikm778/HackAI
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_processor.py
More file actions
219 lines (181 loc) · 9.31 KB
/
pdf_processor.py
File metadata and controls
219 lines (181 loc) · 9.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import os
import fitz # PyMuPDF
import logging
from pathlib import Path
from PIL import Image
from typing import Dict, List, Tuple
import io
from embeddings_processor import ImageAnalyzer
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class PDFProcessor:
"""
Handles all PDF processing operations including text extraction and image extraction
"""
def __init__(self, output_dir: str = "data", image_dir: str = "pic_data"):
self.output_dir = Path(output_dir)
self.image_dir = Path(image_dir)
self.output_dir.mkdir(exist_ok=True)
self.image_dir.mkdir(exist_ok=True)
self.image_analyzer = ImageAnalyzer()
def process_pdf(self, pdf_path: str, chunk_size: int = 1000) -> Tuple[List[Dict], List[Dict]]:
"""
Process a PDF file to extract both text and images, with special handling for tables, diagrams, and graphs
Args:
pdf_path: Path to the PDF file
chunk_size: Number of characters per text chunk
Returns:
Tuple of (text_chunks, image_info)
"""
logger.info(f"Processing PDF: {pdf_path}")
text_chunks = []
image_info = []
try:
doc = fitz.open(pdf_path)
# Process each page
for page_num, page in enumerate(doc, 1):
# Extract text
text = page.get_text()
if text.strip():
chunks = self._split_text(text, chunk_size)
for i, chunk in enumerate(chunks, 1):
chunk_info = {
'text': chunk,
'page_number': page_num,
'chunk_number': i,
'file_path': pdf_path
}
text_chunks.append(chunk_info)
# Extract and analyze images using both standard and advanced methods
# Method 1: Standard image extraction
image_list = page.get_images()
# Method 2: Get tables and diagrams as images
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution for better quality
img_data = pix.tobytes("png")
# Save the full page image temporarily to detect tables and diagrams
temp_page_image = f"temp_page_{page_num}.png"
with open(temp_page_image, "wb") as f:
f.write(img_data)
# Analyze the full page for tables and diagrams
page_analysis = self.image_analyzer.analyze_image(temp_page_image)
if page_analysis['type'] in ["a table or spreadsheet", "a diagram or flowchart"]:
# If a table or diagram is detected, save it as a separate image
img_index = len(image_list) + 1
image_filename = f"page_{page_num}_img_{img_index}.png"
image_path = self.image_dir / image_filename
# Copy the detected table/diagram
import shutil
shutil.copy2(temp_page_image, image_path)
# Get text content
extracted_text = self.image_analyzer.extract_text(str(image_path))
# Create image info
img_info = {
'page_number': page_num,
'image_number': img_index,
'path': str(image_path),
'width': pix.width,
'height': pix.height,
'type': page_analysis['type'],
'description': f"{page_analysis['type'].capitalize()} containing: {extracted_text[:200]}...",
'confidence': page_analysis['confidence'],
'extracted_text': extracted_text,
'file_path': pdf_path
}
image_info.append(img_info)
# Clean up temporary file
try:
os.remove(temp_page_image)
except:
pass
# Process standard images
for img_index, img in enumerate(image_list, 1):
try:
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
# Save image
image = Image.open(io.BytesIO(image_bytes))
image_filename = f"page_{page_num}_img_{img_index}.png"
image_path = self.image_dir / image_filename
image.save(image_path)
# Enhanced image analysis
analysis = self.image_analyzer.analyze_image(str(image_path))
# Extract text content
extracted_text = ""
if analysis['type'] in ["a table or spreadsheet", "a graph or chart", "a diagram or flowchart"]:
extracted_text = self.image_analyzer.extract_text(str(image_path))
# Generate descriptive text based on the type
if analysis['type'] == "a table or spreadsheet":
analysis['description'] = f"Table containing: {extracted_text[:200]}..."
elif analysis['type'] == "a graph or chart":
analysis['description'] = f"Graph/Chart with labels: {extracted_text[:200]}..."
elif analysis['type'] == "a diagram or flowchart":
analysis['description'] = f"Diagram/Flowchart showing: {extracted_text[:200]}..."
# Record image info
img_info = {
'page_number': page_num,
'image_number': img_index,
'path': str(image_path),
'width': image.width,
'height': image.height,
'type': analysis['type'],
'description': analysis['description'],
'confidence': analysis['confidence'],
'extracted_text': extracted_text,
'file_path': pdf_path
}
image_info.append(img_info)
except Exception as e:
logger.warning(f"Failed to process image {img_index} on page {page_num}: {e}")
return text_chunks, image_info
except Exception as e:
logger.error(f"Error processing PDF {pdf_path}: {e}")
raise
def _split_text(self, text: str, chunk_size: int) -> List[str]:
"""Split text into chunks of approximately equal size"""
words = text.split()
chunks = []
current_chunk = []
current_size = 0
for word in words:
word_size = len(word)
if current_size + word_size > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_size = word_size
else:
current_chunk.append(word)
current_size += word_size + 1 # +1 for space
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def save_text_chunks(self, chunks: List[Dict]) -> None:
"""Save text chunks to individual files"""
for chunk in chunks:
filename = f"text_{chunk['page_number']}_{chunk['chunk_number']}.txt"
filepath = self.output_dir / filename
with open(filepath, 'w', encoding='utf-8') as f:
f.write(chunk['text'])
def main():
import argparse
parser = argparse.ArgumentParser(description='Process a PDF file to extract text and images.')
parser.add_argument('--input', required=True, help='Path to the PDF file to process')
args = parser.parse_args()
processor = PDFProcessor()
text_chunks, image_info = processor.process_pdf(args.input)
# Save text chunks
processor.save_text_chunks(text_chunks)
# Print summary
print(f"\nProcessed {len(text_chunks)} text chunks")
print(f"Extracted {len(image_info)} images")
# Print image analysis results
print("\nImage Analysis Results:")
for img in image_info:
if img['description']: # Only print non-empty descriptions
print(f"\nPage {img['page_number']}, Image {img['image_number']}:")
print(f"Type: {img['type']}")
print(f"Description: {img['description']}")
print(f"Confidence: {img['confidence']:.2f}")
if __name__ == "__main__":
main()