Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@ COPY ./app/requirements.txt /app/
# Install Python dependencies
RUN pip install --no-cache-dir -r /app/requirements.txt

# Copy the rest of the application files
COPY ./app /app

# Install Playwright and required dependencies
RUN apt-get update && apt-get install -y libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxcomposite1 libxrandr2 libgbm-dev libasound2 libpangocairo-1.0-0 libxdamage1 libpango-1.0-0 libgtk-3-0 libx11-xcb1
RUN pip install playwright && playwright install

# Copy the rest of the application files
COPY ./app .

# Ensure outputs directory exists and is writable
RUN mkdir -p /app/outputs && chmod -R 777 /app/outputs
RUN mkdir -p ./outputs && chmod -R 777 ./outputs

# Expose port 8000 for the FastAPI service
EXPOSE 8000

# Run the FastAPI server using uvicorn
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
29 changes: 29 additions & 0 deletions Dockerfile .old
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Use an official Python runtime as a parent image
FROM python:3.9-slim

# Set the working directory in the container
WORKDIR /app

# Copy requirements.txt from the app directory
COPY ./app/requirements.txt /app/

# Install Python dependencies
RUN pip install --no-cache-dir -r /app/requirements.txt

# Install Playwright and required dependencies
RUN apt-get update && apt-get install -y libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxcomposite1 libxrandr2 libgbm-dev libasound2 libpangocairo-1.0-0 libxdamage1 libpango-1.0-0 libgtk-3-0 libx11-xcb1
RUN pip install playwright && playwright install

# Copy the rest of the application files
COPY ./app ./app

# Ensure outputs directory exists and is writable
RUN mkdir -p /app/app/outputs && chmod -R 777 /app/app/outputs

# Expose port 8000 for the FastAPI service
EXPOSE 8000

# CMD ["python3", "test.py"]

# Run the FastAPI server using uvicorn
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
16 changes: 15 additions & 1 deletion app/gptcrawlercore.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(self, start_url: str, max_pages: int = 100, concurrency: int = 5, j
self.visited: Set[str] = set()
self.to_visit: List[str] = [self.start_url]
self.results: List[dict] = []
self.contains_preboot_text: List[str] = []
self.retry_limit = 3 # Number of retries for failed pages
self.retry_count = {} # Track retries per URL
self.sem = asyncio.Semaphore(self.concurrency) # Semaphore for concurrency control
Expand Down Expand Up @@ -259,6 +260,12 @@ async def crawl_page(self, context, url: str):
title = await page.title()
html_content = await self.get_page_html(page, "body")
text_content = self.extract_text_from_html(html_content)

# Store if contains preboot word
pattern = r'\bpre[-\s]?boot\w*\b'
if re.search(pattern, text_content, re.IGNORECASE):
self.contains_preboot_text.append(url)

self.results.append({
"title": title,
"url": url,
Expand Down Expand Up @@ -313,7 +320,7 @@ async def crawl(self):
await browser.close()
print("[INFO] Browser closed.")

def write_output(self, output_file: str):
def write_output(self, output_file: str, output_file_meta : str = None):
"""
Writes the crawl results to a JSON file.

Expand All @@ -327,5 +334,12 @@ def write_output(self, output_file: str):
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(self.results, f, ensure_ascii=False, indent=2)
print(f"[INFO] Output written to {output_file}")

if output_file_meta:
output_dir = os.path.dirname(output_file_meta)
os.makedirs(output_dir, exist_ok=True) # Ensure the output directory exists
with open(output_file_meta, 'w', encoding='utf-8') as f:
json.dump(self.contains_preboot_text, f, ensure_ascii=False, indent=2)
print(f"[INFO] Meta Output written to {output_file_meta}")
except Exception as e:
self.record_error(self.start_url, f"Failed to write output file: {e}")
21 changes: 19 additions & 2 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from rq import Queue
import os
import time
from app.worker import run_crawler
from worker import run_crawler
import json
import uuid # For generating unique job IDs
from typing import List # Import List for type hinting
Expand Down Expand Up @@ -169,7 +169,24 @@ def get_output(job_id: str):
Returns:
FileResponse: The JSON file containing crawl results.
"""
output_file = f"app/outputs/{job_id}.json"
output_file = os.path.join("app", "outputs", f"{job_id}.json")
if not os.path.exists(output_file):
raise HTTPException(status_code=404, detail="Output file not found")
return FileResponse(output_file, media_type='application/json', filename=f"{job_id}.json")


@app.get("/get-meta-output/{job_id}")
def get_output(job_id: str):
"""
Allows downloading the crawl results as a JSON file.

Args:
job_id (str): The unique identifier of the crawl job.

Returns:
FileResponse: The JSON file containing crawl results.
"""
output_file = os.path.join("app", "outputs", f"{job_id}_meta.json")
if not os.path.exists(output_file):
raise HTTPException(status_code=404, detail="Output file not found")
return FileResponse(output_file, media_type='application/json', filename=f"{job_id}_meta.json")
5 changes: 3 additions & 2 deletions app/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import asyncio
from redis import Redis
import os
from app.gptcrawlercore import GPTCrawlerCore
from gptcrawlercore import GPTCrawlerCore
import time

# Initialize Redis connection
Expand Down Expand Up @@ -47,9 +47,10 @@ def run_crawler(job_id: str, start_url: str, max_pages: int = 10):

# Define output file path using unique job_id
output_file = os.path.join("app", "outputs", f"{job_id}.json")
output_meta_file = os.path.join("app", "outputs", f"{job_id}_meta.json")

# Write the crawling results to output file
crawler.write_output(output_file=output_file)
crawler.write_output(output_file=output_file, output_file_meta=output_meta_file)

# Update job status to completed and set end_time
redis_conn.hset(f"job:{job_id}", mapping={
Expand Down
16 changes: 11 additions & 5 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,24 @@ services:
container_name: rediss
networks:
- app-network
expose:
- "6379"
ports:
- "6380:6379"
volumes:
- redis-data:/data
command: ["redis-server", "--requirepass", "your_secure_password"]
command: ["redis-server"]
restart: unless-stopped
healthcheck:
test: ["CMD", "redis-cli", "ping"]
start_period: 59s
start_interval: 30s
interval: 30s
timeout: 10s
retries: 3

fastapi:
build:
context: .
dockerfile: Dockerfile.prod # Use a production-specific Dockerfile
dockerfile: Dockerfile # Use a production-specific Dockerfile
container_name: fastapi
environment:
- ENV=production
Expand All @@ -46,11 +48,13 @@ services:
interval: 30s
timeout: 10s
retries: 3
start_period: 59s
start_interval: 30s

worker:
build:
context: .
dockerfile: Dockerfile.prod
dockerfile: Dockerfile
container_name: worker
environment:
- REDIS_URL=redis://rediss:6379/0
Expand All @@ -60,6 +64,8 @@ services:
networks:
- app-network
restart: unless-stopped
volumes:
- ./app/outputs:/app/app/outputs/
deploy:
resources:
limits:
Expand Down