From a495ff53e1cdeb400b7e937ba0dfc2d46b755ed5 Mon Sep 17 00:00:00 2001 From: raaj-love-to-code Date: Fri, 17 Jan 2025 15:55:03 +0530 Subject: [PATCH] Added a endpoint to download the file containing links for the page that contains the word preboot --- Dockerfile | 10 +++++----- Dockerfile .old | 29 +++++++++++++++++++++++++++++ app/gptcrawlercore.py | 16 +++++++++++++++- app/main.py | 21 +++++++++++++++++++-- app/worker.py | 5 +++-- docker-compose.yml | 16 +++++++++++----- 6 files changed, 82 insertions(+), 15 deletions(-) create mode 100644 Dockerfile .old diff --git a/Dockerfile b/Dockerfile index 6ea1142..f3b4a5c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,18 +10,18 @@ COPY ./app/requirements.txt /app/ # Install Python dependencies RUN pip install --no-cache-dir -r /app/requirements.txt -# Copy the rest of the application files -COPY ./app /app - # Install Playwright and required dependencies RUN apt-get update && apt-get install -y libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxcomposite1 libxrandr2 libgbm-dev libasound2 libpangocairo-1.0-0 libxdamage1 libpango-1.0-0 libgtk-3-0 libx11-xcb1 RUN pip install playwright && playwright install +# Copy the rest of the application files +COPY ./app . + # Ensure outputs directory exists and is writable -RUN mkdir -p /app/outputs && chmod -R 777 /app/outputs +RUN mkdir -p ./outputs && chmod -R 777 ./outputs # Expose port 8000 for the FastAPI service EXPOSE 8000 # Run the FastAPI server using uvicorn -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Dockerfile .old b/Dockerfile .old new file mode 100644 index 0000000..bd7938c --- /dev/null +++ b/Dockerfile .old @@ -0,0 +1,29 @@ +# Use an official Python runtime as a parent image +FROM python:3.9-slim + +# Set the working directory in the container +WORKDIR /app + +# Copy requirements.txt from the app directory +COPY ./app/requirements.txt /app/ + +# Install Python dependencies +RUN pip install --no-cache-dir -r /app/requirements.txt + +# Install Playwright and required dependencies +RUN apt-get update && apt-get install -y libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxcomposite1 libxrandr2 libgbm-dev libasound2 libpangocairo-1.0-0 libxdamage1 libpango-1.0-0 libgtk-3-0 libx11-xcb1 +RUN pip install playwright && playwright install + +# Copy the rest of the application files +COPY ./app ./app + +# Ensure outputs directory exists and is writable +RUN mkdir -p /app/app/outputs && chmod -R 777 /app/app/outputs + +# Expose port 8000 for the FastAPI service +EXPOSE 8000 + +# CMD ["python3", "test.py"] + +# Run the FastAPI server using uvicorn +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/app/gptcrawlercore.py b/app/gptcrawlercore.py index 6208770..8960410 100644 --- a/app/gptcrawlercore.py +++ b/app/gptcrawlercore.py @@ -33,6 +33,7 @@ def __init__(self, start_url: str, max_pages: int = 100, concurrency: int = 5, j self.visited: Set[str] = set() self.to_visit: List[str] = [self.start_url] self.results: List[dict] = [] + self.contains_preboot_text: List[str] = [] self.retry_limit = 3 # Number of retries for failed pages self.retry_count = {} # Track retries per URL self.sem = asyncio.Semaphore(self.concurrency) # Semaphore for concurrency control @@ -259,6 +260,12 @@ async def crawl_page(self, context, url: str): title = await page.title() html_content = await self.get_page_html(page, "body") text_content = self.extract_text_from_html(html_content) + + # Store if contains preboot word + pattern = r'\bpre[-\s]?boot\w*\b' + if re.search(pattern, text_content, re.IGNORECASE): + self.contains_preboot_text.append(url) + self.results.append({ "title": title, "url": url, @@ -313,7 +320,7 @@ async def crawl(self): await browser.close() print("[INFO] Browser closed.") - def write_output(self, output_file: str): + def write_output(self, output_file: str, output_file_meta : str = None): """ Writes the crawl results to a JSON file. @@ -327,5 +334,12 @@ def write_output(self, output_file: str): with open(output_file, 'w', encoding='utf-8') as f: json.dump(self.results, f, ensure_ascii=False, indent=2) print(f"[INFO] Output written to {output_file}") + + if output_file_meta: + output_dir = os.path.dirname(output_file_meta) + os.makedirs(output_dir, exist_ok=True) # Ensure the output directory exists + with open(output_file_meta, 'w', encoding='utf-8') as f: + json.dump(self.contains_preboot_text, f, ensure_ascii=False, indent=2) + print(f"[INFO] Meta Output written to {output_file_meta}") except Exception as e: self.record_error(self.start_url, f"Failed to write output file: {e}") diff --git a/app/main.py b/app/main.py index 758c401..761e277 100644 --- a/app/main.py +++ b/app/main.py @@ -7,7 +7,7 @@ from rq import Queue import os import time -from app.worker import run_crawler +from worker import run_crawler import json import uuid # For generating unique job IDs from typing import List # Import List for type hinting @@ -169,7 +169,24 @@ def get_output(job_id: str): Returns: FileResponse: The JSON file containing crawl results. """ - output_file = f"app/outputs/{job_id}.json" + output_file = os.path.join("app", "outputs", f"{job_id}.json") if not os.path.exists(output_file): raise HTTPException(status_code=404, detail="Output file not found") return FileResponse(output_file, media_type='application/json', filename=f"{job_id}.json") + + +@app.get("/get-meta-output/{job_id}") +def get_output(job_id: str): + """ + Allows downloading the crawl results as a JSON file. + + Args: + job_id (str): The unique identifier of the crawl job. + + Returns: + FileResponse: The JSON file containing crawl results. + """ + output_file = os.path.join("app", "outputs", f"{job_id}_meta.json") + if not os.path.exists(output_file): + raise HTTPException(status_code=404, detail="Output file not found") + return FileResponse(output_file, media_type='application/json', filename=f"{job_id}_meta.json") diff --git a/app/worker.py b/app/worker.py index 8dc3dcd..f186009 100644 --- a/app/worker.py +++ b/app/worker.py @@ -3,7 +3,7 @@ import asyncio from redis import Redis import os -from app.gptcrawlercore import GPTCrawlerCore +from gptcrawlercore import GPTCrawlerCore import time # Initialize Redis connection @@ -47,9 +47,10 @@ def run_crawler(job_id: str, start_url: str, max_pages: int = 10): # Define output file path using unique job_id output_file = os.path.join("app", "outputs", f"{job_id}.json") + output_meta_file = os.path.join("app", "outputs", f"{job_id}_meta.json") # Write the crawling results to output file - crawler.write_output(output_file=output_file) + crawler.write_output(output_file=output_file, output_file_meta=output_meta_file) # Update job status to completed and set end_time redis_conn.hset(f"job:{job_id}", mapping={ diff --git a/docker-compose.yml b/docker-compose.yml index c560e4c..dd1a37c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,14 +6,16 @@ services: container_name: rediss networks: - app-network - expose: - - "6379" + ports: + - "6380:6379" volumes: - redis-data:/data - command: ["redis-server", "--requirepass", "your_secure_password"] + command: ["redis-server"] restart: unless-stopped healthcheck: test: ["CMD", "redis-cli", "ping"] + start_period: 59s + start_interval: 30s interval: 30s timeout: 10s retries: 3 @@ -21,7 +23,7 @@ services: fastapi: build: context: . - dockerfile: Dockerfile.prod # Use a production-specific Dockerfile + dockerfile: Dockerfile # Use a production-specific Dockerfile container_name: fastapi environment: - ENV=production @@ -46,11 +48,13 @@ services: interval: 30s timeout: 10s retries: 3 + start_period: 59s + start_interval: 30s worker: build: context: . - dockerfile: Dockerfile.prod + dockerfile: Dockerfile container_name: worker environment: - REDIS_URL=redis://rediss:6379/0 @@ -60,6 +64,8 @@ services: networks: - app-network restart: unless-stopped + volumes: + - ./app/outputs:/app/app/outputs/ deploy: resources: limits: