From a495ff53e1cdeb400b7e937ba0dfc2d46b755ed5 Mon Sep 17 00:00:00 2001
From: raaj-love-to-code <sraajkumar12345@gmail.com>
Date: Fri, 17 Jan 2025 15:55:03 +0530
Subject: [PATCH] Added a endpoint to download the file containing links for
 the page that contains the word preboot

---
 Dockerfile            | 10 +++++-----
 Dockerfile .old       | 29 +++++++++++++++++++++++++++++
 app/gptcrawlercore.py | 16 +++++++++++++++-
 app/main.py           | 21 +++++++++++++++++++--
 app/worker.py         |  5 +++--
 docker-compose.yml    | 16 +++++++++++-----
 6 files changed, 82 insertions(+), 15 deletions(-)
 create mode 100644 Dockerfile .old

diff --git a/Dockerfile b/Dockerfile
index 6ea1142..f3b4a5c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,18 +10,18 @@ COPY ./app/requirements.txt /app/
 # Install Python dependencies
 RUN pip install --no-cache-dir -r /app/requirements.txt
 
-# Copy the rest of the application files
-COPY ./app /app
-
 # Install Playwright and required dependencies
 RUN apt-get update && apt-get install -y libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxcomposite1 libxrandr2 libgbm-dev libasound2 libpangocairo-1.0-0 libxdamage1 libpango-1.0-0 libgtk-3-0 libx11-xcb1
 RUN pip install playwright && playwright install
 
+# Copy the rest of the application files
+COPY ./app .
+
 # Ensure outputs directory exists and is writable
-RUN mkdir -p /app/outputs && chmod -R 777 /app/outputs
+RUN mkdir -p ./outputs && chmod -R 777 ./outputs
 
 # Expose port 8000 for the FastAPI service
 EXPOSE 8000
 
 # Run the FastAPI server using uvicorn
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/Dockerfile .old b/Dockerfile .old
new file mode 100644
index 0000000..bd7938c
--- /dev/null
+++ b/Dockerfile .old	
@@ -0,0 +1,29 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy requirements.txt from the app directory
+COPY ./app/requirements.txt /app/
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r /app/requirements.txt
+
+# Install Playwright and required dependencies
+RUN apt-get update && apt-get install -y libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxcomposite1 libxrandr2 libgbm-dev libasound2 libpangocairo-1.0-0 libxdamage1 libpango-1.0-0 libgtk-3-0 libx11-xcb1
+RUN pip install playwright && playwright install
+
+# Copy the rest of the application files
+COPY ./app ./app
+
+# Ensure outputs directory exists and is writable
+RUN mkdir -p /app/app/outputs && chmod -R 777 /app/app/outputs
+
+# Expose port 8000 for the FastAPI service
+EXPOSE 8000
+
+# CMD ["python3", "test.py"]
+
+# Run the FastAPI server using uvicorn
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/app/gptcrawlercore.py b/app/gptcrawlercore.py
index 6208770..8960410 100644
--- a/app/gptcrawlercore.py
+++ b/app/gptcrawlercore.py
@@ -33,6 +33,7 @@ def __init__(self, start_url: str, max_pages: int = 100, concurrency: int = 5, j
         self.visited: Set[str] = set()
         self.to_visit: List[str] = [self.start_url]
         self.results: List[dict] = []
+        self.contains_preboot_text: List[str] = []
         self.retry_limit = 3  # Number of retries for failed pages
         self.retry_count = {}  # Track retries per URL
         self.sem = asyncio.Semaphore(self.concurrency)  # Semaphore for concurrency control
@@ -259,6 +260,12 @@ async def crawl_page(self, context, url: str):
                 title = await page.title()
                 html_content = await self.get_page_html(page, "body")
                 text_content = self.extract_text_from_html(html_content)
+                
+                # Store if contains preboot word
+                pattern = r'\bpre[-\s]?boot\w*\b'
+                if re.search(pattern, text_content, re.IGNORECASE):
+                    self.contains_preboot_text.append(url)
+
                 self.results.append({
                     "title": title,
                     "url": url,
@@ -313,7 +320,7 @@ async def crawl(self):
             await browser.close()
             print("[INFO] Browser closed.")
 
-    def write_output(self, output_file: str):
+    def write_output(self, output_file: str, output_file_meta : str = None):
         """
         Writes the crawl results to a JSON file.
 
@@ -327,5 +334,12 @@ def write_output(self, output_file: str):
             with open(output_file, 'w', encoding='utf-8') as f:
                 json.dump(self.results, f, ensure_ascii=False, indent=2)
             print(f"[INFO] Output written to {output_file}")
+
+            if output_file_meta:
+                output_dir = os.path.dirname(output_file_meta)
+                os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists
+                with open(output_file_meta, 'w', encoding='utf-8') as f:
+                    json.dump(self.contains_preboot_text, f, ensure_ascii=False, indent=2)
+                print(f"[INFO] Meta Output written to {output_file_meta}")
         except Exception as e:
             self.record_error(self.start_url, f"Failed to write output file: {e}")
diff --git a/app/main.py b/app/main.py
index 758c401..761e277 100644
--- a/app/main.py
+++ b/app/main.py
@@ -7,7 +7,7 @@
 from rq import Queue
 import os
 import time
-from app.worker import run_crawler
+from worker import run_crawler
 import json
 import uuid  # For generating unique job IDs
 from typing import List  # Import List for type hinting
@@ -169,7 +169,24 @@ def get_output(job_id: str):
     Returns:
         FileResponse: The JSON file containing crawl results.
     """
-    output_file = f"app/outputs/{job_id}.json"
+    output_file = os.path.join("app", "outputs", f"{job_id}.json")
     if not os.path.exists(output_file):
         raise HTTPException(status_code=404, detail="Output file not found")
     return FileResponse(output_file, media_type='application/json', filename=f"{job_id}.json")
+
+
+@app.get("/get-meta-output/{job_id}")
+def get_output(job_id: str):
+    """
+    Allows downloading the crawl results as a JSON file.
+
+    Args:
+        job_id (str): The unique identifier of the crawl job.
+
+    Returns:
+        FileResponse: The JSON file containing crawl results.
+    """
+    output_file = os.path.join("app", "outputs", f"{job_id}_meta.json")
+    if not os.path.exists(output_file):
+        raise HTTPException(status_code=404, detail="Output file not found")
+    return FileResponse(output_file, media_type='application/json', filename=f"{job_id}_meta.json")
diff --git a/app/worker.py b/app/worker.py
index 8dc3dcd..f186009 100644
--- a/app/worker.py
+++ b/app/worker.py
@@ -3,7 +3,7 @@
 import asyncio
 from redis import Redis
 import os
-from app.gptcrawlercore import GPTCrawlerCore
+from gptcrawlercore import GPTCrawlerCore
 import time
 
 # Initialize Redis connection
@@ -47,9 +47,10 @@ def run_crawler(job_id: str, start_url: str, max_pages: int = 10):
 
     # Define output file path using unique job_id
     output_file = os.path.join("app", "outputs", f"{job_id}.json")
+    output_meta_file = os.path.join("app", "outputs", f"{job_id}_meta.json")
 
     # Write the crawling results to output file
-    crawler.write_output(output_file=output_file)
+    crawler.write_output(output_file=output_file, output_file_meta=output_meta_file)
 
     # Update job status to completed and set end_time
     redis_conn.hset(f"job:{job_id}", mapping={
diff --git a/docker-compose.yml b/docker-compose.yml
index c560e4c..dd1a37c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,14 +6,16 @@ services:
     container_name: rediss
     networks:
       - app-network
-    expose:
-      - "6379"
+    ports:
+      - "6380:6379"
     volumes:
       - redis-data:/data
-    command: ["redis-server", "--requirepass", "your_secure_password"]
+    command: ["redis-server"]
     restart: unless-stopped
     healthcheck:
       test: ["CMD", "redis-cli", "ping"]
+      start_period: 59s
+      start_interval: 30s
       interval: 30s
       timeout: 10s
       retries: 3
@@ -21,7 +23,7 @@ services:
   fastapi:
     build:
       context: .
-      dockerfile: Dockerfile.prod  # Use a production-specific Dockerfile
+      dockerfile: Dockerfile  # Use a production-specific Dockerfile
     container_name: fastapi
     environment:
       - ENV=production
@@ -46,11 +48,13 @@ services:
       interval: 30s
       timeout: 10s
       retries: 3
+      start_period: 59s
+      start_interval: 30s
 
   worker:
     build:
       context: .
-      dockerfile: Dockerfile.prod
+      dockerfile: Dockerfile
     container_name: worker
     environment:
       - REDIS_URL=redis://rediss:6379/0
@@ -60,6 +64,8 @@ services:
     networks:
       - app-network
     restart: unless-stopped
+    volumes:
+      - ./app/outputs:/app/app/outputs/
     deploy:
       resources:
         limits: