From 10ea09fc31c7b29dc0076fdcc5be8432e69347ea Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Tue, 25 Mar 2025 22:54:02 -0400
Subject: [PATCH 1/7] switched to gemini-flash model. in set_current_index
 moved the checking for the index first to avoid caching from causing
 unauthorized error on pinecone

---
 backend/mainService/src/llm/Pinecone.py                  | 4 ++--
 backend/mainService/src/llm/chat_llm/Gemini_llm.py       | 2 +-
 .../src/scraper/site_specific/async_frontier_scraper.py  | 2 +-
 backend/mainService/src/services/citation_service.py     | 9 +++++----
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/backend/mainService/src/llm/Pinecone.py b/backend/mainService/src/llm/Pinecone.py
index 2ca06d6..7b7b50b 100644
--- a/backend/mainService/src/llm/Pinecone.py
+++ b/backend/mainService/src/llm/Pinecone.py
@@ -186,14 +186,14 @@ async def set_current_index(
 
         :param index_name: Name of the index to set as current
         """
+        if not await self._pc.has_index(index_name):
+                return False
         if not self._current_index_name == index_name and self._current_index:
             await self._current_index.close()
         elif self._current_index_name == index_name:
             return True
 
         if not index_host:
-            if not await self._pc.has_index(index_name):
-                return False
             index_model = await self._pc.describe_index(index_name)
             self._current_index_host = index_model.host
         else:
diff --git a/backend/mainService/src/llm/chat_llm/Gemini_llm.py b/backend/mainService/src/llm/chat_llm/Gemini_llm.py
index 0ce0453..9fabc7a 100644
--- a/backend/mainService/src/llm/chat_llm/Gemini_llm.py
+++ b/backend/mainService/src/llm/chat_llm/Gemini_llm.py
@@ -11,7 +11,7 @@
 
 
 class Genai_cite:
-    model = "gemini-2.0-pro-exp-02-05"
+    model = "gemini-2.0-flash"
 
     def __init__(self, api_key: str = os.getenv("GOOGLE_API_KEY"),
                  llm_model: str = f'models/{model}'):
diff --git a/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py b/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py
index 8ede8ad..06a88a9 100644
--- a/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py
+++ b/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py
@@ -38,7 +38,7 @@ async def _get_download_link(self, url: str) -> Optional[str]:
         try:
             page = await self.context.new_page()
             if not url.endswith("pdf"):
-                await page.goto(url, wait_until='networkidle')
+                await page.goto(url, wait_until='networkidle', timeout=self.element_timeout)
                 await self._interact_with_dropdown(page)
                 download_link = await self._extract_download_link(page)
             else:
diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py
index 324734d..77ed236 100644
--- a/backend/mainService/src/services/citation_service.py
+++ b/backend/mainService/src/services/citation_service.py
@@ -230,10 +230,11 @@ async def _process_documents(
 
         try:
             cleaned_result = search_results["cleaned_result"]
-            download_results = await self.scraper.get_pdfs(
-                target_urls=cleaned_result.get("links"),
-                storage_path=search_results["search_key"]
-            )
+            async with asyncio.timeout(15):  # 15 second timeout
+                download_results = await self.scraper.get_pdfs(
+                    target_urls=cleaned_result.get("links"),
+                    storage_path=search_results["search_key"]
+                )
 
             return await self._prepare_document_batches(
                 download_results,

From c98080ba0222ec5ae661084a9b75a615b260f954 Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Tue, 25 Mar 2025 23:20:17 -0400
Subject: [PATCH 2/7] pinecone script to delete indexes every 30 minutes , to
 be adjusted later

---
 backend/mainService/Dockerfile | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile
index 08ab309..4d6e9a9 100644
--- a/backend/mainService/Dockerfile
+++ b/backend/mainService/Dockerfile
@@ -7,6 +7,7 @@ WORKDIR /app
 # Removes the package lists downloaded during the update to reduce the image size.
 RUN apt-get update && apt-get install -y \
     build-essential \
+    cron \
     && rm -rf /var/lib/apt/lists/*
 
 # Set the PATH environment variable to include /app
@@ -30,9 +31,19 @@ RUN mkdir -p /app/config
 # Install playwright
 RUN playwright install && playwright install-deps 
 
+# Make the cleanup script executable
+RUN chmod +x /app/scripts/run_cleanup.sh
+
+# Add crontab file
+COPY ./scripts/cleanup-crontab /etc/cron.d/cleanup-cron
+RUN chmod 0644 /etc/cron.d/cleanup-cron
+RUN crontab /etc/cron.d/cleanup-cron
+
+# Create log directory
+RUN mkdir -p /var/log
+
 # Expose the port the app runs on
 EXPOSE 8000
 
-
-# Command to run the application
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] 
\ No newline at end of file
+# Start both cron and the FastAPI application
+CMD ["sh", "-c", "cron && uvicorn app:app --host 0.0.0.0 --port 8000"] 
\ No newline at end of file

From f9a60b989930ea0d9fe2c1877ca1283d1f5d87ba Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Tue, 25 Mar 2025 23:20:33 -0400
Subject: [PATCH 3/7] pinecone script to delete indexes every 30 minutes , to
 be adjusted later

---
 backend/mainService/scripts/cleanup-crontab | 1 +
 backend/mainService/scripts/run_cleanup.sh  | 7 +++++++
 2 files changed, 8 insertions(+)
 create mode 100644 backend/mainService/scripts/cleanup-crontab
 create mode 100644 backend/mainService/scripts/run_cleanup.sh

diff --git a/backend/mainService/scripts/cleanup-crontab b/backend/mainService/scripts/cleanup-crontab
new file mode 100644
index 0000000..6e9955f
--- /dev/null
+++ b/backend/mainService/scripts/cleanup-crontab
@@ -0,0 +1 @@
+*/30 * * * * /app/scripts/run_cleanup.sh >> /var/log/cleanup.log 2>&1 
\ No newline at end of file
diff --git a/backend/mainService/scripts/run_cleanup.sh b/backend/mainService/scripts/run_cleanup.sh
new file mode 100644
index 0000000..5daea3a
--- /dev/null
+++ b/backend/mainService/scripts/run_cleanup.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# Navigate to the script directory
+cd "$(dirname "$0")/.."
+
+# Run the cleanup script
+python scripts/delete_stale_data.py 
\ No newline at end of file

From 4a82f672cf81c873f3c1d9f8a85b848ee48e073b Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Wed, 26 Mar 2025 08:31:44 -0400
Subject: [PATCH 4/7] updated docker file to include cron task

---
 backend/mainService/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile
index 4d6e9a9..d0d63e0 100644
--- a/backend/mainService/Dockerfile
+++ b/backend/mainService/Dockerfile
@@ -37,7 +37,7 @@ RUN chmod +x /app/scripts/run_cleanup.sh
 # Add crontab file
 COPY ./scripts/cleanup-crontab /etc/cron.d/cleanup-cron
 RUN chmod 0644 /etc/cron.d/cleanup-cron
-RUN crontab /etc/cron.d/cleanup-cron
+RUN echo "" >> /etc/cron.d/cleanup-cron && crontab /etc/cron.d/cleanup-cron
 
 # Create log directory
 RUN mkdir -p /var/log

From 462146db822ff5f33403c767c4c7cb2a73f32ce7 Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Wed, 26 Mar 2025 14:11:17 -0400
Subject: [PATCH 5/7] always generate a good search key no longer conditional

---
 backend/mainService/src/llm/chat_llm/Groq_llm.py     | 9 +++++++--
 backend/mainService/src/services/citation_service.py | 3 +--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py
index 4d8c967..a2a6370 100644
--- a/backend/mainService/src/llm/chat_llm/Groq_llm.py
+++ b/backend/mainService/src/llm/chat_llm/Groq_llm.py
@@ -19,7 +19,7 @@ def __init__(self, api_key: str = os.getenv("GROQ_API_KEY"),
         self.client = Groq(api_key=self.api_key)
         self.llm_model = llm_model
 
-    def getKeywordSearchTerm(self, document: str) -> Optional[str]:
+    def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = None) -> str:
         """
         Generate a search term from the provided document using LLM.
 
@@ -46,12 +46,17 @@ def getKeywordSearchTerm(self, document: str) -> Optional[str]:
 
             # Make API call with error handling
 
+            if proposed_title:
+                document = f"Here is the proposed title: {proposed_title}\n\nHere is the content: {document}"
+            else:
+                document = f"Here is the content: {document}"
+
             completion = self.client.chat.completions.create(
                 model=self.llm_model,
                 messages=[
                     {
                         "role": "user",
-                        "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'. Here is the content: {document}"
+                        "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}"
                     },
                 ],
                 temperature=0.9,
diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py
index 77ed236..f9a3aa3 100644
--- a/backend/mainService/src/services/citation_service.py
+++ b/backend/mainService/src/services/citation_service.py
@@ -130,8 +130,7 @@ async def process_citation(self,
         """
         try:
             # Step 0: Generate index name
-            title = (self.summarize_llm.getKeywordSearchTerm(content)
-                     if title.lower() == "untitled" else title)
+            title = self.summarize_llm.getKeywordSearchTerm(content, proposed_title=title)
             index_name = self._generate_index_name(title)
             logger.info(f"index_name = {index_name}")
             if await self.PC.set_current_index(index_name):

From 223214dd3f1432b647fda835b49ed10065ecc4cc Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Wed, 26 Mar 2025 14:25:12 -0400
Subject: [PATCH 6/7] clean up

---
 .gitignore                                    |   1 +
 backend/mainService/Dockerfile                |  11 --
 backend/mainService/scripts/cleanup-crontab   |   1 -
 .../mainService/scripts/delete_stale_data.py  | 125 ------------------
 backend/mainService/scripts/run_cleanup.sh    |   7 -
 backend/mainService/src/config/config.py      |   2 +-
 .../src/services/author_reputation.py         |   9 +-
 .../metricsService/src/utils/api_config.py    |   1 +
 8 files changed, 8 insertions(+), 149 deletions(-)
 delete mode 100644 backend/mainService/scripts/cleanup-crontab
 delete mode 100644 backend/mainService/scripts/delete_stale_data.py
 delete mode 100644 backend/mainService/scripts/run_cleanup.sh

diff --git a/.gitignore b/.gitignore
index 75c1545..1fbafed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,3 +59,4 @@ unit_test.py
 testing_workflow.py
 *.yaml
 
+scripts/
diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile
index d0d63e0..084364c 100644
--- a/backend/mainService/Dockerfile
+++ b/backend/mainService/Dockerfile
@@ -31,17 +31,6 @@ RUN mkdir -p /app/config
 # Install playwright
 RUN playwright install && playwright install-deps 
 
-# Make the cleanup script executable
-RUN chmod +x /app/scripts/run_cleanup.sh
-
-# Add crontab file
-COPY ./scripts/cleanup-crontab /etc/cron.d/cleanup-cron
-RUN chmod 0644 /etc/cron.d/cleanup-cron
-RUN echo "" >> /etc/cron.d/cleanup-cron && crontab /etc/cron.d/cleanup-cron
-
-# Create log directory
-RUN mkdir -p /var/log
-
 # Expose the port the app runs on
 EXPOSE 8000
 
diff --git a/backend/mainService/scripts/cleanup-crontab b/backend/mainService/scripts/cleanup-crontab
deleted file mode 100644
index 6e9955f..0000000
--- a/backend/mainService/scripts/cleanup-crontab
+++ /dev/null
@@ -1 +0,0 @@
-*/30 * * * * /app/scripts/run_cleanup.sh >> /var/log/cleanup.log 2>&1 
\ No newline at end of file
diff --git a/backend/mainService/scripts/delete_stale_data.py b/backend/mainService/scripts/delete_stale_data.py
deleted file mode 100644
index fb4786e..0000000
--- a/backend/mainService/scripts/delete_stale_data.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import os
-import json
-import asyncio
-from datetime import datetime, timezone, timedelta
-from typing import Dict, List, Tuple
-from pinecone import PineconeAsyncio as Pinecone
-from collections import defaultdict
-from src.config.log_config import setup_logging
-
-log_filename = os.path.basename(__file__)
-logger = setup_logging(filename=log_filename)
-
-# Initialize Pinecone with your API key and environment
-
-
-INDEX_DICT_FILE = 'index_dict.json'
-THRESHOLD_HOURS = 2  # Delete indexes older than 2 hours
-
-
-def load_index_dict() -> Dict[str, List[str]]:
-    """Load the index dictionary from the JSON file.    
-    Returns:
-        Dict[str, List[str]]: Dictionary mapping hourly timestamps to lists of index names.
-        Returns defaultdict with empty list as default if file doesn't exist.
-    """
-    if os.path.exists(INDEX_DICT_FILE):
-        with open(INDEX_DICT_FILE, 'r') as f:
-            return defaultdict(list, json.load(f))
-    return defaultdict(list)
-
-
-def save_index_dict(index_dict: Dict[str, List[str]]) -> None:
-    """Save the index dictionary to the JSON file.
-    
-    Args:
-        index_dict (Dict[str, List[str]]): Dictionary mapping timestamps to lists of index names.
-    """
-    with open(INDEX_DICT_FILE, 'w') as f:
-        json.dump(dict(index_dict), f)  # Convert defaultdict to regular dict for JSON serialization
-
-
-async def delete_index(index_name: str, pc:Pinecone) -> Tuple[str, bool, str]:
-    """Delete a single Pinecone index.
-    
-    Args:
-        index_name (str): Name of the index to delete
-        
-    Returns:
-        Tuple[str, bool, str]: Tuple containing:
-            - Index name
-            - Boolean indicating success/failure
-            - Error message if failure, empty string if success
-    """
-    
-    try:
-        await pc.delete_index(index_name)
-        return index_name, True, ""
-    except Exception as e:
-        return index_name, False, str(e)
-
-
-async def delete_old_indexes(threshold_hours: int = THRESHOLD_HOURS) -> None:
-    """Asynchronously delete Pinecone indexes older than the threshold.
-    
-    This function loads the index dictionary, identifies indexes from timestamps older
-    than the threshold, and deletes them concurrently using asyncio.gather.
-    
-    Args:
-        threshold_hours (int, optional): Age threshold in hours. Defaults to THRESHOLD_HOURS.
-    """
-    API_KEY = os.getenv("PINECONE_API_KEY")
-    pc = Pinecone(api_key=API_KEY)
-    
-    index_dict = load_index_dict()
-    now = datetime.now(timezone.utc)
-    updated_dict = defaultdict(list)
-    indexes_to_delete = []
-    
-    # Process each timestamp and its indexes
-    for timestamp_str, index_list in index_dict.items():
-        try:
-            creation_time = datetime.strptime(timestamp_str, "%Y-%m-%d %H").replace(tzinfo=timezone.utc)
-            print("creeation_time:", creation_time)
-            if now - creation_time >= timedelta(minutes=threshold_hours):
-                # Add all indexes from this timestamp to deletion list
-                indexes_to_delete.extend(index_list)
-            else:
-                # Keep indexes from recent timestamps
-                updated_dict[timestamp_str] = index_list
-        except ValueError as e:
-            logger.exception(f"Error parsing timestamp '{timestamp_str}': {e}")
-            # Keep entries with invalid timestamps for manual review
-            updated_dict[timestamp_str] = index_list
-    
-    if indexes_to_delete:
-        # Execute deletions concurrently
-        results = await asyncio.gather(
-            *[delete_index(index_name, pc=pc) for index_name in indexes_to_delete],
-            return_exceptions=True
-        )        
-        # Process results
-        for result in results:
-            if isinstance(result, Exception):
-                logger.error(f"Unexpected error during deletion: {result}")
-                continue
-                
-            index_name, success, error = result
-            if success:
-                logger.info(f"Successfully deleted index '{index_name}'")
-            else:
-                print(f"Failed to delete index '{index_name}': {error}")
-                # For failed deletions, keep them in their original timestamp bucket
-                # This requires finding the original timestamp
-                for timestamp_str, indexes in index_dict.items():
-                    if index_name in indexes:
-                        updated_dict[timestamp_str].append(index_name)
-                        break
-    
-    save_index_dict(updated_dict)
-    print("Deletion job complete. Remaining indexes by timestamp:", 
-          {ts: indexes for ts, indexes in updated_dict.items() if indexes})
-    await pc.close()
-
-if __name__ == "__main__":
-    asyncio.run(delete_old_indexes())
\ No newline at end of file
diff --git a/backend/mainService/scripts/run_cleanup.sh b/backend/mainService/scripts/run_cleanup.sh
deleted file mode 100644
index 5daea3a..0000000
--- a/backend/mainService/scripts/run_cleanup.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-# Navigate to the script directory
-cd "$(dirname "$0")/.."
-
-# Run the cleanup script
-python scripts/delete_stale_data.py 
\ No newline at end of file
diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py
index c70b91d..7ca0a07 100644
--- a/backend/mainService/src/config/config.py
+++ b/backend/mainService/src/config/config.py
@@ -32,7 +32,7 @@ class ScraperConfig:
     """
     This is the timeout duration for the requests made to the web scraper
     """
-    TIMEOUT_DURATION: int = 8000
+    TIMEOUT_DURATION: int = 10000
 
     def __post_init__(self):
         if self.MAX_FILE_SIZE <= 0:
diff --git a/backend/metricsService/src/services/author_reputation.py b/backend/metricsService/src/services/author_reputation.py
index fea9b71..5ca821a 100644
--- a/backend/metricsService/src/services/author_reputation.py
+++ b/backend/metricsService/src/services/author_reputation.py
@@ -37,7 +37,8 @@
 from ..utils.api_config import (
     ORCID_API,
     SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API,
-    OPEN_ALEX_AUTHOR_API
+    OPEN_ALEX_AUTHOR_API,
+    DEFAULT_TIMEOUT
 )
 from ..utils.api_utils import rate_limit
 from ..utils.logging_config import get_logger
@@ -64,7 +65,7 @@ async def get_authorship_reputation(author_id: Optional[str] = None, author_name
             orcid_response = requests.get(
                 f"{ORCID_API}{author_id}/works",
                 headers={"Accept": "application/json"},
-                timeout=15
+                timeout=DEFAULT_TIMEOUT
             )
             if orcid_response.status_code == 200:
                 orcid_data = orcid_response.json()
@@ -119,7 +120,7 @@ async def get_openalex_author_reputation(author_name: str):
     """Fetch author reputation from OpenAlex using the authors endpoint."""
     await rate_limit()
     try:
-        response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=10)
+        response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=DEFAULT_TIMEOUT)
         if response.status_code == 200:
             data = response.json()
             if data.get("results"):
@@ -138,7 +139,7 @@ async def get_semantic_scholar_author_reputation(author_name: str):
     await rate_limit()
     try:
         params = {"query": author_name, "fields": "hIndex,paperCount", "limit": 1}
-        response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=10)
+        response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=DEFAULT_TIMEOUT)
         if response.status_code == 200:
             data = response.json()
             if data.get("data") and len(data["data"]) > 0:
diff --git a/backend/metricsService/src/utils/api_config.py b/backend/metricsService/src/utils/api_config.py
index 1267b11..8b6c9c6 100644
--- a/backend/metricsService/src/utils/api_config.py
+++ b/backend/metricsService/src/utils/api_config.py
@@ -34,3 +34,4 @@
 OPEN_CITATIONS_API = "https://opencitations.net/index/api/v1/"
 MAX_CONCURRENT_WORKERS = 20
 DEFAULT_CONCURRENT_WORKERS = 10
+DEFAULT_TIMEOUT = 10

From 88cb97be208f2d910080a1f874a343730a3c9f29 Mon Sep 17 00:00:00 2001
From: Ikeoluwa Oladele <oladelesamson156@gmail.com>
Date: Wed, 26 Mar 2025 14:27:51 -0400
Subject: [PATCH 7/7] add pypdf as part of requirements

---
 backend/mainService/requirements.txt    | 2 ++
 backend/metricsService/requirements.txt | 1 +
 2 files changed, 3 insertions(+)

diff --git a/backend/mainService/requirements.txt b/backend/mainService/requirements.txt
index 3ea9a61..a5189a8 100644
--- a/backend/mainService/requirements.txt
+++ b/backend/mainService/requirements.txt
@@ -25,4 +25,6 @@ google-genai
 redis>=4.2.0
 uvicorn
 httpx>=0.28.1
+pypdf
+pypdf2
 
diff --git a/backend/metricsService/requirements.txt b/backend/metricsService/requirements.txt
index 183bb12..3d56cd0 100644
--- a/backend/metricsService/requirements.txt
+++ b/backend/metricsService/requirements.txt
@@ -7,3 +7,4 @@ python-dotenv==1.0.1
 Requests==2.32.3
 scholarly==1.7.11
 uvicorn
+