From 10ea09fc31c7b29dc0076fdcc5be8432e69347ea Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Tue, 25 Mar 2025 22:54:02 -0400 Subject: [PATCH 1/7] switched to gemini-flash model. in set_current_index moved the checking for the index first to avoid caching from causing unauthorized error on pinecone --- backend/mainService/src/llm/Pinecone.py | 4 ++-- backend/mainService/src/llm/chat_llm/Gemini_llm.py | 2 +- .../src/scraper/site_specific/async_frontier_scraper.py | 2 +- backend/mainService/src/services/citation_service.py | 9 +++++---- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/backend/mainService/src/llm/Pinecone.py b/backend/mainService/src/llm/Pinecone.py index 2ca06d6..7b7b50b 100644 --- a/backend/mainService/src/llm/Pinecone.py +++ b/backend/mainService/src/llm/Pinecone.py @@ -186,14 +186,14 @@ async def set_current_index( :param index_name: Name of the index to set as current """ + if not await self._pc.has_index(index_name): + return False if not self._current_index_name == index_name and self._current_index: await self._current_index.close() elif self._current_index_name == index_name: return True if not index_host: - if not await self._pc.has_index(index_name): - return False index_model = await self._pc.describe_index(index_name) self._current_index_host = index_model.host else: diff --git a/backend/mainService/src/llm/chat_llm/Gemini_llm.py b/backend/mainService/src/llm/chat_llm/Gemini_llm.py index 0ce0453..9fabc7a 100644 --- a/backend/mainService/src/llm/chat_llm/Gemini_llm.py +++ b/backend/mainService/src/llm/chat_llm/Gemini_llm.py @@ -11,7 +11,7 @@ class Genai_cite: - model = "gemini-2.0-pro-exp-02-05" + model = "gemini-2.0-flash" def __init__(self, api_key: str = os.getenv("GOOGLE_API_KEY"), llm_model: str = f'models/{model}'): diff --git a/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py b/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py index 8ede8ad..06a88a9 100644 --- a/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py +++ b/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py @@ -38,7 +38,7 @@ async def _get_download_link(self, url: str) -> Optional[str]: try: page = await self.context.new_page() if not url.endswith("pdf"): - await page.goto(url, wait_until='networkidle') + await page.goto(url, wait_until='networkidle', timeout=self.element_timeout) await self._interact_with_dropdown(page) download_link = await self._extract_download_link(page) else: diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py index 324734d..77ed236 100644 --- a/backend/mainService/src/services/citation_service.py +++ b/backend/mainService/src/services/citation_service.py @@ -230,10 +230,11 @@ async def _process_documents( try: cleaned_result = search_results["cleaned_result"] - download_results = await self.scraper.get_pdfs( - target_urls=cleaned_result.get("links"), - storage_path=search_results["search_key"] - ) + async with asyncio.timeout(15): # 15 second timeout + download_results = await self.scraper.get_pdfs( + target_urls=cleaned_result.get("links"), + storage_path=search_results["search_key"] + ) return await self._prepare_document_batches( download_results, From c98080ba0222ec5ae661084a9b75a615b260f954 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Tue, 25 Mar 2025 23:20:17 -0400 Subject: [PATCH 2/7] pinecone script to delete indexes every 30 minutes , to be adjusted later --- backend/mainService/Dockerfile | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile index 08ab309..4d6e9a9 100644 --- a/backend/mainService/Dockerfile +++ b/backend/mainService/Dockerfile @@ -7,6 +7,7 @@ WORKDIR /app # Removes the package lists downloaded during the update to reduce the image size. RUN apt-get update && apt-get install -y \ build-essential \ + cron \ && rm -rf /var/lib/apt/lists/* # Set the PATH environment variable to include /app @@ -30,9 +31,19 @@ RUN mkdir -p /app/config # Install playwright RUN playwright install && playwright install-deps +# Make the cleanup script executable +RUN chmod +x /app/scripts/run_cleanup.sh + +# Add crontab file +COPY ./scripts/cleanup-crontab /etc/cron.d/cleanup-cron +RUN chmod 0644 /etc/cron.d/cleanup-cron +RUN crontab /etc/cron.d/cleanup-cron + +# Create log directory +RUN mkdir -p /var/log + # Expose the port the app runs on EXPOSE 8000 - -# Command to run the application -CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file +# Start both cron and the FastAPI application +CMD ["sh", "-c", "cron && uvicorn app:app --host 0.0.0.0 --port 8000"] \ No newline at end of file From f9a60b989930ea0d9fe2c1877ca1283d1f5d87ba Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Tue, 25 Mar 2025 23:20:33 -0400 Subject: [PATCH 3/7] pinecone script to delete indexes every 30 minutes , to be adjusted later --- backend/mainService/scripts/cleanup-crontab | 1 + backend/mainService/scripts/run_cleanup.sh | 7 +++++++ 2 files changed, 8 insertions(+) create mode 100644 backend/mainService/scripts/cleanup-crontab create mode 100644 backend/mainService/scripts/run_cleanup.sh diff --git a/backend/mainService/scripts/cleanup-crontab b/backend/mainService/scripts/cleanup-crontab new file mode 100644 index 0000000..6e9955f --- /dev/null +++ b/backend/mainService/scripts/cleanup-crontab @@ -0,0 +1 @@ +*/30 * * * * /app/scripts/run_cleanup.sh >> /var/log/cleanup.log 2>&1 \ No newline at end of file diff --git a/backend/mainService/scripts/run_cleanup.sh b/backend/mainService/scripts/run_cleanup.sh new file mode 100644 index 0000000..5daea3a --- /dev/null +++ b/backend/mainService/scripts/run_cleanup.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# Navigate to the script directory +cd "$(dirname "$0")/.." + +# Run the cleanup script +python scripts/delete_stale_data.py \ No newline at end of file From 4a82f672cf81c873f3c1d9f8a85b848ee48e073b Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Wed, 26 Mar 2025 08:31:44 -0400 Subject: [PATCH 4/7] updated docker file to include cron task --- backend/mainService/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile index 4d6e9a9..d0d63e0 100644 --- a/backend/mainService/Dockerfile +++ b/backend/mainService/Dockerfile @@ -37,7 +37,7 @@ RUN chmod +x /app/scripts/run_cleanup.sh # Add crontab file COPY ./scripts/cleanup-crontab /etc/cron.d/cleanup-cron RUN chmod 0644 /etc/cron.d/cleanup-cron -RUN crontab /etc/cron.d/cleanup-cron +RUN echo "" >> /etc/cron.d/cleanup-cron && crontab /etc/cron.d/cleanup-cron # Create log directory RUN mkdir -p /var/log From 462146db822ff5f33403c767c4c7cb2a73f32ce7 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Wed, 26 Mar 2025 14:11:17 -0400 Subject: [PATCH 5/7] always generate a good search key no longer conditional --- backend/mainService/src/llm/chat_llm/Groq_llm.py | 9 +++++++-- backend/mainService/src/services/citation_service.py | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py index 4d8c967..a2a6370 100644 --- a/backend/mainService/src/llm/chat_llm/Groq_llm.py +++ b/backend/mainService/src/llm/chat_llm/Groq_llm.py @@ -19,7 +19,7 @@ def __init__(self, api_key: str = os.getenv("GROQ_API_KEY"), self.client = Groq(api_key=self.api_key) self.llm_model = llm_model - def getKeywordSearchTerm(self, document: str) -> Optional[str]: + def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = None) -> str: """ Generate a search term from the provided document using LLM. @@ -46,12 +46,17 @@ def getKeywordSearchTerm(self, document: str) -> Optional[str]: # Make API call with error handling + if proposed_title: + document = f"Here is the proposed title: {proposed_title}\n\nHere is the content: {document}" + else: + document = f"Here is the content: {document}" + completion = self.client.chat.completions.create( model=self.llm_model, messages=[ { "role": "user", - "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'. Here is the content: {document}" + "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}" }, ], temperature=0.9, diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py index 77ed236..f9a3aa3 100644 --- a/backend/mainService/src/services/citation_service.py +++ b/backend/mainService/src/services/citation_service.py @@ -130,8 +130,7 @@ async def process_citation(self, """ try: # Step 0: Generate index name - title = (self.summarize_llm.getKeywordSearchTerm(content) - if title.lower() == "untitled" else title) + title = self.summarize_llm.getKeywordSearchTerm(content, proposed_title=title) index_name = self._generate_index_name(title) logger.info(f"index_name = {index_name}") if await self.PC.set_current_index(index_name): From 223214dd3f1432b647fda835b49ed10065ecc4cc Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Wed, 26 Mar 2025 14:25:12 -0400 Subject: [PATCH 6/7] clean up --- .gitignore | 1 + backend/mainService/Dockerfile | 11 -- backend/mainService/scripts/cleanup-crontab | 1 - .../mainService/scripts/delete_stale_data.py | 125 ------------------ backend/mainService/scripts/run_cleanup.sh | 7 - backend/mainService/src/config/config.py | 2 +- .../src/services/author_reputation.py | 9 +- .../metricsService/src/utils/api_config.py | 1 + 8 files changed, 8 insertions(+), 149 deletions(-) delete mode 100644 backend/mainService/scripts/cleanup-crontab delete mode 100644 backend/mainService/scripts/delete_stale_data.py delete mode 100644 backend/mainService/scripts/run_cleanup.sh diff --git a/.gitignore b/.gitignore index 75c1545..1fbafed 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,4 @@ unit_test.py testing_workflow.py *.yaml +scripts/ diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile index d0d63e0..084364c 100644 --- a/backend/mainService/Dockerfile +++ b/backend/mainService/Dockerfile @@ -31,17 +31,6 @@ RUN mkdir -p /app/config # Install playwright RUN playwright install && playwright install-deps -# Make the cleanup script executable -RUN chmod +x /app/scripts/run_cleanup.sh - -# Add crontab file -COPY ./scripts/cleanup-crontab /etc/cron.d/cleanup-cron -RUN chmod 0644 /etc/cron.d/cleanup-cron -RUN echo "" >> /etc/cron.d/cleanup-cron && crontab /etc/cron.d/cleanup-cron - -# Create log directory -RUN mkdir -p /var/log - # Expose the port the app runs on EXPOSE 8000 diff --git a/backend/mainService/scripts/cleanup-crontab b/backend/mainService/scripts/cleanup-crontab deleted file mode 100644 index 6e9955f..0000000 --- a/backend/mainService/scripts/cleanup-crontab +++ /dev/null @@ -1 +0,0 @@ -*/30 * * * * /app/scripts/run_cleanup.sh >> /var/log/cleanup.log 2>&1 \ No newline at end of file diff --git a/backend/mainService/scripts/delete_stale_data.py b/backend/mainService/scripts/delete_stale_data.py deleted file mode 100644 index fb4786e..0000000 --- a/backend/mainService/scripts/delete_stale_data.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import json -import asyncio -from datetime import datetime, timezone, timedelta -from typing import Dict, List, Tuple -from pinecone import PineconeAsyncio as Pinecone -from collections import defaultdict -from src.config.log_config import setup_logging - -log_filename = os.path.basename(__file__) -logger = setup_logging(filename=log_filename) - -# Initialize Pinecone with your API key and environment - - -INDEX_DICT_FILE = 'index_dict.json' -THRESHOLD_HOURS = 2 # Delete indexes older than 2 hours - - -def load_index_dict() -> Dict[str, List[str]]: - """Load the index dictionary from the JSON file. - Returns: - Dict[str, List[str]]: Dictionary mapping hourly timestamps to lists of index names. - Returns defaultdict with empty list as default if file doesn't exist. - """ - if os.path.exists(INDEX_DICT_FILE): - with open(INDEX_DICT_FILE, 'r') as f: - return defaultdict(list, json.load(f)) - return defaultdict(list) - - -def save_index_dict(index_dict: Dict[str, List[str]]) -> None: - """Save the index dictionary to the JSON file. - - Args: - index_dict (Dict[str, List[str]]): Dictionary mapping timestamps to lists of index names. - """ - with open(INDEX_DICT_FILE, 'w') as f: - json.dump(dict(index_dict), f) # Convert defaultdict to regular dict for JSON serialization - - -async def delete_index(index_name: str, pc:Pinecone) -> Tuple[str, bool, str]: - """Delete a single Pinecone index. - - Args: - index_name (str): Name of the index to delete - - Returns: - Tuple[str, bool, str]: Tuple containing: - - Index name - - Boolean indicating success/failure - - Error message if failure, empty string if success - """ - - try: - await pc.delete_index(index_name) - return index_name, True, "" - except Exception as e: - return index_name, False, str(e) - - -async def delete_old_indexes(threshold_hours: int = THRESHOLD_HOURS) -> None: - """Asynchronously delete Pinecone indexes older than the threshold. - - This function loads the index dictionary, identifies indexes from timestamps older - than the threshold, and deletes them concurrently using asyncio.gather. - - Args: - threshold_hours (int, optional): Age threshold in hours. Defaults to THRESHOLD_HOURS. - """ - API_KEY = os.getenv("PINECONE_API_KEY") - pc = Pinecone(api_key=API_KEY) - - index_dict = load_index_dict() - now = datetime.now(timezone.utc) - updated_dict = defaultdict(list) - indexes_to_delete = [] - - # Process each timestamp and its indexes - for timestamp_str, index_list in index_dict.items(): - try: - creation_time = datetime.strptime(timestamp_str, "%Y-%m-%d %H").replace(tzinfo=timezone.utc) - print("creeation_time:", creation_time) - if now - creation_time >= timedelta(minutes=threshold_hours): - # Add all indexes from this timestamp to deletion list - indexes_to_delete.extend(index_list) - else: - # Keep indexes from recent timestamps - updated_dict[timestamp_str] = index_list - except ValueError as e: - logger.exception(f"Error parsing timestamp '{timestamp_str}': {e}") - # Keep entries with invalid timestamps for manual review - updated_dict[timestamp_str] = index_list - - if indexes_to_delete: - # Execute deletions concurrently - results = await asyncio.gather( - *[delete_index(index_name, pc=pc) for index_name in indexes_to_delete], - return_exceptions=True - ) - # Process results - for result in results: - if isinstance(result, Exception): - logger.error(f"Unexpected error during deletion: {result}") - continue - - index_name, success, error = result - if success: - logger.info(f"Successfully deleted index '{index_name}'") - else: - print(f"Failed to delete index '{index_name}': {error}") - # For failed deletions, keep them in their original timestamp bucket - # This requires finding the original timestamp - for timestamp_str, indexes in index_dict.items(): - if index_name in indexes: - updated_dict[timestamp_str].append(index_name) - break - - save_index_dict(updated_dict) - print("Deletion job complete. Remaining indexes by timestamp:", - {ts: indexes for ts, indexes in updated_dict.items() if indexes}) - await pc.close() - -if __name__ == "__main__": - asyncio.run(delete_old_indexes()) \ No newline at end of file diff --git a/backend/mainService/scripts/run_cleanup.sh b/backend/mainService/scripts/run_cleanup.sh deleted file mode 100644 index 5daea3a..0000000 --- a/backend/mainService/scripts/run_cleanup.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# Navigate to the script directory -cd "$(dirname "$0")/.." - -# Run the cleanup script -python scripts/delete_stale_data.py \ No newline at end of file diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py index c70b91d..7ca0a07 100644 --- a/backend/mainService/src/config/config.py +++ b/backend/mainService/src/config/config.py @@ -32,7 +32,7 @@ class ScraperConfig: """ This is the timeout duration for the requests made to the web scraper """ - TIMEOUT_DURATION: int = 8000 + TIMEOUT_DURATION: int = 10000 def __post_init__(self): if self.MAX_FILE_SIZE <= 0: diff --git a/backend/metricsService/src/services/author_reputation.py b/backend/metricsService/src/services/author_reputation.py index fea9b71..5ca821a 100644 --- a/backend/metricsService/src/services/author_reputation.py +++ b/backend/metricsService/src/services/author_reputation.py @@ -37,7 +37,8 @@ from ..utils.api_config import ( ORCID_API, SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, - OPEN_ALEX_AUTHOR_API + OPEN_ALEX_AUTHOR_API, + DEFAULT_TIMEOUT ) from ..utils.api_utils import rate_limit from ..utils.logging_config import get_logger @@ -64,7 +65,7 @@ async def get_authorship_reputation(author_id: Optional[str] = None, author_name orcid_response = requests.get( f"{ORCID_API}{author_id}/works", headers={"Accept": "application/json"}, - timeout=15 + timeout=DEFAULT_TIMEOUT ) if orcid_response.status_code == 200: orcid_data = orcid_response.json() @@ -119,7 +120,7 @@ async def get_openalex_author_reputation(author_name: str): """Fetch author reputation from OpenAlex using the authors endpoint.""" await rate_limit() try: - response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=10) + response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=DEFAULT_TIMEOUT) if response.status_code == 200: data = response.json() if data.get("results"): @@ -138,7 +139,7 @@ async def get_semantic_scholar_author_reputation(author_name: str): await rate_limit() try: params = {"query": author_name, "fields": "hIndex,paperCount", "limit": 1} - response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=10) + response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=DEFAULT_TIMEOUT) if response.status_code == 200: data = response.json() if data.get("data") and len(data["data"]) > 0: diff --git a/backend/metricsService/src/utils/api_config.py b/backend/metricsService/src/utils/api_config.py index 1267b11..8b6c9c6 100644 --- a/backend/metricsService/src/utils/api_config.py +++ b/backend/metricsService/src/utils/api_config.py @@ -34,3 +34,4 @@ OPEN_CITATIONS_API = "https://opencitations.net/index/api/v1/" MAX_CONCURRENT_WORKERS = 20 DEFAULT_CONCURRENT_WORKERS = 10 +DEFAULT_TIMEOUT = 10 From 88cb97be208f2d910080a1f874a343730a3c9f29 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Wed, 26 Mar 2025 14:27:51 -0400 Subject: [PATCH 7/7] add pypdf as part of requirements --- backend/mainService/requirements.txt | 2 ++ backend/metricsService/requirements.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/backend/mainService/requirements.txt b/backend/mainService/requirements.txt index 3ea9a61..a5189a8 100644 --- a/backend/mainService/requirements.txt +++ b/backend/mainService/requirements.txt @@ -25,4 +25,6 @@ google-genai redis>=4.2.0 uvicorn httpx>=0.28.1 +pypdf +pypdf2 diff --git a/backend/metricsService/requirements.txt b/backend/metricsService/requirements.txt index 183bb12..3d56cd0 100644 --- a/backend/metricsService/requirements.txt +++ b/backend/metricsService/requirements.txt @@ -7,3 +7,4 @@ python-dotenv==1.0.1 Requests==2.32.3 scholarly==1.7.11 uvicorn +