Merge pull request #44 from JustInternetAI/Grace

graceannmad · web-flow · commit fb01aea23a55 · 2025-08-11T09:43:48.000-07:00
Grace
diff --git a/src/ingest/ap_ingestor.py b/src/ingest/ap_ingestor.py
@@ -32,12 +32,17 @@ def fetch_full_text(self, article_url):
         try:
             with sync_playwright() as p:
                 browser = p.chromium.launch(headless=True)
-                page = browser.new_page()
+                context = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
+                          viewport={"width": 1280, "height": 800})
+                page = context.new_page()
+
+                # browser = p.chromium.launch(headless=True)
+                # page = browser.new_page()
 
                 #change the article_url to the redirected one (or just the same)
                 article_url = self.resolve_google_news_redirect(article_url)
                 #print(article_url)
-                page.goto(article_url, wait_until="domcontentloaded", timeout=15000)
+                page.goto(article_url, wait_until="domcontentloaded", timeout=30000)
 
                 # Wait for the main article body to load
                 page.wait_for_selector('div.RichTextStoryBody', timeout=3000)
@@ -50,6 +55,6 @@ def fetch_full_text(self, article_url):
                 return full_text.strip()
 
         except Exception as e:
-            #print(f"Playwright error fetching {article_url}: {e}")
+            print(f"Playwright error fetching {article_url}: {e}")
             return ""
     
diff --git a/src/ingest/base_ingestor.py b/src/ingest/base_ingestor.py
@@ -46,14 +46,9 @@ def check_and_save_new_entries(self, using_celery=False):
         )
 
         for entry in feed.entries:
+            # print("\n--- ENTRY ---")
+            # for k, v in entry.items():
+            #     print(f"{k}: {v}")
+            # print(entry.link)
             formattedEntry = self.format_entry(entry)
             save_entry(formattedEntry, using_celery)
-
-    def check_no_save_new_entries(self):
-        feed = feedparser.parse(self.RSS_URL)
-        all_entries = []
-        
-        for entry in feed.entries:
-            all_entries.append(self.format_entry(entry))
-        
-        return all_entries
diff --git a/src/ingest/bbc_ingestor.py b/src/ingest/bbc_ingestor.py
@@ -10,6 +10,16 @@ def fetch_full_text(self, article_url):
         soup = BeautifulSoup(response.content, 'html.parser')
 
         article = soup.find('article')
+        
+        if not article:
+            return None
 
-        if article:
-            return(article.get_text())
+        # # Remove unwanted sections like "related content", "media", or "byline"
+        # for unwanted in article.select('[data-component="byline"], [data-component="media-block"], .bbc-1msyfg1, .bbc-1fxtbkn'):  # classes may vary
+        #     unwanted.decompose()
+
+        # Gather all paragraphs that are part of the article body
+        paragraphs = article.find_all('p')
+
+        cleaned_text = '\n\n'.join(p.get_text(strip=True) for p in paragraphs)
+        return cleaned_text
diff --git a/src/ingest/cnn_ingestor.py b/src/ingest/cnn_ingestor.py
@@ -4,6 +4,7 @@
 
 class CNNIngestor(BaseIngestor):
     RSS_URL = "http://rss.cnn.com/rss/cnn_world.rss"
+    # This RSS feed is from 2023????????
         
     def fetch_full_text(self, url):
         try:
diff --git a/src/ingest/save_to_database.py b/src/ingest/save_to_database.py
@@ -16,6 +16,7 @@ def save_entry(entry, using_celery):
 
     #dont save entries without body text
     if entry["full_text"] == "" or entry["full_text"] == None:
+        print("Unable to fetch full text.")
         return
 
     #check if the entry has already been saved and if it has not then save it
@@ -34,10 +35,13 @@ def save_entry(entry, using_celery):
                 current_app.tasks[ner_task.name]
                 #TODO: The following line can be used when connected to EC2 to actually use a GPU
                 #ner_task.apply_async(args=[str(inserted_id)], queue='gpu')
+                print("Checkpoint 1")
                 ner_task.apply_async(args=[str(inserted_id)])
             except NotRegistered:
                 # fallback to inline
+                print("Checkpoint 2")
                 ner_task(str(inserted_id))
         else:
             # Inline execution
+            print("Checkpoint 3")
             ner_task(str(inserted_id))
diff --git a/src/justinsight/celery.py b/src/justinsight/celery.py
@@ -16,31 +16,35 @@
     #     "args": (),
     # },
 
-    # NOT SAVING ANYTHING? WHAT HAPPENED
+    # Maybe we should just give up on getting AP to work...
     # "check-APfeed-every-5-minutes": {
     #     "task": "justinsight.tasks.apLogger_task",
     #     "schedule": 5.0,
     #     "args": (),
     # },
 
-    # "check-BBCfeed-every-5-minutes": {
-    #     "task": "justinsight.tasks.bbcLogger_task",
-    #     "schedule": 5.0,
-    #     "args": (),
-    # },
+    # Checked and good 
+    "check-BBCfeed-every-5-minutes": {
+        "task": "justinsight.tasks.bbcLogger_task",
+        "schedule": 5.0,
+        "args": (),
+    },
 
+    # Checked and good
     # "check-CBSfeed-every-5-minutes": {
     #     "task": "justinsight.tasks.cbsLogger_task",
     #     "schedule": 5.0,
     #     "args": (),
     # },
 
+    # Could not find a current RSS feed for CNN :(
     # "check-CNNfeed-every-5-minutes": {
     #     "task": "justinsight.tasks.cnnLogger_task",
     #     "schedule": 5.0,
     #     "args": (),
     # },
 
+    #I am here
     # "check-LATIMESfeed-every-5-minutes": {
     #     "task": "justinsight.tasks.latimesLogger_task",
     #     "schedule": 5.0,
diff --git a/src/justinsight/nlpthings.py b/src/justinsight/nlpthings.py
diff --git a/src/justinsight/streamlitapp.py b/src/justinsight/streamlitapp.py
@@ -28,9 +28,9 @@
 columns_to_show = st.multiselect("Columns to display", options=df.columns.tolist(), default=df.columns.tolist())
 st.dataframe(df[columns_to_show])#, use_container_width=True)
 
-# for i, row in df[columns_to_show].iterrows():
-#     st.markdown(f"### Entry {i+1}")
-#     st.write(row.to_dict())
+for i, row in df[columns_to_show].iterrows():
+    st.markdown(f"### Entry {i+1}")
+    st.write(row.to_dict())
 
 
 
diff --git a/src/justinsight/tasks.py b/src/justinsight/tasks.py
@@ -8,10 +8,7 @@
 from ingest.npr_ingestor import NPRIngestor
 from ingest.nyt_ingestor import NYTIngestor
 from ingest.usnews_ingestor import USNEWSIngestor
-from .nlpthings import dummy_addToEntryInDB
-from ingest.save_to_database import collection
 from nlp.ner_core import NERCore
-from bson import ObjectId
 
 @shared_task
 def sample_task():
@@ -80,13 +77,6 @@ def usnewsLogger_task():
     ingestor.check_and_save_new_entries(using_celery=True)  # this will invoke the inherited logic
     return "USNEWS RSS Feed checked."
 
-
-@shared_task
-def runNER_task(entry_id):
-    print(f"New worker so we can use GPU on this entry id: {entry_id}")
-    dummy_addToEntryInDB(entry_id)
-    #Do GPU-dependent processing here
-
 @shared_task
 def ner_task(article_id):
     # Process article with NER results
diff --git a/src/nlp/ner_core.py b/src/nlp/ner_core.py
@@ -4,7 +4,7 @@
 
 class NERCore(BaseCore):
     def __init__(self):
-        print("constructing NER Core instance")
+        print(" NER Core instance constructing")
         super().__init__(
             task="ner",
             model_name="dslim/bert-base-NER",