Skip to content

Commit fb01aea

Browse files
authored
Merge pull request #44 from JustInternetAI/Grace
Grace
2 parents f472ea8 + 1779dd9 commit fb01aea

10 files changed

Lines changed: 43 additions & 58 deletions

File tree

src/ingest/ap_ingestor.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,17 @@ def fetch_full_text(self, article_url):
3232
try:
3333
with sync_playwright() as p:
3434
browser = p.chromium.launch(headless=True)
35-
page = browser.new_page()
35+
context = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
36+
viewport={"width": 1280, "height": 800})
37+
page = context.new_page()
38+
39+
# browser = p.chromium.launch(headless=True)
40+
# page = browser.new_page()
3641

3742
#change the article_url to the redirected one (or just the same)
3843
article_url = self.resolve_google_news_redirect(article_url)
3944
#print(article_url)
40-
page.goto(article_url, wait_until="domcontentloaded", timeout=15000)
45+
page.goto(article_url, wait_until="domcontentloaded", timeout=30000)
4146

4247
# Wait for the main article body to load
4348
page.wait_for_selector('div.RichTextStoryBody', timeout=3000)
@@ -50,6 +55,6 @@ def fetch_full_text(self, article_url):
5055
return full_text.strip()
5156

5257
except Exception as e:
53-
#print(f"Playwright error fetching {article_url}: {e}")
58+
print(f"Playwright error fetching {article_url}: {e}")
5459
return ""
5560

src/ingest/base_ingestor.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,9 @@ def check_and_save_new_entries(self, using_celery=False):
4646
)
4747

4848
for entry in feed.entries:
49+
# print("\n--- ENTRY ---")
50+
# for k, v in entry.items():
51+
# print(f"{k}: {v}")
52+
# print(entry.link)
4953
formattedEntry = self.format_entry(entry)
5054
save_entry(formattedEntry, using_celery)
51-
52-
def check_no_save_new_entries(self):
53-
feed = feedparser.parse(self.RSS_URL)
54-
all_entries = []
55-
56-
for entry in feed.entries:
57-
all_entries.append(self.format_entry(entry))
58-
59-
return all_entries

src/ingest/bbc_ingestor.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,16 @@ def fetch_full_text(self, article_url):
1010
soup = BeautifulSoup(response.content, 'html.parser')
1111

1212
article = soup.find('article')
13+
14+
if not article:
15+
return None
1316

14-
if article:
15-
return(article.get_text())
17+
# # Remove unwanted sections like "related content", "media", or "byline"
18+
# for unwanted in article.select('[data-component="byline"], [data-component="media-block"], .bbc-1msyfg1, .bbc-1fxtbkn'): # classes may vary
19+
# unwanted.decompose()
20+
21+
# Gather all paragraphs that are part of the article body
22+
paragraphs = article.find_all('p')
23+
24+
cleaned_text = '\n\n'.join(p.get_text(strip=True) for p in paragraphs)
25+
return cleaned_text

src/ingest/cnn_ingestor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
class CNNIngestor(BaseIngestor):
66
RSS_URL = "http://rss.cnn.com/rss/cnn_world.rss"
7+
# This RSS feed is from 2023????????
78

89
def fetch_full_text(self, url):
910
try:

src/ingest/save_to_database.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ def save_entry(entry, using_celery):
1616

1717
#dont save entries without body text
1818
if entry["full_text"] == "" or entry["full_text"] == None:
19+
print("Unable to fetch full text.")
1920
return
2021

2122
#check if the entry has already been saved and if it has not then save it
@@ -34,10 +35,13 @@ def save_entry(entry, using_celery):
3435
current_app.tasks[ner_task.name]
3536
#TODO: The following line can be used when connected to EC2 to actually use a GPU
3637
#ner_task.apply_async(args=[str(inserted_id)], queue='gpu')
38+
print("Checkpoint 1")
3739
ner_task.apply_async(args=[str(inserted_id)])
3840
except NotRegistered:
3941
# fallback to inline
42+
print("Checkpoint 2")
4043
ner_task(str(inserted_id))
4144
else:
4245
# Inline execution
46+
print("Checkpoint 3")
4347
ner_task(str(inserted_id))

src/justinsight/celery.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,31 +16,35 @@
1616
# "args": (),
1717
# },
1818

19-
# NOT SAVING ANYTHING? WHAT HAPPENED
19+
# Maybe we should just give up on getting AP to work...
2020
# "check-APfeed-every-5-minutes": {
2121
# "task": "justinsight.tasks.apLogger_task",
2222
# "schedule": 5.0,
2323
# "args": (),
2424
# },
2525

26-
# "check-BBCfeed-every-5-minutes": {
27-
# "task": "justinsight.tasks.bbcLogger_task",
28-
# "schedule": 5.0,
29-
# "args": (),
30-
# },
26+
# Checked and good
27+
"check-BBCfeed-every-5-minutes": {
28+
"task": "justinsight.tasks.bbcLogger_task",
29+
"schedule": 5.0,
30+
"args": (),
31+
},
3132

33+
# Checked and good
3234
# "check-CBSfeed-every-5-minutes": {
3335
# "task": "justinsight.tasks.cbsLogger_task",
3436
# "schedule": 5.0,
3537
# "args": (),
3638
# },
3739

40+
# Could not find a current RSS feed for CNN :(
3841
# "check-CNNfeed-every-5-minutes": {
3942
# "task": "justinsight.tasks.cnnLogger_task",
4043
# "schedule": 5.0,
4144
# "args": (),
4245
# },
4346

47+
#I am here
4448
# "check-LATIMESfeed-every-5-minutes": {
4549
# "task": "justinsight.tasks.latimesLogger_task",
4650
# "schedule": 5.0,

src/justinsight/nlpthings.py

Lines changed: 0 additions & 24 deletions
This file was deleted.

src/justinsight/streamlitapp.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@
2828
columns_to_show = st.multiselect("Columns to display", options=df.columns.tolist(), default=df.columns.tolist())
2929
st.dataframe(df[columns_to_show])#, use_container_width=True)
3030

31-
# for i, row in df[columns_to_show].iterrows():
32-
# st.markdown(f"### Entry {i+1}")
33-
# st.write(row.to_dict())
31+
for i, row in df[columns_to_show].iterrows():
32+
st.markdown(f"### Entry {i+1}")
33+
st.write(row.to_dict())
3434

3535

3636

src/justinsight/tasks.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,7 @@
88
from ingest.npr_ingestor import NPRIngestor
99
from ingest.nyt_ingestor import NYTIngestor
1010
from ingest.usnews_ingestor import USNEWSIngestor
11-
from .nlpthings import dummy_addToEntryInDB
12-
from ingest.save_to_database import collection
1311
from nlp.ner_core import NERCore
14-
from bson import ObjectId
1512

1613
@shared_task
1714
def sample_task():
@@ -80,13 +77,6 @@ def usnewsLogger_task():
8077
ingestor.check_and_save_new_entries(using_celery=True) # this will invoke the inherited logic
8178
return "USNEWS RSS Feed checked."
8279

83-
84-
@shared_task
85-
def runNER_task(entry_id):
86-
print(f"New worker so we can use GPU on this entry id: {entry_id}")
87-
dummy_addToEntryInDB(entry_id)
88-
#Do GPU-dependent processing here
89-
9080
@shared_task
9181
def ner_task(article_id):
9282
# Process article with NER results

src/nlp/ner_core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
class NERCore(BaseCore):
66
def __init__(self):
7-
print("constructing NER Core instance")
7+
print(" NER Core instance constructing")
88
super().__init__(
99
task="ner",
1010
model_name="dslim/bert-base-NER",

0 commit comments

Comments
 (0)