File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -2,6 +2,7 @@ celerybeat-schedule.*
22src /celerybeat-schedule. *
33feed * .json
44.DS_Store
5+ .venv
56
67# Ignore all __pycache__ directories at any level
78** /__pycache__ /
@@ -12,4 +13,10 @@ feed*.json
1213* .pyd
1314
1415# Ignore saved data
15- data /*
16+ data /*
17+
18+ # Packaging
19+ * .egg
20+ * .egg-info /
21+ dist /
22+ build /
Original file line number Diff line number Diff line change @@ -5,7 +5,8 @@ services:
55 build :
66 context : .
77 dockerfile : .docker/Dockerfile
8-
8+ environment :
9+ - MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin
910 # bind-mount your repo and the shared EBS volume
1011 volumes :
1112 - ./:/workspace:cached
@@ -26,6 +27,8 @@ services:
2627 build :
2728 context : .
2829 dockerfile : .docker/Dockerfile
30+ environment :
31+ - MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin
2932 volumes :
3033 - .:/workspace:cached
3134 - mongo_data:/data/db
@@ -39,6 +42,8 @@ services:
3942 build :
4043 context : .
4144 dockerfile : .docker/Dockerfile
45+ environment :
46+ - MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin
4247 volumes :
4348 - .:/workspace:cached
4449 depends_on :
Original file line number Diff line number Diff line change 1+ [build-system ]
2+ requires = [" setuptools" , " wheel" ]
3+ build-backend = " setuptools.build_meta"
4+
5+ [project ]
6+ name = " justinsight"
7+ version = " 0.1.0"
8+ description = " Your project description"
9+ authors = [{ name = " Grace Madison" }]
10+ dependencies = []
11+
12+ [tool .setuptools .packages .find ]
13+ where = [" src" ]
14+
15+ [tool .pytest .ini_options ]
16+ minversion = " 6.0"
17+ addopts = " -ra -q"
18+ testpaths = [" tests" ]
19+ pythonpath = [" src" ]
Original file line number Diff line number Diff line change @@ -45,4 +45,12 @@ Note: When running in background you can use "docker logs <container_name_or_id>
4545 " task" : " justinsight.tasks.your_task_name" ,
4646 " schedule" : x, # where x is the number of seconds between when the task should happen
4747 " args" : (), # potential arguments for your task
48- },
48+ },
49+
50+
51+ # # How to check what's in the database
52+ Please run: docker compose up -d
53+ Then: docker exec -it mongo mongosh -u myuser -p mypassword
54+ Then: use justinsightdb
55+ Then: db.articles.find().pretty ()
56+ Note - you may need to download mongosh for this to work and to exit the mongosh environment just run ' exit' . Remember to ' docker compose down' as the containers will be running in the background.
Original file line number Diff line number Diff line change 22import feedparser
33import hashlib
44import re
5- from src . ingest .save_to_database import save_entry
5+ from ingest .save_to_database import save_entry
66
77class BaseIngestor :
88 RSS_URL = None #will be set by the subclasses
99
10- def slugify (self , text ):
11- # Convert title to a filesystem-friendly slug
12- text = text .lower ()
13- text = re .sub (r'[^a-z0-9]+' , '-' , text )
14- return text .strip ('-' )
15-
1610 def format_date (self , entry ):
1711 # Extract and format the published date
1812 try :
@@ -31,8 +25,6 @@ def generate_entry_hash(self, entry):
3125 return hashlib .sha256 (hash_input .encode ('utf-8' )).hexdigest ()
3226
3327 def format_entry (self , entry ):
34- title_slug = self .slugify (entry .title )
35- date_str = self .format_date (entry )
3628 full_text = self .fetch_full_text (entry .link )
3729
3830 data = {
Original file line number Diff line number Diff line change 11from bs4 import BeautifulSoup
22import requests
3- from src . ingest .base_ingestor import BaseIngestor
3+ from ingest .base_ingestor import BaseIngestor
44
55class BBCIngestor (BaseIngestor ):
66 RSS_URL = "http://feeds.bbci.co.uk/news/world/rss.xml"
77
88 def fetch_full_text (self , article_url ):
9- response = requests .get (self . RSS_URL )
9+ response = requests .get (article_url )
1010 soup = BeautifulSoup (response .content , 'html.parser' )
1111
1212 article = soup .find ('article' )
Original file line number Diff line number Diff line change 1+ from bs4 import BeautifulSoup
2+ import requests
3+ from ingest .base_ingestor import BaseIngestor
4+
5+ class CNNIngestor (BaseIngestor ):
6+ RSS_URL = "http://rss.cnn.com/rss/cnn_world.rss"
7+
8+ def fetch_full_text (self , url ):
9+ try :
10+ headers = {
11+ "User-Agent" : "Mozilla/5.0"
12+ }
13+ response = requests .get (url , headers = headers , timeout = 10 )
14+ response .raise_for_status ()
15+
16+ soup = BeautifulSoup (response .content , 'html.parser' )
17+
18+ # CNN article content is usually within <div class="article__content"> or <section id="body-text">
19+ article_section = soup .find ('section' , id = 'body-text' ) or soup .find ('div' , class_ = 'article__content' )
20+
21+ if not article_section :
22+ print ("No CNN article body found." )
23+ return ""
24+
25+ paragraphs = article_section .find_all ('div' , class_ = 'paragraph' ) or article_section .find_all ('p' )
26+
27+ full_text = "\n " .join (p .get_text (strip = True ) for p in paragraphs )
28+
29+ return full_text .strip ()
30+
31+ except Exception as e :
32+ print (f"Error fetching CNN article: { e } " )
33+ return ""
Original file line number Diff line number Diff line change 1+ import os
12from pymongo import MongoClient
23
4+ # Default to local MongoDB when not using docker
5+ mongodb_uri = os .getenv ("MONGODB_URI" , "mongodb://localhost:27017" )
6+
7+ #mongodb_uri = os.getenv("mongodb://localhost:27017", "mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin")
8+
39# Include username, password, and authentication database
4- client = MongoClient ("mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin" )
10+ client = MongoClient (mongodb_uri )
511
612# Get (or create) a database
713db = client ["justinsightdb" ]
@@ -14,5 +20,4 @@ def save_entry(entry):
1420 entry_hash = entry ["id" ]
1521 if collection .count_documents ({"id" : entry_hash }) == 0 :
1622 collection .insert_one (entry )
17- else :
18- print (f"{ entry ['title' ]} already in use!" )
23+ print (f"I have now saved: { entry ['title' ]} " )
Original file line number Diff line number Diff line change 2222 "args" : (),
2323 },
2424
25- # "check-NYTfeed -every-5-minutes": {
26- # "task": "justinsight.tasks.nytLogger_task ",
27- # "schedule": 300 .0,
28- # "args": (),
29- # },
25+ "check-CNNfeed -every-5-minutes" : {
26+ "task" : "justinsight.tasks.cnnLogger_task " ,
27+ "schedule" : 5 .0 ,
28+ "args" : (),
29+ },
3030
3131 #schedule more tasks here
3232}
Original file line number Diff line number Diff line change 1- # tasks.py
21from celery import shared_task
32from ingest .bbc_ingestor import BBCIngestor
3+ from ingest .cnn_ingestor import CNNIngestor
44
55
66@shared_task
@@ -16,8 +16,9 @@ def bbcLogger_task():
1616 return "BBC RSS Feed checked."
1717
1818@shared_task
19- def nytLogger_task ():
20- check_and_save_nyt ()
21- return "NYT RSS Feed checked."
19+ def cnnLogger_task ():
20+ ingestor = CNNIngestor ()
21+ ingestor .check_and_save_new_entries () # this will invoke the inherited logic
22+ return "CNN RSS Feed checked."
2223
2324#Add more tasks here in the format of the one above
You can’t perform that action at this time.
0 commit comments