Skip to content

Commit ad4a94f

Browse files
authored
Merge pull request #29 from JustInternetAI/Development
Development
2 parents 1b35cc3 + a708dca commit ad4a94f

11 files changed

Lines changed: 117 additions & 28 deletions

File tree

.gitignore

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ celerybeat-schedule.*
22
src/celerybeat-schedule.*
33
feed*.json
44
.DS_Store
5+
.venv
56

67
# Ignore all __pycache__ directories at any level
78
**/__pycache__/
@@ -12,4 +13,10 @@ feed*.json
1213
*.pyd
1314

1415
#Ignore saved data
15-
data/*
16+
data/*
17+
18+
# Packaging
19+
*.egg
20+
*.egg-info/
21+
dist/
22+
build/

docker-compose.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ services:
55
build:
66
context: .
77
dockerfile: .docker/Dockerfile
8-
8+
environment:
9+
- MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin
910
# bind-mount your repo and the shared EBS volume
1011
volumes:
1112
- ./:/workspace:cached
@@ -26,6 +27,8 @@ services:
2627
build:
2728
context: .
2829
dockerfile: .docker/Dockerfile
30+
environment:
31+
- MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin
2932
volumes:
3033
- .:/workspace:cached
3134
- mongo_data:/data/db
@@ -39,6 +42,8 @@ services:
3942
build:
4043
context: .
4144
dockerfile: .docker/Dockerfile
45+
environment:
46+
- MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin
4247
volumes:
4348
- .:/workspace:cached
4449
depends_on:

pyproject.toml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[build-system]
2+
requires = ["setuptools", "wheel"]
3+
build-backend = "setuptools.build_meta"
4+
5+
[project]
6+
name = "justinsight"
7+
version = "0.1.0"
8+
description = "Your project description"
9+
authors = [{ name = "Grace Madison" }]
10+
dependencies = []
11+
12+
[tool.setuptools.packages.find]
13+
where = ["src"]
14+
15+
[tool.pytest.ini_options]
16+
minversion = "6.0"
17+
addopts = "-ra -q"
18+
testpaths = ["tests"]
19+
pythonpath = ["src"]

readme.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,12 @@ Note: When running in background you can use "docker logs <container_name_or_id>
4545
"task": "justinsight.tasks.your_task_name",
4646
"schedule": x, #where x is the number of seconds between when the task should happen
4747
"args": (), #potential arguments for your task
48-
},
48+
},
49+
50+
51+
## How to check what's in the database
52+
Please run: docker compose up -d
53+
Then: docker exec -it mongo mongosh -u myuser -p mypassword
54+
Then: use justinsightdb
55+
Then: db.articles.find().pretty()
56+
Note - you may need to download mongosh for this to work and to exit the mongosh environment just run 'exit'. Remember to 'docker compose down' as the containers will be running in the background.

src/ingest/base_ingestor.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,11 @@
22
import feedparser
33
import hashlib
44
import re
5-
from src.ingest.save_to_database import save_entry
5+
from ingest.save_to_database import save_entry
66

77
class BaseIngestor:
88
RSS_URL = None #will be set by the subclasses
99

10-
def slugify(self, text):
11-
# Convert title to a filesystem-friendly slug
12-
text = text.lower()
13-
text = re.sub(r'[^a-z0-9]+', '-', text)
14-
return text.strip('-')
15-
1610
def format_date(self, entry):
1711
# Extract and format the published date
1812
try:
@@ -31,8 +25,6 @@ def generate_entry_hash(self, entry):
3125
return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
3226

3327
def format_entry(self, entry):
34-
title_slug = self.slugify(entry.title)
35-
date_str = self.format_date(entry)
3628
full_text = self.fetch_full_text(entry.link)
3729

3830
data = {

src/ingest/bbc_ingestor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from bs4 import BeautifulSoup
22
import requests
3-
from src.ingest.base_ingestor import BaseIngestor
3+
from ingest.base_ingestor import BaseIngestor
44

55
class BBCIngestor(BaseIngestor):
66
RSS_URL = "http://feeds.bbci.co.uk/news/world/rss.xml"
77

88
def fetch_full_text(self, article_url):
9-
response = requests.get(self.RSS_URL)
9+
response = requests.get(article_url)
1010
soup = BeautifulSoup(response.content, 'html.parser')
1111

1212
article = soup.find('article')

src/ingest/cnn_ingestor.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
from ingest.base_ingestor import BaseIngestor
4+
5+
class CNNIngestor(BaseIngestor):
6+
RSS_URL = "http://rss.cnn.com/rss/cnn_world.rss"
7+
8+
def fetch_full_text(self, url):
9+
try:
10+
headers = {
11+
"User-Agent": "Mozilla/5.0"
12+
}
13+
response = requests.get(url, headers=headers, timeout=10)
14+
response.raise_for_status()
15+
16+
soup = BeautifulSoup(response.content, 'html.parser')
17+
18+
# CNN article content is usually within <div class="article__content"> or <section id="body-text">
19+
article_section = soup.find('section', id='body-text') or soup.find('div', class_='article__content')
20+
21+
if not article_section:
22+
print("No CNN article body found.")
23+
return ""
24+
25+
paragraphs = article_section.find_all('div', class_='paragraph') or article_section.find_all('p')
26+
27+
full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
28+
29+
return full_text.strip()
30+
31+
except Exception as e:
32+
print(f"Error fetching CNN article: {e}")
33+
return ""

src/ingest/save_to_database.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
1+
import os
12
from pymongo import MongoClient
23

4+
# Default to local MongoDB when not using docker
5+
mongodb_uri = os.getenv("MONGODB_URI", "mongodb://localhost:27017")
6+
7+
#mongodb_uri = os.getenv("mongodb://localhost:27017", "mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin")
8+
39
# Include username, password, and authentication database
4-
client = MongoClient("mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin")
10+
client = MongoClient(mongodb_uri)
511

612
# Get (or create) a database
713
db = client["justinsightdb"]
@@ -14,5 +20,4 @@ def save_entry(entry):
1420
entry_hash = entry["id"]
1521
if collection.count_documents({"id": entry_hash}) == 0:
1622
collection.insert_one(entry)
17-
else:
18-
print(f"{entry['title']} already in use!")
23+
print(f"I have now saved: {entry['title']}")

src/justinsight/celery.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,11 @@
2222
"args": (),
2323
},
2424

25-
# "check-NYTfeed-every-5-minutes": {
26-
# "task": "justinsight.tasks.nytLogger_task",
27-
# "schedule": 300.0,
28-
# "args": (),
29-
# },
25+
"check-CNNfeed-every-5-minutes": {
26+
"task": "justinsight.tasks.cnnLogger_task",
27+
"schedule": 5.0,
28+
"args": (),
29+
},
3030

3131
#schedule more tasks here
3232
}

src/justinsight/tasks.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
# tasks.py
21
from celery import shared_task
32
from ingest.bbc_ingestor import BBCIngestor
3+
from ingest.cnn_ingestor import CNNIngestor
44

55

66
@shared_task
@@ -16,8 +16,9 @@ def bbcLogger_task():
1616
return "BBC RSS Feed checked."
1717

1818
@shared_task
19-
def nytLogger_task():
20-
check_and_save_nyt()
21-
return "NYT RSS Feed checked."
19+
def cnnLogger_task():
20+
ingestor = CNNIngestor()
21+
ingestor.check_and_save_new_entries() # this will invoke the inherited logic
22+
return "CNN RSS Feed checked."
2223

2324
#Add more tasks here in the format of the one above

0 commit comments

Comments
 (0)