diff --git a/.env_template b/.env_template index 49ea0bd..56b5add 100644 --- a/.env_template +++ b/.env_template @@ -2,3 +2,4 @@ YOUTUBE_API_KEY= MONGO_URI= MONGO_DB= STAGE= +DAILY_SUN_URL= \ No newline at end of file diff --git a/README.md b/README.md index 5df5eb3..839e973 100644 --- a/README.md +++ b/README.md @@ -22,4 +22,6 @@ To start the project, run the following command in the terminal ## Setting up the database -Add /graphql to the url to access the interactive GraphQL platform \ No newline at end of file +Create a Mongo database named `score_db` and another named `daily_sun_db`. A partnership with the Daily Sun has given us access to their articles which we copy and paginate the results for frontend. + +Add /graphql to the url to access the interactive GraphQL platform diff --git a/app.py b/app.py index 5a01798..67aec02 100644 --- a/app.py +++ b/app.py @@ -5,6 +5,10 @@ from flask_graphql import GraphQLView from graphene import Schema from src.schema import Query, Mutation +from src.scrapers.games_scraper import fetch_game_schedule +from src.scrapers.youtube_stats import fetch_videos +from src.scrapers.daily_sun_scrape import fetch_news +from src.services.article_service import ArticleService from src.utils.team_loader import TeamLoader import signal import sys @@ -83,6 +87,30 @@ def create_context(): ), ) +# Setup command line arguments +def parse_args(): + parser = argparse.ArgumentParser(description="Skip scraping tasks, for dev purposes.") + parser.add_argument( + "--no-scrape", + action="store_true", + help="Skips scraping tasks if set, useful for frontend development.", + ) + parser.add_argument( + "--no-daily-sun", + action="store_true", + help="Skips using the Daily Sun page for alerts", + ) + return parser.parse_args() + +# Only parse arguments when running directly (not when imported by gunicorn) +if __name__ == "__main__": + args = parse_args() +else: + # Default args when imported by gunicorn + class DefaultArgs: + no_scrape = False + no_daily_sun = False + args = DefaultArgs() def signal_handler(sig, frame): sys.exit(0) @@ -91,5 +119,50 @@ def signal_handler(sig, frame): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) +# Only parse arguments when running directly (not when imported by gunicorn) +if __name__ == "__main__": + args = parse_args() +else: + # Default args when imported by gunicorn + class DefaultArgs: + no_scrape = False + no_daily_sun = False + args = DefaultArgs() + +# Only run scraping tasks if not disabled +if not args.no_scrape: + from flask_apscheduler import APScheduler + scheduler = APScheduler() + scheduler.init_app(app) + scheduler.start() + + @scheduler.task("interval", id="scrape_schedules", seconds=43200) # 12 hours + def scrape_schedules(): + logging.info("Scraping game schedules...") + fetch_game_schedule() + + @scheduler.task("interval", id="scrape_videos", seconds=43200) # 12 hours + def scrape_videos(): + logging.info("Scraping YouTube videos...") + fetch_videos() + + scrape_schedules() + scrape_videos() + +if not args.no_daily_sun and not args.no_scrape: + @scheduler.task("interval", id="scrape_daily_sun", seconds=3600) + def scrape_daily_sun(): + logging.info("Getting Daily Sun Sports News...") + fetch_news() + + @scheduler.task("interval", id="cleanse_daily_sun_db", seconds=604800) # 1 week + def cleanse_daily_sun_db(): + logging.info("Cleaning the Daily Sun database from old articles...") + ArticleService.cleanse_old_articles() + + scrape_daily_sun() + cleanse_daily_sun_db() + + if __name__ == "__main__": app.run(debug=True, host="0.0.0.0", port=8000) diff --git a/src/database.py b/src/database.py index d475437..834808d 100644 --- a/src/database.py +++ b/src/database.py @@ -48,6 +48,7 @@ def keep_connection_alive(): # Access the database db = client[os.getenv("MONGO_DB", "score_db")] +daily_sun_db = client[os.getenv("DAILY_SUN_DB", "daily_sun_db")] def setup_database_indexes(): @@ -65,6 +66,31 @@ def setup_database_indexes(): # Index for sorting operations game_collection.create_index([("date", -1)], background=True) + + # Index to have unique games so we won't add duplicates + game_collection.create_index( + [ + ("sport", 1), + ("gender", 1), + ("date", 1), + ("opponent_id", 1), + ("state", 1), + ], + unique=True, + background=True + ) + + # Additional index for tournament games (without opponent_id) + game_collection.create_index( + [ + ("sport", 1), + ("gender", 1), + ("date", 1), + ("city", 1), + ("state", 1), + ], + background=True + ) print("✅ MongoDB indexes created successfully") except Exception as e: diff --git a/src/models/__init__.py b/src/models/__init__.py index ab83d25..efbf4e5 100644 --- a/src/models/__init__.py +++ b/src/models/__init__.py @@ -1,3 +1,4 @@ from .game import Game from .team import Team -from .youtube_video import YoutubeVideo \ No newline at end of file +from .youtube_video import YoutubeVideo +from .article import Article \ No newline at end of file diff --git a/src/models/article.py b/src/models/article.py new file mode 100644 index 0000000..bfcb8e1 --- /dev/null +++ b/src/models/article.py @@ -0,0 +1,56 @@ +from bson.objectid import ObjectId +from datetime import datetime + +class Article: + """ + A model representing a news article. + + Attributes: + - title: The title of the article + - image: The filename of the article's main image + - sports_type: The specific sport category + - published_at: The publication date + - url: The URL to the full article + - slug: Unique identifier from the source + - created_at: When the article was added to our DB + """ + def __init__(self, title, sports_type, published_at, url, slug, image=None, id=None, created_at=None): + self.id = id if id else str(ObjectId()) + self.title = title + self.image = image + self.sports_type = sports_type + self.published_at = published_at + self.url = url + self.slug = slug + self.created_at = created_at if created_at else datetime.now() + + def to_dict(self): + """ + Converts the Article object to a dictionary format for MongoDB storage. + """ + return { + "_id": self.id, + "title": self.title, + "image": self.image, + "sports_type": self.sports_type, + "published_at": self.published_at, + "url": self.url, + "slug": self.slug, + "created_at": self.created_at + } + + @staticmethod + def from_dict(data): + """ + Converts a MongoDB document to an Article object. + """ + return Article( + id=data.get("_id"), + title=data.get("title"), + image=data.get("image"), + sports_type=data.get("sports_type"), + published_at=data.get("published_at"), + url=data.get("url"), + slug=data.get("slug"), + created_at=data.get("created_at") + ) \ No newline at end of file diff --git a/src/models/game.py b/src/models/game.py index f3ebcf6..73a7968 100644 --- a/src/models/game.py +++ b/src/models/game.py @@ -17,6 +17,7 @@ class Game: - `time` The time of the game. (optional) - `box_score` The scoring summary of the game (optional) - `score_breakdown` The scoring breakdown of the game (optional) + - 'ticket_link' The ticket link for the game (optional) """ def __init__( @@ -35,6 +36,7 @@ def __init__( score_breakdown=None, team=None, utc_date=None, + ticket_link=None, ): self.id = id if id else str(ObjectId()) self.city = city @@ -50,6 +52,7 @@ def __init__( self.score_breakdown = score_breakdown self.team = team self.utc_date = utc_date + self.ticket_link = ticket_link def to_dict(self): """ @@ -70,6 +73,7 @@ def to_dict(self): "score_breakdown": self.score_breakdown, "team": self.team, "utc_date": self.utc_date, + "ticket_link": self.ticket_link, } @staticmethod @@ -92,4 +96,5 @@ def from_dict(data) -> None: score_breakdown=data.get("score_breakdown"), team=data.get("team"), utc_date=data.get("utc_date"), + ticket_link=data.get("ticket_link"), ) diff --git a/src/models/youtube_video.py b/src/models/youtube_video.py index e45a965..cdd11af 100644 --- a/src/models/youtube_video.py +++ b/src/models/youtube_video.py @@ -12,10 +12,11 @@ class YoutubeVideo: - `thumbnail` The thumbnail of the video, as a URL string pointing to a `.jpg` file. - `url` The URL of the video. - `published_at` The date and time the video was published. + - `duration` The duration of the video. """ def __init__( - self, title, description, thumbnail, b64_thumbnail, url, published_at, id=None + self, title, description, thumbnail, b64_thumbnail, url, published_at, duration=None, id=None ): self.id = id if id else str(ObjectId()) self.title = title @@ -24,6 +25,7 @@ def __init__( self.b64_thumbnail = b64_thumbnail self.url = url self.published_at = published_at + self.duration = duration def to_dict(self): """ @@ -37,6 +39,7 @@ def to_dict(self): "b64_thumbnail": self.b64_thumbnail, "url": self.url, "published_at": self.published_at, + "duration": self.duration, } @staticmethod @@ -52,4 +55,5 @@ def from_dict(data): b64_thumbnail=data.get("b64_thumbnail"), url=data.get("url"), published_at=data.get("published_at"), + duration=data.get("duration"), ) diff --git a/src/mutations/__init__.py b/src/mutations/__init__.py index 3fd3a8a..3df8e4d 100644 --- a/src/mutations/__init__.py +++ b/src/mutations/__init__.py @@ -1,3 +1,4 @@ from .create_game import CreateGame from .create_team import CreateTeam -from .create_youtube_video import CreateYoutubeVideo \ No newline at end of file +from .create_youtube_video import CreateYoutubeVideo +from .create_article import CreateArticle \ No newline at end of file diff --git a/src/mutations/create_article.py b/src/mutations/create_article.py new file mode 100644 index 0000000..19b8920 --- /dev/null +++ b/src/mutations/create_article.py @@ -0,0 +1,26 @@ +from graphene import Mutation, String, Field +from src.types import ArticleType +from src.services.article_service import ArticleService + +class CreateArticle(Mutation): + class Arguments: + title = String(required=True) + sports_type = String(required=True) + published_at = String(required=True) + url = String(required=True) + slug = String(required=True) + image = String(required=False) + + article = Field(lambda: ArticleType) + + def mutate(self, info, title, sports_type, published_at, url, slug, image=None): + article_data = { + "title": title, + "sports_type": sports_type, + "published_at": published_at, # Already in ISO 8601 format + "url": url, + "slug": slug, + "image": image + } + new_article = ArticleService.create_article(article_data) + return CreateArticle(article=new_article) \ No newline at end of file diff --git a/src/mutations/create_game.py b/src/mutations/create_game.py index 205a153..3a52345 100644 --- a/src/mutations/create_game.py +++ b/src/mutations/create_game.py @@ -17,6 +17,7 @@ class Arguments: box_score = String(required=False) score_breakdown = String(required=False) utc_date = String(required=False) + ticket_link = String(required=False) game = Field(lambda: GameType) @@ -34,7 +35,8 @@ def mutate( time=None, box_score=None, score_breakdown=None, - utc_date=None + utc_date=None, + ticket_link=None ): game_data = { "city": city, @@ -48,7 +50,8 @@ def mutate( "time": time, "box_score": box_score, "score_breakdown": score_breakdown, - "utc_date": utc_date + "utc_date": utc_date, + "ticket_link": ticket_link } new_game = GameService.create_game(game_data) return CreateGame(game=new_game) \ No newline at end of file diff --git a/src/mutations/create_youtube_video.py b/src/mutations/create_youtube_video.py index 9f39bf7..ed2cb40 100644 --- a/src/mutations/create_youtube_video.py +++ b/src/mutations/create_youtube_video.py @@ -8,13 +8,14 @@ class Arguments: title = String(required=True) description = String(required=True) thumbnail = String(required=True) - b64_thumbnail = String(required=True) + b64_thumbnail = String(required=False) url = String(required=True) published_at = String(required=True) + duration = String(required=True) youtube_video = Field(lambda: YoutubeVideoType) - def mutate(self, info, id, title, description, thumbnail, url, published_at): + def mutate(self, info, id, title, description, thumbnail, url, published_at, duration, b64_thumbnail=None): video_data = { "id": id, "title": title, @@ -23,6 +24,7 @@ def mutate(self, info, id, title, description, thumbnail, url, published_at): "b64_thumbnail": b64_thumbnail, "url": url, "published_at": published_at, + "duration": duration, } new_video = YoutubeVideoService.create_video(video_data) return CreateYoutubeVideo(youtube_video=new_video) \ No newline at end of file diff --git a/src/queries/__init__.py b/src/queries/__init__.py index f345409..fdf2f41 100644 --- a/src/queries/__init__.py +++ b/src/queries/__init__.py @@ -1,3 +1,4 @@ from .game_query import GameQuery from .team_query import TeamQuery from .youtube_video_query import YoutubeVideoQuery +from .article_query import ArticleQuery \ No newline at end of file diff --git a/src/queries/article_query.py b/src/queries/article_query.py new file mode 100644 index 0000000..52e6cbc --- /dev/null +++ b/src/queries/article_query.py @@ -0,0 +1,12 @@ +from graphene import ObjectType, List, String +from src.services.article_service import ArticleService +from src.types import ArticleType + +class ArticleQuery(ObjectType): + articles = List(ArticleType, sports_type=String()) + + def resolve_articles(self, info, sports_type=None): + """ + Resolver for retrieving news articles, optionally filtered by sports_type. + """ + return ArticleService.get_articles(sports_type) \ No newline at end of file diff --git a/src/queries/game_query.py b/src/queries/game_query.py index 4aa8a55..3c04116 100644 --- a/src/queries/game_query.py +++ b/src/queries/game_query.py @@ -1,4 +1,4 @@ -from graphene import ObjectType, String, Field, List, Int +from graphene import ObjectType, String, Field, List, Int, DateTime from src.services.game_service import GameService from src.types import GameType @@ -20,12 +20,14 @@ class GameQuery(ObjectType): sport=String(required=True), state=String(required=True), time=String(required=True), + ticket_link=String(required=False), ) games_by_sport = List(GameType, sport=String(required=True)) games_by_gender = List(GameType, gender=String(required=True)) games_by_sport_gender = List( GameType, sport=String(required=True), gender=String(required=True) ) + games_by_date = List(GameType, startDate=DateTime(required=True), endDate=DateTime(required=True)) def resolve_games(self, info, limit=100, offset=0): """ @@ -40,13 +42,13 @@ def resolve_game(self, info, id): return GameService.get_game_by_id(id) def resolve_game_by_data( - self, info, city, date, gender, opponent_id, sport, state, time, location=None + self, info, city, date, gender, opponent_id, sport, state, time, location=None, ticket_link=None ): """ Resolver for retrieving a game by its data. """ return GameService.get_game_by_data( - city, date, gender, location, opponent_id, sport, state, time + city, date, gender, location, opponent_id, sport, state, time, ticket_link ) def resolve_games_by_sport(self, info, sport): @@ -66,3 +68,9 @@ def resolve_games_by_sport_gender(self, info, sport, gender): Resolver for retrieving all games by its sport and gender. """ return GameService.get_games_by_sport_gender(sport, gender) + + def resolve_games_by_date(self, info, startDate, endDate): + """ + Resolver for retrieving games by date. + """ + return GameService.get_games_by_date(startDate, endDate) diff --git a/src/repositories/__init__.py b/src/repositories/__init__.py index 1c18bb7..f9c6252 100644 --- a/src/repositories/__init__.py +++ b/src/repositories/__init__.py @@ -1,3 +1,4 @@ from .game_repository import GameRepository from .team_repository import TeamRepository from .youtube_video_repository import YoutubeVideoRepository +from .article_repository import ArticleRepository \ No newline at end of file diff --git a/src/repositories/article_repository.py b/src/repositories/article_repository.py new file mode 100644 index 0000000..440f856 --- /dev/null +++ b/src/repositories/article_repository.py @@ -0,0 +1,85 @@ +from src.database import daily_sun_db +from src.models.article import Article +from pymongo import UpdateOne +from datetime import datetime, timedelta, timezone + +class ArticleRepository: + @staticmethod + def upsert(article): + """ + Upsert an article into the 'news_articles' collection in MongoDB. + """ + article_collection = daily_sun_db["news_articles"] + article_dict = article.to_dict() + # Remove _id from the update to avoid MongoDB error + article_dict.pop("_id", None) + + article_collection.update_one( + {"slug": article.slug}, + {"$set": article_dict}, + upsert=True + ) + + @staticmethod + def bulk_upsert(articles): + """ + Bulk upsert articles into the 'news_articles' collection based on slug. + """ + if not articles: + return + + article_collection = daily_sun_db["news_articles"] + operations = [] + for article in articles: + article_dict = article.to_dict() + # Remove _id from the update to avoid MongoDB error + article_dict.pop("_id", None) + + operations.append( + UpdateOne( + {"slug": article.slug}, + {"$set": article_dict}, + upsert=True + ) + ) + + if operations: + article_collection.bulk_write(operations) + + @staticmethod + def find_recent(limit_days=3): + """ + Retrieve articles from the last N days, sorted by published_at descending. + """ + article_collection = daily_sun_db["news_articles"] + # Calculate threshold as ISO 8601 string + threshold = (datetime.now(timezone.utc) - timedelta(days=limit_days)).isoformat().replace('+00:00', 'Z') + query = {"published_at": {"$gte": threshold}} + articles = article_collection.find(query).sort("published_at", -1) + return [Article.from_dict(article) for article in articles] + + @staticmethod + def find_by_sports_type(sports_type, limit_days=3): + """ + Retrieve articles by sports_type from the last N days, sorted by published_at descending. + """ + article_collection = daily_sun_db["news_articles"] + # Calculate threshold as ISO 8601 string + threshold = (datetime.now(timezone.utc) - timedelta(days=limit_days)).isoformat().replace('+00:00', 'Z') + query = { + "sports_type": sports_type, + "published_at": {"$gte": threshold} + } + articles = article_collection.find(query).sort("published_at", -1) + return [Article.from_dict(article) for article in articles] + + @staticmethod + def delete_not_recent(limit_days=3): + """ + Delete articles older than N days, sorted by published_at descending. + """ + article_collection = daily_sun_db["news_articles"] + # Calculate threshold as ISO 8601 string + threshold = (datetime.now(timezone.utc) - timedelta(days=limit_days)).isoformat().replace('+00:00', 'Z') + query = {"published_at": {"$lt": threshold}} + article_collection.delete_many(query) \ No newline at end of file diff --git a/src/repositories/game_repository.py b/src/repositories/game_repository.py index bfe5d08..95e679b 100644 --- a/src/repositories/game_repository.py +++ b/src/repositories/game_repository.py @@ -130,6 +130,56 @@ def find_by_key_fields(city, date, gender, location, opponent_id, sport, state): return [Game.from_dict(game) for game in games] + @staticmethod + def find_by_tournament_key_fields(city, date, gender, location, sport, state): + """ + Find tournament games by location and date (excluding opponent_id). + This is used when we need to find a tournament game that might have a placeholder team. + Uses flexible matching to handle TBD/TBA values. + """ + game_collection = db["game"] + + # Build flexible query that can handle TBD/TBA values + query = { + "date": date, + "gender": gender, + "sport": sport, + } + + # For city, state, and location, use flexible matching + # This allows finding games even when TBD/TBA values change to real values + city_conditions = [] + if city: + city_conditions.append(city) + else: + city_conditions = [None] + + state_conditions = [] + if state: + state_conditions.append(state) + else: + state_conditions = [None] + + location_conditions = [] + if location: + location_conditions.append(location) + else: + location_conditions = [None] + + query["city"] = {"$in": city_conditions} + query["state"] = {"$in": state_conditions} + query["location"] = {"$in": location_conditions} + + games = list(game_collection.find(query)) + + if not games: + return None + + if len(games) == 1: + return Game.from_dict(games[0]) + + return [Game.from_dict(game) for game in games] + @staticmethod def find_by_sport(sport): """ @@ -156,3 +206,52 @@ def find_by_sport_gender(sport, gender): game_collection = db["game"] games = game_collection.find({"sport": sport, "gender": gender}) return [Game.from_dict(game) for game in games] + + @staticmethod + def find_games_by_sport_gender_after_date(sport, gender, after_date=None): + """ + Find games for a specific sport and gender, optionally after a specific date. + This method returns raw game data without team information. + """ + game_collection = db["game"] + + query = { + "sport": sport, + "gender": gender + } + + if after_date: + query["utc_date"] = {"$gt": after_date} + + games = game_collection.find(query) + return [Game.from_dict(game) for game in games] + + @staticmethod + def find_by_date(startDate, endDate): + """ + Retrieve all games from the 'game' collection in MongoDB for games + between certain dates. + """ + game_collection = db["game"] + + start_str = startDate.isoformat() + endDate = endDate.isoformat() + + query = { + "utc_date": { + "$gte": start_str, + "$lte": endDate + } + } + + games = game_collection.find(query) + return [Game.from_dict(game) for game in games] + + @staticmethod + def delete_games_by_ids(game_ids): + """ + Delete games by their IDs. + """ + game_collection = db["game"] + result = game_collection.delete_many({"_id": {"$in": game_ids}}) + return result.deleted_count diff --git a/src/schema.py b/src/schema.py index 2cbbe69..0f3ae99 100644 --- a/src/schema.py +++ b/src/schema.py @@ -1,9 +1,9 @@ from graphene import ObjectType, Schema, Mutation -from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo -from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery +from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo, CreateArticle +from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery, ArticleQuery -class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ObjectType): +class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ArticleQuery, ObjectType): pass @@ -11,6 +11,7 @@ class Mutation(ObjectType): create_game = CreateGame.Field(description="Creates a new game.") create_team = CreateTeam.Field(description="Creates a new team.") create_youtube_video = CreateYoutubeVideo.Field(description="Creates a new youtube video.") + create_article = CreateArticle.Field(description="Creates a new article.") schema = Schema(query=Query, mutation=Mutation) diff --git a/src/scrapers/daily_sun_scrape.py b/src/scrapers/daily_sun_scrape.py new file mode 100644 index 0000000..4b5cd24 --- /dev/null +++ b/src/scrapers/daily_sun_scrape.py @@ -0,0 +1,81 @@ +import os +import requests +from datetime import datetime, timedelta, timezone +from dotenv import load_dotenv +from ..services import ArticleService +from ..utils.constants import ARTICLE_IMG_TAG +from ..utils.helpers import extract_sport_type_from_title +import logging +from bs4 import BeautifulSoup +import base64 + +load_dotenv() + + +def fetch_news(): + try: + url = os.getenv("DAILY_SUN_URL") + response = requests.get( + url, + headers={ + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + } + ) + response.raise_for_status() + data = response.json() + + # Current date and 3-day threshold (in UTC) + current_date = datetime.now(timezone.utc) + three_days_ago = current_date - timedelta(days=3) + + # Process articles + articles_to_store = [] + for article in data.get("articles", []): + published_at_dt = datetime.strptime(article["published_at"], "%Y-%m-%d %H:%M:%S") + # Assume the timezone is UTC and convert to ISO 8601 format string + published_at_dt = published_at_dt.replace(tzinfo=timezone.utc) + published_at = published_at_dt.isoformat().replace('+00:00', 'Z') + + if published_at_dt >= three_days_ago: + # Extract sport type from title + title = article["headline"] + sports_type = extract_sport_type_from_title(title) + article_url = f"https://cornellsun.com/article/{article['slug']}" + + article_image = None + try: + response = requests.get( + article_url, + headers={ + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + } + ) + response.raise_for_status() + soup = BeautifulSoup(response.content, 'html.parser') + img_tag = soup.select_one(ARTICLE_IMG_TAG) + if img_tag and img_tag.get('src'): + article_image=img_tag.get('src') + except Exception as e: + logging.error(f"Error fetching news: {str(e)}") + article_doc = { + "title": article["headline"], + "image": article_image, + "sports_type": sports_type, + "published_at": published_at, + "url": article_url, + "slug": article["slug"], + "created_at": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') + } + articles_to_store.append(article_doc) + + + if articles_to_store: + ArticleService.create_articles_bulk(articles_to_store) + logging.info(f"Stored/Updated {len(articles_to_store)} recent articles") + else: + logging.info("No recent articles to store") + return True + + except Exception as e: + logging.error(f"Error fetching news: {str(e)}") + return False diff --git a/src/scrapers/game_details_scrape.py b/src/scrapers/game_details_scrape.py index 8fce04a..5b6fc6d 100644 --- a/src/scrapers/game_details_scrape.py +++ b/src/scrapers/game_details_scrape.py @@ -31,16 +31,24 @@ def extract_teams_and_scores(box_score_section, sport): period_scores = [] for row in score_table.find(TAG_TBODY).find_all(TAG_TR): - team_name_cell = row.find(TAG_TH) if sport == 'ice hockey' else row.find(TAG_TD) + # Check if team name is in (some sports) or first (other sports) + team_name_cell = row.find(TAG_TH) if team_name_cell: + # Team name is in , all elements are period scores team_name = team_name_cell.text.strip().replace("Winner", "").strip() - team_name = ' '.join(team_name.split()) + scores = [td.text.strip() for td in row.find_all(TAG_TD)] else: - team_name = "Unknown" + # Team name is in first , remaining elements are period scores + team_name_cell = row.find(TAG_TD) + team_name = team_name_cell.text.strip().replace("Winner", "").strip() if team_name_cell else "Unknown" + scores = [td.text.strip() for td in row.find_all(TAG_TD)[1:]] + # Basketball box score includes a "Records" column at the end - exclude it + if sport == 'basketball' and scores: + scores = scores[:-1] + + team_name = ' '.join(team_name.split()) team_names.append(team_name) - scores = [td.text.strip() for td in row.find_all(TAG_TD)[1:]] - scores = scores[:-1] if sport == 'basketball' else scores period_scores.append(scores) return team_names, period_scores @@ -59,7 +67,7 @@ def soccer_summary(box_score_section): event = row.find_all(TAG_TD)[2] desc = event.find_all(TAG_SPAN)[-1].text.strip() - if team == "COR" or team == "CU": + if team == "COR" or team == "CU" or team == "CRNL": cornell_score += 1 else: opp_score += 1 @@ -220,6 +228,36 @@ def baseball_summary(box_score_section): summary = [{"message": "No scoring events in this game."}] return summary +# def basketball_summary(box_score_section): +# summary = [] +# scoring_section = box_score_section.find(TAG_SECTION, {ATTR_ARIA_LABEL: LABEL_SCORING_SUMMARY}) +# if scoring_section: +# scoring_rows = scoring_section.find(TAG_TBODY) +# if scoring_rows: +# cornell_score = 0 +# opp_score = 0 +# for row in scoring_rows.find_all(TAG_TR): +# time = row.find_all(TAG_TD)[0].text.strip() +# team = row.find_all(TAG_TD)[1].find(TAG_IMG)[ATTR_ALT] +# event = row.find_all(TAG_TD)[2] +# desc = event.find_all(TAG_SPAN)[-1].text.strip() + +# if team == "COR" or team == "CU" or team == "CRNL": +# cornell_score += 1 +# else: +# opp_score += 1 + +# summary.append({ +# 'time': time, +# 'team': team, +# 'description': desc, +# 'cor_score': cornell_score, +# 'opp_score': opp_score +# }) +# if not summary: +# summary = [{"message": "No scoring events in this game."}] +# return summary + def scrape_game(url, sport): soup = fetch_page(url) box_score_section = soup.find(class_=CLASS_BOX_SCORE) if sport in ['baseball', 'softball'] else soup.find(id=ID_BOX_SCORE) @@ -233,6 +271,7 @@ def scrape_game(url, sport): 'field hockey': (lambda: extract_teams_and_scores(box_score_section, 'field hockey'), field_hockey_summary), 'lacrosse': (lambda: extract_teams_and_scores(box_score_section, 'lacrosse'), lacrosse_summary), 'baseball': (lambda: extract_teams_and_scores(box_score_section, 'baseball'), baseball_summary), + 'basketball': (lambda: extract_teams_and_scores(box_score_section, 'basketball'), lambda _: []), } extract_teams_func, summary_func = sport_parsers.get(sport, (None, None)) diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index e174a65..818760c 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -4,10 +4,10 @@ from src.utils.convert_to_utc import convert_to_utc from src.utils.constants import * from src.scrapers.game_details_scrape import scrape_game -from src.utils.helpers import get_dominant_color +from src.utils.helpers import get_dominant_color, normalize_game_data, is_tournament_placeholder_team, is_cornell_loss import base64 import re -import html +from src.database import db import threading @@ -153,7 +153,14 @@ def parse_schedule_page(url, sport, gender): else: game_data["box_score"] = None game_data["score_breakdown"] = None - + + ticket_link_tag = game_item.select_one(GAME_TICKET_LINK) + ticket_link = ( + ticket_link_tag["href"] if ticket_link_tag else None + ) + game_data["ticket_link"] = ( + ticket_link if ticket_link else None + ) process_game_data(game_data) @@ -164,6 +171,8 @@ def process_game_data(game_data): Args: game_data (dict): A dictionary containing the game data. """ + + game_data = normalize_game_data(game_data) location_data = game_data["location"].split("\n") geo_location = location_data[0] if (",") not in geo_location: @@ -232,16 +241,28 @@ def process_game_data(game_data): if str(final_box_cor_score) != str(cor_final) or str(final_box_opp_score) != str(opp_final): game_data["score_breakdown"] = game_data["score_breakdown"][::-1] - # finds any existing game with the same key fields regardless of time - curr_game = GameService.get_game_by_key_fields( + # Try to find by tournament key fields to handle placeholder teams + curr_game = GameService.get_game_by_tournament_key_fields( city, game_data["date"], game_data["gender"], location, - team.id, game_data["sport"], state ) + + # If no tournament game found, try the regular lookup with opponent_id + if not curr_game: + curr_game = GameService.get_game_by_key_fields( + city, + game_data["date"], + game_data["gender"], + location, + team.id, + game_data["sport"], + state + ) + if isinstance(curr_game, list): if curr_game: curr_game = curr_game[0] @@ -253,8 +274,20 @@ def process_game_data(game_data): "result": game_data["result"], "box_score": game_data["box_score"], "score_breakdown": game_data["score_breakdown"], - "utc_date": utc_date_str + "utc_date": utc_date_str, + "city": city, + "location": location, + "state": state, + "ticket_link": game_data["ticket_link"] } + + current_team = TeamService.get_team_by_id(curr_game.opponent_id) + if current_team and is_tournament_placeholder_team(current_team.name): + updates["opponent_id"] = team.id + + if is_cornell_loss(game_data["result"]) and game_data["utc_date"]: + GameService.handle_tournament_loss(game_data["sport"], game_data["gender"], game_data["utc_date"]) + GameService.update_game(curr_game.id, updates) return @@ -270,7 +303,8 @@ def process_game_data(game_data): "time": game_time, "box_score": game_data["box_score"], "score_breakdown": game_data["score_breakdown"], - "utc_date": utc_date_str + "utc_date": utc_date_str, + "ticket_link": game_data["ticket_link"] } - + GameService.create_game(game_data) \ No newline at end of file diff --git a/src/scrapers/youtube_stats.py b/src/scrapers/youtube_stats.py index ee8a5a7..94eac38 100644 --- a/src/scrapers/youtube_stats.py +++ b/src/scrapers/youtube_stats.py @@ -6,6 +6,7 @@ import base64 import os import html +from bs4 import BeautifulSoup load_dotenv() YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY") @@ -25,6 +26,54 @@ def fetch_videos(): process_video_item(item) +def get_video_duration(video_id): + """ + Gets video duration using YouTube API + """ + try: + url = f"https://www.googleapis.com/youtube/v3/videos?key={YOUTUBE_API_KEY}&id={video_id}&part=contentDetails" + response = requests.get(url) + response.raise_for_status() + data = response.json() + + if data.get("items"): + duration_iso = data["items"][0]["contentDetails"]["duration"] + return convert_iso_duration(duration_iso) + return None + except Exception as e: + print(f"Error getting video duration: {e}") + return None + + +def convert_iso_duration(iso_duration): + """ + Converts ISO 8601 duration (PT2M5S) to readable format (2:05) + Examples: + - PT2M5S -> 2:05 + - PT1H23M45S -> 1:23:45 + - PT30S -> 0:30 + """ + import re + + # Remove PT prefix + duration = iso_duration.replace('PT', '') + + # Extract hours, minutes, seconds + hours = re.search(r'(\d+)H', duration) + minutes = re.search(r'(\d+)M', duration) + seconds = re.search(r'(\d+)S', duration) + + h = int(hours.group(1)) if hours else 0 + m = int(minutes.group(1)) if minutes else 0 + s = int(seconds.group(1)) if seconds else 0 + + # Format as MM:SS or HH:MM:SS + if h > 0: + return f"{h}:{m:02d}:{s:02d}" + else: + return f"{m}:{s:02d}" + + def process_video_item(item): """ Extracts the required data from a video item and @@ -55,14 +104,17 @@ def process_video_item(item): published_at = snippet.get("publishedAt") video_url = f"https://www.youtube.com/watch?v={video_id}" + duration = get_video_duration(video_id) + video_data = { - "id": video_id, # use video id for easy retrieval + "id": video_id, "title": title, "description": description, "thumbnail": thumbnail, "b64_thumbnail": encoded_thumbnail, "url": video_url, "published_at": published_at, + "duration": duration, } process_video_data(video_data) diff --git a/src/services/__init__.py b/src/services/__init__.py index 2ed3e7a..29b5c31 100644 --- a/src/services/__init__.py +++ b/src/services/__init__.py @@ -1,3 +1,4 @@ from .game_service import GameService from .team_service import TeamService -from .youtube_video_service import YoutubeVideoService \ No newline at end of file +from .youtube_video_service import YoutubeVideoService +from .article_service import ArticleService \ No newline at end of file diff --git a/src/services/article_service.py b/src/services/article_service.py new file mode 100644 index 0000000..77da243 --- /dev/null +++ b/src/services/article_service.py @@ -0,0 +1,73 @@ +from src.database import daily_sun_db +from src.models.article import Article +from src.repositories.article_repository import ArticleRepository +from datetime import datetime, timedelta +import logging + +class ArticleService: + @staticmethod + def get_articles(sports_type=None): + """ + Retrieve all articles from the last 3 days, optionally filtered by sports_type, sorted by published_at descending. + """ + try: + if sports_type: + return ArticleRepository.find_by_sports_type(sports_type) + return ArticleRepository.find_recent() + except Exception as e: + logging.error(f"Error retrieving articles: {str(e)}") + return [] + + @staticmethod + def create_article(article_data): + """ + Create a single article and store it in MongoDB. + """ + try: + article = Article( + title=article_data["title"], + sports_type=article_data["sports_type"], + published_at=article_data["published_at"], + url=article_data["url"], + slug=article_data["slug"], + image=article_data.get("image") + ) + return ArticleRepository.upsert(article) + except Exception as e: + logging.error(f"Error creating article: {str(e)}") + return None + + @staticmethod + def create_articles_bulk(articles_data): + """ + Create or update multiple articles in bulk and store them in MongoDB. + """ + try: + if not articles_data: + return + articles = [ + Article( + title=data["title"], + sports_type=data["sports_type"], + published_at=data["published_at"], + url=data["url"], + slug=data["slug"], + image=data.get("image") + ) + for data in articles_data + ] + ArticleRepository.bulk_upsert(articles) + except Exception as e: + logging.error(f"Error creating articles in bulk: {str(e)}") + raise + + @staticmethod + def cleanse_old_articles(): + """ + Remove articles older than 3 days from the database. + """ + try: + ArticleRepository.delete_not_recent(limit_days=5) # provide a buffer from the 3-day threshold + except Exception as e: + logging.error(f"Error cleansing old articles: {str(e)}") + raise \ No newline at end of file diff --git a/src/services/game_service.py b/src/services/game_service.py index 5463835..c0ae3db 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -1,6 +1,8 @@ from src.repositories.game_repository import GameRepository from src.models.game import Game from src.services.team_service import TeamService +from src.utils.helpers import is_tournament_placeholder_team +from pymongo.errors import DuplicateKeyError class GameService: @@ -33,9 +35,13 @@ def create_game(data): opponent_id = data.get("opponent_id") if not TeamService.get_team_by_id(opponent_id): raise ValueError(f"Opponent team with id {opponent_id} does not exist.") + game = Game(**data) - GameRepository.insert(game) - return game + try: + GameRepository.insert(game) + return game + except DuplicateKeyError: + return None @staticmethod def delete_game(game_id): @@ -69,6 +75,16 @@ def get_game_by_key_fields(city, date, gender, location, opponent_id, sport, sta city, date, gender, location, opponent_id, sport, state ) + @staticmethod + def get_game_by_tournament_key_fields(city, date, gender, location, sport, state): + """ + Retrieve a tournament game by location and date (excluding opponent_id). + This is used when we need to find a tournament game that might have a placeholder team. + """ + return GameRepository.find_by_tournament_key_fields( + city, date, gender, location, sport, state + ) + @staticmethod def get_games_by_sport(sport): """ @@ -89,3 +105,57 @@ def get_games_by_sport_gender(sport, gender): Retrieves all game by its sport and gender. """ return GameRepository.find_by_sport_gender(sport, gender) + + @staticmethod + def get_games_by_date(startDate, endDate): + """ + Retrieves all games between these two dates. + """ + return GameRepository.find_by_date(startDate, endDate) + + @staticmethod + def get_tournament_games_by_sport_gender(sport, gender, after_date=None): + """ + Find tournament games (with placeholder team names) for a specific sport and gender. + Optionally filter by games after a specific date. + """ + games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date) + tournament_games = [] + + for game in games: + team = TeamService.get_team_by_id(game.opponent_id) + if team and is_tournament_placeholder_team(team.name): + tournament_games.append(game) + + return tournament_games + + @staticmethod + def delete_tournament_games_by_sport_gender(sport, gender, after_date=None): + """ + Delete tournament games (with placeholder team names) for a specific sport and gender. + Optionally filter by games after a specific date. + """ + games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date) + tournament_game_ids = [] + + for game in games: + team = TeamService.get_team_by_id(game.opponent_id) + if team and is_tournament_placeholder_team(team.name): + tournament_game_ids.append(game.id) + + if tournament_game_ids: + return GameRepository.delete_games_by_ids(tournament_game_ids) + return 0 + + @staticmethod + def handle_tournament_loss(sport, gender, loss_date): + """ + Handle when a Cornell team loses in a tournament by deleting future tournament games. + + Args: + sport (str): The sport of the team that lost + gender (str): The gender of the team that lost + loss_date (datetime): The date when the team lost + """ + deleted_count = GameService.delete_tournament_games_by_sport_gender(sport, gender, loss_date) + return deleted_count diff --git a/src/services/team_service.py b/src/services/team_service.py index 57598f8..c961534 100644 --- a/src/services/team_service.py +++ b/src/services/team_service.py @@ -1,7 +1,6 @@ from src.repositories import TeamRepository from src.models.team import Team - class TeamService: @staticmethod def get_all_teams(): @@ -13,14 +12,25 @@ def get_all_teams(): @staticmethod def create_team(team_data): """ - Create a new team. - + Create a new team, or update it if it already exists. + Args: team_data (dict): The data for the new team. - Returns: Team: The created team. """ + name = team_data.get("name") + if not name: + raise ValueError("Team name is required to create a team.") + + existing = TeamService.get_team_by_name(name) + if existing: + if isinstance(existing, list) and existing: + existing = existing[0] + + TeamService.update_team(existing.id, team_data) + return existing + team = Team(**team_data) TeamRepository.insert(team) return team diff --git a/src/services/youtube_video_service.py b/src/services/youtube_video_service.py index 5052975..0d34c33 100644 --- a/src/services/youtube_video_service.py +++ b/src/services/youtube_video_service.py @@ -30,6 +30,7 @@ def create_video(data): b64_thumbnail=data.get("b64_thumbnail"), url=data.get("url"), published_at=data.get("published_at"), + duration=data.get("duration"), ) YoutubeVideoRepository.insert(video) return video diff --git a/src/types.py b/src/types.py index b792b46..7eb8fbe 100644 --- a/src/types.py +++ b/src/types.py @@ -1,5 +1,5 @@ from graphene import ObjectType, Field, String, List, Int -from src.services import TeamService +from datetime import datetime class TeamType(ObjectType): """ @@ -88,6 +88,7 @@ class GameType(ObjectType): - `time`: The time of the game. (optional) - `box_score`: The box score of the game. - `score_breakdown`: The score breakdown of the game. + - `ticket_link`: The ticket link of the game. (optional) """ id = String(required=False) @@ -104,11 +105,11 @@ class GameType(ObjectType): score_breakdown = List(List(String), required=False) team = Field(TeamType, required=False) utc_date = String(required=False) - + ticket_link = String(required=False) def __init__( - self, id, city, date, gender, location, opponent_id, result, sport, state, time, box_score=None, score_breakdown=None, utc_date=None + self, id, city, date, gender, location, opponent_id, result, sport, state, time, box_score=None, score_breakdown=None, utc_date=None, ticket_link=None ): - self.id = id + self.id = id self.city = city self.date = date self.gender = gender @@ -121,7 +122,7 @@ def __init__( self.box_score = box_score self.score_breakdown = score_breakdown self.utc_date = utc_date - + self.ticket_link = ticket_link @staticmethod def team_to_team_type(team_obj): if team_obj is None: @@ -138,7 +139,7 @@ def resolve_team(parent, info): # getting team id - team could be None in older data team_id = parent.team if parent.team is not None else parent.opponent_id if team_id and isinstance(team_id, str): - # promise to get team object once the dataloader is ready + # promise to get team object once the dataloader is ready promise = info.context["team_loader"].load(team_id) return promise.then(GameType.team_to_team_type) return None @@ -151,17 +152,50 @@ class YoutubeVideoType(ObjectType): - id: The YouTube video ID (optional). - title: The title of the video. - description: The description of the video. - - thumbnail: The URL of the video's thumbnail. + - thumbnail: The URL of the video's thumbnail. (optional) - url: The URL to the video. - published_at: The date and time the video was published. + - duration: The duration of the video (optional). + - sportsType: The sport type extracted from the video title. """ id = String(required=False) title = String(required=True) description = String(required=True) thumbnail = String(required=True) - b64_thumbnail = String(required=True) + b64_thumbnail = String(required=False) url = String(required=True) published_at = String(required=True) + duration = String(required=False) + sportsType = String(required=False) + + def __init__(self, **kwargs): + for key, value in kwargs.items(): + setattr(self, key, value) + + def resolve_sportsType(video, info): + """ + Resolver to extract sport type from the video title. + """ + from src.utils.helpers import extract_sport_from_title + return extract_sport_from_title(video.title) + +class ArticleType(ObjectType): + """ + A GraphQL type representing a news article. + + Attributes: + - title: The title of the article + - image: The filename of the article's main image + - sports_type: The specific sport category + - published_at: The publication date + - url: The URL to the full article + """ + id = String() + title = String(required=True) + image = String() + sports_type = String(required=True) + published_at = String(required=True) + url = String(required=True) def __init__(self, **kwargs): for key, value in kwargs.items(): diff --git a/src/utils/constants.py b/src/utils/constants.py index c65b20f..81f0414 100644 --- a/src/utils/constants.py +++ b/src/utils/constants.py @@ -40,6 +40,9 @@ # The tag for the box score BOX_SCORE_TAG = ".sidearm-schedule-game-links-boxscore a" +# The tag for the game ticket link +GAME_TICKET_LINK = ".sidearm-schedule-game-links-tickets a" + # HTML Tags TAG_TABLE = 'table' TAG_SECTION = 'section' @@ -125,4 +128,6 @@ CHANNEL_ID = "UClSQOi2gnn9bi7mcgQrAVKA" # The maximum number of videos to retrieve -VIDEO_LIMIT = 20 \ No newline at end of file +VIDEO_LIMIT = 20 + +ARTICLE_IMG_TAG = ".dom-art-container img" diff --git a/src/utils/helpers.py b/src/utils/helpers.py index 0866f79..ba1ea4c 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -3,6 +3,7 @@ from PIL import Image from io import BytesIO from collections import Counter +import re def get_dominant_color(image_url, white_threshold=200, black_threshold=50): @@ -54,4 +55,135 @@ def get_dominant_color(image_url, white_threshold=200, black_threshold=50): return hex_color except Exception as e: logging.error(f"Error in get_dominant_color for {image_url}: {e}") - return default_color \ No newline at end of file + return default_color + +def normalize_game_data(data: dict): + """ + Normalize placeholder values like TBA/TBD into None. + """ + placeholders = {"TBA", "TBD", "tba", "tbd"} + + for field in ["time", "city", "state"]: + if data.get(field) in placeholders: + data[field] = None + + return data + +def is_tournament_placeholder_team(team_name: str): + """ + Check if a team name is a tournament placeholder. + """ + + placeholder_team_names = [ + "First Round", "Second Round", "Third Round", "Quarterfinals", + "College Cup Semifinals", "College Cup Championship Game", + "ECAC Hockey First Round", "ECAC Hockey Quarterfinals", + "ECAC Hockey Semifinals", "ECAC Hockey Championship Game", + "Regional Semifinals", "Regional Championship", "National Semifinals", + "TBD", "National Championship", "NCAA Wrestling Championships", "NCAA Northeast Regional CHampionships", + "NCAA Cross Country Championships", + ] + return team_name in placeholder_team_names + +def is_cornell_loss(result: str): + """ + Check if the result indicates a Cornell loss. + """ + + if not result: + return False + + # Common loss indicators in result strings + loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"] + return any(indicator in result for indicator in loss_indicators) + +def extract_sport_from_title(title): + """ + Extracts the sport type from a YouTube video title. + + Args: + title (str): The title of the YouTube video + + Returns: + str: The sport type if found, None otherwise + """ + if not title: + return None + + title_lower = title.lower() + + sport_patterns = [ + # Ice Hockey + (r"ice\s+hockey", "Ice Hockey"), + (r"women'?s\s+ice\s+hockey", "Ice Hockey"), + (r"men'?s\s+ice\s+hockey", "Ice Hockey"), + # Field Hockey + (r"field\s+hockey", "Field Hockey"), + # Hockey + (r"\bhockey\b", "Ice Hockey"), + # Basketball + (r"basketball", "Basketball"), + # Football + (r"\bfootball\b", "Football"), + # Soccer + (r"\bsoccer\b", "Soccer"), + # Volleyball + (r"volleyball", "Volleyball"), + # Wrestling + (r"wrestling", "Wrestling"), + # Sprint Football + (r"sprint\s+football", "Sprint Football"), + ] + + for pattern, sport_name in sport_patterns: + if re.search(pattern, title_lower): + return sport_name + + if "ice" in title_lower and ("hockey" in title_lower or "cornell" in title_lower): + return "Ice Hockey" + + return None + +def extract_sport_type_from_title(title: str): + """ + Extract the sport type from an article title by matching against known sports. + + Args: + title (str): The article title to analyze + + Returns: + str: The sport name if found, otherwise "sports" as default + """ + from .constants import SPORT_URLS + + if not title: + return "sports" + + # Get all unique sport names from SPORT_URLS + sport_names = set() + for sport_data in SPORT_URLS.values(): + sport_name = sport_data["sport"].strip() + if sport_name: + sport_names.add(sport_name) + + # Sort by length (longest first) to match "Swimming & Diving" before "Swimming" + sport_names_sorted = sorted(sport_names, key=len, reverse=True) + + title_lower = title.lower() + + for sport_name in sport_names_sorted: + if sport_name.lower() in title_lower: + return sport_name + + # Special mappings for common variations in titles + # Only checked if no exact match found above + # e.g., "Hockey" in title should match "Ice Hockey" in sport names + special_mappings = { + "hockey": "Ice Hockey", # "Men's Hockey" or "Women's Hockey" → "Ice Hockey" + } + + for keyword, sport_name in special_mappings.items(): + if keyword in title_lower and sport_name in sport_names: + return sport_name + + return "sports"