From 3d88699951eade0abe1cc9bf7adaafd196cde3da Mon Sep 17 00:00:00 2001 From: Mateo Date: Mon, 24 Mar 2025 20:23:11 -0400 Subject: [PATCH 01/30] implement articles from daily sun --- README.md | 4 +- app.py | 24 ++++++++- src/database.py | 3 +- src/models/__init__.py | 3 +- src/models/article.py | 56 ++++++++++++++++++++ src/mutations/__init__.py | 3 +- src/mutations/create_article.py | 27 ++++++++++ src/queries/__init__.py | 1 + src/queries/article_query.py | 12 +++++ src/repositories/__init__.py | 1 + src/repositories/article_repository.py | 69 ++++++++++++++++++++++++ src/schema.py | 7 +-- src/scrapers/daily_sun_scrape.py | 59 +++++++++++++++++++++ src/services/__init__.py | 3 +- src/services/article_service.py | 73 ++++++++++++++++++++++++++ src/types.py | 29 +++++++++- 16 files changed, 363 insertions(+), 11 deletions(-) create mode 100644 src/models/article.py create mode 100644 src/mutations/create_article.py create mode 100644 src/queries/article_query.py create mode 100644 src/repositories/article_repository.py create mode 100644 src/scrapers/daily_sun_scrape.py create mode 100644 src/services/article_service.py diff --git a/README.md b/README.md index 5df5eb3..839e973 100644 --- a/README.md +++ b/README.md @@ -22,4 +22,6 @@ To start the project, run the following command in the terminal ## Setting up the database -Add /graphql to the url to access the interactive GraphQL platform \ No newline at end of file +Create a Mongo database named `score_db` and another named `daily_sun_db`. A partnership with the Daily Sun has given us access to their articles which we copy and paginate the results for frontend. + +Add /graphql to the url to access the interactive GraphQL platform diff --git a/app.py b/app.py index 5debd7e..860dc89 100644 --- a/app.py +++ b/app.py @@ -7,6 +7,8 @@ from src.schema import Query, Mutation from src.scrapers.games_scraper import fetch_game_schedule from src.scrapers.youtube_stats import fetch_videos +from src.scrapers.daily_sun_scrape import fetch_news +from src.services.article_service import ArticleService from src.utils.team_loader import TeamLoader app = Flask(__name__) @@ -42,6 +44,11 @@ def parse_args(): action="store_true", help="Skips scraping tasks if set, useful for frontend development.", ) + parser.add_argument( + "--no-daily-sun", + action="store_true", + help="Skips using the Daily Sun page for alerts", + ) return parser.parse_args() args = parse_args() @@ -52,7 +59,7 @@ def scrape_schedules(): logging.info("Scraping game schedules...") fetch_game_schedule() - @scheduler.task("interval", id="scrape_schedules", seconds=43200) + @scheduler.task("interval", id="scrape_schedules", seconds=43200) # 12 hours def scrape_videos(): logging.info("Scraping YouTube videos...") fetch_videos() @@ -60,5 +67,20 @@ def scrape_videos(): scrape_schedules() scrape_videos() +if not args.no_daily_sun: + @scheduler.task("interval", id="scrape_daily_sun", seconds=3600) + def scrape_daily_sun(): + logging.info("Getting Daily Sun Sports News...") + fetch_news() + + @scheduler.task("interval", id="cleanse_daily_sun_db", seconds=604800) # 1 week + def cleanse_daily_sun_db(): + logging.info("Cleaning the Daily Sun database from old articles...") + ArticleService.cleanse_old_articles() + + scrape_daily_sun() + cleanse_daily_sun_db() + + if __name__ == "__main__": app.run(debug=True, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/src/database.py b/src/database.py index 19801e0..b9c25e5 100644 --- a/src/database.py +++ b/src/database.py @@ -6,7 +6,7 @@ if os.getenv("STAGE") == "local": file_name = "ca-certificate.crt" - use_tls = os.getenv("MONGO_URI") != "mongodb://localhost:27017/" + use_tls = "localhost" not in os.getenv("MONGO_URI") else: file_name = "/etc/ssl/ca-certificate.crt" use_tls = True @@ -17,3 +17,4 @@ client = MongoClient(os.getenv("MONGO_URI")) db = client[os.getenv("MONGO_DB", "score_db")] +daily_sun_db = client[os.getenv("DAILY_SUN_DB", "daily_sun_db")] diff --git a/src/models/__init__.py b/src/models/__init__.py index ab83d25..efbf4e5 100644 --- a/src/models/__init__.py +++ b/src/models/__init__.py @@ -1,3 +1,4 @@ from .game import Game from .team import Team -from .youtube_video import YoutubeVideo \ No newline at end of file +from .youtube_video import YoutubeVideo +from .article import Article \ No newline at end of file diff --git a/src/models/article.py b/src/models/article.py new file mode 100644 index 0000000..bfcb8e1 --- /dev/null +++ b/src/models/article.py @@ -0,0 +1,56 @@ +from bson.objectid import ObjectId +from datetime import datetime + +class Article: + """ + A model representing a news article. + + Attributes: + - title: The title of the article + - image: The filename of the article's main image + - sports_type: The specific sport category + - published_at: The publication date + - url: The URL to the full article + - slug: Unique identifier from the source + - created_at: When the article was added to our DB + """ + def __init__(self, title, sports_type, published_at, url, slug, image=None, id=None, created_at=None): + self.id = id if id else str(ObjectId()) + self.title = title + self.image = image + self.sports_type = sports_type + self.published_at = published_at + self.url = url + self.slug = slug + self.created_at = created_at if created_at else datetime.now() + + def to_dict(self): + """ + Converts the Article object to a dictionary format for MongoDB storage. + """ + return { + "_id": self.id, + "title": self.title, + "image": self.image, + "sports_type": self.sports_type, + "published_at": self.published_at, + "url": self.url, + "slug": self.slug, + "created_at": self.created_at + } + + @staticmethod + def from_dict(data): + """ + Converts a MongoDB document to an Article object. + """ + return Article( + id=data.get("_id"), + title=data.get("title"), + image=data.get("image"), + sports_type=data.get("sports_type"), + published_at=data.get("published_at"), + url=data.get("url"), + slug=data.get("slug"), + created_at=data.get("created_at") + ) \ No newline at end of file diff --git a/src/mutations/__init__.py b/src/mutations/__init__.py index 3fd3a8a..3df8e4d 100644 --- a/src/mutations/__init__.py +++ b/src/mutations/__init__.py @@ -1,3 +1,4 @@ from .create_game import CreateGame from .create_team import CreateTeam -from .create_youtube_video import CreateYoutubeVideo \ No newline at end of file +from .create_youtube_video import CreateYoutubeVideo +from .create_article import CreateArticle \ No newline at end of file diff --git a/src/mutations/create_article.py b/src/mutations/create_article.py new file mode 100644 index 0000000..1e0a03b --- /dev/null +++ b/src/mutations/create_article.py @@ -0,0 +1,27 @@ +from graphene import Mutation, String, Field +from src.types import ArticleType +from src.services.article_service import ArticleService + +class CreateArticle(Mutation): + class Arguments: + title = String(required=True) + sports_type = String(required=True) + published_at = String(required=True) + url = String(required=True) + slug = String(required=True) + image = String(required=False) + + article = Field(lambda: ArticleType) + + def mutate(self, info, title, sports_type, published_at, url, slug, image=None): + from datetime import datetime + article_data = { + "title": title, + "sports_type": sports_type, + "published_at": datetime.fromisoformat(published_at), + "url": url, + "slug": slug, + "image": image + } + new_article = ArticleService.create_article(article_data) + return CreateArticle(article=new_article) \ No newline at end of file diff --git a/src/queries/__init__.py b/src/queries/__init__.py index f345409..fdf2f41 100644 --- a/src/queries/__init__.py +++ b/src/queries/__init__.py @@ -1,3 +1,4 @@ from .game_query import GameQuery from .team_query import TeamQuery from .youtube_video_query import YoutubeVideoQuery +from .article_query import ArticleQuery \ No newline at end of file diff --git a/src/queries/article_query.py b/src/queries/article_query.py new file mode 100644 index 0000000..52e6cbc --- /dev/null +++ b/src/queries/article_query.py @@ -0,0 +1,12 @@ +from graphene import ObjectType, List, String +from src.services.article_service import ArticleService +from src.types import ArticleType + +class ArticleQuery(ObjectType): + articles = List(ArticleType, sports_type=String()) + + def resolve_articles(self, info, sports_type=None): + """ + Resolver for retrieving news articles, optionally filtered by sports_type. + """ + return ArticleService.get_articles(sports_type) \ No newline at end of file diff --git a/src/repositories/__init__.py b/src/repositories/__init__.py index 1c18bb7..f9c6252 100644 --- a/src/repositories/__init__.py +++ b/src/repositories/__init__.py @@ -1,3 +1,4 @@ from .game_repository import GameRepository from .team_repository import TeamRepository from .youtube_video_repository import YoutubeVideoRepository +from .article_repository import ArticleRepository \ No newline at end of file diff --git a/src/repositories/article_repository.py b/src/repositories/article_repository.py new file mode 100644 index 0000000..1a30dc5 --- /dev/null +++ b/src/repositories/article_repository.py @@ -0,0 +1,69 @@ +from src.database import daily_sun_db +from src.models.article import Article +from pymongo import UpdateOne +from datetime import datetime, timedelta + +class ArticleRepository: + @staticmethod + def upsert(article): + """ + Upsert an article into the 'news_articles' collection in MongoDB. + """ + article_collection = daily_sun_db["news_articles"] + article_collection.update_one( + {"slug": article.slug}, + {"$set": article.to_dict()}, + upsert=True + ) + + @staticmethod + def bulk_upsert(articles): + """ + Bulk upsert articles into the 'news_articles' collection based on slug. + """ + if not articles: + return + + article_collection = daily_sun_db["news_articles"] + operations = [ + UpdateOne( + {"slug": article.slug}, + {"$set": article.to_dict()}, + upsert=True + ) + for article in articles + ] + if operations: + article_collection.bulk_write(operations) + + @staticmethod + def find_recent(limit_days=3): + """ + Retrieve articles from the last N days, sorted by published_at descending. + """ + article_collection = daily_sun_db["news_articles"] + query = {"published_at": {"$gte": datetime.now() - timedelta(days=limit_days)}} + articles = article_collection.find(query).sort("published_at", -1) + return [Article.from_dict(article) for article in articles] + + @staticmethod + def find_by_sports_type(sports_type, limit_days=3): + """ + Retrieve articles by sports_type from the last N days, sorted by published_at descending. + """ + article_collection = daily_sun_db["news_articles"] + query = { + "sports_type": sports_type, + "published_at": {"$gte": datetime.now() - timedelta(days=limit_days)} + } + articles = article_collection.find(query).sort("published_at", -1) + return [Article.from_dict(article) for article in articles] + + @staticmethod + def delete_not_recent(limit_days=3): + """ + Delete articles older than N days, sorted by published_at descending. + """ + article_collection = daily_sun_db["news_articles"] + query = {"published_at": {"$lt": datetime.now() - timedelta(days=limit_days)}} + article_collection.delete_many(query) \ No newline at end of file diff --git a/src/schema.py b/src/schema.py index 2cbbe69..0f3ae99 100644 --- a/src/schema.py +++ b/src/schema.py @@ -1,9 +1,9 @@ from graphene import ObjectType, Schema, Mutation -from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo -from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery +from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo, CreateArticle +from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery, ArticleQuery -class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ObjectType): +class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ArticleQuery, ObjectType): pass @@ -11,6 +11,7 @@ class Mutation(ObjectType): create_game = CreateGame.Field(description="Creates a new game.") create_team = CreateTeam.Field(description="Creates a new team.") create_youtube_video = CreateYoutubeVideo.Field(description="Creates a new youtube video.") + create_article = CreateArticle.Field(description="Creates a new article.") schema = Schema(query=Query, mutation=Mutation) diff --git a/src/scrapers/daily_sun_scrape.py b/src/scrapers/daily_sun_scrape.py new file mode 100644 index 0000000..cd11c5a --- /dev/null +++ b/src/scrapers/daily_sun_scrape.py @@ -0,0 +1,59 @@ +import os +import requests +from datetime import datetime, timedelta +from dotenv import load_dotenv +from ..services import ArticleService +import logging + +load_dotenv() + + +def fetch_news(): + try: + url = os.getenv("DAILY_SUN_URL") + response = requests.get( + url, + headers={ + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + } + ) + response.raise_for_status() + data = response.json() + + # Current date and 3-day threshold + current_date = datetime.now() + three_days_ago = current_date - timedelta(days=3) + + # Process articles + articles_to_store = [] + for article in data.get("articles", []): + published_at = datetime.strptime(article["published_at"], "%Y-%m-%d %H:%M:%S") + + if published_at >= three_days_ago: + sports_type = next( + (tag["name"] for tag in article["tags"] if tag["name"] not in ["Sports", "Top Stories"]), + "General" + ) + article_url = f"https://cornellsun.com/article/{article['slug']}" + + article_doc = { + "title": article["headline"], + "image": article["dominantMedia"]["title"] if article["dominantMedia"] else None, + "sports_type": sports_type, + "published_at": published_at, + "url": article_url, + "slug": article["slug"], + "created_at": datetime.now() + } + articles_to_store.append(article_doc) + + if articles_to_store: + ArticleService.create_articles_bulk(articles_to_store) + logging.info(f"Stored/Updated {len(articles_to_store)} recent articles") + else: + logging.info("No recent articles to store") + return True + + except Exception as e: + logging.error(f"Error fetching news: {str(e)}") + return False diff --git a/src/services/__init__.py b/src/services/__init__.py index 2ed3e7a..29b5c31 100644 --- a/src/services/__init__.py +++ b/src/services/__init__.py @@ -1,3 +1,4 @@ from .game_service import GameService from .team_service import TeamService -from .youtube_video_service import YoutubeVideoService \ No newline at end of file +from .youtube_video_service import YoutubeVideoService +from .article_service import ArticleService \ No newline at end of file diff --git a/src/services/article_service.py b/src/services/article_service.py new file mode 100644 index 0000000..77da243 --- /dev/null +++ b/src/services/article_service.py @@ -0,0 +1,73 @@ +from src.database import daily_sun_db +from src.models.article import Article +from src.repositories.article_repository import ArticleRepository +from datetime import datetime, timedelta +import logging + +class ArticleService: + @staticmethod + def get_articles(sports_type=None): + """ + Retrieve all articles from the last 3 days, optionally filtered by sports_type, sorted by published_at descending. + """ + try: + if sports_type: + return ArticleRepository.find_by_sports_type(sports_type) + return ArticleRepository.find_recent() + except Exception as e: + logging.error(f"Error retrieving articles: {str(e)}") + return [] + + @staticmethod + def create_article(article_data): + """ + Create a single article and store it in MongoDB. + """ + try: + article = Article( + title=article_data["title"], + sports_type=article_data["sports_type"], + published_at=article_data["published_at"], + url=article_data["url"], + slug=article_data["slug"], + image=article_data.get("image") + ) + return ArticleRepository.upsert(article) + except Exception as e: + logging.error(f"Error creating article: {str(e)}") + return None + + @staticmethod + def create_articles_bulk(articles_data): + """ + Create or update multiple articles in bulk and store them in MongoDB. + """ + try: + if not articles_data: + return + articles = [ + Article( + title=data["title"], + sports_type=data["sports_type"], + published_at=data["published_at"], + url=data["url"], + slug=data["slug"], + image=data.get("image") + ) + for data in articles_data + ] + ArticleRepository.bulk_upsert(articles) + except Exception as e: + logging.error(f"Error creating articles in bulk: {str(e)}") + raise + + @staticmethod + def cleanse_old_articles(): + """ + Remove articles older than 3 days from the database. + """ + try: + ArticleRepository.delete_not_recent(limit_days=5) # provide a buffer from the 3-day threshold + except Exception as e: + logging.error(f"Error cleansing old articles: {str(e)}") + raise \ No newline at end of file diff --git a/src/types.py b/src/types.py index 830e1e2..88ceb36 100644 --- a/src/types.py +++ b/src/types.py @@ -1,5 +1,5 @@ from graphene import ObjectType, Field, String, List, Int -from src.services import TeamService +from datetime import datetime class TeamType(ObjectType): """ @@ -159,4 +159,29 @@ class YoutubeVideoType(ObjectType): def __init__(self, **kwargs): for key, value in kwargs.items(): - setattr(self, key, value) \ No newline at end of file + setattr(self, key, value) + +class ArticleType(ObjectType): + """ + A GraphQL type representing a news article. + + Attributes: + - title: The title of the article + - image: The filename of the article's main image + - sports_type: The specific sport category + - published_at: The publication date + - url: The URL to the full article + """ + id = String() + title = String(required=True) + image = String() + sports_type = String(required=True) + published_at = String(required=True) + url = String(required=True) + + def __init__(self, **kwargs): + for key, value in kwargs.items(): + if key == "published_at" and isinstance(value, datetime): + setattr(self, key, value.isoformat()) + else: + setattr(self, key, value) \ No newline at end of file From cc9ebd1b3bf9ef36b7fc78e1423afbea26c7644a Mon Sep 17 00:00:00 2001 From: Mateo Date: Thu, 27 Mar 2025 00:48:11 -0400 Subject: [PATCH 02/30] update env template --- .env_template | 1 + 1 file changed, 1 insertion(+) diff --git a/.env_template b/.env_template index 49ea0bd..56b5add 100644 --- a/.env_template +++ b/.env_template @@ -2,3 +2,4 @@ YOUTUBE_API_KEY= MONGO_URI= MONGO_DB= STAGE= +DAILY_SUN_URL= \ No newline at end of file From 9d1792c7144cd841087a7df5d1662ae47da58291 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Fri, 12 Sep 2025 21:19:18 -0400 Subject: [PATCH 03/30] Added logic to prevent adding duplicate games when scraping --- src/database.py | 17 ++++++++++++++++- src/scrapers/games_scraper.py | 17 ++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/database.py b/src/database.py index d475437..1ec4096 100644 --- a/src/database.py +++ b/src/database.py @@ -48,7 +48,7 @@ def keep_connection_alive(): # Access the database db = client[os.getenv("MONGO_DB", "score_db")] - +print("Total games in DB:", db["game"].count_documents({})) def setup_database_indexes(): """Set up MongoDB indexes for optimal query performance""" @@ -65,6 +65,21 @@ def setup_database_indexes(): # Index for sorting operations game_collection.create_index([("date", -1)], background=True) + + # Index to have unique games so we won't add duplicates + game_collection.create_index( + [ + ("sport", 1), + ("gender", 1), + ("date", 1), + ("opponent_id", 1), + ("city", 1), + ("state", 1), + ("location", 1), + ], + unique=True, + background=True + ) print("✅ MongoDB indexes created successfully") except Exception as e: diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index e174a65..a92e87c 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -7,7 +7,7 @@ from src.utils.helpers import get_dominant_color import base64 import re -import html +from src.database import db import threading @@ -272,5 +272,20 @@ def process_game_data(game_data): "score_breakdown": game_data["score_breakdown"], "utc_date": utc_date_str } + + # update the game if it exists, otherwise insert it as a new game. + db.game.update_one( + { + "sport": game_data["sport"], + "gender": game_data["gender"], + "date": game_data["date"], + "opponent_id": game_data["opponent_id"], + "city": game_data["city"], + "state": game_data["state"], + "location": game_data["location"], + }, + {"$set": game_data}, + upsert=True + ) GameService.create_game(game_data) \ No newline at end of file From d205803c58c124ea8074ab2075189a6bdb634846 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Fri, 12 Sep 2025 21:49:40 -0400 Subject: [PATCH 04/30] Reworked service methods to check for duplicates and fixed game scraping issue --- src/scrapers/games_scraper.py | 15 --------------- src/services/game_service.py | 28 ++++++++++++++++++++++++++++ src/services/team_service.py | 21 +++++++++++++-------- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index a92e87c..cd71dd0 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -273,19 +273,4 @@ def process_game_data(game_data): "utc_date": utc_date_str } - # update the game if it exists, otherwise insert it as a new game. - db.game.update_one( - { - "sport": game_data["sport"], - "gender": game_data["gender"], - "date": game_data["date"], - "opponent_id": game_data["opponent_id"], - "city": game_data["city"], - "state": game_data["state"], - "location": game_data["location"], - }, - {"$set": game_data}, - upsert=True - ) - GameService.create_game(game_data) \ No newline at end of file diff --git a/src/services/game_service.py b/src/services/game_service.py index 5463835..ae8381e 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -33,6 +33,34 @@ def create_game(data): opponent_id = data.get("opponent_id") if not TeamService.get_team_by_id(opponent_id): raise ValueError(f"Opponent team with id {opponent_id} does not exist.") + + existing = GameService.get_game_by_key_fields( + data["city"], + data["date"], + data["gender"], + data["location"], + data["opponent_id"], + data["sport"], + data["state"], + ) + + #check if game already exists + if existing: + if isinstance(existing, list) and existing: + existing = existing[0] + + # update existing game + updates = { + "time": data.get("time"), + "result": data.get("result"), + "box_score": data.get("box_score"), + "score_breakdown": data.get("score_breakdown"), + "utc_date": data.get("utc_date"), + } + GameService.update_game(existing.id, updates) + return existing + + # create new game if it doesn't exist game = Game(**data) GameRepository.insert(game) return game diff --git a/src/services/team_service.py b/src/services/team_service.py index 57598f8..7127d21 100644 --- a/src/services/team_service.py +++ b/src/services/team_service.py @@ -1,7 +1,6 @@ from src.repositories import TeamRepository from src.models.team import Team - class TeamService: @staticmethod def get_all_teams(): @@ -13,14 +12,20 @@ def get_all_teams(): @staticmethod def create_team(team_data): """ - Create a new team. - - Args: - team_data (dict): The data for the new team. - - Returns: - Team: The created team. + Create a new team, or update it if it already exists. """ + name = team_data.get("name") + if not name: + raise ValueError("Team name is required to create a team.") + + existing = TeamService.get_team_by_name(name) + if existing: + if isinstance(existing, list) and existing: + existing = existing[0] + + TeamService.update_team(existing.id, team_data) + return existing + team = Team(**team_data) TeamRepository.insert(team) return team From e1f080c42d66a3c62645570fb4db0adce2aa5a6e Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Sat, 20 Sep 2025 19:04:43 -0400 Subject: [PATCH 05/30] Added tournament handling and TBD/TBA updates for games --- src/database.py | 14 ++++- src/repositories/game_repository.py | 78 ++++++++++++++++++++++++++ src/scrapers/games_scraper.py | 35 ++++++++++-- src/services/game_service.py | 86 ++++++++++++++++++++--------- src/utils/helpers.py | 42 +++++++++++++- 5 files changed, 220 insertions(+), 35 deletions(-) diff --git a/src/database.py b/src/database.py index 1ec4096..5ea74a4 100644 --- a/src/database.py +++ b/src/database.py @@ -73,13 +73,23 @@ def setup_database_indexes(): ("gender", 1), ("date", 1), ("opponent_id", 1), - ("city", 1), ("state", 1), - ("location", 1), ], unique=True, background=True ) + + # Additional index for tournament games (without opponent_id) + game_collection.create_index( + [ + ("sport", 1), + ("gender", 1), + ("date", 1), + ("city", 1), + ("state", 1), + ], + background=True + ) print("✅ MongoDB indexes created successfully") except Exception as e: diff --git a/src/repositories/game_repository.py b/src/repositories/game_repository.py index bfe5d08..4f05b2f 100644 --- a/src/repositories/game_repository.py +++ b/src/repositories/game_repository.py @@ -130,6 +130,56 @@ def find_by_key_fields(city, date, gender, location, opponent_id, sport, state): return [Game.from_dict(game) for game in games] + @staticmethod + def find_by_tournament_key_fields(city, date, gender, location, sport, state): + """ + Find tournament games by location and date (excluding opponent_id). + This is used when we need to find a tournament game that might have a placeholder team. + Uses flexible matching to handle TBD/TBA values. + """ + game_collection = db["game"] + + # Build flexible query that can handle TBD/TBA values + query = { + "date": date, + "gender": gender, + "sport": sport, + } + + # For city, state, and location, use flexible matching + # This allows finding games even when TBD/TBA values change to real values + city_conditions = [] + if city: + city_conditions.append(city) + else: + city_conditions = [None] + + state_conditions = [] + if state: + state_conditions.append(state) + else: + state_conditions = [None] + + location_conditions = [] + if location: + location_conditions.append(location) + else: + location_conditions = [None] + + query["city"] = {"$in": city_conditions} + query["state"] = {"$in": state_conditions} + query["location"] = {"$in": location_conditions} + + games = list(game_collection.find(query)) + + if not games: + return None + + if len(games) == 1: + return Game.from_dict(games[0]) + + return [Game.from_dict(game) for game in games] + @staticmethod def find_by_sport(sport): """ @@ -156,3 +206,31 @@ def find_by_sport_gender(sport, gender): game_collection = db["game"] games = game_collection.find({"sport": sport, "gender": gender}) return [Game.from_dict(game) for game in games] + + @staticmethod + def find_games_by_sport_gender_after_date(sport, gender, after_date=None): + """ + Find games for a specific sport and gender, optionally after a specific date. + This method returns raw game data without team information. + """ + game_collection = db["game"] + + query = { + "sport": sport, + "gender": gender + } + + if after_date: + query["utc_date"] = {"$gt": after_date} + + games = game_collection.find(query) + return [Game.from_dict(game) for game in games] + + @staticmethod + def delete_games_by_ids(game_ids): + """ + Delete games by their IDs. + """ + game_collection = db["game"] + result = game_collection.delete_many({"_id": {"$in": game_ids}}) + return result.deleted_count diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index cd71dd0..da43692 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -4,7 +4,7 @@ from src.utils.convert_to_utc import convert_to_utc from src.utils.constants import * from src.scrapers.game_details_scrape import scrape_game -from src.utils.helpers import get_dominant_color +from src.utils.helpers import get_dominant_color, normalize_game_data, is_tournament_placeholder_team, is_cornell_loss import base64 import re from src.database import db @@ -164,6 +164,8 @@ def process_game_data(game_data): Args: game_data (dict): A dictionary containing the game data. """ + + game_data = normalize_game_data(game_data) location_data = game_data["location"].split("\n") geo_location = location_data[0] if (",") not in geo_location: @@ -232,16 +234,28 @@ def process_game_data(game_data): if str(final_box_cor_score) != str(cor_final) or str(final_box_opp_score) != str(opp_final): game_data["score_breakdown"] = game_data["score_breakdown"][::-1] - # finds any existing game with the same key fields regardless of time - curr_game = GameService.get_game_by_key_fields( + # Try to find by tournament key fields to handle placeholder teams + curr_game = GameService.get_game_by_tournament_key_fields( city, game_data["date"], game_data["gender"], location, - team.id, game_data["sport"], state ) + + # If no tournament game found, try the regular lookup with opponent_id + if not curr_game: + curr_game = GameService.get_game_by_key_fields( + city, + game_data["date"], + game_data["gender"], + location, + team.id, + game_data["sport"], + state + ) + if isinstance(curr_game, list): if curr_game: curr_game = curr_game[0] @@ -253,8 +267,19 @@ def process_game_data(game_data): "result": game_data["result"], "box_score": game_data["box_score"], "score_breakdown": game_data["score_breakdown"], - "utc_date": utc_date_str + "utc_date": utc_date_str, + "city": city, + "location": location, + "state": state } + + current_team = TeamService.get_team_by_id(curr_game.opponent_id) + if current_team and is_tournament_placeholder_team(current_team.name): + updates["opponent_id"] = team.id + + if is_cornell_loss(game_data["result"]) and game_data["utc_date"]: + GameService.handle_tournament_loss(game_data["sport"], game_data["gender"], game_data["utc_date"]) + GameService.update_game(curr_game.id, updates) return diff --git a/src/services/game_service.py b/src/services/game_service.py index ae8381e..6352dd0 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -1,6 +1,7 @@ from src.repositories.game_repository import GameRepository from src.models.game import Game from src.services.team_service import TeamService +from src.utils.helpers import is_tournament_placeholder_team class GameService: @@ -33,34 +34,7 @@ def create_game(data): opponent_id = data.get("opponent_id") if not TeamService.get_team_by_id(opponent_id): raise ValueError(f"Opponent team with id {opponent_id} does not exist.") - - existing = GameService.get_game_by_key_fields( - data["city"], - data["date"], - data["gender"], - data["location"], - data["opponent_id"], - data["sport"], - data["state"], - ) - #check if game already exists - if existing: - if isinstance(existing, list) and existing: - existing = existing[0] - - # update existing game - updates = { - "time": data.get("time"), - "result": data.get("result"), - "box_score": data.get("box_score"), - "score_breakdown": data.get("score_breakdown"), - "utc_date": data.get("utc_date"), - } - GameService.update_game(existing.id, updates) - return existing - - # create new game if it doesn't exist game = Game(**data) GameRepository.insert(game) return game @@ -97,6 +71,16 @@ def get_game_by_key_fields(city, date, gender, location, opponent_id, sport, sta city, date, gender, location, opponent_id, sport, state ) + @staticmethod + def get_game_by_tournament_key_fields(city, date, gender, location, sport, state): + """ + Retrieve a tournament game by location and date (excluding opponent_id). + This is used when we need to find a tournament game that might have a placeholder team. + """ + return GameRepository.find_by_tournament_key_fields( + city, date, gender, location, sport, state + ) + @staticmethod def get_games_by_sport(sport): """ @@ -117,3 +101,51 @@ def get_games_by_sport_gender(sport, gender): Retrieves all game by its sport and gender. """ return GameRepository.find_by_sport_gender(sport, gender) + + @staticmethod + def get_tournament_games_by_sport_gender(sport, gender, after_date=None): + """ + Find tournament games (with placeholder team names) for a specific sport and gender. + Optionally filter by games after a specific date. + """ + games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date) + tournament_games = [] + + for game in games: + team = TeamService.get_team_by_id(game.opponent_id) + if team and is_tournament_placeholder_team(team.name): + tournament_games.append(game) + + return tournament_games + + @staticmethod + def delete_tournament_games_by_sport_gender(sport, gender, after_date=None): + """ + Delete tournament games (with placeholder team names) for a specific sport and gender. + Optionally filter by games after a specific date. + """ + games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date) + tournament_game_ids = [] + + for game in games: + team = TeamService.get_team_by_id(game.opponent_id) + if team and is_tournament_placeholder_team(team.name): + tournament_game_ids.append(game.id) + + if tournament_game_ids: + return GameRepository.delete_games_by_ids(tournament_game_ids) + return 0 + + @staticmethod + def handle_tournament_loss(sport, gender, loss_date): + """ + Handle when a Cornell team loses in a tournament by deleting future tournament games. + + Args: + sport (str): The sport of the team that lost + gender (str): The gender of the team that lost + loss_date (datetime): The date when the team lost + """ + deleted_count = GameService.delete_tournament_games_by_sport_gender(sport, gender, loss_date) + print(f"Deleted {deleted_count} future tournament games for {gender} {sport} after loss on {loss_date}") + return deleted_count diff --git a/src/utils/helpers.py b/src/utils/helpers.py index 0866f79..cb3d759 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -54,4 +54,44 @@ def get_dominant_color(image_url, white_threshold=200, black_threshold=50): return hex_color except Exception as e: logging.error(f"Error in get_dominant_color for {image_url}: {e}") - return default_color \ No newline at end of file + return default_color + +def normalize_game_data(data: dict): + """ + Normalize placeholder values like TBA/TBD into None. + """ + placeholders = {"TBA", "TBD", "tba", "tbd"} + + for field in ["time", "city", "state"]: + if data.get(field) in placeholders: + data[field] = None + + return data + +def is_tournament_placeholder_team(team_name: str): + """ + Check if a team name is a tournament placeholder. + """ + + placeholder_team_names = [ + "First Round", "Second Round", "Third Round", "Quarterfinals", + "College Cup Semifinals", "College Cup Championship Game", + "ECAC Hockey First Round", "ECAC Hockey Quarterfinals", + "ECAC Hockey Semifinals", "ECAC Hockey Championship Game", + "Regional Semifinals", "Regional Championship", "National Semifinals", + "TBD", "National Championship", "NCAA Wrestling Championships", "NCAA Northeast Regional CHampionships", + "NCAA Cross Country Championships", + ] + return team_name in placeholder_team_names + +def is_cornell_loss(result: str): + """ + Check if the result indicates a Cornell loss. + """ + + if not result: + return False + + # Common loss indicators in result strings + loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"] + return any(indicator in result for indicator in loss_indicators) \ No newline at end of file From e2ac05f1c076dde746f864127ca880cac6114ef0 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Sat, 20 Sep 2025 19:10:14 -0400 Subject: [PATCH 06/30] remove comments that trigger alerts --- src/services/game_service.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/services/game_service.py b/src/services/game_service.py index 6352dd0..95d31ed 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -147,5 +147,4 @@ def handle_tournament_loss(sport, gender, loss_date): loss_date (datetime): The date when the team lost """ deleted_count = GameService.delete_tournament_games_by_sport_gender(sport, gender, loss_date) - print(f"Deleted {deleted_count} future tournament games for {gender} {sport} after loss on {loss_date}") return deleted_count From 756ec32d0ee21db6caa8a261e8072c2654491ecd Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Sat, 20 Sep 2025 19:12:32 -0400 Subject: [PATCH 07/30] remove db game count logging --- src/database.py | 1 - src/services/team_service.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/database.py b/src/database.py index 5ea74a4..177a1ee 100644 --- a/src/database.py +++ b/src/database.py @@ -48,7 +48,6 @@ def keep_connection_alive(): # Access the database db = client[os.getenv("MONGO_DB", "score_db")] -print("Total games in DB:", db["game"].count_documents({})) def setup_database_indexes(): """Set up MongoDB indexes for optimal query performance""" diff --git a/src/services/team_service.py b/src/services/team_service.py index 7127d21..c961534 100644 --- a/src/services/team_service.py +++ b/src/services/team_service.py @@ -13,6 +13,11 @@ def get_all_teams(): def create_team(team_data): """ Create a new team, or update it if it already exists. + + Args: + team_data (dict): The data for the new team. + Returns: + Team: The created team. """ name = team_data.get("name") if not name: From 907fe84f41b2b296c67e2456659794384fe91ef9 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Wed, 24 Sep 2025 17:44:36 -0400 Subject: [PATCH 08/30] Revert "Merge pull request #36 from cuappdev/fix-duplicate-games" This reverts commit 721f0f8a06eb0e3dc8ada6db08d8d4aad685f716, reversing changes made to bd146590bf0e0690cbfb39ab34adbdb88ed7af68. --- src/database.py | 26 +--------- src/repositories/game_repository.py | 78 ----------------------------- src/scrapers/games_scraper.py | 39 +++------------ src/services/game_service.py | 59 ---------------------- src/services/team_service.py | 18 ++----- src/utils/helpers.py | 42 +--------------- 6 files changed, 13 insertions(+), 249 deletions(-) diff --git a/src/database.py b/src/database.py index 177a1ee..d475437 100644 --- a/src/database.py +++ b/src/database.py @@ -49,6 +49,7 @@ def keep_connection_alive(): # Access the database db = client[os.getenv("MONGO_DB", "score_db")] + def setup_database_indexes(): """Set up MongoDB indexes for optimal query performance""" try: @@ -64,31 +65,6 @@ def setup_database_indexes(): # Index for sorting operations game_collection.create_index([("date", -1)], background=True) - - # Index to have unique games so we won't add duplicates - game_collection.create_index( - [ - ("sport", 1), - ("gender", 1), - ("date", 1), - ("opponent_id", 1), - ("state", 1), - ], - unique=True, - background=True - ) - - # Additional index for tournament games (without opponent_id) - game_collection.create_index( - [ - ("sport", 1), - ("gender", 1), - ("date", 1), - ("city", 1), - ("state", 1), - ], - background=True - ) print("✅ MongoDB indexes created successfully") except Exception as e: diff --git a/src/repositories/game_repository.py b/src/repositories/game_repository.py index 4f05b2f..bfe5d08 100644 --- a/src/repositories/game_repository.py +++ b/src/repositories/game_repository.py @@ -130,56 +130,6 @@ def find_by_key_fields(city, date, gender, location, opponent_id, sport, state): return [Game.from_dict(game) for game in games] - @staticmethod - def find_by_tournament_key_fields(city, date, gender, location, sport, state): - """ - Find tournament games by location and date (excluding opponent_id). - This is used when we need to find a tournament game that might have a placeholder team. - Uses flexible matching to handle TBD/TBA values. - """ - game_collection = db["game"] - - # Build flexible query that can handle TBD/TBA values - query = { - "date": date, - "gender": gender, - "sport": sport, - } - - # For city, state, and location, use flexible matching - # This allows finding games even when TBD/TBA values change to real values - city_conditions = [] - if city: - city_conditions.append(city) - else: - city_conditions = [None] - - state_conditions = [] - if state: - state_conditions.append(state) - else: - state_conditions = [None] - - location_conditions = [] - if location: - location_conditions.append(location) - else: - location_conditions = [None] - - query["city"] = {"$in": city_conditions} - query["state"] = {"$in": state_conditions} - query["location"] = {"$in": location_conditions} - - games = list(game_collection.find(query)) - - if not games: - return None - - if len(games) == 1: - return Game.from_dict(games[0]) - - return [Game.from_dict(game) for game in games] - @staticmethod def find_by_sport(sport): """ @@ -206,31 +156,3 @@ def find_by_sport_gender(sport, gender): game_collection = db["game"] games = game_collection.find({"sport": sport, "gender": gender}) return [Game.from_dict(game) for game in games] - - @staticmethod - def find_games_by_sport_gender_after_date(sport, gender, after_date=None): - """ - Find games for a specific sport and gender, optionally after a specific date. - This method returns raw game data without team information. - """ - game_collection = db["game"] - - query = { - "sport": sport, - "gender": gender - } - - if after_date: - query["utc_date"] = {"$gt": after_date} - - games = game_collection.find(query) - return [Game.from_dict(game) for game in games] - - @staticmethod - def delete_games_by_ids(game_ids): - """ - Delete games by their IDs. - """ - game_collection = db["game"] - result = game_collection.delete_many({"_id": {"$in": game_ids}}) - return result.deleted_count diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index da43692..e174a65 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -4,10 +4,10 @@ from src.utils.convert_to_utc import convert_to_utc from src.utils.constants import * from src.scrapers.game_details_scrape import scrape_game -from src.utils.helpers import get_dominant_color, normalize_game_data, is_tournament_placeholder_team, is_cornell_loss +from src.utils.helpers import get_dominant_color import base64 import re -from src.database import db +import html import threading @@ -164,8 +164,6 @@ def process_game_data(game_data): Args: game_data (dict): A dictionary containing the game data. """ - - game_data = normalize_game_data(game_data) location_data = game_data["location"].split("\n") geo_location = location_data[0] if (",") not in geo_location: @@ -234,28 +232,16 @@ def process_game_data(game_data): if str(final_box_cor_score) != str(cor_final) or str(final_box_opp_score) != str(opp_final): game_data["score_breakdown"] = game_data["score_breakdown"][::-1] - # Try to find by tournament key fields to handle placeholder teams - curr_game = GameService.get_game_by_tournament_key_fields( + # finds any existing game with the same key fields regardless of time + curr_game = GameService.get_game_by_key_fields( city, game_data["date"], game_data["gender"], location, + team.id, game_data["sport"], state ) - - # If no tournament game found, try the regular lookup with opponent_id - if not curr_game: - curr_game = GameService.get_game_by_key_fields( - city, - game_data["date"], - game_data["gender"], - location, - team.id, - game_data["sport"], - state - ) - if isinstance(curr_game, list): if curr_game: curr_game = curr_game[0] @@ -267,19 +253,8 @@ def process_game_data(game_data): "result": game_data["result"], "box_score": game_data["box_score"], "score_breakdown": game_data["score_breakdown"], - "utc_date": utc_date_str, - "city": city, - "location": location, - "state": state + "utc_date": utc_date_str } - - current_team = TeamService.get_team_by_id(curr_game.opponent_id) - if current_team and is_tournament_placeholder_team(current_team.name): - updates["opponent_id"] = team.id - - if is_cornell_loss(game_data["result"]) and game_data["utc_date"]: - GameService.handle_tournament_loss(game_data["sport"], game_data["gender"], game_data["utc_date"]) - GameService.update_game(curr_game.id, updates) return @@ -297,5 +272,5 @@ def process_game_data(game_data): "score_breakdown": game_data["score_breakdown"], "utc_date": utc_date_str } - + GameService.create_game(game_data) \ No newline at end of file diff --git a/src/services/game_service.py b/src/services/game_service.py index 95d31ed..5463835 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -1,7 +1,6 @@ from src.repositories.game_repository import GameRepository from src.models.game import Game from src.services.team_service import TeamService -from src.utils.helpers import is_tournament_placeholder_team class GameService: @@ -34,7 +33,6 @@ def create_game(data): opponent_id = data.get("opponent_id") if not TeamService.get_team_by_id(opponent_id): raise ValueError(f"Opponent team with id {opponent_id} does not exist.") - game = Game(**data) GameRepository.insert(game) return game @@ -71,16 +69,6 @@ def get_game_by_key_fields(city, date, gender, location, opponent_id, sport, sta city, date, gender, location, opponent_id, sport, state ) - @staticmethod - def get_game_by_tournament_key_fields(city, date, gender, location, sport, state): - """ - Retrieve a tournament game by location and date (excluding opponent_id). - This is used when we need to find a tournament game that might have a placeholder team. - """ - return GameRepository.find_by_tournament_key_fields( - city, date, gender, location, sport, state - ) - @staticmethod def get_games_by_sport(sport): """ @@ -101,50 +89,3 @@ def get_games_by_sport_gender(sport, gender): Retrieves all game by its sport and gender. """ return GameRepository.find_by_sport_gender(sport, gender) - - @staticmethod - def get_tournament_games_by_sport_gender(sport, gender, after_date=None): - """ - Find tournament games (with placeholder team names) for a specific sport and gender. - Optionally filter by games after a specific date. - """ - games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date) - tournament_games = [] - - for game in games: - team = TeamService.get_team_by_id(game.opponent_id) - if team and is_tournament_placeholder_team(team.name): - tournament_games.append(game) - - return tournament_games - - @staticmethod - def delete_tournament_games_by_sport_gender(sport, gender, after_date=None): - """ - Delete tournament games (with placeholder team names) for a specific sport and gender. - Optionally filter by games after a specific date. - """ - games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date) - tournament_game_ids = [] - - for game in games: - team = TeamService.get_team_by_id(game.opponent_id) - if team and is_tournament_placeholder_team(team.name): - tournament_game_ids.append(game.id) - - if tournament_game_ids: - return GameRepository.delete_games_by_ids(tournament_game_ids) - return 0 - - @staticmethod - def handle_tournament_loss(sport, gender, loss_date): - """ - Handle when a Cornell team loses in a tournament by deleting future tournament games. - - Args: - sport (str): The sport of the team that lost - gender (str): The gender of the team that lost - loss_date (datetime): The date when the team lost - """ - deleted_count = GameService.delete_tournament_games_by_sport_gender(sport, gender, loss_date) - return deleted_count diff --git a/src/services/team_service.py b/src/services/team_service.py index c961534..57598f8 100644 --- a/src/services/team_service.py +++ b/src/services/team_service.py @@ -1,6 +1,7 @@ from src.repositories import TeamRepository from src.models.team import Team + class TeamService: @staticmethod def get_all_teams(): @@ -12,25 +13,14 @@ def get_all_teams(): @staticmethod def create_team(team_data): """ - Create a new team, or update it if it already exists. - + Create a new team. + Args: team_data (dict): The data for the new team. + Returns: Team: The created team. """ - name = team_data.get("name") - if not name: - raise ValueError("Team name is required to create a team.") - - existing = TeamService.get_team_by_name(name) - if existing: - if isinstance(existing, list) and existing: - existing = existing[0] - - TeamService.update_team(existing.id, team_data) - return existing - team = Team(**team_data) TeamRepository.insert(team) return team diff --git a/src/utils/helpers.py b/src/utils/helpers.py index cb3d759..0866f79 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -54,44 +54,4 @@ def get_dominant_color(image_url, white_threshold=200, black_threshold=50): return hex_color except Exception as e: logging.error(f"Error in get_dominant_color for {image_url}: {e}") - return default_color - -def normalize_game_data(data: dict): - """ - Normalize placeholder values like TBA/TBD into None. - """ - placeholders = {"TBA", "TBD", "tba", "tbd"} - - for field in ["time", "city", "state"]: - if data.get(field) in placeholders: - data[field] = None - - return data - -def is_tournament_placeholder_team(team_name: str): - """ - Check if a team name is a tournament placeholder. - """ - - placeholder_team_names = [ - "First Round", "Second Round", "Third Round", "Quarterfinals", - "College Cup Semifinals", "College Cup Championship Game", - "ECAC Hockey First Round", "ECAC Hockey Quarterfinals", - "ECAC Hockey Semifinals", "ECAC Hockey Championship Game", - "Regional Semifinals", "Regional Championship", "National Semifinals", - "TBD", "National Championship", "NCAA Wrestling Championships", "NCAA Northeast Regional CHampionships", - "NCAA Cross Country Championships", - ] - return team_name in placeholder_team_names - -def is_cornell_loss(result: str): - """ - Check if the result indicates a Cornell loss. - """ - - if not result: - return False - - # Common loss indicators in result strings - loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"] - return any(indicator in result for indicator in loss_indicators) \ No newline at end of file + return default_color \ No newline at end of file From 1de4e00579970911e650dddc319d9d85ae472a1f Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Wed, 24 Sep 2025 17:44:54 -0400 Subject: [PATCH 09/30] revert pr --- binary_segregation.py | 94 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 binary_segregation.py diff --git a/binary_segregation.py b/binary_segregation.py new file mode 100644 index 0000000..fbc538d --- /dev/null +++ b/binary_segregation.py @@ -0,0 +1,94 @@ +def getMaxCost(s): + """ + Calculate the maximum possible cost to segregate a binary string. + + Rules: + - A "1" can be moved to the right until it reaches the end or another "1" + - Cost = 1 + number of places moved + - Each "1" must be moved to its maximum possible position + + The strategy is to maximize the total cost by ensuring each '1' moves as far as possible. + We can do this by moving '1's one step at a time to maximize the number of operations. + + Args: + s (str): Binary string containing only '0' and '1' + + Returns: + int: Maximum possible cost to segregate the string + """ + n = len(s) + ones_count = s.count('1') + + if ones_count == 0: + return 0 + + # Convert to list for easier manipulation + arr = list(s) + total_cost = 0 + + # Move each '1' to its maximum possible position + # We'll process from left to right and move each '1' as far right as possible + for i in range(n): + if arr[i] == '1': + # Find the rightmost position this '1' can move to + # It can move until it hits another '1' or the end + j = i + while j < n - 1 and arr[j + 1] != '1': + # Move this '1' one position to the right + arr[j], arr[j + 1] = arr[j + 1], arr[j] + j += 1 + total_cost += 1 + 1 # Cost = 1 + distance (distance = 1 for each step) + + return total_cost + + +def getMaxCostOptimized(s): + """ + Optimized version that calculates cost without explicitly tracking positions. + """ + n = len(s) + total_cost = 0 + ones_count = 0 + + # Process from right to left + for i in range(n - 1, -1, -1): + if s[i] == '1': + # This '1' can move to position (n - 1 - ones_count) + # Distance = (n - 1 - ones_count) - i + distance = (n - 1 - ones_count) - i + if distance > 0: + total_cost += 1 + distance + ones_count += 1 + + return total_cost + + +# Test with the provided example +if __name__ == "__main__": + # Test case from the problem + s = "110100" + result = getMaxCost(s) + print(f"Input: {s}") + print(f"Maximum cost: {result}") + + # Test with optimized version + result_opt = getMaxCostOptimized(s) + print(f"Optimized result: {result_opt}") + + # Additional test cases + test_cases = [ + "110100", # Expected: 13 + "111000", # All 1s at start + "000111", # All 1s at end + "101010", # Alternating + "100000", # Single 1 at start + "000001", # Single 1 at end + "111111", # All 1s + "000000", # All 0s + ] + + print("\nTesting additional cases:") + for test in test_cases: + cost = getMaxCost(test) + cost_opt = getMaxCostOptimized(test) + print(f"{test}: {cost} (both methods: {cost == cost_opt})") From bbe2408952872197c75dff80561166cef2180a98 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Wed, 24 Sep 2025 17:45:17 -0400 Subject: [PATCH 10/30] revert pr --- binary_segregation.py | 94 ------------------------------------------- 1 file changed, 94 deletions(-) delete mode 100644 binary_segregation.py diff --git a/binary_segregation.py b/binary_segregation.py deleted file mode 100644 index fbc538d..0000000 --- a/binary_segregation.py +++ /dev/null @@ -1,94 +0,0 @@ -def getMaxCost(s): - """ - Calculate the maximum possible cost to segregate a binary string. - - Rules: - - A "1" can be moved to the right until it reaches the end or another "1" - - Cost = 1 + number of places moved - - Each "1" must be moved to its maximum possible position - - The strategy is to maximize the total cost by ensuring each '1' moves as far as possible. - We can do this by moving '1's one step at a time to maximize the number of operations. - - Args: - s (str): Binary string containing only '0' and '1' - - Returns: - int: Maximum possible cost to segregate the string - """ - n = len(s) - ones_count = s.count('1') - - if ones_count == 0: - return 0 - - # Convert to list for easier manipulation - arr = list(s) - total_cost = 0 - - # Move each '1' to its maximum possible position - # We'll process from left to right and move each '1' as far right as possible - for i in range(n): - if arr[i] == '1': - # Find the rightmost position this '1' can move to - # It can move until it hits another '1' or the end - j = i - while j < n - 1 and arr[j + 1] != '1': - # Move this '1' one position to the right - arr[j], arr[j + 1] = arr[j + 1], arr[j] - j += 1 - total_cost += 1 + 1 # Cost = 1 + distance (distance = 1 for each step) - - return total_cost - - -def getMaxCostOptimized(s): - """ - Optimized version that calculates cost without explicitly tracking positions. - """ - n = len(s) - total_cost = 0 - ones_count = 0 - - # Process from right to left - for i in range(n - 1, -1, -1): - if s[i] == '1': - # This '1' can move to position (n - 1 - ones_count) - # Distance = (n - 1 - ones_count) - i - distance = (n - 1 - ones_count) - i - if distance > 0: - total_cost += 1 + distance - ones_count += 1 - - return total_cost - - -# Test with the provided example -if __name__ == "__main__": - # Test case from the problem - s = "110100" - result = getMaxCost(s) - print(f"Input: {s}") - print(f"Maximum cost: {result}") - - # Test with optimized version - result_opt = getMaxCostOptimized(s) - print(f"Optimized result: {result_opt}") - - # Additional test cases - test_cases = [ - "110100", # Expected: 13 - "111000", # All 1s at start - "000111", # All 1s at end - "101010", # Alternating - "100000", # Single 1 at start - "000001", # Single 1 at end - "111111", # All 1s - "000000", # All 0s - ] - - print("\nTesting additional cases:") - for test in test_cases: - cost = getMaxCost(test) - cost_opt = getMaxCostOptimized(test) - print(f"{test}: {cost} (both methods: {cost == cost_opt})") From a5f0e7e698cc3b3c5365c3bcae36a331fddb5df7 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Tue, 30 Sep 2025 17:39:08 -0400 Subject: [PATCH 11/30] Fix gunicorn argument parsing and MongoDB _id field error - Fix argument parsing to only run when script is executed directly, not when imported by gunicorn - Fix MongoDB _id field modification error in Daily Sun scraper by removing _id from upsert operations - Maintain all functionality while eliminating error messages --- app.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/app.py b/app.py index 69a1992..8534134 100644 --- a/app.py +++ b/app.py @@ -102,8 +102,6 @@ def parse_args(): ) return parser.parse_args() -args = parse_args() - def signal_handler(sig, frame): sys.exit(0) @@ -111,6 +109,16 @@ def signal_handler(sig, frame): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) +# Only parse arguments when running directly (not when imported by gunicorn) +if __name__ == "__main__": + args = parse_args() +else: + # Default args when imported by gunicorn + class DefaultArgs: + no_scrape = False + no_daily_sun = False + args = DefaultArgs() + # Only run scraping tasks if not disabled if not args.no_scrape: from flask_apscheduler import APScheduler From c2d8df252f397d180ea97457905f2e6d0080aac6 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Tue, 30 Sep 2025 17:41:52 -0400 Subject: [PATCH 12/30] Move signal handlers after argument parsing to fix initialization order --- app.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/app.py b/app.py index 69a1992..8fed869 100644 --- a/app.py +++ b/app.py @@ -102,7 +102,15 @@ def parse_args(): ) return parser.parse_args() -args = parse_args() +# Only parse arguments when running directly (not when imported by gunicorn) +if __name__ == "__main__": + args = parse_args() +else: + # Default args when imported by gunicorn + class DefaultArgs: + no_scrape = False + no_daily_sun = False + args = DefaultArgs() def signal_handler(sig, frame): sys.exit(0) From 7d35699b449185bdc82737d9e13aa9f7a481aa33 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Fri, 12 Sep 2025 21:19:18 -0400 Subject: [PATCH 13/30] Added logic to prevent adding duplicate games when scraping --- src/database.py | 15 +++++++++++++++ src/scrapers/games_scraper.py | 17 ++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/database.py b/src/database.py index 85d0dee..8b7c682 100644 --- a/src/database.py +++ b/src/database.py @@ -66,6 +66,21 @@ def setup_database_indexes(): # Index for sorting operations game_collection.create_index([("date", -1)], background=True) + + # Index to have unique games so we won't add duplicates + game_collection.create_index( + [ + ("sport", 1), + ("gender", 1), + ("date", 1), + ("opponent_id", 1), + ("city", 1), + ("state", 1), + ("location", 1), + ], + unique=True, + background=True + ) print("✅ MongoDB indexes created successfully") except Exception as e: diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index e174a65..a92e87c 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -7,7 +7,7 @@ from src.utils.helpers import get_dominant_color import base64 import re -import html +from src.database import db import threading @@ -272,5 +272,20 @@ def process_game_data(game_data): "score_breakdown": game_data["score_breakdown"], "utc_date": utc_date_str } + + # update the game if it exists, otherwise insert it as a new game. + db.game.update_one( + { + "sport": game_data["sport"], + "gender": game_data["gender"], + "date": game_data["date"], + "opponent_id": game_data["opponent_id"], + "city": game_data["city"], + "state": game_data["state"], + "location": game_data["location"], + }, + {"$set": game_data}, + upsert=True + ) GameService.create_game(game_data) \ No newline at end of file From b66d59f91badc5431616dbb1581ee8a6fd0f7b1a Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Fri, 12 Sep 2025 21:49:40 -0400 Subject: [PATCH 14/30] Reworked service methods to check for duplicates and fixed game scraping issue --- src/scrapers/games_scraper.py | 15 --------------- src/services/game_service.py | 28 ++++++++++++++++++++++++++++ src/services/team_service.py | 21 +++++++++++++-------- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index a92e87c..cd71dd0 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -273,19 +273,4 @@ def process_game_data(game_data): "utc_date": utc_date_str } - # update the game if it exists, otherwise insert it as a new game. - db.game.update_one( - { - "sport": game_data["sport"], - "gender": game_data["gender"], - "date": game_data["date"], - "opponent_id": game_data["opponent_id"], - "city": game_data["city"], - "state": game_data["state"], - "location": game_data["location"], - }, - {"$set": game_data}, - upsert=True - ) - GameService.create_game(game_data) \ No newline at end of file diff --git a/src/services/game_service.py b/src/services/game_service.py index 5463835..ae8381e 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -33,6 +33,34 @@ def create_game(data): opponent_id = data.get("opponent_id") if not TeamService.get_team_by_id(opponent_id): raise ValueError(f"Opponent team with id {opponent_id} does not exist.") + + existing = GameService.get_game_by_key_fields( + data["city"], + data["date"], + data["gender"], + data["location"], + data["opponent_id"], + data["sport"], + data["state"], + ) + + #check if game already exists + if existing: + if isinstance(existing, list) and existing: + existing = existing[0] + + # update existing game + updates = { + "time": data.get("time"), + "result": data.get("result"), + "box_score": data.get("box_score"), + "score_breakdown": data.get("score_breakdown"), + "utc_date": data.get("utc_date"), + } + GameService.update_game(existing.id, updates) + return existing + + # create new game if it doesn't exist game = Game(**data) GameRepository.insert(game) return game diff --git a/src/services/team_service.py b/src/services/team_service.py index 57598f8..7127d21 100644 --- a/src/services/team_service.py +++ b/src/services/team_service.py @@ -1,7 +1,6 @@ from src.repositories import TeamRepository from src.models.team import Team - class TeamService: @staticmethod def get_all_teams(): @@ -13,14 +12,20 @@ def get_all_teams(): @staticmethod def create_team(team_data): """ - Create a new team. - - Args: - team_data (dict): The data for the new team. - - Returns: - Team: The created team. + Create a new team, or update it if it already exists. """ + name = team_data.get("name") + if not name: + raise ValueError("Team name is required to create a team.") + + existing = TeamService.get_team_by_name(name) + if existing: + if isinstance(existing, list) and existing: + existing = existing[0] + + TeamService.update_team(existing.id, team_data) + return existing + team = Team(**team_data) TeamRepository.insert(team) return team From 42d139437ce1bfdd4230834e0154347486ea104b Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Sat, 20 Sep 2025 19:04:43 -0400 Subject: [PATCH 15/30] Added tournament handling and TBD/TBA updates for games --- src/database.py | 14 ++++- src/repositories/game_repository.py | 78 ++++++++++++++++++++++++++ src/scrapers/games_scraper.py | 35 ++++++++++-- src/services/game_service.py | 86 ++++++++++++++++++++--------- src/utils/helpers.py | 42 +++++++++++++- 5 files changed, 220 insertions(+), 35 deletions(-) diff --git a/src/database.py b/src/database.py index 8b7c682..834808d 100644 --- a/src/database.py +++ b/src/database.py @@ -74,13 +74,23 @@ def setup_database_indexes(): ("gender", 1), ("date", 1), ("opponent_id", 1), - ("city", 1), ("state", 1), - ("location", 1), ], unique=True, background=True ) + + # Additional index for tournament games (without opponent_id) + game_collection.create_index( + [ + ("sport", 1), + ("gender", 1), + ("date", 1), + ("city", 1), + ("state", 1), + ], + background=True + ) print("✅ MongoDB indexes created successfully") except Exception as e: diff --git a/src/repositories/game_repository.py b/src/repositories/game_repository.py index bfe5d08..4f05b2f 100644 --- a/src/repositories/game_repository.py +++ b/src/repositories/game_repository.py @@ -130,6 +130,56 @@ def find_by_key_fields(city, date, gender, location, opponent_id, sport, state): return [Game.from_dict(game) for game in games] + @staticmethod + def find_by_tournament_key_fields(city, date, gender, location, sport, state): + """ + Find tournament games by location and date (excluding opponent_id). + This is used when we need to find a tournament game that might have a placeholder team. + Uses flexible matching to handle TBD/TBA values. + """ + game_collection = db["game"] + + # Build flexible query that can handle TBD/TBA values + query = { + "date": date, + "gender": gender, + "sport": sport, + } + + # For city, state, and location, use flexible matching + # This allows finding games even when TBD/TBA values change to real values + city_conditions = [] + if city: + city_conditions.append(city) + else: + city_conditions = [None] + + state_conditions = [] + if state: + state_conditions.append(state) + else: + state_conditions = [None] + + location_conditions = [] + if location: + location_conditions.append(location) + else: + location_conditions = [None] + + query["city"] = {"$in": city_conditions} + query["state"] = {"$in": state_conditions} + query["location"] = {"$in": location_conditions} + + games = list(game_collection.find(query)) + + if not games: + return None + + if len(games) == 1: + return Game.from_dict(games[0]) + + return [Game.from_dict(game) for game in games] + @staticmethod def find_by_sport(sport): """ @@ -156,3 +206,31 @@ def find_by_sport_gender(sport, gender): game_collection = db["game"] games = game_collection.find({"sport": sport, "gender": gender}) return [Game.from_dict(game) for game in games] + + @staticmethod + def find_games_by_sport_gender_after_date(sport, gender, after_date=None): + """ + Find games for a specific sport and gender, optionally after a specific date. + This method returns raw game data without team information. + """ + game_collection = db["game"] + + query = { + "sport": sport, + "gender": gender + } + + if after_date: + query["utc_date"] = {"$gt": after_date} + + games = game_collection.find(query) + return [Game.from_dict(game) for game in games] + + @staticmethod + def delete_games_by_ids(game_ids): + """ + Delete games by their IDs. + """ + game_collection = db["game"] + result = game_collection.delete_many({"_id": {"$in": game_ids}}) + return result.deleted_count diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index cd71dd0..da43692 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -4,7 +4,7 @@ from src.utils.convert_to_utc import convert_to_utc from src.utils.constants import * from src.scrapers.game_details_scrape import scrape_game -from src.utils.helpers import get_dominant_color +from src.utils.helpers import get_dominant_color, normalize_game_data, is_tournament_placeholder_team, is_cornell_loss import base64 import re from src.database import db @@ -164,6 +164,8 @@ def process_game_data(game_data): Args: game_data (dict): A dictionary containing the game data. """ + + game_data = normalize_game_data(game_data) location_data = game_data["location"].split("\n") geo_location = location_data[0] if (",") not in geo_location: @@ -232,16 +234,28 @@ def process_game_data(game_data): if str(final_box_cor_score) != str(cor_final) or str(final_box_opp_score) != str(opp_final): game_data["score_breakdown"] = game_data["score_breakdown"][::-1] - # finds any existing game with the same key fields regardless of time - curr_game = GameService.get_game_by_key_fields( + # Try to find by tournament key fields to handle placeholder teams + curr_game = GameService.get_game_by_tournament_key_fields( city, game_data["date"], game_data["gender"], location, - team.id, game_data["sport"], state ) + + # If no tournament game found, try the regular lookup with opponent_id + if not curr_game: + curr_game = GameService.get_game_by_key_fields( + city, + game_data["date"], + game_data["gender"], + location, + team.id, + game_data["sport"], + state + ) + if isinstance(curr_game, list): if curr_game: curr_game = curr_game[0] @@ -253,8 +267,19 @@ def process_game_data(game_data): "result": game_data["result"], "box_score": game_data["box_score"], "score_breakdown": game_data["score_breakdown"], - "utc_date": utc_date_str + "utc_date": utc_date_str, + "city": city, + "location": location, + "state": state } + + current_team = TeamService.get_team_by_id(curr_game.opponent_id) + if current_team and is_tournament_placeholder_team(current_team.name): + updates["opponent_id"] = team.id + + if is_cornell_loss(game_data["result"]) and game_data["utc_date"]: + GameService.handle_tournament_loss(game_data["sport"], game_data["gender"], game_data["utc_date"]) + GameService.update_game(curr_game.id, updates) return diff --git a/src/services/game_service.py b/src/services/game_service.py index ae8381e..6352dd0 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -1,6 +1,7 @@ from src.repositories.game_repository import GameRepository from src.models.game import Game from src.services.team_service import TeamService +from src.utils.helpers import is_tournament_placeholder_team class GameService: @@ -33,34 +34,7 @@ def create_game(data): opponent_id = data.get("opponent_id") if not TeamService.get_team_by_id(opponent_id): raise ValueError(f"Opponent team with id {opponent_id} does not exist.") - - existing = GameService.get_game_by_key_fields( - data["city"], - data["date"], - data["gender"], - data["location"], - data["opponent_id"], - data["sport"], - data["state"], - ) - #check if game already exists - if existing: - if isinstance(existing, list) and existing: - existing = existing[0] - - # update existing game - updates = { - "time": data.get("time"), - "result": data.get("result"), - "box_score": data.get("box_score"), - "score_breakdown": data.get("score_breakdown"), - "utc_date": data.get("utc_date"), - } - GameService.update_game(existing.id, updates) - return existing - - # create new game if it doesn't exist game = Game(**data) GameRepository.insert(game) return game @@ -97,6 +71,16 @@ def get_game_by_key_fields(city, date, gender, location, opponent_id, sport, sta city, date, gender, location, opponent_id, sport, state ) + @staticmethod + def get_game_by_tournament_key_fields(city, date, gender, location, sport, state): + """ + Retrieve a tournament game by location and date (excluding opponent_id). + This is used when we need to find a tournament game that might have a placeholder team. + """ + return GameRepository.find_by_tournament_key_fields( + city, date, gender, location, sport, state + ) + @staticmethod def get_games_by_sport(sport): """ @@ -117,3 +101,51 @@ def get_games_by_sport_gender(sport, gender): Retrieves all game by its sport and gender. """ return GameRepository.find_by_sport_gender(sport, gender) + + @staticmethod + def get_tournament_games_by_sport_gender(sport, gender, after_date=None): + """ + Find tournament games (with placeholder team names) for a specific sport and gender. + Optionally filter by games after a specific date. + """ + games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date) + tournament_games = [] + + for game in games: + team = TeamService.get_team_by_id(game.opponent_id) + if team and is_tournament_placeholder_team(team.name): + tournament_games.append(game) + + return tournament_games + + @staticmethod + def delete_tournament_games_by_sport_gender(sport, gender, after_date=None): + """ + Delete tournament games (with placeholder team names) for a specific sport and gender. + Optionally filter by games after a specific date. + """ + games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date) + tournament_game_ids = [] + + for game in games: + team = TeamService.get_team_by_id(game.opponent_id) + if team and is_tournament_placeholder_team(team.name): + tournament_game_ids.append(game.id) + + if tournament_game_ids: + return GameRepository.delete_games_by_ids(tournament_game_ids) + return 0 + + @staticmethod + def handle_tournament_loss(sport, gender, loss_date): + """ + Handle when a Cornell team loses in a tournament by deleting future tournament games. + + Args: + sport (str): The sport of the team that lost + gender (str): The gender of the team that lost + loss_date (datetime): The date when the team lost + """ + deleted_count = GameService.delete_tournament_games_by_sport_gender(sport, gender, loss_date) + print(f"Deleted {deleted_count} future tournament games for {gender} {sport} after loss on {loss_date}") + return deleted_count diff --git a/src/utils/helpers.py b/src/utils/helpers.py index 0866f79..cb3d759 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -54,4 +54,44 @@ def get_dominant_color(image_url, white_threshold=200, black_threshold=50): return hex_color except Exception as e: logging.error(f"Error in get_dominant_color for {image_url}: {e}") - return default_color \ No newline at end of file + return default_color + +def normalize_game_data(data: dict): + """ + Normalize placeholder values like TBA/TBD into None. + """ + placeholders = {"TBA", "TBD", "tba", "tbd"} + + for field in ["time", "city", "state"]: + if data.get(field) in placeholders: + data[field] = None + + return data + +def is_tournament_placeholder_team(team_name: str): + """ + Check if a team name is a tournament placeholder. + """ + + placeholder_team_names = [ + "First Round", "Second Round", "Third Round", "Quarterfinals", + "College Cup Semifinals", "College Cup Championship Game", + "ECAC Hockey First Round", "ECAC Hockey Quarterfinals", + "ECAC Hockey Semifinals", "ECAC Hockey Championship Game", + "Regional Semifinals", "Regional Championship", "National Semifinals", + "TBD", "National Championship", "NCAA Wrestling Championships", "NCAA Northeast Regional CHampionships", + "NCAA Cross Country Championships", + ] + return team_name in placeholder_team_names + +def is_cornell_loss(result: str): + """ + Check if the result indicates a Cornell loss. + """ + + if not result: + return False + + # Common loss indicators in result strings + loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"] + return any(indicator in result for indicator in loss_indicators) \ No newline at end of file From 00f8c7d047f9c5403a0ed0466284eb4636f93c46 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Sat, 20 Sep 2025 19:10:14 -0400 Subject: [PATCH 16/30] remove comments that trigger alerts --- src/services/game_service.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/services/game_service.py b/src/services/game_service.py index 6352dd0..95d31ed 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -147,5 +147,4 @@ def handle_tournament_loss(sport, gender, loss_date): loss_date (datetime): The date when the team lost """ deleted_count = GameService.delete_tournament_games_by_sport_gender(sport, gender, loss_date) - print(f"Deleted {deleted_count} future tournament games for {gender} {sport} after loss on {loss_date}") return deleted_count From 310de060776d61896abb1117681ba6fec81ee6b0 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Sat, 20 Sep 2025 19:12:32 -0400 Subject: [PATCH 17/30] remove db game count logging --- src/services/team_service.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/services/team_service.py b/src/services/team_service.py index 7127d21..c961534 100644 --- a/src/services/team_service.py +++ b/src/services/team_service.py @@ -13,6 +13,11 @@ def get_all_teams(): def create_team(team_data): """ Create a new team, or update it if it already exists. + + Args: + team_data (dict): The data for the new team. + Returns: + Team: The created team. """ name = team_data.get("name") if not name: From d94130b65df346a03c6faf504c14fb318c384962 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Wed, 1 Oct 2025 17:31:52 -0400 Subject: [PATCH 18/30] Added ticket links to game objects --- app.py | 2 +- src/models/game.py | 5 +++++ src/mutations/create_game.py | 7 +++++-- src/queries/game_query.py | 5 +++-- src/scrapers/games_scraper.py | 18 +++++++++++++++--- src/types.py | 11 ++++++----- src/utils/constants.py | 7 ++++++- 7 files changed, 41 insertions(+), 14 deletions(-) diff --git a/app.py b/app.py index 8fed869..eec4429 100644 --- a/app.py +++ b/app.py @@ -155,4 +155,4 @@ def cleanse_daily_sun_db(): if __name__ == "__main__": - app.run(debug=True, host="0.0.0.0", port=8000) + app.run(debug=True, host="0.0.0.0", port=8001) diff --git a/src/models/game.py b/src/models/game.py index f3ebcf6..73a7968 100644 --- a/src/models/game.py +++ b/src/models/game.py @@ -17,6 +17,7 @@ class Game: - `time` The time of the game. (optional) - `box_score` The scoring summary of the game (optional) - `score_breakdown` The scoring breakdown of the game (optional) + - 'ticket_link' The ticket link for the game (optional) """ def __init__( @@ -35,6 +36,7 @@ def __init__( score_breakdown=None, team=None, utc_date=None, + ticket_link=None, ): self.id = id if id else str(ObjectId()) self.city = city @@ -50,6 +52,7 @@ def __init__( self.score_breakdown = score_breakdown self.team = team self.utc_date = utc_date + self.ticket_link = ticket_link def to_dict(self): """ @@ -70,6 +73,7 @@ def to_dict(self): "score_breakdown": self.score_breakdown, "team": self.team, "utc_date": self.utc_date, + "ticket_link": self.ticket_link, } @staticmethod @@ -92,4 +96,5 @@ def from_dict(data) -> None: score_breakdown=data.get("score_breakdown"), team=data.get("team"), utc_date=data.get("utc_date"), + ticket_link=data.get("ticket_link"), ) diff --git a/src/mutations/create_game.py b/src/mutations/create_game.py index 205a153..3a52345 100644 --- a/src/mutations/create_game.py +++ b/src/mutations/create_game.py @@ -17,6 +17,7 @@ class Arguments: box_score = String(required=False) score_breakdown = String(required=False) utc_date = String(required=False) + ticket_link = String(required=False) game = Field(lambda: GameType) @@ -34,7 +35,8 @@ def mutate( time=None, box_score=None, score_breakdown=None, - utc_date=None + utc_date=None, + ticket_link=None ): game_data = { "city": city, @@ -48,7 +50,8 @@ def mutate( "time": time, "box_score": box_score, "score_breakdown": score_breakdown, - "utc_date": utc_date + "utc_date": utc_date, + "ticket_link": ticket_link } new_game = GameService.create_game(game_data) return CreateGame(game=new_game) \ No newline at end of file diff --git a/src/queries/game_query.py b/src/queries/game_query.py index 4aa8a55..631aba8 100644 --- a/src/queries/game_query.py +++ b/src/queries/game_query.py @@ -20,6 +20,7 @@ class GameQuery(ObjectType): sport=String(required=True), state=String(required=True), time=String(required=True), + ticket_link=String(required=False), ) games_by_sport = List(GameType, sport=String(required=True)) games_by_gender = List(GameType, gender=String(required=True)) @@ -40,13 +41,13 @@ def resolve_game(self, info, id): return GameService.get_game_by_id(id) def resolve_game_by_data( - self, info, city, date, gender, opponent_id, sport, state, time, location=None + self, info, city, date, gender, opponent_id, sport, state, time, location=None, ticket_link=None ): """ Resolver for retrieving a game by its data. """ return GameService.get_game_by_data( - city, date, gender, location, opponent_id, sport, state, time + city, date, gender, location, opponent_id, sport, state, time, ticket_link ) def resolve_games_by_sport(self, info, sport): diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index e174a65..2b3dd00 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -153,7 +153,14 @@ def parse_schedule_page(url, sport, gender): else: game_data["box_score"] = None game_data["score_breakdown"] = None - + + ticket_link_tag = game_item.select_one(GAME_TICKET_LINK) + ticket_link = ( + ticket_link_tag["href"] if ticket_link_tag else None + ) + game_data["ticket_link"] = ( + ticket_link if ticket_link else None + ) process_game_data(game_data) @@ -253,7 +260,11 @@ def process_game_data(game_data): "result": game_data["result"], "box_score": game_data["box_score"], "score_breakdown": game_data["score_breakdown"], - "utc_date": utc_date_str + "utc_date": utc_date_str, + "city": city, + "location": location, + "state": state, + "ticket_link": game_data["ticket_link"] } GameService.update_game(curr_game.id, updates) return @@ -270,7 +281,8 @@ def process_game_data(game_data): "time": game_time, "box_score": game_data["box_score"], "score_breakdown": game_data["score_breakdown"], - "utc_date": utc_date_str + "utc_date": utc_date_str, + "ticket_link": game_data["ticket_link"] } GameService.create_game(game_data) \ No newline at end of file diff --git a/src/types.py b/src/types.py index 548e190..284f127 100644 --- a/src/types.py +++ b/src/types.py @@ -88,6 +88,7 @@ class GameType(ObjectType): - `time`: The time of the game. (optional) - `box_score`: The box score of the game. - `score_breakdown`: The score breakdown of the game. + - `ticket_link`: The ticket link of the game. (optional) """ id = String(required=False) @@ -104,11 +105,11 @@ class GameType(ObjectType): score_breakdown = List(List(String), required=False) team = Field(TeamType, required=False) utc_date = String(required=False) - + ticket_link = String(required=False) def __init__( - self, id, city, date, gender, location, opponent_id, result, sport, state, time, box_score=None, score_breakdown=None, utc_date=None + self, id, city, date, gender, location, opponent_id, result, sport, state, time, box_score=None, score_breakdown=None, utc_date=None, ticket_link=None ): - self.id = id + self.id = id self.city = city self.date = date self.gender = gender @@ -121,7 +122,7 @@ def __init__( self.box_score = box_score self.score_breakdown = score_breakdown self.utc_date = utc_date - + self.ticket_link = ticket_link @staticmethod def team_to_team_type(team_obj): if team_obj is None: @@ -138,7 +139,7 @@ def resolve_team(parent, info): # getting team id - team could be None in older data team_id = parent.team if parent.team is not None else parent.opponent_id if team_id and isinstance(team_id, str): - # promise to get team object once the dataloader is ready + # promise to get team object once the dataloader is ready promise = info.context["team_loader"].load(team_id) return promise.then(GameType.team_to_team_type) return None diff --git a/src/utils/constants.py b/src/utils/constants.py index c65b20f..e6f6295 100644 --- a/src/utils/constants.py +++ b/src/utils/constants.py @@ -40,6 +40,9 @@ # The tag for the box score BOX_SCORE_TAG = ".sidearm-schedule-game-links-boxscore a" +# The tag for the game ticket link +GAME_TICKET_LINK = ".sidearm-schedule-game-links-tickets a" + # HTML Tags TAG_TABLE = 'table' TAG_SECTION = 'section' @@ -125,4 +128,6 @@ CHANNEL_ID = "UClSQOi2gnn9bi7mcgQrAVKA" # The maximum number of videos to retrieve -VIDEO_LIMIT = 20 \ No newline at end of file +VIDEO_LIMIT = 20 + + From 1309c3f5e989cfa9d0b78cbd533997a32f0c0ada Mon Sep 17 00:00:00 2001 From: claiireyu Date: Wed, 1 Oct 2025 17:46:37 -0400 Subject: [PATCH 19/30] Fixed port from 8001 to 8000 --- app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app.py b/app.py index eec4429..8fed869 100644 --- a/app.py +++ b/app.py @@ -155,4 +155,4 @@ def cleanse_daily_sun_db(): if __name__ == "__main__": - app.run(debug=True, host="0.0.0.0", port=8001) + app.run(debug=True, host="0.0.0.0", port=8000) From 580a3f37222200a2db7841187e38b0663bf12a23 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Wed, 8 Oct 2025 18:00:05 -0400 Subject: [PATCH 20/30] Added Daily Sun Images --- src/scrapers/daily_sun_scrape.py | 21 ++++++++++++++++++++- src/utils/constants.py | 2 +- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/scrapers/daily_sun_scrape.py b/src/scrapers/daily_sun_scrape.py index cd11c5a..8e377c4 100644 --- a/src/scrapers/daily_sun_scrape.py +++ b/src/scrapers/daily_sun_scrape.py @@ -3,7 +3,10 @@ from datetime import datetime, timedelta from dotenv import load_dotenv from ..services import ArticleService +from ..utils.constants import ARTICLE_IMG_TAG import logging +from bs4 import BeautifulSoup +import base64 load_dotenv() @@ -36,9 +39,24 @@ def fetch_news(): ) article_url = f"https://cornellsun.com/article/{article['slug']}" + article_image = None + try: + response = requests.get( + article_url, + headers={ + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + } + ) + response.raise_for_status() + soup = BeautifulSoup(response.content, 'html.parser') + img_tag = soup.select_one(ARTICLE_IMG_TAG) + if img_tag and img_tag.get('src'): + article_image=img_tag.get('src') + except Exception as e: + logging.error(f"Error fetching news: {str(e)}") article_doc = { "title": article["headline"], - "image": article["dominantMedia"]["title"] if article["dominantMedia"] else None, + "image": article_image, "sports_type": sports_type, "published_at": published_at, "url": article_url, @@ -46,6 +64,7 @@ def fetch_news(): "created_at": datetime.now() } articles_to_store.append(article_doc) + if articles_to_store: ArticleService.create_articles_bulk(articles_to_store) diff --git a/src/utils/constants.py b/src/utils/constants.py index e6f6295..81f0414 100644 --- a/src/utils/constants.py +++ b/src/utils/constants.py @@ -130,4 +130,4 @@ # The maximum number of videos to retrieve VIDEO_LIMIT = 20 - +ARTICLE_IMG_TAG = ".dom-art-container img" From 7cfaaf96bb82a0614f8a9d442d3f30c2c30c9051 Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Wed, 22 Oct 2025 18:17:26 -0400 Subject: [PATCH 21/30] Added find by date query to support advanced filters for frontend --- src/queries/game_query.py | 9 ++++++++- src/repositories/game_repository.py | 21 +++++++++++++++++++++ src/services/game_service.py | 7 +++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/queries/game_query.py b/src/queries/game_query.py index 631aba8..3c04116 100644 --- a/src/queries/game_query.py +++ b/src/queries/game_query.py @@ -1,4 +1,4 @@ -from graphene import ObjectType, String, Field, List, Int +from graphene import ObjectType, String, Field, List, Int, DateTime from src.services.game_service import GameService from src.types import GameType @@ -27,6 +27,7 @@ class GameQuery(ObjectType): games_by_sport_gender = List( GameType, sport=String(required=True), gender=String(required=True) ) + games_by_date = List(GameType, startDate=DateTime(required=True), endDate=DateTime(required=True)) def resolve_games(self, info, limit=100, offset=0): """ @@ -67,3 +68,9 @@ def resolve_games_by_sport_gender(self, info, sport, gender): Resolver for retrieving all games by its sport and gender. """ return GameService.get_games_by_sport_gender(sport, gender) + + def resolve_games_by_date(self, info, startDate, endDate): + """ + Resolver for retrieving games by date. + """ + return GameService.get_games_by_date(startDate, endDate) diff --git a/src/repositories/game_repository.py b/src/repositories/game_repository.py index 4f05b2f..95e679b 100644 --- a/src/repositories/game_repository.py +++ b/src/repositories/game_repository.py @@ -225,6 +225,27 @@ def find_games_by_sport_gender_after_date(sport, gender, after_date=None): games = game_collection.find(query) return [Game.from_dict(game) for game in games] + + @staticmethod + def find_by_date(startDate, endDate): + """ + Retrieve all games from the 'game' collection in MongoDB for games + between certain dates. + """ + game_collection = db["game"] + + start_str = startDate.isoformat() + endDate = endDate.isoformat() + + query = { + "utc_date": { + "$gte": start_str, + "$lte": endDate + } + } + + games = game_collection.find(query) + return [Game.from_dict(game) for game in games] @staticmethod def delete_games_by_ids(game_ids): diff --git a/src/services/game_service.py b/src/services/game_service.py index 95d31ed..2351543 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -101,6 +101,13 @@ def get_games_by_sport_gender(sport, gender): Retrieves all game by its sport and gender. """ return GameRepository.find_by_sport_gender(sport, gender) + + @staticmethod + def get_games_by_date(startDate, endDate): + """ + Retrieves all games between these two dates. + """ + return GameRepository.find_by_date(startDate, endDate) @staticmethod def get_tournament_games_by_sport_gender(sport, gender, after_date=None): From 0869246932062772691ba6ccc95f09fd9865a41e Mon Sep 17 00:00:00 2001 From: claiireyu Date: Wed, 22 Oct 2025 23:24:52 -0400 Subject: [PATCH 22/30] Add duration field to YoutubeVideo model and related mutations - Updated YoutubeVideo model to include duration attribute. - Modified CreateYoutubeVideo mutation to accept duration as an argument. - Implemented get_video_duration function to fetch video duration from YouTube API. - Updated process_video_item to include video duration in the processed data. - Adjusted YoutubeVideoType to reflect the new duration field. --- src/models/youtube_video.py | 6 ++- src/mutations/create_youtube_video.py | 4 +- src/scrapers/youtube_stats.py | 54 ++++++++++++++++++++++++++- src/services/youtube_video_service.py | 1 + src/types.py | 2 + 5 files changed, 64 insertions(+), 3 deletions(-) diff --git a/src/models/youtube_video.py b/src/models/youtube_video.py index e45a965..cdd11af 100644 --- a/src/models/youtube_video.py +++ b/src/models/youtube_video.py @@ -12,10 +12,11 @@ class YoutubeVideo: - `thumbnail` The thumbnail of the video, as a URL string pointing to a `.jpg` file. - `url` The URL of the video. - `published_at` The date and time the video was published. + - `duration` The duration of the video. """ def __init__( - self, title, description, thumbnail, b64_thumbnail, url, published_at, id=None + self, title, description, thumbnail, b64_thumbnail, url, published_at, duration=None, id=None ): self.id = id if id else str(ObjectId()) self.title = title @@ -24,6 +25,7 @@ def __init__( self.b64_thumbnail = b64_thumbnail self.url = url self.published_at = published_at + self.duration = duration def to_dict(self): """ @@ -37,6 +39,7 @@ def to_dict(self): "b64_thumbnail": self.b64_thumbnail, "url": self.url, "published_at": self.published_at, + "duration": self.duration, } @staticmethod @@ -52,4 +55,5 @@ def from_dict(data): b64_thumbnail=data.get("b64_thumbnail"), url=data.get("url"), published_at=data.get("published_at"), + duration=data.get("duration"), ) diff --git a/src/mutations/create_youtube_video.py b/src/mutations/create_youtube_video.py index 9f39bf7..156df6d 100644 --- a/src/mutations/create_youtube_video.py +++ b/src/mutations/create_youtube_video.py @@ -11,10 +11,11 @@ class Arguments: b64_thumbnail = String(required=True) url = String(required=True) published_at = String(required=True) + duration = String(required=True) youtube_video = Field(lambda: YoutubeVideoType) - def mutate(self, info, id, title, description, thumbnail, url, published_at): + def mutate(self, info, id, title, description, thumbnail, b64_thumbnail, url, published_at, duration): video_data = { "id": id, "title": title, @@ -23,6 +24,7 @@ def mutate(self, info, id, title, description, thumbnail, url, published_at): "b64_thumbnail": b64_thumbnail, "url": url, "published_at": published_at, + "duration": duration, } new_video = YoutubeVideoService.create_video(video_data) return CreateYoutubeVideo(youtube_video=new_video) \ No newline at end of file diff --git a/src/scrapers/youtube_stats.py b/src/scrapers/youtube_stats.py index ee8a5a7..a7dff13 100644 --- a/src/scrapers/youtube_stats.py +++ b/src/scrapers/youtube_stats.py @@ -6,6 +6,7 @@ import base64 import os import html +from bs4 import BeautifulSoup load_dotenv() YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY") @@ -25,6 +26,54 @@ def fetch_videos(): process_video_item(item) +def get_video_duration(video_id): + """ + Gets video duration using YouTube API + """ + try: + url = f"https://www.googleapis.com/youtube/v3/videos?key={YOUTUBE_API_KEY}&id={video_id}&part=contentDetails" + response = requests.get(url) + response.raise_for_status() + data = response.json() + + if data.get("items"): + duration_iso = data["items"][0]["contentDetails"]["duration"] + return convert_iso_duration(duration_iso) + return None + except Exception as e: + print(f"Error getting duration for video {video_id}: {e}") + return None + + +def convert_iso_duration(iso_duration): + """ + Converts ISO 8601 duration (PT2M5S) to readable format (2:05) + Examples: + - PT2M5S -> 2:05 + - PT1H23M45S -> 1:23:45 + - PT30S -> 0:30 + """ + import re + + # Remove PT prefix + duration = iso_duration.replace('PT', '') + + # Extract hours, minutes, seconds + hours = re.search(r'(\d+)H', duration) + minutes = re.search(r'(\d+)M', duration) + seconds = re.search(r'(\d+)S', duration) + + h = int(hours.group(1)) if hours else 0 + m = int(minutes.group(1)) if minutes else 0 + s = int(seconds.group(1)) if seconds else 0 + + # Format as MM:SS or HH:MM:SS + if h > 0: + return f"{h}:{m:02d}:{s:02d}" + else: + return f"{m}:{s:02d}" + + def process_video_item(item): """ Extracts the required data from a video item and @@ -55,14 +104,17 @@ def process_video_item(item): published_at = snippet.get("publishedAt") video_url = f"https://www.youtube.com/watch?v={video_id}" + duration = get_video_duration(video_id) + video_data = { - "id": video_id, # use video id for easy retrieval + "id": video_id, "title": title, "description": description, "thumbnail": thumbnail, "b64_thumbnail": encoded_thumbnail, "url": video_url, "published_at": published_at, + "duration": duration, } process_video_data(video_data) diff --git a/src/services/youtube_video_service.py b/src/services/youtube_video_service.py index 5052975..0d34c33 100644 --- a/src/services/youtube_video_service.py +++ b/src/services/youtube_video_service.py @@ -30,6 +30,7 @@ def create_video(data): b64_thumbnail=data.get("b64_thumbnail"), url=data.get("url"), published_at=data.get("published_at"), + duration=data.get("duration"), ) YoutubeVideoRepository.insert(video) return video diff --git a/src/types.py b/src/types.py index 284f127..871e57d 100644 --- a/src/types.py +++ b/src/types.py @@ -155,6 +155,7 @@ class YoutubeVideoType(ObjectType): - thumbnail: The URL of the video's thumbnail. - url: The URL to the video. - published_at: The date and time the video was published. + - duration: The duration of the video (optional). """ id = String(required=False) title = String(required=True) @@ -163,6 +164,7 @@ class YoutubeVideoType(ObjectType): b64_thumbnail = String(required=True) url = String(required=True) published_at = String(required=True) + duration = String(required=False) def __init__(self, **kwargs): for key, value in kwargs.items(): From 5348a2883025325d6f67169753d0ce603c4f183f Mon Sep 17 00:00:00 2001 From: claiireyu Date: Mon, 27 Oct 2025 18:00:52 -0400 Subject: [PATCH 23/30] Update youtube_stats.py --- src/scrapers/youtube_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scrapers/youtube_stats.py b/src/scrapers/youtube_stats.py index a7dff13..94eac38 100644 --- a/src/scrapers/youtube_stats.py +++ b/src/scrapers/youtube_stats.py @@ -41,7 +41,7 @@ def get_video_duration(video_id): return convert_iso_duration(duration_iso) return None except Exception as e: - print(f"Error getting duration for video {video_id}: {e}") + print(f"Error getting video duration: {e}") return None From c89babefa5ff76e69f7b6697afb34ef6ae5b8fa9 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Sun, 16 Nov 2025 22:05:45 -0500 Subject: [PATCH 24/30] Refactor article date handling to use ISO 8601 format --- src/mutations/create_article.py | 3 +-- src/repositories/article_repository.py | 14 ++++++++++---- src/scrapers/daily_sun_scrape.py | 15 +++++++++------ src/types.py | 5 +---- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/mutations/create_article.py b/src/mutations/create_article.py index 1e0a03b..19b8920 100644 --- a/src/mutations/create_article.py +++ b/src/mutations/create_article.py @@ -14,11 +14,10 @@ class Arguments: article = Field(lambda: ArticleType) def mutate(self, info, title, sports_type, published_at, url, slug, image=None): - from datetime import datetime article_data = { "title": title, "sports_type": sports_type, - "published_at": datetime.fromisoformat(published_at), + "published_at": published_at, # Already in ISO 8601 format "url": url, "slug": slug, "image": image diff --git a/src/repositories/article_repository.py b/src/repositories/article_repository.py index 0e324e9..440f856 100644 --- a/src/repositories/article_repository.py +++ b/src/repositories/article_repository.py @@ -1,7 +1,7 @@ from src.database import daily_sun_db from src.models.article import Article from pymongo import UpdateOne -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone class ArticleRepository: @staticmethod @@ -52,7 +52,9 @@ def find_recent(limit_days=3): Retrieve articles from the last N days, sorted by published_at descending. """ article_collection = daily_sun_db["news_articles"] - query = {"published_at": {"$gte": datetime.now() - timedelta(days=limit_days)}} + # Calculate threshold as ISO 8601 string + threshold = (datetime.now(timezone.utc) - timedelta(days=limit_days)).isoformat().replace('+00:00', 'Z') + query = {"published_at": {"$gte": threshold}} articles = article_collection.find(query).sort("published_at", -1) return [Article.from_dict(article) for article in articles] @@ -62,9 +64,11 @@ def find_by_sports_type(sports_type, limit_days=3): Retrieve articles by sports_type from the last N days, sorted by published_at descending. """ article_collection = daily_sun_db["news_articles"] + # Calculate threshold as ISO 8601 string + threshold = (datetime.now(timezone.utc) - timedelta(days=limit_days)).isoformat().replace('+00:00', 'Z') query = { "sports_type": sports_type, - "published_at": {"$gte": datetime.now() - timedelta(days=limit_days)} + "published_at": {"$gte": threshold} } articles = article_collection.find(query).sort("published_at", -1) return [Article.from_dict(article) for article in articles] @@ -75,5 +79,7 @@ def delete_not_recent(limit_days=3): Delete articles older than N days, sorted by published_at descending. """ article_collection = daily_sun_db["news_articles"] - query = {"published_at": {"$lt": datetime.now() - timedelta(days=limit_days)}} + # Calculate threshold as ISO 8601 string + threshold = (datetime.now(timezone.utc) - timedelta(days=limit_days)).isoformat().replace('+00:00', 'Z') + query = {"published_at": {"$lt": threshold}} article_collection.delete_many(query) \ No newline at end of file diff --git a/src/scrapers/daily_sun_scrape.py b/src/scrapers/daily_sun_scrape.py index 8e377c4..7750f4a 100644 --- a/src/scrapers/daily_sun_scrape.py +++ b/src/scrapers/daily_sun_scrape.py @@ -1,6 +1,6 @@ import os import requests -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from dotenv import load_dotenv from ..services import ArticleService from ..utils.constants import ARTICLE_IMG_TAG @@ -23,16 +23,19 @@ def fetch_news(): response.raise_for_status() data = response.json() - # Current date and 3-day threshold - current_date = datetime.now() + # Current date and 3-day threshold (in UTC) + current_date = datetime.now(timezone.utc) three_days_ago = current_date - timedelta(days=3) # Process articles articles_to_store = [] for article in data.get("articles", []): - published_at = datetime.strptime(article["published_at"], "%Y-%m-%d %H:%M:%S") + published_at_dt = datetime.strptime(article["published_at"], "%Y-%m-%d %H:%M:%S") + # Assume the timezone is UTC and convert to ISO 8601 format string + published_at_dt = published_at_dt.replace(tzinfo=timezone.utc) + published_at = published_at_dt.isoformat().replace('+00:00', 'Z') - if published_at >= three_days_ago: + if published_at_dt >= three_days_ago: sports_type = next( (tag["name"] for tag in article["tags"] if tag["name"] not in ["Sports", "Top Stories"]), "General" @@ -61,7 +64,7 @@ def fetch_news(): "published_at": published_at, "url": article_url, "slug": article["slug"], - "created_at": datetime.now() + "created_at": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') } articles_to_store.append(article_doc) diff --git a/src/types.py b/src/types.py index 871e57d..96f36e2 100644 --- a/src/types.py +++ b/src/types.py @@ -190,7 +190,4 @@ class ArticleType(ObjectType): def __init__(self, **kwargs): for key, value in kwargs.items(): - if key == "published_at" and isinstance(value, datetime): - setattr(self, key, value.isoformat()) - else: - setattr(self, key, value) \ No newline at end of file + setattr(self, key, value) \ No newline at end of file From a36601424a9cc993a37683538e22576d3ef0a305 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Sun, 16 Nov 2025 22:29:41 -0500 Subject: [PATCH 25/30] Implement sport type extraction from article titles --- src/scrapers/daily_sun_scrape.py | 8 +++--- src/utils/helpers.py | 46 +++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/src/scrapers/daily_sun_scrape.py b/src/scrapers/daily_sun_scrape.py index 8e377c4..d64dd76 100644 --- a/src/scrapers/daily_sun_scrape.py +++ b/src/scrapers/daily_sun_scrape.py @@ -4,6 +4,7 @@ from dotenv import load_dotenv from ..services import ArticleService from ..utils.constants import ARTICLE_IMG_TAG +from ..utils.helpers import extract_sport_type_from_title import logging from bs4 import BeautifulSoup import base64 @@ -33,10 +34,9 @@ def fetch_news(): published_at = datetime.strptime(article["published_at"], "%Y-%m-%d %H:%M:%S") if published_at >= three_days_ago: - sports_type = next( - (tag["name"] for tag in article["tags"] if tag["name"] not in ["Sports", "Top Stories"]), - "General" - ) + # Extract sport type from title + title = article["headline"] + sports_type = extract_sport_type_from_title(title) article_url = f"https://cornellsun.com/article/{article['slug']}" article_image = None diff --git a/src/utils/helpers.py b/src/utils/helpers.py index cb3d759..362bb2e 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -94,4 +94,48 @@ def is_cornell_loss(result: str): # Common loss indicators in result strings loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"] - return any(indicator in result for indicator in loss_indicators) \ No newline at end of file + return any(indicator in result for indicator in loss_indicators) + +def extract_sport_type_from_title(title: str): + """ + Extract the sport type from an article title by matching against known sports. + + Args: + title (str): The article title to analyze + + Returns: + str: The sport name if found, otherwise "sports" as default + """ + from .constants import SPORT_URLS + + if not title: + return "sports" + + # Get all unique sport names from SPORT_URLS + sport_names = set() + for sport_data in SPORT_URLS.values(): + sport_name = sport_data["sport"].strip() + if sport_name: + sport_names.add(sport_name) + + # Sort by length (longest first) to match "Swimming & Diving" before "Swimming" + sport_names_sorted = sorted(sport_names, key=len, reverse=True) + + title_lower = title.lower() + + for sport_name in sport_names_sorted: + if sport_name.lower() in title_lower: + return sport_name + + # Special mappings for common variations in titles + # Only checked if no exact match found above + # e.g., "Hockey" in title should match "Ice Hockey" in sport names + special_mappings = { + "hockey": "Ice Hockey", # "Men's Hockey" or "Women's Hockey" → "Ice Hockey" + } + + for keyword, sport_name in special_mappings.items(): + if keyword in title_lower and sport_name in sport_names: + return sport_name + + return "sports" From 0a892fad0125dcd5e059ac7b6ed8e56ea0d5345b Mon Sep 17 00:00:00 2001 From: Kevin Biliguun Date: Wed, 19 Nov 2025 02:00:35 -0500 Subject: [PATCH 26/30] Added sports type to youtube videos --- src/services/game_service.py | 8 ++++-- src/types.py | 9 +++++++ src/utils/helpers.py | 51 +++++++++++++++++++++++++++++++++++- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/src/services/game_service.py b/src/services/game_service.py index 2351543..c0ae3db 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -2,6 +2,7 @@ from src.models.game import Game from src.services.team_service import TeamService from src.utils.helpers import is_tournament_placeholder_team +from pymongo.errors import DuplicateKeyError class GameService: @@ -36,8 +37,11 @@ def create_game(data): raise ValueError(f"Opponent team with id {opponent_id} does not exist.") game = Game(**data) - GameRepository.insert(game) - return game + try: + GameRepository.insert(game) + return game + except DuplicateKeyError: + return None @staticmethod def delete_game(game_id): diff --git a/src/types.py b/src/types.py index 871e57d..e27eace 100644 --- a/src/types.py +++ b/src/types.py @@ -156,6 +156,7 @@ class YoutubeVideoType(ObjectType): - url: The URL to the video. - published_at: The date and time the video was published. - duration: The duration of the video (optional). + - sportsType: The sport type extracted from the video title. """ id = String(required=False) title = String(required=True) @@ -165,11 +166,19 @@ class YoutubeVideoType(ObjectType): url = String(required=True) published_at = String(required=True) duration = String(required=False) + sportsType = String(required=False) def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) + def resolve_sportsType(video, info): + """ + Resolver to extract sport type from the video title. + """ + from src.utils.helpers import extract_sport_from_title + return extract_sport_from_title(video.title) + class ArticleType(ObjectType): """ A GraphQL type representing a news article. diff --git a/src/utils/helpers.py b/src/utils/helpers.py index cb3d759..339ee7e 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -3,6 +3,7 @@ from PIL import Image from io import BytesIO from collections import Counter +import re def get_dominant_color(image_url, white_threshold=200, black_threshold=50): @@ -94,4 +95,52 @@ def is_cornell_loss(result: str): # Common loss indicators in result strings loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"] - return any(indicator in result for indicator in loss_indicators) \ No newline at end of file + return any(indicator in result for indicator in loss_indicators) + +def extract_sport_from_title(title): + """ + Extracts the sport type from a YouTube video title. + + Args: + title (str): The title of the YouTube video + + Returns: + str: The sport type if found, None otherwise + """ + if not title: + return None + + title_lower = title.lower() + + sport_patterns = [ + # Ice Hockey + (r"ice\s+hockey", "Ice Hockey"), + (r"women'?s\s+ice\s+hockey", "Ice Hockey"), + (r"men'?s\s+ice\s+hockey", "Ice Hockey"), + # Field Hockey + (r"field\s+hockey", "Field Hockey"), + # Hockey + (r"\bhockey\b", "Ice Hockey"), + # Basketball + (r"basketball", "Basketball"), + # Football + (r"\bfootball\b", "Football"), + # Soccer + (r"\bsoccer\b", "Soccer"), + # Volleyball + (r"volleyball", "Volleyball"), + # Wrestling + (r"wrestling", "Wrestling"), + # Sprint Football + (r"sprint\s+football", "Sprint Football"), + ] + + for pattern, sport_name in sport_patterns: + if re.search(pattern, title_lower): + return sport_name + + if "ice" in title_lower and ("hockey" in title_lower or "cornell" in title_lower): + return "Ice Hockey" + + return None + From 076ecaf65b9d449960d0821a4c9b6d39239ce072 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Fri, 23 Jan 2026 13:18:08 -0500 Subject: [PATCH 27/30] Fixes #50 --- src/scrapers/game_details_scrape.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/scrapers/game_details_scrape.py b/src/scrapers/game_details_scrape.py index 8fce04a..e1c6a9e 100644 --- a/src/scrapers/game_details_scrape.py +++ b/src/scrapers/game_details_scrape.py @@ -31,16 +31,20 @@ def extract_teams_and_scores(box_score_section, sport): period_scores = [] for row in score_table.find(TAG_TBODY).find_all(TAG_TR): - team_name_cell = row.find(TAG_TH) if sport == 'ice hockey' else row.find(TAG_TD) + # Check if team name is in (some sports) or first (other sports) + team_name_cell = row.find(TAG_TH) if team_name_cell: + # Team name is in , all elements are period scores team_name = team_name_cell.text.strip().replace("Winner", "").strip() - team_name = ' '.join(team_name.split()) + scores = [td.text.strip() for td in row.find_all(TAG_TD)] else: - team_name = "Unknown" + # Team name is in first , remaining elements are period scores + team_name_cell = row.find(TAG_TD) + team_name = team_name_cell.text.strip().replace("Winner", "").strip() if team_name_cell else "Unknown" + scores = [td.text.strip() for td in row.find_all(TAG_TD)[1:]] + team_name = ' '.join(team_name.split()) team_names.append(team_name) - scores = [td.text.strip() for td in row.find_all(TAG_TD)[1:]] - scores = scores[:-1] if sport == 'basketball' else scores period_scores.append(scores) return team_names, period_scores From e848140558abce4175f1787fab28cefaecf66ad1 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Thu, 29 Jan 2026 19:35:57 -0500 Subject: [PATCH 28/30] Fix #49 for baskerball score breakdowns --- src/scrapers/game_details_scrape.py | 51 +++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/src/scrapers/game_details_scrape.py b/src/scrapers/game_details_scrape.py index 8fce04a..5b6fc6d 100644 --- a/src/scrapers/game_details_scrape.py +++ b/src/scrapers/game_details_scrape.py @@ -31,16 +31,24 @@ def extract_teams_and_scores(box_score_section, sport): period_scores = [] for row in score_table.find(TAG_TBODY).find_all(TAG_TR): - team_name_cell = row.find(TAG_TH) if sport == 'ice hockey' else row.find(TAG_TD) + # Check if team name is in (some sports) or first (other sports) + team_name_cell = row.find(TAG_TH) if team_name_cell: + # Team name is in , all elements are period scores team_name = team_name_cell.text.strip().replace("Winner", "").strip() - team_name = ' '.join(team_name.split()) + scores = [td.text.strip() for td in row.find_all(TAG_TD)] else: - team_name = "Unknown" + # Team name is in first , remaining elements are period scores + team_name_cell = row.find(TAG_TD) + team_name = team_name_cell.text.strip().replace("Winner", "").strip() if team_name_cell else "Unknown" + scores = [td.text.strip() for td in row.find_all(TAG_TD)[1:]] + # Basketball box score includes a "Records" column at the end - exclude it + if sport == 'basketball' and scores: + scores = scores[:-1] + + team_name = ' '.join(team_name.split()) team_names.append(team_name) - scores = [td.text.strip() for td in row.find_all(TAG_TD)[1:]] - scores = scores[:-1] if sport == 'basketball' else scores period_scores.append(scores) return team_names, period_scores @@ -59,7 +67,7 @@ def soccer_summary(box_score_section): event = row.find_all(TAG_TD)[2] desc = event.find_all(TAG_SPAN)[-1].text.strip() - if team == "COR" or team == "CU": + if team == "COR" or team == "CU" or team == "CRNL": cornell_score += 1 else: opp_score += 1 @@ -220,6 +228,36 @@ def baseball_summary(box_score_section): summary = [{"message": "No scoring events in this game."}] return summary +# def basketball_summary(box_score_section): +# summary = [] +# scoring_section = box_score_section.find(TAG_SECTION, {ATTR_ARIA_LABEL: LABEL_SCORING_SUMMARY}) +# if scoring_section: +# scoring_rows = scoring_section.find(TAG_TBODY) +# if scoring_rows: +# cornell_score = 0 +# opp_score = 0 +# for row in scoring_rows.find_all(TAG_TR): +# time = row.find_all(TAG_TD)[0].text.strip() +# team = row.find_all(TAG_TD)[1].find(TAG_IMG)[ATTR_ALT] +# event = row.find_all(TAG_TD)[2] +# desc = event.find_all(TAG_SPAN)[-1].text.strip() + +# if team == "COR" or team == "CU" or team == "CRNL": +# cornell_score += 1 +# else: +# opp_score += 1 + +# summary.append({ +# 'time': time, +# 'team': team, +# 'description': desc, +# 'cor_score': cornell_score, +# 'opp_score': opp_score +# }) +# if not summary: +# summary = [{"message": "No scoring events in this game."}] +# return summary + def scrape_game(url, sport): soup = fetch_page(url) box_score_section = soup.find(class_=CLASS_BOX_SCORE) if sport in ['baseball', 'softball'] else soup.find(id=ID_BOX_SCORE) @@ -233,6 +271,7 @@ def scrape_game(url, sport): 'field hockey': (lambda: extract_teams_and_scores(box_score_section, 'field hockey'), field_hockey_summary), 'lacrosse': (lambda: extract_teams_and_scores(box_score_section, 'lacrosse'), lacrosse_summary), 'baseball': (lambda: extract_teams_and_scores(box_score_section, 'baseball'), baseball_summary), + 'basketball': (lambda: extract_teams_and_scores(box_score_section, 'basketball'), lambda _: []), } extract_teams_func, summary_func = sport_parsers.get(sport, (None, None)) From 85eac9db691c1fe0ff8f8896f7145bc2b5857b5f Mon Sep 17 00:00:00 2001 From: claiireyu Date: Fri, 30 Jan 2026 14:51:13 -0500 Subject: [PATCH 29/30] Fix date comparison in news fetching logic to use datetime object instead of ISO 8601 string --- src/scrapers/daily_sun_scrape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scrapers/daily_sun_scrape.py b/src/scrapers/daily_sun_scrape.py index 42a2e63..4b5cd24 100644 --- a/src/scrapers/daily_sun_scrape.py +++ b/src/scrapers/daily_sun_scrape.py @@ -36,7 +36,7 @@ def fetch_news(): published_at_dt = published_at_dt.replace(tzinfo=timezone.utc) published_at = published_at_dt.isoformat().replace('+00:00', 'Z') - if published_at >= three_days_ago: + if published_at_dt >= three_days_ago: # Extract sport type from title title = article["headline"] sports_type = extract_sport_type_from_title(title) From 4b66034e58b321cdb59946a20b6e713a2ef27d29 Mon Sep 17 00:00:00 2001 From: Joshua Dirga Date: Fri, 30 Jan 2026 15:30:42 -0500 Subject: [PATCH 30/30] fixed youtube b64 thumbnail null issue --- src/mutations/create_youtube_video.py | 4 ++-- src/types.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mutations/create_youtube_video.py b/src/mutations/create_youtube_video.py index 156df6d..ed2cb40 100644 --- a/src/mutations/create_youtube_video.py +++ b/src/mutations/create_youtube_video.py @@ -8,14 +8,14 @@ class Arguments: title = String(required=True) description = String(required=True) thumbnail = String(required=True) - b64_thumbnail = String(required=True) + b64_thumbnail = String(required=False) url = String(required=True) published_at = String(required=True) duration = String(required=True) youtube_video = Field(lambda: YoutubeVideoType) - def mutate(self, info, id, title, description, thumbnail, b64_thumbnail, url, published_at, duration): + def mutate(self, info, id, title, description, thumbnail, url, published_at, duration, b64_thumbnail=None): video_data = { "id": id, "title": title, diff --git a/src/types.py b/src/types.py index 85024f2..7eb8fbe 100644 --- a/src/types.py +++ b/src/types.py @@ -152,7 +152,7 @@ class YoutubeVideoType(ObjectType): - id: The YouTube video ID (optional). - title: The title of the video. - description: The description of the video. - - thumbnail: The URL of the video's thumbnail. + - thumbnail: The URL of the video's thumbnail. (optional) - url: The URL to the video. - published_at: The date and time the video was published. - duration: The duration of the video (optional). @@ -162,7 +162,7 @@ class YoutubeVideoType(ObjectType): title = String(required=True) description = String(required=True) thumbnail = String(required=True) - b64_thumbnail = String(required=True) + b64_thumbnail = String(required=False) url = String(required=True) published_at = String(required=True) duration = String(required=False)