From c3ccf9e8f8fb896b1a1bb85880f553484d72066d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 7 Mar 2025 07:47:33 -0500 Subject: [PATCH 1/7] feat(app): update to accomodate changes in data sources app --- AccessManager.py | 50 +++++ CacheManager.py | 40 ++++ Dockerfile | 2 + InternetArchiveInterface.py | 47 +++++ Jenkinsfile | 14 ++ PDAPInterface.py | 49 +++++ README.md | 3 + cache_url.py | 374 +++++++++++++++++++++--------------- enums.py | 15 ++ exceptions.py | 2 + requirements.txt | 4 +- 11 files changed, 442 insertions(+), 158 deletions(-) create mode 100644 AccessManager.py create mode 100644 CacheManager.py create mode 100644 InternetArchiveInterface.py create mode 100644 PDAPInterface.py create mode 100644 enums.py create mode 100644 exceptions.py diff --git a/AccessManager.py b/AccessManager.py new file mode 100644 index 0000000..d7bd03f --- /dev/null +++ b/AccessManager.py @@ -0,0 +1,50 @@ +import os + +import requests + + + +class AccessManager: + """ + Manages access to the API, handling logins and access token storage + """ + + def __init__( + self, + email: str, + password: str + ): + self.email = email + self.password = password + self.access_token = None + self.refresh_token = None + self.login() + + def get_bearer_authorization_header(self): + return {"Authorization": f"Bearer {self.access_token}"} + + def login(self): + response = requests.post( + f"{os.getenv('VITE_VUE_APP_BASE_URL')}/auth/login", + json={ + "email": self.email, + "password": self.password + }, + timeout=10 + ) + response.raise_for_status() + data = response.json() + self.access_token = data["access_token"] + self.refresh_token = data["refresh_token"] + + def refresh_access_token(self): + response = requests.post( + f"{os.getenv('VITE_VUE_APP_BASE_URL')}/auth/refresh-session", + headers={ + "Authorization": f"Bearer {self.refresh_token}" + }, + timeout=10 + ) + response.raise_for_status() + data = response.json() + self.access_token = data["access_token"] \ No newline at end of file diff --git a/CacheManager.py b/CacheManager.py new file mode 100644 index 0000000..80766d7 --- /dev/null +++ b/CacheManager.py @@ -0,0 +1,40 @@ +import datetime +import json +import os + + +class CacheManager: + """ + Manages an internal cache for tracking when source URLs were last cached + in the internet archive + This is to minimize the number of requests to the archive + """ + + def __init__(self): + # If the file does not exist, create it + if not os.path.exists("cache.json"): + with open("cache.json", "w") as f: + f.write("{}") + + # Load the cache + with open("cache.json", "r") as f: + self.cache = json.load(f) + + def save_cache(self): + with open("cache.json", "w") as f: + json.dump(self.cache, f) + + def datetime_to_str(self, last_cached: datetime.datetime) -> str: + return last_cached.strftime("%Y-%m-%d") + + def str_to_datetime(self, last_cached: str) -> datetime.datetime: + return datetime.datetime.strptime(last_cached, "%Y-%m-%d") + + def get_last_cached(self, source_url: str) -> datetime.datetime: + return self.str_to_datetime(self.cache.get(source_url)) + + def set_last_cached(self, source_url: str, last_cached: datetime.datetime): + self.cache[source_url] = self.datetime_to_str(last_cached) + + def has_source_url(self, source_url: str) -> bool: + return source_url in self.cache \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 6b0d429..e4f4598 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,8 @@ FROM python:3.11 WORKDIR /usr/src/app +RUN apt-get update && apt-get install -y curl + COPY requirements.txt ./ RUN pip install --no-cache-dir -r requirements.txt diff --git a/InternetArchiveInterface.py b/InternetArchiveInterface.py new file mode 100644 index 0000000..0c50836 --- /dev/null +++ b/InternetArchiveInterface.py @@ -0,0 +1,47 @@ +import time +from datetime import datetime + +import requests + +from exceptions import ArchiveLastCacheNotFoundError + + +class InternetArchiveInterface: + + def __init__(self, s3_keys: str): + self.s3_keys = s3_keys + + def get_website_info_data(self, source_url): + website_info = requests.get( + f"https://archive.org/wayback/available?url={source_url}", + timeout=10 + ) + website_info_data = website_info.json() + return website_info_data + + def get_website_info_data_last_cached(self, source_url): + website_info_data = self.get_website_info_data(source_url) + if not website_info_data["archived_snapshots"]: + raise ArchiveLastCacheNotFoundError + return datetime.strptime( + website_info_data["archived_snapshots"]["closest"]["timestamp"], + "%Y%m%d%H%M%S", + ) + + def save_to_internet_archive(self, entry: dict, source_url: str, wait_time: int): + """ + Wait then post to Internet Archive + :param entry: + :param source_url: + :param wait_time: The amount of time to wait + :return: + """ + api_url = f"http://web.archive.org/save/{source_url}" + time.sleep(wait_time) + requests.post( + api_url, + headers={"Authorization": f"LOW {self.s3_keys}"}, + timeout=10 + ) + # Update the last_cached date if cache is successful + entry["last_cached"] = datetime.now().strftime("%Y-%m-%d") diff --git a/Jenkinsfile b/Jenkinsfile index 1114241..88e3fa4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,12 +14,26 @@ pipeline { } stages { + stage('Retrieve JSON from Previous Build') { + steps { + copyArtifacts projectName: "${env.JOB_NAME}", filter: 'cache.json', selector: lastSuccessful() + script { + def jsonContent = readFile 'cache.json' + def jsonData = new groovy.json.JsonSlurper().parseText(jsonContent) + echo "Loaded JSON from previous build: ${jsonData}" + } + } + } stage('Run Automatic Archive') { steps { echo 'Running Automatic Archive...' sh 'python cache_url.py' } } + stage('Save cache') { + steps { + archiveArtifacts artifacts: 'cache.json', fingerprint: true + } } post { failure { diff --git a/PDAPInterface.py b/PDAPInterface.py new file mode 100644 index 0000000..6feabec --- /dev/null +++ b/PDAPInterface.py @@ -0,0 +1,49 @@ +import json +from datetime import datetime + +import requests + +from enums import UpdateFrequency + + +class PDAPInterface: + + def __init__(self, base_url: str): + self.base_url = base_url + + + def update_pdap_archives( + self, + entry: dict, + authorization_header: dict + ): + """ + Update data in PDAP archives + :param entry: + :return: + """ + response = requests.put( + f"{self.base_url}/archives", + json=entry, + headers=authorization_header, + timeout=10 + ) + response.raise_for_status() + + def get_from_pdap_archives( + self, + authorization_header: dict, + update_frequency: UpdateFrequency, + last_archived_before: datetime + ): + response = requests.get( + f"{self.base_url}/archives", + params={ + "update_frequency": update_frequency.value, + "last_archived_before": last_archived_before.isoformat() + }, + headers=authorization_header, + timeout=10 + ) + response.raise_for_status() + return response.json() \ No newline at end of file diff --git a/README.md b/README.md index d97f1a5..ac3519a 100644 --- a/README.md +++ b/README.md @@ -10,4 +10,7 @@ Requires the following environment variables to be set: ```text VUE_APP_PDAP_API_KEY= VITE_VUE_APP_BASE_URL= +PDAP_EMAIL= +PDAP_PASSWORD= +INTERNET_ARCHIVE_S3_KEYS= ``` \ No newline at end of file diff --git a/cache_url.py b/cache_url.py index 5d82c2f..1f216ef 100644 --- a/cache_url.py +++ b/cache_url.py @@ -1,18 +1,24 @@ import json from datetime import datetime, timedelta from dataclasses import dataclass + +from dotenv import load_dotenv from tqdm import tqdm import requests import os import time +from AccessManager import AccessManager +from CacheManager import CacheManager +from InternetArchiveInterface import InternetArchiveInterface +from PDAPInterface import PDAPInterface +from enums import UpdateFrequency +from exceptions import ArchiveLastCacheNotFoundError + # How long to wait in between archive requests, in seconds # Too many requests will result in the IP being temporarily blocked: https://archive.org/details/toomanyrequests_20191110 ARCHIVE_WAIT_TIME = 7 -class ArchiveLastCacheNotFoundError(Exception): - pass - @dataclass class ArchiveEntry: @@ -27,7 +33,7 @@ def from_dict(cls, dict_entry: dict): return cls(**dict_entry) -API_KEY = "Bearer " + os.getenv("VUE_APP_PDAP_API_KEY") +API_KEY = "Basic " + os.getenv("VUE_APP_PDAP_API_KEY") UPDATE_FREQUENCY_MAPPING = { "Incident-based": 7, "< Hourly": 1 / 24, @@ -44,69 +50,6 @@ def from_dict(cls, dict_entry: dict): "Other": None, } - -def archive_url(entry: dict): - """ - - :param entry: - :return: - """ - entry["broken_source_url_as_of"] = None - source_url = entry.get("source_url") - - try: - wait_then_post(entry, source_url, ARCHIVE_WAIT_TIME) - except Exception as error: - try: - wait_then_post(entry, source_url, 10) - except Exception as error: - print(str(error)) - # Send updated data to Data Sources - update_pdap_archives(entry) - -def wait_then_post(entry: dict, source_url: str, wait_time: int): - """ - Wait then post to Internet Archive - :param entry: - :param source_url: - :param wait_time: The amount of time to wait - :return: - """ - api_url = f"http://web.archive.org/save/{source_url}" - time.sleep(wait_time) - requests.post(api_url) - # Update the last_cached date if cache is successful - entry["last_cached"] = datetime.now().strftime("%Y-%m-%d") - - -def handle_missing_source_url(entry: dict): - """ - Record when url was found to be missing, - update PDAP archives, and throw exception - :param entry: - :return: - """ - entry["broken_source_url_as_of"] = datetime.now().strftime("%Y-%m-%d") - update_pdap_archives(entry) - raise Exception("No source_url") - - -def update_pdap_archives(entry: dict): - """ - Update data in PDAP archives - :param entry: - :return: - """ - entry_json = json.dumps(entry) - response = requests.put( - f"{os.getenv('VITE_VUE_APP_BASE_URL')}/archives", - json=entry_json, - headers={"Authorization": API_KEY}, - ) - response.raise_for_status() - - - def get_update_delta(update_frequency: str | None) -> timedelta: """ Calculate update delt based on entry's update frequency @@ -122,107 +65,224 @@ def get_update_delta(update_frequency: str | None) -> timedelta: return timedelta(days=int(update_delta)) -def get_website_info_data_last_cached(source_url) -> datetime: - website_info_data = get_website_info_data(source_url) - if not website_info_data["archived_snapshots"]: - raise ArchiveLastCacheNotFoundError - return datetime.strptime( - website_info_data["archived_snapshots"]["closest"]["timestamp"], - "%Y%m%d%H%M%S", - ) - - -def get_last_archived(last_archived: str | None, source_url: str) -> datetime: - """ - Get last archived date of website from Internet Archive. - :param entry: - :param source_url: - :return: - """ - if last_archived is not None: - try: - return datetime.strptime(last_archived, "%Y-%m-%d") - except ValueError: - return datetime.min - # Check if website exists in archive and compare archived website to current site - last_archived = datetime.min +def last_archived_or_datetime_min(last_archived): try: - website_info_data_last_cached = get_website_info_data_last_cached(source_url) - except ArchiveLastCacheNotFoundError: - return last_archived - if website_info_data_last_cached > last_archived: - return website_info_data_last_cached - return last_archived + return datetime.strptime(last_archived, "%Y-%m-%d") + except ValueError: + return datetime.min -def get_website_info_data(source_url): - website_info = requests.get( - f"https://archive.org/wayback/available?url={source_url}" - ) - website_info_data = website_info.json() - return website_info_data - +def missing_source_url(entry: dict): + return entry['source_url'] is None -def main(): - data = get_from_archives() - extract_url_info_and_archived_if_needed(data) +def is_overdue_for_update(last_archived, update_delta): + return last_archived + update_delta < datetime.now() -def extract_url_info_and_archived_if_needed(data: list[dict]): - """ +def get_past_date(reference_date: datetime, frequency: UpdateFrequency) -> datetime: + year_delta = timedelta(days=365) + + frequency_map = { + UpdateFrequency.WEEKLY: timedelta(weeks=1), + UpdateFrequency.DAILY: timedelta(days=1), + UpdateFrequency.RARELY_UPDATED: year_delta, # Arbitrary long period + UpdateFrequency.MORE_THAN_ANNUALLY: year_delta, + UpdateFrequency.ON_REQUEST: year_delta, # Arbitrary long period + UpdateFrequency.MONTHLY: timedelta(days=30), + UpdateFrequency.INCIDENT_BASED: year_delta, # Arbitrary long period + UpdateFrequency.ANNUALLY: year_delta, + UpdateFrequency.LESS_THAN_HOURLY: timedelta(minutes=30), # Arbitrary small period + UpdateFrequency.BI_WEEKLY: timedelta(weeks=2), + UpdateFrequency.HOURLY: timedelta(hours=1), + UpdateFrequency.QUARTERLY: timedelta(days=90), + } + + return reference_date - frequency_map.get(frequency, timedelta(days=0)) + + +class Archiver: + + def __init__( + self, + access_manager: AccessManager, + cache_manager: CacheManager, + internet_archive_interface: InternetArchiveInterface, + pdap_interface: PDAPInterface + ): + self.access_manager = access_manager + self.cache_manager = cache_manager + self.internet_archive_interface = internet_archive_interface + self.pdap_interface = pdap_interface + + def get_from_pdap_archives( + self, + update_frequency: UpdateFrequency, + last_archived_before: datetime + ): + return self.pdap_interface.get_from_pdap_archives( + authorization_header=self.access_manager.get_bearer_authorization_header(), + update_frequency=update_frequency, + last_archived_before=last_archived_before + ) + + def handle_missing_source_url(self, entry: dict): + entry["broken_source_url_as_of"] = datetime.now().strftime("%Y-%m-%d") + self.update_pdap_archives(entry) + raise Exception("No source_url") + + def get_last_archived( + self, + last_archived: str | None, + source_url: str, + ) -> datetime: + """ + Get last archived date of website from Internet Archive. + :param entry: + :param source_url: + :return: + """ + if last_archived is not None: + return last_archived_or_datetime_min(last_archived) + # Check if website exists in archive and compare archived website to current site + last_archived = datetime.min + try: + if self.cache_manager.has_source_url(source_url): + website_info_data_last_cached = self.cache_manager.get_last_cached(source_url) + else: + website_info_data_last_cached = self.internet_archive_interface.get_website_info_data_last_cached( + source_url + ) + self.cache_manager.set_last_cached(source_url, website_info_data_last_cached) + except ArchiveLastCacheNotFoundError: + self.cache_manager.set_last_cached(source_url, last_archived) + return last_archived + if website_info_data_last_cached > last_archived: + return website_info_data_last_cached + return last_archived - :param data: - :return: - """ - # Create a tuple of entries with missing source URLs - missing_source_url_entries = tuple(filter(missing_source_url, data)) - - # Handle entries with missing source URLs - print("Handling missing source urls") - for entry in tqdm(missing_source_url_entries): - handle_missing_source_url(entry) - - print("\nFinding entries that need updates") - non_missing_source_url_entries = tuple(filter(lambda e: not missing_source_url(e), data)) - entries_needing_updates = [] - for entry in tqdm(non_missing_source_url_entries): - if needs_updated(entry): - entries_needing_updates.append(entry) - - print(f"Updating {len(entries_needing_updates)} entries that need updates") - # Handle entries that need to be updated - for entry in tqdm(entries_needing_updates): + def needs_updated(self, entry: dict) -> bool: + """ + Check if entry needs to be updated + :param entry: + :return: + """ + last_archived = self.get_last_archived( + last_archived=entry["last_cached"], + source_url=entry["source_url"], + ) + update_delta = get_update_delta(entry["update_frequency"]) + return is_overdue_for_update(last_archived, update_delta) + + def get_entries_needing_updates(self, data): + non_missing_source_url_entries = tuple(filter(lambda e: not missing_source_url(e), data)) + entries_needing_updates = [] + for entry in tqdm(non_missing_source_url_entries): + if self.needs_updated(entry): + entries_needing_updates.append(entry) + self.cache_manager.save_cache() + return entries_needing_updates + + def update_pdap_archives( + self, + entry: dict, + ): + """ + Update data in PDAP archives + :param entry: + :return: + """ try: - archive_url(entry) + self.pdap_interface.update_pdap_archives( + entry=entry, + authorization_header=self.access_manager.get_bearer_authorization_header(), + ) except Exception as error: - print(str(error)) + # Try again after refreshing access token + self.access_manager.refresh_access_token() + self.pdap_interface.update_pdap_archives( + entry=entry, + authorization_header=self.access_manager.get_bearer_authorization_header(), + ) + + + def archive_url( + self, + entry: dict, + ): + """ + + :param entry: + :return: + """ + entry["broken_source_url_as_of"] = None + source_url = entry.get("source_url") -def missing_source_url(entry: dict): - return entry['source_url'] is None + try: + self.internet_archive_interface.save_to_internet_archive(entry, source_url, ARCHIVE_WAIT_TIME) + self.cache_manager.set_last_cached(source_url, datetime.now()) + except Exception as error: + try: + self.internet_archive_interface.save_to_internet_archive(entry, source_url, 10) + self.cache_manager.set_last_cached(source_url, datetime.now()) + except Exception as error: + print(str(error)) + # Send updated data to Data Sources + self.update_pdap_archives( + entry, + ) + + def update_entries(self, entries_needing_updates): + for entry in tqdm(entries_needing_updates): + self.archive_url(entry) + + def extract_url_info_and_archive_if_needed(self, data): + """ + :param data: + :return: + """ + # Create a tuple of entries with missing source URLs + missing_source_url_entries = tuple(filter(missing_source_url, data)) + + # Handle entries with missing source URLs + print("Handling missing source urls") + for entry in tqdm(missing_source_url_entries): + self.handle_missing_source_url(entry) + + print("\nFinding entries that need updates") + entries_needing_updates = self.get_entries_needing_updates(data) + + print(f"Updating {len(entries_needing_updates)} entries that need updates") + # Handle entries that need to be updated + self.update_entries(entries_needing_updates) + + def main(self): + today = datetime.today() + for update_frequency in UpdateFrequency: + print(f"Getting {update_frequency.value} entries...") + last_date = get_past_date(reference_date=today, frequency=update_frequency) + data = self.get_from_pdap_archives( + last_archived_before=last_date, + update_frequency=update_frequency + ) + self.extract_url_info_and_archive_if_needed( + data, + ) -def needs_updated(entry: dict) -> bool: - """ - Check if entry needs to be updated - :param entry: - :return: - """ - last_archived = get_last_archived(entry["last_cached"], entry["source_url"]) - update_delta = get_update_delta(entry["update_frequency"]) - return last_archived + update_delta < datetime.now() +if __name__ == "__main__": -def get_from_archives() -> list[dict]: - """ - Get data from PDAP Archive. - :param url: - :return: - """ - response = requests.get( - f"{os.getenv('VITE_VUE_APP_BASE_URL')}/archives", - headers={"Authorization": API_KEY}, + load_dotenv() + archiver = Archiver( + access_manager=AccessManager( + email=os.getenv("PDAP_EMAIL"), + password=os.getenv("PDAP_PASSWORD"), + ), + cache_manager=CacheManager(), + internet_archive_interface=InternetArchiveInterface( + s3_keys=os.getenv("INTERNET_ARCHIVE_S3_KEYS") + ), + pdap_interface=PDAPInterface( + base_url=os.getenv("VITE_VUE_APP_BASE_URL") + ) ) - response.raise_for_status() - return response.json() + archiver.main() - -if __name__ == "__main__": - main() diff --git a/enums.py b/enums.py new file mode 100644 index 0000000..4195a24 --- /dev/null +++ b/enums.py @@ -0,0 +1,15 @@ +from enum import Enum + +class UpdateFrequency(Enum): + WEEKLY = "Weekly" + DAILY = "Daily" + RARELY_UPDATED = "No updates / rarely updated" + MORE_THAN_ANNUALLY = "> Annually" + ON_REQUEST = "On request" + MONTHLY = "Monthly" + INCIDENT_BASED = "Incident-based" + ANNUALLY = "Annually" + LESS_THAN_HOURLY = "< Hourly" + BI_WEEKLY = "Bi-weekly" + HOURLY = "Hourly" + QUARTERLY = "Quarterly" \ No newline at end of file diff --git a/exceptions.py b/exceptions.py new file mode 100644 index 0000000..3d88c95 --- /dev/null +++ b/exceptions.py @@ -0,0 +1,2 @@ +class ArchiveLastCacheNotFoundError(Exception): + pass diff --git a/requirements.txt b/requirements.txt index b483a61..8805968 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,6 @@ requests==2.28.2 savepagenow==1.2.3 urllib3==1.26.15 pytest==8.2.1 -tqdm==4.66.4 \ No newline at end of file +tqdm==4.66.4 +python-dotenv~=1.0.1 +internetarchive~=5.2.1 \ No newline at end of file From c02e841b355c31ee4098e2fb7843fc17c2456e5e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 7 Mar 2025 07:54:08 -0500 Subject: [PATCH 2/7] feat(app): update to accomodate changes in data sources app --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index 88e3fa4..6ede734 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -34,6 +34,7 @@ pipeline { steps { archiveArtifacts artifacts: 'cache.json', fingerprint: true } + } } post { failure { From 16e7001de1dccea6e8197031b3f9dcc23da62a77 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 7 Mar 2025 07:59:36 -0500 Subject: [PATCH 3/7] feat(app): update to accomodate changes in data sources app --- Jenkinsfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6ede734..0bfe4f5 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -15,13 +15,13 @@ pipeline { stages { stage('Retrieve JSON from Previous Build') { - steps { + try { copyArtifacts projectName: "${env.JOB_NAME}", filter: 'cache.json', selector: lastSuccessful() - script { - def jsonContent = readFile 'cache.json' - def jsonData = new groovy.json.JsonSlurper().parseText(jsonContent) - echo "Loaded JSON from previous build: ${jsonData}" - } + def jsonContent = readFile 'cache.json' + def jsonData = new groovy.json.JsonSlurper().parseText(jsonContent) + echo "Loaded JSON from previous build: ${jsonData}" + } catch (Exception e) { + echo "No previous successful build found, proceeding without cache." } } stage('Run Automatic Archive') { From ca5457e627b76396159472222a7294c967eceb75 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 7 Mar 2025 08:01:16 -0500 Subject: [PATCH 4/7] feat(app): update to accomodate changes in data sources app --- Jenkinsfile | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0bfe4f5..f9dddfb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -15,13 +15,15 @@ pipeline { stages { stage('Retrieve JSON from Previous Build') { - try { - copyArtifacts projectName: "${env.JOB_NAME}", filter: 'cache.json', selector: lastSuccessful() - def jsonContent = readFile 'cache.json' - def jsonData = new groovy.json.JsonSlurper().parseText(jsonContent) - echo "Loaded JSON from previous build: ${jsonData}" - } catch (Exception e) { - echo "No previous successful build found, proceeding without cache." + steps { + try { + copyArtifacts projectName: "${env.JOB_NAME}", filter: 'cache.json', selector: lastSuccessful() + def jsonContent = readFile 'cache.json' + def jsonData = new groovy.json.JsonSlurper().parseText(jsonContent) + echo "Loaded JSON from previous build: ${jsonData}" + } catch (Exception e) { + echo "No previous successful build found, proceeding without cache." + } } } stage('Run Automatic Archive') { From 326df936d74e58c91b3d8bbf04fc79ad69f173ae Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 7 Mar 2025 08:02:16 -0500 Subject: [PATCH 5/7] feat(app): update to accomodate changes in data sources app --- Jenkinsfile | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index f9dddfb..a7822ce 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -16,13 +16,15 @@ pipeline { stages { stage('Retrieve JSON from Previous Build') { steps { - try { - copyArtifacts projectName: "${env.JOB_NAME}", filter: 'cache.json', selector: lastSuccessful() - def jsonContent = readFile 'cache.json' - def jsonData = new groovy.json.JsonSlurper().parseText(jsonContent) - echo "Loaded JSON from previous build: ${jsonData}" - } catch (Exception e) { - echo "No previous successful build found, proceeding without cache." + script{ + try { + copyArtifacts projectName: "${env.JOB_NAME}", filter: 'cache.json', selector: lastSuccessful() + def jsonContent = readFile 'cache.json' + def jsonData = new groovy.json.JsonSlurper().parseText(jsonContent) + echo "Loaded JSON from previous build: ${jsonData}" + } catch (Exception e) { + echo "No previous successful build found, proceeding without cache." + } } } } From 240bebe07ea6914e07ddeb365e2155f794222843 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 7 Mar 2025 08:03:26 -0500 Subject: [PATCH 6/7] feat(app): update to accomodate changes in data sources app --- Jenkinsfile | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index a7822ce..1114241 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,31 +14,12 @@ pipeline { } stages { - stage('Retrieve JSON from Previous Build') { - steps { - script{ - try { - copyArtifacts projectName: "${env.JOB_NAME}", filter: 'cache.json', selector: lastSuccessful() - def jsonContent = readFile 'cache.json' - def jsonData = new groovy.json.JsonSlurper().parseText(jsonContent) - echo "Loaded JSON from previous build: ${jsonData}" - } catch (Exception e) { - echo "No previous successful build found, proceeding without cache." - } - } - } - } stage('Run Automatic Archive') { steps { echo 'Running Automatic Archive...' sh 'python cache_url.py' } } - stage('Save cache') { - steps { - archiveArtifacts artifacts: 'cache.json', fingerprint: true - } - } } post { failure { From fb72b00520978de19e01aa744a35742a1f0106be Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 11 Mar 2025 14:58:35 -0400 Subject: [PATCH 7/7] fix(app): fix archives get to call use `YYYY-MM-DD` formatting --- PDAPInterface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PDAPInterface.py b/PDAPInterface.py index 6feabec..e456d5d 100644 --- a/PDAPInterface.py +++ b/PDAPInterface.py @@ -40,7 +40,7 @@ def get_from_pdap_archives( f"{self.base_url}/archives", params={ "update_frequency": update_frequency.value, - "last_archived_before": last_archived_before.isoformat() + "last_archived_before": last_archived_before.strftime("%Y-%m-%d") }, headers=authorization_header, timeout=10