Skip to content
This repository was archived by the owner on Oct 6, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions AccessManager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os

import requests



class AccessManager:
"""
Manages access to the API, handling logins and access token storage
"""

def __init__(
self,
email: str,
password: str
):
self.email = email
self.password = password
self.access_token = None
self.refresh_token = None
self.login()

def get_bearer_authorization_header(self):
return {"Authorization": f"Bearer {self.access_token}"}

def login(self):
response = requests.post(
f"{os.getenv('VITE_VUE_APP_BASE_URL')}/auth/login",
json={
"email": self.email,
"password": self.password
},
timeout=10
)
response.raise_for_status()
data = response.json()
self.access_token = data["access_token"]
self.refresh_token = data["refresh_token"]

def refresh_access_token(self):
response = requests.post(
f"{os.getenv('VITE_VUE_APP_BASE_URL')}/auth/refresh-session",
headers={
"Authorization": f"Bearer {self.refresh_token}"
},
timeout=10
)
response.raise_for_status()
data = response.json()
self.access_token = data["access_token"]
40 changes: 40 additions & 0 deletions CacheManager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import datetime
import json
import os


class CacheManager:
"""
Manages an internal cache for tracking when source URLs were last cached
in the internet archive
This is to minimize the number of requests to the archive
"""

def __init__(self):
# If the file does not exist, create it
if not os.path.exists("cache.json"):
with open("cache.json", "w") as f:
f.write("{}")

# Load the cache
with open("cache.json", "r") as f:
self.cache = json.load(f)

def save_cache(self):
with open("cache.json", "w") as f:
json.dump(self.cache, f)

def datetime_to_str(self, last_cached: datetime.datetime) -> str:
return last_cached.strftime("%Y-%m-%d")

def str_to_datetime(self, last_cached: str) -> datetime.datetime:
return datetime.datetime.strptime(last_cached, "%Y-%m-%d")

def get_last_cached(self, source_url: str) -> datetime.datetime:
return self.str_to_datetime(self.cache.get(source_url))

def set_last_cached(self, source_url: str, last_cached: datetime.datetime):
self.cache[source_url] = self.datetime_to_str(last_cached)

def has_source_url(self, source_url: str) -> bool:
return source_url in self.cache
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ FROM python:3.11

WORKDIR /usr/src/app

RUN apt-get update && apt-get install -y curl

COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt

Expand Down
47 changes: 47 additions & 0 deletions InternetArchiveInterface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import time
from datetime import datetime

import requests

from exceptions import ArchiveLastCacheNotFoundError


class InternetArchiveInterface:

def __init__(self, s3_keys: str):
self.s3_keys = s3_keys

def get_website_info_data(self, source_url):
website_info = requests.get(
f"https://archive.org/wayback/available?url={source_url}",
timeout=10
)
website_info_data = website_info.json()
return website_info_data

def get_website_info_data_last_cached(self, source_url):
website_info_data = self.get_website_info_data(source_url)
if not website_info_data["archived_snapshots"]:
raise ArchiveLastCacheNotFoundError
return datetime.strptime(
website_info_data["archived_snapshots"]["closest"]["timestamp"],
"%Y%m%d%H%M%S",
)

def save_to_internet_archive(self, entry: dict, source_url: str, wait_time: int):
"""
Wait then post to Internet Archive
:param entry:
:param source_url:
:param wait_time: The amount of time to wait
:return:
"""
api_url = f"http://web.archive.org/save/{source_url}"
time.sleep(wait_time)
requests.post(
api_url,
headers={"Authorization": f"LOW {self.s3_keys}"},
timeout=10
)
# Update the last_cached date if cache is successful
entry["last_cached"] = datetime.now().strftime("%Y-%m-%d")
49 changes: 49 additions & 0 deletions PDAPInterface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import json
from datetime import datetime

import requests

from enums import UpdateFrequency


class PDAPInterface:

def __init__(self, base_url: str):
self.base_url = base_url


def update_pdap_archives(
self,
entry: dict,
authorization_header: dict
):
"""
Update data in PDAP archives
:param entry:
:return:
"""
response = requests.put(
f"{self.base_url}/archives",
json=entry,
headers=authorization_header,
timeout=10
)
response.raise_for_status()

def get_from_pdap_archives(
self,
authorization_header: dict,
update_frequency: UpdateFrequency,
last_archived_before: datetime
):
response = requests.get(
f"{self.base_url}/archives",
params={
"update_frequency": update_frequency.value,
"last_archived_before": last_archived_before.strftime("%Y-%m-%d")
},
headers=authorization_header,
timeout=10
)
response.raise_for_status()
return response.json()
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,7 @@ Requires the following environment variables to be set:
```text
VUE_APP_PDAP_API_KEY=<YOUR_PDAP_API_KEY>
VITE_VUE_APP_BASE_URL=<YOUR_PDAP_API_URL>
PDAP_EMAIL=<YOUR_PDAP_EMAIL>
PDAP_PASSWORD=<YOUR_PDAP_PASSWORD>
INTERNET_ARCHIVE_S3_KEYS=<YOUR_INTERNET_ARCHIVE_S3_KEYS>
```
Loading