diff --git a/README.md b/README.md
index c81f0df..c4132a2 100644
--- a/README.md
+++ b/README.md
@@ -29,8 +29,8 @@ specify them as command line arguments.
Clone the repo and install the dependencies:
```bash
-git clone https://github.com/yourusername/substack_scraper.git
-cd substack_scraper
+git clone https://github.com/yourusername/Substack2Markdown.git
+cd Substack2Markdown
# # Optinally create a virtual environment
# python -m venv venv
diff --git a/requirements.txt b/requirements.txt
index c58926a..7cff179 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,25 @@
+attrs==25.4.0
+beautifulsoup4==4.14.3
bs4==0.0.1
+certifi==2026.1.4
+charset-normalizer==3.4.4
+h11==0.16.0
html2text==2020.1.16
+idna==3.11
+Markdown==3.6
+outcome==1.3.0.post0
+packaging==26.0
+PySocks==1.7.1
+python-dotenv==1.2.1
requests==2.31.0
selenium==4.16.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.8.3
tqdm==4.66.1
-webdriver_manager==4.0.1
-Markdown==3.6
+trio==0.32.0
+trio-websocket==0.12.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+webdriver-manager==4.0.1
+wsproto==1.3.2
diff --git a/substack_scraper.py b/substack_scraper.py
index 126d260..6601937 100644
--- a/substack_scraper.py
+++ b/substack_scraper.py
@@ -1,11 +1,12 @@
import argparse
import json
import os
+import sys
+import random
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple
from time import sleep
-
import html2text
import markdown
import requests
@@ -16,10 +17,14 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
-from webdriver_manager.microsoft import EdgeChromiumDriverManager
+from selenium.common.exceptions import SessionNotCreatedException, WebDriverException
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.edge.service import Service as EdgeService
+from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.edge.options import Options as EdgeOptions
-from selenium.common.exceptions import SessionNotCreatedException
-from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.microsoft import EdgeChromiumDriverManager
+
from urllib.parse import urlparse
from config import EMAIL, PASSWORD
@@ -28,42 +33,34 @@
BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files
BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files
HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page
-JSON_DATA_DIR: str = "data"
+JSON_DATA_DIR: str = "data"
NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts
def extract_main_part(url: str) -> str:
- parts = urlparse(url).netloc.split('.') # Parse the URL to get the netloc, and split on '.'
- return parts[1] if parts[0] == 'www' else parts[0] # Return the main part of the domain, while ignoring 'www' if
- # present
+ parts = urlparse(url).netloc.split('.')
+ return parts[1] if parts[0] == 'www' else parts[0]
def generate_html_file(author_name: str) -> None:
- """
- Generates a HTML file for the given author.
- """
if not os.path.exists(BASE_HTML_DIR):
os.makedirs(BASE_HTML_DIR)
- # Read JSON data
json_path = os.path.join(JSON_DATA_DIR, f'{author_name}.json')
with open(json_path, 'r', encoding='utf-8') as file:
essays_data = json.load(file)
- # Convert JSON data to a JSON string for embedding
embedded_json_data = json.dumps(essays_data, ensure_ascii=False, indent=4)
with open(HTML_TEMPLATE, 'r', encoding='utf-8') as file:
html_template = file.read()
- # Insert the JSON string into the script tag in the HTML template
- html_with_data = html_template.replace('', author_name).replace(
+ html_with_data = html_template.replace('', author_name).replace(
'',
f''
)
html_with_author = html_with_data.replace('author_name', author_name)
- # Write the modified HTML to a new file
html_output_path = os.path.join(BASE_HTML_DIR, f'{author_name}.html')
with open(html_output_path, 'w', encoding='utf-8') as file:
file.write(html_with_author)
@@ -92,62 +89,42 @@ def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str)
self.post_urls: List[str] = self.get_all_post_urls()
def get_all_post_urls(self) -> List[str]:
- """
- Attempts to fetch URLs from sitemap.xml, falling back to feed.xml if necessary.
- """
urls = self.fetch_urls_from_sitemap()
if not urls:
urls = self.fetch_urls_from_feed()
return self.filter_urls(urls, self.keywords)
def fetch_urls_from_sitemap(self) -> List[str]:
- """
- Fetches URLs from sitemap.xml.
- """
sitemap_url = f"{self.base_substack_url}sitemap.xml"
response = requests.get(sitemap_url)
-
if not response.ok:
print(f'Error fetching sitemap at {sitemap_url}: {response.status_code}')
return []
-
root = ET.fromstring(response.content)
urls = [element.text for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]
return urls
def fetch_urls_from_feed(self) -> List[str]:
- """
- Fetches URLs from feed.xml.
- """
print('Falling back to feed.xml. This will only contain up to the 22 most recent posts.')
feed_url = f"{self.base_substack_url}feed.xml"
response = requests.get(feed_url)
-
if not response.ok:
print(f'Error fetching feed at {feed_url}: {response.status_code}')
return []
-
root = ET.fromstring(response.content)
urls = []
for item in root.findall('.//item'):
link = item.find('link')
if link is not None and link.text:
urls.append(link.text)
-
return urls
@staticmethod
def filter_urls(urls: List[str], keywords: List[str]) -> List[str]:
- """
- This method filters out URLs that contain certain keywords
- """
return [url for url in urls if all(keyword not in url for keyword in keywords)]
@staticmethod
def html_to_md(html_content: str) -> str:
- """
- This method converts HTML to Markdown
- """
if not isinstance(html_content, str):
raise ValueError("html_content must be a string")
h = html2text.HTML2Text()
@@ -157,45 +134,24 @@ def html_to_md(html_content: str) -> str:
@staticmethod
def save_to_file(filepath: str, content: str) -> None:
- """
- This method saves content to a file. Can be used to save HTML or Markdown
- """
if not isinstance(filepath, str):
raise ValueError("filepath must be a string")
-
if not isinstance(content, str):
raise ValueError("content must be a string")
-
if os.path.exists(filepath):
print(f"File already exists: {filepath}")
return
-
with open(filepath, 'w', encoding='utf-8') as file:
file.write(content)
@staticmethod
def md_to_html(md_content: str) -> str:
- """
- This method converts Markdown to HTML
- """
return markdown.markdown(md_content, extensions=['extra'])
-
def save_to_html_file(self, filepath: str, content: str) -> None:
- """
- This method saves HTML content to a file with a link to an external CSS file.
- """
- if not isinstance(filepath, str):
- raise ValueError("filepath must be a string")
-
- if not isinstance(content, str):
- raise ValueError("content must be a string")
-
- # Calculate the relative path from the HTML file to the CSS file
html_dir = os.path.dirname(filepath)
css_path = os.path.relpath("./assets/css/essay-styles.css", html_dir)
- css_path = css_path.replace("\\", "/") # Ensure forward slashes for web paths
-
+ css_path = css_path.replace("\\", "/")
html_content = f"""
@@ -212,65 +168,41 @@ def save_to_html_file(self, filepath: str, content: str) -> None: