-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmedium2dev.py
More file actions
445 lines (359 loc) · 18.8 KB
/
medium2dev.py
File metadata and controls
445 lines (359 loc) · 18.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
#!/usr/bin/env python3
"""
Medium2Dev - Convert Medium posts to DEV.to markdown format and optionally publish to DEV.to
"""
import argparse
import os
import re
import requests
import sys
import json
import logging
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import html2text
import time
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('medium2dev')
class Medium2Dev:
def __init__(self, url, output_dir=None, image_dir=None, api_key=None):
"""Initialize the converter with the Medium post URL."""
self.url = url
self.output_dir = output_dir or os.getcwd()
self.image_dir = image_dir or os.path.join(self.output_dir, 'images')
self.api_key = api_key
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
self.medium_word_count = 0
# Create image directory if it doesn't exist
if not os.path.exists(self.image_dir):
os.makedirs(self.image_dir)
def fetch_article(self):
"""Fetch the Medium article content."""
logger.info(f"Fetching article from {self.url}")
try:
# Add headers to mimic a browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://medium.com/',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
response = self.session.get(self.url, headers=headers)
response.raise_for_status()
# Check if we need to handle a JavaScript redirect
if 'window.location.href' in response.text:
# Extract the redirect URL
match = re.search(r'window\.location\.href\s*=\s*"([^"]+)"', response.text)
if match:
redirect_url = match.group(1)
logger.info(f"Following redirect to {redirect_url}")
response = self.session.get(redirect_url, headers=headers)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logger.error(f"Error fetching article: {e}")
sys.exit(1)
def extract_content(self, html_content):
"""Extract the article content from the HTML."""
soup = BeautifulSoup(html_content, 'html.parser')
# Extract title
title_tag = soup.find('h1')
title = title_tag.text.strip() if title_tag else "Untitled Article"
# Extract publication date for frontmatter only
date_tag = soup.find('meta', property='article:published_time')
date = date_tag['content'].split('T')[0] if date_tag else ""
# Extract article content
article_tag = soup.find('article')
if not article_tag:
# Try alternative selectors for Medium content
article_tag = soup.select_one('div.section-content')
if not article_tag:
# Try another approach - find the main content div
article_tag = soup.find('div', class_='postArticle-content')
if not article_tag:
logger.error("Could not find article content")
sys.exit(1)
# Create a new div to hold only the content we want
content_div = soup.new_tag('div')
# Find all the content sections (paragraphs, headings, code blocks, images)
# Make sure to include all header levels (h1-h6)
content_elements = article_tag.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'pre', 'figure', 'img', 'blockquote', 'ul', 'ol', 'div'])
# Add the content elements to our new div
for element in content_elements:
# Skip elements that are likely part of the author byline or metadata
if element.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and any(cls in str(element.get('class', [])) for cls in ['postMetaLockup', 'graf--authorName', 'authorLockup']):
continue
# Skip elements with author info, claps, etc.
if element.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and element.find(string=re.compile(r'clap|follow|min read|sign up|bookmark|Listen|Share')):
continue
# Skip elements that just contain "--" or numbers at the beginning
if element.name == 'p' and re.match(r'^\s*--\s*$|^\s*\d+\s*$', element.text.strip()):
continue
# Skip the title (h1) since we'll add it in the frontmatter
if element.name == 'h1' and element.text.strip() == title:
continue
# Skip elements that contain "In Plain English" footer
if element.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and element.find(string=re.compile(r'In Plain English|Thank you for being a part of')):
continue
# Skip elements that just contain "·" character
if element.name == 'p' and element.text.strip() == '·':
continue
content_div.append(element)
# Calculate the word count of the original content
content_text = ' '.join([element.get_text() for element in content_div.contents])
self.medium_word_count = len(content_text.split())
logger.info(f"Original Medium content word count: {self.medium_word_count}")
return {
'title': title,
'date': date,
'content': content_div
}
def download_images(self, content):
"""Download images and update their references in the content."""
images = content.find_all('img')
downloaded_count = 0
for i, img in enumerate(images):
if not img.get('src'):
continue
# Get image URL
img_url = img['src']
if not img_url.startswith(('http://', 'https://')):
img_url = urljoin(self.url, img_url)
# Skip small profile images and icons (typically < 100px)
if 'resize:fill:64:64' in img_url or 'resize:fill:88:88' in img_url:
img.decompose() # Remove author profile images
continue
# For Medium images, try to get the full-size version
if 'miro.medium.com' in img_url:
# Remove size constraints from URL to get original image
img_url = re.sub(r'/resize:[^/]+/', '/', img_url)
# Remove query parameters that might limit image size
img_url = img_url.split('?')[0]
# Generate image filename with a more descriptive name
img_extension = os.path.splitext(urlparse(img_url).path)[1]
if not img_extension:
img_extension = '.jpg' # Default extension
img_filename = f"image_{i+1}{img_extension}"
img_path = os.path.join(self.image_dir, img_filename)
# Create image directory if it doesn't exist
if not os.path.exists(self.image_dir):
os.makedirs(self.image_dir)
# Download image
try:
logger.info(f"Downloading image: {img_url}")
img_response = self.session.get(img_url, stream=True)
img_response.raise_for_status()
with open(img_path, 'wb') as f:
for chunk in img_response.iter_content(chunk_size=8192):
f.write(chunk)
# Update image reference in content
img_relative_path = os.path.join('images', img_filename)
img['src'] = img_relative_path
downloaded_count += 1
except requests.RequestException as e:
logger.warning(f"Failed to download image {img_url}: {e}")
logger.info(f"Downloaded {downloaded_count} content images")
return content
def convert_to_markdown(self, content):
"""Convert HTML content to Markdown format suitable for DEV.to."""
# Process content before conversion
for pre in content.find_all('pre'):
# Ensure code blocks are properly formatted
if pre.find('code'):
pre['class'] = 'highlight'
for figure in content.find_all('figure'):
# Handle figure captions
figcaption = figure.find('figcaption')
if figcaption:
img = figure.find('img')
if img:
img['alt'] = figcaption.text.strip()
# Remove Medium-specific UI elements and metadata
for element in content.select('.postMetaLockup, .graf--pullquote, .section-divider, .js-actionMultirecommendCount, .js-actionRecommend'):
if element:
element.decompose()
# Remove share buttons, claps, and other interactive elements
for element in content.select('button, .buttonSet, .js-actionRecommend, .js-postMetaLockup'):
if element:
element.decompose()
# Convert to markdown
h2t = html2text.HTML2Text()
h2t.body_width = 0 # Don't wrap lines
h2t.backquote_code_style = True # Use fenced code blocks
h2t.escape_snob = True # Escape Markdown characters
h2t.ignore_links = False
h2t.ignore_images = False
h2t.ignore_emphasis = False
h2t.ignore_tables = False
markdown = h2t.handle(str(content))
# Post-process markdown
# Fix code blocks
markdown = re.sub(r'```\n\s*```', '', markdown)
# Convert level one headings to level two headings
markdown = re.sub(r'^# (.+)$', r'## \1', markdown, flags=re.MULTILINE)
# Fix image paths
def repl(match):
path = match.group(1)
if path.startswith('images/'):
return f""
return match.group(0)
markdown = re.sub(r'!\[.*?\]\((.*?)\)', repl, markdown)
# Fix headings (ensure proper spacing)
markdown = re.sub(r'(?<!\n)#{1,6} ', r'\n\g<0>', markdown)
# Remove Medium-specific footer text and links
markdown = re.sub(r'\n\s*\[.*?\]\(https?://medium\.com/.*?\)\s*\n', '\n\n', markdown)
# Remove clap indicators and other Medium UI elements
markdown = re.sub(r'\d+\s*claps?', '', markdown)
markdown = re.sub(r'Follow\s*\d+\s*min read', '', markdown)
# Remove "Listen" and "Share" text that often appears at the beginning
markdown = re.sub(r'^\s*--\s*\n+\d+\s*\n+Listen\s*\n+Share\s*\n+', '', markdown)
markdown = re.sub(r'^\s*--\s*\n+\d+\s*\n+', '', markdown)
markdown = re.sub(r'^\s*·\s*\n+', '', markdown)
markdown = re.sub(r'^\s*\\--\s*\n+', '', markdown)
# Fix code links format - change `[text](url)` to [`text`](url)
markdown = re.sub(r'`\[(.*?)\]\((.*?)\)`', r'[`\1`](\2)', markdown)
# Remove Medium footer about "In Plain English" community
markdown = re.sub(r'# In Plain English.*?$', '', markdown, flags=re.DOTALL)
markdown = re.sub(r'_Thank you for being a part of the_.*?$', '', markdown, flags=re.DOTALL)
# Remove author links at the beginning
markdown = re.sub(r'^\s*\[\]\(https://.*?medium\.com/.*?\)\s*\n+', '', markdown)
markdown = re.sub(r'^\s*\[Vivek V\]\(https://.*?medium\.com/.*?\)\s*\n+', '', markdown)
# Clean up multiple blank lines
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
# Remove any remaining "·" and "--" at the beginning of the document
lines = markdown.split('\n')
while lines and (lines[0].strip() == '·' or lines[0].strip() == '--' or lines[0].strip() == '\\--'):
lines.pop(0)
markdown = '\n'.join(lines)
# Final cleanup of any remaining "--" characters
markdown = re.sub(r'\n\\--\n', '\n\n', markdown)
markdown = re.sub(r'\n--\n', '\n\n', markdown)
return markdown
def generate_frontmatter(self, title, date):
"""Generate DEV.to frontmatter."""
# Extract tags from the URL or use default tags
tags = ["aws", "tutorial", "programming"] # Default tags
# Try to extract tags from URL path components
parsed_url = urlparse(self.url)
path_components = parsed_url.path.strip('/').split('/')
if len(path_components) > 1:
# The first component might be a publication name or topic
potential_tag = path_components[0].replace('-', '')
if potential_tag and potential_tag not in ['medium', 'blog', 'posts']:
# Ensure tag is alphanumeric only
potential_tag = re.sub(r'[^a-zA-Z0-9]', '', potential_tag)
if potential_tag:
tags.insert(0, potential_tag)
frontmatter = [
"---",
f"title: {title}",
f"published: false", # Set to false for draft
f"date: \"{date}\"", # Date in quotes to ensure proper formatting
f"tags: {json.dumps(tags)}",
f"canonical_url: \"{self.url}\"", # URL in quotes to handle special characters
"cover_image: ",
"---\n"
]
return "\n".join(frontmatter)
def convert(self):
"""Convert the Medium post to DEV.to markdown format."""
html_content = self.fetch_article()
extracted = self.extract_content(html_content)
# Download images
processed_content = self.download_images(extracted['content'])
# Convert to markdown
markdown_content = self.convert_to_markdown(processed_content)
# Generate frontmatter
frontmatter = self.generate_frontmatter(
extracted['title'],
extracted['date']
)
# Combine frontmatter and content
full_markdown = frontmatter + "\n" + markdown_content
# Generate output filename
parsed_url = urlparse(self.url)
slug = parsed_url.path.strip('/').split('/')[-1]
output_filename = f"{slug}.md"
output_path = os.path.join(self.output_dir, output_filename)
# Write to file
with open(output_path, 'w', encoding='utf-8') as f:
f.write(full_markdown)
logger.info(f"Conversion complete! Output saved to {output_path}")
return output_path, extracted['title'], full_markdown
def publish_to_devto(self, title, markdown_content):
"""Publish the converted markdown as a draft post to DEV.to."""
if not self.api_key:
logger.error("No DEV.to API key provided. Skipping publish.")
return False
logger.info("Publishing to DEV.to as draft...")
api_url = "https://dev.to/api/articles"
headers = {
"api-key": self.api_key,
"Content-Type": "application/json"
}
# Extract frontmatter to properly format the article data
frontmatter_match = re.match(r'---\n(.*?)\n---\n', markdown_content, re.DOTALL)
body_markdown = markdown_content
# Prepare the article data
article_data = {
"article": {
"title": title,
"body_markdown": body_markdown,
"published": False # Set as draft
}
}
try:
response = requests.post(api_url, headers=headers, json=article_data)
response.raise_for_status()
article_data = response.json()
logger.info(f"Successfully published draft to DEV.to! URL: https://dev.to/dashboard/drafts")
return True
except requests.RequestException as e:
logger.error(f"Error publishing to DEV.to: {e}")
if hasattr(e, 'response') and e.response:
logger.error(f"Response: {e.response.text}")
return False
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(description='Convert Medium posts to DEV.to markdown format and optionally publish as draft')
parser.add_argument('url', help='URL of the Medium post to convert')
parser.add_argument('-o', '--output-dir', help='Directory to save the output markdown file')
parser.add_argument('-i', '--image-dir', help='Directory to save downloaded images')
parser.add_argument('-p', '--publish', action='store_true', help='Publish to DEV.to as draft')
parser.add_argument('-k', '--api-key', help='DEV.to API key (if not set via DEVTO_API_KEY environment variable)')
args = parser.parse_args()
# Get API key from environment or command line
api_key = args.api_key or os.environ.get('DEVTO_API_KEY')
# Check if publishing is requested but no API key is available
if args.publish and not api_key:
logger.error("Publishing requested but no DEV.to API key provided. Set DEVTO_API_KEY environment variable or use --api-key.")
sys.exit(1)
converter = Medium2Dev(args.url, args.output_dir, args.image_dir, api_key)
output_path, title, markdown_content = converter.convert()
print(f"\nConversion successful! Output saved to: {output_path}")
print(f"Images saved to: {converter.image_dir}")
# Calculate DEV.to word count
devto_word_count = len(re.sub(r'---.*?---\n', '', markdown_content, flags=re.DOTALL).split())
if args.publish:
if converter.publish_to_devto(title, markdown_content):
print("Successfully published as draft to DEV.to!")
print("\nWord Count Comparison:")
print("| Platform | Word Count |")
print("|----------|------------|")
print(f"| Medium | {converter.medium_word_count} |")
print(f"| DEV.to | {devto_word_count} |")
else:
print("Failed to publish to DEV.to. Check logs for details.")
if __name__ == '__main__':
main()