diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a295864
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.pyc
+__pycache__
diff --git a/.hgignore b/.hgignore
deleted file mode 100644
index 471301b..0000000
--- a/.hgignore
+++ /dev/null
@@ -1,2 +0,0 @@
-syntax:glob
-*.pyc
diff --git a/crawl.py b/crawl.py
index 22da38a..68fcf41 100644
--- a/crawl.py
+++ b/crawl.py
@@ -1,118 +1,121 @@
-import argparse
-import sys
-import locale
-import codecs
-import os
-from wikidot import Wikidot
-from rmaint import RepoMaintainer
-
-# TODO: Files.
-# TODO: Forum and comment pages.
-# TODO: Ability to download new transactions since last dump.
-#   We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump)
-
-rawStdout = sys.stdout
-rawStderr = sys.stderr
-sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, 'xmlcharrefreplace')
-sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr, 'xmlcharrefreplace')
-
-parser = argparse.ArgumentParser(description='Queries Wikidot')
-parser.add_argument('site', help='URL of Wikidot site')
-# Actions
-parser.add_argument('--list-pages', action='store_true', help='List all pages on this site')
-parser.add_argument('--source', action='store_true', help='Print page source (requires --page)')
-parser.add_argument('--content', action='store_true', help='Print page content (requires --page)')
-parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)')
-parser.add_argument('--dump', type=str, help='Download page revisions to this directory')
-# Debug actions
-parser.add_argument('--list-pages-raw', action='store_true')
-parser.add_argument('--log-raw', action='store_true')
-# Action settings
-parser.add_argument('--page', type=str, help='Query only this page')
-parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
-parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository')
-# Common settings
-parser.add_argument('--debug', action='store_true', help='Print debug info')
-parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot')
-args = parser.parse_args()
-
-
-wd = Wikidot(args.site)
-wd.debug = args.debug
-wd.delay = args.delay
-
-
-def force_dirs(path):
-    try:
-        os.makedirs(path)
-    except OSError as exception:
-        if exception.errno != os.errno.EEXIST:
-            raise
-
-if args.list_pages_raw:
-	print wd.list_pages_raw(args.depth)
-
-elif args.list_pages:
-	for page in wd.list_pages(args.depth):
-		print page
-
-elif args.source:
-	if not args.page:
-		raise "Please specify --page for --source."
-	
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise "Page not found: "+args.page
-	
-	revs = wd.get_revisions(page_id, 1) # last revision
-	print wd.get_revision_source(revs[0]['id'])
-
-elif args.content:
-	if not args.page:
-		raise "Please specify --page for --source."
-	
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise "Page not found: "+args.page
-	
-	revs = wd.get_revisions(page_id, 1) # last revision
-	print wd.get_revision_version(revs[0]['id'])
-
-elif args.log_raw:
-	if not args.page:
-		raise "Please specify --page for --log."
-
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise "Page not found: "+args.page
-
-	print wd.get_revisions_raw(page_id, args.depth)
-
-
-elif args.log:
-	if not args.page:
-		raise "Please specify --page for --log."
-
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise "Page not found: "+args.page
-	for rev in wd.get_revisions(page_id, args.depth):
-		print unicode(rev)
-
-
-elif args.dump:
-	print "Downloading pages to "+args.dump
-	force_dirs(args.dump)
-	
-	rm = RepoMaintainer(wd, args.dump)
-	rm.debug = args.debug
-	rm.storeRevIds = args.revids
-	rm.buildRevisionList([args.page] if args.page else None, args.depth)
-	rm.openRepo()
-	
-	print "Downloading revisions..."
-	while rm.commitNext():
-		pass
-	
-	rm.cleanup()
-	print "Done."
+import argparse
+import sys
+import locale
+import codecs
+import os
+from wikidot import Wikidot
+from rmaint import RepoMaintainer
+
+# TODO: Files.
+# TODO: Forum and comment pages.
+# TODO: Ability to download new transactions since last dump.
+#   We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump)
+
+parser = argparse.ArgumentParser(description='Queries Wikidot')
+parser.add_argument('site', help='URL of Wikidot site')
+# Actions
+parser.add_argument('--list-pages', action='store_true', help='List all pages on this site')
+parser.add_argument('--max-page-count', type=int, default='10000', help='Only list/fetch up to this amount of pages')
+parser.add_argument('--source', action='store_true', help='Print page source (requires --page)')
+parser.add_argument('--content', action='store_true', help='Print page content (requires --page)')
+parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)')
+parser.add_argument('--dump', type=str, help='Download page revisions to this directory')
+# Debug actions
+parser.add_argument('--list-pages-raw', action='store_true')
+parser.add_argument('--log-raw', action='store_true')
+# Action settings
+parser.add_argument('--page', type=str, help='Query only this page')
+parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
+parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True)
+parser.add_argument('--skip', type=str, help='Skip the specified revision')
+parser.add_argument('--skip-pages', type=str, help='Skip the specified pages')
+parser.add_argument('--cleanup', action='store_true', help='Clean up after downloading repo')
+# Common settings
+parser.add_argument('--debug', action='store_true', help='Print debug info')
+parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot')
+args = parser.parse_args()
+
+
+wd = Wikidot(args.site)
+wd.debug = args.debug
+wd.delay = args.delay
+
+
+def force_dirs(path):
+    os.makedirs(path, exist_ok=True)
+
+if args.list_pages_raw:
+    print((wd.list_pages_raw(limit = args.max_pages_count)))
+
+elif args.list_pages:
+    for page in wd.list_pages(limit = args.max_pages_count):
+        print(page)
+
+elif args.source:
+    if not args.page:
+        raise Exception("Please specify --page for --source.")
+    
+    page_id = wd.get_page_id(page_unix_name=args.page)
+    if not page_id:
+        raise Exception("Page not found: "+args.page)
+    
+    revs = wd.get_revisions(page_id, 1) # last revision
+    print((wd.get_revision_source(revs[0]['id'])))
+
+elif args.content:
+    if not args.page:
+        raise Exception("Please specify --page for --source.")
+    
+    page_id = wd.get_page_id(page_unix_name=args.page)
+    if not page_id:
+        raise Exception("Page not found: "+args.page)
+    
+    revs = wd.get_revisions(page_id, 1) # last revision
+    print((wd.get_revision_version(revs[0]['id'])))
+
+elif args.log_raw:
+    if not args.page:
+        raise Exception("Please specify --page for --log.")
+
+    page_id = wd.get_page_id(page_unix_name=args.page)
+    if not page_id:
+        raise Exception("Page not found: "+args.page)
+
+    print((wd.get_revisions_raw(page_id, args.depth)))
+
+
+elif args.log:
+    if not args.page:
+        raise Exception("Please specify --page for --log.")
+
+    page_id = wd.get_page_id(page_unix_name=args.page)
+    if not page_id:
+        raise Exception("Page not found: "+args.page)
+    for rev in wd.get_revisions(page_id, args.depth):
+        print((str(rev)))
+
+
+elif args.dump:
+    print(("Downloading pages to "+args.dump))
+    force_dirs(args.dump)
+
+    rm = RepoMaintainer(wd, args.dump)
+    rm.debug = args.debug
+    rm.storeRevIds = args.revids
+    rm.max_depth = args.depth
+    rm.max_page_count = args.max_page_count
+    rm.buildRevisionList([args.page] if args.page else None)
+    rm.openRepo()
+
+    if args.skip_pages:
+        rm.pages_to_skip = args.skip_pages.split(",")
+    if args.skip:
+        rm.revs_to_skip = args.skip.split(",")
+
+    print("Downloading revisions")
+    rm.fetchAll()
+
+    if args.cleanup:
+        rm.cleanup()
+
+    print("Done.")
diff --git a/hgpatch.py b/hgpatch.py
deleted file mode 100644
index 6d2ff12..0000000
--- a/hgpatch.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from mercurial import scmutil, osutil
-from types import MethodType
-from mercurial import encoding
-import codecs
-
-# Patches commit-message unicode handling on Python 2.x
-
-# Mercurial is internally unicode. But because it runs from ASCII console, it tries to convert
-# all input from "input encoding" (set in mercurial/encoding.py)
-
-# Problem 1:
-#   If you just pass it u'unicode string', it'll fail. Even if you set "input encoding" to utf-8,
-#   it'll still try to decode it to ASCII.
-# Solution:
-#   Patch this decoding function to pass unicode unchanged.
-
-old_fromlocal = None
-
-def better_fromlocal(s):
-	if isinstance(s, unicode):
-		return s.encode('utf-8')
-	global old_fromlocal
-	return old_fromlocal(s)
-
-old_fromlocal = encoding.fromlocal
-encoding.fromlocal = better_fromlocal
-
-
-# Problem 2:
-#   Separate from actual log, Mercurial stores commit message in commit-message.txt.
-#   Unfortunately it uses default Python 2.x file.open which expects ASCII and auto-conversion fails.
-# Solution:
-#   Patch virtual-fs open() function to use codecs.open wrapper in this particular case.
-
-old_vfs_call = None
-
-def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False):
-	fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose)
-	if path.endswith('last-message.txt'):
-		# Create a wrapper like codecs.open does:
-		info = codecs.lookup("utf-8")
-		fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict')
-		fp.encoding = 'utf-8'
-	return fp
-
-old_vfs_call = scmutil.vfs.__call__
-scmutil.vfs.__call__ = better_vfs_call
-
-
-
diff --git a/readme.md b/readme.md
index f66a0cc..641a570 100644
--- a/readme.md
+++ b/readme.md
@@ -1,30 +1,50 @@
-This is a Python command line client for relatively popular wiki hosting http://www.wikidot.com which lets you:
-
-* List all pages on a site
-* See all revisions of a page
-* Query page source
-
-Most interestingly, it allows you to download the whole site as a Mercurial repository, with proper commit dates and comments!
-
-##### Examples:
-
-    crawl.py http://example.wikidot.com --dump ExampleRepo
-    crawl.py http://example.wikidot.com --log --page example-page
-
-It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers.
-
-Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed.
-
-##### Useful links:
-
-Wikidot code (very old) which simplifies things a bit:
-
-* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
-
-The descriptions for on-site modules are heavily correlated with AJAX ones:
-
-* http://www.wikidot.com/doc-modules:listpages-module
-
-Someone else did Wikidot AJAX:
-
-* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py
+*This is a fork to make a permanent backup of the SCP wiki.*
+
+This is a Python command line client for relatively popular wiki hosting
+http://www.wikidot.com which lets you:
+
+* List all pages on a site
+* See all revisions of a page
+* Query page source
+
+Most interestingly, it allows you to download the whole site as a Git repository, with proper commit dates, author and comments!
+
+##### Dependencies
+
+At least:
+
+* Python 3
+* python-beautifulsoup4
+* python-gitpython
+* python-requests
+* python-tqdm
+
+##### Examples:
+
+    crawl.py http://example.wikidot.com --dump ExampleRepo
+    crawl.py http://example.wikidot.com --log --page example-page
+
+It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers.
+
+Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed.
+
+##### Useful links:
+
+Wikidot code (very old) which simplifies things a bit:
+
+* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
+
+The descriptions for on-site modules are heavily correlated with AJAX ones:
+
+* http://www.wikidot.com/doc-modules:listpages-module
+
+Someone else did Wikidot AJAX:
+
+* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py
+
+
+#### TODO
+
+ - Handle deleted images. Probably need to check the diff and check all pages for references if removed from one page.
+ - Handle tags (both added and removed).
+
diff --git a/rmaint.py b/rmaint.py
index 029319f..1ab383f 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -1,263 +1,535 @@
-import os
-import codecs
-from mercurial import commands, ui, hg
-import hgpatch
-import cPickle as pickle
-import wikidot
-
-# Repository builder and maintainer
-# Contains logic for actual loading and maintaining the repository over the course of its construction.
-
-# Usage:
-#   rm = RepoMaintainer(wikidot, path)
-#   rm.buildRevisionList(pages, depth)
-#   rm.openRepo()
-#   while rm.commitNext():
-#		pass
-#   rm.cleanup()
-
-# Talkative.
-
-class RepoMaintainer:
-	def __init__(self, wikidot, path):
-		# Settings
-		self.wd = wikidot			# Wikidot instance
-		self.path = path			# Path to repository
-		self.debug = False			# = True to enable more printing
-		self.storeRevIds = True		# = True to store .revid with each commit
-		
-		# Internal state
-		self.wrevs = None			# Compiled wikidot revision list (history)
-		
-		self.rev_no	= 0				# Next revision to process
-		self.last_names = {}		# Tracks page renames: name atm -> last name in repo
-		self.last_parents = {}		# Tracks page parent names: name atm -> last parent in repo
-		
-		self.ui = None				# Mercurial UI object
-		self.repo = None			# Mercurial repo object
-
-
-	#
-	# Saves and loads revision list from file
-	#
-	def saveWRevs(self):
-		fp = open(self.path+'\\.wrevs', 'wb')
-		pickle.dump(self.wrevs, fp)
-		fp.close()
-	
-	def loadWRevs(self):
-		fp = open(self.path+'\\.wrevs', 'rb')
-		self.wrevs = pickle.load(fp)
-		fp.close()
-
-	#
-	# Compiles a combined revision list for a given set of pages, or all pages on the site.
-	#  pages: compile history for these pages
-	#  depth: download at most this number of revisions.
-	#
-	# If there exists a cached revision list at the repository destination,
-	# it is loaded and no requests are made.
-	#
-	def buildRevisionList(self, pages = None, depth = 10000):
-		if os.path.isfile(self.path+'\\.wrevs'):
-			print "Loading cached revision list..."
-			self.loadWRevs()
-		else:
-			print "Building revision list..."
-			if not pages:
-				pages = self.wd.list_pages(10000)
-			self.wrevs = []
-			for page in pages:
-				print "Querying page: "+page
-				page_id = self.wd.get_page_id(page)
-				print "ID: "+str(page_id)
-				revs = self.wd.get_revisions(page_id, depth)
-				print "Revisions: "+str(len(revs))
-				for rev in revs:
-					self.wrevs.append({
-					  'page_id' : page_id,
-					  'page_name' : page, # name atm, not at revision time
-					  'rev_id' : rev['id'],
-					  'date' : rev['date'],
-					  'user' : rev['user'],
-					  'comment' : rev['comment'],
-					})
-			self.saveWRevs() # Save a cached copy
-			print ""
-		
-		
-		print "Total revisions: "+str(len(self.wrevs))
-		
-		print "Sorting revisions..."
-		self.wrevs.sort(key=lambda rev: rev['date'])
-		print ""
-		
-		if self.debug:
-			print "Revision list: "
-			for rev in self.wrevs:
-				print str(rev)+"\n"
-			print ""
-
-
-	#
-	# Saves and loads operational state from file
-	#
-	def saveState(self):
-		fp = open(self.path+'\\.wstate', 'wb')
-		pickle.dump(self.rev_no, fp)
-		pickle.dump(self.last_names, fp)
-		pickle.dump(self.last_parents, fp)
-		fp.close()
-	
-	def loadState(self):
-		fp = open(self.path+'\\.wstate', 'rb')
-		self.rev_no = pickle.load(fp)
-		self.last_names = pickle.load(fp)
-		try:
-			self.last_parents = pickle.load(fp)
-		except EOFError:
-			pass
-		fp.close()
-
-
-	#
-	# Initializes the construction process, after the revision list has been compiled.
-	# Either creates a new repo, or loads the existing one at the target path
-	# and restores its construction state.
-	#
-	def openRepo(self):
-		# Create a new repository or continue from aborted dump
-		self.ui=ui.ui()
-		self.last_names = {} # Tracks page renames: name atm -> last name in repo
-		self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
-		
-		if os.path.isfile(self.path+'\\.wstate'):
-			print "Continuing from aborted dump state..."
-			self.loadState()
-			self.repo = hg.repository(self.ui, self.path)
-		
-		else: # create a new repository (will fail if one exists)
-			print "Initializing repository..."
-			commands.init(self.ui, self.path)
-			self.repo = hg.repository(self.ui, self.path)
-			self.rev_no = 0
-			
-			if self.storeRevIds:
-				# Add revision id file to the new repo
-				fname = self.path+'\\.revid'
-				codecs.open(fname, "w", "UTF-8").close()
-				commands.add(self.ui, self.repo, str(fname))
-	
-	
-	#
-	# Takes an unprocessed revision from a revision log, fetches its data and commits it.
-	# Returns false if no unprocessed revisions remain.
-	#
-	def commitNext(self):
-		if self.rev_no >= len(self.wrevs):
-			return False
-			
-		rev = self.wrevs[self.rev_no]
-		source = self.wd.get_revision_source(rev['rev_id'])
-		# Page title and unix_name changes are only available through another request:
-		details = self.wd.get_revision_version(rev['rev_id'])
-		
-		# Store revision_id for last commit
-		# Without this, empty commits (e.g. file uploads) will be skipped by Mercurial
-		if self.storeRevIds:
-			fname = self.path+'\\.revid'
-			outp = codecs.open(fname, "w", "UTF-8")
-			outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
-			outp.close()
-		
-		unixname = rev['page_name']
-		rev_unixname = details['unixname'] # may be different in revision than atm
-		
-		# Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.
-		# The only way to know they were changed is revision comments, though evil people may trick us.
-		if rev['comment'].startswith('Parent page set to: "'):
-			# This is a parenting revision, remember the new parent
-			parent_unixname = rev['comment'][21:-2]
-			self.last_parents[unixname] = parent_unixname
-		else:
-			# Else use last parent_unixname we've recorded
-			parent_unixname =  self.last_parents[unixname] if unixname in self.last_parents else None
-		# There are also problems when parent page gets renamed -- see updateChildren
-		
-		# If the page is tracked and its name just changed, tell HG
-		rename = (unixname in self.last_names) and (self.last_names[unixname] <> rev_unixname)
-		if rename:
-			self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
-			commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt')
-		
-		# Ouput contents
-		fname = self.path+'\\'+rev_unixname+'.txt'
-		outp = codecs.open(fname, "w", "UTF-8")
-		if details['title']:
-			outp.write('title:'+details['title']+'\n')
-		if parent_unixname:
-			outp.write('parent:'+parent_unixname+'\n')
-		outp.write(source)
-		outp.close()
-		
-		# Add new page
-		if not unixname in self.last_names: # never before seen
-			commands.add(self.ui, self.repo, str(fname))
-
-		self.last_names[unixname] = rev_unixname
-
-		# Commit
-		if rev['comment'] <> '':
-			commit_msg = rev_unixname + ': ' + rev['comment']
-		else:
-			commit_msg = rev_unixname
-		if rev['date']:
-			commit_date = str(rev['date']) + ' 0'
-		else:
-			commit_date = None
-		print "Commiting: "+str(self.rev_no)+'. '+commit_msg
-
-		commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date)
-		self.rev_no += 1
-
-		self.saveState() # Update operation state
-		return True
-
-
-	#
-	# Updates all children of the page to reflect parent's unixname change.
-	#
-	# Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body.
-	# A parent may then be renamed.
-	# Wikidot logs no additional changes for child pages, yet they stay linked to the parent.
-	#
-	# Therefore, on every rename we must update all linked children in the same revision.
-	#
-	def updateChildren(self, oldunixname, newunixname):
-		for child in self.last_parents.keys():
-			if self.last_parents[child] == oldunixname:
-				self.updateParentField(child, self.last_parents[child], newunixname)
-	
-	#
-	# Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
-	# The rest of the file is preserved.
-	#
-	def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
-		with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f:
-			content = f.readlines()
-		# Since this is all tracked by us, we KNOW there's a line in standard format somewhere
-		idx = content.index('parent:'+parent_oldunixname+'\n')
-		if idx < 0:
-			raise Exception("Cannot update child page "+child_unixname+": "
-				+"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
-		content[idx] = 'parent:'+parent_newunixname+'\n'
-		with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f:
-			f.writelines(content)
-
-
-	#
-	# Finalizes the construction process and deletes any temporary files.
-	#
-	def cleanup(self):
-		os.remove(self.path+'\\.wstate')
-		os.remove(self.path+'\\.wrevs')
\ No newline at end of file
+import wikidot
+
+# Basic python stuff
+import os
+import codecs
+import pickle as pickle
+import json
+
+# git stuff
+from git import Repo, Actor
+import time # For parsing unix epoch timestamps from wikidot and convert to normal timestamps
+import re # For sanitizing usernames to fake email addresses
+
+from tqdm import tqdm # for progress bar
+
+# Repository builder and maintainer
+# Contains logic for actual loading and maintaining the repository over the course of its construction.
+
+# Usage:
+#   rm = RepoMaintainer(wikidot, path)
+#   rm.buildRevisionList(pages)
+#   rm.openRepo()
+#   while rm.commitNext():
+#       pass
+#   rm.cleanup()
+
+# Talkative.
+
+class RepoMaintainer:
+    def __init__(self, wikidot, path):
+        # Settings
+        self.wd = wikidot           # Wikidot instance
+        self.path = path            # Path to repository
+        self.debug = False          # = True to enable more printing
+        self.storeRevIds = True     # = True to store .revid with each commit
+
+        # Internal state
+        self.wrevs = None           # Compiled wikidot revision list (history)
+
+        self.rev_no = 0             # Next revision to process
+        self.last_names = {}        # Tracks page renames: name atm -> last name in repo
+        self.last_parents = {}      # Tracks page parent names: name atm -> last parent in repo
+
+        self.repo = None            # Git repo object
+        self.index = None           # Git current index object
+        self.max_depth = 10000      # download at most this number of revisions
+        self.max_page_count = 10000 # download at most this number of pages
+
+        self.pbar = None
+        self.first_fetched = 0      # For progress bar
+        self.fetched_revids = set()
+
+        self.revs_to_skip = []
+        self.pages_to_skip = []
+
+
+    #
+    # Saves and loads revision list from file
+    #
+    def saveWRevs(self):
+        fp = open(self.path+'/.wrevs', 'wb')
+        pickle.dump(self.wrevs, fp)
+        fp.close()
+
+    def loadWRevs(self):
+        fp = open(self.path+'/.wrevs', 'rb')
+        self.wrevs = pickle.load(fp)
+        fp.close()
+
+    def savePages(self, pages):
+        fp = open(self.path+'/.pages', 'wb')
+        pickle.dump(pages, fp)
+        fp.close()
+
+    def appendFetchedRevid(self, revid):
+        fp = open(self.path+'/.fetched.txt', 'a')
+        fp.write(revid + '\n')
+        fp.close()
+
+    def loadFetchedRevids(self):
+        self.fetched_revids = set([line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')])
+
+    def saveFailedImages(self):
+        file_path = self.path + '/.failed-images.txt'
+        fp = open(file_path, 'w')
+        for failed in self.wd.failed_images:
+            fp.write(failed + '\n')
+        fp.close()
+
+    def loadFailedImages(self):
+        file_path = self.path + '/.failed-images.txt'
+        if not os.path.isfile(file_path):
+            return
+        self.wd.failed_images = set([line.rstrip() for line in open(file_path, 'r')])
+
+    # Persistent metadata about the repo:
+    #  - Tracks page renames: name atm -> last name in repo
+    #  - Tracks page parent names: name atm -> last parent in repo
+    def saveMetadata(self):
+        metadata = {'names': self.last_names, 'parents': self.last_parents }
+        fp = open(self.path+'/.metadata.json', 'w')
+        json.dump(metadata, fp)
+        fp.close()
+
+    def loadMetadata(self):
+        fp = open(self.path+'/.metadata.json', 'r')
+        metadata = json.load(fp)
+        self.last_names = metadata['names']
+        self.last_parents = metadata['parents']
+        fp.close()
+
+        self.loadFetchedRevids()
+    #
+    # Compiles a combined revision list for a given set of pages, or all pages on the site.
+    #  pages: compile history for these pages
+    #
+    # If there exists a cached revision list at the repository destination,
+    # it is loaded and no requests are made.
+    #
+    def buildRevisionList(self, pages = None):
+        if os.path.isfile(self.path+'/.wrevs'):
+            print("Loading cached revision list...")
+            self.loadWRevs()
+        else:
+            self.wrevs = []
+            if self.debug:
+                print('No existing wrevs')
+
+        if os.path.isfile(self.path+'/.fetched.txt'):
+            self.loadFetchedRevids()
+            print(len(self.fetched_revids), 'revisions already fetched')
+        else:
+            self.fetched_revids = set()
+
+        if self.debug:
+            print("Building revision list...")
+
+        if not pages:
+            if os.path.isfile(self.path+'/.pages'):
+                print('Loading fetched pages')
+                fp = open(self.path+'/.pages', 'rb')
+                pages = pickle.load(fp)
+                fp.close()
+
+
+            if not pages or len(pages) < self.max_page_count:
+                if self.debug:
+                    print('Need to fetch pages')
+                pages = self.wd.list_pages(self.max_page_count)
+                self.savePages(pages)
+            elif self.debug:
+                print(len(pages), 'pages loaded')
+
+        fetched_pages = set()
+
+        for wrev in tqdm(self.wrevs, desc='Collecting pages we already got revisions for'):
+            page_name = wrev['page_name']
+
+            if page_name in fetched_pages:
+                continue
+
+            fetched_pages.add(page_name)
+
+        if self.debug:
+            print("Already fetched revisions for " + str(len(fetched_pages)) + " of " + str(len(pages)))
+
+        fetched = 0
+        for page in tqdm(pages, desc='Updating list of revisions to fetch'):
+            if page in fetched_pages:
+                continue
+
+            # TODO: more generic blacklisting
+            if page == "sandbox":
+                if self.debug:
+                    print("Skipping", page)
+                continue
+
+            fetched += 1
+            page_id = self.wd.get_page_id(page)
+
+            if self.debug:
+                print(("ID: "+str(page_id)))
+
+            if page_id is None:
+                print('Page gone?', page)
+                continue
+
+            revs = self.wd.get_revisions(page_id=page_id, limit=self.max_depth)
+            for rev in revs:
+                if rev['id'] in self.fetched_revids:
+                    continue
+
+                self.wrevs.append({
+                  'page_id' : page_id,
+                  'page_name' : page, # current name, not at revision time (revisions can rename them)
+                  'rev_id' : rev['id'],
+                  'date' : rev['date'],
+                  'user' : rev['user'],
+                  'comment' : rev['comment'],
+                })
+            self.saveWRevs() # Save a cached copy
+
+        print("Number of revisions already fetched", len(self.fetched_revids), len(self.wrevs))
+
+        if os.path.isfile(self.path+'/.metadata.json'):
+            self.loadMetadata()
+
+        print("")
+
+        print(("Total revisions: "+str(len(self.wrevs))))
+
+        if self.debug:
+            print("Sorting revisions...")
+
+        self.wrevs.sort(key=lambda rev: rev['date'])
+        
+        if self.debug:
+            if len(self.wrevs) < 100:
+                print("")
+                print("Revision list: ")
+                for rev in self.wrevs:
+                    print((str(rev)+"\n"))
+                print("")
+            else:
+                print("Too many revisions, not printing everything")
+
+
+    #
+    # Saves and loads operational state from file
+    #
+    def saveState(self):
+        fp = open(self.path+'/.wstate', 'wb')
+        pickle.dump(self.rev_no, fp)
+        fp.close()
+    
+    def loadState(self):
+        if not os.path.isfile(self.path+'/.wstate'):
+            return
+        fp = open(self.path+'/.wstate', 'rb')
+        self.rev_no = pickle.load(fp)
+        fp.close()
+
+
+    #
+    # Initializes the construction process, after the revision list has been compiled.
+    # Either creates a new repo, or loads the existing one at the target path
+    # and restores its construction state.
+    #
+    def openRepo(self):
+        # Create a new repository or continue from aborted dump
+        self.last_names = {} # Tracks page renames: name atm -> last name in repo
+        self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
+        self.loadFailedImages()
+
+        if os.path.isdir(self.path+'/.git'):
+            print("Continuing from aborted dump state...")
+            self.loadState()
+            self.repo = Repo(self.path)
+            assert not self.repo.bare
+
+        else: # create a new repository (will fail if one exists)
+            print("Initializing repository...")
+            self.repo = Repo.init(self.path)
+            self.rev_no = 0
+
+            if self.storeRevIds:
+                # Add revision id file to the new repo
+                fname = '.revid'
+                codecs.open(self.path + '/' + fname, "w", "UTF-8").close()
+                self.repo.index.add([fname])
+                self.index.commit("Initial creation of repo")
+        self.index = self.repo.index
+
+    #
+    # Takes an unprocessed revision from a revision log, fetches its data and commits it.
+    # Returns false if no unprocessed revisions remain.
+    #
+    def commitNext(self, rev):
+        if self.rev_no >= len(self.wrevs):
+            return False
+
+        if rev['rev_id'] in self.fetched_revids:
+            self.rev_no += 1
+
+            self.saveState() # Update operation state
+            return True
+
+        if rev['rev_id'] in self.revs_to_skip:
+            print("Skipping", rev)
+            return True
+
+        unixname = rev['page_name']
+        if unixname in self.pages_to_skip:
+            print("Skipping", rev)
+            return True
+
+        source = self.wd.get_revision_source(rev['rev_id'])
+        # Page title and unix_name changes are only available through another request:
+        details = self.wd.get_revision_version(rev['rev_id'])
+
+        # Store revision_id for last commit
+        # Without this, empty commits (e.g. file uploads) will be skipped by Git
+        if self.storeRevIds:
+            fname = self.path+'/.revid'
+            outp = codecs.open(fname, "w", "UTF-8")
+            outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
+            outp.close()
+
+        rev_unixname = details['unixname'] # may be different in revision than atm
+
+        # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.
+        # The only way to know they were changed is revision comments, though evil people may trick us.
+        if rev['comment'].startswith('Parent page set to: "'):
+            # This is a parenting revision, remember the new parent
+            parent_unixname = rev['comment'][21:-2]
+            if self.debug:
+                print('Parent changed', parent_unixname)
+            self.last_parents[unixname] = parent_unixname
+        else:
+            # Else use last parent_unixname we've recorded
+            parent_unixname =  self.last_parents[unixname] if unixname in self.last_parents else None
+
+        ## TODO: test
+        #if rev['comment'].startswith('Removed tags: ') or rev['comment'].startswith('Added tags: '):
+        #    self.updateTags(rev['comment'], rev_unixname)
+
+        # There are also problems when parent page gets renamed -- see updateChildren
+
+        # If the page is tracked and its name just changed, tell Git
+        fname = str(rev_unixname) + '.txt'
+        rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
+
+        commit_msg = ""
+
+        added_file_paths = []
+
+        if rename:
+            name_rename_from = str(self.last_names[unixname])+'.txt'
+
+            if self.debug:
+                print("Moving renamed", name_rename_from, "to", fname)
+
+            self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
+
+            # Try to do the best we can, these situations usually stem from vandalism people have cleaned up
+            if os.path.isfile(self.path + '/' + name_rename_from):
+                self.index.move([name_rename_from, fname], force=True)
+                commit_msg += "Renamed from " + str(self.last_names[unixname]) + ' to ' + str(rev_unixname) + ' '
+            else:
+                print("Source file does not exist, probably deleted or renamed from already?", name_rename_from)
+
+        # Add new page
+        elif not os.path.isfile(self.path + '/' + fname): # never before seen
+            commit_msg += "Created "
+            if self.debug:
+                print("Adding", fname)
+        elif rev['comment'] == '':
+            commit_msg += "Updated "
+
+        self.last_names[unixname] = rev_unixname
+
+        # Ouput contents
+        outp = codecs.open(self.path + '/' + fname, "w", "UTF-8")
+        if details['title']:
+            outp.write('title:' + details['title']+'\n')
+        if parent_unixname:
+            outp.write('parent:'+parent_unixname+'\n')
+        outp.write(source)
+        outp.close()
+
+        added_file_paths.append(str(fname))
+
+        commit_msg += rev_unixname
+
+        # Commit
+        if rev['comment'] != '':
+            commit_msg += ': ' + rev['comment']
+        else:
+            commit_msg += ' (no message)'
+        if rev['date']:
+            parsed_time = time.gmtime(int(rev['date'])) # TODO: assumes GMT
+            commit_date = time.strftime('%Y-%m-%d %H:%M:%S', parsed_time)
+        else:
+            commit_date = None
+
+        got_images = False;
+
+        # Add some spacing in the commit message
+        if len(details['images']) > 0:
+            commit_msg += '\n'
+
+        for image in details['images']:
+            if self.wd.maybe_download_file(image['src'], self.path + '/' + image['filepath']):
+                commit_msg += '\nAdded image: ' + image['src']
+                got_images = True
+                # If we do this gitpython barfs on itself
+                #added_file_paths.append(image['filepath'])
+            else:
+                self.saveFailedImages()
+
+
+        if got_images:
+            added_file_paths.append("images")
+        print("Committing: " + str(self.rev_no) + '. '+commit_msg)
+
+        # Include metadata in the commit (if changed)
+        self.appendFetchedRevid(rev['rev_id'])
+        self.saveMetadata()
+        added_file_paths.append('.metadata.json')
+        self.index.add(added_file_paths)
+
+        username = str(rev['user'])
+        email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename
+        author = Actor(username, email)
+
+        commit = self.index.commit(commit_msg, author=author, author_date=commit_date)
+
+        if self.debug:
+            print('Committed', commit.name_rev, 'by', author)
+
+        self.fetched_revids.add(rev['rev_id'])
+
+        self.rev_no += 1
+        self.saveState() # Update operation state
+
+        return True
+
+    def fetchAll(self):
+        to_fetch = []
+        for rev in tqdm(self.wrevs, desc='Creating list of revisions to fetch'):
+            if rev['rev_id'] not in self.fetched_revids:
+                to_fetch.append(rev)
+        for rev in tqdm(to_fetch, desc='Downloading'):
+            self.commitNext(rev)
+
+    #
+    # Updates all children of the page to reflect parent's unixname change.
+    #
+    # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body.
+    # A parent may then be renamed.
+    # Wikidot logs no additional changes for child pages, yet they stay linked to the parent.
+    #
+    # Therefore, on every rename we must update all linked children in the same revision.
+    #
+    def updateChildren(self, oldunixname, newunixname):
+        if self.debug:
+            print('Updating parents for', oldunixname, newunixname)
+
+        for child in list(self.last_parents.keys()):
+            if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname:
+                self.updateParentField(child, self.last_parents[child], newunixname)
+
+    def updateTags(self, comment, unixname):
+        file_name = self.path+'/'+unixname+'.txt'
+        removed = []
+        removed_match = re.search(pattern = r'Removed tags: ([^.]+,?)\.')
+        if removed_match is not None:
+            removed = removed_match.group(1).split(', ')
+
+        tags = []
+
+        with codecs.open(file_name, "r", "UTF-8") as f:
+            content = f.readlines()
+
+        tagsline = None
+        for line in content:
+            if line.startswith('tags:'):
+                tagsline = line
+                break
+
+        # Father forgive me for the indentation depth
+        idx = -1
+        if tagsline is not None:
+            idx = content.index(tagsline)
+            for tag in tagsline.split(','):
+                if not tag in removed:
+                    tags.append(tag)
+
+
+        added_match = re.search(pattern = r'Added tags: ([^.]+,?)\.')
+        if added_match is not None:
+            tags += added_match.group(1).split(', ')
+
+        tags.sort()
+
+        newtagsline = 'tags:' + ','.join(tags) + '\n'
+        if idx != -1:
+            contents[idx] = newtagsline
+        else:
+            contents = newtagsline + contents
+
+        with codecs.open(file_name, "w", "UTF-8") as f:
+            f.writelines(content)
+
+    #
+    # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
+    # The rest of the file is preserved.
+    #
+    def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
+        child_path = self.path+'/'+child_unixname+'.txt'
+        if not os.path.isfile(child_path):
+            print('Failed to find child file!', child_path)
+            return
+        with codecs.open(child_path, "r", "UTF-8") as f:
+            content = f.readlines()
+        # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
+        idx = content.index('parent:'+parent_oldunixname+'\n')
+        if idx < 0:
+            raise Exception("Cannot update child page "+child_unixname+": "
+                +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
+        content[idx] = 'parent:'+parent_newunixname+'\n'
+        with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f:
+            f.writelines(content)
+
+
+    #
+    # Finalizes the construction process and deletes any temporary files.
+    #
+    def cleanup(self):
+        if os.path.exists(self.path+'/.wstate'):
+            os.remove(self.path+'/.wstate')
+        else:
+            print("wstate does not exist?")
+
+        if os.path.exists(self.path+'/.wrevs'):
+            os.remove(self.path+'/.wrevs')
+        else:
+            print("wrevs does not exist?")
+
+        if os.path.exists(self.path+'/.pages'):
+            os.remove(self.path+'/.pages')
+
+        if self.rev_no > 0:
+            self.index.add(['.fetched.txt'])
+            self.index.commit('Updating fetched revisions')
diff --git a/wikidot.py b/wikidot.py
index f01c59f..be378ea 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -1,193 +1,561 @@
-import requests
-import random
-from bs4 import BeautifulSoup
-import time
-
-# Implements various queries to Wikidot engine through its AJAX facilities
-
-
-class Wikidot:
-	def __init__(self, site):
-		self.site = site		# Wikidot site to query
-		self.delay = 200		# Delay between requests in msec
-		self.debug = False		# Print debug messages
-		self.next_timeslot = time.clock()	# Can call immediately
-
-
-	# To honor usage rules, we wait for self.delay between requests.
-	# Low-level query functions call this before every request to Wikidot./
-	def _wait_request_slot(self):
-		tm = time.clock()
-		if self.next_timeslot - tm > 0:
-			time.sleep(self.next_timeslot - tm)
-		self.next_timeslot = tm + self.delay / 1000
-		pass
-
-	# Makes a Wikidot AJAX query. Returns the response+title or throws an error.
-	def queryex(self, params):
-		token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
-		cookies = {"wikidot_token7": token}
-		params['wikidot_token7'] = token
-	
-		if self.debug:
-			print params
-			print cookies
-
-		self._wait_request_slot()
-		req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
-		json = req.json()
-		if json['status'] == 'ok':
-			return json['body'], (json['title'] if 'title' in json else '')
-		else:
-			raise req.text
-
-	# Same but only returns the body, most responses don't have titles
-	def query(self, params):
-		return self.queryex(params)[0]
-
-
-	# List all pages for the site.
-
-	# Raw version
-	# For the supported formats (module_body) see:
-	# See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
-	def list_pages_raw(self, limit):
-		res = self.query({
-		  'moduleName': 'list/ListPagesModule',
-		  'limit': limit if limit else '10000',
-		  'perPage': limit if limit else '10000',
-		  'module_body': '%%page_unix_name%%',
-		  'separate': 'false',
-		  'order': 'dateCreatedDesc',  # This way limit makes sense. This is also the default
-		})
-		return res
-
-	# Client version
-	def list_pages(self, limit):
-		raw = self.list_pages_raw(limit).replace('<br/>',"\n")
-		soup = BeautifulSoup(raw, 'html.parser')
-		pages = []
-		for entry in soup.div.p.text.split('\n'):
-			pages.append(entry)
-		return pages
-
-
-	# Retrieves internal page_id by page unix_name.
-	# Page IDs are required for most of page functions.
-
-	def get_page_id(self, page_unix_name):
-		# The only freaking way to get page ID is to load the page! Wikidot!
-		self._wait_request_slot()
-		req = requests.request('GET', self.site+'/'+page_unix_name)
-		soup = BeautifulSoup(req.text, 'html.parser')
-		for item in soup.head.find_all('script'):
-			text = item.text
-			pos = text.find("WIKIREQUEST.info.pageId = ")
-			if pos >= 0:
-				pos += len("WIKIREQUEST.info.pageId = ")
-				crlf = text.find(";", pos)
-				if crlf >= 0:
-					return int(text[pos:crlf])
-				else:
-					return int(text[pos:])
-		return None
-
-
-	# Retrieves a list of revisions for a page.
-	# See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
-
-	# Raw version
-	def get_revisions_raw(self, page_id, limit):
-		res = self.query({
-		  'moduleName': 'history/PageRevisionListModule',
-		  'page_id': page_id,
-		  'page': '1',
-		  'perpage': limit if limit else '10000',
-		  'options': '{"all":true}'
-		})
-		
-		soup = BeautifulSoup(res, 'html.parser')
-		return soup.table.contents
-
-	# Client version
-	def get_revisions(self, page_id, limit):
-		revs = []
-		for tr in self.get_revisions_raw(page_id, limit):
-			if tr.name != 'tr': continue # there's a header + various junk
-
-			# RevID is stored as a value of an INPUT field
-			rev_id = tr.input['value'] if tr.input else None
-			if rev_id is None: continue # can't parse
-
-			# Unixtime is stored as a CSS class time_*
-			rev_date = 0
-			date_span = tr.find("span", attrs={"class": "odate"})
-			if date_span is not None:
-				for cls in date_span['class']:
-					if cls.startswith('time_'):
-						rev_date = int(cls[5:])
-
-			# Username in a last <a> under <span class="printuser">
-			user_span = tr.find("span", attrs={"class": "printuser"})
-			for last_a in user_span.find_all('a'): pass
-			rev_user = last_a.getText() if last_a else None
-			
-
-			# Comment is in the last TD of the row
-			last_td = None
-			for last_td in tr.find_all('td'): pass
-			rev_comment = last_td.getText() if last_td else ""
-
-			revs.append({
-				'id': rev_id,
-				'date': rev_date,
-				'user': rev_user,
-				'comment': rev_comment,
-			})
-		return revs
-
-
-	# Retrieves revision source for a revision.
-	# There's no raw version because there's nothing else in raw.
-	def get_revision_source(self, rev_id):
-		res = self.query({
-		  'moduleName': 'history/PageSourceModule',
-		  'revision_id': rev_id,
-		  # We don't need page id
-		})
-		# The source is HTMLified but BeautifulSoup's getText() will decode that
-		# - htmlentities
-		# - <br/>s in place of linebreaks
-		# - random real linebreaks (have to be ignored)
-		soup = BeautifulSoup(res, 'html.parser')
-		return soup.div.getText().lstrip(' \r\n')
-	
-	# Retrieves the rendered version + additional info unavailable in get_revision_source:
-	# * Title
-	# * Unixname at the time
-	def get_revision_version_raw(self, rev_id):
-		res = self.queryex({
-		  'moduleName': 'history/PageVersionModule',
-		  'revision_id': rev_id,
-		})
-		return res
-	
-	def get_revision_version(self, rev_id):
-		res = self.get_revision_version_raw(rev_id) # this has title!
-		soup = BeautifulSoup(res[0], 'html.parser')
-
-		# First table is a flyout with revision details. Remove and study it.
-		unixname = None
-		details = soup.find("div", attrs={"id": "page-version-info"}).extract()
-		for tr in details.find_all('tr'):
-			tds = tr.find_all('td')
-			if len(tds) < 2: continue
-			if tds[0].getText().strip() == 'Page name:':
-				unixname = tds[1].getText().strip()
-
-		return {
-		  'rev_id': rev_id,
-		  'unixname': unixname,
-		  'title': res[1],
-		  'content': unicode(soup), # only content remains
-		}
\ No newline at end of file
+import requests
+import random
+from bs4 import BeautifulSoup
+import time
+from urllib.parse import urlparse, urljoin
+from pprint import pprint
+import pathlib
+import hashlib
+import os
+import shutil
+import imghdr
+from timeit import default_timer as timer
+
+# Implements various queries to Wikidot engine through its AJAX facilities
+
+
+class Wikidot:
+    def __init__(self, site):
+        self.site = site        # Wikidot site to query
+
+        # strip out trailing /, if it exists
+        if self.site[-1] == '/':
+            self.site = self.site[:-1]
+        self.sitename = urlparse(site).hostname.lower()
+        self.delay = 1000        # Delay between requests in msec
+        self.debug = False      # Print debug messages
+        self.next_timeslot = time.process_time()   # Can call immediately
+        self.max_retries = 5
+        self.failed_images = set()
+
+    # Downloads file if it doesn't exist
+    def maybe_download_file(self, url, file_path):
+        if url in self.failed_images:
+            if self.debug:
+                print(" ! ", url, "already failed, skipping")
+            return False
+
+        if os.path.exists(file_path):
+            if self.debug:
+                print(" - ", file_path, "exists, skipping")
+            return False
+
+        #self._wait_request_slot()
+
+        try:
+            dirpath = os.path.dirname(file_path)
+            os.makedirs(dirpath, exist_ok=True)
+        except OSError as e:
+            if e.errno == 36:
+                print("Path too long", e)
+                return False
+            else:
+                raise  # re-raise previously caught exception
+
+        if self.debug:
+            print(" < downloading", url, "to" ,file_path, "dirpath", dirpath)
+
+        # In case of e. g. 500 errors
+        retries = 0
+        while retries < self.max_retries:
+            self._wait_request_slot()
+
+            headers = requests.utils.default_headers()
+            # Pretty generic user-agent, but we append a unique none for us
+            # Makes wikimedia happy
+            headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"})
+            start = timer()
+
+            try:
+                req = requests.get(url, stream=True, timeout=30)
+            except requests.exceptions.RequestException:
+                print('request exception')
+
+                retries += 1
+                time.sleep(retries * retries * retries) # up to ~2 minutes
+                continue
+            except urllib3.exceptions.ReadTimeoutError:
+                print('read timeout')
+
+                retries += 1
+                time.sleep(retries * retries * retries) # up to ~2 minutes
+                continue
+
+            if req.status_code >= 500:
+                print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+                # In case of debug enabled, we already printed this above
+                if not self.debug:
+                    print(' - ', req)
+
+                retries += 1
+                time.sleep(retries * retries * retries)
+                continue
+
+            if req.status_code >= 400:
+                self.failed_images.add(url)
+                return False
+
+            try:
+                # In case of 404 errors or other stuff that indicates
+                # some bug in how we handle or request things
+                req.raise_for_status()
+
+                req.raw.decode_content = True
+                with open(file_path, 'wb') as out_file:
+                    shutil.copyfileobj(req.raw, out_file)
+
+                if imghdr.what(file_path) is None:
+                    print('Downloaded invalid image', url)
+                    os.remove(file_path)
+                    self.failed_images.add(url)
+                    return False
+
+
+                if self.debug:
+                    print(" - downloaded file size", os.path.getsize(file_path), "in", round(timer() - start, 2))
+
+                return True
+            except OSError as e:
+                if e.errno == 36:
+                    print("Filename to long", e)
+                    return False
+                else:
+                    raise  # re-raise previously caught exception
+            except Exception as e:
+                print(' ! Failed to download', e, req, url)
+                raise e
+
+        print('Failed too many times for', url)
+        return False
+
+    # To honor usage rules, we wait for self.delay between requests.
+    # Low-level query functions call this before every request to Wikidot./
+    def _wait_request_slot(self):
+        tm = time.process_time()
+        if self.next_timeslot - tm > 0:
+            time.sleep(self.next_timeslot - tm)
+        self.next_timeslot = tm + self.delay / 1000
+
+        pass
+
+    # Makes a Wikidot AJAX query. Returns the response+title or throws an error.
+    def queryex(self, params, urlAppend = None):
+        token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
+        cookies = {"wikidot_token7": token}
+        params['wikidot_token7'] = token
+
+        if self.debug:
+            print(' - ', params)
+            print(' - ', cookies)
+
+        url = self.site+'/ajax-module-connector.php'
+        if urlAppend is not None:
+            url += urlAppend
+
+        # In case of e. g. 500 errors
+        retries = 0
+        while retries < self.max_retries:
+            if retries > 0:
+                print(" ! retry", retries, "of", self.max_retries)
+
+            self._wait_request_slot()
+
+            start = timer()
+            try:
+                req = requests.request('POST', url, data=params, cookies=cookies, timeout=30)
+            except requests.exceptions.RequestException:
+                print('request timed out!')
+                retries += 1
+                time.sleep(retries * retries * retries)
+                continue
+
+            if self.debug:
+                print(' * ajax request completed in', round(timer() - start, 2))
+
+            # Usually a 502 error, recovers immediately
+            if req.status_code >= 500:
+                retries += 1
+                print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+
+                # In case of debug enabled, we already printed this above
+                if not self.debug:
+                    print(req, params)
+
+                # Be nice, double wait delay for errors
+                self._wait_request_slot()
+
+                # Extra nice, sleep longer (expoential increase), hope for the
+                # server to recover
+                time.sleep(retries * retries * retries)
+
+                continue
+
+            try:
+                # In case of 404 errors or other stuff that indicates
+                # some bug in how we handle or request things
+                req.raise_for_status()
+            except Exception as e:
+                print(' ! Failed to get response from wikidot', e, req, url, params)
+
+            try:
+                json = req.json()
+            except Exception as e:
+                print(' ! Failed to get response from wikidot', e, req, url, params)
+                if retries < self.max_retries:
+                    retries += 1
+                    #self._wait_request_slot()
+                    time.sleep(retries * retries * retries)
+                    continue
+
+                raise e
+
+            if json['status'] == 'ok':
+                return json['body'], (json['title'] if 'title' in json else '')
+            else:
+                print(" ! error in response", json)
+
+                retries += 1
+                time.sleep(retries * retries * retries)
+                continue
+
+        print(' ! Failed too many times', url, params, cookies)
+        raise Exception('Failed too many times for ' + url)
+
+    # Same but only returns the body, most responses don't have titles
+    def query(self, params, urlAppend = None):
+        return self.queryex(params, urlAppend)[0]
+
+    # List all pages for the site.
+
+    # Raw version
+    # For the supported formats (module_body) see:
+    # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
+    def list_pages_raw(self, limit, offset):
+        res = self.query({
+          'moduleName': 'list/ListPagesModule',
+          'limit': limit if limit else '10000',
+          'perPage': limit if limit else '10000',
+          'module_body': '%%page_unix_name%%',
+          'separate': 'false',
+          'p': str(offset),
+          'order': 'dateCreatedDesc',  # This way limit makes sense. This is also the default
+        }, '/p/' + str(offset))
+        return res
+
+    # Client version
+    def list_pages(self, limit):
+        offset = 1
+        pages = []
+
+        while True:
+            raw = self.list_pages_raw(limit, offset).replace('<br/>',"\n")
+            soup = BeautifulSoup(raw, 'html.parser')
+
+
+            for entry in soup.div.p.text.split('\n'):
+                pages.append(entry)
+
+            if self.debug:
+                print(' - Pages found:', len(pages))
+
+            targets = soup.find_all('span','target')
+            if len(targets) < 2:
+                print(" ! Unable to find next listing page, not enough target spans")
+                break
+
+            next_url = targets[-1].a.get('href').split('/')
+            if len(next_url) > 0 and next_url[-1].isnumeric():
+                next_page = int(next_url[-1])
+
+                if self.debug:
+                    print(' - Next listing page', next_page)
+
+            else:
+                print(" ! invalid next url", next_url)
+                break
+
+            #next_page = int(targets[0].a.text)
+
+            current_spans = soup.find_all('span','current')
+            if len(current_spans) > 0:
+                current_page = int(current_spans[0].text)
+
+                if self.debug:
+                    print(' - Current listing page', current_page)
+
+            else:
+                print(" ! unable to find current page")
+                break;
+
+            if next_page != offset + 1:
+                if self.debug:
+                    print(' ! Next page is wrong', next_page, 'hopefully at the end')
+                break
+
+            offset += 1
+
+            print(" - Fetching listing page", offset)
+
+        return pages
+
+
+    # Retrieves internal page_id by page unix_name.
+    # Page IDs are required for most of page functions.
+
+    def get_page_id(self, page_unix_name):
+        # The only freaking way to get page ID is to load the page! Wikidot!
+        self._wait_request_slot()
+        url = self.site+'/'+page_unix_name + '/noredirect/true';
+
+        if self.debug:
+            print(" > fetching", url)
+
+        start = timer()
+        retries = 0
+        req = None
+        while retries < self.max_retries:
+            try:
+                req = requests.request('GET', url, timeout=30)
+            except requests.exceptions.RequestException:
+                print('request timed out!')
+                retries += 1
+                time.sleep(retries * retries * retries)
+                continue
+
+            if req.status_code >= 500:
+                print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+                retries += 1
+                time.sleep(retries * retries * retries)
+                continue
+
+            req.raise_for_status()
+            break
+
+        if self.debug:
+            print(' * page id request completed in', round(timer() - start, 2))
+
+        soup = BeautifulSoup(req.text, 'html.parser')
+        for item in soup.head.find_all('script'):
+            text = item.string
+            if text is None:
+                #print("No text in script item", item)
+                continue
+
+            pos = text.find("WIKIREQUEST.info.pageId = ")
+            if pos >= 0:
+                pos += len("WIKIREQUEST.info.pageId = ")
+                crlf = text.find(";", pos)
+                if crlf >= 0:
+                    return int(text[pos:crlf])
+                else:
+                    return int(text[pos:])
+
+        raise Exception('Failed to get page_id for ' + page_unix_name)
+
+
+    # Retrieves a list of revisions for a page.
+    # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
+
+    # Raw version
+    def get_revisions_raw(self, page_id, limit):
+        res = self.query({
+          'moduleName': 'history/PageRevisionListModule',
+          'page_id': page_id,
+          'page': '1',
+          'perpage': limit if limit else '10000',
+          'options': '{"all":true}'
+        })
+
+        soup = BeautifulSoup(res, 'html.parser')
+        return soup.table.contents
+
+    # Client version
+    def get_revisions(self, page_id, limit):
+        revs = []
+        raw = self.get_revisions_raw(page_id, limit)
+        for tr in raw:
+            if tr.name != 'tr': continue # there's a header + various junk
+
+            # RevID is stored as a value of an INPUT field
+            rev_id = tr.input['value'] if tr.input else None
+            if rev_id is None: continue # can't parse
+            attachment_action = tr.find("span", attrs={"title": "file/attachment action"})
+            attached_file = False
+            if attachment_action is not None:
+                attached_file = True
+                print(" - was attchment", rev_id)
+
+            # Unixtime is stored as a CSS class time_*
+            rev_date = 0
+            date_span = tr.find("span", attrs={"class": "odate"})
+            if date_span is not None:
+                for cls in date_span['class']:
+                    if cls.startswith('time_'):
+                        rev_date = int(cls[5:])
+            else:
+                print(" ! no odate found")
+
+            # Username in a last <a> under <span class="printuser">
+            user_span = tr.find("span", attrs={"class": "printuser"})
+            last_a = None
+            for last_a in user_span.find_all('a'): pass
+            rev_user = last_a.getText() if last_a else None
+
+
+            # Comment is in the last TD of the row
+            last_td = None
+            for last_td in tr.find_all('td'): pass
+            rev_comment = last_td.getText() if last_td else ""
+
+            revs.append({
+                'id': rev_id,
+                'date': rev_date,
+                'user': rev_user,
+                'comment': rev_comment,
+                'attached_file': attached_file,
+            })
+        return revs
+
+    # topics in forum: http://www.scp-wiki.net/forum/c-###/sort/start
+    # -> div class 'title'
+    #   -> a href= http://www.scp-wiki.net/forum/t-####/foobar (foobar not important)
+
+    # posts in topic http://www.scp-wiki.net/forum/t-####/
+    # -> div id 'thread-container'
+    #   -> div class 'post-container'
+    #       -> div class = 'post', id = 'post-####'
+    #           -> div class 'title'
+    #           -> div class 'content'
+    #   -> div class 'post-container'
+    #       -> ...
+    #       -> div class 'post-container'
+    #           -> ...
+
+    #def get_forum_post_revisions(self, post_id):
+    #    res = self.query({
+    #      'moduleName': 'forum/sub/ForumPostRevisionsModule',
+    #      'postId': post_id,
+    #    })
+    #    revisions = []
+    #    soup = BeautifulSoup(res, 'html.parser')
+    #    for row in soup.find_all("tr"):
+    #        columns = row.find_all("td")
+
+    #        if len(columns) != 3:
+    #            raise Exception('Invalid row in post history for ' + str(post_id))
+
+    #        user = columns[0].find('a').getText()
+    #        time = columns[1].find('span').getText()
+    #        rev_id_js = columns[0].find('a')['href']
+    #        match = re.search(r'showRevision\(event, ([0-9]+)\)', rev_id_js)
+    #        rev_id = match.group(1)
+
+    #        revisions.append({
+    #            'id': rev_id,
+    #            'user': user,
+    #            'time': time,
+    #            })
+
+    # Retrieves revision source for a revision.
+    # There's no raw version because there's nothing else in raw.
+    def get_revision_source(self, rev_id):
+        res = self.query({
+          'moduleName': 'history/PageSourceModule',
+          'revision_id': rev_id,
+          # We don't need page id
+        })
+        # The source is HTMLified but BeautifulSoup's getText() will decode that
+        # - htmlentities
+        # - <br/>s in place of linebreaks
+        # - random real linebreaks (have to be ignored)
+        soup = BeautifulSoup(res, 'html.parser')
+        return soup.div.getText().lstrip(' \r\n')
+
+    # Retrieves the rendered version + additional info unavailable in get_revision_source:
+    # * Title
+    # * Unixname at the time
+    #
+    # TODO: I think this could fetch the source as well, so we don't need to
+    # fetch two pages (the fetch source function above).
+    def get_revision_version_raw(self, rev_id):
+        res = self.queryex({
+          'moduleName': 'history/PageVersionModule',
+          'revision_id': rev_id,
+        })
+        return res
+
+    def get_revision_version(self, rev_id):
+        res = self.get_revision_version_raw(rev_id) # this has title!
+        soup = BeautifulSoup(res[0], 'html.parser')
+
+        # Extract list of images
+
+        # TODO: to get the right revision that added them, we need to go back
+        # and amend the commits that are flagged as attached_file above,
+        # because we can't get the image file name or URL reliably until they
+        # are added to the page source, wikidot itself doesn't store this information.
+        # So much hassle for little value, we get the empty commits when images
+        # are added anyways.
+        images = []
+        for img_div in soup.find_all("div", attrs={"class": "scp-image-block"}):
+            img_src = None
+            img_name = ""
+            full_link = img_div.find("a")
+            if full_link is not None:
+                # Check if it has a thumbnail, otherwise we can't trust that it is the original
+                img = full_link.find("img", attrs={"class": "enlarge"})
+                if img is not None:
+                    img_src = full_link["href"]
+                    img_name = img["alt"]
+
+            if img_src is None:
+                img = img_div.find("img")
+                if img is not None:
+                    img_src = img["src"]
+                    img_name = img["alt"]
+
+            if img_src is None:
+                continue
+
+            # Just in case, I don't think it ever happens, but resolve '..'
+            # juuuust in case someone tries to be funny
+            img_url = urlparse(urljoin(img_src, "."))
+            url_path = pathlib.Path(img_url.path)
+
+            img_path = ""
+            if img_url.netloc != "":
+                img_path = img_url.netloc + "/"
+                if img_url.netloc[-1] != '/':
+                    img_path += '/'
+
+            if img_url.path != "" and img_url.path[0] == '/':
+                img_path += img_url.path[1:]
+            else:
+                img_path += img_url.path
+
+            if img_path == "" or img_path[-1] == "/":
+                img_path += img_name
+
+            images.append({"src": img_src, "filename": img_name, "filepath": "images/" + img_path})
+
+
+
+        # First table is a flyout with revision details. Remove and study it.
+        unixname = None
+        details = soup.find("div", attrs={"id": "page-version-info"}).extract()
+        for tr in details.find_all('tr'):
+            tds = tr.find_all('td')
+            if len(tds) < 2: continue
+            if tds[0].getText().strip() == 'Page name:':
+                unixname = tds[1].getText().strip()
+
+        if unixname is None:
+            raise Exception('Failed to find unixname for ' + rev_id)
+
+        return {
+          'rev_id': rev_id,
+          'unixname': unixname,
+          'title': res[1],
+          'content': str(soup), # only content remains
+          'images': images,
+        }