From 66461594d1443910294827fb19def13ce51db957 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 13:38:21 +0200
Subject: [PATCH 01/93] python3 does not support string exceptions

---
 crawl.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/crawl.py b/crawl.py
index 22da38a..ab47baf 100644
--- a/crawl.py
+++ b/crawl.py
@@ -47,7 +47,7 @@ def force_dirs(path):
         os.makedirs(path)
     except OSError as exception:
         if exception.errno != os.errno.EEXIST:
-            raise
+            raise exception
 
 if args.list_pages_raw:
 	print wd.list_pages_raw(args.depth)
@@ -58,44 +58,44 @@ def force_dirs(path):
 
 elif args.source:
 	if not args.page:
-		raise "Please specify --page for --source."
+		raise Exception("Please specify --page for --source."
 	
 	page_id = wd.get_page_id(args.page)
 	if not page_id:
-		raise "Page not found: "+args.page
+		raise Exception("Page not found: "+args.page)
 	
 	revs = wd.get_revisions(page_id, 1) # last revision
 	print wd.get_revision_source(revs[0]['id'])
 
 elif args.content:
 	if not args.page:
-		raise "Please specify --page for --source."
+		raise Exception("Please specify --page for --source.")
 	
 	page_id = wd.get_page_id(args.page)
 	if not page_id:
-		raise "Page not found: "+args.page
+		raise Exception("Page not found: "+args.page)
 	
 	revs = wd.get_revisions(page_id, 1) # last revision
 	print wd.get_revision_version(revs[0]['id'])
 
 elif args.log_raw:
 	if not args.page:
-		raise "Please specify --page for --log."
+		raise Exception("Please specify --page for --log.")
 
 	page_id = wd.get_page_id(args.page)
 	if not page_id:
-		raise "Page not found: "+args.page
+		raise Exception("Page not found: "+args.page)
 
 	print wd.get_revisions_raw(page_id, args.depth)
 
 
 elif args.log:
 	if not args.page:
-		raise "Please specify --page for --log."
+		raise Exception("Please specify --page for --log.")
 
 	page_id = wd.get_page_id(args.page)
 	if not page_id:
-		raise "Page not found: "+args.page
+		raise Exception("Page not found: "+args.page)
 	for rev in wd.get_revisions(page_id, args.depth):
 		print unicode(rev)
 

From a9360af935f1c737af5c8fa403646ebf272cc24e Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 13:39:55 +0200
Subject: [PATCH 02/93] dos2unix, missing )

---
 crawl.py | 236 +++++++++++++++++++++++++++----------------------------
 1 file changed, 118 insertions(+), 118 deletions(-)

diff --git a/crawl.py b/crawl.py
index ab47baf..566b70c 100644
--- a/crawl.py
+++ b/crawl.py
@@ -1,118 +1,118 @@
-import argparse
-import sys
-import locale
-import codecs
-import os
-from wikidot import Wikidot
-from rmaint import RepoMaintainer
-
-# TODO: Files.
-# TODO: Forum and comment pages.
-# TODO: Ability to download new transactions since last dump.
-#   We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump)
-
-rawStdout = sys.stdout
-rawStderr = sys.stderr
-sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, 'xmlcharrefreplace')
-sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr, 'xmlcharrefreplace')
-
-parser = argparse.ArgumentParser(description='Queries Wikidot')
-parser.add_argument('site', help='URL of Wikidot site')
-# Actions
-parser.add_argument('--list-pages', action='store_true', help='List all pages on this site')
-parser.add_argument('--source', action='store_true', help='Print page source (requires --page)')
-parser.add_argument('--content', action='store_true', help='Print page content (requires --page)')
-parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)')
-parser.add_argument('--dump', type=str, help='Download page revisions to this directory')
-# Debug actions
-parser.add_argument('--list-pages-raw', action='store_true')
-parser.add_argument('--log-raw', action='store_true')
-# Action settings
-parser.add_argument('--page', type=str, help='Query only this page')
-parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
-parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository')
-# Common settings
-parser.add_argument('--debug', action='store_true', help='Print debug info')
-parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot')
-args = parser.parse_args()
-
-
-wd = Wikidot(args.site)
-wd.debug = args.debug
-wd.delay = args.delay
-
-
-def force_dirs(path):
-    try:
-        os.makedirs(path)
-    except OSError as exception:
-        if exception.errno != os.errno.EEXIST:
-            raise exception
-
-if args.list_pages_raw:
-	print wd.list_pages_raw(args.depth)
-
-elif args.list_pages:
-	for page in wd.list_pages(args.depth):
-		print page
-
-elif args.source:
-	if not args.page:
-		raise Exception("Please specify --page for --source."
-	
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise Exception("Page not found: "+args.page)
-	
-	revs = wd.get_revisions(page_id, 1) # last revision
-	print wd.get_revision_source(revs[0]['id'])
-
-elif args.content:
-	if not args.page:
-		raise Exception("Please specify --page for --source.")
-	
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise Exception("Page not found: "+args.page)
-	
-	revs = wd.get_revisions(page_id, 1) # last revision
-	print wd.get_revision_version(revs[0]['id'])
-
-elif args.log_raw:
-	if not args.page:
-		raise Exception("Please specify --page for --log.")
-
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise Exception("Page not found: "+args.page)
-
-	print wd.get_revisions_raw(page_id, args.depth)
-
-
-elif args.log:
-	if not args.page:
-		raise Exception("Please specify --page for --log.")
-
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise Exception("Page not found: "+args.page)
-	for rev in wd.get_revisions(page_id, args.depth):
-		print unicode(rev)
-
-
-elif args.dump:
-	print "Downloading pages to "+args.dump
-	force_dirs(args.dump)
-	
-	rm = RepoMaintainer(wd, args.dump)
-	rm.debug = args.debug
-	rm.storeRevIds = args.revids
-	rm.buildRevisionList([args.page] if args.page else None, args.depth)
-	rm.openRepo()
-	
-	print "Downloading revisions..."
-	while rm.commitNext():
-		pass
-	
-	rm.cleanup()
-	print "Done."
+import argparse
+import sys
+import locale
+import codecs
+import os
+from wikidot import Wikidot
+from rmaint import RepoMaintainer
+
+# TODO: Files.
+# TODO: Forum and comment pages.
+# TODO: Ability to download new transactions since last dump.
+#   We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump)
+
+rawStdout = sys.stdout
+rawStderr = sys.stderr
+sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, 'xmlcharrefreplace')
+sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr, 'xmlcharrefreplace')
+
+parser = argparse.ArgumentParser(description='Queries Wikidot')
+parser.add_argument('site', help='URL of Wikidot site')
+# Actions
+parser.add_argument('--list-pages', action='store_true', help='List all pages on this site')
+parser.add_argument('--source', action='store_true', help='Print page source (requires --page)')
+parser.add_argument('--content', action='store_true', help='Print page content (requires --page)')
+parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)')
+parser.add_argument('--dump', type=str, help='Download page revisions to this directory')
+# Debug actions
+parser.add_argument('--list-pages-raw', action='store_true')
+parser.add_argument('--log-raw', action='store_true')
+# Action settings
+parser.add_argument('--page', type=str, help='Query only this page')
+parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
+parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository')
+# Common settings
+parser.add_argument('--debug', action='store_true', help='Print debug info')
+parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot')
+args = parser.parse_args()
+
+
+wd = Wikidot(args.site)
+wd.debug = args.debug
+wd.delay = args.delay
+
+
+def force_dirs(path):
+    try:
+        os.makedirs(path)
+    except OSError as exception:
+        if exception.errno != os.errno.EEXIST:
+            raise exception
+
+if args.list_pages_raw:
+	print wd.list_pages_raw(args.depth)
+
+elif args.list_pages:
+	for page in wd.list_pages(args.depth):
+		print page
+
+elif args.source:
+	if not args.page:
+		raise Exception("Please specify --page for --source.")
+	
+	page_id = wd.get_page_id(args.page)
+	if not page_id:
+		raise Exception("Page not found: "+args.page)
+	
+	revs = wd.get_revisions(page_id, 1) # last revision
+	print wd.get_revision_source(revs[0]['id'])
+
+elif args.content:
+	if not args.page:
+		raise Exception("Please specify --page for --source.")
+	
+	page_id = wd.get_page_id(args.page)
+	if not page_id:
+		raise Exception("Page not found: "+args.page)
+	
+	revs = wd.get_revisions(page_id, 1) # last revision
+	print wd.get_revision_version(revs[0]['id'])
+
+elif args.log_raw:
+	if not args.page:
+		raise Exception("Please specify --page for --log.")
+
+	page_id = wd.get_page_id(args.page)
+	if not page_id:
+		raise Exception("Page not found: "+args.page)
+
+	print wd.get_revisions_raw(page_id, args.depth)
+
+
+elif args.log:
+	if not args.page:
+		raise Exception("Please specify --page for --log.")
+
+	page_id = wd.get_page_id(args.page)
+	if not page_id:
+		raise Exception("Page not found: "+args.page)
+	for rev in wd.get_revisions(page_id, args.depth):
+		print unicode(rev)
+
+
+elif args.dump:
+	print "Downloading pages to "+args.dump
+	force_dirs(args.dump)
+	
+	rm = RepoMaintainer(wd, args.dump)
+	rm.debug = args.debug
+	rm.storeRevIds = args.revids
+	rm.buildRevisionList([args.page] if args.page else None, args.depth)
+	rm.openRepo()
+	
+	print "Downloading revisions..."
+	while rm.commitNext():
+		pass
+	
+	rm.cleanup()
+	print "Done."

From fcdc5bdfdf9f237b90ad61c42f24cc1ef5c8400d Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 13:42:23 +0200
Subject: [PATCH 03/93] run 2to3 on crawl.py

---
 crawl.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/crawl.py b/crawl.py
index 566b70c..08bbbf3 100644
--- a/crawl.py
+++ b/crawl.py
@@ -50,11 +50,11 @@ def force_dirs(path):
             raise exception
 
 if args.list_pages_raw:
-	print wd.list_pages_raw(args.depth)
+	print(wd.list_pages_raw(args.depth))
 
 elif args.list_pages:
 	for page in wd.list_pages(args.depth):
-		print page
+		print(page)
 
 elif args.source:
 	if not args.page:
@@ -65,7 +65,7 @@ def force_dirs(path):
 		raise Exception("Page not found: "+args.page)
 	
 	revs = wd.get_revisions(page_id, 1) # last revision
-	print wd.get_revision_source(revs[0]['id'])
+	print(wd.get_revision_source(revs[0]['id']))
 
 elif args.content:
 	if not args.page:
@@ -76,7 +76,7 @@ def force_dirs(path):
 		raise Exception("Page not found: "+args.page)
 	
 	revs = wd.get_revisions(page_id, 1) # last revision
-	print wd.get_revision_version(revs[0]['id'])
+	print(wd.get_revision_version(revs[0]['id']))
 
 elif args.log_raw:
 	if not args.page:
@@ -86,7 +86,7 @@ def force_dirs(path):
 	if not page_id:
 		raise Exception("Page not found: "+args.page)
 
-	print wd.get_revisions_raw(page_id, args.depth)
+	print(wd.get_revisions_raw(page_id, args.depth))
 
 
 elif args.log:
@@ -97,11 +97,11 @@ def force_dirs(path):
 	if not page_id:
 		raise Exception("Page not found: "+args.page)
 	for rev in wd.get_revisions(page_id, args.depth):
-		print unicode(rev)
+		print(str(rev))
 
 
 elif args.dump:
-	print "Downloading pages to "+args.dump
+	print("Downloading pages to "+args.dump)
 	force_dirs(args.dump)
 	
 	rm = RepoMaintainer(wd, args.dump)
@@ -110,9 +110,9 @@ def force_dirs(path):
 	rm.buildRevisionList([args.page] if args.page else None, args.depth)
 	rm.openRepo()
 	
-	print "Downloading revisions..."
+	print("Downloading revisions...")
 	while rm.commitNext():
 		pass
 	
 	rm.cleanup()
-	print "Done."
+	print("Done.")

From 8dcab9e35d1a478f5d0ebe78219187ccad5f979c Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 13:42:33 +0200
Subject: [PATCH 04/93] dos2unix on the rest

---
 .hgignore  |   4 +-
 hgpatch.py | 100 +++++-----
 readme.md  |  60 +++---
 rmaint.py  | 524 ++++++++++++++++++++++++++---------------------------
 wikidot.py | 384 +++++++++++++++++++--------------------
 5 files changed, 536 insertions(+), 536 deletions(-)

diff --git a/.hgignore b/.hgignore
index 471301b..a26d142 100644
--- a/.hgignore
+++ b/.hgignore
@@ -1,2 +1,2 @@
-syntax:glob
-*.pyc
+syntax:glob
+*.pyc
diff --git a/hgpatch.py b/hgpatch.py
index 6d2ff12..2d77769 100644
--- a/hgpatch.py
+++ b/hgpatch.py
@@ -1,50 +1,50 @@
-from mercurial import scmutil, osutil
-from types import MethodType
-from mercurial import encoding
-import codecs
-
-# Patches commit-message unicode handling on Python 2.x
-
-# Mercurial is internally unicode. But because it runs from ASCII console, it tries to convert
-# all input from "input encoding" (set in mercurial/encoding.py)
-
-# Problem 1:
-#   If you just pass it u'unicode string', it'll fail. Even if you set "input encoding" to utf-8,
-#   it'll still try to decode it to ASCII.
-# Solution:
-#   Patch this decoding function to pass unicode unchanged.
-
-old_fromlocal = None
-
-def better_fromlocal(s):
-	if isinstance(s, unicode):
-		return s.encode('utf-8')
-	global old_fromlocal
-	return old_fromlocal(s)
-
-old_fromlocal = encoding.fromlocal
-encoding.fromlocal = better_fromlocal
-
-
-# Problem 2:
-#   Separate from actual log, Mercurial stores commit message in commit-message.txt.
-#   Unfortunately it uses default Python 2.x file.open which expects ASCII and auto-conversion fails.
-# Solution:
-#   Patch virtual-fs open() function to use codecs.open wrapper in this particular case.
-
-old_vfs_call = None
-
-def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False):
-	fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose)
-	if path.endswith('last-message.txt'):
-		# Create a wrapper like codecs.open does:
-		info = codecs.lookup("utf-8")
-		fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict')
-		fp.encoding = 'utf-8'
-	return fp
-
-old_vfs_call = scmutil.vfs.__call__
-scmutil.vfs.__call__ = better_vfs_call
-
-
-
+from mercurial import scmutil, osutil
+from types import MethodType
+from mercurial import encoding
+import codecs
+
+# Patches commit-message unicode handling on Python 2.x
+
+# Mercurial is internally unicode. But because it runs from ASCII console, it tries to convert
+# all input from "input encoding" (set in mercurial/encoding.py)
+
+# Problem 1:
+#   If you just pass it u'unicode string', it'll fail. Even if you set "input encoding" to utf-8,
+#   it'll still try to decode it to ASCII.
+# Solution:
+#   Patch this decoding function to pass unicode unchanged.
+
+old_fromlocal = None
+
+def better_fromlocal(s):
+	if isinstance(s, unicode):
+		return s.encode('utf-8')
+	global old_fromlocal
+	return old_fromlocal(s)
+
+old_fromlocal = encoding.fromlocal
+encoding.fromlocal = better_fromlocal
+
+
+# Problem 2:
+#   Separate from actual log, Mercurial stores commit message in commit-message.txt.
+#   Unfortunately it uses default Python 2.x file.open which expects ASCII and auto-conversion fails.
+# Solution:
+#   Patch virtual-fs open() function to use codecs.open wrapper in this particular case.
+
+old_vfs_call = None
+
+def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False):
+	fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose)
+	if path.endswith('last-message.txt'):
+		# Create a wrapper like codecs.open does:
+		info = codecs.lookup("utf-8")
+		fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict')
+		fp.encoding = 'utf-8'
+	return fp
+
+old_vfs_call = scmutil.vfs.__call__
+scmutil.vfs.__call__ = better_vfs_call
+
+
+
diff --git a/readme.md b/readme.md
index f66a0cc..2458933 100644
--- a/readme.md
+++ b/readme.md
@@ -1,30 +1,30 @@
-This is a Python command line client for relatively popular wiki hosting http://www.wikidot.com which lets you:
-
-* List all pages on a site
-* See all revisions of a page
-* Query page source
-
-Most interestingly, it allows you to download the whole site as a Mercurial repository, with proper commit dates and comments!
-
-##### Examples:
-
-    crawl.py http://example.wikidot.com --dump ExampleRepo
-    crawl.py http://example.wikidot.com --log --page example-page
-
-It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers.
-
-Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed.
-
-##### Useful links:
-
-Wikidot code (very old) which simplifies things a bit:
-
-* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
-
-The descriptions for on-site modules are heavily correlated with AJAX ones:
-
-* http://www.wikidot.com/doc-modules:listpages-module
-
-Someone else did Wikidot AJAX:
-
-* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py
+This is a Python command line client for relatively popular wiki hosting http://www.wikidot.com which lets you:
+
+* List all pages on a site
+* See all revisions of a page
+* Query page source
+
+Most interestingly, it allows you to download the whole site as a Mercurial repository, with proper commit dates and comments!
+
+##### Examples:
+
+    crawl.py http://example.wikidot.com --dump ExampleRepo
+    crawl.py http://example.wikidot.com --log --page example-page
+
+It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers.
+
+Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed.
+
+##### Useful links:
+
+Wikidot code (very old) which simplifies things a bit:
+
+* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
+
+The descriptions for on-site modules are heavily correlated with AJAX ones:
+
+* http://www.wikidot.com/doc-modules:listpages-module
+
+Someone else did Wikidot AJAX:
+
+* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py
diff --git a/rmaint.py b/rmaint.py
index 029319f..fe21027 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -1,263 +1,263 @@
-import os
-import codecs
-from mercurial import commands, ui, hg
-import hgpatch
-import cPickle as pickle
-import wikidot
-
-# Repository builder and maintainer
-# Contains logic for actual loading and maintaining the repository over the course of its construction.
-
-# Usage:
-#   rm = RepoMaintainer(wikidot, path)
-#   rm.buildRevisionList(pages, depth)
-#   rm.openRepo()
-#   while rm.commitNext():
-#		pass
-#   rm.cleanup()
-
-# Talkative.
-
-class RepoMaintainer:
-	def __init__(self, wikidot, path):
-		# Settings
-		self.wd = wikidot			# Wikidot instance
-		self.path = path			# Path to repository
-		self.debug = False			# = True to enable more printing
-		self.storeRevIds = True		# = True to store .revid with each commit
-		
-		# Internal state
-		self.wrevs = None			# Compiled wikidot revision list (history)
-		
-		self.rev_no	= 0				# Next revision to process
-		self.last_names = {}		# Tracks page renames: name atm -> last name in repo
-		self.last_parents = {}		# Tracks page parent names: name atm -> last parent in repo
-		
-		self.ui = None				# Mercurial UI object
-		self.repo = None			# Mercurial repo object
-
-
-	#
-	# Saves and loads revision list from file
-	#
-	def saveWRevs(self):
-		fp = open(self.path+'\\.wrevs', 'wb')
-		pickle.dump(self.wrevs, fp)
-		fp.close()
-	
-	def loadWRevs(self):
-		fp = open(self.path+'\\.wrevs', 'rb')
-		self.wrevs = pickle.load(fp)
-		fp.close()
-
-	#
-	# Compiles a combined revision list for a given set of pages, or all pages on the site.
-	#  pages: compile history for these pages
-	#  depth: download at most this number of revisions.
-	#
-	# If there exists a cached revision list at the repository destination,
-	# it is loaded and no requests are made.
-	#
-	def buildRevisionList(self, pages = None, depth = 10000):
-		if os.path.isfile(self.path+'\\.wrevs'):
-			print "Loading cached revision list..."
-			self.loadWRevs()
-		else:
-			print "Building revision list..."
-			if not pages:
-				pages = self.wd.list_pages(10000)
-			self.wrevs = []
-			for page in pages:
-				print "Querying page: "+page
-				page_id = self.wd.get_page_id(page)
-				print "ID: "+str(page_id)
-				revs = self.wd.get_revisions(page_id, depth)
-				print "Revisions: "+str(len(revs))
-				for rev in revs:
-					self.wrevs.append({
-					  'page_id' : page_id,
-					  'page_name' : page, # name atm, not at revision time
-					  'rev_id' : rev['id'],
-					  'date' : rev['date'],
-					  'user' : rev['user'],
-					  'comment' : rev['comment'],
-					})
-			self.saveWRevs() # Save a cached copy
-			print ""
-		
-		
-		print "Total revisions: "+str(len(self.wrevs))
-		
-		print "Sorting revisions..."
-		self.wrevs.sort(key=lambda rev: rev['date'])
-		print ""
-		
-		if self.debug:
-			print "Revision list: "
-			for rev in self.wrevs:
-				print str(rev)+"\n"
-			print ""
-
-
-	#
-	# Saves and loads operational state from file
-	#
-	def saveState(self):
-		fp = open(self.path+'\\.wstate', 'wb')
-		pickle.dump(self.rev_no, fp)
-		pickle.dump(self.last_names, fp)
-		pickle.dump(self.last_parents, fp)
-		fp.close()
-	
-	def loadState(self):
-		fp = open(self.path+'\\.wstate', 'rb')
-		self.rev_no = pickle.load(fp)
-		self.last_names = pickle.load(fp)
-		try:
-			self.last_parents = pickle.load(fp)
-		except EOFError:
-			pass
-		fp.close()
-
-
-	#
-	# Initializes the construction process, after the revision list has been compiled.
-	# Either creates a new repo, or loads the existing one at the target path
-	# and restores its construction state.
-	#
-	def openRepo(self):
-		# Create a new repository or continue from aborted dump
-		self.ui=ui.ui()
-		self.last_names = {} # Tracks page renames: name atm -> last name in repo
-		self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
-		
-		if os.path.isfile(self.path+'\\.wstate'):
-			print "Continuing from aborted dump state..."
-			self.loadState()
-			self.repo = hg.repository(self.ui, self.path)
-		
-		else: # create a new repository (will fail if one exists)
-			print "Initializing repository..."
-			commands.init(self.ui, self.path)
-			self.repo = hg.repository(self.ui, self.path)
-			self.rev_no = 0
-			
-			if self.storeRevIds:
-				# Add revision id file to the new repo
-				fname = self.path+'\\.revid'
-				codecs.open(fname, "w", "UTF-8").close()
-				commands.add(self.ui, self.repo, str(fname))
-	
-	
-	#
-	# Takes an unprocessed revision from a revision log, fetches its data and commits it.
-	# Returns false if no unprocessed revisions remain.
-	#
-	def commitNext(self):
-		if self.rev_no >= len(self.wrevs):
-			return False
-			
-		rev = self.wrevs[self.rev_no]
-		source = self.wd.get_revision_source(rev['rev_id'])
-		# Page title and unix_name changes are only available through another request:
-		details = self.wd.get_revision_version(rev['rev_id'])
-		
-		# Store revision_id for last commit
-		# Without this, empty commits (e.g. file uploads) will be skipped by Mercurial
-		if self.storeRevIds:
-			fname = self.path+'\\.revid'
-			outp = codecs.open(fname, "w", "UTF-8")
-			outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
-			outp.close()
-		
-		unixname = rev['page_name']
-		rev_unixname = details['unixname'] # may be different in revision than atm
-		
-		# Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.
-		# The only way to know they were changed is revision comments, though evil people may trick us.
-		if rev['comment'].startswith('Parent page set to: "'):
-			# This is a parenting revision, remember the new parent
-			parent_unixname = rev['comment'][21:-2]
-			self.last_parents[unixname] = parent_unixname
-		else:
-			# Else use last parent_unixname we've recorded
-			parent_unixname =  self.last_parents[unixname] if unixname in self.last_parents else None
-		# There are also problems when parent page gets renamed -- see updateChildren
-		
-		# If the page is tracked and its name just changed, tell HG
-		rename = (unixname in self.last_names) and (self.last_names[unixname] <> rev_unixname)
-		if rename:
-			self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
-			commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt')
-		
-		# Ouput contents
-		fname = self.path+'\\'+rev_unixname+'.txt'
-		outp = codecs.open(fname, "w", "UTF-8")
-		if details['title']:
-			outp.write('title:'+details['title']+'\n')
-		if parent_unixname:
-			outp.write('parent:'+parent_unixname+'\n')
-		outp.write(source)
-		outp.close()
-		
-		# Add new page
-		if not unixname in self.last_names: # never before seen
-			commands.add(self.ui, self.repo, str(fname))
-
-		self.last_names[unixname] = rev_unixname
-
-		# Commit
-		if rev['comment'] <> '':
-			commit_msg = rev_unixname + ': ' + rev['comment']
-		else:
-			commit_msg = rev_unixname
-		if rev['date']:
-			commit_date = str(rev['date']) + ' 0'
-		else:
-			commit_date = None
-		print "Commiting: "+str(self.rev_no)+'. '+commit_msg
-
-		commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date)
-		self.rev_no += 1
-
-		self.saveState() # Update operation state
-		return True
-
-
-	#
-	# Updates all children of the page to reflect parent's unixname change.
-	#
-	# Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body.
-	# A parent may then be renamed.
-	# Wikidot logs no additional changes for child pages, yet they stay linked to the parent.
-	#
-	# Therefore, on every rename we must update all linked children in the same revision.
-	#
-	def updateChildren(self, oldunixname, newunixname):
-		for child in self.last_parents.keys():
-			if self.last_parents[child] == oldunixname:
-				self.updateParentField(child, self.last_parents[child], newunixname)
-	
-	#
-	# Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
-	# The rest of the file is preserved.
-	#
-	def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
-		with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f:
-			content = f.readlines()
-		# Since this is all tracked by us, we KNOW there's a line in standard format somewhere
-		idx = content.index('parent:'+parent_oldunixname+'\n')
-		if idx < 0:
-			raise Exception("Cannot update child page "+child_unixname+": "
-				+"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
-		content[idx] = 'parent:'+parent_newunixname+'\n'
-		with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f:
-			f.writelines(content)
-
-
-	#
-	# Finalizes the construction process and deletes any temporary files.
-	#
-	def cleanup(self):
-		os.remove(self.path+'\\.wstate')
+import os
+import codecs
+from mercurial import commands, ui, hg
+import hgpatch
+import cPickle as pickle
+import wikidot
+
+# Repository builder and maintainer
+# Contains logic for actual loading and maintaining the repository over the course of its construction.
+
+# Usage:
+#   rm = RepoMaintainer(wikidot, path)
+#   rm.buildRevisionList(pages, depth)
+#   rm.openRepo()
+#   while rm.commitNext():
+#		pass
+#   rm.cleanup()
+
+# Talkative.
+
+class RepoMaintainer:
+	def __init__(self, wikidot, path):
+		# Settings
+		self.wd = wikidot			# Wikidot instance
+		self.path = path			# Path to repository
+		self.debug = False			# = True to enable more printing
+		self.storeRevIds = True		# = True to store .revid with each commit
+		
+		# Internal state
+		self.wrevs = None			# Compiled wikidot revision list (history)
+		
+		self.rev_no	= 0				# Next revision to process
+		self.last_names = {}		# Tracks page renames: name atm -> last name in repo
+		self.last_parents = {}		# Tracks page parent names: name atm -> last parent in repo
+		
+		self.ui = None				# Mercurial UI object
+		self.repo = None			# Mercurial repo object
+
+
+	#
+	# Saves and loads revision list from file
+	#
+	def saveWRevs(self):
+		fp = open(self.path+'\\.wrevs', 'wb')
+		pickle.dump(self.wrevs, fp)
+		fp.close()
+	
+	def loadWRevs(self):
+		fp = open(self.path+'\\.wrevs', 'rb')
+		self.wrevs = pickle.load(fp)
+		fp.close()
+
+	#
+	# Compiles a combined revision list for a given set of pages, or all pages on the site.
+	#  pages: compile history for these pages
+	#  depth: download at most this number of revisions.
+	#
+	# If there exists a cached revision list at the repository destination,
+	# it is loaded and no requests are made.
+	#
+	def buildRevisionList(self, pages = None, depth = 10000):
+		if os.path.isfile(self.path+'\\.wrevs'):
+			print "Loading cached revision list..."
+			self.loadWRevs()
+		else:
+			print "Building revision list..."
+			if not pages:
+				pages = self.wd.list_pages(10000)
+			self.wrevs = []
+			for page in pages:
+				print "Querying page: "+page
+				page_id = self.wd.get_page_id(page)
+				print "ID: "+str(page_id)
+				revs = self.wd.get_revisions(page_id, depth)
+				print "Revisions: "+str(len(revs))
+				for rev in revs:
+					self.wrevs.append({
+					  'page_id' : page_id,
+					  'page_name' : page, # name atm, not at revision time
+					  'rev_id' : rev['id'],
+					  'date' : rev['date'],
+					  'user' : rev['user'],
+					  'comment' : rev['comment'],
+					})
+			self.saveWRevs() # Save a cached copy
+			print ""
+		
+		
+		print "Total revisions: "+str(len(self.wrevs))
+		
+		print "Sorting revisions..."
+		self.wrevs.sort(key=lambda rev: rev['date'])
+		print ""
+		
+		if self.debug:
+			print "Revision list: "
+			for rev in self.wrevs:
+				print str(rev)+"\n"
+			print ""
+
+
+	#
+	# Saves and loads operational state from file
+	#
+	def saveState(self):
+		fp = open(self.path+'\\.wstate', 'wb')
+		pickle.dump(self.rev_no, fp)
+		pickle.dump(self.last_names, fp)
+		pickle.dump(self.last_parents, fp)
+		fp.close()
+	
+	def loadState(self):
+		fp = open(self.path+'\\.wstate', 'rb')
+		self.rev_no = pickle.load(fp)
+		self.last_names = pickle.load(fp)
+		try:
+			self.last_parents = pickle.load(fp)
+		except EOFError:
+			pass
+		fp.close()
+
+
+	#
+	# Initializes the construction process, after the revision list has been compiled.
+	# Either creates a new repo, or loads the existing one at the target path
+	# and restores its construction state.
+	#
+	def openRepo(self):
+		# Create a new repository or continue from aborted dump
+		self.ui=ui.ui()
+		self.last_names = {} # Tracks page renames: name atm -> last name in repo
+		self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
+		
+		if os.path.isfile(self.path+'\\.wstate'):
+			print "Continuing from aborted dump state..."
+			self.loadState()
+			self.repo = hg.repository(self.ui, self.path)
+		
+		else: # create a new repository (will fail if one exists)
+			print "Initializing repository..."
+			commands.init(self.ui, self.path)
+			self.repo = hg.repository(self.ui, self.path)
+			self.rev_no = 0
+			
+			if self.storeRevIds:
+				# Add revision id file to the new repo
+				fname = self.path+'\\.revid'
+				codecs.open(fname, "w", "UTF-8").close()
+				commands.add(self.ui, self.repo, str(fname))
+	
+	
+	#
+	# Takes an unprocessed revision from a revision log, fetches its data and commits it.
+	# Returns false if no unprocessed revisions remain.
+	#
+	def commitNext(self):
+		if self.rev_no >= len(self.wrevs):
+			return False
+			
+		rev = self.wrevs[self.rev_no]
+		source = self.wd.get_revision_source(rev['rev_id'])
+		# Page title and unix_name changes are only available through another request:
+		details = self.wd.get_revision_version(rev['rev_id'])
+		
+		# Store revision_id for last commit
+		# Without this, empty commits (e.g. file uploads) will be skipped by Mercurial
+		if self.storeRevIds:
+			fname = self.path+'\\.revid'
+			outp = codecs.open(fname, "w", "UTF-8")
+			outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
+			outp.close()
+		
+		unixname = rev['page_name']
+		rev_unixname = details['unixname'] # may be different in revision than atm
+		
+		# Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.
+		# The only way to know they were changed is revision comments, though evil people may trick us.
+		if rev['comment'].startswith('Parent page set to: "'):
+			# This is a parenting revision, remember the new parent
+			parent_unixname = rev['comment'][21:-2]
+			self.last_parents[unixname] = parent_unixname
+		else:
+			# Else use last parent_unixname we've recorded
+			parent_unixname =  self.last_parents[unixname] if unixname in self.last_parents else None
+		# There are also problems when parent page gets renamed -- see updateChildren
+		
+		# If the page is tracked and its name just changed, tell HG
+		rename = (unixname in self.last_names) and (self.last_names[unixname] <> rev_unixname)
+		if rename:
+			self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
+			commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt')
+		
+		# Ouput contents
+		fname = self.path+'\\'+rev_unixname+'.txt'
+		outp = codecs.open(fname, "w", "UTF-8")
+		if details['title']:
+			outp.write('title:'+details['title']+'\n')
+		if parent_unixname:
+			outp.write('parent:'+parent_unixname+'\n')
+		outp.write(source)
+		outp.close()
+		
+		# Add new page
+		if not unixname in self.last_names: # never before seen
+			commands.add(self.ui, self.repo, str(fname))
+
+		self.last_names[unixname] = rev_unixname
+
+		# Commit
+		if rev['comment'] <> '':
+			commit_msg = rev_unixname + ': ' + rev['comment']
+		else:
+			commit_msg = rev_unixname
+		if rev['date']:
+			commit_date = str(rev['date']) + ' 0'
+		else:
+			commit_date = None
+		print "Commiting: "+str(self.rev_no)+'. '+commit_msg
+
+		commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date)
+		self.rev_no += 1
+
+		self.saveState() # Update operation state
+		return True
+
+
+	#
+	# Updates all children of the page to reflect parent's unixname change.
+	#
+	# Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body.
+	# A parent may then be renamed.
+	# Wikidot logs no additional changes for child pages, yet they stay linked to the parent.
+	#
+	# Therefore, on every rename we must update all linked children in the same revision.
+	#
+	def updateChildren(self, oldunixname, newunixname):
+		for child in self.last_parents.keys():
+			if self.last_parents[child] == oldunixname:
+				self.updateParentField(child, self.last_parents[child], newunixname)
+	
+	#
+	# Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
+	# The rest of the file is preserved.
+	#
+	def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
+		with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f:
+			content = f.readlines()
+		# Since this is all tracked by us, we KNOW there's a line in standard format somewhere
+		idx = content.index('parent:'+parent_oldunixname+'\n')
+		if idx < 0:
+			raise Exception("Cannot update child page "+child_unixname+": "
+				+"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
+		content[idx] = 'parent:'+parent_newunixname+'\n'
+		with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f:
+			f.writelines(content)
+
+
+	#
+	# Finalizes the construction process and deletes any temporary files.
+	#
+	def cleanup(self):
+		os.remove(self.path+'\\.wstate')
 		os.remove(self.path+'\\.wrevs')
\ No newline at end of file
diff --git a/wikidot.py b/wikidot.py
index f01c59f..df2252d 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -1,193 +1,193 @@
-import requests
-import random
-from bs4 import BeautifulSoup
-import time
-
-# Implements various queries to Wikidot engine through its AJAX facilities
-
-
-class Wikidot:
-	def __init__(self, site):
-		self.site = site		# Wikidot site to query
-		self.delay = 200		# Delay between requests in msec
-		self.debug = False		# Print debug messages
-		self.next_timeslot = time.clock()	# Can call immediately
-
-
-	# To honor usage rules, we wait for self.delay between requests.
-	# Low-level query functions call this before every request to Wikidot./
-	def _wait_request_slot(self):
-		tm = time.clock()
-		if self.next_timeslot - tm > 0:
-			time.sleep(self.next_timeslot - tm)
-		self.next_timeslot = tm + self.delay / 1000
-		pass
-
-	# Makes a Wikidot AJAX query. Returns the response+title or throws an error.
-	def queryex(self, params):
-		token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
-		cookies = {"wikidot_token7": token}
-		params['wikidot_token7'] = token
-	
-		if self.debug:
-			print params
-			print cookies
-
-		self._wait_request_slot()
-		req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
-		json = req.json()
-		if json['status'] == 'ok':
-			return json['body'], (json['title'] if 'title' in json else '')
-		else:
-			raise req.text
-
-	# Same but only returns the body, most responses don't have titles
-	def query(self, params):
-		return self.queryex(params)[0]
-
-
-	# List all pages for the site.
-
-	# Raw version
-	# For the supported formats (module_body) see:
-	# See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
-	def list_pages_raw(self, limit):
-		res = self.query({
-		  'moduleName': 'list/ListPagesModule',
-		  'limit': limit if limit else '10000',
-		  'perPage': limit if limit else '10000',
-		  'module_body': '%%page_unix_name%%',
-		  'separate': 'false',
-		  'order': 'dateCreatedDesc',  # This way limit makes sense. This is also the default
-		})
-		return res
-
-	# Client version
-	def list_pages(self, limit):
-		raw = self.list_pages_raw(limit).replace('<br/>',"\n")
-		soup = BeautifulSoup(raw, 'html.parser')
-		pages = []
-		for entry in soup.div.p.text.split('\n'):
-			pages.append(entry)
-		return pages
-
-
-	# Retrieves internal page_id by page unix_name.
-	# Page IDs are required for most of page functions.
-
-	def get_page_id(self, page_unix_name):
-		# The only freaking way to get page ID is to load the page! Wikidot!
-		self._wait_request_slot()
-		req = requests.request('GET', self.site+'/'+page_unix_name)
-		soup = BeautifulSoup(req.text, 'html.parser')
-		for item in soup.head.find_all('script'):
-			text = item.text
-			pos = text.find("WIKIREQUEST.info.pageId = ")
-			if pos >= 0:
-				pos += len("WIKIREQUEST.info.pageId = ")
-				crlf = text.find(";", pos)
-				if crlf >= 0:
-					return int(text[pos:crlf])
-				else:
-					return int(text[pos:])
-		return None
-
-
-	# Retrieves a list of revisions for a page.
-	# See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
-
-	# Raw version
-	def get_revisions_raw(self, page_id, limit):
-		res = self.query({
-		  'moduleName': 'history/PageRevisionListModule',
-		  'page_id': page_id,
-		  'page': '1',
-		  'perpage': limit if limit else '10000',
-		  'options': '{"all":true}'
-		})
-		
-		soup = BeautifulSoup(res, 'html.parser')
-		return soup.table.contents
-
-	# Client version
-	def get_revisions(self, page_id, limit):
-		revs = []
-		for tr in self.get_revisions_raw(page_id, limit):
-			if tr.name != 'tr': continue # there's a header + various junk
-
-			# RevID is stored as a value of an INPUT field
-			rev_id = tr.input['value'] if tr.input else None
-			if rev_id is None: continue # can't parse
-
-			# Unixtime is stored as a CSS class time_*
-			rev_date = 0
-			date_span = tr.find("span", attrs={"class": "odate"})
-			if date_span is not None:
-				for cls in date_span['class']:
-					if cls.startswith('time_'):
-						rev_date = int(cls[5:])
-
-			# Username in a last <a> under <span class="printuser">
-			user_span = tr.find("span", attrs={"class": "printuser"})
-			for last_a in user_span.find_all('a'): pass
-			rev_user = last_a.getText() if last_a else None
-			
-
-			# Comment is in the last TD of the row
-			last_td = None
-			for last_td in tr.find_all('td'): pass
-			rev_comment = last_td.getText() if last_td else ""
-
-			revs.append({
-				'id': rev_id,
-				'date': rev_date,
-				'user': rev_user,
-				'comment': rev_comment,
-			})
-		return revs
-
-
-	# Retrieves revision source for a revision.
-	# There's no raw version because there's nothing else in raw.
-	def get_revision_source(self, rev_id):
-		res = self.query({
-		  'moduleName': 'history/PageSourceModule',
-		  'revision_id': rev_id,
-		  # We don't need page id
-		})
-		# The source is HTMLified but BeautifulSoup's getText() will decode that
-		# - htmlentities
-		# - <br/>s in place of linebreaks
-		# - random real linebreaks (have to be ignored)
-		soup = BeautifulSoup(res, 'html.parser')
-		return soup.div.getText().lstrip(' \r\n')
-	
-	# Retrieves the rendered version + additional info unavailable in get_revision_source:
-	# * Title
-	# * Unixname at the time
-	def get_revision_version_raw(self, rev_id):
-		res = self.queryex({
-		  'moduleName': 'history/PageVersionModule',
-		  'revision_id': rev_id,
-		})
-		return res
-	
-	def get_revision_version(self, rev_id):
-		res = self.get_revision_version_raw(rev_id) # this has title!
-		soup = BeautifulSoup(res[0], 'html.parser')
-
-		# First table is a flyout with revision details. Remove and study it.
-		unixname = None
-		details = soup.find("div", attrs={"id": "page-version-info"}).extract()
-		for tr in details.find_all('tr'):
-			tds = tr.find_all('td')
-			if len(tds) < 2: continue
-			if tds[0].getText().strip() == 'Page name:':
-				unixname = tds[1].getText().strip()
-
-		return {
-		  'rev_id': rev_id,
-		  'unixname': unixname,
-		  'title': res[1],
-		  'content': unicode(soup), # only content remains
+import requests
+import random
+from bs4 import BeautifulSoup
+import time
+
+# Implements various queries to Wikidot engine through its AJAX facilities
+
+
+class Wikidot:
+	def __init__(self, site):
+		self.site = site		# Wikidot site to query
+		self.delay = 200		# Delay between requests in msec
+		self.debug = False		# Print debug messages
+		self.next_timeslot = time.clock()	# Can call immediately
+
+
+	# To honor usage rules, we wait for self.delay between requests.
+	# Low-level query functions call this before every request to Wikidot./
+	def _wait_request_slot(self):
+		tm = time.clock()
+		if self.next_timeslot - tm > 0:
+			time.sleep(self.next_timeslot - tm)
+		self.next_timeslot = tm + self.delay / 1000
+		pass
+
+	# Makes a Wikidot AJAX query. Returns the response+title or throws an error.
+	def queryex(self, params):
+		token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
+		cookies = {"wikidot_token7": token}
+		params['wikidot_token7'] = token
+	
+		if self.debug:
+			print params
+			print cookies
+
+		self._wait_request_slot()
+		req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
+		json = req.json()
+		if json['status'] == 'ok':
+			return json['body'], (json['title'] if 'title' in json else '')
+		else:
+			raise req.text
+
+	# Same but only returns the body, most responses don't have titles
+	def query(self, params):
+		return self.queryex(params)[0]
+
+
+	# List all pages for the site.
+
+	# Raw version
+	# For the supported formats (module_body) see:
+	# See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
+	def list_pages_raw(self, limit):
+		res = self.query({
+		  'moduleName': 'list/ListPagesModule',
+		  'limit': limit if limit else '10000',
+		  'perPage': limit if limit else '10000',
+		  'module_body': '%%page_unix_name%%',
+		  'separate': 'false',
+		  'order': 'dateCreatedDesc',  # This way limit makes sense. This is also the default
+		})
+		return res
+
+	# Client version
+	def list_pages(self, limit):
+		raw = self.list_pages_raw(limit).replace('<br/>',"\n")
+		soup = BeautifulSoup(raw, 'html.parser')
+		pages = []
+		for entry in soup.div.p.text.split('\n'):
+			pages.append(entry)
+		return pages
+
+
+	# Retrieves internal page_id by page unix_name.
+	# Page IDs are required for most of page functions.
+
+	def get_page_id(self, page_unix_name):
+		# The only freaking way to get page ID is to load the page! Wikidot!
+		self._wait_request_slot()
+		req = requests.request('GET', self.site+'/'+page_unix_name)
+		soup = BeautifulSoup(req.text, 'html.parser')
+		for item in soup.head.find_all('script'):
+			text = item.text
+			pos = text.find("WIKIREQUEST.info.pageId = ")
+			if pos >= 0:
+				pos += len("WIKIREQUEST.info.pageId = ")
+				crlf = text.find(";", pos)
+				if crlf >= 0:
+					return int(text[pos:crlf])
+				else:
+					return int(text[pos:])
+		return None
+
+
+	# Retrieves a list of revisions for a page.
+	# See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
+
+	# Raw version
+	def get_revisions_raw(self, page_id, limit):
+		res = self.query({
+		  'moduleName': 'history/PageRevisionListModule',
+		  'page_id': page_id,
+		  'page': '1',
+		  'perpage': limit if limit else '10000',
+		  'options': '{"all":true}'
+		})
+		
+		soup = BeautifulSoup(res, 'html.parser')
+		return soup.table.contents
+
+	# Client version
+	def get_revisions(self, page_id, limit):
+		revs = []
+		for tr in self.get_revisions_raw(page_id, limit):
+			if tr.name != 'tr': continue # there's a header + various junk
+
+			# RevID is stored as a value of an INPUT field
+			rev_id = tr.input['value'] if tr.input else None
+			if rev_id is None: continue # can't parse
+
+			# Unixtime is stored as a CSS class time_*
+			rev_date = 0
+			date_span = tr.find("span", attrs={"class": "odate"})
+			if date_span is not None:
+				for cls in date_span['class']:
+					if cls.startswith('time_'):
+						rev_date = int(cls[5:])
+
+			# Username in a last <a> under <span class="printuser">
+			user_span = tr.find("span", attrs={"class": "printuser"})
+			for last_a in user_span.find_all('a'): pass
+			rev_user = last_a.getText() if last_a else None
+			
+
+			# Comment is in the last TD of the row
+			last_td = None
+			for last_td in tr.find_all('td'): pass
+			rev_comment = last_td.getText() if last_td else ""
+
+			revs.append({
+				'id': rev_id,
+				'date': rev_date,
+				'user': rev_user,
+				'comment': rev_comment,
+			})
+		return revs
+
+
+	# Retrieves revision source for a revision.
+	# There's no raw version because there's nothing else in raw.
+	def get_revision_source(self, rev_id):
+		res = self.query({
+		  'moduleName': 'history/PageSourceModule',
+		  'revision_id': rev_id,
+		  # We don't need page id
+		})
+		# The source is HTMLified but BeautifulSoup's getText() will decode that
+		# - htmlentities
+		# - <br/>s in place of linebreaks
+		# - random real linebreaks (have to be ignored)
+		soup = BeautifulSoup(res, 'html.parser')
+		return soup.div.getText().lstrip(' \r\n')
+	
+	# Retrieves the rendered version + additional info unavailable in get_revision_source:
+	# * Title
+	# * Unixname at the time
+	def get_revision_version_raw(self, rev_id):
+		res = self.queryex({
+		  'moduleName': 'history/PageVersionModule',
+		  'revision_id': rev_id,
+		})
+		return res
+	
+	def get_revision_version(self, rev_id):
+		res = self.get_revision_version_raw(rev_id) # this has title!
+		soup = BeautifulSoup(res[0], 'html.parser')
+
+		# First table is a flyout with revision details. Remove and study it.
+		unixname = None
+		details = soup.find("div", attrs={"id": "page-version-info"}).extract()
+		for tr in details.find_all('tr'):
+			tds = tr.find_all('td')
+			if len(tds) < 2: continue
+			if tds[0].getText().strip() == 'Page name:':
+				unixname = tds[1].getText().strip()
+
+		return {
+		  'rev_id': rev_id,
+		  'unixname': unixname,
+		  'title': res[1],
+		  'content': unicode(soup), # only content remains
 		}
\ No newline at end of file

From 68e9b67aa906547ff2b3b23e7dfe6ea50c6e2fa4 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 13:43:08 +0200
Subject: [PATCH 05/93] 2to3 on the rest

---
 crawl.py   | 12 ++++++------
 hgpatch.py |  2 +-
 rmaint.py  | 38 +++++++++++++++++++-------------------
 wikidot.py |  6 +++---
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/crawl.py b/crawl.py
index 08bbbf3..0516205 100644
--- a/crawl.py
+++ b/crawl.py
@@ -50,7 +50,7 @@ def force_dirs(path):
             raise exception
 
 if args.list_pages_raw:
-	print(wd.list_pages_raw(args.depth))
+	print((wd.list_pages_raw(args.depth)))
 
 elif args.list_pages:
 	for page in wd.list_pages(args.depth):
@@ -65,7 +65,7 @@ def force_dirs(path):
 		raise Exception("Page not found: "+args.page)
 	
 	revs = wd.get_revisions(page_id, 1) # last revision
-	print(wd.get_revision_source(revs[0]['id']))
+	print((wd.get_revision_source(revs[0]['id'])))
 
 elif args.content:
 	if not args.page:
@@ -76,7 +76,7 @@ def force_dirs(path):
 		raise Exception("Page not found: "+args.page)
 	
 	revs = wd.get_revisions(page_id, 1) # last revision
-	print(wd.get_revision_version(revs[0]['id']))
+	print((wd.get_revision_version(revs[0]['id'])))
 
 elif args.log_raw:
 	if not args.page:
@@ -86,7 +86,7 @@ def force_dirs(path):
 	if not page_id:
 		raise Exception("Page not found: "+args.page)
 
-	print(wd.get_revisions_raw(page_id, args.depth))
+	print((wd.get_revisions_raw(page_id, args.depth)))
 
 
 elif args.log:
@@ -97,11 +97,11 @@ def force_dirs(path):
 	if not page_id:
 		raise Exception("Page not found: "+args.page)
 	for rev in wd.get_revisions(page_id, args.depth):
-		print(str(rev))
+		print((str(rev)))
 
 
 elif args.dump:
-	print("Downloading pages to "+args.dump)
+	print(("Downloading pages to "+args.dump))
 	force_dirs(args.dump)
 	
 	rm = RepoMaintainer(wd, args.dump)
diff --git a/hgpatch.py b/hgpatch.py
index 2d77769..de363ba 100644
--- a/hgpatch.py
+++ b/hgpatch.py
@@ -17,7 +17,7 @@
 old_fromlocal = None
 
 def better_fromlocal(s):
-	if isinstance(s, unicode):
+	if isinstance(s, str):
 		return s.encode('utf-8')
 	global old_fromlocal
 	return old_fromlocal(s)
diff --git a/rmaint.py b/rmaint.py
index fe21027..bfc0c87 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -2,7 +2,7 @@
 import codecs
 from mercurial import commands, ui, hg
 import hgpatch
-import cPickle as pickle
+import pickle as pickle
 import wikidot
 
 # Repository builder and maintainer
@@ -60,19 +60,19 @@ def loadWRevs(self):
 	#
 	def buildRevisionList(self, pages = None, depth = 10000):
 		if os.path.isfile(self.path+'\\.wrevs'):
-			print "Loading cached revision list..."
+			print("Loading cached revision list...")
 			self.loadWRevs()
 		else:
-			print "Building revision list..."
+			print("Building revision list...")
 			if not pages:
 				pages = self.wd.list_pages(10000)
 			self.wrevs = []
 			for page in pages:
-				print "Querying page: "+page
+				print(("Querying page: "+page))
 				page_id = self.wd.get_page_id(page)
-				print "ID: "+str(page_id)
+				print(("ID: "+str(page_id)))
 				revs = self.wd.get_revisions(page_id, depth)
-				print "Revisions: "+str(len(revs))
+				print(("Revisions: "+str(len(revs))))
 				for rev in revs:
 					self.wrevs.append({
 					  'page_id' : page_id,
@@ -83,20 +83,20 @@ def buildRevisionList(self, pages = None, depth = 10000):
 					  'comment' : rev['comment'],
 					})
 			self.saveWRevs() # Save a cached copy
-			print ""
+			print("")
 		
 		
-		print "Total revisions: "+str(len(self.wrevs))
+		print(("Total revisions: "+str(len(self.wrevs))))
 		
-		print "Sorting revisions..."
+		print("Sorting revisions...")
 		self.wrevs.sort(key=lambda rev: rev['date'])
-		print ""
+		print("")
 		
 		if self.debug:
-			print "Revision list: "
+			print("Revision list: ")
 			for rev in self.wrevs:
-				print str(rev)+"\n"
-			print ""
+				print((str(rev)+"\n"))
+			print("")
 
 
 	#
@@ -132,12 +132,12 @@ def openRepo(self):
 		self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
 		
 		if os.path.isfile(self.path+'\\.wstate'):
-			print "Continuing from aborted dump state..."
+			print("Continuing from aborted dump state...")
 			self.loadState()
 			self.repo = hg.repository(self.ui, self.path)
 		
 		else: # create a new repository (will fail if one exists)
-			print "Initializing repository..."
+			print("Initializing repository...")
 			commands.init(self.ui, self.path)
 			self.repo = hg.repository(self.ui, self.path)
 			self.rev_no = 0
@@ -185,7 +185,7 @@ def commitNext(self):
 		# There are also problems when parent page gets renamed -- see updateChildren
 		
 		# If the page is tracked and its name just changed, tell HG
-		rename = (unixname in self.last_names) and (self.last_names[unixname] <> rev_unixname)
+		rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
 		if rename:
 			self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
 			commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt')
@@ -207,7 +207,7 @@ def commitNext(self):
 		self.last_names[unixname] = rev_unixname
 
 		# Commit
-		if rev['comment'] <> '':
+		if rev['comment'] != '':
 			commit_msg = rev_unixname + ': ' + rev['comment']
 		else:
 			commit_msg = rev_unixname
@@ -215,7 +215,7 @@ def commitNext(self):
 			commit_date = str(rev['date']) + ' 0'
 		else:
 			commit_date = None
-		print "Commiting: "+str(self.rev_no)+'. '+commit_msg
+		print(("Commiting: "+str(self.rev_no)+'. '+commit_msg))
 
 		commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date)
 		self.rev_no += 1
@@ -234,7 +234,7 @@ def commitNext(self):
 	# Therefore, on every rename we must update all linked children in the same revision.
 	#
 	def updateChildren(self, oldunixname, newunixname):
-		for child in self.last_parents.keys():
+		for child in list(self.last_parents.keys()):
 			if self.last_parents[child] == oldunixname:
 				self.updateParentField(child, self.last_parents[child], newunixname)
 	
diff --git a/wikidot.py b/wikidot.py
index df2252d..ba1c218 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -30,8 +30,8 @@ def queryex(self, params):
 		params['wikidot_token7'] = token
 	
 		if self.debug:
-			print params
-			print cookies
+			print(params)
+			print(cookies)
 
 		self._wait_request_slot()
 		req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
@@ -189,5 +189,5 @@ def get_revision_version(self, rev_id):
 		  'rev_id': rev_id,
 		  'unixname': unixname,
 		  'title': res[1],
-		  'content': unicode(soup), # only content remains
+		  'content': str(soup), # only content remains
 		}
\ No newline at end of file

From 3f5fbd0c9bde859a48f3b47f746edb3cb445a417 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 13:54:35 +0200
Subject: [PATCH 06/93] tabs to spaces

---
 crawl.py   |  98 ++++++------
 hgpatch.py |  22 +--
 rmaint.py  | 456 ++++++++++++++++++++++++++---------------------------
 wikidot.py | 368 +++++++++++++++++++++---------------------
 4 files changed, 472 insertions(+), 472 deletions(-)

diff --git a/crawl.py b/crawl.py
index 0516205..5ca5dfe 100644
--- a/crawl.py
+++ b/crawl.py
@@ -50,69 +50,69 @@ def force_dirs(path):
             raise exception
 
 if args.list_pages_raw:
-	print((wd.list_pages_raw(args.depth)))
+    print((wd.list_pages_raw(args.depth)))
 
 elif args.list_pages:
-	for page in wd.list_pages(args.depth):
-		print(page)
+    for page in wd.list_pages(args.depth):
+        print(page)
 
 elif args.source:
-	if not args.page:
-		raise Exception("Please specify --page for --source.")
-	
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise Exception("Page not found: "+args.page)
-	
-	revs = wd.get_revisions(page_id, 1) # last revision
-	print((wd.get_revision_source(revs[0]['id'])))
+    if not args.page:
+        raise Exception("Please specify --page for --source.")
+    
+    page_id = wd.get_page_id(args.page)
+    if not page_id:
+        raise Exception("Page not found: "+args.page)
+    
+    revs = wd.get_revisions(page_id, 1) # last revision
+    print((wd.get_revision_source(revs[0]['id'])))
 
 elif args.content:
-	if not args.page:
-		raise Exception("Please specify --page for --source.")
-	
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise Exception("Page not found: "+args.page)
-	
-	revs = wd.get_revisions(page_id, 1) # last revision
-	print((wd.get_revision_version(revs[0]['id'])))
+    if not args.page:
+        raise Exception("Please specify --page for --source.")
+    
+    page_id = wd.get_page_id(args.page)
+    if not page_id:
+        raise Exception("Page not found: "+args.page)
+    
+    revs = wd.get_revisions(page_id, 1) # last revision
+    print((wd.get_revision_version(revs[0]['id'])))
 
 elif args.log_raw:
-	if not args.page:
-		raise Exception("Please specify --page for --log.")
+    if not args.page:
+        raise Exception("Please specify --page for --log.")
 
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise Exception("Page not found: "+args.page)
+    page_id = wd.get_page_id(args.page)
+    if not page_id:
+        raise Exception("Page not found: "+args.page)
 
-	print((wd.get_revisions_raw(page_id, args.depth)))
+    print((wd.get_revisions_raw(page_id, args.depth)))
 
 
 elif args.log:
-	if not args.page:
-		raise Exception("Please specify --page for --log.")
+    if not args.page:
+        raise Exception("Please specify --page for --log.")
 
-	page_id = wd.get_page_id(args.page)
-	if not page_id:
-		raise Exception("Page not found: "+args.page)
-	for rev in wd.get_revisions(page_id, args.depth):
-		print((str(rev)))
+    page_id = wd.get_page_id(args.page)
+    if not page_id:
+        raise Exception("Page not found: "+args.page)
+    for rev in wd.get_revisions(page_id, args.depth):
+        print((str(rev)))
 
 
 elif args.dump:
-	print(("Downloading pages to "+args.dump))
-	force_dirs(args.dump)
-	
-	rm = RepoMaintainer(wd, args.dump)
-	rm.debug = args.debug
-	rm.storeRevIds = args.revids
-	rm.buildRevisionList([args.page] if args.page else None, args.depth)
-	rm.openRepo()
-	
-	print("Downloading revisions...")
-	while rm.commitNext():
-		pass
-	
-	rm.cleanup()
-	print("Done.")
+    print(("Downloading pages to "+args.dump))
+    force_dirs(args.dump)
+    
+    rm = RepoMaintainer(wd, args.dump)
+    rm.debug = args.debug
+    rm.storeRevIds = args.revids
+    rm.buildRevisionList([args.page] if args.page else None, args.depth)
+    rm.openRepo()
+    
+    print("Downloading revisions...")
+    while rm.commitNext():
+        pass
+    
+    rm.cleanup()
+    print("Done.")
diff --git a/hgpatch.py b/hgpatch.py
index de363ba..02aed23 100644
--- a/hgpatch.py
+++ b/hgpatch.py
@@ -17,10 +17,10 @@
 old_fromlocal = None
 
 def better_fromlocal(s):
-	if isinstance(s, str):
-		return s.encode('utf-8')
-	global old_fromlocal
-	return old_fromlocal(s)
+    if isinstance(s, str):
+        return s.encode('utf-8')
+    global old_fromlocal
+    return old_fromlocal(s)
 
 old_fromlocal = encoding.fromlocal
 encoding.fromlocal = better_fromlocal
@@ -35,13 +35,13 @@ def better_fromlocal(s):
 old_vfs_call = None
 
 def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False):
-	fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose)
-	if path.endswith('last-message.txt'):
-		# Create a wrapper like codecs.open does:
-		info = codecs.lookup("utf-8")
-		fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict')
-		fp.encoding = 'utf-8'
-	return fp
+    fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose)
+    if path.endswith('last-message.txt'):
+        # Create a wrapper like codecs.open does:
+        info = codecs.lookup("utf-8")
+        fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict')
+        fp.encoding = 'utf-8'
+    return fp
 
 old_vfs_call = scmutil.vfs.__call__
 scmutil.vfs.__call__ = better_vfs_call
diff --git a/rmaint.py b/rmaint.py
index bfc0c87..d64415d 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -13,251 +13,251 @@
 #   rm.buildRevisionList(pages, depth)
 #   rm.openRepo()
 #   while rm.commitNext():
-#		pass
+#       pass
 #   rm.cleanup()
 
 # Talkative.
 
 class RepoMaintainer:
-	def __init__(self, wikidot, path):
-		# Settings
-		self.wd = wikidot			# Wikidot instance
-		self.path = path			# Path to repository
-		self.debug = False			# = True to enable more printing
-		self.storeRevIds = True		# = True to store .revid with each commit
-		
-		# Internal state
-		self.wrevs = None			# Compiled wikidot revision list (history)
-		
-		self.rev_no	= 0				# Next revision to process
-		self.last_names = {}		# Tracks page renames: name atm -> last name in repo
-		self.last_parents = {}		# Tracks page parent names: name atm -> last parent in repo
-		
-		self.ui = None				# Mercurial UI object
-		self.repo = None			# Mercurial repo object
+    def __init__(self, wikidot, path):
+        # Settings
+        self.wd = wikidot           # Wikidot instance
+        self.path = path            # Path to repository
+        self.debug = False          # = True to enable more printing
+        self.storeRevIds = True     # = True to store .revid with each commit
+        
+        # Internal state
+        self.wrevs = None           # Compiled wikidot revision list (history)
+        
+        self.rev_no = 0             # Next revision to process
+        self.last_names = {}        # Tracks page renames: name atm -> last name in repo
+        self.last_parents = {}      # Tracks page parent names: name atm -> last parent in repo
+        
+        self.ui = None              # Mercurial UI object
+        self.repo = None            # Mercurial repo object
 
 
-	#
-	# Saves and loads revision list from file
-	#
-	def saveWRevs(self):
-		fp = open(self.path+'\\.wrevs', 'wb')
-		pickle.dump(self.wrevs, fp)
-		fp.close()
-	
-	def loadWRevs(self):
-		fp = open(self.path+'\\.wrevs', 'rb')
-		self.wrevs = pickle.load(fp)
-		fp.close()
+    #
+    # Saves and loads revision list from file
+    #
+    def saveWRevs(self):
+        fp = open(self.path+'\\.wrevs', 'wb')
+        pickle.dump(self.wrevs, fp)
+        fp.close()
+    
+    def loadWRevs(self):
+        fp = open(self.path+'\\.wrevs', 'rb')
+        self.wrevs = pickle.load(fp)
+        fp.close()
 
-	#
-	# Compiles a combined revision list for a given set of pages, or all pages on the site.
-	#  pages: compile history for these pages
-	#  depth: download at most this number of revisions.
-	#
-	# If there exists a cached revision list at the repository destination,
-	# it is loaded and no requests are made.
-	#
-	def buildRevisionList(self, pages = None, depth = 10000):
-		if os.path.isfile(self.path+'\\.wrevs'):
-			print("Loading cached revision list...")
-			self.loadWRevs()
-		else:
-			print("Building revision list...")
-			if not pages:
-				pages = self.wd.list_pages(10000)
-			self.wrevs = []
-			for page in pages:
-				print(("Querying page: "+page))
-				page_id = self.wd.get_page_id(page)
-				print(("ID: "+str(page_id)))
-				revs = self.wd.get_revisions(page_id, depth)
-				print(("Revisions: "+str(len(revs))))
-				for rev in revs:
-					self.wrevs.append({
-					  'page_id' : page_id,
-					  'page_name' : page, # name atm, not at revision time
-					  'rev_id' : rev['id'],
-					  'date' : rev['date'],
-					  'user' : rev['user'],
-					  'comment' : rev['comment'],
-					})
-			self.saveWRevs() # Save a cached copy
-			print("")
-		
-		
-		print(("Total revisions: "+str(len(self.wrevs))))
-		
-		print("Sorting revisions...")
-		self.wrevs.sort(key=lambda rev: rev['date'])
-		print("")
-		
-		if self.debug:
-			print("Revision list: ")
-			for rev in self.wrevs:
-				print((str(rev)+"\n"))
-			print("")
+    #
+    # Compiles a combined revision list for a given set of pages, or all pages on the site.
+    #  pages: compile history for these pages
+    #  depth: download at most this number of revisions.
+    #
+    # If there exists a cached revision list at the repository destination,
+    # it is loaded and no requests are made.
+    #
+    def buildRevisionList(self, pages = None, depth = 10000):
+        if os.path.isfile(self.path+'\\.wrevs'):
+            print("Loading cached revision list...")
+            self.loadWRevs()
+        else:
+            print("Building revision list...")
+            if not pages:
+                pages = self.wd.list_pages(10000)
+            self.wrevs = []
+            for page in pages:
+                print(("Querying page: "+page))
+                page_id = self.wd.get_page_id(page)
+                print(("ID: "+str(page_id)))
+                revs = self.wd.get_revisions(page_id, depth)
+                print(("Revisions: "+str(len(revs))))
+                for rev in revs:
+                    self.wrevs.append({
+                      'page_id' : page_id,
+                      'page_name' : page, # name atm, not at revision time
+                      'rev_id' : rev['id'],
+                      'date' : rev['date'],
+                      'user' : rev['user'],
+                      'comment' : rev['comment'],
+                    })
+            self.saveWRevs() # Save a cached copy
+            print("")
+        
+        
+        print(("Total revisions: "+str(len(self.wrevs))))
+        
+        print("Sorting revisions...")
+        self.wrevs.sort(key=lambda rev: rev['date'])
+        print("")
+        
+        if self.debug:
+            print("Revision list: ")
+            for rev in self.wrevs:
+                print((str(rev)+"\n"))
+            print("")
 
 
-	#
-	# Saves and loads operational state from file
-	#
-	def saveState(self):
-		fp = open(self.path+'\\.wstate', 'wb')
-		pickle.dump(self.rev_no, fp)
-		pickle.dump(self.last_names, fp)
-		pickle.dump(self.last_parents, fp)
-		fp.close()
-	
-	def loadState(self):
-		fp = open(self.path+'\\.wstate', 'rb')
-		self.rev_no = pickle.load(fp)
-		self.last_names = pickle.load(fp)
-		try:
-			self.last_parents = pickle.load(fp)
-		except EOFError:
-			pass
-		fp.close()
+    #
+    # Saves and loads operational state from file
+    #
+    def saveState(self):
+        fp = open(self.path+'\\.wstate', 'wb')
+        pickle.dump(self.rev_no, fp)
+        pickle.dump(self.last_names, fp)
+        pickle.dump(self.last_parents, fp)
+        fp.close()
+    
+    def loadState(self):
+        fp = open(self.path+'\\.wstate', 'rb')
+        self.rev_no = pickle.load(fp)
+        self.last_names = pickle.load(fp)
+        try:
+            self.last_parents = pickle.load(fp)
+        except EOFError:
+            pass
+        fp.close()
 
 
-	#
-	# Initializes the construction process, after the revision list has been compiled.
-	# Either creates a new repo, or loads the existing one at the target path
-	# and restores its construction state.
-	#
-	def openRepo(self):
-		# Create a new repository or continue from aborted dump
-		self.ui=ui.ui()
-		self.last_names = {} # Tracks page renames: name atm -> last name in repo
-		self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
-		
-		if os.path.isfile(self.path+'\\.wstate'):
-			print("Continuing from aborted dump state...")
-			self.loadState()
-			self.repo = hg.repository(self.ui, self.path)
-		
-		else: # create a new repository (will fail if one exists)
-			print("Initializing repository...")
-			commands.init(self.ui, self.path)
-			self.repo = hg.repository(self.ui, self.path)
-			self.rev_no = 0
-			
-			if self.storeRevIds:
-				# Add revision id file to the new repo
-				fname = self.path+'\\.revid'
-				codecs.open(fname, "w", "UTF-8").close()
-				commands.add(self.ui, self.repo, str(fname))
-	
-	
-	#
-	# Takes an unprocessed revision from a revision log, fetches its data and commits it.
-	# Returns false if no unprocessed revisions remain.
-	#
-	def commitNext(self):
-		if self.rev_no >= len(self.wrevs):
-			return False
-			
-		rev = self.wrevs[self.rev_no]
-		source = self.wd.get_revision_source(rev['rev_id'])
-		# Page title and unix_name changes are only available through another request:
-		details = self.wd.get_revision_version(rev['rev_id'])
-		
-		# Store revision_id for last commit
-		# Without this, empty commits (e.g. file uploads) will be skipped by Mercurial
-		if self.storeRevIds:
-			fname = self.path+'\\.revid'
-			outp = codecs.open(fname, "w", "UTF-8")
-			outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
-			outp.close()
-		
-		unixname = rev['page_name']
-		rev_unixname = details['unixname'] # may be different in revision than atm
-		
-		# Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.
-		# The only way to know they were changed is revision comments, though evil people may trick us.
-		if rev['comment'].startswith('Parent page set to: "'):
-			# This is a parenting revision, remember the new parent
-			parent_unixname = rev['comment'][21:-2]
-			self.last_parents[unixname] = parent_unixname
-		else:
-			# Else use last parent_unixname we've recorded
-			parent_unixname =  self.last_parents[unixname] if unixname in self.last_parents else None
-		# There are also problems when parent page gets renamed -- see updateChildren
-		
-		# If the page is tracked and its name just changed, tell HG
-		rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
-		if rename:
-			self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
-			commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt')
-		
-		# Ouput contents
-		fname = self.path+'\\'+rev_unixname+'.txt'
-		outp = codecs.open(fname, "w", "UTF-8")
-		if details['title']:
-			outp.write('title:'+details['title']+'\n')
-		if parent_unixname:
-			outp.write('parent:'+parent_unixname+'\n')
-		outp.write(source)
-		outp.close()
-		
-		# Add new page
-		if not unixname in self.last_names: # never before seen
-			commands.add(self.ui, self.repo, str(fname))
+    #
+    # Initializes the construction process, after the revision list has been compiled.
+    # Either creates a new repo, or loads the existing one at the target path
+    # and restores its construction state.
+    #
+    def openRepo(self):
+        # Create a new repository or continue from aborted dump
+        self.ui=ui.ui()
+        self.last_names = {} # Tracks page renames: name atm -> last name in repo
+        self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
+        
+        if os.path.isfile(self.path+'\\.wstate'):
+            print("Continuing from aborted dump state...")
+            self.loadState()
+            self.repo = hg.repository(self.ui, self.path)
+        
+        else: # create a new repository (will fail if one exists)
+            print("Initializing repository...")
+            commands.init(self.ui, self.path)
+            self.repo = hg.repository(self.ui, self.path)
+            self.rev_no = 0
+            
+            if self.storeRevIds:
+                # Add revision id file to the new repo
+                fname = self.path+'\\.revid'
+                codecs.open(fname, "w", "UTF-8").close()
+                commands.add(self.ui, self.repo, str(fname))
+    
+    
+    #
+    # Takes an unprocessed revision from a revision log, fetches its data and commits it.
+    # Returns false if no unprocessed revisions remain.
+    #
+    def commitNext(self):
+        if self.rev_no >= len(self.wrevs):
+            return False
+            
+        rev = self.wrevs[self.rev_no]
+        source = self.wd.get_revision_source(rev['rev_id'])
+        # Page title and unix_name changes are only available through another request:
+        details = self.wd.get_revision_version(rev['rev_id'])
+        
+        # Store revision_id for last commit
+        # Without this, empty commits (e.g. file uploads) will be skipped by Mercurial
+        if self.storeRevIds:
+            fname = self.path+'\\.revid'
+            outp = codecs.open(fname, "w", "UTF-8")
+            outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
+            outp.close()
+        
+        unixname = rev['page_name']
+        rev_unixname = details['unixname'] # may be different in revision than atm
+        
+        # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.
+        # The only way to know they were changed is revision comments, though evil people may trick us.
+        if rev['comment'].startswith('Parent page set to: "'):
+            # This is a parenting revision, remember the new parent
+            parent_unixname = rev['comment'][21:-2]
+            self.last_parents[unixname] = parent_unixname
+        else:
+            # Else use last parent_unixname we've recorded
+            parent_unixname =  self.last_parents[unixname] if unixname in self.last_parents else None
+        # There are also problems when parent page gets renamed -- see updateChildren
+        
+        # If the page is tracked and its name just changed, tell HG
+        rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
+        if rename:
+            self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
+            commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt')
+        
+        # Ouput contents
+        fname = self.path+'\\'+rev_unixname+'.txt'
+        outp = codecs.open(fname, "w", "UTF-8")
+        if details['title']:
+            outp.write('title:'+details['title']+'\n')
+        if parent_unixname:
+            outp.write('parent:'+parent_unixname+'\n')
+        outp.write(source)
+        outp.close()
+        
+        # Add new page
+        if not unixname in self.last_names: # never before seen
+            commands.add(self.ui, self.repo, str(fname))
 
-		self.last_names[unixname] = rev_unixname
+        self.last_names[unixname] = rev_unixname
 
-		# Commit
-		if rev['comment'] != '':
-			commit_msg = rev_unixname + ': ' + rev['comment']
-		else:
-			commit_msg = rev_unixname
-		if rev['date']:
-			commit_date = str(rev['date']) + ' 0'
-		else:
-			commit_date = None
-		print(("Commiting: "+str(self.rev_no)+'. '+commit_msg))
+        # Commit
+        if rev['comment'] != '':
+            commit_msg = rev_unixname + ': ' + rev['comment']
+        else:
+            commit_msg = rev_unixname
+        if rev['date']:
+            commit_date = str(rev['date']) + ' 0'
+        else:
+            commit_date = None
+        print(("Commiting: "+str(self.rev_no)+'. '+commit_msg))
 
-		commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date)
-		self.rev_no += 1
+        commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date)
+        self.rev_no += 1
 
-		self.saveState() # Update operation state
-		return True
+        self.saveState() # Update operation state
+        return True
 
 
-	#
-	# Updates all children of the page to reflect parent's unixname change.
-	#
-	# Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body.
-	# A parent may then be renamed.
-	# Wikidot logs no additional changes for child pages, yet they stay linked to the parent.
-	#
-	# Therefore, on every rename we must update all linked children in the same revision.
-	#
-	def updateChildren(self, oldunixname, newunixname):
-		for child in list(self.last_parents.keys()):
-			if self.last_parents[child] == oldunixname:
-				self.updateParentField(child, self.last_parents[child], newunixname)
-	
-	#
-	# Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
-	# The rest of the file is preserved.
-	#
-	def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
-		with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f:
-			content = f.readlines()
-		# Since this is all tracked by us, we KNOW there's a line in standard format somewhere
-		idx = content.index('parent:'+parent_oldunixname+'\n')
-		if idx < 0:
-			raise Exception("Cannot update child page "+child_unixname+": "
-				+"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
-		content[idx] = 'parent:'+parent_newunixname+'\n'
-		with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f:
-			f.writelines(content)
+    #
+    # Updates all children of the page to reflect parent's unixname change.
+    #
+    # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body.
+    # A parent may then be renamed.
+    # Wikidot logs no additional changes for child pages, yet they stay linked to the parent.
+    #
+    # Therefore, on every rename we must update all linked children in the same revision.
+    #
+    def updateChildren(self, oldunixname, newunixname):
+        for child in list(self.last_parents.keys()):
+            if self.last_parents[child] == oldunixname:
+                self.updateParentField(child, self.last_parents[child], newunixname)
+    
+    #
+    # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
+    # The rest of the file is preserved.
+    #
+    def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
+        with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f:
+            content = f.readlines()
+        # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
+        idx = content.index('parent:'+parent_oldunixname+'\n')
+        if idx < 0:
+            raise Exception("Cannot update child page "+child_unixname+": "
+                +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
+        content[idx] = 'parent:'+parent_newunixname+'\n'
+        with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f:
+            f.writelines(content)
 
 
-	#
-	# Finalizes the construction process and deletes any temporary files.
-	#
-	def cleanup(self):
-		os.remove(self.path+'\\.wstate')
-		os.remove(self.path+'\\.wrevs')
\ No newline at end of file
+    #
+    # Finalizes the construction process and deletes any temporary files.
+    #
+    def cleanup(self):
+        os.remove(self.path+'\\.wstate')
+        os.remove(self.path+'\\.wrevs')
\ No newline at end of file
diff --git a/wikidot.py b/wikidot.py
index ba1c218..4760f5f 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -7,187 +7,187 @@
 
 
 class Wikidot:
-	def __init__(self, site):
-		self.site = site		# Wikidot site to query
-		self.delay = 200		# Delay between requests in msec
-		self.debug = False		# Print debug messages
-		self.next_timeslot = time.clock()	# Can call immediately
-
-
-	# To honor usage rules, we wait for self.delay between requests.
-	# Low-level query functions call this before every request to Wikidot./
-	def _wait_request_slot(self):
-		tm = time.clock()
-		if self.next_timeslot - tm > 0:
-			time.sleep(self.next_timeslot - tm)
-		self.next_timeslot = tm + self.delay / 1000
-		pass
-
-	# Makes a Wikidot AJAX query. Returns the response+title or throws an error.
-	def queryex(self, params):
-		token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
-		cookies = {"wikidot_token7": token}
-		params['wikidot_token7'] = token
-	
-		if self.debug:
-			print(params)
-			print(cookies)
-
-		self._wait_request_slot()
-		req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
-		json = req.json()
-		if json['status'] == 'ok':
-			return json['body'], (json['title'] if 'title' in json else '')
-		else:
-			raise req.text
-
-	# Same but only returns the body, most responses don't have titles
-	def query(self, params):
-		return self.queryex(params)[0]
-
-
-	# List all pages for the site.
-
-	# Raw version
-	# For the supported formats (module_body) see:
-	# See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
-	def list_pages_raw(self, limit):
-		res = self.query({
-		  'moduleName': 'list/ListPagesModule',
-		  'limit': limit if limit else '10000',
-		  'perPage': limit if limit else '10000',
-		  'module_body': '%%page_unix_name%%',
-		  'separate': 'false',
-		  'order': 'dateCreatedDesc',  # This way limit makes sense. This is also the default
-		})
-		return res
-
-	# Client version
-	def list_pages(self, limit):
-		raw = self.list_pages_raw(limit).replace('<br/>',"\n")
-		soup = BeautifulSoup(raw, 'html.parser')
-		pages = []
-		for entry in soup.div.p.text.split('\n'):
-			pages.append(entry)
-		return pages
-
-
-	# Retrieves internal page_id by page unix_name.
-	# Page IDs are required for most of page functions.
-
-	def get_page_id(self, page_unix_name):
-		# The only freaking way to get page ID is to load the page! Wikidot!
-		self._wait_request_slot()
-		req = requests.request('GET', self.site+'/'+page_unix_name)
-		soup = BeautifulSoup(req.text, 'html.parser')
-		for item in soup.head.find_all('script'):
-			text = item.text
-			pos = text.find("WIKIREQUEST.info.pageId = ")
-			if pos >= 0:
-				pos += len("WIKIREQUEST.info.pageId = ")
-				crlf = text.find(";", pos)
-				if crlf >= 0:
-					return int(text[pos:crlf])
-				else:
-					return int(text[pos:])
-		return None
-
-
-	# Retrieves a list of revisions for a page.
-	# See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
-
-	# Raw version
-	def get_revisions_raw(self, page_id, limit):
-		res = self.query({
-		  'moduleName': 'history/PageRevisionListModule',
-		  'page_id': page_id,
-		  'page': '1',
-		  'perpage': limit if limit else '10000',
-		  'options': '{"all":true}'
-		})
-		
-		soup = BeautifulSoup(res, 'html.parser')
-		return soup.table.contents
-
-	# Client version
-	def get_revisions(self, page_id, limit):
-		revs = []
-		for tr in self.get_revisions_raw(page_id, limit):
-			if tr.name != 'tr': continue # there's a header + various junk
-
-			# RevID is stored as a value of an INPUT field
-			rev_id = tr.input['value'] if tr.input else None
-			if rev_id is None: continue # can't parse
-
-			# Unixtime is stored as a CSS class time_*
-			rev_date = 0
-			date_span = tr.find("span", attrs={"class": "odate"})
-			if date_span is not None:
-				for cls in date_span['class']:
-					if cls.startswith('time_'):
-						rev_date = int(cls[5:])
-
-			# Username in a last <a> under <span class="printuser">
-			user_span = tr.find("span", attrs={"class": "printuser"})
-			for last_a in user_span.find_all('a'): pass
-			rev_user = last_a.getText() if last_a else None
-			
-
-			# Comment is in the last TD of the row
-			last_td = None
-			for last_td in tr.find_all('td'): pass
-			rev_comment = last_td.getText() if last_td else ""
-
-			revs.append({
-				'id': rev_id,
-				'date': rev_date,
-				'user': rev_user,
-				'comment': rev_comment,
-			})
-		return revs
-
-
-	# Retrieves revision source for a revision.
-	# There's no raw version because there's nothing else in raw.
-	def get_revision_source(self, rev_id):
-		res = self.query({
-		  'moduleName': 'history/PageSourceModule',
-		  'revision_id': rev_id,
-		  # We don't need page id
-		})
-		# The source is HTMLified but BeautifulSoup's getText() will decode that
-		# - htmlentities
-		# - <br/>s in place of linebreaks
-		# - random real linebreaks (have to be ignored)
-		soup = BeautifulSoup(res, 'html.parser')
-		return soup.div.getText().lstrip(' \r\n')
-	
-	# Retrieves the rendered version + additional info unavailable in get_revision_source:
-	# * Title
-	# * Unixname at the time
-	def get_revision_version_raw(self, rev_id):
-		res = self.queryex({
-		  'moduleName': 'history/PageVersionModule',
-		  'revision_id': rev_id,
-		})
-		return res
-	
-	def get_revision_version(self, rev_id):
-		res = self.get_revision_version_raw(rev_id) # this has title!
-		soup = BeautifulSoup(res[0], 'html.parser')
-
-		# First table is a flyout with revision details. Remove and study it.
-		unixname = None
-		details = soup.find("div", attrs={"id": "page-version-info"}).extract()
-		for tr in details.find_all('tr'):
-			tds = tr.find_all('td')
-			if len(tds) < 2: continue
-			if tds[0].getText().strip() == 'Page name:':
-				unixname = tds[1].getText().strip()
-
-		return {
-		  'rev_id': rev_id,
-		  'unixname': unixname,
-		  'title': res[1],
-		  'content': str(soup), # only content remains
-		}
\ No newline at end of file
+    def __init__(self, site):
+        self.site = site        # Wikidot site to query
+        self.delay = 200        # Delay between requests in msec
+        self.debug = False      # Print debug messages
+        self.next_timeslot = time.clock()   # Can call immediately
+
+
+    # To honor usage rules, we wait for self.delay between requests.
+    # Low-level query functions call this before every request to Wikidot./
+    def _wait_request_slot(self):
+        tm = time.clock()
+        if self.next_timeslot - tm > 0:
+            time.sleep(self.next_timeslot - tm)
+        self.next_timeslot = tm + self.delay / 1000
+        pass
+
+    # Makes a Wikidot AJAX query. Returns the response+title or throws an error.
+    def queryex(self, params):
+        token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
+        cookies = {"wikidot_token7": token}
+        params['wikidot_token7'] = token
+    
+        if self.debug:
+            print(params)
+            print(cookies)
+
+        self._wait_request_slot()
+        req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
+        json = req.json()
+        if json['status'] == 'ok':
+            return json['body'], (json['title'] if 'title' in json else '')
+        else:
+            raise req.text
+
+    # Same but only returns the body, most responses don't have titles
+    def query(self, params):
+        return self.queryex(params)[0]
+
+
+    # List all pages for the site.
+
+    # Raw version
+    # For the supported formats (module_body) see:
+    # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
+    def list_pages_raw(self, limit):
+        res = self.query({
+          'moduleName': 'list/ListPagesModule',
+          'limit': limit if limit else '10000',
+          'perPage': limit if limit else '10000',
+          'module_body': '%%page_unix_name%%',
+          'separate': 'false',
+          'order': 'dateCreatedDesc',  # This way limit makes sense. This is also the default
+        })
+        return res
+
+    # Client version
+    def list_pages(self, limit):
+        raw = self.list_pages_raw(limit).replace('<br/>',"\n")
+        soup = BeautifulSoup(raw, 'html.parser')
+        pages = []
+        for entry in soup.div.p.text.split('\n'):
+            pages.append(entry)
+        return pages
+
+
+    # Retrieves internal page_id by page unix_name.
+    # Page IDs are required for most of page functions.
+
+    def get_page_id(self, page_unix_name):
+        # The only freaking way to get page ID is to load the page! Wikidot!
+        self._wait_request_slot()
+        req = requests.request('GET', self.site+'/'+page_unix_name)
+        soup = BeautifulSoup(req.text, 'html.parser')
+        for item in soup.head.find_all('script'):
+            text = item.text
+            pos = text.find("WIKIREQUEST.info.pageId = ")
+            if pos >= 0:
+                pos += len("WIKIREQUEST.info.pageId = ")
+                crlf = text.find(";", pos)
+                if crlf >= 0:
+                    return int(text[pos:crlf])
+                else:
+                    return int(text[pos:])
+        return None
+
+
+    # Retrieves a list of revisions for a page.
+    # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
+
+    # Raw version
+    def get_revisions_raw(self, page_id, limit):
+        res = self.query({
+          'moduleName': 'history/PageRevisionListModule',
+          'page_id': page_id,
+          'page': '1',
+          'perpage': limit if limit else '10000',
+          'options': '{"all":true}'
+        })
+        
+        soup = BeautifulSoup(res, 'html.parser')
+        return soup.table.contents
+
+    # Client version
+    def get_revisions(self, page_id, limit):
+        revs = []
+        for tr in self.get_revisions_raw(page_id, limit):
+            if tr.name != 'tr': continue # there's a header + various junk
+
+            # RevID is stored as a value of an INPUT field
+            rev_id = tr.input['value'] if tr.input else None
+            if rev_id is None: continue # can't parse
+
+            # Unixtime is stored as a CSS class time_*
+            rev_date = 0
+            date_span = tr.find("span", attrs={"class": "odate"})
+            if date_span is not None:
+                for cls in date_span['class']:
+                    if cls.startswith('time_'):
+                        rev_date = int(cls[5:])
+
+            # Username in a last <a> under <span class="printuser">
+            user_span = tr.find("span", attrs={"class": "printuser"})
+            for last_a in user_span.find_all('a'): pass
+            rev_user = last_a.getText() if last_a else None
+            
+
+            # Comment is in the last TD of the row
+            last_td = None
+            for last_td in tr.find_all('td'): pass
+            rev_comment = last_td.getText() if last_td else ""
+
+            revs.append({
+                'id': rev_id,
+                'date': rev_date,
+                'user': rev_user,
+                'comment': rev_comment,
+            })
+        return revs
+
+
+    # Retrieves revision source for a revision.
+    # There's no raw version because there's nothing else in raw.
+    def get_revision_source(self, rev_id):
+        res = self.query({
+          'moduleName': 'history/PageSourceModule',
+          'revision_id': rev_id,
+          # We don't need page id
+        })
+        # The source is HTMLified but BeautifulSoup's getText() will decode that
+        # - htmlentities
+        # - <br/>s in place of linebreaks
+        # - random real linebreaks (have to be ignored)
+        soup = BeautifulSoup(res, 'html.parser')
+        return soup.div.getText().lstrip(' \r\n')
+    
+    # Retrieves the rendered version + additional info unavailable in get_revision_source:
+    # * Title
+    # * Unixname at the time
+    def get_revision_version_raw(self, rev_id):
+        res = self.queryex({
+          'moduleName': 'history/PageVersionModule',
+          'revision_id': rev_id,
+        })
+        return res
+    
+    def get_revision_version(self, rev_id):
+        res = self.get_revision_version_raw(rev_id) # this has title!
+        soup = BeautifulSoup(res[0], 'html.parser')
+
+        # First table is a flyout with revision details. Remove and study it.
+        unixname = None
+        details = soup.find("div", attrs={"id": "page-version-info"}).extract()
+        for tr in details.find_all('tr'):
+            tds = tr.find_all('td')
+            if len(tds) < 2: continue
+            if tds[0].getText().strip() == 'Page name:':
+                unixname = tds[1].getText().strip()
+
+        return {
+          'rev_id': rev_id,
+          'unixname': unixname,
+          'title': res[1],
+          'content': str(soup), # only content remains
+        }
\ No newline at end of file

From 31421883b307c66139a65ff5bb73792e2b8ebcec Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 14:01:36 +0200
Subject: [PATCH 07/93] don't kill stdout and stderr

---
 crawl.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/crawl.py b/crawl.py
index 5ca5dfe..409eed1 100644
--- a/crawl.py
+++ b/crawl.py
@@ -11,11 +11,6 @@
 # TODO: Ability to download new transactions since last dump.
 #   We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump)
 
-rawStdout = sys.stdout
-rawStderr = sys.stderr
-sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, 'xmlcharrefreplace')
-sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr, 'xmlcharrefreplace')
-
 parser = argparse.ArgumentParser(description='Queries Wikidot')
 parser.add_argument('site', help='URL of Wikidot site')
 # Actions

From 19c0bc83b1aa10ae01ca707c851a61b75e3361e8 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 15:15:02 +0200
Subject: [PATCH 08/93] os.errno doesn't exist anymore in python3, but exist_ok
 does

---
 crawl.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/crawl.py b/crawl.py
index 409eed1..9fd0bab 100644
--- a/crawl.py
+++ b/crawl.py
@@ -38,11 +38,7 @@
 
 
 def force_dirs(path):
-    try:
-        os.makedirs(path)
-    except OSError as exception:
-        if exception.errno != os.errno.EEXIST:
-            raise exception
+    os.makedirs(path, exist_ok=True)
 
 if args.list_pages_raw:
     print((wd.list_pages_raw(args.depth)))
@@ -98,16 +94,16 @@ def force_dirs(path):
 elif args.dump:
     print(("Downloading pages to "+args.dump))
     force_dirs(args.dump)
-    
+
     rm = RepoMaintainer(wd, args.dump)
     rm.debug = args.debug
     rm.storeRevIds = args.revids
     rm.buildRevisionList([args.page] if args.page else None, args.depth)
     rm.openRepo()
-    
+
     print("Downloading revisions...")
     while rm.commitNext():
         pass
-    
+
     rm.cleanup()
     print("Done.")

From a69ef7cdc32463e87d169d0012ce39070100e43f Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 15:18:16 +0200
Subject: [PATCH 09/93] port to git, mercurial doesn't have a python3 API (at
 least not stable enough to be available by default)

---
 rmaint.py  | 75 ++++++++++++++++++++++++++++++++----------------------
 wikidot.py | 15 ++++++-----
 2 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index d64415d..dc56153 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -1,9 +1,14 @@
+import wikidot
+
+# Basic python stuff
 import os
 import codecs
-from mercurial import commands, ui, hg
-import hgpatch
 import pickle as pickle
-import wikidot
+
+# git stuff
+from git import Repo, Actor
+import time # For parsing unix epoch timestamps from wikidot and convert to normal timestamps
+import re # For sanitizing usernames to fake email addresses
 
 # Repository builder and maintainer
 # Contains logic for actual loading and maintaining the repository over the course of its construction.
@@ -25,16 +30,16 @@ def __init__(self, wikidot, path):
         self.path = path            # Path to repository
         self.debug = False          # = True to enable more printing
         self.storeRevIds = True     # = True to store .revid with each commit
-        
+
         # Internal state
         self.wrevs = None           # Compiled wikidot revision list (history)
-        
+
         self.rev_no = 0             # Next revision to process
         self.last_names = {}        # Tracks page renames: name atm -> last name in repo
         self.last_parents = {}      # Tracks page parent names: name atm -> last parent in repo
-        
-        self.ui = None              # Mercurial UI object
-        self.repo = None            # Mercurial repo object
+
+        self.repo = None            # Git repo object
+        self.index = None           # Git current index object
 
 
     #
@@ -127,28 +132,28 @@ def loadState(self):
     #
     def openRepo(self):
         # Create a new repository or continue from aborted dump
-        self.ui=ui.ui()
         self.last_names = {} # Tracks page renames: name atm -> last name in repo
         self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
         
         if os.path.isfile(self.path+'\\.wstate'):
             print("Continuing from aborted dump state...")
             self.loadState()
-            self.repo = hg.repository(self.ui, self.path)
-        
+            self.repo = Repo(self.path)
+            assert not self.repo.bare
+
         else: # create a new repository (will fail if one exists)
             print("Initializing repository...")
-            commands.init(self.ui, self.path)
-            self.repo = hg.repository(self.ui, self.path)
+            self.repo = Repo.init(self.path)
             self.rev_no = 0
-            
+
             if self.storeRevIds:
                 # Add revision id file to the new repo
-                fname = self.path+'\\.revid'
-                codecs.open(fname, "w", "UTF-8").close()
-                commands.add(self.ui, self.repo, str(fname))
-    
-    
+                fname = '/.revid'
+                codecs.open(self.path + fname, "w", "UTF-8").close()
+                self.repo.index.add([fname])
+                self.index.commit("Initial creation of repo")
+        self.index = self.repo.index
+
     #
     # Takes an unprocessed revision from a revision log, fetches its data and commits it.
     # Returns false if no unprocessed revisions remain.
@@ -156,23 +161,23 @@ def openRepo(self):
     def commitNext(self):
         if self.rev_no >= len(self.wrevs):
             return False
-            
+
         rev = self.wrevs[self.rev_no]
         source = self.wd.get_revision_source(rev['rev_id'])
         # Page title and unix_name changes are only available through another request:
         details = self.wd.get_revision_version(rev['rev_id'])
-        
+
         # Store revision_id for last commit
-        # Without this, empty commits (e.g. file uploads) will be skipped by Mercurial
+        # Without this, empty commits (e.g. file uploads) will be skipped by Git
         if self.storeRevIds:
             fname = self.path+'\\.revid'
             outp = codecs.open(fname, "w", "UTF-8")
             outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
             outp.close()
-        
+
         unixname = rev['page_name']
         rev_unixname = details['unixname'] # may be different in revision than atm
-        
+
         # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.
         # The only way to know they were changed is revision comments, though evil people may trick us.
         if rev['comment'].startswith('Parent page set to: "'):
@@ -183,13 +188,13 @@ def commitNext(self):
             # Else use last parent_unixname we've recorded
             parent_unixname =  self.last_parents[unixname] if unixname in self.last_parents else None
         # There are also problems when parent page gets renamed -- see updateChildren
-        
+
         # If the page is tracked and its name just changed, tell HG
         rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
         if rename:
             self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
-            commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt')
-        
+            self.index.move([str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt'])
+
         # Ouput contents
         fname = self.path+'\\'+rev_unixname+'.txt'
         outp = codecs.open(fname, "w", "UTF-8")
@@ -199,10 +204,13 @@ def commitNext(self):
             outp.write('parent:'+parent_unixname+'\n')
         outp.write(source)
         outp.close()
-        
+
         # Add new page
         if not unixname in self.last_names: # never before seen
-            commands.add(self.ui, self.repo, str(fname))
+            if self.debug:
+                print("adding", fname)
+
+            self.index.add([str(fname)])
 
         self.last_names[unixname] = rev_unixname
 
@@ -212,12 +220,17 @@ def commitNext(self):
         else:
             commit_msg = rev_unixname
         if rev['date']:
-            commit_date = str(rev['date']) + ' 0'
+            parsed_time = time.gmtime(int(rev['date'])) # TODO: assumes GMT
+            commit_date = time.strftime('%Y-%m-%d %H:%M:%S', parsed_time)
         else:
             commit_date = None
         print(("Commiting: "+str(self.rev_no)+'. '+commit_msg))
 
-        commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date)
+        username = str(rev['user'])
+        email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename
+
+        author = Actor(username, email)
+        commit = self.index.commit(commit_msg, author=author, commit_date=commit_date)
         self.rev_no += 1
 
         self.saveState() # Update operation state
diff --git a/wikidot.py b/wikidot.py
index 4760f5f..ba34f39 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -2,6 +2,7 @@
 import random
 from bs4 import BeautifulSoup
 import time
+from urllib.parse import urlparse
 
 # Implements various queries to Wikidot engine through its AJAX facilities
 
@@ -9,6 +10,7 @@
 class Wikidot:
     def __init__(self, site):
         self.site = site        # Wikidot site to query
+        self.sitename = urlparse(site).hostname.lower()
         self.delay = 200        # Delay between requests in msec
         self.debug = False      # Print debug messages
         self.next_timeslot = time.clock()   # Can call immediately
@@ -28,7 +30,7 @@ def queryex(self, params):
         token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
         cookies = {"wikidot_token7": token}
         params['wikidot_token7'] = token
-    
+
         if self.debug:
             print(params)
             print(cookies)
@@ -36,6 +38,7 @@ def queryex(self, params):
         self._wait_request_slot()
         req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
         json = req.json()
+
         if json['status'] == 'ok':
             return json['body'], (json['title'] if 'title' in json else '')
         else:
@@ -105,7 +108,7 @@ def get_revisions_raw(self, page_id, limit):
           'perpage': limit if limit else '10000',
           'options': '{"all":true}'
         })
-        
+
         soup = BeautifulSoup(res, 'html.parser')
         return soup.table.contents
 
@@ -131,7 +134,7 @@ def get_revisions(self, page_id, limit):
             user_span = tr.find("span", attrs={"class": "printuser"})
             for last_a in user_span.find_all('a'): pass
             rev_user = last_a.getText() if last_a else None
-            
+
 
             # Comment is in the last TD of the row
             last_td = None
@@ -161,7 +164,7 @@ def get_revision_source(self, rev_id):
         # - random real linebreaks (have to be ignored)
         soup = BeautifulSoup(res, 'html.parser')
         return soup.div.getText().lstrip(' \r\n')
-    
+
     # Retrieves the rendered version + additional info unavailable in get_revision_source:
     # * Title
     # * Unixname at the time
@@ -171,7 +174,7 @@ def get_revision_version_raw(self, rev_id):
           'revision_id': rev_id,
         })
         return res
-    
+
     def get_revision_version(self, rev_id):
         res = self.get_revision_version_raw(rev_id) # this has title!
         soup = BeautifulSoup(res[0], 'html.parser')
@@ -190,4 +193,4 @@ def get_revision_version(self, rev_id):
           'unixname': unixname,
           'title': res[1],
           'content': str(soup), # only content remains
-        }
\ No newline at end of file
+        }

From 8d9232d7f99f41bd6d74225bf8662885fce7c330 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 15:18:27 +0200
Subject: [PATCH 10/93] use proper path separators

---
 rmaint.py | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index dc56153..d5edec3 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -46,12 +46,12 @@ def __init__(self, wikidot, path):
     # Saves and loads revision list from file
     #
     def saveWRevs(self):
-        fp = open(self.path+'\\.wrevs', 'wb')
+        fp = open(self.path+'/.wrevs', 'wb')
         pickle.dump(self.wrevs, fp)
         fp.close()
-    
+
     def loadWRevs(self):
-        fp = open(self.path+'\\.wrevs', 'rb')
+        fp = open(self.path+'/.wrevs', 'rb')
         self.wrevs = pickle.load(fp)
         fp.close()
 
@@ -64,7 +64,7 @@ def loadWRevs(self):
     # it is loaded and no requests are made.
     #
     def buildRevisionList(self, pages = None, depth = 10000):
-        if os.path.isfile(self.path+'\\.wrevs'):
+        if os.path.isfile(self.path+'/.wrevs'):
             print("Loading cached revision list...")
             self.loadWRevs()
         else:
@@ -108,14 +108,14 @@ def buildRevisionList(self, pages = None, depth = 10000):
     # Saves and loads operational state from file
     #
     def saveState(self):
-        fp = open(self.path+'\\.wstate', 'wb')
+        fp = open(self.path+'/.wstate', 'wb')
         pickle.dump(self.rev_no, fp)
         pickle.dump(self.last_names, fp)
         pickle.dump(self.last_parents, fp)
         fp.close()
     
     def loadState(self):
-        fp = open(self.path+'\\.wstate', 'rb')
+        fp = open(self.path+'/.wstate', 'rb')
         self.rev_no = pickle.load(fp)
         self.last_names = pickle.load(fp)
         try:
@@ -134,8 +134,8 @@ def openRepo(self):
         # Create a new repository or continue from aborted dump
         self.last_names = {} # Tracks page renames: name atm -> last name in repo
         self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
-        
-        if os.path.isfile(self.path+'\\.wstate'):
+
+        if os.path.isfile(self.path+'/.wstate'):
             print("Continuing from aborted dump state...")
             self.loadState()
             self.repo = Repo(self.path)
@@ -170,7 +170,7 @@ def commitNext(self):
         # Store revision_id for last commit
         # Without this, empty commits (e.g. file uploads) will be skipped by Git
         if self.storeRevIds:
-            fname = self.path+'\\.revid'
+            fname = self.path+'/.revid'
             outp = codecs.open(fname, "w", "UTF-8")
             outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
             outp.close()
@@ -192,12 +192,15 @@ def commitNext(self):
         # If the page is tracked and its name just changed, tell HG
         rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
         if rename:
+            if self.debug:
+                print("moving", str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt')
+
             self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
             self.index.move([str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt'])
 
         # Ouput contents
-        fname = self.path+'\\'+rev_unixname+'.txt'
-        outp = codecs.open(fname, "w", "UTF-8")
+        fname = rev_unixname+'.txt'
+        outp = codecs.open(self.path + '/' + fname, "w", "UTF-8")
         if details['title']:
             outp.write('title:'+details['title']+'\n')
         if parent_unixname:
@@ -233,6 +236,9 @@ def commitNext(self):
         commit = self.index.commit(commit_msg, author=author, commit_date=commit_date)
         self.rev_no += 1
 
+        if self.debug:
+            print('committed', commit.name_rev, 'by', author)
+
         self.saveState() # Update operation state
         return True
 
@@ -250,13 +256,13 @@ def updateChildren(self, oldunixname, newunixname):
         for child in list(self.last_parents.keys()):
             if self.last_parents[child] == oldunixname:
                 self.updateParentField(child, self.last_parents[child], newunixname)
-    
+
     #
     # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
     # The rest of the file is preserved.
     #
     def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
-        with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f:
+        with codecs.open(self.path+'/'+child_unixname+'.txt', "r", "UTF-8") as f:
             content = f.readlines()
         # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
         idx = content.index('parent:'+parent_oldunixname+'\n')
@@ -264,7 +270,7 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna
             raise Exception("Cannot update child page "+child_unixname+": "
                 +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
         content[idx] = 'parent:'+parent_newunixname+'\n'
-        with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f:
+        with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f:
             f.writelines(content)
 
 
@@ -272,5 +278,5 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna
     # Finalizes the construction process and deletes any temporary files.
     #
     def cleanup(self):
-        os.remove(self.path+'\\.wstate')
-        os.remove(self.path+'\\.wrevs')
\ No newline at end of file
+        os.remove(self.path+'/.wstate')
+        os.remove(self.path+'/.wrevs')

From 4990468370d7abc43dae77f77434493900632821 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 15:18:49 +0200
Subject: [PATCH 11/93] don't need the mercurial monkeypatching anymore

---
 hgpatch.py | 50 --------------------------------------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 hgpatch.py

diff --git a/hgpatch.py b/hgpatch.py
deleted file mode 100644
index 02aed23..0000000
--- a/hgpatch.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from mercurial import scmutil, osutil
-from types import MethodType
-from mercurial import encoding
-import codecs
-
-# Patches commit-message unicode handling on Python 2.x
-
-# Mercurial is internally unicode. But because it runs from ASCII console, it tries to convert
-# all input from "input encoding" (set in mercurial/encoding.py)
-
-# Problem 1:
-#   If you just pass it u'unicode string', it'll fail. Even if you set "input encoding" to utf-8,
-#   it'll still try to decode it to ASCII.
-# Solution:
-#   Patch this decoding function to pass unicode unchanged.
-
-old_fromlocal = None
-
-def better_fromlocal(s):
-    if isinstance(s, str):
-        return s.encode('utf-8')
-    global old_fromlocal
-    return old_fromlocal(s)
-
-old_fromlocal = encoding.fromlocal
-encoding.fromlocal = better_fromlocal
-
-
-# Problem 2:
-#   Separate from actual log, Mercurial stores commit message in commit-message.txt.
-#   Unfortunately it uses default Python 2.x file.open which expects ASCII and auto-conversion fails.
-# Solution:
-#   Patch virtual-fs open() function to use codecs.open wrapper in this particular case.
-
-old_vfs_call = None
-
-def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False):
-    fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose)
-    if path.endswith('last-message.txt'):
-        # Create a wrapper like codecs.open does:
-        info = codecs.lookup("utf-8")
-        fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict')
-        fp.encoding = 'utf-8'
-    return fp
-
-old_vfs_call = scmutil.vfs.__call__
-scmutil.vfs.__call__ = better_vfs_call
-
-
-

From 110415dcba5f7091d12e73a3c191b316cd1044d1 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 15:21:07 +0200
Subject: [PATCH 12/93] 'better' commit message when no message from author

---
 rmaint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index d5edec3..8174848 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -189,7 +189,7 @@ def commitNext(self):
             parent_unixname =  self.last_parents[unixname] if unixname in self.last_parents else None
         # There are also problems when parent page gets renamed -- see updateChildren
 
-        # If the page is tracked and its name just changed, tell HG
+        # If the page is tracked and its name just changed, tell Git
         rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
         if rename:
             if self.debug:
@@ -221,7 +221,7 @@ def commitNext(self):
         if rev['comment'] != '':
             commit_msg = rev_unixname + ': ' + rev['comment']
         else:
-            commit_msg = rev_unixname
+            commit_msg = 'Updated ' + rev_unixname + ' (no message)'
         if rev['date']:
             parsed_time = time.gmtime(int(rev['date'])) # TODO: assumes GMT
             commit_date = time.strftime('%Y-%m-%d %H:%M:%S', parsed_time)

From 134fc1b57f792e2d7fd48e687397eaea62bf7b0a Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 15:21:23 +0200
Subject: [PATCH 13/93] update readme explaining it now uses git

---
 readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index 2458933..755fdc1 100644
--- a/readme.md
+++ b/readme.md
@@ -4,7 +4,7 @@ This is a Python command line client for relatively popular wiki hosting http://
 * See all revisions of a page
 * Query page source
 
-Most interestingly, it allows you to download the whole site as a Mercurial repository, with proper commit dates and comments!
+Most interestingly, it allows you to download the whole site as a Git repository, with proper commit dates, author and comments!
 
 ##### Examples:
 

From f397b8ccfba816d55005a0e593300777035d6993 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 15:22:53 +0200
Subject: [PATCH 14/93] .hgignore -> .gitignore

---
 .gitignore | 2 ++
 .hgignore  | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 .gitignore
 delete mode 100644 .hgignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a295864
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.pyc
+__pycache__
diff --git a/.hgignore b/.hgignore
deleted file mode 100644
index a26d142..0000000
--- a/.hgignore
+++ /dev/null
@@ -1,2 +0,0 @@
-syntax:glob
-*.pyc

From 3e6e9f1d3be09a1f2b6222daeab99ee568f8f486 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 17:30:49 +0200
Subject: [PATCH 15/93] actually commit changes

---
 rmaint.py  | 21 ++++++++++++++-------
 wikidot.py |  2 ++
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 8174848..b6ed5c9 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -193,10 +193,10 @@ def commitNext(self):
         rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
         if rename:
             if self.debug:
-                print("moving", str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt')
+                print("moving", str(self.last_names[unixname])+'.txt', str(rev_unixname)+'.txt')
 
             self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
-            self.index.move([str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt'])
+            self.index.move([str(self.last_names[unixname])+'.txt', str(rev_unixname)+'.txt'])
 
         # Ouput contents
         fname = rev_unixname+'.txt'
@@ -208,31 +208,38 @@ def commitNext(self):
         outp.write(source)
         outp.close()
 
+        commit_msg = ""
+
         # Add new page
         if not unixname in self.last_names: # never before seen
+            commit_msg += "Created "
             if self.debug:
                 print("adding", fname)
+        elif rev['comment'] == '':
+            commit_msg += "Updated "
 
-            self.index.add([str(fname)])
-
-        self.last_names[unixname] = rev_unixname
+        commit_msg += rev_unixname
 
         # Commit
         if rev['comment'] != '':
-            commit_msg = rev_unixname + ': ' + rev['comment']
+            commit_msg += ': ' + rev['comment']
         else:
-            commit_msg = 'Updated ' + rev_unixname + ' (no message)'
+            commit_msg += ' (no message)'
         if rev['date']:
             parsed_time = time.gmtime(int(rev['date'])) # TODO: assumes GMT
             commit_date = time.strftime('%Y-%m-%d %H:%M:%S', parsed_time)
         else:
             commit_date = None
+
         print(("Commiting: "+str(self.rev_no)+'. '+commit_msg))
 
         username = str(rev['user'])
         email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename
 
         author = Actor(username, email)
+
+        self.index.add([str(fname)])
+        self.last_names[unixname] = rev_unixname
         commit = self.index.commit(commit_msg, author=author, commit_date=commit_date)
         self.rev_no += 1
 
diff --git a/wikidot.py b/wikidot.py
index ba34f39..977c286 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -72,6 +72,8 @@ def list_pages(self, limit):
         pages = []
         for entry in soup.div.p.text.split('\n'):
             pages.append(entry)
+        if self.debug:
+            print('Pages found:', len(pages))
         return pages
 
 

From 61838a1b41cdf5e47ca3ff439a6cdb5de61222bf Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 21 Jul 2019 18:17:37 +0200
Subject: [PATCH 16/93] handle more than 250 pages

---
 wikidot.py | 64 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 977c286..39c61f8 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -26,7 +26,7 @@ def _wait_request_slot(self):
         pass
 
     # Makes a Wikidot AJAX query. Returns the response+title or throws an error.
-    def queryex(self, params):
+    def queryex(self, params, urlAppend = None):
         token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
         cookies = {"wikidot_token7": token}
         params['wikidot_token7'] = token
@@ -36,17 +36,22 @@ def queryex(self, params):
             print(cookies)
 
         self._wait_request_slot()
-        req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
+        url = self.site+'/ajax-module-connector.php'
+        if urlAppend is not None:
+            url += urlAppend
+        print('url', url)
+        req = requests.request('POST', url, data=params, cookies=cookies)
         json = req.json()
 
+        print(json)
         if json['status'] == 'ok':
             return json['body'], (json['title'] if 'title' in json else '')
         else:
             raise req.text
 
     # Same but only returns the body, most responses don't have titles
-    def query(self, params):
-        return self.queryex(params)[0]
+    def query(self, params, urlAppend = None):
+        return self.queryex(params, urlAppend)[0]
 
 
     # List all pages for the site.
@@ -54,26 +59,61 @@ def query(self, params):
     # Raw version
     # For the supported formats (module_body) see:
     # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
-    def list_pages_raw(self, limit):
+    def list_pages_raw(self, limit, offset):
         res = self.query({
           'moduleName': 'list/ListPagesModule',
           'limit': limit if limit else '10000',
           'perPage': limit if limit else '10000',
           'module_body': '%%page_unix_name%%',
           'separate': 'false',
+          'p': str(offset),
           'order': 'dateCreatedDesc',  # This way limit makes sense. This is also the default
-        })
+        }, '/p/' + str(offset))
         return res
 
     # Client version
     def list_pages(self, limit):
-        raw = self.list_pages_raw(limit).replace('<br/>',"\n")
-        soup = BeautifulSoup(raw, 'html.parser')
+        offset = 1
         pages = []
-        for entry in soup.div.p.text.split('\n'):
-            pages.append(entry)
-        if self.debug:
-            print('Pages found:', len(pages))
+
+        while True:
+            raw = self.list_pages_raw(limit, offset).replace('<br/>',"\n")
+            soup = BeautifulSoup(raw, 'html.parser')
+
+
+            for entry in soup.div.p.text.split('\n'):
+                pages.append(entry)
+            if self.debug:
+                print('Pages found:', len(pages))
+
+            targets = soup.find_all('span','target')
+            if len(targets) < 2:
+                print("unable to find next target")
+                break
+
+            next_url = targets[-1].a.get('href').split('/')
+            if len(next_url) > 0 and next_url[-1].isnumeric():
+                next_page = int(next_url[-1])
+                print('next page', next_page)
+            else:
+                print("invalid next url", next_url)
+                break
+
+            #next_page = int(targets[0].a.text)
+
+            current_spans = soup.find_all('span','current')
+            if len(current_spans) > 0:
+                current_page = int(current_spans[0].text)
+                print('current page', current_page)
+            else:
+                print("unable to find current page")
+                break;
+
+            if next_page != offset + 1:
+                print('next page is wrong', next_page)
+                break
+
+            offset += 1
         return pages
 
 

From c63bf3bc1acfe8e62cc4d273cd47335fc2b94c49 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 27 Jul 2019 12:13:02 +0200
Subject: [PATCH 17/93] cache fetched pages

---
 rmaint.py | 81 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 18 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index b6ed5c9..70993c6 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -55,6 +55,11 @@ def loadWRevs(self):
         self.wrevs = pickle.load(fp)
         fp.close()
 
+    def savePages(self, pages):
+        fp = open(self.path+'/.pages', 'wb')
+        pickle.dump(pages, fp)
+        fp.close()
+
     #
     # Compiles a combined revision list for a given set of pages, or all pages on the site.
     #  pages: compile history for these pages
@@ -68,32 +73,72 @@ def buildRevisionList(self, pages = None, depth = 10000):
             print("Loading cached revision list...")
             self.loadWRevs()
         else:
-            print("Building revision list...")
+            self.wrevs = []
+            print('no wrevs')
+
+        print("Building revision list...")
+        if not pages:
+            if os.path.isfile(self.path+'/.pages'):
+                print('loading fetched pages')
+                fp = open(self.path+'/.pages', 'rb')
+                pages = pickle.load(fp)
+                fp.close()
+
+            print('need to fetch pages')
             if not pages:
                 pages = self.wd.list_pages(10000)
-            self.wrevs = []
-            for page in pages:
-                print(("Querying page: "+page))
-                page_id = self.wd.get_page_id(page)
-                print(("ID: "+str(page_id)))
-                revs = self.wd.get_revisions(page_id, depth)
-                print(("Revisions: "+str(len(revs))))
-                for rev in revs:
-                    self.wrevs.append({
-                      'page_id' : page_id,
-                      'page_name' : page, # name atm, not at revision time
-                      'rev_id' : rev['id'],
-                      'date' : rev['date'],
-                      'user' : rev['user'],
-                      'comment' : rev['comment'],
-                    })
+                self.savePages(pages)
+
+
+        fetched_pages = []
+
+        for wrev in self.wrevs:
+            page_name = wrev['page_name']
+
+            if page_name in fetched_pages:
+                continue
+
+            fetched_pages.append(page_name)
+
+        print("fetched " + str(len(fetched_pages)) + " of " + str(len(pages)))
+
+        #self.wrevs = []
+        fetched = 0
+        for page in pages:
+            if page in fetched_pages:
+                print('already fetched', page)
+                continue
+
+            print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages)))
+            fetched += 1
+            page_id = self.wd.get_page_id(page)
+            print(("ID: "+str(page_id)))
+            if page_id is None:
+                print('page lost', page)
+                continue
+
+            revs = self.wd.get_revisions(page_id, depth)
+            print(("Revisions: "+str(len(revs))))
+            for rev in revs:
+                self.wrevs.append({
+                  'page_id' : page_id,
+                  'page_name' : page, # name atm, not at revision time
+                  'rev_id' : rev['id'],
+                  'date' : rev['date'],
+                  'user' : rev['user'],
+                  'comment' : rev['comment'],
+                })
             self.saveWRevs() # Save a cached copy
-            print("")
+        self.saveWRevs() # Save a cached copy
+        os.remove(self.path+'/.pages')
+        print("")
         
         
         print(("Total revisions: "+str(len(self.wrevs))))
         
         print("Sorting revisions...")
+        print(self.wrevs[0])
+        print(self.wrevs[0]['date'])
         self.wrevs.sort(key=lambda rev: rev['date'])
         print("")
         

From 92a05100fd998132d52413bc69b7c8c3cf915dfe Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 27 Jul 2019 12:13:26 +0200
Subject: [PATCH 18/93] less debug spam, fix exception for python3
 compatibility

---
 wikidot.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 39c61f8..a6d38e7 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -39,15 +39,14 @@ def queryex(self, params, urlAppend = None):
         url = self.site+'/ajax-module-connector.php'
         if urlAppend is not None:
             url += urlAppend
-        print('url', url)
+
         req = requests.request('POST', url, data=params, cookies=cookies)
         json = req.json()
 
-        print(json)
         if json['status'] == 'ok':
             return json['body'], (json['title'] if 'title' in json else '')
         else:
-            raise req.text
+            raise Exception(req.text)
 
     # Same but only returns the body, most responses don't have titles
     def query(self, params, urlAppend = None):

From 149136d2f11f608b4c514543ead5697ab2cba21b Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 27 Jul 2019 12:13:41 +0200
Subject: [PATCH 19/93] missing declaration

---
 wikidot.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wikidot.py b/wikidot.py
index a6d38e7..53a0c69 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -173,6 +173,7 @@ def get_revisions(self, page_id, limit):
 
             # Username in a last <a> under <span class="printuser">
             user_span = tr.find("span", attrs={"class": "printuser"})
+            last_a = None
             for last_a in user_span.find_all('a'): pass
             rev_user = last_a.getText() if last_a else None
 

From dd0738a6e912795713ae0e34e9bcc9c2a42f972a Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 27 Jul 2019 12:14:34 +0200
Subject: [PATCH 20/93] less debug spam, fix skipping already fetched

---
 rmaint.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index 70993c6..89060a5 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -106,7 +106,11 @@ def buildRevisionList(self, pages = None, depth = 10000):
         fetched = 0
         for page in pages:
             if page in fetched_pages:
-                print('already fetched', page)
+                #print('already fetched', page)
+                continue
+
+            if page == "sandbox":
+                print("Skipping", page)
                 continue
 
             print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages)))

From 335f1c8726e1e1c015e093c92ff1875f092a2ae0 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 27 Jul 2019 12:14:50 +0200
Subject: [PATCH 21/93] check for .git when checking if there's an existing
 repo

---
 rmaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index 89060a5..5d3f00d 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -184,7 +184,7 @@ def openRepo(self):
         self.last_names = {} # Tracks page renames: name atm -> last name in repo
         self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
 
-        if os.path.isfile(self.path+'/.wstate'):
+        if os.path.isfile(self.path+'/.git'):
             print("Continuing from aborted dump state...")
             self.loadState()
             self.repo = Repo(self.path)

From f6bd4e78db7550d81ac7941ef8c9bb3fd96bc926 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 27 Jul 2019 15:00:41 +0200
Subject: [PATCH 22/93] fix renames

---
 rmaint.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 5d3f00d..d696b2b 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -134,7 +134,10 @@ def buildRevisionList(self, pages = None, depth = 10000):
                 })
             self.saveWRevs() # Save a cached copy
         self.saveWRevs() # Save a cached copy
-        os.remove(self.path+'/.pages')
+
+        if os.path.isfile(self.path+'/.pages'):
+            os.remove(self.path+'/.pages')
+
         print("")
         
         
@@ -239,16 +242,24 @@ def commitNext(self):
         # There are also problems when parent page gets renamed -- see updateChildren
 
         # If the page is tracked and its name just changed, tell Git
+        fname = str(rev_unixname) + '.txt'
         rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
+
         if rename:
+            name_rename_from = str(self.last_names[unixname])+'.txt'
+
             if self.debug:
-                print("moving", str(self.last_names[unixname])+'.txt', str(rev_unixname)+'.txt')
+                print("moving", name_rename_from, "to", fname)
 
             self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
-            self.index.move([str(self.last_names[unixname])+'.txt', str(rev_unixname)+'.txt'])
+
+            # Try to do the best we can, these situations usually stem from vandalism people have cleaned up
+            if os.path.isfile(self.path + '/' + name_rename_from):
+                self.index.move([name_rename_from, fname], force=True)
+            else:
+                print("source file does not exist, probably deleted or renamed from already", name_rename_from)
 
         # Ouput contents
-        fname = rev_unixname+'.txt'
         outp = codecs.open(self.path + '/' + fname, "w", "UTF-8")
         if details['title']:
             outp.write('title:'+details['title']+'\n')

From 974ddb0f4495bec15cfd38a91c8cd2b418df7caf Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 28 Jul 2019 14:17:06 +0200
Subject: [PATCH 23/93] fix cleanup, store fetched IDs so we don't fetch again
 later

---
 rmaint.py | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index d696b2b..d056964 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -33,6 +33,7 @@ def __init__(self, wikidot, path):
 
         # Internal state
         self.wrevs = None           # Compiled wikidot revision list (history)
+        self.fetcheds_revids = []   # Compiled wikidot revision list (history)
 
         self.rev_no = 0             # Next revision to process
         self.last_names = {}        # Tracks page renames: name atm -> last name in repo
@@ -60,6 +61,15 @@ def savePages(self, pages):
         pickle.dump(pages, fp)
         fp.close()
 
+    def saveFetched(self):
+        fp = open(self.path+'/.fetched', 'wb')
+        pickle.dump(self.fetched_revids, fp)
+        fp.close()
+
+    def loadFetched(self):
+        fp = open(self.path+'/.fetched', 'rb')
+        self.fetched_revids = pickle.load(fp)
+        fp.close()
     #
     # Compiles a combined revision list for a given set of pages, or all pages on the site.
     #  pages: compile history for these pages
@@ -76,6 +86,11 @@ def buildRevisionList(self, pages = None, depth = 10000):
             self.wrevs = []
             print('no wrevs')
 
+        if os.path.isfile(self.path+'/.fetched'):
+            loadFetched()
+        else:
+            self.fetched_revids = []
+
         print("Building revision list...")
         if not pages:
             if os.path.isfile(self.path+'/.pages'):
@@ -124,6 +139,9 @@ def buildRevisionList(self, pages = None, depth = 10000):
             revs = self.wd.get_revisions(page_id, depth)
             print(("Revisions: "+str(len(revs))))
             for rev in revs:
+                if rev['id'] in self.fetched_revids:
+                    continue
+
                 self.wrevs.append({
                   'page_id' : page_id,
                   'page_name' : page, # name atm, not at revision time
@@ -133,10 +151,6 @@ def buildRevisionList(self, pages = None, depth = 10000):
                   'comment' : rev['comment'],
                 })
             self.saveWRevs() # Save a cached copy
-        self.saveWRevs() # Save a cached copy
-
-        if os.path.isfile(self.path+'/.pages'):
-            os.remove(self.path+'/.pages')
 
         print("")
         
@@ -306,7 +320,11 @@ def commitNext(self):
         if self.debug:
             print('committed', commit.name_rev, 'by', author)
 
+        self.fetched_revids.append(rev['rev_id'])
+        self.saveFetched()
+
         self.saveState() # Update operation state
+
         return True
 
 
@@ -347,3 +365,7 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna
     def cleanup(self):
         os.remove(self.path+'/.wstate')
         os.remove(self.path+'/.wrevs')
+
+        if os.path.isfile(self.path+'/.pages'):
+            os.remove(self.path+'/.pages')
+

From 7bf42b30b9f66da67a58826c582ca38fd648342e Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 28 Jul 2019 14:17:19 +0200
Subject: [PATCH 24/93] more verbose output when returned json fails to parse

---
 wikidot.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/wikidot.py b/wikidot.py
index 53a0c69..2bc5eb4 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -41,7 +41,12 @@ def queryex(self, params, urlAppend = None):
             url += urlAppend
 
         req = requests.request('POST', url, data=params, cookies=cookies)
-        json = req.json()
+        try:
+            json = req.json()
+        except JSONDecodeError as e:
+            print(e, req, url, params)
+            raise e
+        #print(json)
 
         if json['status'] == 'ok':
             return json['body'], (json['title'] if 'title' in json else '')

From 04e93240a19ab3de327b1358e1c69566c32e9bb6 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 28 Jul 2019 14:43:00 +0200
Subject: [PATCH 25/93] fix storing/skipping of already fetched revisions

---
 rmaint.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index d056964..7f136c3 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -87,7 +87,7 @@ def buildRevisionList(self, pages = None, depth = 10000):
             print('no wrevs')
 
         if os.path.isfile(self.path+'/.fetched'):
-            loadFetched()
+            self.loadFetched()
         else:
             self.fetched_revids = []
 
@@ -140,6 +140,7 @@ def buildRevisionList(self, pages = None, depth = 10000):
             print(("Revisions: "+str(len(revs))))
             for rev in revs:
                 if rev['id'] in self.fetched_revids:
+                    print(rev['id'], 'already fetched')
                     continue
 
                 self.wrevs.append({
@@ -229,6 +230,14 @@ def commitNext(self):
             return False
 
         rev = self.wrevs[self.rev_no]
+
+        if rev['rev_id'] in self.fetched_revids:
+            print(rev['rev_id'], 'already fetched')
+            self.rev_no += 1
+
+            self.saveState() # Update operation state
+            return True
+
         source = self.wd.get_revision_source(rev['rev_id'])
         # Page title and unix_name changes are only available through another request:
         details = self.wd.get_revision_version(rev['rev_id'])

From ab82f222570f2e2c62eb678015061028d7422c7c Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 28 Jul 2019 14:43:16 +0200
Subject: [PATCH 26/93] track renames with symlinks

---
 rmaint.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/rmaint.py b/rmaint.py
index 7f136c3..41a8604 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -266,6 +266,12 @@ def commitNext(self):
 
         # If the page is tracked and its name just changed, tell Git
         fname = str(rev_unixname) + '.txt'
+
+        # We track renames as symlinks to try to emulate how it handles redirects from old to new names
+        # But if it is overwritten, don't write into the symlinked file, create a new one
+        if os.path.islink(self.path + '/' + fname):
+            os.remove(self.path + '/' + fname)
+
         rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
 
         if rename:
@@ -279,6 +285,10 @@ def commitNext(self):
             # Try to do the best we can, these situations usually stem from vandalism people have cleaned up
             if os.path.isfile(self.path + '/' + name_rename_from):
                 self.index.move([name_rename_from, fname], force=True)
+
+                # Because the wiki redirects
+                os.symlink(fname, self.path + '/' + name_rename_from)
+                self.index.add([name_rename_from])
             else:
                 print("source file does not exist, probably deleted or renamed from already", name_rename_from)
 

From bf568d512924a77942a45d32893a2d20fe469295 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 28 Jul 2019 15:05:27 +0200
Subject: [PATCH 27/93] avoid making the terminal backlog useless when scraping
 scp

---
 rmaint.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 41a8604..c7d0bf8 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -165,10 +165,13 @@ def buildRevisionList(self, pages = None, depth = 10000):
         print("")
         
         if self.debug:
-            print("Revision list: ")
-            for rev in self.wrevs:
-                print((str(rev)+"\n"))
-            print("")
+            if len(self.wrevs) < 100:
+                print("Revision list: ")
+                for rev in self.wrevs:
+                    print((str(rev)+"\n"))
+                print("")
+            else:
+                print("Too many revisions, not printing everything")
 
 
     #

From 8850e83cf6e6009eeee15f0b56be09aaa8b701c1 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 28 Jul 2019 15:06:43 +0200
Subject: [PATCH 28/93] Revert "track renames with symlinks"

Renames don't automatically redirect, so stop trying to emulate that.
Instead we properly track redirect pages.

This reverts commit ab82f222570f2e2c62eb678015061028d7422c7c.
---
 rmaint.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index c7d0bf8..e195b5f 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -269,12 +269,6 @@ def commitNext(self):
 
         # If the page is tracked and its name just changed, tell Git
         fname = str(rev_unixname) + '.txt'
-
-        # We track renames as symlinks to try to emulate how it handles redirects from old to new names
-        # But if it is overwritten, don't write into the symlinked file, create a new one
-        if os.path.islink(self.path + '/' + fname):
-            os.remove(self.path + '/' + fname)
-
         rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
 
         if rename:
@@ -288,10 +282,6 @@ def commitNext(self):
             # Try to do the best we can, these situations usually stem from vandalism people have cleaned up
             if os.path.isfile(self.path + '/' + name_rename_from):
                 self.index.move([name_rename_from, fname], force=True)
-
-                # Because the wiki redirects
-                os.symlink(fname, self.path + '/' + name_rename_from)
-                self.index.add([name_rename_from])
             else:
                 print("source file does not exist, probably deleted or renamed from already", name_rename_from)
 

From d129d91ddfa4490f9d6bcba9d6f8a2e454b6ecf1 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 28 Jul 2019 15:07:34 +0200
Subject: [PATCH 29/93] track redirect pages correctly

---
 wikidot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wikidot.py b/wikidot.py
index 2bc5eb4..885719c 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -127,7 +127,7 @@ def list_pages(self, limit):
     def get_page_id(self, page_unix_name):
         # The only freaking way to get page ID is to load the page! Wikidot!
         self._wait_request_slot()
-        req = requests.request('GET', self.site+'/'+page_unix_name)
+        req = requests.request('GET', self.site+'/'+page_unix_name + '/noredirect/true')
         soup = BeautifulSoup(req.text, 'html.parser')
         for item in soup.head.find_all('script'):
             text = item.text

From 23e6412ae787ce58e1ddf9a46da53974fb9eb6eb Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 28 Jul 2019 15:16:22 +0200
Subject: [PATCH 30/93] control debug spam

---
 rmaint.py  | 60 ++++++++++++++++++++++++++++++++----------------------
 wikidot.py | 22 ++++++++++++++------
 2 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index e195b5f..68dd275 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -84,22 +84,27 @@ def buildRevisionList(self, pages = None, depth = 10000):
             self.loadWRevs()
         else:
             self.wrevs = []
-            print('no wrevs')
+            if self.debug:
+                print('No existing wrevs')
 
         if os.path.isfile(self.path+'/.fetched'):
             self.loadFetched()
         else:
             self.fetched_revids = []
 
-        print("Building revision list...")
+        if self.debug:
+            print("Building revision list...")
+
         if not pages:
             if os.path.isfile(self.path+'/.pages'):
-                print('loading fetched pages')
+                print('Loading fetched pages')
                 fp = open(self.path+'/.pages', 'rb')
                 pages = pickle.load(fp)
                 fp.close()
 
-            print('need to fetch pages')
+            if self.debug:
+                print('Need to fetch pages')
+
             if not pages:
                 pages = self.wd.list_pages(10000)
                 self.savePages(pages)
@@ -115,29 +120,35 @@ def buildRevisionList(self, pages = None, depth = 10000):
 
             fetched_pages.append(page_name)
 
-        print("fetched " + str(len(fetched_pages)) + " of " + str(len(pages)))
+        if self.debug:
+            print("Already fetched " + str(len(fetched_pages)) + " of " + str(len(pages)))
 
-        #self.wrevs = []
         fetched = 0
         for page in pages:
             if page in fetched_pages:
                 #print('already fetched', page)
                 continue
 
+            # TODO: more generic blacklisting
             if page == "sandbox":
-                print("Skipping", page)
+                if self.debug:
+                    print("Skipping", page)
                 continue
 
-            print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages)))
+            if self.debug:
+                print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages)))
             fetched += 1
             page_id = self.wd.get_page_id(page)
-            print(("ID: "+str(page_id)))
+
+            if self.debug:
+                print(("ID: "+str(page_id)))
+
             if page_id is None:
-                print('page lost', page)
+                print('Page gone?', page)
                 continue
 
             revs = self.wd.get_revisions(page_id, depth)
-            print(("Revisions: "+str(len(revs))))
+            print("Revisions to fetch: "+str(len(revs)))
             for rev in revs:
                 if rev['id'] in self.fetched_revids:
                     print(rev['id'], 'already fetched')
@@ -154,18 +165,17 @@ def buildRevisionList(self, pages = None, depth = 10000):
             self.saveWRevs() # Save a cached copy
 
         print("")
-        
-        
+
         print(("Total revisions: "+str(len(self.wrevs))))
-        
-        print("Sorting revisions...")
-        print(self.wrevs[0])
-        print(self.wrevs[0]['date'])
+
+        if self.debug:
+            print("Sorting revisions...")
+
         self.wrevs.sort(key=lambda rev: rev['date'])
-        print("")
         
         if self.debug:
             if len(self.wrevs) < 100:
+                print("")
                 print("Revision list: ")
                 for rev in self.wrevs:
                     print((str(rev)+"\n"))
@@ -235,7 +245,9 @@ def commitNext(self):
         rev = self.wrevs[self.rev_no]
 
         if rev['rev_id'] in self.fetched_revids:
-            print(rev['rev_id'], 'already fetched')
+            if self.debug:
+                print(rev['rev_id'], 'already fetched')
+
             self.rev_no += 1
 
             self.saveState() # Update operation state
@@ -275,7 +287,7 @@ def commitNext(self):
             name_rename_from = str(self.last_names[unixname])+'.txt'
 
             if self.debug:
-                print("moving", name_rename_from, "to", fname)
+                print("Moving renamed", name_rename_from, "to", fname)
 
             self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
 
@@ -283,7 +295,7 @@ def commitNext(self):
             if os.path.isfile(self.path + '/' + name_rename_from):
                 self.index.move([name_rename_from, fname], force=True)
             else:
-                print("source file does not exist, probably deleted or renamed from already", name_rename_from)
+                print("Source file does not exist, probably deleted or renamed from already?", name_rename_from)
 
         # Ouput contents
         outp = codecs.open(self.path + '/' + fname, "w", "UTF-8")
@@ -300,7 +312,7 @@ def commitNext(self):
         if not unixname in self.last_names: # never before seen
             commit_msg += "Created "
             if self.debug:
-                print("adding", fname)
+                print("Adding", fname)
         elif rev['comment'] == '':
             commit_msg += "Updated "
 
@@ -317,7 +329,7 @@ def commitNext(self):
         else:
             commit_date = None
 
-        print(("Commiting: "+str(self.rev_no)+'. '+commit_msg))
+        print("Committing: " + str(self.rev_no) + '. '+commit_msg)
 
         username = str(rev['user'])
         email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename
@@ -330,7 +342,7 @@ def commitNext(self):
         self.rev_no += 1
 
         if self.debug:
-            print('committed', commit.name_rev, 'by', author)
+            print('Committed', commit.name_rev, 'by', author)
 
         self.fetched_revids.append(rev['rev_id'])
         self.saveFetched()
diff --git a/wikidot.py b/wikidot.py
index 885719c..07d44ff 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -44,9 +44,8 @@ def queryex(self, params, urlAppend = None):
         try:
             json = req.json()
         except JSONDecodeError as e:
-            print(e, req, url, params)
+            print('Failed to parse response from wikidot', e, req, url, params)
             raise e
-        #print(json)
 
         if json['status'] == 'ok':
             return json['body'], (json['title'] if 'title' in json else '')
@@ -87,18 +86,22 @@ def list_pages(self, limit):
 
             for entry in soup.div.p.text.split('\n'):
                 pages.append(entry)
+
             if self.debug:
                 print('Pages found:', len(pages))
 
             targets = soup.find_all('span','target')
             if len(targets) < 2:
-                print("unable to find next target")
+                print("Unable to find next listing page, not enough target spans")
                 break
 
             next_url = targets[-1].a.get('href').split('/')
             if len(next_url) > 0 and next_url[-1].isnumeric():
                 next_page = int(next_url[-1])
-                print('next page', next_page)
+
+                if self.debug:
+                    print('Next listing page', next_page)
+
             else:
                 print("invalid next url", next_url)
                 break
@@ -108,16 +111,23 @@ def list_pages(self, limit):
             current_spans = soup.find_all('span','current')
             if len(current_spans) > 0:
                 current_page = int(current_spans[0].text)
-                print('current page', current_page)
+
+                if self.debug:
+                    print('Current listing page', current_page)
+
             else:
                 print("unable to find current page")
                 break;
 
             if next_page != offset + 1:
-                print('next page is wrong', next_page)
+                if self.debug:
+                    print('Next page is wrong', next_page, 'hopefully at the end')
                 break
 
             offset += 1
+
+            print("Fetching listing page", offset)
+
         return pages
 
 

From b40b560fb32dc9d0a73aaaf4b399189931cf7c96 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Mon, 29 Jul 2019 10:24:24 +0200
Subject: [PATCH 31/93] more cleaning of debug output

---
 rmaint.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 68dd275..a930fee 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -102,16 +102,23 @@ def buildRevisionList(self, pages = None, depth = 10000):
                 pages = pickle.load(fp)
                 fp.close()
 
-            if self.debug:
-                print('Need to fetch pages')
 
             if not pages:
+                if self.debug:
+                    print('Need to fetch pages')
                 pages = self.wd.list_pages(10000)
                 self.savePages(pages)
+            elif self.debug:
+                print(len(pages), 'pages loaded')
 
 
         fetched_pages = []
 
+        if self.debug:
+            print('Collecting already pages we already got revisions for')
+
+        # TODO: I don't know python, but this is highly suboptimal (and takes a ton of time)
+        # Should use a set/hashmap/whatever python calls it
         for wrev in self.wrevs:
             page_name = wrev['page_name']
 
@@ -121,12 +128,11 @@ def buildRevisionList(self, pages = None, depth = 10000):
             fetched_pages.append(page_name)
 
         if self.debug:
-            print("Already fetched " + str(len(fetched_pages)) + " of " + str(len(pages)))
+            print("Already fetched revisions for " + str(len(fetched_pages)) + " of " + str(len(pages)))
 
         fetched = 0
         for page in pages:
             if page in fetched_pages:
-                #print('already fetched', page)
                 continue
 
             # TODO: more generic blacklisting

From 5a90bb195511d7ba627c3520d3f99dcc04ba6ca2 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Mon, 29 Jul 2019 10:24:33 +0200
Subject: [PATCH 32/93] fix dates in commits

---
 rmaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index a930fee..ade4fce 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -344,7 +344,7 @@ def commitNext(self):
 
         self.index.add([str(fname)])
         self.last_names[unixname] = rev_unixname
-        commit = self.index.commit(commit_msg, author=author, commit_date=commit_date)
+        commit = self.index.commit(commit_msg, author=author, commit_date=commit_date, author_date=commit_date)
         self.rev_no += 1
 
         if self.debug:

From e487cdd7260a8d42f2177782cfa0c798550e6016 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Mon, 29 Jul 2019 10:30:02 +0200
Subject: [PATCH 33/93] let commit date be the current datetime, it makes more
 sense

---
 rmaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index ade4fce..4c0bae5 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -344,7 +344,7 @@ def commitNext(self):
 
         self.index.add([str(fname)])
         self.last_names[unixname] = rev_unixname
-        commit = self.index.commit(commit_msg, author=author, commit_date=commit_date, author_date=commit_date)
+        commit = self.index.commit(commit_msg, author=author, author_date=commit_date)
         self.rev_no += 1
 
         if self.debug:

From 94fa6ae5bf6561aafc55f94fe5a1297596c942c1 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 4 Aug 2019 14:31:18 +0200
Subject: [PATCH 34/93] python doesn't have this already?

---
 rmaint.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 4c0bae5..4508c50 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -393,9 +393,12 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna
     # Finalizes the construction process and deletes any temporary files.
     #
     def cleanup(self):
-        os.remove(self.path+'/.wstate')
-        os.remove(self.path+'/.wrevs')
+        if os.path.exists(self.path+'/.wstate'):
+            os.remove(self.path+'/.wstate')
 
-        if os.path.isfile(self.path+'/.pages'):
+        if os.path.exists(self.path+'/.wrevs'):
+            os.remove(self.path+'/.wrevs')
+
+        if os.path.exists(self.path+'/.pages'):
             os.remove(self.path+'/.pages')
 

From f9175f3ce941bef932e2a8ce35bdb1dc7785541b Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 4 Aug 2019 14:36:12 +0200
Subject: [PATCH 35/93] retry in case of gateway errors, which seem to be
 semi-frequent and quickly recovered from

---
 wikidot.py | 50 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 07d44ff..e02caf4 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -14,6 +14,7 @@ def __init__(self, site):
         self.delay = 200        # Delay between requests in msec
         self.debug = False      # Print debug messages
         self.next_timeslot = time.clock()   # Can call immediately
+        self.max_retries = 5
 
 
     # To honor usage rules, we wait for self.delay between requests.
@@ -35,22 +36,47 @@ def queryex(self, params, urlAppend = None):
             print(params)
             print(cookies)
 
-        self._wait_request_slot()
         url = self.site+'/ajax-module-connector.php'
         if urlAppend is not None:
             url += urlAppend
 
-        req = requests.request('POST', url, data=params, cookies=cookies)
-        try:
-            json = req.json()
-        except JSONDecodeError as e:
-            print('Failed to parse response from wikidot', e, req, url, params)
-            raise e
-
-        if json['status'] == 'ok':
-            return json['body'], (json['title'] if 'title' in json else '')
-        else:
-            raise Exception(req.text)
+        # In case of e. g. 500 errors
+        retries = 0
+        while retries < self.max_retries:
+            self._wait_request_slot()
+
+            req = requests.request('POST', url, data=params, cookies=cookies)
+
+            # Usually a 502 error, recovers immediately
+            if req.status_code >= 500:
+                retries += 1
+                print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+
+                # In case of debug enabled, we already printed this above
+                if not self.debug:
+                    print(req, params)
+
+                # Be nice, double wait delay for errors
+                self._wait_request_slot()
+
+                continue
+
+            try:
+                # In case of 404 errors or other stuff that indicates
+                # some bug in how we handle or request things
+                req.raise_for_status()
+                json = req.json()
+            except Exception as e:
+                print('Failed to get response from wikidot', e, req, url, params)
+                raise e
+
+            if json['status'] == 'ok':
+                return json['body'], (json['title'] if 'title' in json else '')
+            else:
+                raise Exception(req.text)
+
+        print('Failed too many times', url, params, cookies)
+        raise Exception('Failed too many times for ' + url)
 
     # Same but only returns the body, most responses don't have titles
     def query(self, params, urlAppend = None):

From 7070b38ac2e61a2434e8bf5726b852e729bc6a0e Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 4 Aug 2019 15:33:36 +0200
Subject: [PATCH 36/93] disable removing state tracking files, we want them if
 we continously update our mirror

---
 crawl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl.py b/crawl.py
index 9fd0bab..25d9609 100644
--- a/crawl.py
+++ b/crawl.py
@@ -105,5 +105,5 @@ def force_dirs(path):
     while rm.commitNext():
         pass
 
-    rm.cleanup()
+    # rm.cleanup()
     print("Done.")

From d103e4db1eff7dbd890c431892ecc821bd1b123a Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 4 Aug 2019 15:37:47 +0200
Subject: [PATCH 37/93] improve tracking of created files (not entirely sure
 why it didn't work), commit message when renaming

---
 rmaint.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 4508c50..a746e52 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -204,9 +204,11 @@ def loadState(self):
         fp = open(self.path+'/.wstate', 'rb')
         self.rev_no = pickle.load(fp)
         self.last_names = pickle.load(fp)
+
         try:
             self.last_parents = pickle.load(fp)
-        except EOFError:
+        except EOFError as e:
+            print('EOFError while loading wstate', e)
             pass
         fp.close()
 
@@ -289,6 +291,8 @@ def commitNext(self):
         fname = str(rev_unixname) + '.txt'
         rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
 
+        commit_msg = ""
+
         if rename:
             name_rename_from = str(self.last_names[unixname])+'.txt'
 
@@ -300,9 +304,18 @@ def commitNext(self):
             # Try to do the best we can, these situations usually stem from vandalism people have cleaned up
             if os.path.isfile(self.path + '/' + name_rename_from):
                 self.index.move([name_rename_from, fname], force=True)
+                commit_msg += "Renamed from " str(self.last_names[unixname]) + ' to ' + str(rev_unixname) + ' '
             else:
                 print("Source file does not exist, probably deleted or renamed from already?", name_rename_from)
 
+        # Add new page
+        elif not os.path.isfile(self.path + '/' + fname): # never before seen
+            commit_msg += "Created "
+            if self.debug:
+                print("Adding", fname)
+        elif rev['comment'] == '':
+            commit_msg += "Updated "
+
         # Ouput contents
         outp = codecs.open(self.path + '/' + fname, "w", "UTF-8")
         if details['title']:
@@ -312,16 +325,6 @@ def commitNext(self):
         outp.write(source)
         outp.close()
 
-        commit_msg = ""
-
-        # Add new page
-        if not unixname in self.last_names: # never before seen
-            commit_msg += "Created "
-            if self.debug:
-                print("Adding", fname)
-        elif rev['comment'] == '':
-            commit_msg += "Updated "
-
         commit_msg += rev_unixname
 
         # Commit

From 1955c2869a445a3e1a1594fb002bb81ecc0dca3f Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 4 Aug 2019 16:48:40 +0200
Subject: [PATCH 38/93] persist metadata (renames etc.) in the git repo

---
 crawl.py  |  2 +-
 rmaint.py | 69 ++++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/crawl.py b/crawl.py
index 25d9609..9fd0bab 100644
--- a/crawl.py
+++ b/crawl.py
@@ -105,5 +105,5 @@ def force_dirs(path):
     while rm.commitNext():
         pass
 
-    # rm.cleanup()
+    rm.cleanup()
     print("Done.")
diff --git a/rmaint.py b/rmaint.py
index a746e52..42def44 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -4,6 +4,7 @@
 import os
 import codecs
 import pickle as pickle
+import json
 
 # git stuff
 from git import Repo, Actor
@@ -61,15 +62,31 @@ def savePages(self, pages):
         pickle.dump(pages, fp)
         fp.close()
 
-    def saveFetched(self):
-        fp = open(self.path+'/.fetched', 'wb')
-        pickle.dump(self.fetched_revids, fp)
+    def appendFetchedRevid(self, revid):
+        fp = open(self.path+'/.fetched.txt', 'a')
+        fp.write(revid + '\n')
         fp.close()
 
-    def loadFetched(self):
-        fp = open(self.path+'/.fetched', 'rb')
-        self.fetched_revids = pickle.load(fp)
+    def loadFetchedRevids(self):
+        self.fetched_revids = [line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')]
+
+    # Persistent metadata about the repo:
+    #  - Tracks page renames: name atm -> last name in repo
+    #  - Tracks page parent names: name atm -> last parent in repo
+    def saveMetadata(self):
+        metadata = {'names': self.last_names, 'parents': self.last_parents }
+        fp = open(self.path+'/.metadata.json', 'w')
+        json.dump(metadata, fp)
+        fp.close()
+
+    def loadMetadata(self):
+        fp = open(self.path+'/.metadata.json', 'r')
+        metadata = json.load(fp)
+        self.last_names = metadata['names']
+        self.last_parents = metadata['parents']
         fp.close()
+
+        self.loadFetchedRevids()
     #
     # Compiles a combined revision list for a given set of pages, or all pages on the site.
     #  pages: compile history for these pages
@@ -87,8 +104,9 @@ def buildRevisionList(self, pages = None, depth = 10000):
             if self.debug:
                 print('No existing wrevs')
 
-        if os.path.isfile(self.path+'/.fetched'):
-            self.loadFetched()
+        if os.path.isfile(self.path+'/.fetched.txt'):
+            self.loadFetchedRevids()
+            print(self.fetched_revids)
         else:
             self.fetched_revids = []
 
@@ -111,7 +129,6 @@ def buildRevisionList(self, pages = None, depth = 10000):
             elif self.debug:
                 print(len(pages), 'pages loaded')
 
-
         fetched_pages = []
 
         if self.debug:
@@ -170,6 +187,9 @@ def buildRevisionList(self, pages = None, depth = 10000):
                 })
             self.saveWRevs() # Save a cached copy
 
+        if os.path.isfile(self.path+'/.metadata.json'):
+            self.loadMetadata()
+
         print("")
 
         print(("Total revisions: "+str(len(self.wrevs))))
@@ -196,20 +216,11 @@ def buildRevisionList(self, pages = None, depth = 10000):
     def saveState(self):
         fp = open(self.path+'/.wstate', 'wb')
         pickle.dump(self.rev_no, fp)
-        pickle.dump(self.last_names, fp)
-        pickle.dump(self.last_parents, fp)
         fp.close()
     
     def loadState(self):
         fp = open(self.path+'/.wstate', 'rb')
         self.rev_no = pickle.load(fp)
-        self.last_names = pickle.load(fp)
-
-        try:
-            self.last_parents = pickle.load(fp)
-        except EOFError as e:
-            print('EOFError while loading wstate', e)
-            pass
         fp.close()
 
 
@@ -304,7 +315,7 @@ def commitNext(self):
             # Try to do the best we can, these situations usually stem from vandalism people have cleaned up
             if os.path.isfile(self.path + '/' + name_rename_from):
                 self.index.move([name_rename_from, fname], force=True)
-                commit_msg += "Renamed from " str(self.last_names[unixname]) + ' to ' + str(rev_unixname) + ' '
+                commit_msg += "Renamed from " + str(self.last_names[unixname]) + ' to ' + str(rev_unixname) + ' '
             else:
                 print("Source file does not exist, probably deleted or renamed from already?", name_rename_from)
 
@@ -316,6 +327,8 @@ def commitNext(self):
         elif rev['comment'] == '':
             commit_msg += "Updated "
 
+        self.last_names[unixname] = rev_unixname
+
         # Ouput contents
         outp = codecs.open(self.path + '/' + fname, "w", "UTF-8")
         if details['title']:
@@ -340,22 +353,23 @@ def commitNext(self):
 
         print("Committing: " + str(self.rev_no) + '. '+commit_msg)
 
+        # Include metadata in the commit (if changed)
+        self.appendFetchedRevid(rev['rev_id'])
+        self.saveMetadata()
+        self.index.add([str(fname), '.metadata.json'])
+
         username = str(rev['user'])
         email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename
-
         author = Actor(username, email)
 
-        self.index.add([str(fname)])
-        self.last_names[unixname] = rev_unixname
         commit = self.index.commit(commit_msg, author=author, author_date=commit_date)
-        self.rev_no += 1
 
         if self.debug:
             print('Committed', commit.name_rev, 'by', author)
 
         self.fetched_revids.append(rev['rev_id'])
-        self.saveFetched()
 
+        self.rev_no += 1
         self.saveState() # Update operation state
 
         return True
@@ -398,10 +412,17 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna
     def cleanup(self):
         if os.path.exists(self.path+'/.wstate'):
             os.remove(self.path+'/.wstate')
+        else:
+            print("wstate does not exist?")
 
         if os.path.exists(self.path+'/.wrevs'):
             os.remove(self.path+'/.wrevs')
+        else:
+            print("wrevs does not exist?")
 
         if os.path.exists(self.path+'/.pages'):
             os.remove(self.path+'/.pages')
 
+        if self.rev_no > 0:
+            self.index.add(['.fetched.txt'])
+            self.index.commit('Updating fetched revisions')

From 3d08cc21872b752fd3c729ba51249276f4d687ea Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Tue, 14 Jul 2020 10:46:08 +0200
Subject: [PATCH 39/93] add dependencies to readme

---
 readme.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index 755fdc1..c13d967 100644
--- a/readme.md
+++ b/readme.md
@@ -1,4 +1,7 @@
-This is a Python command line client for relatively popular wiki hosting http://www.wikidot.com which lets you:
+*This is a fork to make a permanent backup of the SCP wiki.*
+
+This is a Python command line client for relatively popular wiki hosting
+http://www.wikidot.com which lets you:
 
 * List all pages on a site
 * See all revisions of a page
@@ -6,6 +9,14 @@ This is a Python command line client for relatively popular wiki hosting http://
 
 Most interestingly, it allows you to download the whole site as a Git repository, with proper commit dates, author and comments!
 
+##### Dependencies
+
+At least:
+
+* Python 3
+* python-beautifulsoup4
+* python-gitpython
+
 ##### Examples:
 
     crawl.py http://example.wikidot.com --dump ExampleRepo

From b29c0eee94949600e0e2f9c88fb8ca6f50d491fd Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Tue, 14 Jul 2020 10:47:54 +0200
Subject: [PATCH 40/93] python's time.clock() is gone

---
 wikidot.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index e02caf4..2595bdc 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -13,14 +13,14 @@ def __init__(self, site):
         self.sitename = urlparse(site).hostname.lower()
         self.delay = 200        # Delay between requests in msec
         self.debug = False      # Print debug messages
-        self.next_timeslot = time.clock()   # Can call immediately
+        self.next_timeslot = time.process_time()   # Can call immediately
         self.max_retries = 5
 
 
     # To honor usage rules, we wait for self.delay between requests.
     # Low-level query functions call this before every request to Wikidot./
     def _wait_request_slot(self):
-        tm = time.clock()
+        tm = time.process_time()
         if self.next_timeslot - tm > 0:
             time.sleep(self.next_timeslot - tm)
         self.next_timeslot = tm + self.delay / 1000

From bcf32407d2c8cc2e084edc1ff773631355f8997c Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Tue, 14 Jul 2020 11:01:42 +0200
Subject: [PATCH 41/93] bs (appropriate name) apparently has changed its API

---
 wikidot.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/wikidot.py b/wikidot.py
index 2595bdc..bbf6fb0 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -166,7 +166,11 @@ def get_page_id(self, page_unix_name):
         req = requests.request('GET', self.site+'/'+page_unix_name + '/noredirect/true')
         soup = BeautifulSoup(req.text, 'html.parser')
         for item in soup.head.find_all('script'):
-            text = item.text
+            text = item.string
+            if text is None:
+                print("No text in script item", item)
+                continue
+
             pos = text.find("WIKIREQUEST.info.pageId = ")
             if pos >= 0:
                 pos += len("WIKIREQUEST.info.pageId = ")

From 817c4d050a701adb020543faae6a16f5ff3d7288 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Tue, 14 Jul 2020 11:02:00 +0200
Subject: [PATCH 42/93] avoid double / in the URLs

---
 wikidot.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/wikidot.py b/wikidot.py
index bbf6fb0..29b6c65 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -10,6 +10,10 @@
 class Wikidot:
     def __init__(self, site):
         self.site = site        # Wikidot site to query
+
+        # strip out trailing /, if it exists
+        if self.site[-1] == '/':
+            self.site = self.site[:-1]
         self.sitename = urlparse(site).hostname.lower()
         self.delay = 200        # Delay between requests in msec
         self.debug = False      # Print debug messages

From fd918ad8116d9094f18a97374765eb926b6a3d1b Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Tue, 14 Jul 2020 11:02:19 +0200
Subject: [PATCH 43/93] print URLs we fetch in debug output

---
 wikidot.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/wikidot.py b/wikidot.py
index 29b6c65..e0b4e26 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -167,7 +167,12 @@ def list_pages(self, limit):
     def get_page_id(self, page_unix_name):
         # The only freaking way to get page ID is to load the page! Wikidot!
         self._wait_request_slot()
-        req = requests.request('GET', self.site+'/'+page_unix_name + '/noredirect/true')
+        url = self.site+'/'+page_unix_name + '/noredirect/true';
+
+        if self.debug:
+            print("fetching", url)
+
+        req = requests.request('GET', url)
         soup = BeautifulSoup(req.text, 'html.parser')
         for item in soup.head.find_all('script'):
             text = item.string

From 2fcec563645f7db1d3396b5966adda8ea6065178 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Tue, 14 Jul 2020 12:12:53 +0200
Subject: [PATCH 44/93] try to have more robust fetching (longer waiting on
 errors)

---
 wikidot.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/wikidot.py b/wikidot.py
index e0b4e26..828ced8 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -63,6 +63,10 @@ def queryex(self, params, urlAppend = None):
                 # Be nice, double wait delay for errors
                 self._wait_request_slot()
 
+                # Extra nice, sleep longer (expoential increase), hope for the
+                # server to recover
+                time.sleep(retries * retries * self.delay)
+
                 continue
 
             try:

From c4d907ef07a66a866b1a1d8a4d9fb1b788b51343 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Tue, 14 Jul 2020 12:13:47 +0200
Subject: [PATCH 45/93] extract list of embedded images

---
 wikidot.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/wikidot.py b/wikidot.py
index 828ced8..4fba067 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -279,6 +279,29 @@ def get_revision_version(self, rev_id):
         res = self.get_revision_version_raw(rev_id) # this has title!
         soup = BeautifulSoup(res[0], 'html.parser')
 
+        images = []
+        for img_div in soup.find_all("div", attrs={"class": "scp-image-block"}):
+            img_src = None
+            img_name = ""
+            full_link = img_div.find("a")
+            if full_link is not None:
+                # Check if it has a thumbnail, otherwise we can't trust that it is the original
+                img = full_link.find("img", attrs={"class": "enlarge"})
+                if img is not None:
+                    img_src = full_link["href"]
+                    img_name = img["alt"]
+
+            if img_src is None:
+                img = img_div.find("img")
+                if img is not None:
+                    img_src = img["src"]
+                    img_name = img["alt"]
+
+            if img_src is not None:
+                # Just in case, I don't think it ever happens
+                img_name = img_name.replace("/", "_forward_slash_")
+                images.append({"src": img_src, "filename": img_name})
+
         # First table is a flyout with revision details. Remove and study it.
         unixname = None
         details = soup.find("div", attrs={"id": "page-version-info"}).extract()
@@ -293,4 +316,5 @@ def get_revision_version(self, rev_id):
           'unixname': unixname,
           'title': res[1],
           'content': str(soup), # only content remains
+          'images': images,
         }

From 8d0a5eeafe5c8368ee1ff2b6d136d7b7217a22a4 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Tue, 14 Jul 2020 14:03:01 +0200
Subject: [PATCH 46/93] fix image downloading (TODO: make it add them in the
 right commit, now it just downloads them as they appear)

---
 rmaint.py  |  18 +++++++-
 wikidot.py | 120 +++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 128 insertions(+), 10 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 42def44..5cb56aa 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -304,6 +304,8 @@ def commitNext(self):
 
         commit_msg = ""
 
+        added_file_paths = []
+
         if rename:
             name_rename_from = str(self.last_names[unixname])+'.txt'
 
@@ -332,12 +334,14 @@ def commitNext(self):
         # Ouput contents
         outp = codecs.open(self.path + '/' + fname, "w", "UTF-8")
         if details['title']:
-            outp.write('title:'+details['title']+'\n')
+            outp.write('title:' + details['title']+'\n')
         if parent_unixname:
             outp.write('parent:'+parent_unixname+'\n')
         outp.write(source)
         outp.close()
 
+        added_file_paths.append(str(fname))
+
         commit_msg += rev_unixname
 
         # Commit
@@ -351,12 +355,22 @@ def commitNext(self):
         else:
             commit_date = None
 
+        got_images = False;
+        for image in details['images']:
+            if self.wd.maybe_download_file(image['src'], self.path + '/' + image['filepath']):
+                got_images = True
+                # If we do this gitpython barfs on itself
+                #added_file_paths.append(image['filepath'])
+
+        if got_images:
+            added_file_paths.append("images")
         print("Committing: " + str(self.rev_no) + '. '+commit_msg)
 
         # Include metadata in the commit (if changed)
         self.appendFetchedRevid(rev['rev_id'])
         self.saveMetadata()
-        self.index.add([str(fname), '.metadata.json'])
+        added_file_paths.append('.metadata.json')
+        self.index.add(added_file_paths)
 
         username = str(rev['user'])
         email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename
diff --git a/wikidot.py b/wikidot.py
index 4fba067..188f9f5 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -2,7 +2,12 @@
 import random
 from bs4 import BeautifulSoup
 import time
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urljoin
+from pprint import pprint
+import pathlib
+import hashlib
+import os
+import shutil
 
 # Implements various queries to Wikidot engine through its AJAX facilities
 
@@ -20,6 +25,65 @@ def __init__(self, site):
         self.next_timeslot = time.process_time()   # Can call immediately
         self.max_retries = 5
 
+    # Downloads file if it doesn't exist
+    def maybe_download_file(self, url, file_path):
+        self._wait_request_slot()
+
+        path = pathlib.Path(file_path)
+        if path.exists():
+            if self.debug:
+                print(file_path, "exists, skipping")
+            return False
+
+        dirpath = path.resolve().relative_to(pathlib.Path.cwd()).parent
+        os.makedirs(dirpath, exist_ok=True)
+
+        if self.debug:
+            print("downloading", url, "to" ,file_path, "dirpath", dirpath)
+
+        # In case of e. g. 500 errors
+        retries = 0
+        while retries < self.max_retries:
+            self._wait_request_slot()
+
+            headers = requests.utils.default_headers()
+            # Pretty generic user-agent, but we append a unique none for us
+            # Makes wikimedia happy
+            headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"})
+            req = requests.get(url, stream=True, )
+
+            if req.status_code >= 500:
+                retries += 1
+                print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+
+                # In case of debug enabled, we already printed this above
+                if not self.debug:
+                    print(req)
+
+                # Be nice, double wait delay for errors
+                self._wait_request_slot()
+
+                # Extra nice, sleep longer (expoential increase), hope for the
+                # server to recover
+                time.sleep(retries * retries * self.delay)
+
+                continue
+
+            try:
+                # In case of 404 errors or other stuff that indicates
+                # some bug in how we handle or request things
+                req.raise_for_status()
+
+                req.raw.decode_content = True
+                with open(file_path, 'wb') as out_file:
+                    shutil.copyfileobj(req.raw, out_file)
+
+                return True
+            except Exception as e:
+                print('Failed to download', e, req, url)
+                raise e
+
+        return False
 
     # To honor usage rules, we wait for self.delay between requests.
     # Low-level query functions call this before every request to Wikidot./
@@ -28,6 +92,7 @@ def _wait_request_slot(self):
         if self.next_timeslot - tm > 0:
             time.sleep(self.next_timeslot - tm)
         self.next_timeslot = tm + self.delay / 1000
+
         pass
 
     # Makes a Wikidot AJAX query. Returns the response+title or throws an error.
@@ -90,7 +155,6 @@ def queryex(self, params, urlAppend = None):
     def query(self, params, urlAppend = None):
         return self.queryex(params, urlAppend)[0]
 
-
     # List all pages for the site.
 
     # Raw version
@@ -181,7 +245,7 @@ def get_page_id(self, page_unix_name):
         for item in soup.head.find_all('script'):
             text = item.string
             if text is None:
-                print("No text in script item", item)
+                #print("No text in script item", item)
                 continue
 
             pos = text.find("WIKIREQUEST.info.pageId = ")
@@ -209,17 +273,25 @@ def get_revisions_raw(self, page_id, limit):
         })
 
         soup = BeautifulSoup(res, 'html.parser')
+        print("revisions raw")
         return soup.table.contents
 
     # Client version
     def get_revisions(self, page_id, limit):
         revs = []
-        for tr in self.get_revisions_raw(page_id, limit):
+        raw = self.get_revisions_raw(page_id, limit)
+        for tr in raw:
             if tr.name != 'tr': continue # there's a header + various junk
 
             # RevID is stored as a value of an INPUT field
             rev_id = tr.input['value'] if tr.input else None
             if rev_id is None: continue # can't parse
+            attachment_action = tr.find("span", attrs={"title": "file/attachment action"})
+            attached_file = False
+            if attachment_action is not None:
+                attached_file = True
+                #pprint(raw)
+                print("was attchment", rev_id)
 
             # Unixtime is stored as a CSS class time_*
             rev_date = 0
@@ -228,6 +300,8 @@ def get_revisions(self, page_id, limit):
                 for cls in date_span['class']:
                     if cls.startswith('time_'):
                         rev_date = int(cls[5:])
+            else:
+                print("no odate found")
 
             # Username in a last <a> under <span class="printuser">
             user_span = tr.find("span", attrs={"class": "printuser"})
@@ -246,6 +320,7 @@ def get_revisions(self, page_id, limit):
                 'date': rev_date,
                 'user': rev_user,
                 'comment': rev_comment,
+                'attached_file': attached_file,
             })
         return revs
 
@@ -262,12 +337,18 @@ def get_revision_source(self, rev_id):
         # - htmlentities
         # - <br/>s in place of linebreaks
         # - random real linebreaks (have to be ignored)
+        if self.debug:
+            print("revision source:")
+            #pprint(res)
         soup = BeautifulSoup(res, 'html.parser')
         return soup.div.getText().lstrip(' \r\n')
 
     # Retrieves the rendered version + additional info unavailable in get_revision_source:
     # * Title
     # * Unixname at the time
+    #
+    # TODO: I think this could fetch the source as well, so we don't need to
+    # fetch two pages (the fetch source function above).
     def get_revision_version_raw(self, rev_id):
         res = self.queryex({
           'moduleName': 'history/PageVersionModule',
@@ -279,6 +360,8 @@ def get_revision_version(self, rev_id):
         res = self.get_revision_version_raw(rev_id) # this has title!
         soup = BeautifulSoup(res[0], 'html.parser')
 
+
+        # Extract list of images
         images = []
         for img_div in soup.find_all("div", attrs={"class": "scp-image-block"}):
             img_src = None
@@ -297,10 +380,31 @@ def get_revision_version(self, rev_id):
                     img_src = img["src"]
                     img_name = img["alt"]
 
-            if img_src is not None:
-                # Just in case, I don't think it ever happens
-                img_name = img_name.replace("/", "_forward_slash_")
-                images.append({"src": img_src, "filename": img_name})
+            if img_src is None:
+                continue
+
+            # Just in case, I don't think it ever happens, but resolve '..'
+            # juuuust in case someone tries to be funny
+            img_url = urlparse(urljoin(img_src, "."))
+            url_path = pathlib.Path(img_url.path)
+
+            img_path = ""
+            if img_url.netloc != "":
+                img_path = img_url.netloc + "/"
+                if img_url.netloc[-1] != '/':
+                    img_path += '/'
+
+            if img_url.path != "" and img_url.path[0] == '/':
+                img_path += img_url.path[1:]
+            else:
+                img_path += img_url.path
+
+            if img_path == "" or img_path[-1] == "/":
+                img_path += img_name
+
+            images.append({"src": img_src, "filename": img_name, "filepath": "images/" + img_path})
+
+
 
         # First table is a flyout with revision details. Remove and study it.
         unixname = None

From 2eabf6b127b6d5a5b245cc9291dd19eca05e5d0a Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Tue, 14 Jul 2020 14:16:17 +0200
Subject: [PATCH 47/93] add comment explaining why we can't get the images in
 the right commit

---
 wikidot.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 188f9f5..85a0d9c 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -290,7 +290,6 @@ def get_revisions(self, page_id, limit):
             attached_file = False
             if attachment_action is not None:
                 attached_file = True
-                #pprint(raw)
                 print("was attchment", rev_id)
 
             # Unixtime is stored as a CSS class time_*
@@ -339,7 +338,6 @@ def get_revision_source(self, rev_id):
         # - random real linebreaks (have to be ignored)
         if self.debug:
             print("revision source:")
-            #pprint(res)
         soup = BeautifulSoup(res, 'html.parser')
         return soup.div.getText().lstrip(' \r\n')
 
@@ -362,6 +360,13 @@ def get_revision_version(self, rev_id):
 
 
         # Extract list of images
+
+        # TODO: to get the right revision that added them, we need to go back
+        # and amend the commits that are flagged as attached_file above,
+        # because we can't get the image file name or URL reliably until they
+        # are added to the page source, wikidot itself doesn't store this information.
+        # So much hassle for little value, we get the empty commits when images
+        # are added anyways.
         images = []
         for img_div in soup.find_all("div", attrs={"class": "scp-image-block"}):
             img_src = None

From 8afada4ced927180d00c5cadc082202512de5b2b Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Fri, 17 Jul 2020 11:23:59 +0200
Subject: [PATCH 48/93] fuck python, this suddendly didn't work on my server

---
 wikidot.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 85a0d9c..9ed7d95 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -29,13 +29,12 @@ def __init__(self, site):
     def maybe_download_file(self, url, file_path):
         self._wait_request_slot()
 
-        path = pathlib.Path(file_path)
-        if path.exists():
+        if os.path.exists(file_path):
             if self.debug:
                 print(file_path, "exists, skipping")
             return False
 
-        dirpath = path.resolve().relative_to(pathlib.Path.cwd()).parent
+        dirpath = os.path.dirname(file_path)
         os.makedirs(dirpath, exist_ok=True)
 
         if self.debug:

From 74923b193149f3c762fea6ff587b4011e731b844 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Fri, 17 Jul 2020 11:24:28 +0200
Subject: [PATCH 49/93] re-try in case of json errors, seems like they are
 spurious

---
 wikidot.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/wikidot.py b/wikidot.py
index 9ed7d95..5163701 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -137,9 +137,19 @@ def queryex(self, params, urlAppend = None):
                 # In case of 404 errors or other stuff that indicates
                 # some bug in how we handle or request things
                 req.raise_for_status()
+            except Exception as e:
+                print('Failed to get response from wikidot', e, req, url, params)
+
+            try:
                 json = req.json()
             except Exception as e:
                 print('Failed to get response from wikidot', e, req, url, params)
+                if retries < self.max_retries:
+                    retries += 1
+                    self._wait_request_slot()
+                    time.sleep(retries * retries * self.delay)
+                    continue
+
                 raise e
 
             if json['status'] == 'ok':

From 9ed8019e17f45296ad30005f908d5283a1d110fb Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 18 Jul 2020 12:30:13 +0200
Subject: [PATCH 50/93] less debug spam

---
 rmaint.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 5cb56aa..3db3084 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -171,10 +171,11 @@ def buildRevisionList(self, pages = None, depth = 10000):
                 continue
 
             revs = self.wd.get_revisions(page_id, depth)
-            print("Revisions to fetch: "+str(len(revs)))
+            print("Revisions to fetch: " + str(len(revs)))
+            already_fetched = 0
             for rev in revs:
                 if rev['id'] in self.fetched_revids:
-                    print(rev['id'], 'already fetched')
+                    already_fetched += 1
                     continue
 
                 self.wrevs.append({
@@ -187,6 +188,8 @@ def buildRevisionList(self, pages = None, depth = 10000):
                 })
             self.saveWRevs() # Save a cached copy
 
+            print("Revisions already fetched", already_fetched)
+
         if os.path.isfile(self.path+'/.metadata.json'):
             self.loadMetadata()
 
@@ -265,7 +268,7 @@ def commitNext(self):
 
         if rev['rev_id'] in self.fetched_revids:
             if self.debug:
-                print(rev['rev_id'], 'already fetched')
+                print(rev['rev_id'], 'already fetched, yet called on to fetch again')
 
             self.rev_no += 1
 

From 49da2cfd17d402c2e0e85e821cdacf6d6e9e11d6 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 18 Jul 2020 13:40:38 +0200
Subject: [PATCH 51/93] don't need to wait for a download slot if we're not
 downloading

---
 wikidot.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 5163701..cb6b52e 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -27,13 +27,13 @@ def __init__(self, site):
 
     # Downloads file if it doesn't exist
     def maybe_download_file(self, url, file_path):
-        self._wait_request_slot()
-
         if os.path.exists(file_path):
             if self.debug:
                 print(file_path, "exists, skipping")
             return False
 
+        self._wait_request_slot()
+
         dirpath = os.path.dirname(file_path)
         os.makedirs(dirpath, exist_ok=True)
 
@@ -345,8 +345,6 @@ def get_revision_source(self, rev_id):
         # - htmlentities
         # - <br/>s in place of linebreaks
         # - random real linebreaks (have to be ignored)
-        if self.debug:
-            print("revision source:")
         soup = BeautifulSoup(res, 'html.parser')
         return soup.div.getText().lstrip(' \r\n')
 

From e6da37708bdbb9fc937c9c2c6ec7eb7ff727cae0 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 18 Jul 2020 13:42:59 +0200
Subject: [PATCH 52/93] better use of named parameters and stuff

---
 crawl.py  | 17 ++++++++++-------
 rmaint.py | 21 ++++++++++++---------
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/crawl.py b/crawl.py
index 9fd0bab..3b9f486 100644
--- a/crawl.py
+++ b/crawl.py
@@ -15,6 +15,7 @@
 parser.add_argument('site', help='URL of Wikidot site')
 # Actions
 parser.add_argument('--list-pages', action='store_true', help='List all pages on this site')
+parser.add_argument('--max-page-count', type=int, default='10000', help='Only list/fetch up to this amount of pages')
 parser.add_argument('--source', action='store_true', help='Print page source (requires --page)')
 parser.add_argument('--content', action='store_true', help='Print page content (requires --page)')
 parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)')
@@ -41,17 +42,17 @@ def force_dirs(path):
     os.makedirs(path, exist_ok=True)
 
 if args.list_pages_raw:
-    print((wd.list_pages_raw(args.depth)))
+    print((wd.list_pages_raw(limit = args.max_pages_count)))
 
 elif args.list_pages:
-    for page in wd.list_pages(args.depth):
+    for page in wd.list_pages(limit = args.max_pages_count):
         print(page)
 
 elif args.source:
     if not args.page:
         raise Exception("Please specify --page for --source.")
     
-    page_id = wd.get_page_id(args.page)
+    page_id = wd.get_page_id(page_unix_name=args.page)
     if not page_id:
         raise Exception("Page not found: "+args.page)
     
@@ -62,7 +63,7 @@ def force_dirs(path):
     if not args.page:
         raise Exception("Please specify --page for --source.")
     
-    page_id = wd.get_page_id(args.page)
+    page_id = wd.get_page_id(page_unix_name=args.page)
     if not page_id:
         raise Exception("Page not found: "+args.page)
     
@@ -73,7 +74,7 @@ def force_dirs(path):
     if not args.page:
         raise Exception("Please specify --page for --log.")
 
-    page_id = wd.get_page_id(args.page)
+    page_id = wd.get_page_id(page_unix_name=args.page)
     if not page_id:
         raise Exception("Page not found: "+args.page)
 
@@ -84,7 +85,7 @@ def force_dirs(path):
     if not args.page:
         raise Exception("Please specify --page for --log.")
 
-    page_id = wd.get_page_id(args.page)
+    page_id = wd.get_page_id(page_unix_name=args.page)
     if not page_id:
         raise Exception("Page not found: "+args.page)
     for rev in wd.get_revisions(page_id, args.depth):
@@ -98,7 +99,9 @@ def force_dirs(path):
     rm = RepoMaintainer(wd, args.dump)
     rm.debug = args.debug
     rm.storeRevIds = args.revids
-    rm.buildRevisionList([args.page] if args.page else None, args.depth)
+    rm.max_depth = args.depth
+    rm.max_page_count = args.max_page_count
+    rm.buildRevisionList([args.page] if args.page else None)
     rm.openRepo()
 
     print("Downloading revisions...")
diff --git a/rmaint.py b/rmaint.py
index 3db3084..9450b0c 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -16,7 +16,7 @@
 
 # Usage:
 #   rm = RepoMaintainer(wikidot, path)
-#   rm.buildRevisionList(pages, depth)
+#   rm.buildRevisionList(pages)
 #   rm.openRepo()
 #   while rm.commitNext():
 #       pass
@@ -42,6 +42,12 @@ def __init__(self, wikidot, path):
 
         self.repo = None            # Git repo object
         self.index = None           # Git current index object
+        self.max_depth = 10000      # download at most this number of revisions
+        self.max_page_count = 10000 # download at most this number of pages
+
+        self.pbar = None
+        self.first_fetched = 0      # For progress bar
+        self.fetched_revids = set()
 
 
     #
@@ -90,12 +96,11 @@ def loadMetadata(self):
     #
     # Compiles a combined revision list for a given set of pages, or all pages on the site.
     #  pages: compile history for these pages
-    #  depth: download at most this number of revisions.
     #
     # If there exists a cached revision list at the repository destination,
     # it is loaded and no requests are made.
     #
-    def buildRevisionList(self, pages = None, depth = 10000):
+    def buildRevisionList(self, pages = None):
         if os.path.isfile(self.path+'/.wrevs'):
             print("Loading cached revision list...")
             self.loadWRevs()
@@ -121,10 +126,10 @@ def buildRevisionList(self, pages = None, depth = 10000):
                 fp.close()
 
 
-            if not pages:
+            if not pages or len(pages) < self.max_page_count:
                 if self.debug:
                     print('Need to fetch pages')
-                pages = self.wd.list_pages(10000)
+                pages = self.wd.list_pages(self.max_page_count)
                 self.savePages(pages)
             elif self.debug:
                 print(len(pages), 'pages loaded')
@@ -170,10 +175,8 @@ def buildRevisionList(self, pages = None, depth = 10000):
                 print('Page gone?', page)
                 continue
 
-            revs = self.wd.get_revisions(page_id, depth)
-            print("Revisions to fetch: " + str(len(revs)))
-            already_fetched = 0
-            for rev in revs:
+            revs = self.wd.get_revisions(page_id=page_id, limit=max_depth)
+            for rev in tqdm(revs, desc='Adding revisions from page ' + page_id):
                 if rev['id'] in self.fetched_revids:
                     already_fetched += 1
                     continue

From c2479c959e090aa35461fc1d5a1e38c49c55d0fd Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 18 Jul 2020 13:44:12 +0200
Subject: [PATCH 53/93] add progress bars with tqdm

---
 crawl.py  |  5 ++---
 rmaint.py | 25 ++++++++++++++-----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/crawl.py b/crawl.py
index 3b9f486..b6a4f2d 100644
--- a/crawl.py
+++ b/crawl.py
@@ -104,9 +104,8 @@ def force_dirs(path):
     rm.buildRevisionList([args.page] if args.page else None)
     rm.openRepo()
 
-    print("Downloading revisions...")
-    while rm.commitNext():
-        pass
+    print("Downloading revisions")
+    rm.fetchAll()
 
     rm.cleanup()
     print("Done.")
diff --git a/rmaint.py b/rmaint.py
index 9450b0c..80d8da4 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -11,6 +11,8 @@
 import time # For parsing unix epoch timestamps from wikidot and convert to normal timestamps
 import re # For sanitizing usernames to fake email addresses
 
+from tqdm import tqdm # for progress bar
+
 # Repository builder and maintainer
 # Contains logic for actual loading and maintaining the repository over the course of its construction.
 
@@ -141,7 +143,7 @@ def buildRevisionList(self, pages = None):
 
         # TODO: I don't know python, but this is highly suboptimal (and takes a ton of time)
         # Should use a set/hashmap/whatever python calls it
-        for wrev in self.wrevs:
+        for wrev in tqdm(self.wrevs, desc='Collecting pages we already got revisions for'):
             page_name = wrev['page_name']
 
             if page_name in fetched_pages:
@@ -153,7 +155,7 @@ def buildRevisionList(self, pages = None):
             print("Already fetched revisions for " + str(len(fetched_pages)) + " of " + str(len(pages)))
 
         fetched = 0
-        for page in pages:
+        for page in tqdm(pages, desc='Updating list of revisions to fetch'):
             if page in fetched_pages:
                 continue
 
@@ -178,12 +180,11 @@ def buildRevisionList(self, pages = None):
             revs = self.wd.get_revisions(page_id=page_id, limit=max_depth)
             for rev in tqdm(revs, desc='Adding revisions from page ' + page_id):
                 if rev['id'] in self.fetched_revids:
-                    already_fetched += 1
                     continue
 
                 self.wrevs.append({
                   'page_id' : page_id,
-                  'page_name' : page, # name atm, not at revision time
+                  'page_name' : page, # current name, not at revision time (revisions can rename them)
                   'rev_id' : rev['id'],
                   'date' : rev['date'],
                   'user' : rev['user'],
@@ -191,7 +192,7 @@ def buildRevisionList(self, pages = None):
                 })
             self.saveWRevs() # Save a cached copy
 
-            print("Revisions already fetched", already_fetched)
+            print("Number of revisions already fetched", len(revs) - len(self.wrevs))
 
         if os.path.isfile(self.path+'/.metadata.json'):
             self.loadMetadata()
@@ -263,16 +264,11 @@ def openRepo(self):
     # Takes an unprocessed revision from a revision log, fetches its data and commits it.
     # Returns false if no unprocessed revisions remain.
     #
-    def commitNext(self):
+    def commitNext(self, rev):
         if self.rev_no >= len(self.wrevs):
             return False
 
-        rev = self.wrevs[self.rev_no]
-
         if rev['rev_id'] in self.fetched_revids:
-            if self.debug:
-                print(rev['rev_id'], 'already fetched, yet called on to fetch again')
-
             self.rev_no += 1
 
             self.saveState() # Update operation state
@@ -394,6 +390,13 @@ def commitNext(self):
 
         return True
 
+    def fetchAll(self):
+        to_fetch = []
+        for rev in tqdm(self.wrevs, desc='Creating list of revisions to fetch'):
+            if rev['rev_id'] not in self.fetched_revids:
+                to_fetch.append(rev)
+        for rev in tqdm(to_fetch, desc='Downloading'):
+            self.commitNext(rev)
 
     #
     # Updates all children of the page to reflect parent's unixname change.

From 511c6eb800b935262f890d7364679f4a5ad00bfb Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 18 Jul 2020 13:44:22 +0200
Subject: [PATCH 54/93] remove unused

---
 rmaint.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index 80d8da4..8e98335 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -36,7 +36,6 @@ def __init__(self, wikidot, path):
 
         # Internal state
         self.wrevs = None           # Compiled wikidot revision list (history)
-        self.fetcheds_revids = []   # Compiled wikidot revision list (history)
 
         self.rev_no = 0             # Next revision to process
         self.last_names = {}        # Tracks page renames: name atm -> last name in repo

From 9c43ea7ab4055c0fd7b739ccafbf14e95e44f426 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sat, 18 Jul 2020 13:44:41 +0200
Subject: [PATCH 55/93] don't be dumb, use sets, massive speedup

---
 rmaint.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 8e98335..3a9ba4a 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -75,7 +75,7 @@ def appendFetchedRevid(self, revid):
         fp.close()
 
     def loadFetchedRevids(self):
-        self.fetched_revids = [line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')]
+        self.fetched_revids = set([line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')])
 
     # Persistent metadata about the repo:
     #  - Tracks page renames: name atm -> last name in repo
@@ -112,9 +112,9 @@ def buildRevisionList(self, pages = None):
 
         if os.path.isfile(self.path+'/.fetched.txt'):
             self.loadFetchedRevids()
-            print(self.fetched_revids)
+            print(len(self.fetched_revids), 'revisions already fetched')
         else:
-            self.fetched_revids = []
+            self.fetched_revids = set()
 
         if self.debug:
             print("Building revision list...")
@@ -135,11 +135,7 @@ def buildRevisionList(self, pages = None):
             elif self.debug:
                 print(len(pages), 'pages loaded')
 
-        fetched_pages = []
-
-        if self.debug:
-            print('Collecting already pages we already got revisions for')
-
+        fetched_pages = set()
         # TODO: I don't know python, but this is highly suboptimal (and takes a ton of time)
         # Should use a set/hashmap/whatever python calls it
         for wrev in tqdm(self.wrevs, desc='Collecting pages we already got revisions for'):
@@ -148,7 +144,7 @@ def buildRevisionList(self, pages = None):
             if page_name in fetched_pages:
                 continue
 
-            fetched_pages.append(page_name)
+            fetched_pages.add(page_name)
 
         if self.debug:
             print("Already fetched revisions for " + str(len(fetched_pages)) + " of " + str(len(pages)))
@@ -382,7 +378,7 @@ def commitNext(self, rev):
         if self.debug:
             print('Committed', commit.name_rev, 'by', author)
 
-        self.fetched_revids.append(rev['rev_id'])
+        self.fetched_revids.add(rev['rev_id'])
 
         self.rev_no += 1
         self.saveState() # Update operation state

From e2a763b61de1872d8998ab5e8fccc13c67035ae4 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 09:37:12 +0200
Subject: [PATCH 56/93] fix default argument

---
 crawl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl.py b/crawl.py
index b6a4f2d..d90c0ca 100644
--- a/crawl.py
+++ b/crawl.py
@@ -26,7 +26,7 @@
 # Action settings
 parser.add_argument('--page', type=str, help='Query only this page')
 parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
-parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository')
+parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True)
 # Common settings
 parser.add_argument('--debug', action='store_true', help='Print debug info')
 parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot')

From 5cc5fdbde58e320dc3376380dff3cdd7961c93c5 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 09:37:36 +0200
Subject: [PATCH 57/93] fix status output

---
 rmaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index 3a9ba4a..f7b0bfc 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -187,7 +187,7 @@ def buildRevisionList(self, pages = None):
                 })
             self.saveWRevs() # Save a cached copy
 
-            print("Number of revisions already fetched", len(revs) - len(self.wrevs))
+        print("Number of revisions already fetched", len(revs) - len(self.wrevs))
 
         if os.path.isfile(self.path+'/.metadata.json'):
             self.loadMetadata()

From 674bddac96a62e95b48666ecb117c572cb996b87 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 09:37:50 +0200
Subject: [PATCH 58/93] fix check for existing repo

---
 rmaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index f7b0bfc..a20db9b 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -236,7 +236,7 @@ def openRepo(self):
         self.last_names = {} # Tracks page renames: name atm -> last name in repo
         self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
 
-        if os.path.isfile(self.path+'/.git'):
+        if os.path.isdir(self.path+'/.git'):
             print("Continuing from aborted dump state...")
             self.loadState()
             self.repo = Repo(self.path)

From 2f1a9f6c560984bc8703577a9e0ac925ff0063e3 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 09:38:03 +0200
Subject: [PATCH 59/93] add todo

---
 readme.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/readme.md b/readme.md
index c13d967..e59e568 100644
--- a/readme.md
+++ b/readme.md
@@ -39,3 +39,8 @@ The descriptions for on-site modules are heavily correlated with AJAX ones:
 Someone else did Wikidot AJAX:
 
 * https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py
+
+
+#### TODO
+
+Handle deleted images.

From 858b7ed223a6254e7bf1eb359f79cee832ac054f Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 09:38:39 +0200
Subject: [PATCH 60/93] 404 for images is not fatal

---
 wikidot.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/wikidot.py b/wikidot.py
index cb6b52e..d36e85d 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -51,6 +51,9 @@ def maybe_download_file(self, url, file_path):
             headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"})
             req = requests.get(url, stream=True, )
 
+            if req.status_code == 404:
+                return False
+
             if req.status_code >= 500:
                 retries += 1
                 print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))

From 29e17918b4335a22e04104f2775c65ff7239271b Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 10:08:28 +0200
Subject: [PATCH 61/93] fix

---
 rmaint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index a20db9b..d0663b4 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -162,7 +162,6 @@ def buildRevisionList(self, pages = None):
 
             if self.debug:
                 print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages)))
-            fetched += 1
             page_id = self.wd.get_page_id(page)
 
             if self.debug:
@@ -174,6 +173,7 @@ def buildRevisionList(self, pages = None):
 
             revs = self.wd.get_revisions(page_id=page_id, limit=max_depth)
             for rev in tqdm(revs, desc='Adding revisions from page ' + page_id):
+                fetched += 1
                 if rev['id'] in self.fetched_revids:
                     continue
 
@@ -187,7 +187,7 @@ def buildRevisionList(self, pages = None):
                 })
             self.saveWRevs() # Save a cached copy
 
-        print("Number of revisions already fetched", len(revs) - len(self.wrevs))
+        print("Number of revisions already fetched", len(self.fetched_revids), len(self.wrevs))
 
         if os.path.isfile(self.path+'/.metadata.json'):
             self.loadMetadata()

From d24acca2bb0e014e70106312eb844cd7eb273e5b Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 10:08:52 +0200
Subject: [PATCH 62/93] fix relative path

---
 rmaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index d0663b4..346adc3 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -249,7 +249,7 @@ def openRepo(self):
 
             if self.storeRevIds:
                 # Add revision id file to the new repo
-                fname = '/.revid'
+                fname = self.path + '/.revid'
                 codecs.open(self.path + fname, "w", "UTF-8").close()
                 self.repo.index.add([fname])
                 self.index.commit("Initial creation of repo")

From 938bb680421b2a0070381c3d475655b0b50d1891 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 10:09:16 +0200
Subject: [PATCH 63/93] add timeouts

---
 wikidot.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index d36e85d..7d57432 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -49,7 +49,7 @@ def maybe_download_file(self, url, file_path):
             # Pretty generic user-agent, but we append a unique none for us
             # Makes wikimedia happy
             headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"})
-            req = requests.get(url, stream=True, )
+            req = requests.get(url, stream=True, timeout=30)
 
             if req.status_code == 404:
                 return False
@@ -116,7 +116,7 @@ def queryex(self, params, urlAppend = None):
         while retries < self.max_retries:
             self._wait_request_slot()
 
-            req = requests.request('POST', url, data=params, cookies=cookies)
+            req = requests.request('POST', url, data=params, cookies=cookies, timeout=30)
 
             # Usually a 502 error, recovers immediately
             if req.status_code >= 500:
@@ -252,7 +252,7 @@ def get_page_id(self, page_unix_name):
         if self.debug:
             print("fetching", url)
 
-        req = requests.request('GET', url)
+        req = requests.request('GET', url, timeout=30)
         soup = BeautifulSoup(req.text, 'html.parser')
         for item in soup.head.find_all('script'):
             text = item.string

From e0d9e4b0df36f46c04d4ae13a5fa581e8803541b Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 10:09:39 +0200
Subject: [PATCH 64/93] python's time.sleep is in seconds, not milliseconds

---
 wikidot.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 7d57432..15d390b 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -67,7 +67,7 @@ def maybe_download_file(self, url, file_path):
 
                 # Extra nice, sleep longer (expoential increase), hope for the
                 # server to recover
-                time.sleep(retries * retries * self.delay)
+                time.sleep(retries * retries * self.delay / 1000)
 
                 continue
 
@@ -114,6 +114,9 @@ def queryex(self, params, urlAppend = None):
         # In case of e. g. 500 errors
         retries = 0
         while retries < self.max_retries:
+            if retries > 0:
+                print("retry", retries, "of", self.max_retries)
+
             self._wait_request_slot()
 
             req = requests.request('POST', url, data=params, cookies=cookies, timeout=30)
@@ -132,7 +135,7 @@ def queryex(self, params, urlAppend = None):
 
                 # Extra nice, sleep longer (expoential increase), hope for the
                 # server to recover
-                time.sleep(retries * retries * self.delay)
+                time.sleep(retries * retries * self.delay / 1000)
 
                 continue
 
@@ -149,14 +152,20 @@ def queryex(self, params, urlAppend = None):
                 print('Failed to get response from wikidot', e, req, url, params)
                 if retries < self.max_retries:
                     retries += 1
-                    self._wait_request_slot()
-                    time.sleep(retries * retries * self.delay)
+                    #self._wait_request_slot()
+                    time.sleep(retries * retries * self.delay / 1000)
                     continue
 
                 raise e
 
             if json['status'] == 'ok':
                 return json['body'], (json['title'] if 'title' in json else '')
+            elif retries < self.max_retries:
+                print("error in response", json)
+                retries += 1
+                print("sleeping for", retries * retries * self.delay);
+                #self._wait_request_slot()
+                time.sleep(retries * retries * self.delay / 1000)
             else:
                 raise Exception(req.text)
 

From b01c253bf625d9aa4dbeb5c5eb01e327cb8395ef Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 10:10:08 +0200
Subject: [PATCH 65/93] support for skipping select revisions

---
 crawl.py  | 4 ++++
 rmaint.py | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/crawl.py b/crawl.py
index d90c0ca..367dbf0 100644
--- a/crawl.py
+++ b/crawl.py
@@ -27,6 +27,7 @@
 parser.add_argument('--page', type=str, help='Query only this page')
 parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
 parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True)
+parser.add_argument('--skip', type=str, help='Skip the specified revision')
 # Common settings
 parser.add_argument('--debug', action='store_true', help='Print debug info')
 parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot')
@@ -104,6 +105,9 @@ def force_dirs(path):
     rm.buildRevisionList([args.page] if args.page else None)
     rm.openRepo()
 
+    if args.skip:
+        rm.revs_to_skip = [args.skip]
+
     print("Downloading revisions")
     rm.fetchAll()
 
diff --git a/rmaint.py b/rmaint.py
index 346adc3..8a568a8 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -50,6 +50,8 @@ def __init__(self, wikidot, path):
         self.first_fetched = 0      # For progress bar
         self.fetched_revids = set()
 
+        self.revs_to_skip = []
+
 
     #
     # Saves and loads revision list from file
@@ -269,6 +271,10 @@ def commitNext(self, rev):
             self.saveState() # Update operation state
             return True
 
+        if rev['rev_id'] in self.revs_to_skip:
+            print("Skipping", rev)
+            return True
+
         source = self.wd.get_revision_source(rev['rev_id'])
         # Page title and unix_name changes are only available through another request:
         details = self.wd.get_revision_version(rev['rev_id'])

From b52dc93b77097a8c2134cd2e559cb5356e6db6b3 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 11:08:57 +0200
Subject: [PATCH 66/93] time how long a download takes, remove invalid images
 (usually 404 errors with wrong status in return header)

---
 wikidot.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 15d390b..a53384d 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -8,6 +8,8 @@
 import hashlib
 import os
 import shutil
+import imghdr
+from timeit import default_timer as timer
 
 # Implements various queries to Wikidot engine through its AJAX facilities
 
@@ -22,23 +24,23 @@ def __init__(self, site):
         self.sitename = urlparse(site).hostname.lower()
         self.delay = 200        # Delay between requests in msec
         self.debug = False      # Print debug messages
-        self.next_timeslot = time.process_time()   # Can call immediately
+        self.next_timeslot = timer()   # Can call immediately
         self.max_retries = 5
 
     # Downloads file if it doesn't exist
     def maybe_download_file(self, url, file_path):
         if os.path.exists(file_path):
             if self.debug:
-                print(file_path, "exists, skipping")
+                print(" - ", file_path, "exists, skipping")
             return False
 
-        self._wait_request_slot()
+        #self._wait_request_slot()
 
         dirpath = os.path.dirname(file_path)
         os.makedirs(dirpath, exist_ok=True)
 
         if self.debug:
-            print("downloading", url, "to" ,file_path, "dirpath", dirpath)
+            print(" < downloading", url, "to" ,file_path, "dirpath", dirpath)
 
         # In case of e. g. 500 errors
         retries = 0
@@ -49,6 +51,7 @@ def maybe_download_file(self, url, file_path):
             # Pretty generic user-agent, but we append a unique none for us
             # Makes wikimedia happy
             headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"})
+            start = timer()
             req = requests.get(url, stream=True, timeout=30)
 
             if req.status_code == 404:
@@ -56,11 +59,11 @@ def maybe_download_file(self, url, file_path):
 
             if req.status_code >= 500:
                 retries += 1
-                print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+                print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
 
                 # In case of debug enabled, we already printed this above
                 if not self.debug:
-                    print(req)
+                    print(' - ', req)
 
                 # Be nice, double wait delay for errors
                 self._wait_request_slot()
@@ -80,9 +83,18 @@ def maybe_download_file(self, url, file_path):
                 with open(file_path, 'wb') as out_file:
                     shutil.copyfileobj(req.raw, out_file)
 
+                if imghdr.what(file_path) is None:
+                    print('Downloaded invalid image', url)
+                    os.remove(file_path)
+                    return False
+
+
+                if self.debug:
+                    print(" - downloaded file size", os.path.getsize(file_path), "in", round(timer() - start, 2))
+
                 return True
             except Exception as e:
-                print('Failed to download', e, req, url)
+                print(' ! Failed to download', e, req, url)
                 raise e
 
         return False

From bae36607c054677c9bdfc3a44536d86de4a23344 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 11:09:07 +0200
Subject: [PATCH 67/93] add tags to todo

---
 readme.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/readme.md b/readme.md
index e59e568..354a887 100644
--- a/readme.md
+++ b/readme.md
@@ -43,4 +43,6 @@ Someone else did Wikidot AJAX:
 
 #### TODO
 
-Handle deleted images.
+ - Handle deleted images. Probably need to check the diff and check all pages for references if removed from one page.
+ - Handle tags (both added and removed).
+

From 4dc15fdafe926fe06186d17ad4323749e734b3d5 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 11:10:10 +0200
Subject: [PATCH 68/93] improve debug output

---
 rmaint.py  |  5 +++++
 wikidot.py | 50 +++++++++++++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 8a568a8..2d32713 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -295,6 +295,8 @@ def commitNext(self, rev):
         if rev['comment'].startswith('Parent page set to: "'):
             # This is a parenting revision, remember the new parent
             parent_unixname = rev['comment'][21:-2]
+            if self.debug:
+                print('Parent changed', parent_unixname)
             self.last_parents[unixname] = parent_unixname
         else:
             # Else use last parent_unixname we've recorded
@@ -409,6 +411,9 @@ def fetchAll(self):
     # Therefore, on every rename we must update all linked children in the same revision.
     #
     def updateChildren(self, oldunixname, newunixname):
+        if self.debug:
+            print('Updating parents for', oldunixname, newunixname)
+
         for child in list(self.last_parents.keys()):
             if self.last_parents[child] == oldunixname:
                 self.updateParentField(child, self.last_parents[child], newunixname)
diff --git a/wikidot.py b/wikidot.py
index a53384d..32e3ace 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -116,8 +116,8 @@ def queryex(self, params, urlAppend = None):
         params['wikidot_token7'] = token
 
         if self.debug:
-            print(params)
-            print(cookies)
+            print(' - ', params)
+            print(' - ', cookies)
 
         url = self.site+'/ajax-module-connector.php'
         if urlAppend is not None:
@@ -127,16 +127,20 @@ def queryex(self, params, urlAppend = None):
         retries = 0
         while retries < self.max_retries:
             if retries > 0:
-                print("retry", retries, "of", self.max_retries)
+                print(" ! retry", retries, "of", self.max_retries)
 
             self._wait_request_slot()
 
+            start = timer()
             req = requests.request('POST', url, data=params, cookies=cookies, timeout=30)
 
+            if self.debug:
+                print(' * ajax request completed in', round(timer() - start, 2))
+
             # Usually a 502 error, recovers immediately
             if req.status_code >= 500:
                 retries += 1
-                print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+                print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
 
                 # In case of debug enabled, we already printed this above
                 if not self.debug:
@@ -156,12 +160,12 @@ def queryex(self, params, urlAppend = None):
                 # some bug in how we handle or request things
                 req.raise_for_status()
             except Exception as e:
-                print('Failed to get response from wikidot', e, req, url, params)
+                print(' ! Failed to get response from wikidot', e, req, url, params)
 
             try:
                 json = req.json()
             except Exception as e:
-                print('Failed to get response from wikidot', e, req, url, params)
+                print(' ! Failed to get response from wikidot', e, req, url, params)
                 if retries < self.max_retries:
                     retries += 1
                     #self._wait_request_slot()
@@ -171,17 +175,18 @@ def queryex(self, params, urlAppend = None):
                 raise e
 
             if json['status'] == 'ok':
+
                 return json['body'], (json['title'] if 'title' in json else '')
             elif retries < self.max_retries:
-                print("error in response", json)
+                print(" ! error in response", json)
                 retries += 1
-                print("sleeping for", retries * retries * self.delay);
+                print(" ! sleeping for", retries * retries * self.delay);
                 #self._wait_request_slot()
                 time.sleep(retries * retries * self.delay / 1000)
             else:
                 raise Exception(req.text)
 
-        print('Failed too many times', url, params, cookies)
+        print(' ! Failed too many times', url, params, cookies)
         raise Exception('Failed too many times for ' + url)
 
     # Same but only returns the body, most responses don't have titles
@@ -219,11 +224,11 @@ def list_pages(self, limit):
                 pages.append(entry)
 
             if self.debug:
-                print('Pages found:', len(pages))
+                print(' - Pages found:', len(pages))
 
             targets = soup.find_all('span','target')
             if len(targets) < 2:
-                print("Unable to find next listing page, not enough target spans")
+                print(" ! Unable to find next listing page, not enough target spans")
                 break
 
             next_url = targets[-1].a.get('href').split('/')
@@ -231,10 +236,10 @@ def list_pages(self, limit):
                 next_page = int(next_url[-1])
 
                 if self.debug:
-                    print('Next listing page', next_page)
+                    print(' - Next listing page', next_page)
 
             else:
-                print("invalid next url", next_url)
+                print(" ! invalid next url", next_url)
                 break
 
             #next_page = int(targets[0].a.text)
@@ -244,20 +249,20 @@ def list_pages(self, limit):
                 current_page = int(current_spans[0].text)
 
                 if self.debug:
-                    print('Current listing page', current_page)
+                    print(' - Current listing page', current_page)
 
             else:
-                print("unable to find current page")
+                print(" ! unable to find current page")
                 break;
 
             if next_page != offset + 1:
                 if self.debug:
-                    print('Next page is wrong', next_page, 'hopefully at the end')
+                    print(' ! Next page is wrong', next_page, 'hopefully at the end')
                 break
 
             offset += 1
 
-            print("Fetching listing page", offset)
+            print(" - Fetching listing page", offset)
 
         return pages
 
@@ -271,9 +276,13 @@ def get_page_id(self, page_unix_name):
         url = self.site+'/'+page_unix_name + '/noredirect/true';
 
         if self.debug:
-            print("fetching", url)
+            print(" > fetching", url)
 
+        start = timer()
         req = requests.request('GET', url, timeout=30)
+        if self.debug:
+            print(' * page id request completed in', round(timer() - start, 2))
+
         soup = BeautifulSoup(req.text, 'html.parser')
         for item in soup.head.find_all('script'):
             text = item.string
@@ -306,7 +315,6 @@ def get_revisions_raw(self, page_id, limit):
         })
 
         soup = BeautifulSoup(res, 'html.parser')
-        print("revisions raw")
         return soup.table.contents
 
     # Client version
@@ -323,7 +331,7 @@ def get_revisions(self, page_id, limit):
             attached_file = False
             if attachment_action is not None:
                 attached_file = True
-                print("was attchment", rev_id)
+                print(" - was attchment", rev_id)
 
             # Unixtime is stored as a CSS class time_*
             rev_date = 0
@@ -333,7 +341,7 @@ def get_revisions(self, page_id, limit):
                     if cls.startswith('time_'):
                         rev_date = int(cls[5:])
             else:
-                print("no odate found")
+                print(" ! no odate found")
 
             # Username in a last <a> under <span class="printuser">
             user_span = tr.find("span", attrs={"class": "printuser"})

From 251b7068afa0345ca8ae1279b830cbb5604734f1 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 11:10:36 +0200
Subject: [PATCH 69/93] skip updating parent history if not actually changed

---
 rmaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index 2d32713..cd69f7a 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -415,7 +415,7 @@ def updateChildren(self, oldunixname, newunixname):
             print('Updating parents for', oldunixname, newunixname)
 
         for child in list(self.last_parents.keys()):
-            if self.last_parents[child] == oldunixname:
+            if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname:
                 self.updateParentField(child, self.last_parents[child], newunixname)
 
     #

From 4ec2b3bd67de91c47bfe659853839ce1d8aae84d Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 11:20:55 +0200
Subject: [PATCH 70/93] mention added images in commit message

---
 rmaint.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/rmaint.py b/rmaint.py
index cd69f7a..493006a 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -361,12 +361,19 @@ def commitNext(self, rev):
             commit_date = None
 
         got_images = False;
+
+        # Add some spacing in the commit message
+        if len(details['images']) > 0:
+            commit_msg += '\n'
+
         for image in details['images']:
             if self.wd.maybe_download_file(image['src'], self.path + '/' + image['filepath']):
+                commit_msg += '\nAdded image: ' + image['src']
                 got_images = True
                 # If we do this gitpython barfs on itself
                 #added_file_paths.append(image['filepath'])
 
+
         if got_images:
             added_file_paths.append("images")
         print("Committing: " + str(self.rev_no) + '. '+commit_msg)

From 22a3f1ed36e71e7c51920fe4061f272b9c95d33f Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 11:32:51 +0200
Subject: [PATCH 71/93] added some dependencies

---
 readme.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/readme.md b/readme.md
index 354a887..641a570 100644
--- a/readme.md
+++ b/readme.md
@@ -16,6 +16,8 @@ At least:
 * Python 3
 * python-beautifulsoup4
 * python-gitpython
+* python-requests
+* python-tqdm
 
 ##### Examples:
 

From d38d51414d7afcdc5d9aff98204a9b2bd3d3c5fb Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 11:40:58 +0200
Subject: [PATCH 72/93] avoid retrying images that we know are invalid (i. e.
 not temporary download failures)

---
 rmaint.py  | 16 ++++++++++++++++
 wikidot.py |  8 ++++++++
 2 files changed, 24 insertions(+)

diff --git a/rmaint.py b/rmaint.py
index 493006a..ae89248 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -79,6 +79,19 @@ def appendFetchedRevid(self, revid):
     def loadFetchedRevids(self):
         self.fetched_revids = set([line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')])
 
+    def saveFailedImages(self):
+        file_path = self.path + '/.failed-images.txt'
+        fp = open(file_path, 'w')
+        for failed in self.wd.failed_images:
+            fp.write(failed + '\n')
+        fp.close()
+
+    def loadFailedImages(self):
+        file_path = self.path + '/.failed-images.txt'
+        if not os.path.isfile(file_path):
+            return
+        self.self.wd.failed_images = set([line.rstrip() for line in open(file_path, 'r')])
+
     # Persistent metadata about the repo:
     #  - Tracks page renames: name atm -> last name in repo
     #  - Tracks page parent names: name atm -> last parent in repo
@@ -237,6 +250,7 @@ def openRepo(self):
         # Create a new repository or continue from aborted dump
         self.last_names = {} # Tracks page renames: name atm -> last name in repo
         self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
+        self.loadFailedImages()
 
         if os.path.isdir(self.path+'/.git'):
             print("Continuing from aborted dump state...")
@@ -372,6 +386,8 @@ def commitNext(self, rev):
                 got_images = True
                 # If we do this gitpython barfs on itself
                 #added_file_paths.append(image['filepath'])
+            else:
+                self.saveFailedImages()
 
 
         if got_images:
diff --git a/wikidot.py b/wikidot.py
index 32e3ace..6b8e2f6 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -26,9 +26,15 @@ def __init__(self, site):
         self.debug = False      # Print debug messages
         self.next_timeslot = timer()   # Can call immediately
         self.max_retries = 5
+        self.failed_images = set()
 
     # Downloads file if it doesn't exist
     def maybe_download_file(self, url, file_path):
+        if url in self.failed_images:
+            if self.debug:
+                print(" ! ", url, "already failed, skipping")
+            return False
+
         if os.path.exists(file_path):
             if self.debug:
                 print(" - ", file_path, "exists, skipping")
@@ -55,6 +61,7 @@ def maybe_download_file(self, url, file_path):
             req = requests.get(url, stream=True, timeout=30)
 
             if req.status_code == 404:
+                self.failed_images.add(url)
                 return False
 
             if req.status_code >= 500:
@@ -86,6 +93,7 @@ def maybe_download_file(self, url, file_path):
                 if imghdr.what(file_path) is None:
                     print('Downloaded invalid image', url)
                     os.remove(file_path)
+                    self.failed_images.add(url)
                     return False
 
 

From 1b3f608c45c9f57dd045cf95e75763a1e94301e7 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 12:04:06 +0200
Subject: [PATCH 73/93] implement tag handling, not tested

---
 rmaint.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/rmaint.py b/rmaint.py
index ae89248..18a594d 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -315,6 +315,11 @@ def commitNext(self, rev):
         else:
             # Else use last parent_unixname we've recorded
             parent_unixname =  self.last_parents[unixname] if unixname in self.last_parents else None
+
+        ## TODO: test
+        #if rev['comment'].startswith('Removed tags: ') or rev['comment'].startswith('Added tags: '):
+        #    self.updateTags(rev['comment'], rev_unixname)
+
         # There are also problems when parent page gets renamed -- see updateChildren
 
         # If the page is tracked and its name just changed, tell Git
@@ -441,6 +446,48 @@ def updateChildren(self, oldunixname, newunixname):
             if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname:
                 self.updateParentField(child, self.last_parents[child], newunixname)
 
+    def updateTags(self, comment, unixname):
+        file_name = self.path+'/'+unixname+'.txt'
+        removed = []
+        removed_match = re.search(pattern = r'Removed tags: ([^.]+,?)\.')
+        if removed_match is not None:
+            removed = removed_match.group(1).split(', ')
+
+        tags = []
+
+        with codecs.open(file_name, "r", "UTF-8") as f:
+            content = f.readlines()
+
+        tagsline = None
+        for line in content:
+            if line.startswith('tags:'):
+                tagsline = line
+                break
+
+        # Father forgive me for the indentation depth
+        idx = -1
+        if tagsline is not None:
+            idx = content.index(tagsline)
+            for tag in tagsline.split(','):
+                if not tag in removed:
+                    tags.append(tag)
+
+
+        added_match = re.search(pattern = r'Added tags: ([^.]+,?)\.')
+        if added_match is not None:
+            tags += added_match.group(1).split(', ')
+
+        tags.sort()
+
+        newtagsline = 'tags:' + ','.join(tags) + '\n'
+        if idx != -1:
+            contents[idx] = newtagsline
+        else:
+            contents = newtagsline + contents
+
+        with codecs.open(file_name, "w", "UTF-8") as f:
+            f.writelines(content)
+
     #
     # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
     # The rest of the file is preserved.

From 26e8977ca29cf99b9a7326fa4903edb62e3a0686 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 12:05:02 +0200
Subject: [PATCH 74/93] bump default delay, be nice

---
 wikidot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wikidot.py b/wikidot.py
index 6b8e2f6..bc26d51 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -22,7 +22,7 @@ def __init__(self, site):
         if self.site[-1] == '/':
             self.site = self.site[:-1]
         self.sitename = urlparse(site).hostname.lower()
-        self.delay = 200        # Delay between requests in msec
+        self.delay = 1000        # Delay between requests in msec
         self.debug = False      # Print debug messages
         self.next_timeslot = timer()   # Can call immediately
         self.max_retries = 5

From b991fe7cc001f09dd93ba64968337c003376e5ed Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 12:13:08 +0200
Subject: [PATCH 75/93] doh

---
 wikidot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wikidot.py b/wikidot.py
index bc26d51..bf06dd2 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -24,7 +24,7 @@ def __init__(self, site):
         self.sitename = urlparse(site).hostname.lower()
         self.delay = 1000        # Delay between requests in msec
         self.debug = False      # Print debug messages
-        self.next_timeslot = timer()   # Can call immediately
+        self.next_timeslot = time.process_time()   # Can call immediately
         self.max_retries = 5
         self.failed_images = set()
 

From c8c8ed879e93bddaeeae95f6e093526b68c557e8 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 12:24:15 +0200
Subject: [PATCH 76/93] typo

---
 rmaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index 18a594d..1f2eb2d 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -90,7 +90,7 @@ def loadFailedImages(self):
         file_path = self.path + '/.failed-images.txt'
         if not os.path.isfile(file_path):
             return
-        self.self.wd.failed_images = set([line.rstrip() for line in open(file_path, 'r')])
+        self.wd.failed_images = set([line.rstrip() for line in open(file_path, 'r')])
 
     # Persistent metadata about the repo:
     #  - Tracks page renames: name atm -> last name in repo

From 44a5fc11aa2295ab089bb1a5e6609aaf65d00e94 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 12:34:04 +0200
Subject: [PATCH 77/93] handle timeouts

---
 wikidot.py | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index bf06dd2..0f5c777 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -58,7 +58,15 @@ def maybe_download_file(self, url, file_path):
             # Makes wikimedia happy
             headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"})
             start = timer()
-            req = requests.get(url, stream=True, timeout=30)
+
+            try:
+                req = requests.get(url, stream=True, timeout=30)
+            except requests.exceptions.ReadTimeout:
+                print('request timed out!')
+
+                retries += 1
+                time.sleep(retries * retries * self.delay / 1000)
+                continue
 
             if req.status_code == 404:
                 self.failed_images.add(url)
@@ -140,7 +148,13 @@ def queryex(self, params, urlAppend = None):
             self._wait_request_slot()
 
             start = timer()
-            req = requests.request('POST', url, data=params, cookies=cookies, timeout=30)
+            try:
+                req = requests.request('POST', url, data=params, cookies=cookies, timeout=30)
+            except requests.exceptions.ReadTimeout:
+                print('request timed out!')
+                retries += 1
+                time.sleep(retries * retries * self.delay / 1000)
+                continue
 
             if self.debug:
                 print(' * ajax request completed in', round(timer() - start, 2))
@@ -287,7 +301,26 @@ def get_page_id(self, page_unix_name):
             print(" > fetching", url)
 
         start = timer()
-        req = requests.request('GET', url, timeout=30)
+        retries = 0
+        req = None
+        while retries < self.max_retries:
+            try:
+                req = requests.request('GET', url, timeout=30)
+            except requests.exceptions.ReadTimeout:
+                print('request timed out!')
+                retries += 1
+                time.sleep(retries * retries * self.delay / 1000)
+                continue
+
+            if req.status_code >= 500:
+                print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+                retries += 1
+                time.sleep(retries * retries * self.delay / 1000)
+                continue
+
+            req.raise_for_status()
+            break
+
         if self.debug:
             print(' * page id request completed in', round(timer() - start, 2))
 

From 404a1e4cc4b11085f54945cb53ac2bb5a903e013 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 12:34:38 +0200
Subject: [PATCH 78/93] make some errors that should be fatal fatal

---
 wikidot.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 0f5c777..3c29b7a 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -113,7 +113,7 @@ def maybe_download_file(self, url, file_path):
                 print(' ! Failed to download', e, req, url)
                 raise e
 
-        return False
+        raise Exception('Failed too many times for', url)
 
     # To honor usage rules, we wait for self.delay between requests.
     # Low-level query functions call this before every request to Wikidot./
@@ -339,7 +339,8 @@ def get_page_id(self, page_unix_name):
                     return int(text[pos:crlf])
                 else:
                     return int(text[pos:])
-        return None
+
+        raise Exception('Failed to get page_id for ' + page_unix_name)
 
 
     # Retrieves a list of revisions for a page.
@@ -500,6 +501,9 @@ def get_revision_version(self, rev_id):
             if tds[0].getText().strip() == 'Page name:':
                 unixname = tds[1].getText().strip()
 
+        if unixname is None:
+            raise Exception('Failed to find unixname for ' + rev_id)
+
         return {
           'rev_id': rev_id,
           'unixname': unixname,

From e0b27c39345fb1f97202f5211d2dd4e0d7dc00f3 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 12:34:58 +0200
Subject: [PATCH 79/93] avoid so long delays, it usually recovers immediately

---
 wikidot.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 3c29b7a..2283d38 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -73,20 +73,13 @@ def maybe_download_file(self, url, file_path):
                 return False
 
             if req.status_code >= 500:
-                retries += 1
                 print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
-
                 # In case of debug enabled, we already printed this above
                 if not self.debug:
                     print(' - ', req)
 
-                # Be nice, double wait delay for errors
-                self._wait_request_slot()
-
-                # Extra nice, sleep longer (expoential increase), hope for the
-                # server to recover
+                retries += 1
                 time.sleep(retries * retries * self.delay / 1000)
-
                 continue
 
             try:

From cf88384b57707279fe06c18603ca56fedf294dd5 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 12:35:05 +0200
Subject: [PATCH 80/93] simplify

---
 wikidot.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 2283d38..c6f047e 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -190,16 +190,13 @@ def queryex(self, params, urlAppend = None):
                 raise e
 
             if json['status'] == 'ok':
-
                 return json['body'], (json['title'] if 'title' in json else '')
-            elif retries < self.max_retries:
+            else:
                 print(" ! error in response", json)
+
                 retries += 1
-                print(" ! sleeping for", retries * retries * self.delay);
-                #self._wait_request_slot()
                 time.sleep(retries * retries * self.delay / 1000)
-            else:
-                raise Exception(req.text)
+                continue
 
         print(' ! Failed too many times', url, params, cookies)
         raise Exception('Failed too many times for ' + url)
@@ -432,7 +429,6 @@ def get_revision_version(self, rev_id):
         res = self.get_revision_version_raw(rev_id) # this has title!
         soup = BeautifulSoup(res[0], 'html.parser')
 
-
         # Extract list of images
 
         # TODO: to get the right revision that added them, we need to go back

From c9b7f536c05c48a4c116fdeee7780ee9ee269277 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Thu, 30 Jul 2020 16:44:34 +0200
Subject: [PATCH 81/93] fix initial fetch

---
 rmaint.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 1f2eb2d..d355af8 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -151,8 +151,7 @@ def buildRevisionList(self, pages = None):
                 print(len(pages), 'pages loaded')
 
         fetched_pages = set()
-        # TODO: I don't know python, but this is highly suboptimal (and takes a ton of time)
-        # Should use a set/hashmap/whatever python calls it
+
         for wrev in tqdm(self.wrevs, desc='Collecting pages we already got revisions for'):
             page_name = wrev['page_name']
 
@@ -175,8 +174,7 @@ def buildRevisionList(self, pages = None):
                     print("Skipping", page)
                 continue
 
-            if self.debug:
-                print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages)))
+            fetched += 1
             page_id = self.wd.get_page_id(page)
 
             if self.debug:
@@ -186,9 +184,8 @@ def buildRevisionList(self, pages = None):
                 print('Page gone?', page)
                 continue
 
-            revs = self.wd.get_revisions(page_id=page_id, limit=max_depth)
-            for rev in tqdm(revs, desc='Adding revisions from page ' + page_id):
-                fetched += 1
+            revs = self.wd.get_revisions(page_id=page_id, limit=self.max_depth)
+            for rev in revs:
                 if rev['id'] in self.fetched_revids:
                     continue
 

From 9b526bf0414079d32e86f7261127a5ab05d42c88 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Fri, 31 Jul 2020 11:36:27 +0200
Subject: [PATCH 82/93] don't always cleanup

---
 crawl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/crawl.py b/crawl.py
index 367dbf0..fe2a9a0 100644
--- a/crawl.py
+++ b/crawl.py
@@ -28,6 +28,7 @@
 parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
 parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True)
 parser.add_argument('--skip', type=str, help='Skip the specified revision')
+parser.add_argument('--cleanup', action='store_true', help='Clean up after downloading repo')
 # Common settings
 parser.add_argument('--debug', action='store_true', help='Print debug info')
 parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot')
@@ -111,5 +112,7 @@ def force_dirs(path):
     print("Downloading revisions")
     rm.fetchAll()
 
-    rm.cleanup()
+    if args.cleanup:
+        rm.cleanup()
+
     print("Done.")

From cdcb096a06aeb77d8ef7c5c3116cbd63dc35826a Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Fri, 31 Jul 2020 11:37:16 +0200
Subject: [PATCH 83/93] better throttling when requests fail

---
 wikidot.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index c6f047e..926e69d 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -65,7 +65,7 @@ def maybe_download_file(self, url, file_path):
                 print('request timed out!')
 
                 retries += 1
-                time.sleep(retries * retries * self.delay / 1000)
+                time.sleep(retries * retries * retries) # up to ~2 minutes
                 continue
 
             if req.status_code == 404:
@@ -79,7 +79,7 @@ def maybe_download_file(self, url, file_path):
                     print(' - ', req)
 
                 retries += 1
-                time.sleep(retries * retries * self.delay / 1000)
+                time.sleep(retries * retries * retries)
                 continue
 
             try:
@@ -146,7 +146,7 @@ def queryex(self, params, urlAppend = None):
             except requests.exceptions.ReadTimeout:
                 print('request timed out!')
                 retries += 1
-                time.sleep(retries * retries * self.delay / 1000)
+                time.sleep(retries * retries * retries)
                 continue
 
             if self.debug:
@@ -166,7 +166,7 @@ def queryex(self, params, urlAppend = None):
 
                 # Extra nice, sleep longer (expoential increase), hope for the
                 # server to recover
-                time.sleep(retries * retries * self.delay / 1000)
+                time.sleep(retries * retries * retries)
 
                 continue
 
@@ -184,7 +184,7 @@ def queryex(self, params, urlAppend = None):
                 if retries < self.max_retries:
                     retries += 1
                     #self._wait_request_slot()
-                    time.sleep(retries * retries * self.delay / 1000)
+                    time.sleep(retries * retries * retries)
                     continue
 
                 raise e
@@ -195,7 +195,7 @@ def queryex(self, params, urlAppend = None):
                 print(" ! error in response", json)
 
                 retries += 1
-                time.sleep(retries * retries * self.delay / 1000)
+                time.sleep(retries * retries * retries)
                 continue
 
         print(' ! Failed too many times', url, params, cookies)
@@ -299,13 +299,13 @@ def get_page_id(self, page_unix_name):
             except requests.exceptions.ReadTimeout:
                 print('request timed out!')
                 retries += 1
-                time.sleep(retries * retries * self.delay / 1000)
+                time.sleep(retries * retries * retries)
                 continue
 
             if req.status_code >= 500:
                 print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
                 retries += 1
-                time.sleep(retries * retries * self.delay / 1000)
+                time.sleep(retries * retries * retries)
                 continue
 
             req.raise_for_status()

From d3eeb75a328d6fa1031144996a9beb4a8762bc75 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 2 Aug 2020 11:51:29 +0200
Subject: [PATCH 84/93] fix starting from scratch

---
 rmaint.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rmaint.py b/rmaint.py
index d355af8..ee0bfb3 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -233,6 +233,8 @@ def saveState(self):
         fp.close()
     
     def loadState(self):
+        if not os.path.isfile(self.path+'/.wstate'):
+            return
         fp = open(self.path+'/.wstate', 'rb')
         self.rev_no = pickle.load(fp)
         fp.close()

From f6cfe018e468f6a374d709024d94ede11491df83 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 2 Aug 2020 11:52:05 +0200
Subject: [PATCH 85/93] fix path to revid file

---
 rmaint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index ee0bfb3..f48458b 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -264,8 +264,8 @@ def openRepo(self):
 
             if self.storeRevIds:
                 # Add revision id file to the new repo
-                fname = self.path + '/.revid'
-                codecs.open(self.path + fname, "w", "UTF-8").close()
+                fname = '.revid'
+                codecs.open(self.path + '/' + fname, "w", "UTF-8").close()
                 self.repo.index.add([fname])
                 self.index.commit("Initial creation of repo")
         self.index = self.repo.index

From 2d96bf813c8865ae8caa53a86b517fd295667741 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 2 Aug 2020 11:54:55 +0200
Subject: [PATCH 86/93] ignore minor error

---
 rmaint.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/rmaint.py b/rmaint.py
index f48458b..2edff0c 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -492,7 +492,13 @@ def updateTags(self, comment, unixname):
     # The rest of the file is preserved.
     #
     def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
-        with codecs.open(self.path+'/'+child_unixname+'.txt', "r", "UTF-8") as f:
+        child_path = self.path+'/'+child_unixname+'.txt'
+
+        ## TODO: find out when this happens
+        if not os.path.isfile(child_path):
+            print('Failed to find child file!', child_path)
+            return
+        with codecs.open(child_path, "r", "UTF-8") as f:
             content = f.readlines()
         # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
         idx = content.index('parent:'+parent_oldunixname+'\n')

From b8cd79f121cea456e3019c9a1021bfb4cb9ef3c5 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 2 Aug 2020 11:55:15 +0200
Subject: [PATCH 87/93] annoying

---
 wikidot.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index 926e69d..ea36821 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -61,7 +61,7 @@ def maybe_download_file(self, url, file_path):
 
             try:
                 req = requests.get(url, stream=True, timeout=30)
-            except requests.exceptions.ReadTimeout:
+            except requests.exceptions.RequestException:
                 print('request timed out!')
 
                 retries += 1
@@ -143,7 +143,7 @@ def queryex(self, params, urlAppend = None):
             start = timer()
             try:
                 req = requests.request('POST', url, data=params, cookies=cookies, timeout=30)
-            except requests.exceptions.ReadTimeout:
+            except requests.exceptions.RequestException:
                 print('request timed out!')
                 retries += 1
                 time.sleep(retries * retries * retries)
@@ -296,7 +296,7 @@ def get_page_id(self, page_unix_name):
         while retries < self.max_retries:
             try:
                 req = requests.request('GET', url, timeout=30)
-            except requests.exceptions.ReadTimeout:
+            except requests.exceptions.RequestException:
                 print('request timed out!')
                 retries += 1
                 time.sleep(retries * retries * retries)

From 6be1b90c958e48a18f2a002e30bf35cca09a11c3 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 2 Aug 2020 11:59:51 +0200
Subject: [PATCH 88/93] move code around

---
 rmaint.py | 49 +++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index 2edff0c..a7191b9 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -445,6 +445,33 @@ def updateChildren(self, oldunixname, newunixname):
             if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname:
                 self.updateParentField(child, self.last_parents[child], newunixname)
 
+    #
+    # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
+    # The rest of the file is preserved.
+    #
+    def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
+        child_path = self.path+'/'+child_unixname+'.txt'
+
+        ## TODO: find out when this happens
+        # The child name is gotten from the commit message, so not very reliable
+        if not os.path.isfile(child_path):
+            print('Failed to find child file!', child_path)
+            return
+        with codecs.open(child_path, "r", "UTF-8") as f:
+            content = f.readlines()
+        # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
+        idx = content.index('parent:'+parent_oldunixname+'\n')
+        if idx < 0:
+            raise Exception("Cannot update child page "+child_unixname+": "
+                +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
+        content[idx] = 'parent:'+parent_newunixname+'\n'
+        with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f:
+            f.writelines(content)
+
+    #
+    # Updates the tags field in the file
+    # Not used (yet)
+    #
     def updateTags(self, comment, unixname):
         file_name = self.path+'/'+unixname+'.txt'
         removed = []
@@ -487,28 +514,6 @@ def updateTags(self, comment, unixname):
         with codecs.open(file_name, "w", "UTF-8") as f:
             f.writelines(content)
 
-    #
-    # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
-    # The rest of the file is preserved.
-    #
-    def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
-        child_path = self.path+'/'+child_unixname+'.txt'
-
-        ## TODO: find out when this happens
-        if not os.path.isfile(child_path):
-            print('Failed to find child file!', child_path)
-            return
-        with codecs.open(child_path, "r", "UTF-8") as f:
-            content = f.readlines()
-        # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
-        idx = content.index('parent:'+parent_oldunixname+'\n')
-        if idx < 0:
-            raise Exception("Cannot update child page "+child_unixname+": "
-                +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
-        content[idx] = 'parent:'+parent_newunixname+'\n'
-        with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f:
-            f.writelines(content)
-
 
     #
     # Finalizes the construction process and deletes any temporary files.

From 77490e2fedd0e410b108925220be3eb0949e24af Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 2 Aug 2020 12:00:35 +0200
Subject: [PATCH 89/93] start on forum scraping support

---
 wikidot.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/wikidot.py b/wikidot.py
index ea36821..dd039d7 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -396,6 +396,45 @@ def get_revisions(self, page_id, limit):
             })
         return revs
 
+    # topics in forum: http://www.scp-wiki.net/forum/c-###/sort/start
+    # -> div class 'title'
+    #   -> a href= http://www.scp-wiki.net/forum/t-####/foobar (foobar not important)
+
+    # posts in topic http://www.scp-wiki.net/forum/t-####/
+    # -> div id 'thread-container'
+    #   -> div class 'post-container'
+    #       -> div class = 'post', id = 'post-####'
+    #           -> div class 'title'
+    #           -> div class 'content'
+    #   -> div class 'post-container'
+    #       -> ...
+    #       -> div class 'post-container'
+    #           -> ...
+
+    #def get_forum_post_revisions(self, post_id):
+    #    res = self.query({
+    #      'moduleName': 'forum/sub/ForumPostRevisionsModule',
+    #      'postId': post_id,
+    #    })
+    #    revisions = []
+    #    soup = BeautifulSoup(res, 'html.parser')
+    #    for row in soup.find_all("tr"):
+    #        columns = row.find_all("td")
+
+    #        if len(columns) != 3:
+    #            raise Exception('Invalid row in post history for ' + str(post_id))
+
+    #        user = columns[0].find('a').getText()
+    #        time = columns[1].find('span').getText()
+    #        rev_id_js = columns[0].find('a')['href']
+    #        match = re.search(r'showRevision\(event, ([0-9]+)\)', rev_id_js)
+    #        rev_id = match.group(1)
+
+    #        revisions.append({
+    #            'id': rev_id,
+    #            'user': user,
+    #            'time': time,
+    #            })
 
     # Retrieves revision source for a revision.
     # There's no raw version because there's nothing else in raw.

From b197beb572dcd3c17fe3b8ebd79fbf366dc067d2 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 23 Aug 2020 00:25:45 +0200
Subject: [PATCH 90/93] support for skipping entire pages (for pages that fail
 for some reason)

---
 crawl.py  | 3 +++
 rmaint.py | 7 ++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/crawl.py b/crawl.py
index fe2a9a0..5201eeb 100644
--- a/crawl.py
+++ b/crawl.py
@@ -28,6 +28,7 @@
 parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
 parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True)
 parser.add_argument('--skip', type=str, help='Skip the specified revision')
+parser.add_argument('--skip-pages', type=str, help='Skip the specified pages')
 parser.add_argument('--cleanup', action='store_true', help='Clean up after downloading repo')
 # Common settings
 parser.add_argument('--debug', action='store_true', help='Print debug info')
@@ -106,6 +107,8 @@ def force_dirs(path):
     rm.buildRevisionList([args.page] if args.page else None)
     rm.openRepo()
 
+    if args.skip_pages:
+        rm.pages_to_skip = args.skip_pages.split(",")
     if args.skip:
         rm.revs_to_skip = [args.skip]
 
diff --git a/rmaint.py b/rmaint.py
index a7191b9..b9a2287 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -51,6 +51,7 @@ def __init__(self, wikidot, path):
         self.fetched_revids = set()
 
         self.revs_to_skip = []
+        self.pages_to_skip = []
 
 
     #
@@ -288,6 +289,11 @@ def commitNext(self, rev):
             print("Skipping", rev)
             return True
 
+        unixname = rev['page_name']
+        if unixname in self.pages_to_skip:
+            print("Skipping", rev)
+            return True
+
         source = self.wd.get_revision_source(rev['rev_id'])
         # Page title and unix_name changes are only available through another request:
         details = self.wd.get_revision_version(rev['rev_id'])
@@ -300,7 +306,6 @@ def commitNext(self, rev):
             outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
             outp.close()
 
-        unixname = rev['page_name']
         rev_unixname = details['unixname'] # may be different in revision than atm
 
         # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.

From 603d1647a830dd5644c3bb11e1523e2ce9e1d740 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 23 Aug 2020 00:26:05 +0200
Subject: [PATCH 91/93] fix support for skipping multiple revisions

---
 crawl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl.py b/crawl.py
index 5201eeb..68fcf41 100644
--- a/crawl.py
+++ b/crawl.py
@@ -110,7 +110,7 @@ def force_dirs(path):
     if args.skip_pages:
         rm.pages_to_skip = args.skip_pages.split(",")
     if args.skip:
-        rm.revs_to_skip = [args.skip]
+        rm.revs_to_skip = args.skip.split(",")
 
     print("Downloading revisions")
     rm.fetchAll()

From 3524f541b73a4d15d82992516b54fcf3cd997e30 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 23 Aug 2020 00:26:32 +0200
Subject: [PATCH 92/93] fix

---
 rmaint.py | 47 ++++++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/rmaint.py b/rmaint.py
index b9a2287..1ab383f 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -450,33 +450,6 @@ def updateChildren(self, oldunixname, newunixname):
             if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname:
                 self.updateParentField(child, self.last_parents[child], newunixname)
 
-    #
-    # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
-    # The rest of the file is preserved.
-    #
-    def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
-        child_path = self.path+'/'+child_unixname+'.txt'
-
-        ## TODO: find out when this happens
-        # The child name is gotten from the commit message, so not very reliable
-        if not os.path.isfile(child_path):
-            print('Failed to find child file!', child_path)
-            return
-        with codecs.open(child_path, "r", "UTF-8") as f:
-            content = f.readlines()
-        # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
-        idx = content.index('parent:'+parent_oldunixname+'\n')
-        if idx < 0:
-            raise Exception("Cannot update child page "+child_unixname+": "
-                +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
-        content[idx] = 'parent:'+parent_newunixname+'\n'
-        with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f:
-            f.writelines(content)
-
-    #
-    # Updates the tags field in the file
-    # Not used (yet)
-    #
     def updateTags(self, comment, unixname):
         file_name = self.path+'/'+unixname+'.txt'
         removed = []
@@ -519,6 +492,26 @@ def updateTags(self, comment, unixname):
         with codecs.open(file_name, "w", "UTF-8") as f:
             f.writelines(content)
 
+    #
+    # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
+    # The rest of the file is preserved.
+    #
+    def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
+        child_path = self.path+'/'+child_unixname+'.txt'
+        if not os.path.isfile(child_path):
+            print('Failed to find child file!', child_path)
+            return
+        with codecs.open(child_path, "r", "UTF-8") as f:
+            content = f.readlines()
+        # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
+        idx = content.index('parent:'+parent_oldunixname+'\n')
+        if idx < 0:
+            raise Exception("Cannot update child page "+child_unixname+": "
+                +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
+        content[idx] = 'parent:'+parent_newunixname+'\n'
+        with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f:
+            f.writelines(content)
+
 
     #
     # Finalizes the construction process and deletes any temporary files.

From f23b0ffd56b8561eb00b0e621d95ee2989772389 Mon Sep 17 00:00:00 2001
From: "Martin T. H. Sandsmark" <martin.sandsmark@kde.org>
Date: Sun, 23 Aug 2020 00:27:17 +0200
Subject: [PATCH 93/93] fix robustness when downloading images

---
 wikidot.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/wikidot.py b/wikidot.py
index dd039d7..be378ea 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -42,8 +42,15 @@ def maybe_download_file(self, url, file_path):
 
         #self._wait_request_slot()
 
-        dirpath = os.path.dirname(file_path)
-        os.makedirs(dirpath, exist_ok=True)
+        try:
+            dirpath = os.path.dirname(file_path)
+            os.makedirs(dirpath, exist_ok=True)
+        except OSError as e:
+            if e.errno == 36:
+                print("Path too long", e)
+                return False
+            else:
+                raise  # re-raise previously caught exception
 
         if self.debug:
             print(" < downloading", url, "to" ,file_path, "dirpath", dirpath)
@@ -62,15 +69,17 @@ def maybe_download_file(self, url, file_path):
             try:
                 req = requests.get(url, stream=True, timeout=30)
             except requests.exceptions.RequestException:
-                print('request timed out!')
+                print('request exception')
 
                 retries += 1
                 time.sleep(retries * retries * retries) # up to ~2 minutes
                 continue
+            except urllib3.exceptions.ReadTimeoutError:
+                print('read timeout')
 
-            if req.status_code == 404:
-                self.failed_images.add(url)
-                return False
+                retries += 1
+                time.sleep(retries * retries * retries) # up to ~2 minutes
+                continue
 
             if req.status_code >= 500:
                 print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
@@ -82,6 +91,10 @@ def maybe_download_file(self, url, file_path):
                 time.sleep(retries * retries * retries)
                 continue
 
+            if req.status_code >= 400:
+                self.failed_images.add(url)
+                return False
+
             try:
                 # In case of 404 errors or other stuff that indicates
                 # some bug in how we handle or request things
@@ -102,11 +115,18 @@ def maybe_download_file(self, url, file_path):
                     print(" - downloaded file size", os.path.getsize(file_path), "in", round(timer() - start, 2))
 
                 return True
+            except OSError as e:
+                if e.errno == 36:
+                    print("Filename to long", e)
+                    return False
+                else:
+                    raise  # re-raise previously caught exception
             except Exception as e:
                 print(' ! Failed to download', e, req, url)
                 raise e
 
-        raise Exception('Failed too many times for', url)
+        print('Failed too many times for', url)
+        return False
 
     # To honor usage rules, we wait for self.delay between requests.
     # Low-level query functions call this before every request to Wikidot./