diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a295864
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.pyc
+__pycache__
diff --git a/.hgignore b/.hgignore
deleted file mode 100644
index 471301b..0000000
--- a/.hgignore
+++ /dev/null
@@ -1,2 +0,0 @@
-syntax:glob
-*.pyc
diff --git a/crawl.py b/crawl.py
index 22da38a..68fcf41 100644
--- a/crawl.py
+++ b/crawl.py
@@ -1,118 +1,121 @@
-import argparse
-import sys
-import locale
-import codecs
-import os
-from wikidot import Wikidot
-from rmaint import RepoMaintainer
-
-# TODO: Files.
-# TODO: Forum and comment pages.
-# TODO: Ability to download new transactions since last dump.
-# We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump)
-
-rawStdout = sys.stdout
-rawStderr = sys.stderr
-sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, 'xmlcharrefreplace')
-sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr, 'xmlcharrefreplace')
-
-parser = argparse.ArgumentParser(description='Queries Wikidot')
-parser.add_argument('site', help='URL of Wikidot site')
-# Actions
-parser.add_argument('--list-pages', action='store_true', help='List all pages on this site')
-parser.add_argument('--source', action='store_true', help='Print page source (requires --page)')
-parser.add_argument('--content', action='store_true', help='Print page content (requires --page)')
-parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)')
-parser.add_argument('--dump', type=str, help='Download page revisions to this directory')
-# Debug actions
-parser.add_argument('--list-pages-raw', action='store_true')
-parser.add_argument('--log-raw', action='store_true')
-# Action settings
-parser.add_argument('--page', type=str, help='Query only this page')
-parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
-parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository')
-# Common settings
-parser.add_argument('--debug', action='store_true', help='Print debug info')
-parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot')
-args = parser.parse_args()
-
-
-wd = Wikidot(args.site)
-wd.debug = args.debug
-wd.delay = args.delay
-
-
-def force_dirs(path):
- try:
- os.makedirs(path)
- except OSError as exception:
- if exception.errno != os.errno.EEXIST:
- raise
-
-if args.list_pages_raw:
- print wd.list_pages_raw(args.depth)
-
-elif args.list_pages:
- for page in wd.list_pages(args.depth):
- print page
-
-elif args.source:
- if not args.page:
- raise "Please specify --page for --source."
-
- page_id = wd.get_page_id(args.page)
- if not page_id:
- raise "Page not found: "+args.page
-
- revs = wd.get_revisions(page_id, 1) # last revision
- print wd.get_revision_source(revs[0]['id'])
-
-elif args.content:
- if not args.page:
- raise "Please specify --page for --source."
-
- page_id = wd.get_page_id(args.page)
- if not page_id:
- raise "Page not found: "+args.page
-
- revs = wd.get_revisions(page_id, 1) # last revision
- print wd.get_revision_version(revs[0]['id'])
-
-elif args.log_raw:
- if not args.page:
- raise "Please specify --page for --log."
-
- page_id = wd.get_page_id(args.page)
- if not page_id:
- raise "Page not found: "+args.page
-
- print wd.get_revisions_raw(page_id, args.depth)
-
-
-elif args.log:
- if not args.page:
- raise "Please specify --page for --log."
-
- page_id = wd.get_page_id(args.page)
- if not page_id:
- raise "Page not found: "+args.page
- for rev in wd.get_revisions(page_id, args.depth):
- print unicode(rev)
-
-
-elif args.dump:
- print "Downloading pages to "+args.dump
- force_dirs(args.dump)
-
- rm = RepoMaintainer(wd, args.dump)
- rm.debug = args.debug
- rm.storeRevIds = args.revids
- rm.buildRevisionList([args.page] if args.page else None, args.depth)
- rm.openRepo()
-
- print "Downloading revisions..."
- while rm.commitNext():
- pass
-
- rm.cleanup()
- print "Done."
+import argparse
+import sys
+import locale
+import codecs
+import os
+from wikidot import Wikidot
+from rmaint import RepoMaintainer
+
+# TODO: Files.
+# TODO: Forum and comment pages.
+# TODO: Ability to download new transactions since last dump.
+# We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump)
+
+parser = argparse.ArgumentParser(description='Queries Wikidot')
+parser.add_argument('site', help='URL of Wikidot site')
+# Actions
+parser.add_argument('--list-pages', action='store_true', help='List all pages on this site')
+parser.add_argument('--max-page-count', type=int, default='10000', help='Only list/fetch up to this amount of pages')
+parser.add_argument('--source', action='store_true', help='Print page source (requires --page)')
+parser.add_argument('--content', action='store_true', help='Print page content (requires --page)')
+parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)')
+parser.add_argument('--dump', type=str, help='Download page revisions to this directory')
+# Debug actions
+parser.add_argument('--list-pages-raw', action='store_true')
+parser.add_argument('--log-raw', action='store_true')
+# Action settings
+parser.add_argument('--page', type=str, help='Query only this page')
+parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions')
+parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True)
+parser.add_argument('--skip', type=str, help='Skip the specified revision')
+parser.add_argument('--skip-pages', type=str, help='Skip the specified pages')
+parser.add_argument('--cleanup', action='store_true', help='Clean up after downloading repo')
+# Common settings
+parser.add_argument('--debug', action='store_true', help='Print debug info')
+parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot')
+args = parser.parse_args()
+
+
+wd = Wikidot(args.site)
+wd.debug = args.debug
+wd.delay = args.delay
+
+
+def force_dirs(path):
+ os.makedirs(path, exist_ok=True)
+
+if args.list_pages_raw:
+ print((wd.list_pages_raw(limit = args.max_pages_count)))
+
+elif args.list_pages:
+ for page in wd.list_pages(limit = args.max_pages_count):
+ print(page)
+
+elif args.source:
+ if not args.page:
+ raise Exception("Please specify --page for --source.")
+
+ page_id = wd.get_page_id(page_unix_name=args.page)
+ if not page_id:
+ raise Exception("Page not found: "+args.page)
+
+ revs = wd.get_revisions(page_id, 1) # last revision
+ print((wd.get_revision_source(revs[0]['id'])))
+
+elif args.content:
+ if not args.page:
+ raise Exception("Please specify --page for --source.")
+
+ page_id = wd.get_page_id(page_unix_name=args.page)
+ if not page_id:
+ raise Exception("Page not found: "+args.page)
+
+ revs = wd.get_revisions(page_id, 1) # last revision
+ print((wd.get_revision_version(revs[0]['id'])))
+
+elif args.log_raw:
+ if not args.page:
+ raise Exception("Please specify --page for --log.")
+
+ page_id = wd.get_page_id(page_unix_name=args.page)
+ if not page_id:
+ raise Exception("Page not found: "+args.page)
+
+ print((wd.get_revisions_raw(page_id, args.depth)))
+
+
+elif args.log:
+ if not args.page:
+ raise Exception("Please specify --page for --log.")
+
+ page_id = wd.get_page_id(page_unix_name=args.page)
+ if not page_id:
+ raise Exception("Page not found: "+args.page)
+ for rev in wd.get_revisions(page_id, args.depth):
+ print((str(rev)))
+
+
+elif args.dump:
+ print(("Downloading pages to "+args.dump))
+ force_dirs(args.dump)
+
+ rm = RepoMaintainer(wd, args.dump)
+ rm.debug = args.debug
+ rm.storeRevIds = args.revids
+ rm.max_depth = args.depth
+ rm.max_page_count = args.max_page_count
+ rm.buildRevisionList([args.page] if args.page else None)
+ rm.openRepo()
+
+ if args.skip_pages:
+ rm.pages_to_skip = args.skip_pages.split(",")
+ if args.skip:
+ rm.revs_to_skip = args.skip.split(",")
+
+ print("Downloading revisions")
+ rm.fetchAll()
+
+ if args.cleanup:
+ rm.cleanup()
+
+ print("Done.")
diff --git a/hgpatch.py b/hgpatch.py
deleted file mode 100644
index 6d2ff12..0000000
--- a/hgpatch.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from mercurial import scmutil, osutil
-from types import MethodType
-from mercurial import encoding
-import codecs
-
-# Patches commit-message unicode handling on Python 2.x
-
-# Mercurial is internally unicode. But because it runs from ASCII console, it tries to convert
-# all input from "input encoding" (set in mercurial/encoding.py)
-
-# Problem 1:
-# If you just pass it u'unicode string', it'll fail. Even if you set "input encoding" to utf-8,
-# it'll still try to decode it to ASCII.
-# Solution:
-# Patch this decoding function to pass unicode unchanged.
-
-old_fromlocal = None
-
-def better_fromlocal(s):
- if isinstance(s, unicode):
- return s.encode('utf-8')
- global old_fromlocal
- return old_fromlocal(s)
-
-old_fromlocal = encoding.fromlocal
-encoding.fromlocal = better_fromlocal
-
-
-# Problem 2:
-# Separate from actual log, Mercurial stores commit message in commit-message.txt.
-# Unfortunately it uses default Python 2.x file.open which expects ASCII and auto-conversion fails.
-# Solution:
-# Patch virtual-fs open() function to use codecs.open wrapper in this particular case.
-
-old_vfs_call = None
-
-def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False):
- fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose)
- if path.endswith('last-message.txt'):
- # Create a wrapper like codecs.open does:
- info = codecs.lookup("utf-8")
- fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict')
- fp.encoding = 'utf-8'
- return fp
-
-old_vfs_call = scmutil.vfs.__call__
-scmutil.vfs.__call__ = better_vfs_call
-
-
-
diff --git a/readme.md b/readme.md
index f66a0cc..641a570 100644
--- a/readme.md
+++ b/readme.md
@@ -1,30 +1,50 @@
-This is a Python command line client for relatively popular wiki hosting http://www.wikidot.com which lets you:
-
-* List all pages on a site
-* See all revisions of a page
-* Query page source
-
-Most interestingly, it allows you to download the whole site as a Mercurial repository, with proper commit dates and comments!
-
-##### Examples:
-
- crawl.py http://example.wikidot.com --dump ExampleRepo
- crawl.py http://example.wikidot.com --log --page example-page
-
-It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers.
-
-Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed.
-
-##### Useful links:
-
-Wikidot code (very old) which simplifies things a bit:
-
-* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
-
-The descriptions for on-site modules are heavily correlated with AJAX ones:
-
-* http://www.wikidot.com/doc-modules:listpages-module
-
-Someone else did Wikidot AJAX:
-
-* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py
+*This is a fork to make a permanent backup of the SCP wiki.*
+
+This is a Python command line client for relatively popular wiki hosting
+http://www.wikidot.com which lets you:
+
+* List all pages on a site
+* See all revisions of a page
+* Query page source
+
+Most interestingly, it allows you to download the whole site as a Git repository, with proper commit dates, author and comments!
+
+##### Dependencies
+
+At least:
+
+* Python 3
+* python-beautifulsoup4
+* python-gitpython
+* python-requests
+* python-tqdm
+
+##### Examples:
+
+ crawl.py http://example.wikidot.com --dump ExampleRepo
+ crawl.py http://example.wikidot.com --log --page example-page
+
+It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers.
+
+Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed.
+
+##### Useful links:
+
+Wikidot code (very old) which simplifies things a bit:
+
+* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
+
+The descriptions for on-site modules are heavily correlated with AJAX ones:
+
+* http://www.wikidot.com/doc-modules:listpages-module
+
+Someone else did Wikidot AJAX:
+
+* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py
+
+
+#### TODO
+
+ - Handle deleted images. Probably need to check the diff and check all pages for references if removed from one page.
+ - Handle tags (both added and removed).
+
diff --git a/rmaint.py b/rmaint.py
index 029319f..1ab383f 100644
--- a/rmaint.py
+++ b/rmaint.py
@@ -1,263 +1,535 @@
-import os
-import codecs
-from mercurial import commands, ui, hg
-import hgpatch
-import cPickle as pickle
-import wikidot
-
-# Repository builder and maintainer
-# Contains logic for actual loading and maintaining the repository over the course of its construction.
-
-# Usage:
-# rm = RepoMaintainer(wikidot, path)
-# rm.buildRevisionList(pages, depth)
-# rm.openRepo()
-# while rm.commitNext():
-# pass
-# rm.cleanup()
-
-# Talkative.
-
-class RepoMaintainer:
- def __init__(self, wikidot, path):
- # Settings
- self.wd = wikidot # Wikidot instance
- self.path = path # Path to repository
- self.debug = False # = True to enable more printing
- self.storeRevIds = True # = True to store .revid with each commit
-
- # Internal state
- self.wrevs = None # Compiled wikidot revision list (history)
-
- self.rev_no = 0 # Next revision to process
- self.last_names = {} # Tracks page renames: name atm -> last name in repo
- self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
-
- self.ui = None # Mercurial UI object
- self.repo = None # Mercurial repo object
-
-
- #
- # Saves and loads revision list from file
- #
- def saveWRevs(self):
- fp = open(self.path+'\\.wrevs', 'wb')
- pickle.dump(self.wrevs, fp)
- fp.close()
-
- def loadWRevs(self):
- fp = open(self.path+'\\.wrevs', 'rb')
- self.wrevs = pickle.load(fp)
- fp.close()
-
- #
- # Compiles a combined revision list for a given set of pages, or all pages on the site.
- # pages: compile history for these pages
- # depth: download at most this number of revisions.
- #
- # If there exists a cached revision list at the repository destination,
- # it is loaded and no requests are made.
- #
- def buildRevisionList(self, pages = None, depth = 10000):
- if os.path.isfile(self.path+'\\.wrevs'):
- print "Loading cached revision list..."
- self.loadWRevs()
- else:
- print "Building revision list..."
- if not pages:
- pages = self.wd.list_pages(10000)
- self.wrevs = []
- for page in pages:
- print "Querying page: "+page
- page_id = self.wd.get_page_id(page)
- print "ID: "+str(page_id)
- revs = self.wd.get_revisions(page_id, depth)
- print "Revisions: "+str(len(revs))
- for rev in revs:
- self.wrevs.append({
- 'page_id' : page_id,
- 'page_name' : page, # name atm, not at revision time
- 'rev_id' : rev['id'],
- 'date' : rev['date'],
- 'user' : rev['user'],
- 'comment' : rev['comment'],
- })
- self.saveWRevs() # Save a cached copy
- print ""
-
-
- print "Total revisions: "+str(len(self.wrevs))
-
- print "Sorting revisions..."
- self.wrevs.sort(key=lambda rev: rev['date'])
- print ""
-
- if self.debug:
- print "Revision list: "
- for rev in self.wrevs:
- print str(rev)+"\n"
- print ""
-
-
- #
- # Saves and loads operational state from file
- #
- def saveState(self):
- fp = open(self.path+'\\.wstate', 'wb')
- pickle.dump(self.rev_no, fp)
- pickle.dump(self.last_names, fp)
- pickle.dump(self.last_parents, fp)
- fp.close()
-
- def loadState(self):
- fp = open(self.path+'\\.wstate', 'rb')
- self.rev_no = pickle.load(fp)
- self.last_names = pickle.load(fp)
- try:
- self.last_parents = pickle.load(fp)
- except EOFError:
- pass
- fp.close()
-
-
- #
- # Initializes the construction process, after the revision list has been compiled.
- # Either creates a new repo, or loads the existing one at the target path
- # and restores its construction state.
- #
- def openRepo(self):
- # Create a new repository or continue from aborted dump
- self.ui=ui.ui()
- self.last_names = {} # Tracks page renames: name atm -> last name in repo
- self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
-
- if os.path.isfile(self.path+'\\.wstate'):
- print "Continuing from aborted dump state..."
- self.loadState()
- self.repo = hg.repository(self.ui, self.path)
-
- else: # create a new repository (will fail if one exists)
- print "Initializing repository..."
- commands.init(self.ui, self.path)
- self.repo = hg.repository(self.ui, self.path)
- self.rev_no = 0
-
- if self.storeRevIds:
- # Add revision id file to the new repo
- fname = self.path+'\\.revid'
- codecs.open(fname, "w", "UTF-8").close()
- commands.add(self.ui, self.repo, str(fname))
-
-
- #
- # Takes an unprocessed revision from a revision log, fetches its data and commits it.
- # Returns false if no unprocessed revisions remain.
- #
- def commitNext(self):
- if self.rev_no >= len(self.wrevs):
- return False
-
- rev = self.wrevs[self.rev_no]
- source = self.wd.get_revision_source(rev['rev_id'])
- # Page title and unix_name changes are only available through another request:
- details = self.wd.get_revision_version(rev['rev_id'])
-
- # Store revision_id for last commit
- # Without this, empty commits (e.g. file uploads) will be skipped by Mercurial
- if self.storeRevIds:
- fname = self.path+'\\.revid'
- outp = codecs.open(fname, "w", "UTF-8")
- outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
- outp.close()
-
- unixname = rev['page_name']
- rev_unixname = details['unixname'] # may be different in revision than atm
-
- # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.
- # The only way to know they were changed is revision comments, though evil people may trick us.
- if rev['comment'].startswith('Parent page set to: "'):
- # This is a parenting revision, remember the new parent
- parent_unixname = rev['comment'][21:-2]
- self.last_parents[unixname] = parent_unixname
- else:
- # Else use last parent_unixname we've recorded
- parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None
- # There are also problems when parent page gets renamed -- see updateChildren
-
- # If the page is tracked and its name just changed, tell HG
- rename = (unixname in self.last_names) and (self.last_names[unixname] <> rev_unixname)
- if rename:
- self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
- commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt')
-
- # Ouput contents
- fname = self.path+'\\'+rev_unixname+'.txt'
- outp = codecs.open(fname, "w", "UTF-8")
- if details['title']:
- outp.write('title:'+details['title']+'\n')
- if parent_unixname:
- outp.write('parent:'+parent_unixname+'\n')
- outp.write(source)
- outp.close()
-
- # Add new page
- if not unixname in self.last_names: # never before seen
- commands.add(self.ui, self.repo, str(fname))
-
- self.last_names[unixname] = rev_unixname
-
- # Commit
- if rev['comment'] <> '':
- commit_msg = rev_unixname + ': ' + rev['comment']
- else:
- commit_msg = rev_unixname
- if rev['date']:
- commit_date = str(rev['date']) + ' 0'
- else:
- commit_date = None
- print "Commiting: "+str(self.rev_no)+'. '+commit_msg
-
- commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date)
- self.rev_no += 1
-
- self.saveState() # Update operation state
- return True
-
-
- #
- # Updates all children of the page to reflect parent's unixname change.
- #
- # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body.
- # A parent may then be renamed.
- # Wikidot logs no additional changes for child pages, yet they stay linked to the parent.
- #
- # Therefore, on every rename we must update all linked children in the same revision.
- #
- def updateChildren(self, oldunixname, newunixname):
- for child in self.last_parents.keys():
- if self.last_parents[child] == oldunixname:
- self.updateParentField(child, self.last_parents[child], newunixname)
-
- #
- # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
- # The rest of the file is preserved.
- #
- def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
- with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f:
- content = f.readlines()
- # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
- idx = content.index('parent:'+parent_oldunixname+'\n')
- if idx < 0:
- raise Exception("Cannot update child page "+child_unixname+": "
- +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
- content[idx] = 'parent:'+parent_newunixname+'\n'
- with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f:
- f.writelines(content)
-
-
- #
- # Finalizes the construction process and deletes any temporary files.
- #
- def cleanup(self):
- os.remove(self.path+'\\.wstate')
- os.remove(self.path+'\\.wrevs')
\ No newline at end of file
+import wikidot
+
+# Basic python stuff
+import os
+import codecs
+import pickle as pickle
+import json
+
+# git stuff
+from git import Repo, Actor
+import time # For parsing unix epoch timestamps from wikidot and convert to normal timestamps
+import re # For sanitizing usernames to fake email addresses
+
+from tqdm import tqdm # for progress bar
+
+# Repository builder and maintainer
+# Contains logic for actual loading and maintaining the repository over the course of its construction.
+
+# Usage:
+# rm = RepoMaintainer(wikidot, path)
+# rm.buildRevisionList(pages)
+# rm.openRepo()
+# while rm.commitNext():
+# pass
+# rm.cleanup()
+
+# Talkative.
+
+class RepoMaintainer:
+ def __init__(self, wikidot, path):
+ # Settings
+ self.wd = wikidot # Wikidot instance
+ self.path = path # Path to repository
+ self.debug = False # = True to enable more printing
+ self.storeRevIds = True # = True to store .revid with each commit
+
+ # Internal state
+ self.wrevs = None # Compiled wikidot revision list (history)
+
+ self.rev_no = 0 # Next revision to process
+ self.last_names = {} # Tracks page renames: name atm -> last name in repo
+ self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
+
+ self.repo = None # Git repo object
+ self.index = None # Git current index object
+ self.max_depth = 10000 # download at most this number of revisions
+ self.max_page_count = 10000 # download at most this number of pages
+
+ self.pbar = None
+ self.first_fetched = 0 # For progress bar
+ self.fetched_revids = set()
+
+ self.revs_to_skip = []
+ self.pages_to_skip = []
+
+
+ #
+ # Saves and loads revision list from file
+ #
+ def saveWRevs(self):
+ fp = open(self.path+'/.wrevs', 'wb')
+ pickle.dump(self.wrevs, fp)
+ fp.close()
+
+ def loadWRevs(self):
+ fp = open(self.path+'/.wrevs', 'rb')
+ self.wrevs = pickle.load(fp)
+ fp.close()
+
+ def savePages(self, pages):
+ fp = open(self.path+'/.pages', 'wb')
+ pickle.dump(pages, fp)
+ fp.close()
+
+ def appendFetchedRevid(self, revid):
+ fp = open(self.path+'/.fetched.txt', 'a')
+ fp.write(revid + '\n')
+ fp.close()
+
+ def loadFetchedRevids(self):
+ self.fetched_revids = set([line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')])
+
+ def saveFailedImages(self):
+ file_path = self.path + '/.failed-images.txt'
+ fp = open(file_path, 'w')
+ for failed in self.wd.failed_images:
+ fp.write(failed + '\n')
+ fp.close()
+
+ def loadFailedImages(self):
+ file_path = self.path + '/.failed-images.txt'
+ if not os.path.isfile(file_path):
+ return
+ self.wd.failed_images = set([line.rstrip() for line in open(file_path, 'r')])
+
+ # Persistent metadata about the repo:
+ # - Tracks page renames: name atm -> last name in repo
+ # - Tracks page parent names: name atm -> last parent in repo
+ def saveMetadata(self):
+ metadata = {'names': self.last_names, 'parents': self.last_parents }
+ fp = open(self.path+'/.metadata.json', 'w')
+ json.dump(metadata, fp)
+ fp.close()
+
+ def loadMetadata(self):
+ fp = open(self.path+'/.metadata.json', 'r')
+ metadata = json.load(fp)
+ self.last_names = metadata['names']
+ self.last_parents = metadata['parents']
+ fp.close()
+
+ self.loadFetchedRevids()
+ #
+ # Compiles a combined revision list for a given set of pages, or all pages on the site.
+ # pages: compile history for these pages
+ #
+ # If there exists a cached revision list at the repository destination,
+ # it is loaded and no requests are made.
+ #
+ def buildRevisionList(self, pages = None):
+ if os.path.isfile(self.path+'/.wrevs'):
+ print("Loading cached revision list...")
+ self.loadWRevs()
+ else:
+ self.wrevs = []
+ if self.debug:
+ print('No existing wrevs')
+
+ if os.path.isfile(self.path+'/.fetched.txt'):
+ self.loadFetchedRevids()
+ print(len(self.fetched_revids), 'revisions already fetched')
+ else:
+ self.fetched_revids = set()
+
+ if self.debug:
+ print("Building revision list...")
+
+ if not pages:
+ if os.path.isfile(self.path+'/.pages'):
+ print('Loading fetched pages')
+ fp = open(self.path+'/.pages', 'rb')
+ pages = pickle.load(fp)
+ fp.close()
+
+
+ if not pages or len(pages) < self.max_page_count:
+ if self.debug:
+ print('Need to fetch pages')
+ pages = self.wd.list_pages(self.max_page_count)
+ self.savePages(pages)
+ elif self.debug:
+ print(len(pages), 'pages loaded')
+
+ fetched_pages = set()
+
+ for wrev in tqdm(self.wrevs, desc='Collecting pages we already got revisions for'):
+ page_name = wrev['page_name']
+
+ if page_name in fetched_pages:
+ continue
+
+ fetched_pages.add(page_name)
+
+ if self.debug:
+ print("Already fetched revisions for " + str(len(fetched_pages)) + " of " + str(len(pages)))
+
+ fetched = 0
+ for page in tqdm(pages, desc='Updating list of revisions to fetch'):
+ if page in fetched_pages:
+ continue
+
+ # TODO: more generic blacklisting
+ if page == "sandbox":
+ if self.debug:
+ print("Skipping", page)
+ continue
+
+ fetched += 1
+ page_id = self.wd.get_page_id(page)
+
+ if self.debug:
+ print(("ID: "+str(page_id)))
+
+ if page_id is None:
+ print('Page gone?', page)
+ continue
+
+ revs = self.wd.get_revisions(page_id=page_id, limit=self.max_depth)
+ for rev in revs:
+ if rev['id'] in self.fetched_revids:
+ continue
+
+ self.wrevs.append({
+ 'page_id' : page_id,
+ 'page_name' : page, # current name, not at revision time (revisions can rename them)
+ 'rev_id' : rev['id'],
+ 'date' : rev['date'],
+ 'user' : rev['user'],
+ 'comment' : rev['comment'],
+ })
+ self.saveWRevs() # Save a cached copy
+
+ print("Number of revisions already fetched", len(self.fetched_revids), len(self.wrevs))
+
+ if os.path.isfile(self.path+'/.metadata.json'):
+ self.loadMetadata()
+
+ print("")
+
+ print(("Total revisions: "+str(len(self.wrevs))))
+
+ if self.debug:
+ print("Sorting revisions...")
+
+ self.wrevs.sort(key=lambda rev: rev['date'])
+
+ if self.debug:
+ if len(self.wrevs) < 100:
+ print("")
+ print("Revision list: ")
+ for rev in self.wrevs:
+ print((str(rev)+"\n"))
+ print("")
+ else:
+ print("Too many revisions, not printing everything")
+
+
+ #
+ # Saves and loads operational state from file
+ #
+ def saveState(self):
+ fp = open(self.path+'/.wstate', 'wb')
+ pickle.dump(self.rev_no, fp)
+ fp.close()
+
+ def loadState(self):
+ if not os.path.isfile(self.path+'/.wstate'):
+ return
+ fp = open(self.path+'/.wstate', 'rb')
+ self.rev_no = pickle.load(fp)
+ fp.close()
+
+
+ #
+ # Initializes the construction process, after the revision list has been compiled.
+ # Either creates a new repo, or loads the existing one at the target path
+ # and restores its construction state.
+ #
+ def openRepo(self):
+ # Create a new repository or continue from aborted dump
+ self.last_names = {} # Tracks page renames: name atm -> last name in repo
+ self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo
+ self.loadFailedImages()
+
+ if os.path.isdir(self.path+'/.git'):
+ print("Continuing from aborted dump state...")
+ self.loadState()
+ self.repo = Repo(self.path)
+ assert not self.repo.bare
+
+ else: # create a new repository (will fail if one exists)
+ print("Initializing repository...")
+ self.repo = Repo.init(self.path)
+ self.rev_no = 0
+
+ if self.storeRevIds:
+ # Add revision id file to the new repo
+ fname = '.revid'
+ codecs.open(self.path + '/' + fname, "w", "UTF-8").close()
+ self.repo.index.add([fname])
+ self.index.commit("Initial creation of repo")
+ self.index = self.repo.index
+
+ #
+ # Takes an unprocessed revision from a revision log, fetches its data and commits it.
+ # Returns false if no unprocessed revisions remain.
+ #
+ def commitNext(self, rev):
+ if self.rev_no >= len(self.wrevs):
+ return False
+
+ if rev['rev_id'] in self.fetched_revids:
+ self.rev_no += 1
+
+ self.saveState() # Update operation state
+ return True
+
+ if rev['rev_id'] in self.revs_to_skip:
+ print("Skipping", rev)
+ return True
+
+ unixname = rev['page_name']
+ if unixname in self.pages_to_skip:
+ print("Skipping", rev)
+ return True
+
+ source = self.wd.get_revision_source(rev['rev_id'])
+ # Page title and unix_name changes are only available through another request:
+ details = self.wd.get_revision_version(rev['rev_id'])
+
+ # Store revision_id for last commit
+ # Without this, empty commits (e.g. file uploads) will be skipped by Git
+ if self.storeRevIds:
+ fname = self.path+'/.revid'
+ outp = codecs.open(fname, "w", "UTF-8")
+ outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway
+ outp.close()
+
+ rev_unixname = details['unixname'] # may be different in revision than atm
+
+ # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history.
+ # The only way to know they were changed is revision comments, though evil people may trick us.
+ if rev['comment'].startswith('Parent page set to: "'):
+ # This is a parenting revision, remember the new parent
+ parent_unixname = rev['comment'][21:-2]
+ if self.debug:
+ print('Parent changed', parent_unixname)
+ self.last_parents[unixname] = parent_unixname
+ else:
+ # Else use last parent_unixname we've recorded
+ parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None
+
+ ## TODO: test
+ #if rev['comment'].startswith('Removed tags: ') or rev['comment'].startswith('Added tags: '):
+ # self.updateTags(rev['comment'], rev_unixname)
+
+ # There are also problems when parent page gets renamed -- see updateChildren
+
+ # If the page is tracked and its name just changed, tell Git
+ fname = str(rev_unixname) + '.txt'
+ rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname)
+
+ commit_msg = ""
+
+ added_file_paths = []
+
+ if rename:
+ name_rename_from = str(self.last_names[unixname])+'.txt'
+
+ if self.debug:
+ print("Moving renamed", name_rename_from, "to", fname)
+
+ self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there
+
+ # Try to do the best we can, these situations usually stem from vandalism people have cleaned up
+ if os.path.isfile(self.path + '/' + name_rename_from):
+ self.index.move([name_rename_from, fname], force=True)
+ commit_msg += "Renamed from " + str(self.last_names[unixname]) + ' to ' + str(rev_unixname) + ' '
+ else:
+ print("Source file does not exist, probably deleted or renamed from already?", name_rename_from)
+
+ # Add new page
+ elif not os.path.isfile(self.path + '/' + fname): # never before seen
+ commit_msg += "Created "
+ if self.debug:
+ print("Adding", fname)
+ elif rev['comment'] == '':
+ commit_msg += "Updated "
+
+ self.last_names[unixname] = rev_unixname
+
+ # Ouput contents
+ outp = codecs.open(self.path + '/' + fname, "w", "UTF-8")
+ if details['title']:
+ outp.write('title:' + details['title']+'\n')
+ if parent_unixname:
+ outp.write('parent:'+parent_unixname+'\n')
+ outp.write(source)
+ outp.close()
+
+ added_file_paths.append(str(fname))
+
+ commit_msg += rev_unixname
+
+ # Commit
+ if rev['comment'] != '':
+ commit_msg += ': ' + rev['comment']
+ else:
+ commit_msg += ' (no message)'
+ if rev['date']:
+ parsed_time = time.gmtime(int(rev['date'])) # TODO: assumes GMT
+ commit_date = time.strftime('%Y-%m-%d %H:%M:%S', parsed_time)
+ else:
+ commit_date = None
+
+ got_images = False;
+
+ # Add some spacing in the commit message
+ if len(details['images']) > 0:
+ commit_msg += '\n'
+
+ for image in details['images']:
+ if self.wd.maybe_download_file(image['src'], self.path + '/' + image['filepath']):
+ commit_msg += '\nAdded image: ' + image['src']
+ got_images = True
+ # If we do this gitpython barfs on itself
+ #added_file_paths.append(image['filepath'])
+ else:
+ self.saveFailedImages()
+
+
+ if got_images:
+ added_file_paths.append("images")
+ print("Committing: " + str(self.rev_no) + '. '+commit_msg)
+
+ # Include metadata in the commit (if changed)
+ self.appendFetchedRevid(rev['rev_id'])
+ self.saveMetadata()
+ added_file_paths.append('.metadata.json')
+ self.index.add(added_file_paths)
+
+ username = str(rev['user'])
+ email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename
+ author = Actor(username, email)
+
+ commit = self.index.commit(commit_msg, author=author, author_date=commit_date)
+
+ if self.debug:
+ print('Committed', commit.name_rev, 'by', author)
+
+ self.fetched_revids.add(rev['rev_id'])
+
+ self.rev_no += 1
+ self.saveState() # Update operation state
+
+ return True
+
+ def fetchAll(self):
+ to_fetch = []
+ for rev in tqdm(self.wrevs, desc='Creating list of revisions to fetch'):
+ if rev['rev_id'] not in self.fetched_revids:
+ to_fetch.append(rev)
+ for rev in tqdm(to_fetch, desc='Downloading'):
+ self.commitNext(rev)
+
+ #
+ # Updates all children of the page to reflect parent's unixname change.
+ #
+ # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body.
+ # A parent may then be renamed.
+ # Wikidot logs no additional changes for child pages, yet they stay linked to the parent.
+ #
+ # Therefore, on every rename we must update all linked children in the same revision.
+ #
+ def updateChildren(self, oldunixname, newunixname):
+ if self.debug:
+ print('Updating parents for', oldunixname, newunixname)
+
+ for child in list(self.last_parents.keys()):
+ if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname:
+ self.updateParentField(child, self.last_parents[child], newunixname)
+
+ def updateTags(self, comment, unixname):
+ file_name = self.path+'/'+unixname+'.txt'
+ removed = []
+ removed_match = re.search(pattern = r'Removed tags: ([^.]+,?)\.')
+ if removed_match is not None:
+ removed = removed_match.group(1).split(', ')
+
+ tags = []
+
+ with codecs.open(file_name, "r", "UTF-8") as f:
+ content = f.readlines()
+
+ tagsline = None
+ for line in content:
+ if line.startswith('tags:'):
+ tagsline = line
+ break
+
+ # Father forgive me for the indentation depth
+ idx = -1
+ if tagsline is not None:
+ idx = content.index(tagsline)
+ for tag in tagsline.split(','):
+ if not tag in removed:
+ tags.append(tag)
+
+
+ added_match = re.search(pattern = r'Added tags: ([^.]+,?)\.')
+ if added_match is not None:
+ tags += added_match.group(1).split(', ')
+
+ tags.sort()
+
+ newtagsline = 'tags:' + ','.join(tags) + '\n'
+ if idx != -1:
+ contents[idx] = newtagsline
+ else:
+ contents = newtagsline + contents
+
+ with codecs.open(file_name, "w", "UTF-8") as f:
+ f.writelines(content)
+
+ #
+ # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname.
+ # The rest of the file is preserved.
+ #
+ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname):
+ child_path = self.path+'/'+child_unixname+'.txt'
+ if not os.path.isfile(child_path):
+ print('Failed to find child file!', child_path)
+ return
+ with codecs.open(child_path, "r", "UTF-8") as f:
+ content = f.readlines()
+ # Since this is all tracked by us, we KNOW there's a line in standard format somewhere
+ idx = content.index('parent:'+parent_oldunixname+'\n')
+ if idx < 0:
+ raise Exception("Cannot update child page "+child_unixname+": "
+ +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it.");
+ content[idx] = 'parent:'+parent_newunixname+'\n'
+ with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f:
+ f.writelines(content)
+
+
+ #
+ # Finalizes the construction process and deletes any temporary files.
+ #
+ def cleanup(self):
+ if os.path.exists(self.path+'/.wstate'):
+ os.remove(self.path+'/.wstate')
+ else:
+ print("wstate does not exist?")
+
+ if os.path.exists(self.path+'/.wrevs'):
+ os.remove(self.path+'/.wrevs')
+ else:
+ print("wrevs does not exist?")
+
+ if os.path.exists(self.path+'/.pages'):
+ os.remove(self.path+'/.pages')
+
+ if self.rev_no > 0:
+ self.index.add(['.fetched.txt'])
+ self.index.commit('Updating fetched revisions')
diff --git a/wikidot.py b/wikidot.py
index f01c59f..be378ea 100644
--- a/wikidot.py
+++ b/wikidot.py
@@ -1,193 +1,561 @@
-import requests
-import random
-from bs4 import BeautifulSoup
-import time
-
-# Implements various queries to Wikidot engine through its AJAX facilities
-
-
-class Wikidot:
- def __init__(self, site):
- self.site = site # Wikidot site to query
- self.delay = 200 # Delay between requests in msec
- self.debug = False # Print debug messages
- self.next_timeslot = time.clock() # Can call immediately
-
-
- # To honor usage rules, we wait for self.delay between requests.
- # Low-level query functions call this before every request to Wikidot./
- def _wait_request_slot(self):
- tm = time.clock()
- if self.next_timeslot - tm > 0:
- time.sleep(self.next_timeslot - tm)
- self.next_timeslot = tm + self.delay / 1000
- pass
-
- # Makes a Wikidot AJAX query. Returns the response+title or throws an error.
- def queryex(self, params):
- token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
- cookies = {"wikidot_token7": token}
- params['wikidot_token7'] = token
-
- if self.debug:
- print params
- print cookies
-
- self._wait_request_slot()
- req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies)
- json = req.json()
- if json['status'] == 'ok':
- return json['body'], (json['title'] if 'title' in json else '')
- else:
- raise req.text
-
- # Same but only returns the body, most responses don't have titles
- def query(self, params):
- return self.queryex(params)[0]
-
-
- # List all pages for the site.
-
- # Raw version
- # For the supported formats (module_body) see:
- # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
- def list_pages_raw(self, limit):
- res = self.query({
- 'moduleName': 'list/ListPagesModule',
- 'limit': limit if limit else '10000',
- 'perPage': limit if limit else '10000',
- 'module_body': '%%page_unix_name%%',
- 'separate': 'false',
- 'order': 'dateCreatedDesc', # This way limit makes sense. This is also the default
- })
- return res
-
- # Client version
- def list_pages(self, limit):
- raw = self.list_pages_raw(limit).replace('
',"\n")
- soup = BeautifulSoup(raw, 'html.parser')
- pages = []
- for entry in soup.div.p.text.split('\n'):
- pages.append(entry)
- return pages
-
-
- # Retrieves internal page_id by page unix_name.
- # Page IDs are required for most of page functions.
-
- def get_page_id(self, page_unix_name):
- # The only freaking way to get page ID is to load the page! Wikidot!
- self._wait_request_slot()
- req = requests.request('GET', self.site+'/'+page_unix_name)
- soup = BeautifulSoup(req.text, 'html.parser')
- for item in soup.head.find_all('script'):
- text = item.text
- pos = text.find("WIKIREQUEST.info.pageId = ")
- if pos >= 0:
- pos += len("WIKIREQUEST.info.pageId = ")
- crlf = text.find(";", pos)
- if crlf >= 0:
- return int(text[pos:crlf])
- else:
- return int(text[pos:])
- return None
-
-
- # Retrieves a list of revisions for a page.
- # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
-
- # Raw version
- def get_revisions_raw(self, page_id, limit):
- res = self.query({
- 'moduleName': 'history/PageRevisionListModule',
- 'page_id': page_id,
- 'page': '1',
- 'perpage': limit if limit else '10000',
- 'options': '{"all":true}'
- })
-
- soup = BeautifulSoup(res, 'html.parser')
- return soup.table.contents
-
- # Client version
- def get_revisions(self, page_id, limit):
- revs = []
- for tr in self.get_revisions_raw(page_id, limit):
- if tr.name != 'tr': continue # there's a header + various junk
-
- # RevID is stored as a value of an INPUT field
- rev_id = tr.input['value'] if tr.input else None
- if rev_id is None: continue # can't parse
-
- # Unixtime is stored as a CSS class time_*
- rev_date = 0
- date_span = tr.find("span", attrs={"class": "odate"})
- if date_span is not None:
- for cls in date_span['class']:
- if cls.startswith('time_'):
- rev_date = int(cls[5:])
-
- # Username in a last under
- user_span = tr.find("span", attrs={"class": "printuser"})
- for last_a in user_span.find_all('a'): pass
- rev_user = last_a.getText() if last_a else None
-
-
- # Comment is in the last TD of the row
- last_td = None
- for last_td in tr.find_all('td'): pass
- rev_comment = last_td.getText() if last_td else ""
-
- revs.append({
- 'id': rev_id,
- 'date': rev_date,
- 'user': rev_user,
- 'comment': rev_comment,
- })
- return revs
-
-
- # Retrieves revision source for a revision.
- # There's no raw version because there's nothing else in raw.
- def get_revision_source(self, rev_id):
- res = self.query({
- 'moduleName': 'history/PageSourceModule',
- 'revision_id': rev_id,
- # We don't need page id
- })
- # The source is HTMLified but BeautifulSoup's getText() will decode that
- # - htmlentities
- # -
s in place of linebreaks
- # - random real linebreaks (have to be ignored)
- soup = BeautifulSoup(res, 'html.parser')
- return soup.div.getText().lstrip(' \r\n')
-
- # Retrieves the rendered version + additional info unavailable in get_revision_source:
- # * Title
- # * Unixname at the time
- def get_revision_version_raw(self, rev_id):
- res = self.queryex({
- 'moduleName': 'history/PageVersionModule',
- 'revision_id': rev_id,
- })
- return res
-
- def get_revision_version(self, rev_id):
- res = self.get_revision_version_raw(rev_id) # this has title!
- soup = BeautifulSoup(res[0], 'html.parser')
-
- # First table is a flyout with revision details. Remove and study it.
- unixname = None
- details = soup.find("div", attrs={"id": "page-version-info"}).extract()
- for tr in details.find_all('tr'):
- tds = tr.find_all('td')
- if len(tds) < 2: continue
- if tds[0].getText().strip() == 'Page name:':
- unixname = tds[1].getText().strip()
-
- return {
- 'rev_id': rev_id,
- 'unixname': unixname,
- 'title': res[1],
- 'content': unicode(soup), # only content remains
- }
\ No newline at end of file
+import requests
+import random
+from bs4 import BeautifulSoup
+import time
+from urllib.parse import urlparse, urljoin
+from pprint import pprint
+import pathlib
+import hashlib
+import os
+import shutil
+import imghdr
+from timeit import default_timer as timer
+
+# Implements various queries to Wikidot engine through its AJAX facilities
+
+
+class Wikidot:
+ def __init__(self, site):
+ self.site = site # Wikidot site to query
+
+ # strip out trailing /, if it exists
+ if self.site[-1] == '/':
+ self.site = self.site[:-1]
+ self.sitename = urlparse(site).hostname.lower()
+ self.delay = 1000 # Delay between requests in msec
+ self.debug = False # Print debug messages
+ self.next_timeslot = time.process_time() # Can call immediately
+ self.max_retries = 5
+ self.failed_images = set()
+
+ # Downloads file if it doesn't exist
+ def maybe_download_file(self, url, file_path):
+ if url in self.failed_images:
+ if self.debug:
+ print(" ! ", url, "already failed, skipping")
+ return False
+
+ if os.path.exists(file_path):
+ if self.debug:
+ print(" - ", file_path, "exists, skipping")
+ return False
+
+ #self._wait_request_slot()
+
+ try:
+ dirpath = os.path.dirname(file_path)
+ os.makedirs(dirpath, exist_ok=True)
+ except OSError as e:
+ if e.errno == 36:
+ print("Path too long", e)
+ return False
+ else:
+ raise # re-raise previously caught exception
+
+ if self.debug:
+ print(" < downloading", url, "to" ,file_path, "dirpath", dirpath)
+
+ # In case of e. g. 500 errors
+ retries = 0
+ while retries < self.max_retries:
+ self._wait_request_slot()
+
+ headers = requests.utils.default_headers()
+ # Pretty generic user-agent, but we append a unique none for us
+ # Makes wikimedia happy
+ headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"})
+ start = timer()
+
+ try:
+ req = requests.get(url, stream=True, timeout=30)
+ except requests.exceptions.RequestException:
+ print('request exception')
+
+ retries += 1
+ time.sleep(retries * retries * retries) # up to ~2 minutes
+ continue
+ except urllib3.exceptions.ReadTimeoutError:
+ print('read timeout')
+
+ retries += 1
+ time.sleep(retries * retries * retries) # up to ~2 minutes
+ continue
+
+ if req.status_code >= 500:
+ print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+ # In case of debug enabled, we already printed this above
+ if not self.debug:
+ print(' - ', req)
+
+ retries += 1
+ time.sleep(retries * retries * retries)
+ continue
+
+ if req.status_code >= 400:
+ self.failed_images.add(url)
+ return False
+
+ try:
+ # In case of 404 errors or other stuff that indicates
+ # some bug in how we handle or request things
+ req.raise_for_status()
+
+ req.raw.decode_content = True
+ with open(file_path, 'wb') as out_file:
+ shutil.copyfileobj(req.raw, out_file)
+
+ if imghdr.what(file_path) is None:
+ print('Downloaded invalid image', url)
+ os.remove(file_path)
+ self.failed_images.add(url)
+ return False
+
+
+ if self.debug:
+ print(" - downloaded file size", os.path.getsize(file_path), "in", round(timer() - start, 2))
+
+ return True
+ except OSError as e:
+ if e.errno == 36:
+ print("Filename to long", e)
+ return False
+ else:
+ raise # re-raise previously caught exception
+ except Exception as e:
+ print(' ! Failed to download', e, req, url)
+ raise e
+
+ print('Failed too many times for', url)
+ return False
+
+ # To honor usage rules, we wait for self.delay between requests.
+ # Low-level query functions call this before every request to Wikidot./
+ def _wait_request_slot(self):
+ tm = time.process_time()
+ if self.next_timeslot - tm > 0:
+ time.sleep(self.next_timeslot - tm)
+ self.next_timeslot = tm + self.delay / 1000
+
+ pass
+
+ # Makes a Wikidot AJAX query. Returns the response+title or throws an error.
+ def queryex(self, params, urlAppend = None):
+ token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8))
+ cookies = {"wikidot_token7": token}
+ params['wikidot_token7'] = token
+
+ if self.debug:
+ print(' - ', params)
+ print(' - ', cookies)
+
+ url = self.site+'/ajax-module-connector.php'
+ if urlAppend is not None:
+ url += urlAppend
+
+ # In case of e. g. 500 errors
+ retries = 0
+ while retries < self.max_retries:
+ if retries > 0:
+ print(" ! retry", retries, "of", self.max_retries)
+
+ self._wait_request_slot()
+
+ start = timer()
+ try:
+ req = requests.request('POST', url, data=params, cookies=cookies, timeout=30)
+ except requests.exceptions.RequestException:
+ print('request timed out!')
+ retries += 1
+ time.sleep(retries * retries * retries)
+ continue
+
+ if self.debug:
+ print(' * ajax request completed in', round(timer() - start, 2))
+
+ # Usually a 502 error, recovers immediately
+ if req.status_code >= 500:
+ retries += 1
+ print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+
+ # In case of debug enabled, we already printed this above
+ if not self.debug:
+ print(req, params)
+
+ # Be nice, double wait delay for errors
+ self._wait_request_slot()
+
+ # Extra nice, sleep longer (expoential increase), hope for the
+ # server to recover
+ time.sleep(retries * retries * retries)
+
+ continue
+
+ try:
+ # In case of 404 errors or other stuff that indicates
+ # some bug in how we handle or request things
+ req.raise_for_status()
+ except Exception as e:
+ print(' ! Failed to get response from wikidot', e, req, url, params)
+
+ try:
+ json = req.json()
+ except Exception as e:
+ print(' ! Failed to get response from wikidot', e, req, url, params)
+ if retries < self.max_retries:
+ retries += 1
+ #self._wait_request_slot()
+ time.sleep(retries * retries * retries)
+ continue
+
+ raise e
+
+ if json['status'] == 'ok':
+ return json['body'], (json['title'] if 'title' in json else '')
+ else:
+ print(" ! error in response", json)
+
+ retries += 1
+ time.sleep(retries * retries * retries)
+ continue
+
+ print(' ! Failed too many times', url, params, cookies)
+ raise Exception('Failed too many times for ' + url)
+
+ # Same but only returns the body, most responses don't have titles
+ def query(self, params, urlAppend = None):
+ return self.queryex(params, urlAppend)[0]
+
+ # List all pages for the site.
+
+ # Raw version
+ # For the supported formats (module_body) see:
+ # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php
+ def list_pages_raw(self, limit, offset):
+ res = self.query({
+ 'moduleName': 'list/ListPagesModule',
+ 'limit': limit if limit else '10000',
+ 'perPage': limit if limit else '10000',
+ 'module_body': '%%page_unix_name%%',
+ 'separate': 'false',
+ 'p': str(offset),
+ 'order': 'dateCreatedDesc', # This way limit makes sense. This is also the default
+ }, '/p/' + str(offset))
+ return res
+
+ # Client version
+ def list_pages(self, limit):
+ offset = 1
+ pages = []
+
+ while True:
+ raw = self.list_pages_raw(limit, offset).replace('
',"\n")
+ soup = BeautifulSoup(raw, 'html.parser')
+
+
+ for entry in soup.div.p.text.split('\n'):
+ pages.append(entry)
+
+ if self.debug:
+ print(' - Pages found:', len(pages))
+
+ targets = soup.find_all('span','target')
+ if len(targets) < 2:
+ print(" ! Unable to find next listing page, not enough target spans")
+ break
+
+ next_url = targets[-1].a.get('href').split('/')
+ if len(next_url) > 0 and next_url[-1].isnumeric():
+ next_page = int(next_url[-1])
+
+ if self.debug:
+ print(' - Next listing page', next_page)
+
+ else:
+ print(" ! invalid next url", next_url)
+ break
+
+ #next_page = int(targets[0].a.text)
+
+ current_spans = soup.find_all('span','current')
+ if len(current_spans) > 0:
+ current_page = int(current_spans[0].text)
+
+ if self.debug:
+ print(' - Current listing page', current_page)
+
+ else:
+ print(" ! unable to find current page")
+ break;
+
+ if next_page != offset + 1:
+ if self.debug:
+ print(' ! Next page is wrong', next_page, 'hopefully at the end')
+ break
+
+ offset += 1
+
+ print(" - Fetching listing page", offset)
+
+ return pages
+
+
+ # Retrieves internal page_id by page unix_name.
+ # Page IDs are required for most of page functions.
+
+ def get_page_id(self, page_unix_name):
+ # The only freaking way to get page ID is to load the page! Wikidot!
+ self._wait_request_slot()
+ url = self.site+'/'+page_unix_name + '/noredirect/true';
+
+ if self.debug:
+ print(" > fetching", url)
+
+ start = timer()
+ retries = 0
+ req = None
+ while retries < self.max_retries:
+ try:
+ req = requests.request('GET', url, timeout=30)
+ except requests.exceptions.RequestException:
+ print('request timed out!')
+ retries += 1
+ time.sleep(retries * retries * retries)
+ continue
+
+ if req.status_code >= 500:
+ print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries))
+ retries += 1
+ time.sleep(retries * retries * retries)
+ continue
+
+ req.raise_for_status()
+ break
+
+ if self.debug:
+ print(' * page id request completed in', round(timer() - start, 2))
+
+ soup = BeautifulSoup(req.text, 'html.parser')
+ for item in soup.head.find_all('script'):
+ text = item.string
+ if text is None:
+ #print("No text in script item", item)
+ continue
+
+ pos = text.find("WIKIREQUEST.info.pageId = ")
+ if pos >= 0:
+ pos += len("WIKIREQUEST.info.pageId = ")
+ crlf = text.find(";", pos)
+ if crlf >= 0:
+ return int(text[pos:crlf])
+ else:
+ return int(text[pos:])
+
+ raise Exception('Failed to get page_id for ' + page_unix_name)
+
+
+ # Retrieves a list of revisions for a page.
+ # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php
+
+ # Raw version
+ def get_revisions_raw(self, page_id, limit):
+ res = self.query({
+ 'moduleName': 'history/PageRevisionListModule',
+ 'page_id': page_id,
+ 'page': '1',
+ 'perpage': limit if limit else '10000',
+ 'options': '{"all":true}'
+ })
+
+ soup = BeautifulSoup(res, 'html.parser')
+ return soup.table.contents
+
+ # Client version
+ def get_revisions(self, page_id, limit):
+ revs = []
+ raw = self.get_revisions_raw(page_id, limit)
+ for tr in raw:
+ if tr.name != 'tr': continue # there's a header + various junk
+
+ # RevID is stored as a value of an INPUT field
+ rev_id = tr.input['value'] if tr.input else None
+ if rev_id is None: continue # can't parse
+ attachment_action = tr.find("span", attrs={"title": "file/attachment action"})
+ attached_file = False
+ if attachment_action is not None:
+ attached_file = True
+ print(" - was attchment", rev_id)
+
+ # Unixtime is stored as a CSS class time_*
+ rev_date = 0
+ date_span = tr.find("span", attrs={"class": "odate"})
+ if date_span is not None:
+ for cls in date_span['class']:
+ if cls.startswith('time_'):
+ rev_date = int(cls[5:])
+ else:
+ print(" ! no odate found")
+
+ # Username in a last under
+ user_span = tr.find("span", attrs={"class": "printuser"})
+ last_a = None
+ for last_a in user_span.find_all('a'): pass
+ rev_user = last_a.getText() if last_a else None
+
+
+ # Comment is in the last TD of the row
+ last_td = None
+ for last_td in tr.find_all('td'): pass
+ rev_comment = last_td.getText() if last_td else ""
+
+ revs.append({
+ 'id': rev_id,
+ 'date': rev_date,
+ 'user': rev_user,
+ 'comment': rev_comment,
+ 'attached_file': attached_file,
+ })
+ return revs
+
+ # topics in forum: http://www.scp-wiki.net/forum/c-###/sort/start
+ # -> div class 'title'
+ # -> a href= http://www.scp-wiki.net/forum/t-####/foobar (foobar not important)
+
+ # posts in topic http://www.scp-wiki.net/forum/t-####/
+ # -> div id 'thread-container'
+ # -> div class 'post-container'
+ # -> div class = 'post', id = 'post-####'
+ # -> div class 'title'
+ # -> div class 'content'
+ # -> div class 'post-container'
+ # -> ...
+ # -> div class 'post-container'
+ # -> ...
+
+ #def get_forum_post_revisions(self, post_id):
+ # res = self.query({
+ # 'moduleName': 'forum/sub/ForumPostRevisionsModule',
+ # 'postId': post_id,
+ # })
+ # revisions = []
+ # soup = BeautifulSoup(res, 'html.parser')
+ # for row in soup.find_all("tr"):
+ # columns = row.find_all("td")
+
+ # if len(columns) != 3:
+ # raise Exception('Invalid row in post history for ' + str(post_id))
+
+ # user = columns[0].find('a').getText()
+ # time = columns[1].find('span').getText()
+ # rev_id_js = columns[0].find('a')['href']
+ # match = re.search(r'showRevision\(event, ([0-9]+)\)', rev_id_js)
+ # rev_id = match.group(1)
+
+ # revisions.append({
+ # 'id': rev_id,
+ # 'user': user,
+ # 'time': time,
+ # })
+
+ # Retrieves revision source for a revision.
+ # There's no raw version because there's nothing else in raw.
+ def get_revision_source(self, rev_id):
+ res = self.query({
+ 'moduleName': 'history/PageSourceModule',
+ 'revision_id': rev_id,
+ # We don't need page id
+ })
+ # The source is HTMLified but BeautifulSoup's getText() will decode that
+ # - htmlentities
+ # -
s in place of linebreaks
+ # - random real linebreaks (have to be ignored)
+ soup = BeautifulSoup(res, 'html.parser')
+ return soup.div.getText().lstrip(' \r\n')
+
+ # Retrieves the rendered version + additional info unavailable in get_revision_source:
+ # * Title
+ # * Unixname at the time
+ #
+ # TODO: I think this could fetch the source as well, so we don't need to
+ # fetch two pages (the fetch source function above).
+ def get_revision_version_raw(self, rev_id):
+ res = self.queryex({
+ 'moduleName': 'history/PageVersionModule',
+ 'revision_id': rev_id,
+ })
+ return res
+
+ def get_revision_version(self, rev_id):
+ res = self.get_revision_version_raw(rev_id) # this has title!
+ soup = BeautifulSoup(res[0], 'html.parser')
+
+ # Extract list of images
+
+ # TODO: to get the right revision that added them, we need to go back
+ # and amend the commits that are flagged as attached_file above,
+ # because we can't get the image file name or URL reliably until they
+ # are added to the page source, wikidot itself doesn't store this information.
+ # So much hassle for little value, we get the empty commits when images
+ # are added anyways.
+ images = []
+ for img_div in soup.find_all("div", attrs={"class": "scp-image-block"}):
+ img_src = None
+ img_name = ""
+ full_link = img_div.find("a")
+ if full_link is not None:
+ # Check if it has a thumbnail, otherwise we can't trust that it is the original
+ img = full_link.find("img", attrs={"class": "enlarge"})
+ if img is not None:
+ img_src = full_link["href"]
+ img_name = img["alt"]
+
+ if img_src is None:
+ img = img_div.find("img")
+ if img is not None:
+ img_src = img["src"]
+ img_name = img["alt"]
+
+ if img_src is None:
+ continue
+
+ # Just in case, I don't think it ever happens, but resolve '..'
+ # juuuust in case someone tries to be funny
+ img_url = urlparse(urljoin(img_src, "."))
+ url_path = pathlib.Path(img_url.path)
+
+ img_path = ""
+ if img_url.netloc != "":
+ img_path = img_url.netloc + "/"
+ if img_url.netloc[-1] != '/':
+ img_path += '/'
+
+ if img_url.path != "" and img_url.path[0] == '/':
+ img_path += img_url.path[1:]
+ else:
+ img_path += img_url.path
+
+ if img_path == "" or img_path[-1] == "/":
+ img_path += img_name
+
+ images.append({"src": img_src, "filename": img_name, "filepath": "images/" + img_path})
+
+
+
+ # First table is a flyout with revision details. Remove and study it.
+ unixname = None
+ details = soup.find("div", attrs={"id": "page-version-info"}).extract()
+ for tr in details.find_all('tr'):
+ tds = tr.find_all('td')
+ if len(tds) < 2: continue
+ if tds[0].getText().strip() == 'Page name:':
+ unixname = tds[1].getText().strip()
+
+ if unixname is None:
+ raise Exception('Failed to find unixname for ' + rev_id)
+
+ return {
+ 'rev_id': rev_id,
+ 'unixname': unixname,
+ 'title': res[1],
+ 'content': str(soup), # only content remains
+ 'images': images,
+ }