diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a295864 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +__pycache__ diff --git a/.hgignore b/.hgignore deleted file mode 100644 index 471301b..0000000 --- a/.hgignore +++ /dev/null @@ -1,2 +0,0 @@ -syntax:glob -*.pyc diff --git a/crawl.py b/crawl.py index 22da38a..68fcf41 100644 --- a/crawl.py +++ b/crawl.py @@ -1,118 +1,121 @@ -import argparse -import sys -import locale -import codecs -import os -from wikidot import Wikidot -from rmaint import RepoMaintainer - -# TODO: Files. -# TODO: Forum and comment pages. -# TODO: Ability to download new transactions since last dump. -# We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump) - -rawStdout = sys.stdout -rawStderr = sys.stderr -sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, 'xmlcharrefreplace') -sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr, 'xmlcharrefreplace') - -parser = argparse.ArgumentParser(description='Queries Wikidot') -parser.add_argument('site', help='URL of Wikidot site') -# Actions -parser.add_argument('--list-pages', action='store_true', help='List all pages on this site') -parser.add_argument('--source', action='store_true', help='Print page source (requires --page)') -parser.add_argument('--content', action='store_true', help='Print page content (requires --page)') -parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)') -parser.add_argument('--dump', type=str, help='Download page revisions to this directory') -# Debug actions -parser.add_argument('--list-pages-raw', action='store_true') -parser.add_argument('--log-raw', action='store_true') -# Action settings -parser.add_argument('--page', type=str, help='Query only this page') -parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions') -parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository') -# Common settings -parser.add_argument('--debug', action='store_true', help='Print debug info') -parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot') -args = parser.parse_args() - - -wd = Wikidot(args.site) -wd.debug = args.debug -wd.delay = args.delay - - -def force_dirs(path): - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != os.errno.EEXIST: - raise - -if args.list_pages_raw: - print wd.list_pages_raw(args.depth) - -elif args.list_pages: - for page in wd.list_pages(args.depth): - print page - -elif args.source: - if not args.page: - raise "Please specify --page for --source." - - page_id = wd.get_page_id(args.page) - if not page_id: - raise "Page not found: "+args.page - - revs = wd.get_revisions(page_id, 1) # last revision - print wd.get_revision_source(revs[0]['id']) - -elif args.content: - if not args.page: - raise "Please specify --page for --source." - - page_id = wd.get_page_id(args.page) - if not page_id: - raise "Page not found: "+args.page - - revs = wd.get_revisions(page_id, 1) # last revision - print wd.get_revision_version(revs[0]['id']) - -elif args.log_raw: - if not args.page: - raise "Please specify --page for --log." - - page_id = wd.get_page_id(args.page) - if not page_id: - raise "Page not found: "+args.page - - print wd.get_revisions_raw(page_id, args.depth) - - -elif args.log: - if not args.page: - raise "Please specify --page for --log." - - page_id = wd.get_page_id(args.page) - if not page_id: - raise "Page not found: "+args.page - for rev in wd.get_revisions(page_id, args.depth): - print unicode(rev) - - -elif args.dump: - print "Downloading pages to "+args.dump - force_dirs(args.dump) - - rm = RepoMaintainer(wd, args.dump) - rm.debug = args.debug - rm.storeRevIds = args.revids - rm.buildRevisionList([args.page] if args.page else None, args.depth) - rm.openRepo() - - print "Downloading revisions..." - while rm.commitNext(): - pass - - rm.cleanup() - print "Done." +import argparse +import sys +import locale +import codecs +import os +from wikidot import Wikidot +from rmaint import RepoMaintainer + +# TODO: Files. +# TODO: Forum and comment pages. +# TODO: Ability to download new transactions since last dump. +# We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump) + +parser = argparse.ArgumentParser(description='Queries Wikidot') +parser.add_argument('site', help='URL of Wikidot site') +# Actions +parser.add_argument('--list-pages', action='store_true', help='List all pages on this site') +parser.add_argument('--max-page-count', type=int, default='10000', help='Only list/fetch up to this amount of pages') +parser.add_argument('--source', action='store_true', help='Print page source (requires --page)') +parser.add_argument('--content', action='store_true', help='Print page content (requires --page)') +parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)') +parser.add_argument('--dump', type=str, help='Download page revisions to this directory') +# Debug actions +parser.add_argument('--list-pages-raw', action='store_true') +parser.add_argument('--log-raw', action='store_true') +# Action settings +parser.add_argument('--page', type=str, help='Query only this page') +parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions') +parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True) +parser.add_argument('--skip', type=str, help='Skip the specified revision') +parser.add_argument('--skip-pages', type=str, help='Skip the specified pages') +parser.add_argument('--cleanup', action='store_true', help='Clean up after downloading repo') +# Common settings +parser.add_argument('--debug', action='store_true', help='Print debug info') +parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot') +args = parser.parse_args() + + +wd = Wikidot(args.site) +wd.debug = args.debug +wd.delay = args.delay + + +def force_dirs(path): + os.makedirs(path, exist_ok=True) + +if args.list_pages_raw: + print((wd.list_pages_raw(limit = args.max_pages_count))) + +elif args.list_pages: + for page in wd.list_pages(limit = args.max_pages_count): + print(page) + +elif args.source: + if not args.page: + raise Exception("Please specify --page for --source.") + + page_id = wd.get_page_id(page_unix_name=args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + + revs = wd.get_revisions(page_id, 1) # last revision + print((wd.get_revision_source(revs[0]['id']))) + +elif args.content: + if not args.page: + raise Exception("Please specify --page for --source.") + + page_id = wd.get_page_id(page_unix_name=args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + + revs = wd.get_revisions(page_id, 1) # last revision + print((wd.get_revision_version(revs[0]['id']))) + +elif args.log_raw: + if not args.page: + raise Exception("Please specify --page for --log.") + + page_id = wd.get_page_id(page_unix_name=args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + + print((wd.get_revisions_raw(page_id, args.depth))) + + +elif args.log: + if not args.page: + raise Exception("Please specify --page for --log.") + + page_id = wd.get_page_id(page_unix_name=args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + for rev in wd.get_revisions(page_id, args.depth): + print((str(rev))) + + +elif args.dump: + print(("Downloading pages to "+args.dump)) + force_dirs(args.dump) + + rm = RepoMaintainer(wd, args.dump) + rm.debug = args.debug + rm.storeRevIds = args.revids + rm.max_depth = args.depth + rm.max_page_count = args.max_page_count + rm.buildRevisionList([args.page] if args.page else None) + rm.openRepo() + + if args.skip_pages: + rm.pages_to_skip = args.skip_pages.split(",") + if args.skip: + rm.revs_to_skip = args.skip.split(",") + + print("Downloading revisions") + rm.fetchAll() + + if args.cleanup: + rm.cleanup() + + print("Done.") diff --git a/hgpatch.py b/hgpatch.py deleted file mode 100644 index 6d2ff12..0000000 --- a/hgpatch.py +++ /dev/null @@ -1,50 +0,0 @@ -from mercurial import scmutil, osutil -from types import MethodType -from mercurial import encoding -import codecs - -# Patches commit-message unicode handling on Python 2.x - -# Mercurial is internally unicode. But because it runs from ASCII console, it tries to convert -# all input from "input encoding" (set in mercurial/encoding.py) - -# Problem 1: -# If you just pass it u'unicode string', it'll fail. Even if you set "input encoding" to utf-8, -# it'll still try to decode it to ASCII. -# Solution: -# Patch this decoding function to pass unicode unchanged. - -old_fromlocal = None - -def better_fromlocal(s): - if isinstance(s, unicode): - return s.encode('utf-8') - global old_fromlocal - return old_fromlocal(s) - -old_fromlocal = encoding.fromlocal -encoding.fromlocal = better_fromlocal - - -# Problem 2: -# Separate from actual log, Mercurial stores commit message in commit-message.txt. -# Unfortunately it uses default Python 2.x file.open which expects ASCII and auto-conversion fails. -# Solution: -# Patch virtual-fs open() function to use codecs.open wrapper in this particular case. - -old_vfs_call = None - -def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False): - fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose) - if path.endswith('last-message.txt'): - # Create a wrapper like codecs.open does: - info = codecs.lookup("utf-8") - fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict') - fp.encoding = 'utf-8' - return fp - -old_vfs_call = scmutil.vfs.__call__ -scmutil.vfs.__call__ = better_vfs_call - - - diff --git a/readme.md b/readme.md index f66a0cc..641a570 100644 --- a/readme.md +++ b/readme.md @@ -1,30 +1,50 @@ -This is a Python command line client for relatively popular wiki hosting http://www.wikidot.com which lets you: - -* List all pages on a site -* See all revisions of a page -* Query page source - -Most interestingly, it allows you to download the whole site as a Mercurial repository, with proper commit dates and comments! - -##### Examples: - - crawl.py http://example.wikidot.com --dump ExampleRepo - crawl.py http://example.wikidot.com --log --page example-page - -It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers. - -Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed. - -##### Useful links: - -Wikidot code (very old) which simplifies things a bit: - -* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php - -The descriptions for on-site modules are heavily correlated with AJAX ones: - -* http://www.wikidot.com/doc-modules:listpages-module - -Someone else did Wikidot AJAX: - -* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py +*This is a fork to make a permanent backup of the SCP wiki.* + +This is a Python command line client for relatively popular wiki hosting +http://www.wikidot.com which lets you: + +* List all pages on a site +* See all revisions of a page +* Query page source + +Most interestingly, it allows you to download the whole site as a Git repository, with proper commit dates, author and comments! + +##### Dependencies + +At least: + +* Python 3 +* python-beautifulsoup4 +* python-gitpython +* python-requests +* python-tqdm + +##### Examples: + + crawl.py http://example.wikidot.com --dump ExampleRepo + crawl.py http://example.wikidot.com --log --page example-page + +It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers. + +Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed. + +##### Useful links: + +Wikidot code (very old) which simplifies things a bit: + +* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php + +The descriptions for on-site modules are heavily correlated with AJAX ones: + +* http://www.wikidot.com/doc-modules:listpages-module + +Someone else did Wikidot AJAX: + +* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py + + +#### TODO + + - Handle deleted images. Probably need to check the diff and check all pages for references if removed from one page. + - Handle tags (both added and removed). + diff --git a/rmaint.py b/rmaint.py index 029319f..1ab383f 100644 --- a/rmaint.py +++ b/rmaint.py @@ -1,263 +1,535 @@ -import os -import codecs -from mercurial import commands, ui, hg -import hgpatch -import cPickle as pickle -import wikidot - -# Repository builder and maintainer -# Contains logic for actual loading and maintaining the repository over the course of its construction. - -# Usage: -# rm = RepoMaintainer(wikidot, path) -# rm.buildRevisionList(pages, depth) -# rm.openRepo() -# while rm.commitNext(): -# pass -# rm.cleanup() - -# Talkative. - -class RepoMaintainer: - def __init__(self, wikidot, path): - # Settings - self.wd = wikidot # Wikidot instance - self.path = path # Path to repository - self.debug = False # = True to enable more printing - self.storeRevIds = True # = True to store .revid with each commit - - # Internal state - self.wrevs = None # Compiled wikidot revision list (history) - - self.rev_no = 0 # Next revision to process - self.last_names = {} # Tracks page renames: name atm -> last name in repo - self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - - self.ui = None # Mercurial UI object - self.repo = None # Mercurial repo object - - - # - # Saves and loads revision list from file - # - def saveWRevs(self): - fp = open(self.path+'\\.wrevs', 'wb') - pickle.dump(self.wrevs, fp) - fp.close() - - def loadWRevs(self): - fp = open(self.path+'\\.wrevs', 'rb') - self.wrevs = pickle.load(fp) - fp.close() - - # - # Compiles a combined revision list for a given set of pages, or all pages on the site. - # pages: compile history for these pages - # depth: download at most this number of revisions. - # - # If there exists a cached revision list at the repository destination, - # it is loaded and no requests are made. - # - def buildRevisionList(self, pages = None, depth = 10000): - if os.path.isfile(self.path+'\\.wrevs'): - print "Loading cached revision list..." - self.loadWRevs() - else: - print "Building revision list..." - if not pages: - pages = self.wd.list_pages(10000) - self.wrevs = [] - for page in pages: - print "Querying page: "+page - page_id = self.wd.get_page_id(page) - print "ID: "+str(page_id) - revs = self.wd.get_revisions(page_id, depth) - print "Revisions: "+str(len(revs)) - for rev in revs: - self.wrevs.append({ - 'page_id' : page_id, - 'page_name' : page, # name atm, not at revision time - 'rev_id' : rev['id'], - 'date' : rev['date'], - 'user' : rev['user'], - 'comment' : rev['comment'], - }) - self.saveWRevs() # Save a cached copy - print "" - - - print "Total revisions: "+str(len(self.wrevs)) - - print "Sorting revisions..." - self.wrevs.sort(key=lambda rev: rev['date']) - print "" - - if self.debug: - print "Revision list: " - for rev in self.wrevs: - print str(rev)+"\n" - print "" - - - # - # Saves and loads operational state from file - # - def saveState(self): - fp = open(self.path+'\\.wstate', 'wb') - pickle.dump(self.rev_no, fp) - pickle.dump(self.last_names, fp) - pickle.dump(self.last_parents, fp) - fp.close() - - def loadState(self): - fp = open(self.path+'\\.wstate', 'rb') - self.rev_no = pickle.load(fp) - self.last_names = pickle.load(fp) - try: - self.last_parents = pickle.load(fp) - except EOFError: - pass - fp.close() - - - # - # Initializes the construction process, after the revision list has been compiled. - # Either creates a new repo, or loads the existing one at the target path - # and restores its construction state. - # - def openRepo(self): - # Create a new repository or continue from aborted dump - self.ui=ui.ui() - self.last_names = {} # Tracks page renames: name atm -> last name in repo - self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - - if os.path.isfile(self.path+'\\.wstate'): - print "Continuing from aborted dump state..." - self.loadState() - self.repo = hg.repository(self.ui, self.path) - - else: # create a new repository (will fail if one exists) - print "Initializing repository..." - commands.init(self.ui, self.path) - self.repo = hg.repository(self.ui, self.path) - self.rev_no = 0 - - if self.storeRevIds: - # Add revision id file to the new repo - fname = self.path+'\\.revid' - codecs.open(fname, "w", "UTF-8").close() - commands.add(self.ui, self.repo, str(fname)) - - - # - # Takes an unprocessed revision from a revision log, fetches its data and commits it. - # Returns false if no unprocessed revisions remain. - # - def commitNext(self): - if self.rev_no >= len(self.wrevs): - return False - - rev = self.wrevs[self.rev_no] - source = self.wd.get_revision_source(rev['rev_id']) - # Page title and unix_name changes are only available through another request: - details = self.wd.get_revision_version(rev['rev_id']) - - # Store revision_id for last commit - # Without this, empty commits (e.g. file uploads) will be skipped by Mercurial - if self.storeRevIds: - fname = self.path+'\\.revid' - outp = codecs.open(fname, "w", "UTF-8") - outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway - outp.close() - - unixname = rev['page_name'] - rev_unixname = details['unixname'] # may be different in revision than atm - - # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history. - # The only way to know they were changed is revision comments, though evil people may trick us. - if rev['comment'].startswith('Parent page set to: "'): - # This is a parenting revision, remember the new parent - parent_unixname = rev['comment'][21:-2] - self.last_parents[unixname] = parent_unixname - else: - # Else use last parent_unixname we've recorded - parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None - # There are also problems when parent page gets renamed -- see updateChildren - - # If the page is tracked and its name just changed, tell HG - rename = (unixname in self.last_names) and (self.last_names[unixname] <> rev_unixname) - if rename: - self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there - commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt') - - # Ouput contents - fname = self.path+'\\'+rev_unixname+'.txt' - outp = codecs.open(fname, "w", "UTF-8") - if details['title']: - outp.write('title:'+details['title']+'\n') - if parent_unixname: - outp.write('parent:'+parent_unixname+'\n') - outp.write(source) - outp.close() - - # Add new page - if not unixname in self.last_names: # never before seen - commands.add(self.ui, self.repo, str(fname)) - - self.last_names[unixname] = rev_unixname - - # Commit - if rev['comment'] <> '': - commit_msg = rev_unixname + ': ' + rev['comment'] - else: - commit_msg = rev_unixname - if rev['date']: - commit_date = str(rev['date']) + ' 0' - else: - commit_date = None - print "Commiting: "+str(self.rev_no)+'. '+commit_msg - - commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date) - self.rev_no += 1 - - self.saveState() # Update operation state - return True - - - # - # Updates all children of the page to reflect parent's unixname change. - # - # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body. - # A parent may then be renamed. - # Wikidot logs no additional changes for child pages, yet they stay linked to the parent. - # - # Therefore, on every rename we must update all linked children in the same revision. - # - def updateChildren(self, oldunixname, newunixname): - for child in self.last_parents.keys(): - if self.last_parents[child] == oldunixname: - self.updateParentField(child, self.last_parents[child], newunixname) - - # - # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. - # The rest of the file is preserved. - # - def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): - with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f: - content = f.readlines() - # Since this is all tracked by us, we KNOW there's a line in standard format somewhere - idx = content.index('parent:'+parent_oldunixname+'\n') - if idx < 0: - raise Exception("Cannot update child page "+child_unixname+": " - +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); - content[idx] = 'parent:'+parent_newunixname+'\n' - with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f: - f.writelines(content) - - - # - # Finalizes the construction process and deletes any temporary files. - # - def cleanup(self): - os.remove(self.path+'\\.wstate') - os.remove(self.path+'\\.wrevs') \ No newline at end of file +import wikidot + +# Basic python stuff +import os +import codecs +import pickle as pickle +import json + +# git stuff +from git import Repo, Actor +import time # For parsing unix epoch timestamps from wikidot and convert to normal timestamps +import re # For sanitizing usernames to fake email addresses + +from tqdm import tqdm # for progress bar + +# Repository builder and maintainer +# Contains logic for actual loading and maintaining the repository over the course of its construction. + +# Usage: +# rm = RepoMaintainer(wikidot, path) +# rm.buildRevisionList(pages) +# rm.openRepo() +# while rm.commitNext(): +# pass +# rm.cleanup() + +# Talkative. + +class RepoMaintainer: + def __init__(self, wikidot, path): + # Settings + self.wd = wikidot # Wikidot instance + self.path = path # Path to repository + self.debug = False # = True to enable more printing + self.storeRevIds = True # = True to store .revid with each commit + + # Internal state + self.wrevs = None # Compiled wikidot revision list (history) + + self.rev_no = 0 # Next revision to process + self.last_names = {} # Tracks page renames: name atm -> last name in repo + self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo + + self.repo = None # Git repo object + self.index = None # Git current index object + self.max_depth = 10000 # download at most this number of revisions + self.max_page_count = 10000 # download at most this number of pages + + self.pbar = None + self.first_fetched = 0 # For progress bar + self.fetched_revids = set() + + self.revs_to_skip = [] + self.pages_to_skip = [] + + + # + # Saves and loads revision list from file + # + def saveWRevs(self): + fp = open(self.path+'/.wrevs', 'wb') + pickle.dump(self.wrevs, fp) + fp.close() + + def loadWRevs(self): + fp = open(self.path+'/.wrevs', 'rb') + self.wrevs = pickle.load(fp) + fp.close() + + def savePages(self, pages): + fp = open(self.path+'/.pages', 'wb') + pickle.dump(pages, fp) + fp.close() + + def appendFetchedRevid(self, revid): + fp = open(self.path+'/.fetched.txt', 'a') + fp.write(revid + '\n') + fp.close() + + def loadFetchedRevids(self): + self.fetched_revids = set([line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')]) + + def saveFailedImages(self): + file_path = self.path + '/.failed-images.txt' + fp = open(file_path, 'w') + for failed in self.wd.failed_images: + fp.write(failed + '\n') + fp.close() + + def loadFailedImages(self): + file_path = self.path + '/.failed-images.txt' + if not os.path.isfile(file_path): + return + self.wd.failed_images = set([line.rstrip() for line in open(file_path, 'r')]) + + # Persistent metadata about the repo: + # - Tracks page renames: name atm -> last name in repo + # - Tracks page parent names: name atm -> last parent in repo + def saveMetadata(self): + metadata = {'names': self.last_names, 'parents': self.last_parents } + fp = open(self.path+'/.metadata.json', 'w') + json.dump(metadata, fp) + fp.close() + + def loadMetadata(self): + fp = open(self.path+'/.metadata.json', 'r') + metadata = json.load(fp) + self.last_names = metadata['names'] + self.last_parents = metadata['parents'] + fp.close() + + self.loadFetchedRevids() + # + # Compiles a combined revision list for a given set of pages, or all pages on the site. + # pages: compile history for these pages + # + # If there exists a cached revision list at the repository destination, + # it is loaded and no requests are made. + # + def buildRevisionList(self, pages = None): + if os.path.isfile(self.path+'/.wrevs'): + print("Loading cached revision list...") + self.loadWRevs() + else: + self.wrevs = [] + if self.debug: + print('No existing wrevs') + + if os.path.isfile(self.path+'/.fetched.txt'): + self.loadFetchedRevids() + print(len(self.fetched_revids), 'revisions already fetched') + else: + self.fetched_revids = set() + + if self.debug: + print("Building revision list...") + + if not pages: + if os.path.isfile(self.path+'/.pages'): + print('Loading fetched pages') + fp = open(self.path+'/.pages', 'rb') + pages = pickle.load(fp) + fp.close() + + + if not pages or len(pages) < self.max_page_count: + if self.debug: + print('Need to fetch pages') + pages = self.wd.list_pages(self.max_page_count) + self.savePages(pages) + elif self.debug: + print(len(pages), 'pages loaded') + + fetched_pages = set() + + for wrev in tqdm(self.wrevs, desc='Collecting pages we already got revisions for'): + page_name = wrev['page_name'] + + if page_name in fetched_pages: + continue + + fetched_pages.add(page_name) + + if self.debug: + print("Already fetched revisions for " + str(len(fetched_pages)) + " of " + str(len(pages))) + + fetched = 0 + for page in tqdm(pages, desc='Updating list of revisions to fetch'): + if page in fetched_pages: + continue + + # TODO: more generic blacklisting + if page == "sandbox": + if self.debug: + print("Skipping", page) + continue + + fetched += 1 + page_id = self.wd.get_page_id(page) + + if self.debug: + print(("ID: "+str(page_id))) + + if page_id is None: + print('Page gone?', page) + continue + + revs = self.wd.get_revisions(page_id=page_id, limit=self.max_depth) + for rev in revs: + if rev['id'] in self.fetched_revids: + continue + + self.wrevs.append({ + 'page_id' : page_id, + 'page_name' : page, # current name, not at revision time (revisions can rename them) + 'rev_id' : rev['id'], + 'date' : rev['date'], + 'user' : rev['user'], + 'comment' : rev['comment'], + }) + self.saveWRevs() # Save a cached copy + + print("Number of revisions already fetched", len(self.fetched_revids), len(self.wrevs)) + + if os.path.isfile(self.path+'/.metadata.json'): + self.loadMetadata() + + print("") + + print(("Total revisions: "+str(len(self.wrevs)))) + + if self.debug: + print("Sorting revisions...") + + self.wrevs.sort(key=lambda rev: rev['date']) + + if self.debug: + if len(self.wrevs) < 100: + print("") + print("Revision list: ") + for rev in self.wrevs: + print((str(rev)+"\n")) + print("") + else: + print("Too many revisions, not printing everything") + + + # + # Saves and loads operational state from file + # + def saveState(self): + fp = open(self.path+'/.wstate', 'wb') + pickle.dump(self.rev_no, fp) + fp.close() + + def loadState(self): + if not os.path.isfile(self.path+'/.wstate'): + return + fp = open(self.path+'/.wstate', 'rb') + self.rev_no = pickle.load(fp) + fp.close() + + + # + # Initializes the construction process, after the revision list has been compiled. + # Either creates a new repo, or loads the existing one at the target path + # and restores its construction state. + # + def openRepo(self): + # Create a new repository or continue from aborted dump + self.last_names = {} # Tracks page renames: name atm -> last name in repo + self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo + self.loadFailedImages() + + if os.path.isdir(self.path+'/.git'): + print("Continuing from aborted dump state...") + self.loadState() + self.repo = Repo(self.path) + assert not self.repo.bare + + else: # create a new repository (will fail if one exists) + print("Initializing repository...") + self.repo = Repo.init(self.path) + self.rev_no = 0 + + if self.storeRevIds: + # Add revision id file to the new repo + fname = '.revid' + codecs.open(self.path + '/' + fname, "w", "UTF-8").close() + self.repo.index.add([fname]) + self.index.commit("Initial creation of repo") + self.index = self.repo.index + + # + # Takes an unprocessed revision from a revision log, fetches its data and commits it. + # Returns false if no unprocessed revisions remain. + # + def commitNext(self, rev): + if self.rev_no >= len(self.wrevs): + return False + + if rev['rev_id'] in self.fetched_revids: + self.rev_no += 1 + + self.saveState() # Update operation state + return True + + if rev['rev_id'] in self.revs_to_skip: + print("Skipping", rev) + return True + + unixname = rev['page_name'] + if unixname in self.pages_to_skip: + print("Skipping", rev) + return True + + source = self.wd.get_revision_source(rev['rev_id']) + # Page title and unix_name changes are only available through another request: + details = self.wd.get_revision_version(rev['rev_id']) + + # Store revision_id for last commit + # Without this, empty commits (e.g. file uploads) will be skipped by Git + if self.storeRevIds: + fname = self.path+'/.revid' + outp = codecs.open(fname, "w", "UTF-8") + outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway + outp.close() + + rev_unixname = details['unixname'] # may be different in revision than atm + + # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history. + # The only way to know they were changed is revision comments, though evil people may trick us. + if rev['comment'].startswith('Parent page set to: "'): + # This is a parenting revision, remember the new parent + parent_unixname = rev['comment'][21:-2] + if self.debug: + print('Parent changed', parent_unixname) + self.last_parents[unixname] = parent_unixname + else: + # Else use last parent_unixname we've recorded + parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None + + ## TODO: test + #if rev['comment'].startswith('Removed tags: ') or rev['comment'].startswith('Added tags: '): + # self.updateTags(rev['comment'], rev_unixname) + + # There are also problems when parent page gets renamed -- see updateChildren + + # If the page is tracked and its name just changed, tell Git + fname = str(rev_unixname) + '.txt' + rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) + + commit_msg = "" + + added_file_paths = [] + + if rename: + name_rename_from = str(self.last_names[unixname])+'.txt' + + if self.debug: + print("Moving renamed", name_rename_from, "to", fname) + + self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there + + # Try to do the best we can, these situations usually stem from vandalism people have cleaned up + if os.path.isfile(self.path + '/' + name_rename_from): + self.index.move([name_rename_from, fname], force=True) + commit_msg += "Renamed from " + str(self.last_names[unixname]) + ' to ' + str(rev_unixname) + ' ' + else: + print("Source file does not exist, probably deleted or renamed from already?", name_rename_from) + + # Add new page + elif not os.path.isfile(self.path + '/' + fname): # never before seen + commit_msg += "Created " + if self.debug: + print("Adding", fname) + elif rev['comment'] == '': + commit_msg += "Updated " + + self.last_names[unixname] = rev_unixname + + # Ouput contents + outp = codecs.open(self.path + '/' + fname, "w", "UTF-8") + if details['title']: + outp.write('title:' + details['title']+'\n') + if parent_unixname: + outp.write('parent:'+parent_unixname+'\n') + outp.write(source) + outp.close() + + added_file_paths.append(str(fname)) + + commit_msg += rev_unixname + + # Commit + if rev['comment'] != '': + commit_msg += ': ' + rev['comment'] + else: + commit_msg += ' (no message)' + if rev['date']: + parsed_time = time.gmtime(int(rev['date'])) # TODO: assumes GMT + commit_date = time.strftime('%Y-%m-%d %H:%M:%S', parsed_time) + else: + commit_date = None + + got_images = False; + + # Add some spacing in the commit message + if len(details['images']) > 0: + commit_msg += '\n' + + for image in details['images']: + if self.wd.maybe_download_file(image['src'], self.path + '/' + image['filepath']): + commit_msg += '\nAdded image: ' + image['src'] + got_images = True + # If we do this gitpython barfs on itself + #added_file_paths.append(image['filepath']) + else: + self.saveFailedImages() + + + if got_images: + added_file_paths.append("images") + print("Committing: " + str(self.rev_no) + '. '+commit_msg) + + # Include metadata in the commit (if changed) + self.appendFetchedRevid(rev['rev_id']) + self.saveMetadata() + added_file_paths.append('.metadata.json') + self.index.add(added_file_paths) + + username = str(rev['user']) + email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename + author = Actor(username, email) + + commit = self.index.commit(commit_msg, author=author, author_date=commit_date) + + if self.debug: + print('Committed', commit.name_rev, 'by', author) + + self.fetched_revids.add(rev['rev_id']) + + self.rev_no += 1 + self.saveState() # Update operation state + + return True + + def fetchAll(self): + to_fetch = [] + for rev in tqdm(self.wrevs, desc='Creating list of revisions to fetch'): + if rev['rev_id'] not in self.fetched_revids: + to_fetch.append(rev) + for rev in tqdm(to_fetch, desc='Downloading'): + self.commitNext(rev) + + # + # Updates all children of the page to reflect parent's unixname change. + # + # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body. + # A parent may then be renamed. + # Wikidot logs no additional changes for child pages, yet they stay linked to the parent. + # + # Therefore, on every rename we must update all linked children in the same revision. + # + def updateChildren(self, oldunixname, newunixname): + if self.debug: + print('Updating parents for', oldunixname, newunixname) + + for child in list(self.last_parents.keys()): + if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname: + self.updateParentField(child, self.last_parents[child], newunixname) + + def updateTags(self, comment, unixname): + file_name = self.path+'/'+unixname+'.txt' + removed = [] + removed_match = re.search(pattern = r'Removed tags: ([^.]+,?)\.') + if removed_match is not None: + removed = removed_match.group(1).split(', ') + + tags = [] + + with codecs.open(file_name, "r", "UTF-8") as f: + content = f.readlines() + + tagsline = None + for line in content: + if line.startswith('tags:'): + tagsline = line + break + + # Father forgive me for the indentation depth + idx = -1 + if tagsline is not None: + idx = content.index(tagsline) + for tag in tagsline.split(','): + if not tag in removed: + tags.append(tag) + + + added_match = re.search(pattern = r'Added tags: ([^.]+,?)\.') + if added_match is not None: + tags += added_match.group(1).split(', ') + + tags.sort() + + newtagsline = 'tags:' + ','.join(tags) + '\n' + if idx != -1: + contents[idx] = newtagsline + else: + contents = newtagsline + contents + + with codecs.open(file_name, "w", "UTF-8") as f: + f.writelines(content) + + # + # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. + # The rest of the file is preserved. + # + def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): + child_path = self.path+'/'+child_unixname+'.txt' + if not os.path.isfile(child_path): + print('Failed to find child file!', child_path) + return + with codecs.open(child_path, "r", "UTF-8") as f: + content = f.readlines() + # Since this is all tracked by us, we KNOW there's a line in standard format somewhere + idx = content.index('parent:'+parent_oldunixname+'\n') + if idx < 0: + raise Exception("Cannot update child page "+child_unixname+": " + +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); + content[idx] = 'parent:'+parent_newunixname+'\n' + with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f: + f.writelines(content) + + + # + # Finalizes the construction process and deletes any temporary files. + # + def cleanup(self): + if os.path.exists(self.path+'/.wstate'): + os.remove(self.path+'/.wstate') + else: + print("wstate does not exist?") + + if os.path.exists(self.path+'/.wrevs'): + os.remove(self.path+'/.wrevs') + else: + print("wrevs does not exist?") + + if os.path.exists(self.path+'/.pages'): + os.remove(self.path+'/.pages') + + if self.rev_no > 0: + self.index.add(['.fetched.txt']) + self.index.commit('Updating fetched revisions') diff --git a/wikidot.py b/wikidot.py index f01c59f..be378ea 100644 --- a/wikidot.py +++ b/wikidot.py @@ -1,193 +1,561 @@ -import requests -import random -from bs4 import BeautifulSoup -import time - -# Implements various queries to Wikidot engine through its AJAX facilities - - -class Wikidot: - def __init__(self, site): - self.site = site # Wikidot site to query - self.delay = 200 # Delay between requests in msec - self.debug = False # Print debug messages - self.next_timeslot = time.clock() # Can call immediately - - - # To honor usage rules, we wait for self.delay between requests. - # Low-level query functions call this before every request to Wikidot./ - def _wait_request_slot(self): - tm = time.clock() - if self.next_timeslot - tm > 0: - time.sleep(self.next_timeslot - tm) - self.next_timeslot = tm + self.delay / 1000 - pass - - # Makes a Wikidot AJAX query. Returns the response+title or throws an error. - def queryex(self, params): - token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8)) - cookies = {"wikidot_token7": token} - params['wikidot_token7'] = token - - if self.debug: - print params - print cookies - - self._wait_request_slot() - req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies) - json = req.json() - if json['status'] == 'ok': - return json['body'], (json['title'] if 'title' in json else '') - else: - raise req.text - - # Same but only returns the body, most responses don't have titles - def query(self, params): - return self.queryex(params)[0] - - - # List all pages for the site. - - # Raw version - # For the supported formats (module_body) see: - # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php - def list_pages_raw(self, limit): - res = self.query({ - 'moduleName': 'list/ListPagesModule', - 'limit': limit if limit else '10000', - 'perPage': limit if limit else '10000', - 'module_body': '%%page_unix_name%%', - 'separate': 'false', - 'order': 'dateCreatedDesc', # This way limit makes sense. This is also the default - }) - return res - - # Client version - def list_pages(self, limit): - raw = self.list_pages_raw(limit).replace('
',"\n") - soup = BeautifulSoup(raw, 'html.parser') - pages = [] - for entry in soup.div.p.text.split('\n'): - pages.append(entry) - return pages - - - # Retrieves internal page_id by page unix_name. - # Page IDs are required for most of page functions. - - def get_page_id(self, page_unix_name): - # The only freaking way to get page ID is to load the page! Wikidot! - self._wait_request_slot() - req = requests.request('GET', self.site+'/'+page_unix_name) - soup = BeautifulSoup(req.text, 'html.parser') - for item in soup.head.find_all('script'): - text = item.text - pos = text.find("WIKIREQUEST.info.pageId = ") - if pos >= 0: - pos += len("WIKIREQUEST.info.pageId = ") - crlf = text.find(";", pos) - if crlf >= 0: - return int(text[pos:crlf]) - else: - return int(text[pos:]) - return None - - - # Retrieves a list of revisions for a page. - # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php - - # Raw version - def get_revisions_raw(self, page_id, limit): - res = self.query({ - 'moduleName': 'history/PageRevisionListModule', - 'page_id': page_id, - 'page': '1', - 'perpage': limit if limit else '10000', - 'options': '{"all":true}' - }) - - soup = BeautifulSoup(res, 'html.parser') - return soup.table.contents - - # Client version - def get_revisions(self, page_id, limit): - revs = [] - for tr in self.get_revisions_raw(page_id, limit): - if tr.name != 'tr': continue # there's a header + various junk - - # RevID is stored as a value of an INPUT field - rev_id = tr.input['value'] if tr.input else None - if rev_id is None: continue # can't parse - - # Unixtime is stored as a CSS class time_* - rev_date = 0 - date_span = tr.find("span", attrs={"class": "odate"}) - if date_span is not None: - for cls in date_span['class']: - if cls.startswith('time_'): - rev_date = int(cls[5:]) - - # Username in a last under - user_span = tr.find("span", attrs={"class": "printuser"}) - for last_a in user_span.find_all('a'): pass - rev_user = last_a.getText() if last_a else None - - - # Comment is in the last TD of the row - last_td = None - for last_td in tr.find_all('td'): pass - rev_comment = last_td.getText() if last_td else "" - - revs.append({ - 'id': rev_id, - 'date': rev_date, - 'user': rev_user, - 'comment': rev_comment, - }) - return revs - - - # Retrieves revision source for a revision. - # There's no raw version because there's nothing else in raw. - def get_revision_source(self, rev_id): - res = self.query({ - 'moduleName': 'history/PageSourceModule', - 'revision_id': rev_id, - # We don't need page id - }) - # The source is HTMLified but BeautifulSoup's getText() will decode that - # - htmlentities - # -
s in place of linebreaks - # - random real linebreaks (have to be ignored) - soup = BeautifulSoup(res, 'html.parser') - return soup.div.getText().lstrip(' \r\n') - - # Retrieves the rendered version + additional info unavailable in get_revision_source: - # * Title - # * Unixname at the time - def get_revision_version_raw(self, rev_id): - res = self.queryex({ - 'moduleName': 'history/PageVersionModule', - 'revision_id': rev_id, - }) - return res - - def get_revision_version(self, rev_id): - res = self.get_revision_version_raw(rev_id) # this has title! - soup = BeautifulSoup(res[0], 'html.parser') - - # First table is a flyout with revision details. Remove and study it. - unixname = None - details = soup.find("div", attrs={"id": "page-version-info"}).extract() - for tr in details.find_all('tr'): - tds = tr.find_all('td') - if len(tds) < 2: continue - if tds[0].getText().strip() == 'Page name:': - unixname = tds[1].getText().strip() - - return { - 'rev_id': rev_id, - 'unixname': unixname, - 'title': res[1], - 'content': unicode(soup), # only content remains - } \ No newline at end of file +import requests +import random +from bs4 import BeautifulSoup +import time +from urllib.parse import urlparse, urljoin +from pprint import pprint +import pathlib +import hashlib +import os +import shutil +import imghdr +from timeit import default_timer as timer + +# Implements various queries to Wikidot engine through its AJAX facilities + + +class Wikidot: + def __init__(self, site): + self.site = site # Wikidot site to query + + # strip out trailing /, if it exists + if self.site[-1] == '/': + self.site = self.site[:-1] + self.sitename = urlparse(site).hostname.lower() + self.delay = 1000 # Delay between requests in msec + self.debug = False # Print debug messages + self.next_timeslot = time.process_time() # Can call immediately + self.max_retries = 5 + self.failed_images = set() + + # Downloads file if it doesn't exist + def maybe_download_file(self, url, file_path): + if url in self.failed_images: + if self.debug: + print(" ! ", url, "already failed, skipping") + return False + + if os.path.exists(file_path): + if self.debug: + print(" - ", file_path, "exists, skipping") + return False + + #self._wait_request_slot() + + try: + dirpath = os.path.dirname(file_path) + os.makedirs(dirpath, exist_ok=True) + except OSError as e: + if e.errno == 36: + print("Path too long", e) + return False + else: + raise # re-raise previously caught exception + + if self.debug: + print(" < downloading", url, "to" ,file_path, "dirpath", dirpath) + + # In case of e. g. 500 errors + retries = 0 + while retries < self.max_retries: + self._wait_request_slot() + + headers = requests.utils.default_headers() + # Pretty generic user-agent, but we append a unique none for us + # Makes wikimedia happy + headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"}) + start = timer() + + try: + req = requests.get(url, stream=True, timeout=30) + except requests.exceptions.RequestException: + print('request exception') + + retries += 1 + time.sleep(retries * retries * retries) # up to ~2 minutes + continue + except urllib3.exceptions.ReadTimeoutError: + print('read timeout') + + retries += 1 + time.sleep(retries * retries * retries) # up to ~2 minutes + continue + + if req.status_code >= 500: + print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) + # In case of debug enabled, we already printed this above + if not self.debug: + print(' - ', req) + + retries += 1 + time.sleep(retries * retries * retries) + continue + + if req.status_code >= 400: + self.failed_images.add(url) + return False + + try: + # In case of 404 errors or other stuff that indicates + # some bug in how we handle or request things + req.raise_for_status() + + req.raw.decode_content = True + with open(file_path, 'wb') as out_file: + shutil.copyfileobj(req.raw, out_file) + + if imghdr.what(file_path) is None: + print('Downloaded invalid image', url) + os.remove(file_path) + self.failed_images.add(url) + return False + + + if self.debug: + print(" - downloaded file size", os.path.getsize(file_path), "in", round(timer() - start, 2)) + + return True + except OSError as e: + if e.errno == 36: + print("Filename to long", e) + return False + else: + raise # re-raise previously caught exception + except Exception as e: + print(' ! Failed to download', e, req, url) + raise e + + print('Failed too many times for', url) + return False + + # To honor usage rules, we wait for self.delay between requests. + # Low-level query functions call this before every request to Wikidot./ + def _wait_request_slot(self): + tm = time.process_time() + if self.next_timeslot - tm > 0: + time.sleep(self.next_timeslot - tm) + self.next_timeslot = tm + self.delay / 1000 + + pass + + # Makes a Wikidot AJAX query. Returns the response+title or throws an error. + def queryex(self, params, urlAppend = None): + token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8)) + cookies = {"wikidot_token7": token} + params['wikidot_token7'] = token + + if self.debug: + print(' - ', params) + print(' - ', cookies) + + url = self.site+'/ajax-module-connector.php' + if urlAppend is not None: + url += urlAppend + + # In case of e. g. 500 errors + retries = 0 + while retries < self.max_retries: + if retries > 0: + print(" ! retry", retries, "of", self.max_retries) + + self._wait_request_slot() + + start = timer() + try: + req = requests.request('POST', url, data=params, cookies=cookies, timeout=30) + except requests.exceptions.RequestException: + print('request timed out!') + retries += 1 + time.sleep(retries * retries * retries) + continue + + if self.debug: + print(' * ajax request completed in', round(timer() - start, 2)) + + # Usually a 502 error, recovers immediately + if req.status_code >= 500: + retries += 1 + print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) + + # In case of debug enabled, we already printed this above + if not self.debug: + print(req, params) + + # Be nice, double wait delay for errors + self._wait_request_slot() + + # Extra nice, sleep longer (expoential increase), hope for the + # server to recover + time.sleep(retries * retries * retries) + + continue + + try: + # In case of 404 errors or other stuff that indicates + # some bug in how we handle or request things + req.raise_for_status() + except Exception as e: + print(' ! Failed to get response from wikidot', e, req, url, params) + + try: + json = req.json() + except Exception as e: + print(' ! Failed to get response from wikidot', e, req, url, params) + if retries < self.max_retries: + retries += 1 + #self._wait_request_slot() + time.sleep(retries * retries * retries) + continue + + raise e + + if json['status'] == 'ok': + return json['body'], (json['title'] if 'title' in json else '') + else: + print(" ! error in response", json) + + retries += 1 + time.sleep(retries * retries * retries) + continue + + print(' ! Failed too many times', url, params, cookies) + raise Exception('Failed too many times for ' + url) + + # Same but only returns the body, most responses don't have titles + def query(self, params, urlAppend = None): + return self.queryex(params, urlAppend)[0] + + # List all pages for the site. + + # Raw version + # For the supported formats (module_body) see: + # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php + def list_pages_raw(self, limit, offset): + res = self.query({ + 'moduleName': 'list/ListPagesModule', + 'limit': limit if limit else '10000', + 'perPage': limit if limit else '10000', + 'module_body': '%%page_unix_name%%', + 'separate': 'false', + 'p': str(offset), + 'order': 'dateCreatedDesc', # This way limit makes sense. This is also the default + }, '/p/' + str(offset)) + return res + + # Client version + def list_pages(self, limit): + offset = 1 + pages = [] + + while True: + raw = self.list_pages_raw(limit, offset).replace('
',"\n") + soup = BeautifulSoup(raw, 'html.parser') + + + for entry in soup.div.p.text.split('\n'): + pages.append(entry) + + if self.debug: + print(' - Pages found:', len(pages)) + + targets = soup.find_all('span','target') + if len(targets) < 2: + print(" ! Unable to find next listing page, not enough target spans") + break + + next_url = targets[-1].a.get('href').split('/') + if len(next_url) > 0 and next_url[-1].isnumeric(): + next_page = int(next_url[-1]) + + if self.debug: + print(' - Next listing page', next_page) + + else: + print(" ! invalid next url", next_url) + break + + #next_page = int(targets[0].a.text) + + current_spans = soup.find_all('span','current') + if len(current_spans) > 0: + current_page = int(current_spans[0].text) + + if self.debug: + print(' - Current listing page', current_page) + + else: + print(" ! unable to find current page") + break; + + if next_page != offset + 1: + if self.debug: + print(' ! Next page is wrong', next_page, 'hopefully at the end') + break + + offset += 1 + + print(" - Fetching listing page", offset) + + return pages + + + # Retrieves internal page_id by page unix_name. + # Page IDs are required for most of page functions. + + def get_page_id(self, page_unix_name): + # The only freaking way to get page ID is to load the page! Wikidot! + self._wait_request_slot() + url = self.site+'/'+page_unix_name + '/noredirect/true'; + + if self.debug: + print(" > fetching", url) + + start = timer() + retries = 0 + req = None + while retries < self.max_retries: + try: + req = requests.request('GET', url, timeout=30) + except requests.exceptions.RequestException: + print('request timed out!') + retries += 1 + time.sleep(retries * retries * retries) + continue + + if req.status_code >= 500: + print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) + retries += 1 + time.sleep(retries * retries * retries) + continue + + req.raise_for_status() + break + + if self.debug: + print(' * page id request completed in', round(timer() - start, 2)) + + soup = BeautifulSoup(req.text, 'html.parser') + for item in soup.head.find_all('script'): + text = item.string + if text is None: + #print("No text in script item", item) + continue + + pos = text.find("WIKIREQUEST.info.pageId = ") + if pos >= 0: + pos += len("WIKIREQUEST.info.pageId = ") + crlf = text.find(";", pos) + if crlf >= 0: + return int(text[pos:crlf]) + else: + return int(text[pos:]) + + raise Exception('Failed to get page_id for ' + page_unix_name) + + + # Retrieves a list of revisions for a page. + # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php + + # Raw version + def get_revisions_raw(self, page_id, limit): + res = self.query({ + 'moduleName': 'history/PageRevisionListModule', + 'page_id': page_id, + 'page': '1', + 'perpage': limit if limit else '10000', + 'options': '{"all":true}' + }) + + soup = BeautifulSoup(res, 'html.parser') + return soup.table.contents + + # Client version + def get_revisions(self, page_id, limit): + revs = [] + raw = self.get_revisions_raw(page_id, limit) + for tr in raw: + if tr.name != 'tr': continue # there's a header + various junk + + # RevID is stored as a value of an INPUT field + rev_id = tr.input['value'] if tr.input else None + if rev_id is None: continue # can't parse + attachment_action = tr.find("span", attrs={"title": "file/attachment action"}) + attached_file = False + if attachment_action is not None: + attached_file = True + print(" - was attchment", rev_id) + + # Unixtime is stored as a CSS class time_* + rev_date = 0 + date_span = tr.find("span", attrs={"class": "odate"}) + if date_span is not None: + for cls in date_span['class']: + if cls.startswith('time_'): + rev_date = int(cls[5:]) + else: + print(" ! no odate found") + + # Username in a last
under + user_span = tr.find("span", attrs={"class": "printuser"}) + last_a = None + for last_a in user_span.find_all('a'): pass + rev_user = last_a.getText() if last_a else None + + + # Comment is in the last TD of the row + last_td = None + for last_td in tr.find_all('td'): pass + rev_comment = last_td.getText() if last_td else "" + + revs.append({ + 'id': rev_id, + 'date': rev_date, + 'user': rev_user, + 'comment': rev_comment, + 'attached_file': attached_file, + }) + return revs + + # topics in forum: http://www.scp-wiki.net/forum/c-###/sort/start + # -> div class 'title' + # -> a href= http://www.scp-wiki.net/forum/t-####/foobar (foobar not important) + + # posts in topic http://www.scp-wiki.net/forum/t-####/ + # -> div id 'thread-container' + # -> div class 'post-container' + # -> div class = 'post', id = 'post-####' + # -> div class 'title' + # -> div class 'content' + # -> div class 'post-container' + # -> ... + # -> div class 'post-container' + # -> ... + + #def get_forum_post_revisions(self, post_id): + # res = self.query({ + # 'moduleName': 'forum/sub/ForumPostRevisionsModule', + # 'postId': post_id, + # }) + # revisions = [] + # soup = BeautifulSoup(res, 'html.parser') + # for row in soup.find_all("tr"): + # columns = row.find_all("td") + + # if len(columns) != 3: + # raise Exception('Invalid row in post history for ' + str(post_id)) + + # user = columns[0].find('a').getText() + # time = columns[1].find('span').getText() + # rev_id_js = columns[0].find('a')['href'] + # match = re.search(r'showRevision\(event, ([0-9]+)\)', rev_id_js) + # rev_id = match.group(1) + + # revisions.append({ + # 'id': rev_id, + # 'user': user, + # 'time': time, + # }) + + # Retrieves revision source for a revision. + # There's no raw version because there's nothing else in raw. + def get_revision_source(self, rev_id): + res = self.query({ + 'moduleName': 'history/PageSourceModule', + 'revision_id': rev_id, + # We don't need page id + }) + # The source is HTMLified but BeautifulSoup's getText() will decode that + # - htmlentities + # -
s in place of linebreaks + # - random real linebreaks (have to be ignored) + soup = BeautifulSoup(res, 'html.parser') + return soup.div.getText().lstrip(' \r\n') + + # Retrieves the rendered version + additional info unavailable in get_revision_source: + # * Title + # * Unixname at the time + # + # TODO: I think this could fetch the source as well, so we don't need to + # fetch two pages (the fetch source function above). + def get_revision_version_raw(self, rev_id): + res = self.queryex({ + 'moduleName': 'history/PageVersionModule', + 'revision_id': rev_id, + }) + return res + + def get_revision_version(self, rev_id): + res = self.get_revision_version_raw(rev_id) # this has title! + soup = BeautifulSoup(res[0], 'html.parser') + + # Extract list of images + + # TODO: to get the right revision that added them, we need to go back + # and amend the commits that are flagged as attached_file above, + # because we can't get the image file name or URL reliably until they + # are added to the page source, wikidot itself doesn't store this information. + # So much hassle for little value, we get the empty commits when images + # are added anyways. + images = [] + for img_div in soup.find_all("div", attrs={"class": "scp-image-block"}): + img_src = None + img_name = "" + full_link = img_div.find("a") + if full_link is not None: + # Check if it has a thumbnail, otherwise we can't trust that it is the original + img = full_link.find("img", attrs={"class": "enlarge"}) + if img is not None: + img_src = full_link["href"] + img_name = img["alt"] + + if img_src is None: + img = img_div.find("img") + if img is not None: + img_src = img["src"] + img_name = img["alt"] + + if img_src is None: + continue + + # Just in case, I don't think it ever happens, but resolve '..' + # juuuust in case someone tries to be funny + img_url = urlparse(urljoin(img_src, ".")) + url_path = pathlib.Path(img_url.path) + + img_path = "" + if img_url.netloc != "": + img_path = img_url.netloc + "/" + if img_url.netloc[-1] != '/': + img_path += '/' + + if img_url.path != "" and img_url.path[0] == '/': + img_path += img_url.path[1:] + else: + img_path += img_url.path + + if img_path == "" or img_path[-1] == "/": + img_path += img_name + + images.append({"src": img_src, "filename": img_name, "filepath": "images/" + img_path}) + + + + # First table is a flyout with revision details. Remove and study it. + unixname = None + details = soup.find("div", attrs={"id": "page-version-info"}).extract() + for tr in details.find_all('tr'): + tds = tr.find_all('td') + if len(tds) < 2: continue + if tds[0].getText().strip() == 'Page name:': + unixname = tds[1].getText().strip() + + if unixname is None: + raise Exception('Failed to find unixname for ' + rev_id) + + return { + 'rev_id': rev_id, + 'unixname': unixname, + 'title': res[1], + 'content': str(soup), # only content remains + 'images': images, + }