From 66461594d1443910294827fb19def13ce51db957 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 13:38:21 +0200 Subject: [PATCH 01/93] python3 does not support string exceptions --- crawl.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/crawl.py b/crawl.py index 22da38a..ab47baf 100644 --- a/crawl.py +++ b/crawl.py @@ -47,7 +47,7 @@ def force_dirs(path): os.makedirs(path) except OSError as exception: if exception.errno != os.errno.EEXIST: - raise + raise exception if args.list_pages_raw: print wd.list_pages_raw(args.depth) @@ -58,44 +58,44 @@ def force_dirs(path): elif args.source: if not args.page: - raise "Please specify --page for --source." + raise Exception("Please specify --page for --source." page_id = wd.get_page_id(args.page) if not page_id: - raise "Page not found: "+args.page + raise Exception("Page not found: "+args.page) revs = wd.get_revisions(page_id, 1) # last revision print wd.get_revision_source(revs[0]['id']) elif args.content: if not args.page: - raise "Please specify --page for --source." + raise Exception("Please specify --page for --source.") page_id = wd.get_page_id(args.page) if not page_id: - raise "Page not found: "+args.page + raise Exception("Page not found: "+args.page) revs = wd.get_revisions(page_id, 1) # last revision print wd.get_revision_version(revs[0]['id']) elif args.log_raw: if not args.page: - raise "Please specify --page for --log." + raise Exception("Please specify --page for --log.") page_id = wd.get_page_id(args.page) if not page_id: - raise "Page not found: "+args.page + raise Exception("Page not found: "+args.page) print wd.get_revisions_raw(page_id, args.depth) elif args.log: if not args.page: - raise "Please specify --page for --log." + raise Exception("Please specify --page for --log.") page_id = wd.get_page_id(args.page) if not page_id: - raise "Page not found: "+args.page + raise Exception("Page not found: "+args.page) for rev in wd.get_revisions(page_id, args.depth): print unicode(rev) From a9360af935f1c737af5c8fa403646ebf272cc24e Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 13:39:55 +0200 Subject: [PATCH 02/93] dos2unix, missing ) --- crawl.py | 236 +++++++++++++++++++++++++++---------------------------- 1 file changed, 118 insertions(+), 118 deletions(-) diff --git a/crawl.py b/crawl.py index ab47baf..566b70c 100644 --- a/crawl.py +++ b/crawl.py @@ -1,118 +1,118 @@ -import argparse -import sys -import locale -import codecs -import os -from wikidot import Wikidot -from rmaint import RepoMaintainer - -# TODO: Files. -# TODO: Forum and comment pages. -# TODO: Ability to download new transactions since last dump. -# We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump) - -rawStdout = sys.stdout -rawStderr = sys.stderr -sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, 'xmlcharrefreplace') -sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr, 'xmlcharrefreplace') - -parser = argparse.ArgumentParser(description='Queries Wikidot') -parser.add_argument('site', help='URL of Wikidot site') -# Actions -parser.add_argument('--list-pages', action='store_true', help='List all pages on this site') -parser.add_argument('--source', action='store_true', help='Print page source (requires --page)') -parser.add_argument('--content', action='store_true', help='Print page content (requires --page)') -parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)') -parser.add_argument('--dump', type=str, help='Download page revisions to this directory') -# Debug actions -parser.add_argument('--list-pages-raw', action='store_true') -parser.add_argument('--log-raw', action='store_true') -# Action settings -parser.add_argument('--page', type=str, help='Query only this page') -parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions') -parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository') -# Common settings -parser.add_argument('--debug', action='store_true', help='Print debug info') -parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot') -args = parser.parse_args() - - -wd = Wikidot(args.site) -wd.debug = args.debug -wd.delay = args.delay - - -def force_dirs(path): - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != os.errno.EEXIST: - raise exception - -if args.list_pages_raw: - print wd.list_pages_raw(args.depth) - -elif args.list_pages: - for page in wd.list_pages(args.depth): - print page - -elif args.source: - if not args.page: - raise Exception("Please specify --page for --source." - - page_id = wd.get_page_id(args.page) - if not page_id: - raise Exception("Page not found: "+args.page) - - revs = wd.get_revisions(page_id, 1) # last revision - print wd.get_revision_source(revs[0]['id']) - -elif args.content: - if not args.page: - raise Exception("Please specify --page for --source.") - - page_id = wd.get_page_id(args.page) - if not page_id: - raise Exception("Page not found: "+args.page) - - revs = wd.get_revisions(page_id, 1) # last revision - print wd.get_revision_version(revs[0]['id']) - -elif args.log_raw: - if not args.page: - raise Exception("Please specify --page for --log.") - - page_id = wd.get_page_id(args.page) - if not page_id: - raise Exception("Page not found: "+args.page) - - print wd.get_revisions_raw(page_id, args.depth) - - -elif args.log: - if not args.page: - raise Exception("Please specify --page for --log.") - - page_id = wd.get_page_id(args.page) - if not page_id: - raise Exception("Page not found: "+args.page) - for rev in wd.get_revisions(page_id, args.depth): - print unicode(rev) - - -elif args.dump: - print "Downloading pages to "+args.dump - force_dirs(args.dump) - - rm = RepoMaintainer(wd, args.dump) - rm.debug = args.debug - rm.storeRevIds = args.revids - rm.buildRevisionList([args.page] if args.page else None, args.depth) - rm.openRepo() - - print "Downloading revisions..." - while rm.commitNext(): - pass - - rm.cleanup() - print "Done." +import argparse +import sys +import locale +import codecs +import os +from wikidot import Wikidot +from rmaint import RepoMaintainer + +# TODO: Files. +# TODO: Forum and comment pages. +# TODO: Ability to download new transactions since last dump. +# We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump) + +rawStdout = sys.stdout +rawStderr = sys.stderr +sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, 'xmlcharrefreplace') +sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr, 'xmlcharrefreplace') + +parser = argparse.ArgumentParser(description='Queries Wikidot') +parser.add_argument('site', help='URL of Wikidot site') +# Actions +parser.add_argument('--list-pages', action='store_true', help='List all pages on this site') +parser.add_argument('--source', action='store_true', help='Print page source (requires --page)') +parser.add_argument('--content', action='store_true', help='Print page content (requires --page)') +parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)') +parser.add_argument('--dump', type=str, help='Download page revisions to this directory') +# Debug actions +parser.add_argument('--list-pages-raw', action='store_true') +parser.add_argument('--log-raw', action='store_true') +# Action settings +parser.add_argument('--page', type=str, help='Query only this page') +parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions') +parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository') +# Common settings +parser.add_argument('--debug', action='store_true', help='Print debug info') +parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot') +args = parser.parse_args() + + +wd = Wikidot(args.site) +wd.debug = args.debug +wd.delay = args.delay + + +def force_dirs(path): + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != os.errno.EEXIST: + raise exception + +if args.list_pages_raw: + print wd.list_pages_raw(args.depth) + +elif args.list_pages: + for page in wd.list_pages(args.depth): + print page + +elif args.source: + if not args.page: + raise Exception("Please specify --page for --source.") + + page_id = wd.get_page_id(args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + + revs = wd.get_revisions(page_id, 1) # last revision + print wd.get_revision_source(revs[0]['id']) + +elif args.content: + if not args.page: + raise Exception("Please specify --page for --source.") + + page_id = wd.get_page_id(args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + + revs = wd.get_revisions(page_id, 1) # last revision + print wd.get_revision_version(revs[0]['id']) + +elif args.log_raw: + if not args.page: + raise Exception("Please specify --page for --log.") + + page_id = wd.get_page_id(args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + + print wd.get_revisions_raw(page_id, args.depth) + + +elif args.log: + if not args.page: + raise Exception("Please specify --page for --log.") + + page_id = wd.get_page_id(args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + for rev in wd.get_revisions(page_id, args.depth): + print unicode(rev) + + +elif args.dump: + print "Downloading pages to "+args.dump + force_dirs(args.dump) + + rm = RepoMaintainer(wd, args.dump) + rm.debug = args.debug + rm.storeRevIds = args.revids + rm.buildRevisionList([args.page] if args.page else None, args.depth) + rm.openRepo() + + print "Downloading revisions..." + while rm.commitNext(): + pass + + rm.cleanup() + print "Done." From fcdc5bdfdf9f237b90ad61c42f24cc1ef5c8400d Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 13:42:23 +0200 Subject: [PATCH 03/93] run 2to3 on crawl.py --- crawl.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/crawl.py b/crawl.py index 566b70c..08bbbf3 100644 --- a/crawl.py +++ b/crawl.py @@ -50,11 +50,11 @@ def force_dirs(path): raise exception if args.list_pages_raw: - print wd.list_pages_raw(args.depth) + print(wd.list_pages_raw(args.depth)) elif args.list_pages: for page in wd.list_pages(args.depth): - print page + print(page) elif args.source: if not args.page: @@ -65,7 +65,7 @@ def force_dirs(path): raise Exception("Page not found: "+args.page) revs = wd.get_revisions(page_id, 1) # last revision - print wd.get_revision_source(revs[0]['id']) + print(wd.get_revision_source(revs[0]['id'])) elif args.content: if not args.page: @@ -76,7 +76,7 @@ def force_dirs(path): raise Exception("Page not found: "+args.page) revs = wd.get_revisions(page_id, 1) # last revision - print wd.get_revision_version(revs[0]['id']) + print(wd.get_revision_version(revs[0]['id'])) elif args.log_raw: if not args.page: @@ -86,7 +86,7 @@ def force_dirs(path): if not page_id: raise Exception("Page not found: "+args.page) - print wd.get_revisions_raw(page_id, args.depth) + print(wd.get_revisions_raw(page_id, args.depth)) elif args.log: @@ -97,11 +97,11 @@ def force_dirs(path): if not page_id: raise Exception("Page not found: "+args.page) for rev in wd.get_revisions(page_id, args.depth): - print unicode(rev) + print(str(rev)) elif args.dump: - print "Downloading pages to "+args.dump + print("Downloading pages to "+args.dump) force_dirs(args.dump) rm = RepoMaintainer(wd, args.dump) @@ -110,9 +110,9 @@ def force_dirs(path): rm.buildRevisionList([args.page] if args.page else None, args.depth) rm.openRepo() - print "Downloading revisions..." + print("Downloading revisions...") while rm.commitNext(): pass rm.cleanup() - print "Done." + print("Done.") From 8dcab9e35d1a478f5d0ebe78219187ccad5f979c Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 13:42:33 +0200 Subject: [PATCH 04/93] dos2unix on the rest --- .hgignore | 4 +- hgpatch.py | 100 +++++----- readme.md | 60 +++--- rmaint.py | 524 ++++++++++++++++++++++++++--------------------------- wikidot.py | 384 +++++++++++++++++++-------------------- 5 files changed, 536 insertions(+), 536 deletions(-) diff --git a/.hgignore b/.hgignore index 471301b..a26d142 100644 --- a/.hgignore +++ b/.hgignore @@ -1,2 +1,2 @@ -syntax:glob -*.pyc +syntax:glob +*.pyc diff --git a/hgpatch.py b/hgpatch.py index 6d2ff12..2d77769 100644 --- a/hgpatch.py +++ b/hgpatch.py @@ -1,50 +1,50 @@ -from mercurial import scmutil, osutil -from types import MethodType -from mercurial import encoding -import codecs - -# Patches commit-message unicode handling on Python 2.x - -# Mercurial is internally unicode. But because it runs from ASCII console, it tries to convert -# all input from "input encoding" (set in mercurial/encoding.py) - -# Problem 1: -# If you just pass it u'unicode string', it'll fail. Even if you set "input encoding" to utf-8, -# it'll still try to decode it to ASCII. -# Solution: -# Patch this decoding function to pass unicode unchanged. - -old_fromlocal = None - -def better_fromlocal(s): - if isinstance(s, unicode): - return s.encode('utf-8') - global old_fromlocal - return old_fromlocal(s) - -old_fromlocal = encoding.fromlocal -encoding.fromlocal = better_fromlocal - - -# Problem 2: -# Separate from actual log, Mercurial stores commit message in commit-message.txt. -# Unfortunately it uses default Python 2.x file.open which expects ASCII and auto-conversion fails. -# Solution: -# Patch virtual-fs open() function to use codecs.open wrapper in this particular case. - -old_vfs_call = None - -def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False): - fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose) - if path.endswith('last-message.txt'): - # Create a wrapper like codecs.open does: - info = codecs.lookup("utf-8") - fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict') - fp.encoding = 'utf-8' - return fp - -old_vfs_call = scmutil.vfs.__call__ -scmutil.vfs.__call__ = better_vfs_call - - - +from mercurial import scmutil, osutil +from types import MethodType +from mercurial import encoding +import codecs + +# Patches commit-message unicode handling on Python 2.x + +# Mercurial is internally unicode. But because it runs from ASCII console, it tries to convert +# all input from "input encoding" (set in mercurial/encoding.py) + +# Problem 1: +# If you just pass it u'unicode string', it'll fail. Even if you set "input encoding" to utf-8, +# it'll still try to decode it to ASCII. +# Solution: +# Patch this decoding function to pass unicode unchanged. + +old_fromlocal = None + +def better_fromlocal(s): + if isinstance(s, unicode): + return s.encode('utf-8') + global old_fromlocal + return old_fromlocal(s) + +old_fromlocal = encoding.fromlocal +encoding.fromlocal = better_fromlocal + + +# Problem 2: +# Separate from actual log, Mercurial stores commit message in commit-message.txt. +# Unfortunately it uses default Python 2.x file.open which expects ASCII and auto-conversion fails. +# Solution: +# Patch virtual-fs open() function to use codecs.open wrapper in this particular case. + +old_vfs_call = None + +def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False): + fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose) + if path.endswith('last-message.txt'): + # Create a wrapper like codecs.open does: + info = codecs.lookup("utf-8") + fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict') + fp.encoding = 'utf-8' + return fp + +old_vfs_call = scmutil.vfs.__call__ +scmutil.vfs.__call__ = better_vfs_call + + + diff --git a/readme.md b/readme.md index f66a0cc..2458933 100644 --- a/readme.md +++ b/readme.md @@ -1,30 +1,30 @@ -This is a Python command line client for relatively popular wiki hosting http://www.wikidot.com which lets you: - -* List all pages on a site -* See all revisions of a page -* Query page source - -Most interestingly, it allows you to download the whole site as a Mercurial repository, with proper commit dates and comments! - -##### Examples: - - crawl.py http://example.wikidot.com --dump ExampleRepo - crawl.py http://example.wikidot.com --log --page example-page - -It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers. - -Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed. - -##### Useful links: - -Wikidot code (very old) which simplifies things a bit: - -* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php - -The descriptions for on-site modules are heavily correlated with AJAX ones: - -* http://www.wikidot.com/doc-modules:listpages-module - -Someone else did Wikidot AJAX: - -* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py +This is a Python command line client for relatively popular wiki hosting http://www.wikidot.com which lets you: + +* List all pages on a site +* See all revisions of a page +* Query page source + +Most interestingly, it allows you to download the whole site as a Mercurial repository, with proper commit dates and comments! + +##### Examples: + + crawl.py http://example.wikidot.com --dump ExampleRepo + crawl.py http://example.wikidot.com --log --page example-page + +It uses internal Wikidot AJAX requests to do it's job. If you're from Wikidot, please don't break it. Thank you! We'll try to be nice and not put a load on your servers. + +Downloading of large sites might take a while. If anything breaks, just restart the same command, it'll continue from where it crashed. + +##### Useful links: + +Wikidot code (very old) which simplifies things a bit: + +* https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php + +The descriptions for on-site modules are heavily correlated with AJAX ones: + +* http://www.wikidot.com/doc-modules:listpages-module + +Someone else did Wikidot AJAX: + +* https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py diff --git a/rmaint.py b/rmaint.py index 029319f..fe21027 100644 --- a/rmaint.py +++ b/rmaint.py @@ -1,263 +1,263 @@ -import os -import codecs -from mercurial import commands, ui, hg -import hgpatch -import cPickle as pickle -import wikidot - -# Repository builder and maintainer -# Contains logic for actual loading and maintaining the repository over the course of its construction. - -# Usage: -# rm = RepoMaintainer(wikidot, path) -# rm.buildRevisionList(pages, depth) -# rm.openRepo() -# while rm.commitNext(): -# pass -# rm.cleanup() - -# Talkative. - -class RepoMaintainer: - def __init__(self, wikidot, path): - # Settings - self.wd = wikidot # Wikidot instance - self.path = path # Path to repository - self.debug = False # = True to enable more printing - self.storeRevIds = True # = True to store .revid with each commit - - # Internal state - self.wrevs = None # Compiled wikidot revision list (history) - - self.rev_no = 0 # Next revision to process - self.last_names = {} # Tracks page renames: name atm -> last name in repo - self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - - self.ui = None # Mercurial UI object - self.repo = None # Mercurial repo object - - - # - # Saves and loads revision list from file - # - def saveWRevs(self): - fp = open(self.path+'\\.wrevs', 'wb') - pickle.dump(self.wrevs, fp) - fp.close() - - def loadWRevs(self): - fp = open(self.path+'\\.wrevs', 'rb') - self.wrevs = pickle.load(fp) - fp.close() - - # - # Compiles a combined revision list for a given set of pages, or all pages on the site. - # pages: compile history for these pages - # depth: download at most this number of revisions. - # - # If there exists a cached revision list at the repository destination, - # it is loaded and no requests are made. - # - def buildRevisionList(self, pages = None, depth = 10000): - if os.path.isfile(self.path+'\\.wrevs'): - print "Loading cached revision list..." - self.loadWRevs() - else: - print "Building revision list..." - if not pages: - pages = self.wd.list_pages(10000) - self.wrevs = [] - for page in pages: - print "Querying page: "+page - page_id = self.wd.get_page_id(page) - print "ID: "+str(page_id) - revs = self.wd.get_revisions(page_id, depth) - print "Revisions: "+str(len(revs)) - for rev in revs: - self.wrevs.append({ - 'page_id' : page_id, - 'page_name' : page, # name atm, not at revision time - 'rev_id' : rev['id'], - 'date' : rev['date'], - 'user' : rev['user'], - 'comment' : rev['comment'], - }) - self.saveWRevs() # Save a cached copy - print "" - - - print "Total revisions: "+str(len(self.wrevs)) - - print "Sorting revisions..." - self.wrevs.sort(key=lambda rev: rev['date']) - print "" - - if self.debug: - print "Revision list: " - for rev in self.wrevs: - print str(rev)+"\n" - print "" - - - # - # Saves and loads operational state from file - # - def saveState(self): - fp = open(self.path+'\\.wstate', 'wb') - pickle.dump(self.rev_no, fp) - pickle.dump(self.last_names, fp) - pickle.dump(self.last_parents, fp) - fp.close() - - def loadState(self): - fp = open(self.path+'\\.wstate', 'rb') - self.rev_no = pickle.load(fp) - self.last_names = pickle.load(fp) - try: - self.last_parents = pickle.load(fp) - except EOFError: - pass - fp.close() - - - # - # Initializes the construction process, after the revision list has been compiled. - # Either creates a new repo, or loads the existing one at the target path - # and restores its construction state. - # - def openRepo(self): - # Create a new repository or continue from aborted dump - self.ui=ui.ui() - self.last_names = {} # Tracks page renames: name atm -> last name in repo - self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - - if os.path.isfile(self.path+'\\.wstate'): - print "Continuing from aborted dump state..." - self.loadState() - self.repo = hg.repository(self.ui, self.path) - - else: # create a new repository (will fail if one exists) - print "Initializing repository..." - commands.init(self.ui, self.path) - self.repo = hg.repository(self.ui, self.path) - self.rev_no = 0 - - if self.storeRevIds: - # Add revision id file to the new repo - fname = self.path+'\\.revid' - codecs.open(fname, "w", "UTF-8").close() - commands.add(self.ui, self.repo, str(fname)) - - - # - # Takes an unprocessed revision from a revision log, fetches its data and commits it. - # Returns false if no unprocessed revisions remain. - # - def commitNext(self): - if self.rev_no >= len(self.wrevs): - return False - - rev = self.wrevs[self.rev_no] - source = self.wd.get_revision_source(rev['rev_id']) - # Page title and unix_name changes are only available through another request: - details = self.wd.get_revision_version(rev['rev_id']) - - # Store revision_id for last commit - # Without this, empty commits (e.g. file uploads) will be skipped by Mercurial - if self.storeRevIds: - fname = self.path+'\\.revid' - outp = codecs.open(fname, "w", "UTF-8") - outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway - outp.close() - - unixname = rev['page_name'] - rev_unixname = details['unixname'] # may be different in revision than atm - - # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history. - # The only way to know they were changed is revision comments, though evil people may trick us. - if rev['comment'].startswith('Parent page set to: "'): - # This is a parenting revision, remember the new parent - parent_unixname = rev['comment'][21:-2] - self.last_parents[unixname] = parent_unixname - else: - # Else use last parent_unixname we've recorded - parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None - # There are also problems when parent page gets renamed -- see updateChildren - - # If the page is tracked and its name just changed, tell HG - rename = (unixname in self.last_names) and (self.last_names[unixname] <> rev_unixname) - if rename: - self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there - commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt') - - # Ouput contents - fname = self.path+'\\'+rev_unixname+'.txt' - outp = codecs.open(fname, "w", "UTF-8") - if details['title']: - outp.write('title:'+details['title']+'\n') - if parent_unixname: - outp.write('parent:'+parent_unixname+'\n') - outp.write(source) - outp.close() - - # Add new page - if not unixname in self.last_names: # never before seen - commands.add(self.ui, self.repo, str(fname)) - - self.last_names[unixname] = rev_unixname - - # Commit - if rev['comment'] <> '': - commit_msg = rev_unixname + ': ' + rev['comment'] - else: - commit_msg = rev_unixname - if rev['date']: - commit_date = str(rev['date']) + ' 0' - else: - commit_date = None - print "Commiting: "+str(self.rev_no)+'. '+commit_msg - - commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date) - self.rev_no += 1 - - self.saveState() # Update operation state - return True - - - # - # Updates all children of the page to reflect parent's unixname change. - # - # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body. - # A parent may then be renamed. - # Wikidot logs no additional changes for child pages, yet they stay linked to the parent. - # - # Therefore, on every rename we must update all linked children in the same revision. - # - def updateChildren(self, oldunixname, newunixname): - for child in self.last_parents.keys(): - if self.last_parents[child] == oldunixname: - self.updateParentField(child, self.last_parents[child], newunixname) - - # - # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. - # The rest of the file is preserved. - # - def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): - with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f: - content = f.readlines() - # Since this is all tracked by us, we KNOW there's a line in standard format somewhere - idx = content.index('parent:'+parent_oldunixname+'\n') - if idx < 0: - raise Exception("Cannot update child page "+child_unixname+": " - +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); - content[idx] = 'parent:'+parent_newunixname+'\n' - with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f: - f.writelines(content) - - - # - # Finalizes the construction process and deletes any temporary files. - # - def cleanup(self): - os.remove(self.path+'\\.wstate') +import os +import codecs +from mercurial import commands, ui, hg +import hgpatch +import cPickle as pickle +import wikidot + +# Repository builder and maintainer +# Contains logic for actual loading and maintaining the repository over the course of its construction. + +# Usage: +# rm = RepoMaintainer(wikidot, path) +# rm.buildRevisionList(pages, depth) +# rm.openRepo() +# while rm.commitNext(): +# pass +# rm.cleanup() + +# Talkative. + +class RepoMaintainer: + def __init__(self, wikidot, path): + # Settings + self.wd = wikidot # Wikidot instance + self.path = path # Path to repository + self.debug = False # = True to enable more printing + self.storeRevIds = True # = True to store .revid with each commit + + # Internal state + self.wrevs = None # Compiled wikidot revision list (history) + + self.rev_no = 0 # Next revision to process + self.last_names = {} # Tracks page renames: name atm -> last name in repo + self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo + + self.ui = None # Mercurial UI object + self.repo = None # Mercurial repo object + + + # + # Saves and loads revision list from file + # + def saveWRevs(self): + fp = open(self.path+'\\.wrevs', 'wb') + pickle.dump(self.wrevs, fp) + fp.close() + + def loadWRevs(self): + fp = open(self.path+'\\.wrevs', 'rb') + self.wrevs = pickle.load(fp) + fp.close() + + # + # Compiles a combined revision list for a given set of pages, or all pages on the site. + # pages: compile history for these pages + # depth: download at most this number of revisions. + # + # If there exists a cached revision list at the repository destination, + # it is loaded and no requests are made. + # + def buildRevisionList(self, pages = None, depth = 10000): + if os.path.isfile(self.path+'\\.wrevs'): + print "Loading cached revision list..." + self.loadWRevs() + else: + print "Building revision list..." + if not pages: + pages = self.wd.list_pages(10000) + self.wrevs = [] + for page in pages: + print "Querying page: "+page + page_id = self.wd.get_page_id(page) + print "ID: "+str(page_id) + revs = self.wd.get_revisions(page_id, depth) + print "Revisions: "+str(len(revs)) + for rev in revs: + self.wrevs.append({ + 'page_id' : page_id, + 'page_name' : page, # name atm, not at revision time + 'rev_id' : rev['id'], + 'date' : rev['date'], + 'user' : rev['user'], + 'comment' : rev['comment'], + }) + self.saveWRevs() # Save a cached copy + print "" + + + print "Total revisions: "+str(len(self.wrevs)) + + print "Sorting revisions..." + self.wrevs.sort(key=lambda rev: rev['date']) + print "" + + if self.debug: + print "Revision list: " + for rev in self.wrevs: + print str(rev)+"\n" + print "" + + + # + # Saves and loads operational state from file + # + def saveState(self): + fp = open(self.path+'\\.wstate', 'wb') + pickle.dump(self.rev_no, fp) + pickle.dump(self.last_names, fp) + pickle.dump(self.last_parents, fp) + fp.close() + + def loadState(self): + fp = open(self.path+'\\.wstate', 'rb') + self.rev_no = pickle.load(fp) + self.last_names = pickle.load(fp) + try: + self.last_parents = pickle.load(fp) + except EOFError: + pass + fp.close() + + + # + # Initializes the construction process, after the revision list has been compiled. + # Either creates a new repo, or loads the existing one at the target path + # and restores its construction state. + # + def openRepo(self): + # Create a new repository or continue from aborted dump + self.ui=ui.ui() + self.last_names = {} # Tracks page renames: name atm -> last name in repo + self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo + + if os.path.isfile(self.path+'\\.wstate'): + print "Continuing from aborted dump state..." + self.loadState() + self.repo = hg.repository(self.ui, self.path) + + else: # create a new repository (will fail if one exists) + print "Initializing repository..." + commands.init(self.ui, self.path) + self.repo = hg.repository(self.ui, self.path) + self.rev_no = 0 + + if self.storeRevIds: + # Add revision id file to the new repo + fname = self.path+'\\.revid' + codecs.open(fname, "w", "UTF-8").close() + commands.add(self.ui, self.repo, str(fname)) + + + # + # Takes an unprocessed revision from a revision log, fetches its data and commits it. + # Returns false if no unprocessed revisions remain. + # + def commitNext(self): + if self.rev_no >= len(self.wrevs): + return False + + rev = self.wrevs[self.rev_no] + source = self.wd.get_revision_source(rev['rev_id']) + # Page title and unix_name changes are only available through another request: + details = self.wd.get_revision_version(rev['rev_id']) + + # Store revision_id for last commit + # Without this, empty commits (e.g. file uploads) will be skipped by Mercurial + if self.storeRevIds: + fname = self.path+'\\.revid' + outp = codecs.open(fname, "w", "UTF-8") + outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway + outp.close() + + unixname = rev['page_name'] + rev_unixname = details['unixname'] # may be different in revision than atm + + # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history. + # The only way to know they were changed is revision comments, though evil people may trick us. + if rev['comment'].startswith('Parent page set to: "'): + # This is a parenting revision, remember the new parent + parent_unixname = rev['comment'][21:-2] + self.last_parents[unixname] = parent_unixname + else: + # Else use last parent_unixname we've recorded + parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None + # There are also problems when parent page gets renamed -- see updateChildren + + # If the page is tracked and its name just changed, tell HG + rename = (unixname in self.last_names) and (self.last_names[unixname] <> rev_unixname) + if rename: + self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there + commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt') + + # Ouput contents + fname = self.path+'\\'+rev_unixname+'.txt' + outp = codecs.open(fname, "w", "UTF-8") + if details['title']: + outp.write('title:'+details['title']+'\n') + if parent_unixname: + outp.write('parent:'+parent_unixname+'\n') + outp.write(source) + outp.close() + + # Add new page + if not unixname in self.last_names: # never before seen + commands.add(self.ui, self.repo, str(fname)) + + self.last_names[unixname] = rev_unixname + + # Commit + if rev['comment'] <> '': + commit_msg = rev_unixname + ': ' + rev['comment'] + else: + commit_msg = rev_unixname + if rev['date']: + commit_date = str(rev['date']) + ' 0' + else: + commit_date = None + print "Commiting: "+str(self.rev_no)+'. '+commit_msg + + commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date) + self.rev_no += 1 + + self.saveState() # Update operation state + return True + + + # + # Updates all children of the page to reflect parent's unixname change. + # + # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body. + # A parent may then be renamed. + # Wikidot logs no additional changes for child pages, yet they stay linked to the parent. + # + # Therefore, on every rename we must update all linked children in the same revision. + # + def updateChildren(self, oldunixname, newunixname): + for child in self.last_parents.keys(): + if self.last_parents[child] == oldunixname: + self.updateParentField(child, self.last_parents[child], newunixname) + + # + # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. + # The rest of the file is preserved. + # + def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): + with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f: + content = f.readlines() + # Since this is all tracked by us, we KNOW there's a line in standard format somewhere + idx = content.index('parent:'+parent_oldunixname+'\n') + if idx < 0: + raise Exception("Cannot update child page "+child_unixname+": " + +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); + content[idx] = 'parent:'+parent_newunixname+'\n' + with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f: + f.writelines(content) + + + # + # Finalizes the construction process and deletes any temporary files. + # + def cleanup(self): + os.remove(self.path+'\\.wstate') os.remove(self.path+'\\.wrevs') \ No newline at end of file diff --git a/wikidot.py b/wikidot.py index f01c59f..df2252d 100644 --- a/wikidot.py +++ b/wikidot.py @@ -1,193 +1,193 @@ -import requests -import random -from bs4 import BeautifulSoup -import time - -# Implements various queries to Wikidot engine through its AJAX facilities - - -class Wikidot: - def __init__(self, site): - self.site = site # Wikidot site to query - self.delay = 200 # Delay between requests in msec - self.debug = False # Print debug messages - self.next_timeslot = time.clock() # Can call immediately - - - # To honor usage rules, we wait for self.delay between requests. - # Low-level query functions call this before every request to Wikidot./ - def _wait_request_slot(self): - tm = time.clock() - if self.next_timeslot - tm > 0: - time.sleep(self.next_timeslot - tm) - self.next_timeslot = tm + self.delay / 1000 - pass - - # Makes a Wikidot AJAX query. Returns the response+title or throws an error. - def queryex(self, params): - token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8)) - cookies = {"wikidot_token7": token} - params['wikidot_token7'] = token - - if self.debug: - print params - print cookies - - self._wait_request_slot() - req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies) - json = req.json() - if json['status'] == 'ok': - return json['body'], (json['title'] if 'title' in json else '') - else: - raise req.text - - # Same but only returns the body, most responses don't have titles - def query(self, params): - return self.queryex(params)[0] - - - # List all pages for the site. - - # Raw version - # For the supported formats (module_body) see: - # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php - def list_pages_raw(self, limit): - res = self.query({ - 'moduleName': 'list/ListPagesModule', - 'limit': limit if limit else '10000', - 'perPage': limit if limit else '10000', - 'module_body': '%%page_unix_name%%', - 'separate': 'false', - 'order': 'dateCreatedDesc', # This way limit makes sense. This is also the default - }) - return res - - # Client version - def list_pages(self, limit): - raw = self.list_pages_raw(limit).replace('
',"\n") - soup = BeautifulSoup(raw, 'html.parser') - pages = [] - for entry in soup.div.p.text.split('\n'): - pages.append(entry) - return pages - - - # Retrieves internal page_id by page unix_name. - # Page IDs are required for most of page functions. - - def get_page_id(self, page_unix_name): - # The only freaking way to get page ID is to load the page! Wikidot! - self._wait_request_slot() - req = requests.request('GET', self.site+'/'+page_unix_name) - soup = BeautifulSoup(req.text, 'html.parser') - for item in soup.head.find_all('script'): - text = item.text - pos = text.find("WIKIREQUEST.info.pageId = ") - if pos >= 0: - pos += len("WIKIREQUEST.info.pageId = ") - crlf = text.find(";", pos) - if crlf >= 0: - return int(text[pos:crlf]) - else: - return int(text[pos:]) - return None - - - # Retrieves a list of revisions for a page. - # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php - - # Raw version - def get_revisions_raw(self, page_id, limit): - res = self.query({ - 'moduleName': 'history/PageRevisionListModule', - 'page_id': page_id, - 'page': '1', - 'perpage': limit if limit else '10000', - 'options': '{"all":true}' - }) - - soup = BeautifulSoup(res, 'html.parser') - return soup.table.contents - - # Client version - def get_revisions(self, page_id, limit): - revs = [] - for tr in self.get_revisions_raw(page_id, limit): - if tr.name != 'tr': continue # there's a header + various junk - - # RevID is stored as a value of an INPUT field - rev_id = tr.input['value'] if tr.input else None - if rev_id is None: continue # can't parse - - # Unixtime is stored as a CSS class time_* - rev_date = 0 - date_span = tr.find("span", attrs={"class": "odate"}) - if date_span is not None: - for cls in date_span['class']: - if cls.startswith('time_'): - rev_date = int(cls[5:]) - - # Username in a last under - user_span = tr.find("span", attrs={"class": "printuser"}) - for last_a in user_span.find_all('a'): pass - rev_user = last_a.getText() if last_a else None - - - # Comment is in the last TD of the row - last_td = None - for last_td in tr.find_all('td'): pass - rev_comment = last_td.getText() if last_td else "" - - revs.append({ - 'id': rev_id, - 'date': rev_date, - 'user': rev_user, - 'comment': rev_comment, - }) - return revs - - - # Retrieves revision source for a revision. - # There's no raw version because there's nothing else in raw. - def get_revision_source(self, rev_id): - res = self.query({ - 'moduleName': 'history/PageSourceModule', - 'revision_id': rev_id, - # We don't need page id - }) - # The source is HTMLified but BeautifulSoup's getText() will decode that - # - htmlentities - # -
s in place of linebreaks - # - random real linebreaks (have to be ignored) - soup = BeautifulSoup(res, 'html.parser') - return soup.div.getText().lstrip(' \r\n') - - # Retrieves the rendered version + additional info unavailable in get_revision_source: - # * Title - # * Unixname at the time - def get_revision_version_raw(self, rev_id): - res = self.queryex({ - 'moduleName': 'history/PageVersionModule', - 'revision_id': rev_id, - }) - return res - - def get_revision_version(self, rev_id): - res = self.get_revision_version_raw(rev_id) # this has title! - soup = BeautifulSoup(res[0], 'html.parser') - - # First table is a flyout with revision details. Remove and study it. - unixname = None - details = soup.find("div", attrs={"id": "page-version-info"}).extract() - for tr in details.find_all('tr'): - tds = tr.find_all('td') - if len(tds) < 2: continue - if tds[0].getText().strip() == 'Page name:': - unixname = tds[1].getText().strip() - - return { - 'rev_id': rev_id, - 'unixname': unixname, - 'title': res[1], - 'content': unicode(soup), # only content remains +import requests +import random +from bs4 import BeautifulSoup +import time + +# Implements various queries to Wikidot engine through its AJAX facilities + + +class Wikidot: + def __init__(self, site): + self.site = site # Wikidot site to query + self.delay = 200 # Delay between requests in msec + self.debug = False # Print debug messages + self.next_timeslot = time.clock() # Can call immediately + + + # To honor usage rules, we wait for self.delay between requests. + # Low-level query functions call this before every request to Wikidot./ + def _wait_request_slot(self): + tm = time.clock() + if self.next_timeslot - tm > 0: + time.sleep(self.next_timeslot - tm) + self.next_timeslot = tm + self.delay / 1000 + pass + + # Makes a Wikidot AJAX query. Returns the response+title or throws an error. + def queryex(self, params): + token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8)) + cookies = {"wikidot_token7": token} + params['wikidot_token7'] = token + + if self.debug: + print params + print cookies + + self._wait_request_slot() + req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies) + json = req.json() + if json['status'] == 'ok': + return json['body'], (json['title'] if 'title' in json else '') + else: + raise req.text + + # Same but only returns the body, most responses don't have titles + def query(self, params): + return self.queryex(params)[0] + + + # List all pages for the site. + + # Raw version + # For the supported formats (module_body) see: + # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php + def list_pages_raw(self, limit): + res = self.query({ + 'moduleName': 'list/ListPagesModule', + 'limit': limit if limit else '10000', + 'perPage': limit if limit else '10000', + 'module_body': '%%page_unix_name%%', + 'separate': 'false', + 'order': 'dateCreatedDesc', # This way limit makes sense. This is also the default + }) + return res + + # Client version + def list_pages(self, limit): + raw = self.list_pages_raw(limit).replace('
',"\n") + soup = BeautifulSoup(raw, 'html.parser') + pages = [] + for entry in soup.div.p.text.split('\n'): + pages.append(entry) + return pages + + + # Retrieves internal page_id by page unix_name. + # Page IDs are required for most of page functions. + + def get_page_id(self, page_unix_name): + # The only freaking way to get page ID is to load the page! Wikidot! + self._wait_request_slot() + req = requests.request('GET', self.site+'/'+page_unix_name) + soup = BeautifulSoup(req.text, 'html.parser') + for item in soup.head.find_all('script'): + text = item.text + pos = text.find("WIKIREQUEST.info.pageId = ") + if pos >= 0: + pos += len("WIKIREQUEST.info.pageId = ") + crlf = text.find(";", pos) + if crlf >= 0: + return int(text[pos:crlf]) + else: + return int(text[pos:]) + return None + + + # Retrieves a list of revisions for a page. + # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php + + # Raw version + def get_revisions_raw(self, page_id, limit): + res = self.query({ + 'moduleName': 'history/PageRevisionListModule', + 'page_id': page_id, + 'page': '1', + 'perpage': limit if limit else '10000', + 'options': '{"all":true}' + }) + + soup = BeautifulSoup(res, 'html.parser') + return soup.table.contents + + # Client version + def get_revisions(self, page_id, limit): + revs = [] + for tr in self.get_revisions_raw(page_id, limit): + if tr.name != 'tr': continue # there's a header + various junk + + # RevID is stored as a value of an INPUT field + rev_id = tr.input['value'] if tr.input else None + if rev_id is None: continue # can't parse + + # Unixtime is stored as a CSS class time_* + rev_date = 0 + date_span = tr.find("span", attrs={"class": "odate"}) + if date_span is not None: + for cls in date_span['class']: + if cls.startswith('time_'): + rev_date = int(cls[5:]) + + # Username in a last
under + user_span = tr.find("span", attrs={"class": "printuser"}) + for last_a in user_span.find_all('a'): pass + rev_user = last_a.getText() if last_a else None + + + # Comment is in the last TD of the row + last_td = None + for last_td in tr.find_all('td'): pass + rev_comment = last_td.getText() if last_td else "" + + revs.append({ + 'id': rev_id, + 'date': rev_date, + 'user': rev_user, + 'comment': rev_comment, + }) + return revs + + + # Retrieves revision source for a revision. + # There's no raw version because there's nothing else in raw. + def get_revision_source(self, rev_id): + res = self.query({ + 'moduleName': 'history/PageSourceModule', + 'revision_id': rev_id, + # We don't need page id + }) + # The source is HTMLified but BeautifulSoup's getText() will decode that + # - htmlentities + # -
s in place of linebreaks + # - random real linebreaks (have to be ignored) + soup = BeautifulSoup(res, 'html.parser') + return soup.div.getText().lstrip(' \r\n') + + # Retrieves the rendered version + additional info unavailable in get_revision_source: + # * Title + # * Unixname at the time + def get_revision_version_raw(self, rev_id): + res = self.queryex({ + 'moduleName': 'history/PageVersionModule', + 'revision_id': rev_id, + }) + return res + + def get_revision_version(self, rev_id): + res = self.get_revision_version_raw(rev_id) # this has title! + soup = BeautifulSoup(res[0], 'html.parser') + + # First table is a flyout with revision details. Remove and study it. + unixname = None + details = soup.find("div", attrs={"id": "page-version-info"}).extract() + for tr in details.find_all('tr'): + tds = tr.find_all('td') + if len(tds) < 2: continue + if tds[0].getText().strip() == 'Page name:': + unixname = tds[1].getText().strip() + + return { + 'rev_id': rev_id, + 'unixname': unixname, + 'title': res[1], + 'content': unicode(soup), # only content remains } \ No newline at end of file From 68e9b67aa906547ff2b3b23e7dfe6ea50c6e2fa4 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 13:43:08 +0200 Subject: [PATCH 05/93] 2to3 on the rest --- crawl.py | 12 ++++++------ hgpatch.py | 2 +- rmaint.py | 38 +++++++++++++++++++------------------- wikidot.py | 6 +++--- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/crawl.py b/crawl.py index 08bbbf3..0516205 100644 --- a/crawl.py +++ b/crawl.py @@ -50,7 +50,7 @@ def force_dirs(path): raise exception if args.list_pages_raw: - print(wd.list_pages_raw(args.depth)) + print((wd.list_pages_raw(args.depth))) elif args.list_pages: for page in wd.list_pages(args.depth): @@ -65,7 +65,7 @@ def force_dirs(path): raise Exception("Page not found: "+args.page) revs = wd.get_revisions(page_id, 1) # last revision - print(wd.get_revision_source(revs[0]['id'])) + print((wd.get_revision_source(revs[0]['id']))) elif args.content: if not args.page: @@ -76,7 +76,7 @@ def force_dirs(path): raise Exception("Page not found: "+args.page) revs = wd.get_revisions(page_id, 1) # last revision - print(wd.get_revision_version(revs[0]['id'])) + print((wd.get_revision_version(revs[0]['id']))) elif args.log_raw: if not args.page: @@ -86,7 +86,7 @@ def force_dirs(path): if not page_id: raise Exception("Page not found: "+args.page) - print(wd.get_revisions_raw(page_id, args.depth)) + print((wd.get_revisions_raw(page_id, args.depth))) elif args.log: @@ -97,11 +97,11 @@ def force_dirs(path): if not page_id: raise Exception("Page not found: "+args.page) for rev in wd.get_revisions(page_id, args.depth): - print(str(rev)) + print((str(rev))) elif args.dump: - print("Downloading pages to "+args.dump) + print(("Downloading pages to "+args.dump)) force_dirs(args.dump) rm = RepoMaintainer(wd, args.dump) diff --git a/hgpatch.py b/hgpatch.py index 2d77769..de363ba 100644 --- a/hgpatch.py +++ b/hgpatch.py @@ -17,7 +17,7 @@ old_fromlocal = None def better_fromlocal(s): - if isinstance(s, unicode): + if isinstance(s, str): return s.encode('utf-8') global old_fromlocal return old_fromlocal(s) diff --git a/rmaint.py b/rmaint.py index fe21027..bfc0c87 100644 --- a/rmaint.py +++ b/rmaint.py @@ -2,7 +2,7 @@ import codecs from mercurial import commands, ui, hg import hgpatch -import cPickle as pickle +import pickle as pickle import wikidot # Repository builder and maintainer @@ -60,19 +60,19 @@ def loadWRevs(self): # def buildRevisionList(self, pages = None, depth = 10000): if os.path.isfile(self.path+'\\.wrevs'): - print "Loading cached revision list..." + print("Loading cached revision list...") self.loadWRevs() else: - print "Building revision list..." + print("Building revision list...") if not pages: pages = self.wd.list_pages(10000) self.wrevs = [] for page in pages: - print "Querying page: "+page + print(("Querying page: "+page)) page_id = self.wd.get_page_id(page) - print "ID: "+str(page_id) + print(("ID: "+str(page_id))) revs = self.wd.get_revisions(page_id, depth) - print "Revisions: "+str(len(revs)) + print(("Revisions: "+str(len(revs)))) for rev in revs: self.wrevs.append({ 'page_id' : page_id, @@ -83,20 +83,20 @@ def buildRevisionList(self, pages = None, depth = 10000): 'comment' : rev['comment'], }) self.saveWRevs() # Save a cached copy - print "" + print("") - print "Total revisions: "+str(len(self.wrevs)) + print(("Total revisions: "+str(len(self.wrevs)))) - print "Sorting revisions..." + print("Sorting revisions...") self.wrevs.sort(key=lambda rev: rev['date']) - print "" + print("") if self.debug: - print "Revision list: " + print("Revision list: ") for rev in self.wrevs: - print str(rev)+"\n" - print "" + print((str(rev)+"\n")) + print("") # @@ -132,12 +132,12 @@ def openRepo(self): self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo if os.path.isfile(self.path+'\\.wstate'): - print "Continuing from aborted dump state..." + print("Continuing from aborted dump state...") self.loadState() self.repo = hg.repository(self.ui, self.path) else: # create a new repository (will fail if one exists) - print "Initializing repository..." + print("Initializing repository...") commands.init(self.ui, self.path) self.repo = hg.repository(self.ui, self.path) self.rev_no = 0 @@ -185,7 +185,7 @@ def commitNext(self): # There are also problems when parent page gets renamed -- see updateChildren # If the page is tracked and its name just changed, tell HG - rename = (unixname in self.last_names) and (self.last_names[unixname] <> rev_unixname) + rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) if rename: self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt') @@ -207,7 +207,7 @@ def commitNext(self): self.last_names[unixname] = rev_unixname # Commit - if rev['comment'] <> '': + if rev['comment'] != '': commit_msg = rev_unixname + ': ' + rev['comment'] else: commit_msg = rev_unixname @@ -215,7 +215,7 @@ def commitNext(self): commit_date = str(rev['date']) + ' 0' else: commit_date = None - print "Commiting: "+str(self.rev_no)+'. '+commit_msg + print(("Commiting: "+str(self.rev_no)+'. '+commit_msg)) commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date) self.rev_no += 1 @@ -234,7 +234,7 @@ def commitNext(self): # Therefore, on every rename we must update all linked children in the same revision. # def updateChildren(self, oldunixname, newunixname): - for child in self.last_parents.keys(): + for child in list(self.last_parents.keys()): if self.last_parents[child] == oldunixname: self.updateParentField(child, self.last_parents[child], newunixname) diff --git a/wikidot.py b/wikidot.py index df2252d..ba1c218 100644 --- a/wikidot.py +++ b/wikidot.py @@ -30,8 +30,8 @@ def queryex(self, params): params['wikidot_token7'] = token if self.debug: - print params - print cookies + print(params) + print(cookies) self._wait_request_slot() req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies) @@ -189,5 +189,5 @@ def get_revision_version(self, rev_id): 'rev_id': rev_id, 'unixname': unixname, 'title': res[1], - 'content': unicode(soup), # only content remains + 'content': str(soup), # only content remains } \ No newline at end of file From 3f5fbd0c9bde859a48f3b47f746edb3cb445a417 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 13:54:35 +0200 Subject: [PATCH 06/93] tabs to spaces --- crawl.py | 98 ++++++------ hgpatch.py | 22 +-- rmaint.py | 456 ++++++++++++++++++++++++++--------------------------- wikidot.py | 368 +++++++++++++++++++++--------------------- 4 files changed, 472 insertions(+), 472 deletions(-) diff --git a/crawl.py b/crawl.py index 0516205..5ca5dfe 100644 --- a/crawl.py +++ b/crawl.py @@ -50,69 +50,69 @@ def force_dirs(path): raise exception if args.list_pages_raw: - print((wd.list_pages_raw(args.depth))) + print((wd.list_pages_raw(args.depth))) elif args.list_pages: - for page in wd.list_pages(args.depth): - print(page) + for page in wd.list_pages(args.depth): + print(page) elif args.source: - if not args.page: - raise Exception("Please specify --page for --source.") - - page_id = wd.get_page_id(args.page) - if not page_id: - raise Exception("Page not found: "+args.page) - - revs = wd.get_revisions(page_id, 1) # last revision - print((wd.get_revision_source(revs[0]['id']))) + if not args.page: + raise Exception("Please specify --page for --source.") + + page_id = wd.get_page_id(args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + + revs = wd.get_revisions(page_id, 1) # last revision + print((wd.get_revision_source(revs[0]['id']))) elif args.content: - if not args.page: - raise Exception("Please specify --page for --source.") - - page_id = wd.get_page_id(args.page) - if not page_id: - raise Exception("Page not found: "+args.page) - - revs = wd.get_revisions(page_id, 1) # last revision - print((wd.get_revision_version(revs[0]['id']))) + if not args.page: + raise Exception("Please specify --page for --source.") + + page_id = wd.get_page_id(args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + + revs = wd.get_revisions(page_id, 1) # last revision + print((wd.get_revision_version(revs[0]['id']))) elif args.log_raw: - if not args.page: - raise Exception("Please specify --page for --log.") + if not args.page: + raise Exception("Please specify --page for --log.") - page_id = wd.get_page_id(args.page) - if not page_id: - raise Exception("Page not found: "+args.page) + page_id = wd.get_page_id(args.page) + if not page_id: + raise Exception("Page not found: "+args.page) - print((wd.get_revisions_raw(page_id, args.depth))) + print((wd.get_revisions_raw(page_id, args.depth))) elif args.log: - if not args.page: - raise Exception("Please specify --page for --log.") + if not args.page: + raise Exception("Please specify --page for --log.") - page_id = wd.get_page_id(args.page) - if not page_id: - raise Exception("Page not found: "+args.page) - for rev in wd.get_revisions(page_id, args.depth): - print((str(rev))) + page_id = wd.get_page_id(args.page) + if not page_id: + raise Exception("Page not found: "+args.page) + for rev in wd.get_revisions(page_id, args.depth): + print((str(rev))) elif args.dump: - print(("Downloading pages to "+args.dump)) - force_dirs(args.dump) - - rm = RepoMaintainer(wd, args.dump) - rm.debug = args.debug - rm.storeRevIds = args.revids - rm.buildRevisionList([args.page] if args.page else None, args.depth) - rm.openRepo() - - print("Downloading revisions...") - while rm.commitNext(): - pass - - rm.cleanup() - print("Done.") + print(("Downloading pages to "+args.dump)) + force_dirs(args.dump) + + rm = RepoMaintainer(wd, args.dump) + rm.debug = args.debug + rm.storeRevIds = args.revids + rm.buildRevisionList([args.page] if args.page else None, args.depth) + rm.openRepo() + + print("Downloading revisions...") + while rm.commitNext(): + pass + + rm.cleanup() + print("Done.") diff --git a/hgpatch.py b/hgpatch.py index de363ba..02aed23 100644 --- a/hgpatch.py +++ b/hgpatch.py @@ -17,10 +17,10 @@ old_fromlocal = None def better_fromlocal(s): - if isinstance(s, str): - return s.encode('utf-8') - global old_fromlocal - return old_fromlocal(s) + if isinstance(s, str): + return s.encode('utf-8') + global old_fromlocal + return old_fromlocal(s) old_fromlocal = encoding.fromlocal encoding.fromlocal = better_fromlocal @@ -35,13 +35,13 @@ def better_fromlocal(s): old_vfs_call = None def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False): - fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose) - if path.endswith('last-message.txt'): - # Create a wrapper like codecs.open does: - info = codecs.lookup("utf-8") - fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict') - fp.encoding = 'utf-8' - return fp + fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose) + if path.endswith('last-message.txt'): + # Create a wrapper like codecs.open does: + info = codecs.lookup("utf-8") + fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict') + fp.encoding = 'utf-8' + return fp old_vfs_call = scmutil.vfs.__call__ scmutil.vfs.__call__ = better_vfs_call diff --git a/rmaint.py b/rmaint.py index bfc0c87..d64415d 100644 --- a/rmaint.py +++ b/rmaint.py @@ -13,251 +13,251 @@ # rm.buildRevisionList(pages, depth) # rm.openRepo() # while rm.commitNext(): -# pass +# pass # rm.cleanup() # Talkative. class RepoMaintainer: - def __init__(self, wikidot, path): - # Settings - self.wd = wikidot # Wikidot instance - self.path = path # Path to repository - self.debug = False # = True to enable more printing - self.storeRevIds = True # = True to store .revid with each commit - - # Internal state - self.wrevs = None # Compiled wikidot revision list (history) - - self.rev_no = 0 # Next revision to process - self.last_names = {} # Tracks page renames: name atm -> last name in repo - self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - - self.ui = None # Mercurial UI object - self.repo = None # Mercurial repo object + def __init__(self, wikidot, path): + # Settings + self.wd = wikidot # Wikidot instance + self.path = path # Path to repository + self.debug = False # = True to enable more printing + self.storeRevIds = True # = True to store .revid with each commit + + # Internal state + self.wrevs = None # Compiled wikidot revision list (history) + + self.rev_no = 0 # Next revision to process + self.last_names = {} # Tracks page renames: name atm -> last name in repo + self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo + + self.ui = None # Mercurial UI object + self.repo = None # Mercurial repo object - # - # Saves and loads revision list from file - # - def saveWRevs(self): - fp = open(self.path+'\\.wrevs', 'wb') - pickle.dump(self.wrevs, fp) - fp.close() - - def loadWRevs(self): - fp = open(self.path+'\\.wrevs', 'rb') - self.wrevs = pickle.load(fp) - fp.close() + # + # Saves and loads revision list from file + # + def saveWRevs(self): + fp = open(self.path+'\\.wrevs', 'wb') + pickle.dump(self.wrevs, fp) + fp.close() + + def loadWRevs(self): + fp = open(self.path+'\\.wrevs', 'rb') + self.wrevs = pickle.load(fp) + fp.close() - # - # Compiles a combined revision list for a given set of pages, or all pages on the site. - # pages: compile history for these pages - # depth: download at most this number of revisions. - # - # If there exists a cached revision list at the repository destination, - # it is loaded and no requests are made. - # - def buildRevisionList(self, pages = None, depth = 10000): - if os.path.isfile(self.path+'\\.wrevs'): - print("Loading cached revision list...") - self.loadWRevs() - else: - print("Building revision list...") - if not pages: - pages = self.wd.list_pages(10000) - self.wrevs = [] - for page in pages: - print(("Querying page: "+page)) - page_id = self.wd.get_page_id(page) - print(("ID: "+str(page_id))) - revs = self.wd.get_revisions(page_id, depth) - print(("Revisions: "+str(len(revs)))) - for rev in revs: - self.wrevs.append({ - 'page_id' : page_id, - 'page_name' : page, # name atm, not at revision time - 'rev_id' : rev['id'], - 'date' : rev['date'], - 'user' : rev['user'], - 'comment' : rev['comment'], - }) - self.saveWRevs() # Save a cached copy - print("") - - - print(("Total revisions: "+str(len(self.wrevs)))) - - print("Sorting revisions...") - self.wrevs.sort(key=lambda rev: rev['date']) - print("") - - if self.debug: - print("Revision list: ") - for rev in self.wrevs: - print((str(rev)+"\n")) - print("") + # + # Compiles a combined revision list for a given set of pages, or all pages on the site. + # pages: compile history for these pages + # depth: download at most this number of revisions. + # + # If there exists a cached revision list at the repository destination, + # it is loaded and no requests are made. + # + def buildRevisionList(self, pages = None, depth = 10000): + if os.path.isfile(self.path+'\\.wrevs'): + print("Loading cached revision list...") + self.loadWRevs() + else: + print("Building revision list...") + if not pages: + pages = self.wd.list_pages(10000) + self.wrevs = [] + for page in pages: + print(("Querying page: "+page)) + page_id = self.wd.get_page_id(page) + print(("ID: "+str(page_id))) + revs = self.wd.get_revisions(page_id, depth) + print(("Revisions: "+str(len(revs)))) + for rev in revs: + self.wrevs.append({ + 'page_id' : page_id, + 'page_name' : page, # name atm, not at revision time + 'rev_id' : rev['id'], + 'date' : rev['date'], + 'user' : rev['user'], + 'comment' : rev['comment'], + }) + self.saveWRevs() # Save a cached copy + print("") + + + print(("Total revisions: "+str(len(self.wrevs)))) + + print("Sorting revisions...") + self.wrevs.sort(key=lambda rev: rev['date']) + print("") + + if self.debug: + print("Revision list: ") + for rev in self.wrevs: + print((str(rev)+"\n")) + print("") - # - # Saves and loads operational state from file - # - def saveState(self): - fp = open(self.path+'\\.wstate', 'wb') - pickle.dump(self.rev_no, fp) - pickle.dump(self.last_names, fp) - pickle.dump(self.last_parents, fp) - fp.close() - - def loadState(self): - fp = open(self.path+'\\.wstate', 'rb') - self.rev_no = pickle.load(fp) - self.last_names = pickle.load(fp) - try: - self.last_parents = pickle.load(fp) - except EOFError: - pass - fp.close() + # + # Saves and loads operational state from file + # + def saveState(self): + fp = open(self.path+'\\.wstate', 'wb') + pickle.dump(self.rev_no, fp) + pickle.dump(self.last_names, fp) + pickle.dump(self.last_parents, fp) + fp.close() + + def loadState(self): + fp = open(self.path+'\\.wstate', 'rb') + self.rev_no = pickle.load(fp) + self.last_names = pickle.load(fp) + try: + self.last_parents = pickle.load(fp) + except EOFError: + pass + fp.close() - # - # Initializes the construction process, after the revision list has been compiled. - # Either creates a new repo, or loads the existing one at the target path - # and restores its construction state. - # - def openRepo(self): - # Create a new repository or continue from aborted dump - self.ui=ui.ui() - self.last_names = {} # Tracks page renames: name atm -> last name in repo - self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - - if os.path.isfile(self.path+'\\.wstate'): - print("Continuing from aborted dump state...") - self.loadState() - self.repo = hg.repository(self.ui, self.path) - - else: # create a new repository (will fail if one exists) - print("Initializing repository...") - commands.init(self.ui, self.path) - self.repo = hg.repository(self.ui, self.path) - self.rev_no = 0 - - if self.storeRevIds: - # Add revision id file to the new repo - fname = self.path+'\\.revid' - codecs.open(fname, "w", "UTF-8").close() - commands.add(self.ui, self.repo, str(fname)) - - - # - # Takes an unprocessed revision from a revision log, fetches its data and commits it. - # Returns false if no unprocessed revisions remain. - # - def commitNext(self): - if self.rev_no >= len(self.wrevs): - return False - - rev = self.wrevs[self.rev_no] - source = self.wd.get_revision_source(rev['rev_id']) - # Page title and unix_name changes are only available through another request: - details = self.wd.get_revision_version(rev['rev_id']) - - # Store revision_id for last commit - # Without this, empty commits (e.g. file uploads) will be skipped by Mercurial - if self.storeRevIds: - fname = self.path+'\\.revid' - outp = codecs.open(fname, "w", "UTF-8") - outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway - outp.close() - - unixname = rev['page_name'] - rev_unixname = details['unixname'] # may be different in revision than atm - - # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history. - # The only way to know they were changed is revision comments, though evil people may trick us. - if rev['comment'].startswith('Parent page set to: "'): - # This is a parenting revision, remember the new parent - parent_unixname = rev['comment'][21:-2] - self.last_parents[unixname] = parent_unixname - else: - # Else use last parent_unixname we've recorded - parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None - # There are also problems when parent page gets renamed -- see updateChildren - - # If the page is tracked and its name just changed, tell HG - rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) - if rename: - self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there - commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt') - - # Ouput contents - fname = self.path+'\\'+rev_unixname+'.txt' - outp = codecs.open(fname, "w", "UTF-8") - if details['title']: - outp.write('title:'+details['title']+'\n') - if parent_unixname: - outp.write('parent:'+parent_unixname+'\n') - outp.write(source) - outp.close() - - # Add new page - if not unixname in self.last_names: # never before seen - commands.add(self.ui, self.repo, str(fname)) + # + # Initializes the construction process, after the revision list has been compiled. + # Either creates a new repo, or loads the existing one at the target path + # and restores its construction state. + # + def openRepo(self): + # Create a new repository or continue from aborted dump + self.ui=ui.ui() + self.last_names = {} # Tracks page renames: name atm -> last name in repo + self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo + + if os.path.isfile(self.path+'\\.wstate'): + print("Continuing from aborted dump state...") + self.loadState() + self.repo = hg.repository(self.ui, self.path) + + else: # create a new repository (will fail if one exists) + print("Initializing repository...") + commands.init(self.ui, self.path) + self.repo = hg.repository(self.ui, self.path) + self.rev_no = 0 + + if self.storeRevIds: + # Add revision id file to the new repo + fname = self.path+'\\.revid' + codecs.open(fname, "w", "UTF-8").close() + commands.add(self.ui, self.repo, str(fname)) + + + # + # Takes an unprocessed revision from a revision log, fetches its data and commits it. + # Returns false if no unprocessed revisions remain. + # + def commitNext(self): + if self.rev_no >= len(self.wrevs): + return False + + rev = self.wrevs[self.rev_no] + source = self.wd.get_revision_source(rev['rev_id']) + # Page title and unix_name changes are only available through another request: + details = self.wd.get_revision_version(rev['rev_id']) + + # Store revision_id for last commit + # Without this, empty commits (e.g. file uploads) will be skipped by Mercurial + if self.storeRevIds: + fname = self.path+'\\.revid' + outp = codecs.open(fname, "w", "UTF-8") + outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway + outp.close() + + unixname = rev['page_name'] + rev_unixname = details['unixname'] # may be different in revision than atm + + # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history. + # The only way to know they were changed is revision comments, though evil people may trick us. + if rev['comment'].startswith('Parent page set to: "'): + # This is a parenting revision, remember the new parent + parent_unixname = rev['comment'][21:-2] + self.last_parents[unixname] = parent_unixname + else: + # Else use last parent_unixname we've recorded + parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None + # There are also problems when parent page gets renamed -- see updateChildren + + # If the page is tracked and its name just changed, tell HG + rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) + if rename: + self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there + commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt') + + # Ouput contents + fname = self.path+'\\'+rev_unixname+'.txt' + outp = codecs.open(fname, "w", "UTF-8") + if details['title']: + outp.write('title:'+details['title']+'\n') + if parent_unixname: + outp.write('parent:'+parent_unixname+'\n') + outp.write(source) + outp.close() + + # Add new page + if not unixname in self.last_names: # never before seen + commands.add(self.ui, self.repo, str(fname)) - self.last_names[unixname] = rev_unixname + self.last_names[unixname] = rev_unixname - # Commit - if rev['comment'] != '': - commit_msg = rev_unixname + ': ' + rev['comment'] - else: - commit_msg = rev_unixname - if rev['date']: - commit_date = str(rev['date']) + ' 0' - else: - commit_date = None - print(("Commiting: "+str(self.rev_no)+'. '+commit_msg)) + # Commit + if rev['comment'] != '': + commit_msg = rev_unixname + ': ' + rev['comment'] + else: + commit_msg = rev_unixname + if rev['date']: + commit_date = str(rev['date']) + ' 0' + else: + commit_date = None + print(("Commiting: "+str(self.rev_no)+'. '+commit_msg)) - commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date) - self.rev_no += 1 + commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date) + self.rev_no += 1 - self.saveState() # Update operation state - return True + self.saveState() # Update operation state + return True - # - # Updates all children of the page to reflect parent's unixname change. - # - # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body. - # A parent may then be renamed. - # Wikidot logs no additional changes for child pages, yet they stay linked to the parent. - # - # Therefore, on every rename we must update all linked children in the same revision. - # - def updateChildren(self, oldunixname, newunixname): - for child in list(self.last_parents.keys()): - if self.last_parents[child] == oldunixname: - self.updateParentField(child, self.last_parents[child], newunixname) - - # - # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. - # The rest of the file is preserved. - # - def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): - with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f: - content = f.readlines() - # Since this is all tracked by us, we KNOW there's a line in standard format somewhere - idx = content.index('parent:'+parent_oldunixname+'\n') - if idx < 0: - raise Exception("Cannot update child page "+child_unixname+": " - +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); - content[idx] = 'parent:'+parent_newunixname+'\n' - with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f: - f.writelines(content) + # + # Updates all children of the page to reflect parent's unixname change. + # + # Any page may be assigned a parent, which adds entry to revision log. We store this as parent:unixname in the page body. + # A parent may then be renamed. + # Wikidot logs no additional changes for child pages, yet they stay linked to the parent. + # + # Therefore, on every rename we must update all linked children in the same revision. + # + def updateChildren(self, oldunixname, newunixname): + for child in list(self.last_parents.keys()): + if self.last_parents[child] == oldunixname: + self.updateParentField(child, self.last_parents[child], newunixname) + + # + # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. + # The rest of the file is preserved. + # + def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): + with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f: + content = f.readlines() + # Since this is all tracked by us, we KNOW there's a line in standard format somewhere + idx = content.index('parent:'+parent_oldunixname+'\n') + if idx < 0: + raise Exception("Cannot update child page "+child_unixname+": " + +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); + content[idx] = 'parent:'+parent_newunixname+'\n' + with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f: + f.writelines(content) - # - # Finalizes the construction process and deletes any temporary files. - # - def cleanup(self): - os.remove(self.path+'\\.wstate') - os.remove(self.path+'\\.wrevs') \ No newline at end of file + # + # Finalizes the construction process and deletes any temporary files. + # + def cleanup(self): + os.remove(self.path+'\\.wstate') + os.remove(self.path+'\\.wrevs') \ No newline at end of file diff --git a/wikidot.py b/wikidot.py index ba1c218..4760f5f 100644 --- a/wikidot.py +++ b/wikidot.py @@ -7,187 +7,187 @@ class Wikidot: - def __init__(self, site): - self.site = site # Wikidot site to query - self.delay = 200 # Delay between requests in msec - self.debug = False # Print debug messages - self.next_timeslot = time.clock() # Can call immediately - - - # To honor usage rules, we wait for self.delay between requests. - # Low-level query functions call this before every request to Wikidot./ - def _wait_request_slot(self): - tm = time.clock() - if self.next_timeslot - tm > 0: - time.sleep(self.next_timeslot - tm) - self.next_timeslot = tm + self.delay / 1000 - pass - - # Makes a Wikidot AJAX query. Returns the response+title or throws an error. - def queryex(self, params): - token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8)) - cookies = {"wikidot_token7": token} - params['wikidot_token7'] = token - - if self.debug: - print(params) - print(cookies) - - self._wait_request_slot() - req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies) - json = req.json() - if json['status'] == 'ok': - return json['body'], (json['title'] if 'title' in json else '') - else: - raise req.text - - # Same but only returns the body, most responses don't have titles - def query(self, params): - return self.queryex(params)[0] - - - # List all pages for the site. - - # Raw version - # For the supported formats (module_body) see: - # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php - def list_pages_raw(self, limit): - res = self.query({ - 'moduleName': 'list/ListPagesModule', - 'limit': limit if limit else '10000', - 'perPage': limit if limit else '10000', - 'module_body': '%%page_unix_name%%', - 'separate': 'false', - 'order': 'dateCreatedDesc', # This way limit makes sense. This is also the default - }) - return res - - # Client version - def list_pages(self, limit): - raw = self.list_pages_raw(limit).replace('
',"\n") - soup = BeautifulSoup(raw, 'html.parser') - pages = [] - for entry in soup.div.p.text.split('\n'): - pages.append(entry) - return pages - - - # Retrieves internal page_id by page unix_name. - # Page IDs are required for most of page functions. - - def get_page_id(self, page_unix_name): - # The only freaking way to get page ID is to load the page! Wikidot! - self._wait_request_slot() - req = requests.request('GET', self.site+'/'+page_unix_name) - soup = BeautifulSoup(req.text, 'html.parser') - for item in soup.head.find_all('script'): - text = item.text - pos = text.find("WIKIREQUEST.info.pageId = ") - if pos >= 0: - pos += len("WIKIREQUEST.info.pageId = ") - crlf = text.find(";", pos) - if crlf >= 0: - return int(text[pos:crlf]) - else: - return int(text[pos:]) - return None - - - # Retrieves a list of revisions for a page. - # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php - - # Raw version - def get_revisions_raw(self, page_id, limit): - res = self.query({ - 'moduleName': 'history/PageRevisionListModule', - 'page_id': page_id, - 'page': '1', - 'perpage': limit if limit else '10000', - 'options': '{"all":true}' - }) - - soup = BeautifulSoup(res, 'html.parser') - return soup.table.contents - - # Client version - def get_revisions(self, page_id, limit): - revs = [] - for tr in self.get_revisions_raw(page_id, limit): - if tr.name != 'tr': continue # there's a header + various junk - - # RevID is stored as a value of an INPUT field - rev_id = tr.input['value'] if tr.input else None - if rev_id is None: continue # can't parse - - # Unixtime is stored as a CSS class time_* - rev_date = 0 - date_span = tr.find("span", attrs={"class": "odate"}) - if date_span is not None: - for cls in date_span['class']: - if cls.startswith('time_'): - rev_date = int(cls[5:]) - - # Username in a last
under - user_span = tr.find("span", attrs={"class": "printuser"}) - for last_a in user_span.find_all('a'): pass - rev_user = last_a.getText() if last_a else None - - - # Comment is in the last TD of the row - last_td = None - for last_td in tr.find_all('td'): pass - rev_comment = last_td.getText() if last_td else "" - - revs.append({ - 'id': rev_id, - 'date': rev_date, - 'user': rev_user, - 'comment': rev_comment, - }) - return revs - - - # Retrieves revision source for a revision. - # There's no raw version because there's nothing else in raw. - def get_revision_source(self, rev_id): - res = self.query({ - 'moduleName': 'history/PageSourceModule', - 'revision_id': rev_id, - # We don't need page id - }) - # The source is HTMLified but BeautifulSoup's getText() will decode that - # - htmlentities - # -
s in place of linebreaks - # - random real linebreaks (have to be ignored) - soup = BeautifulSoup(res, 'html.parser') - return soup.div.getText().lstrip(' \r\n') - - # Retrieves the rendered version + additional info unavailable in get_revision_source: - # * Title - # * Unixname at the time - def get_revision_version_raw(self, rev_id): - res = self.queryex({ - 'moduleName': 'history/PageVersionModule', - 'revision_id': rev_id, - }) - return res - - def get_revision_version(self, rev_id): - res = self.get_revision_version_raw(rev_id) # this has title! - soup = BeautifulSoup(res[0], 'html.parser') - - # First table is a flyout with revision details. Remove and study it. - unixname = None - details = soup.find("div", attrs={"id": "page-version-info"}).extract() - for tr in details.find_all('tr'): - tds = tr.find_all('td') - if len(tds) < 2: continue - if tds[0].getText().strip() == 'Page name:': - unixname = tds[1].getText().strip() - - return { - 'rev_id': rev_id, - 'unixname': unixname, - 'title': res[1], - 'content': str(soup), # only content remains - } \ No newline at end of file + def __init__(self, site): + self.site = site # Wikidot site to query + self.delay = 200 # Delay between requests in msec + self.debug = False # Print debug messages + self.next_timeslot = time.clock() # Can call immediately + + + # To honor usage rules, we wait for self.delay between requests. + # Low-level query functions call this before every request to Wikidot./ + def _wait_request_slot(self): + tm = time.clock() + if self.next_timeslot - tm > 0: + time.sleep(self.next_timeslot - tm) + self.next_timeslot = tm + self.delay / 1000 + pass + + # Makes a Wikidot AJAX query. Returns the response+title or throws an error. + def queryex(self, params): + token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8)) + cookies = {"wikidot_token7": token} + params['wikidot_token7'] = token + + if self.debug: + print(params) + print(cookies) + + self._wait_request_slot() + req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies) + json = req.json() + if json['status'] == 'ok': + return json['body'], (json['title'] if 'title' in json else '') + else: + raise req.text + + # Same but only returns the body, most responses don't have titles + def query(self, params): + return self.queryex(params)[0] + + + # List all pages for the site. + + # Raw version + # For the supported formats (module_body) see: + # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php + def list_pages_raw(self, limit): + res = self.query({ + 'moduleName': 'list/ListPagesModule', + 'limit': limit if limit else '10000', + 'perPage': limit if limit else '10000', + 'module_body': '%%page_unix_name%%', + 'separate': 'false', + 'order': 'dateCreatedDesc', # This way limit makes sense. This is also the default + }) + return res + + # Client version + def list_pages(self, limit): + raw = self.list_pages_raw(limit).replace('
',"\n") + soup = BeautifulSoup(raw, 'html.parser') + pages = [] + for entry in soup.div.p.text.split('\n'): + pages.append(entry) + return pages + + + # Retrieves internal page_id by page unix_name. + # Page IDs are required for most of page functions. + + def get_page_id(self, page_unix_name): + # The only freaking way to get page ID is to load the page! Wikidot! + self._wait_request_slot() + req = requests.request('GET', self.site+'/'+page_unix_name) + soup = BeautifulSoup(req.text, 'html.parser') + for item in soup.head.find_all('script'): + text = item.text + pos = text.find("WIKIREQUEST.info.pageId = ") + if pos >= 0: + pos += len("WIKIREQUEST.info.pageId = ") + crlf = text.find(";", pos) + if crlf >= 0: + return int(text[pos:crlf]) + else: + return int(text[pos:]) + return None + + + # Retrieves a list of revisions for a page. + # See https://github.com/gabrys/wikidot/blob/master/php/modules/history/PageRevisionListModule.php + + # Raw version + def get_revisions_raw(self, page_id, limit): + res = self.query({ + 'moduleName': 'history/PageRevisionListModule', + 'page_id': page_id, + 'page': '1', + 'perpage': limit if limit else '10000', + 'options': '{"all":true}' + }) + + soup = BeautifulSoup(res, 'html.parser') + return soup.table.contents + + # Client version + def get_revisions(self, page_id, limit): + revs = [] + for tr in self.get_revisions_raw(page_id, limit): + if tr.name != 'tr': continue # there's a header + various junk + + # RevID is stored as a value of an INPUT field + rev_id = tr.input['value'] if tr.input else None + if rev_id is None: continue # can't parse + + # Unixtime is stored as a CSS class time_* + rev_date = 0 + date_span = tr.find("span", attrs={"class": "odate"}) + if date_span is not None: + for cls in date_span['class']: + if cls.startswith('time_'): + rev_date = int(cls[5:]) + + # Username in a last
under + user_span = tr.find("span", attrs={"class": "printuser"}) + for last_a in user_span.find_all('a'): pass + rev_user = last_a.getText() if last_a else None + + + # Comment is in the last TD of the row + last_td = None + for last_td in tr.find_all('td'): pass + rev_comment = last_td.getText() if last_td else "" + + revs.append({ + 'id': rev_id, + 'date': rev_date, + 'user': rev_user, + 'comment': rev_comment, + }) + return revs + + + # Retrieves revision source for a revision. + # There's no raw version because there's nothing else in raw. + def get_revision_source(self, rev_id): + res = self.query({ + 'moduleName': 'history/PageSourceModule', + 'revision_id': rev_id, + # We don't need page id + }) + # The source is HTMLified but BeautifulSoup's getText() will decode that + # - htmlentities + # -
s in place of linebreaks + # - random real linebreaks (have to be ignored) + soup = BeautifulSoup(res, 'html.parser') + return soup.div.getText().lstrip(' \r\n') + + # Retrieves the rendered version + additional info unavailable in get_revision_source: + # * Title + # * Unixname at the time + def get_revision_version_raw(self, rev_id): + res = self.queryex({ + 'moduleName': 'history/PageVersionModule', + 'revision_id': rev_id, + }) + return res + + def get_revision_version(self, rev_id): + res = self.get_revision_version_raw(rev_id) # this has title! + soup = BeautifulSoup(res[0], 'html.parser') + + # First table is a flyout with revision details. Remove and study it. + unixname = None + details = soup.find("div", attrs={"id": "page-version-info"}).extract() + for tr in details.find_all('tr'): + tds = tr.find_all('td') + if len(tds) < 2: continue + if tds[0].getText().strip() == 'Page name:': + unixname = tds[1].getText().strip() + + return { + 'rev_id': rev_id, + 'unixname': unixname, + 'title': res[1], + 'content': str(soup), # only content remains + } \ No newline at end of file From 31421883b307c66139a65ff5bb73792e2b8ebcec Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 14:01:36 +0200 Subject: [PATCH 07/93] don't kill stdout and stderr --- crawl.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/crawl.py b/crawl.py index 5ca5dfe..409eed1 100644 --- a/crawl.py +++ b/crawl.py @@ -11,11 +11,6 @@ # TODO: Ability to download new transactions since last dump. # We'll probably check the last revision time, then query all transactions and select those with greater revision time (not equal, since we would have downloaded equals at the previous dump) -rawStdout = sys.stdout -rawStderr = sys.stderr -sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, 'xmlcharrefreplace') -sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr, 'xmlcharrefreplace') - parser = argparse.ArgumentParser(description='Queries Wikidot') parser.add_argument('site', help='URL of Wikidot site') # Actions From 19c0bc83b1aa10ae01ca707c851a61b75e3361e8 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 15:15:02 +0200 Subject: [PATCH 08/93] os.errno doesn't exist anymore in python3, but exist_ok does --- crawl.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/crawl.py b/crawl.py index 409eed1..9fd0bab 100644 --- a/crawl.py +++ b/crawl.py @@ -38,11 +38,7 @@ def force_dirs(path): - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != os.errno.EEXIST: - raise exception + os.makedirs(path, exist_ok=True) if args.list_pages_raw: print((wd.list_pages_raw(args.depth))) @@ -98,16 +94,16 @@ def force_dirs(path): elif args.dump: print(("Downloading pages to "+args.dump)) force_dirs(args.dump) - + rm = RepoMaintainer(wd, args.dump) rm.debug = args.debug rm.storeRevIds = args.revids rm.buildRevisionList([args.page] if args.page else None, args.depth) rm.openRepo() - + print("Downloading revisions...") while rm.commitNext(): pass - + rm.cleanup() print("Done.") From a69ef7cdc32463e87d169d0012ce39070100e43f Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 15:18:16 +0200 Subject: [PATCH 09/93] port to git, mercurial doesn't have a python3 API (at least not stable enough to be available by default) --- rmaint.py | 75 ++++++++++++++++++++++++++++++++---------------------- wikidot.py | 15 ++++++----- 2 files changed, 53 insertions(+), 37 deletions(-) diff --git a/rmaint.py b/rmaint.py index d64415d..dc56153 100644 --- a/rmaint.py +++ b/rmaint.py @@ -1,9 +1,14 @@ +import wikidot + +# Basic python stuff import os import codecs -from mercurial import commands, ui, hg -import hgpatch import pickle as pickle -import wikidot + +# git stuff +from git import Repo, Actor +import time # For parsing unix epoch timestamps from wikidot and convert to normal timestamps +import re # For sanitizing usernames to fake email addresses # Repository builder and maintainer # Contains logic for actual loading and maintaining the repository over the course of its construction. @@ -25,16 +30,16 @@ def __init__(self, wikidot, path): self.path = path # Path to repository self.debug = False # = True to enable more printing self.storeRevIds = True # = True to store .revid with each commit - + # Internal state self.wrevs = None # Compiled wikidot revision list (history) - + self.rev_no = 0 # Next revision to process self.last_names = {} # Tracks page renames: name atm -> last name in repo self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - - self.ui = None # Mercurial UI object - self.repo = None # Mercurial repo object + + self.repo = None # Git repo object + self.index = None # Git current index object # @@ -127,28 +132,28 @@ def loadState(self): # def openRepo(self): # Create a new repository or continue from aborted dump - self.ui=ui.ui() self.last_names = {} # Tracks page renames: name atm -> last name in repo self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo if os.path.isfile(self.path+'\\.wstate'): print("Continuing from aborted dump state...") self.loadState() - self.repo = hg.repository(self.ui, self.path) - + self.repo = Repo(self.path) + assert not self.repo.bare + else: # create a new repository (will fail if one exists) print("Initializing repository...") - commands.init(self.ui, self.path) - self.repo = hg.repository(self.ui, self.path) + self.repo = Repo.init(self.path) self.rev_no = 0 - + if self.storeRevIds: # Add revision id file to the new repo - fname = self.path+'\\.revid' - codecs.open(fname, "w", "UTF-8").close() - commands.add(self.ui, self.repo, str(fname)) - - + fname = '/.revid' + codecs.open(self.path + fname, "w", "UTF-8").close() + self.repo.index.add([fname]) + self.index.commit("Initial creation of repo") + self.index = self.repo.index + # # Takes an unprocessed revision from a revision log, fetches its data and commits it. # Returns false if no unprocessed revisions remain. @@ -156,23 +161,23 @@ def openRepo(self): def commitNext(self): if self.rev_no >= len(self.wrevs): return False - + rev = self.wrevs[self.rev_no] source = self.wd.get_revision_source(rev['rev_id']) # Page title and unix_name changes are only available through another request: details = self.wd.get_revision_version(rev['rev_id']) - + # Store revision_id for last commit - # Without this, empty commits (e.g. file uploads) will be skipped by Mercurial + # Without this, empty commits (e.g. file uploads) will be skipped by Git if self.storeRevIds: fname = self.path+'\\.revid' outp = codecs.open(fname, "w", "UTF-8") outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway outp.close() - + unixname = rev['page_name'] rev_unixname = details['unixname'] # may be different in revision than atm - + # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history. # The only way to know they were changed is revision comments, though evil people may trick us. if rev['comment'].startswith('Parent page set to: "'): @@ -183,13 +188,13 @@ def commitNext(self): # Else use last parent_unixname we've recorded parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None # There are also problems when parent page gets renamed -- see updateChildren - + # If the page is tracked and its name just changed, tell HG rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) if rename: self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there - commands.rename(self.ui, self.repo, self.path+'\\'+str(self.last_names[unixname])+'.txt', self.path+'\\'+str(rev_unixname)+'.txt') - + self.index.move([str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt']) + # Ouput contents fname = self.path+'\\'+rev_unixname+'.txt' outp = codecs.open(fname, "w", "UTF-8") @@ -199,10 +204,13 @@ def commitNext(self): outp.write('parent:'+parent_unixname+'\n') outp.write(source) outp.close() - + # Add new page if not unixname in self.last_names: # never before seen - commands.add(self.ui, self.repo, str(fname)) + if self.debug: + print("adding", fname) + + self.index.add([str(fname)]) self.last_names[unixname] = rev_unixname @@ -212,12 +220,17 @@ def commitNext(self): else: commit_msg = rev_unixname if rev['date']: - commit_date = str(rev['date']) + ' 0' + parsed_time = time.gmtime(int(rev['date'])) # TODO: assumes GMT + commit_date = time.strftime('%Y-%m-%d %H:%M:%S', parsed_time) else: commit_date = None print(("Commiting: "+str(self.rev_no)+'. '+commit_msg)) - commands.commit(self.ui, self.repo, message=commit_msg, user=rev['user'], date=commit_date) + username = str(rev['user']) + email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename + + author = Actor(username, email) + commit = self.index.commit(commit_msg, author=author, commit_date=commit_date) self.rev_no += 1 self.saveState() # Update operation state diff --git a/wikidot.py b/wikidot.py index 4760f5f..ba34f39 100644 --- a/wikidot.py +++ b/wikidot.py @@ -2,6 +2,7 @@ import random from bs4 import BeautifulSoup import time +from urllib.parse import urlparse # Implements various queries to Wikidot engine through its AJAX facilities @@ -9,6 +10,7 @@ class Wikidot: def __init__(self, site): self.site = site # Wikidot site to query + self.sitename = urlparse(site).hostname.lower() self.delay = 200 # Delay between requests in msec self.debug = False # Print debug messages self.next_timeslot = time.clock() # Can call immediately @@ -28,7 +30,7 @@ def queryex(self, params): token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8)) cookies = {"wikidot_token7": token} params['wikidot_token7'] = token - + if self.debug: print(params) print(cookies) @@ -36,6 +38,7 @@ def queryex(self, params): self._wait_request_slot() req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies) json = req.json() + if json['status'] == 'ok': return json['body'], (json['title'] if 'title' in json else '') else: @@ -105,7 +108,7 @@ def get_revisions_raw(self, page_id, limit): 'perpage': limit if limit else '10000', 'options': '{"all":true}' }) - + soup = BeautifulSoup(res, 'html.parser') return soup.table.contents @@ -131,7 +134,7 @@ def get_revisions(self, page_id, limit): user_span = tr.find("span", attrs={"class": "printuser"}) for last_a in user_span.find_all('a'): pass rev_user = last_a.getText() if last_a else None - + # Comment is in the last TD of the row last_td = None @@ -161,7 +164,7 @@ def get_revision_source(self, rev_id): # - random real linebreaks (have to be ignored) soup = BeautifulSoup(res, 'html.parser') return soup.div.getText().lstrip(' \r\n') - + # Retrieves the rendered version + additional info unavailable in get_revision_source: # * Title # * Unixname at the time @@ -171,7 +174,7 @@ def get_revision_version_raw(self, rev_id): 'revision_id': rev_id, }) return res - + def get_revision_version(self, rev_id): res = self.get_revision_version_raw(rev_id) # this has title! soup = BeautifulSoup(res[0], 'html.parser') @@ -190,4 +193,4 @@ def get_revision_version(self, rev_id): 'unixname': unixname, 'title': res[1], 'content': str(soup), # only content remains - } \ No newline at end of file + } From 8d9232d7f99f41bd6d74225bf8662885fce7c330 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 15:18:27 +0200 Subject: [PATCH 10/93] use proper path separators --- rmaint.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/rmaint.py b/rmaint.py index dc56153..d5edec3 100644 --- a/rmaint.py +++ b/rmaint.py @@ -46,12 +46,12 @@ def __init__(self, wikidot, path): # Saves and loads revision list from file # def saveWRevs(self): - fp = open(self.path+'\\.wrevs', 'wb') + fp = open(self.path+'/.wrevs', 'wb') pickle.dump(self.wrevs, fp) fp.close() - + def loadWRevs(self): - fp = open(self.path+'\\.wrevs', 'rb') + fp = open(self.path+'/.wrevs', 'rb') self.wrevs = pickle.load(fp) fp.close() @@ -64,7 +64,7 @@ def loadWRevs(self): # it is loaded and no requests are made. # def buildRevisionList(self, pages = None, depth = 10000): - if os.path.isfile(self.path+'\\.wrevs'): + if os.path.isfile(self.path+'/.wrevs'): print("Loading cached revision list...") self.loadWRevs() else: @@ -108,14 +108,14 @@ def buildRevisionList(self, pages = None, depth = 10000): # Saves and loads operational state from file # def saveState(self): - fp = open(self.path+'\\.wstate', 'wb') + fp = open(self.path+'/.wstate', 'wb') pickle.dump(self.rev_no, fp) pickle.dump(self.last_names, fp) pickle.dump(self.last_parents, fp) fp.close() def loadState(self): - fp = open(self.path+'\\.wstate', 'rb') + fp = open(self.path+'/.wstate', 'rb') self.rev_no = pickle.load(fp) self.last_names = pickle.load(fp) try: @@ -134,8 +134,8 @@ def openRepo(self): # Create a new repository or continue from aborted dump self.last_names = {} # Tracks page renames: name atm -> last name in repo self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - - if os.path.isfile(self.path+'\\.wstate'): + + if os.path.isfile(self.path+'/.wstate'): print("Continuing from aborted dump state...") self.loadState() self.repo = Repo(self.path) @@ -170,7 +170,7 @@ def commitNext(self): # Store revision_id for last commit # Without this, empty commits (e.g. file uploads) will be skipped by Git if self.storeRevIds: - fname = self.path+'\\.revid' + fname = self.path+'/.revid' outp = codecs.open(fname, "w", "UTF-8") outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway outp.close() @@ -192,12 +192,15 @@ def commitNext(self): # If the page is tracked and its name just changed, tell HG rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) if rename: + if self.debug: + print("moving", str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt') + self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there self.index.move([str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt']) # Ouput contents - fname = self.path+'\\'+rev_unixname+'.txt' - outp = codecs.open(fname, "w", "UTF-8") + fname = rev_unixname+'.txt' + outp = codecs.open(self.path + '/' + fname, "w", "UTF-8") if details['title']: outp.write('title:'+details['title']+'\n') if parent_unixname: @@ -233,6 +236,9 @@ def commitNext(self): commit = self.index.commit(commit_msg, author=author, commit_date=commit_date) self.rev_no += 1 + if self.debug: + print('committed', commit.name_rev, 'by', author) + self.saveState() # Update operation state return True @@ -250,13 +256,13 @@ def updateChildren(self, oldunixname, newunixname): for child in list(self.last_parents.keys()): if self.last_parents[child] == oldunixname: self.updateParentField(child, self.last_parents[child], newunixname) - + # # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. # The rest of the file is preserved. # def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): - with codecs.open(self.path+'\\'+child_unixname+'.txt', "r", "UTF-8") as f: + with codecs.open(self.path+'/'+child_unixname+'.txt', "r", "UTF-8") as f: content = f.readlines() # Since this is all tracked by us, we KNOW there's a line in standard format somewhere idx = content.index('parent:'+parent_oldunixname+'\n') @@ -264,7 +270,7 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna raise Exception("Cannot update child page "+child_unixname+": " +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); content[idx] = 'parent:'+parent_newunixname+'\n' - with codecs.open(self.path+'\\'+child_unixname+'.txt', "w", "UTF-8") as f: + with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f: f.writelines(content) @@ -272,5 +278,5 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna # Finalizes the construction process and deletes any temporary files. # def cleanup(self): - os.remove(self.path+'\\.wstate') - os.remove(self.path+'\\.wrevs') \ No newline at end of file + os.remove(self.path+'/.wstate') + os.remove(self.path+'/.wrevs') From 4990468370d7abc43dae77f77434493900632821 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 15:18:49 +0200 Subject: [PATCH 11/93] don't need the mercurial monkeypatching anymore --- hgpatch.py | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 hgpatch.py diff --git a/hgpatch.py b/hgpatch.py deleted file mode 100644 index 02aed23..0000000 --- a/hgpatch.py +++ /dev/null @@ -1,50 +0,0 @@ -from mercurial import scmutil, osutil -from types import MethodType -from mercurial import encoding -import codecs - -# Patches commit-message unicode handling on Python 2.x - -# Mercurial is internally unicode. But because it runs from ASCII console, it tries to convert -# all input from "input encoding" (set in mercurial/encoding.py) - -# Problem 1: -# If you just pass it u'unicode string', it'll fail. Even if you set "input encoding" to utf-8, -# it'll still try to decode it to ASCII. -# Solution: -# Patch this decoding function to pass unicode unchanged. - -old_fromlocal = None - -def better_fromlocal(s): - if isinstance(s, str): - return s.encode('utf-8') - global old_fromlocal - return old_fromlocal(s) - -old_fromlocal = encoding.fromlocal -encoding.fromlocal = better_fromlocal - - -# Problem 2: -# Separate from actual log, Mercurial stores commit message in commit-message.txt. -# Unfortunately it uses default Python 2.x file.open which expects ASCII and auto-conversion fails. -# Solution: -# Patch virtual-fs open() function to use codecs.open wrapper in this particular case. - -old_vfs_call = None - -def better_vfs_call(self, path, mode="r", text=False, atomictemp=False, notindexed=False, backgroundclose=False): - fp = old_vfs_call(self, path, mode, text, atomictemp, notindexed, backgroundclose) - if path.endswith('last-message.txt'): - # Create a wrapper like codecs.open does: - info = codecs.lookup("utf-8") - fp = codecs.StreamReaderWriter(fp, info.streamreader, info.streamwriter, 'strict') - fp.encoding = 'utf-8' - return fp - -old_vfs_call = scmutil.vfs.__call__ -scmutil.vfs.__call__ = better_vfs_call - - - From 110415dcba5f7091d12e73a3c191b316cd1044d1 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 15:21:07 +0200 Subject: [PATCH 12/93] 'better' commit message when no message from author --- rmaint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rmaint.py b/rmaint.py index d5edec3..8174848 100644 --- a/rmaint.py +++ b/rmaint.py @@ -189,7 +189,7 @@ def commitNext(self): parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None # There are also problems when parent page gets renamed -- see updateChildren - # If the page is tracked and its name just changed, tell HG + # If the page is tracked and its name just changed, tell Git rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) if rename: if self.debug: @@ -221,7 +221,7 @@ def commitNext(self): if rev['comment'] != '': commit_msg = rev_unixname + ': ' + rev['comment'] else: - commit_msg = rev_unixname + commit_msg = 'Updated ' + rev_unixname + ' (no message)' if rev['date']: parsed_time = time.gmtime(int(rev['date'])) # TODO: assumes GMT commit_date = time.strftime('%Y-%m-%d %H:%M:%S', parsed_time) From 134fc1b57f792e2d7fd48e687397eaea62bf7b0a Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 15:21:23 +0200 Subject: [PATCH 13/93] update readme explaining it now uses git --- readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 2458933..755fdc1 100644 --- a/readme.md +++ b/readme.md @@ -4,7 +4,7 @@ This is a Python command line client for relatively popular wiki hosting http:// * See all revisions of a page * Query page source -Most interestingly, it allows you to download the whole site as a Mercurial repository, with proper commit dates and comments! +Most interestingly, it allows you to download the whole site as a Git repository, with proper commit dates, author and comments! ##### Examples: From f397b8ccfba816d55005a0e593300777035d6993 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 15:22:53 +0200 Subject: [PATCH 14/93] .hgignore -> .gitignore --- .gitignore | 2 ++ .hgignore | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 .gitignore delete mode 100644 .hgignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a295864 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +__pycache__ diff --git a/.hgignore b/.hgignore deleted file mode 100644 index a26d142..0000000 --- a/.hgignore +++ /dev/null @@ -1,2 +0,0 @@ -syntax:glob -*.pyc From 3e6e9f1d3be09a1f2b6222daeab99ee568f8f486 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 17:30:49 +0200 Subject: [PATCH 15/93] actually commit changes --- rmaint.py | 21 ++++++++++++++------- wikidot.py | 2 ++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/rmaint.py b/rmaint.py index 8174848..b6ed5c9 100644 --- a/rmaint.py +++ b/rmaint.py @@ -193,10 +193,10 @@ def commitNext(self): rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) if rename: if self.debug: - print("moving", str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt') + print("moving", str(self.last_names[unixname])+'.txt', str(rev_unixname)+'.txt') self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there - self.index.move([str(self.last_names[unixname])+'.txt', +str(rev_unixname)+'.txt']) + self.index.move([str(self.last_names[unixname])+'.txt', str(rev_unixname)+'.txt']) # Ouput contents fname = rev_unixname+'.txt' @@ -208,31 +208,38 @@ def commitNext(self): outp.write(source) outp.close() + commit_msg = "" + # Add new page if not unixname in self.last_names: # never before seen + commit_msg += "Created " if self.debug: print("adding", fname) + elif rev['comment'] == '': + commit_msg += "Updated " - self.index.add([str(fname)]) - - self.last_names[unixname] = rev_unixname + commit_msg += rev_unixname # Commit if rev['comment'] != '': - commit_msg = rev_unixname + ': ' + rev['comment'] + commit_msg += ': ' + rev['comment'] else: - commit_msg = 'Updated ' + rev_unixname + ' (no message)' + commit_msg += ' (no message)' if rev['date']: parsed_time = time.gmtime(int(rev['date'])) # TODO: assumes GMT commit_date = time.strftime('%Y-%m-%d %H:%M:%S', parsed_time) else: commit_date = None + print(("Commiting: "+str(self.rev_no)+'. '+commit_msg)) username = str(rev['user']) email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename author = Actor(username, email) + + self.index.add([str(fname)]) + self.last_names[unixname] = rev_unixname commit = self.index.commit(commit_msg, author=author, commit_date=commit_date) self.rev_no += 1 diff --git a/wikidot.py b/wikidot.py index ba34f39..977c286 100644 --- a/wikidot.py +++ b/wikidot.py @@ -72,6 +72,8 @@ def list_pages(self, limit): pages = [] for entry in soup.div.p.text.split('\n'): pages.append(entry) + if self.debug: + print('Pages found:', len(pages)) return pages From 61838a1b41cdf5e47ca3ff439a6cdb5de61222bf Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 21 Jul 2019 18:17:37 +0200 Subject: [PATCH 16/93] handle more than 250 pages --- wikidot.py | 64 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/wikidot.py b/wikidot.py index 977c286..39c61f8 100644 --- a/wikidot.py +++ b/wikidot.py @@ -26,7 +26,7 @@ def _wait_request_slot(self): pass # Makes a Wikidot AJAX query. Returns the response+title or throws an error. - def queryex(self, params): + def queryex(self, params, urlAppend = None): token = "".join(random.choice('abcdefghijklmnopqrstuvwxyz0123456789') for i in range(8)) cookies = {"wikidot_token7": token} params['wikidot_token7'] = token @@ -36,17 +36,22 @@ def queryex(self, params): print(cookies) self._wait_request_slot() - req = requests.request('POST', self.site+'/ajax-module-connector.php', data=params, cookies=cookies) + url = self.site+'/ajax-module-connector.php' + if urlAppend is not None: + url += urlAppend + print('url', url) + req = requests.request('POST', url, data=params, cookies=cookies) json = req.json() + print(json) if json['status'] == 'ok': return json['body'], (json['title'] if 'title' in json else '') else: raise req.text # Same but only returns the body, most responses don't have titles - def query(self, params): - return self.queryex(params)[0] + def query(self, params, urlAppend = None): + return self.queryex(params, urlAppend)[0] # List all pages for the site. @@ -54,26 +59,61 @@ def query(self, params): # Raw version # For the supported formats (module_body) see: # See https://github.com/gabrys/wikidot/blob/master/php/modules/list/ListPagesModule.php - def list_pages_raw(self, limit): + def list_pages_raw(self, limit, offset): res = self.query({ 'moduleName': 'list/ListPagesModule', 'limit': limit if limit else '10000', 'perPage': limit if limit else '10000', 'module_body': '%%page_unix_name%%', 'separate': 'false', + 'p': str(offset), 'order': 'dateCreatedDesc', # This way limit makes sense. This is also the default - }) + }, '/p/' + str(offset)) return res # Client version def list_pages(self, limit): - raw = self.list_pages_raw(limit).replace('
',"\n") - soup = BeautifulSoup(raw, 'html.parser') + offset = 1 pages = [] - for entry in soup.div.p.text.split('\n'): - pages.append(entry) - if self.debug: - print('Pages found:', len(pages)) + + while True: + raw = self.list_pages_raw(limit, offset).replace('
',"\n") + soup = BeautifulSoup(raw, 'html.parser') + + + for entry in soup.div.p.text.split('\n'): + pages.append(entry) + if self.debug: + print('Pages found:', len(pages)) + + targets = soup.find_all('span','target') + if len(targets) < 2: + print("unable to find next target") + break + + next_url = targets[-1].a.get('href').split('/') + if len(next_url) > 0 and next_url[-1].isnumeric(): + next_page = int(next_url[-1]) + print('next page', next_page) + else: + print("invalid next url", next_url) + break + + #next_page = int(targets[0].a.text) + + current_spans = soup.find_all('span','current') + if len(current_spans) > 0: + current_page = int(current_spans[0].text) + print('current page', current_page) + else: + print("unable to find current page") + break; + + if next_page != offset + 1: + print('next page is wrong', next_page) + break + + offset += 1 return pages From c63bf3bc1acfe8e62cc4d273cd47335fc2b94c49 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 27 Jul 2019 12:13:02 +0200 Subject: [PATCH 17/93] cache fetched pages --- rmaint.py | 81 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/rmaint.py b/rmaint.py index b6ed5c9..70993c6 100644 --- a/rmaint.py +++ b/rmaint.py @@ -55,6 +55,11 @@ def loadWRevs(self): self.wrevs = pickle.load(fp) fp.close() + def savePages(self, pages): + fp = open(self.path+'/.pages', 'wb') + pickle.dump(pages, fp) + fp.close() + # # Compiles a combined revision list for a given set of pages, or all pages on the site. # pages: compile history for these pages @@ -68,32 +73,72 @@ def buildRevisionList(self, pages = None, depth = 10000): print("Loading cached revision list...") self.loadWRevs() else: - print("Building revision list...") + self.wrevs = [] + print('no wrevs') + + print("Building revision list...") + if not pages: + if os.path.isfile(self.path+'/.pages'): + print('loading fetched pages') + fp = open(self.path+'/.pages', 'rb') + pages = pickle.load(fp) + fp.close() + + print('need to fetch pages') if not pages: pages = self.wd.list_pages(10000) - self.wrevs = [] - for page in pages: - print(("Querying page: "+page)) - page_id = self.wd.get_page_id(page) - print(("ID: "+str(page_id))) - revs = self.wd.get_revisions(page_id, depth) - print(("Revisions: "+str(len(revs)))) - for rev in revs: - self.wrevs.append({ - 'page_id' : page_id, - 'page_name' : page, # name atm, not at revision time - 'rev_id' : rev['id'], - 'date' : rev['date'], - 'user' : rev['user'], - 'comment' : rev['comment'], - }) + self.savePages(pages) + + + fetched_pages = [] + + for wrev in self.wrevs: + page_name = wrev['page_name'] + + if page_name in fetched_pages: + continue + + fetched_pages.append(page_name) + + print("fetched " + str(len(fetched_pages)) + " of " + str(len(pages))) + + #self.wrevs = [] + fetched = 0 + for page in pages: + if page in fetched_pages: + print('already fetched', page) + continue + + print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages))) + fetched += 1 + page_id = self.wd.get_page_id(page) + print(("ID: "+str(page_id))) + if page_id is None: + print('page lost', page) + continue + + revs = self.wd.get_revisions(page_id, depth) + print(("Revisions: "+str(len(revs)))) + for rev in revs: + self.wrevs.append({ + 'page_id' : page_id, + 'page_name' : page, # name atm, not at revision time + 'rev_id' : rev['id'], + 'date' : rev['date'], + 'user' : rev['user'], + 'comment' : rev['comment'], + }) self.saveWRevs() # Save a cached copy - print("") + self.saveWRevs() # Save a cached copy + os.remove(self.path+'/.pages') + print("") print(("Total revisions: "+str(len(self.wrevs)))) print("Sorting revisions...") + print(self.wrevs[0]) + print(self.wrevs[0]['date']) self.wrevs.sort(key=lambda rev: rev['date']) print("") From 92a05100fd998132d52413bc69b7c8c3cf915dfe Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 27 Jul 2019 12:13:26 +0200 Subject: [PATCH 18/93] less debug spam, fix exception for python3 compatibility --- wikidot.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/wikidot.py b/wikidot.py index 39c61f8..a6d38e7 100644 --- a/wikidot.py +++ b/wikidot.py @@ -39,15 +39,14 @@ def queryex(self, params, urlAppend = None): url = self.site+'/ajax-module-connector.php' if urlAppend is not None: url += urlAppend - print('url', url) + req = requests.request('POST', url, data=params, cookies=cookies) json = req.json() - print(json) if json['status'] == 'ok': return json['body'], (json['title'] if 'title' in json else '') else: - raise req.text + raise Exception(req.text) # Same but only returns the body, most responses don't have titles def query(self, params, urlAppend = None): From 149136d2f11f608b4c514543ead5697ab2cba21b Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 27 Jul 2019 12:13:41 +0200 Subject: [PATCH 19/93] missing declaration --- wikidot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/wikidot.py b/wikidot.py index a6d38e7..53a0c69 100644 --- a/wikidot.py +++ b/wikidot.py @@ -173,6 +173,7 @@ def get_revisions(self, page_id, limit): # Username in a last
under user_span = tr.find("span", attrs={"class": "printuser"}) + last_a = None for last_a in user_span.find_all('a'): pass rev_user = last_a.getText() if last_a else None From dd0738a6e912795713ae0e34e9bcc9c2a42f972a Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 27 Jul 2019 12:14:34 +0200 Subject: [PATCH 20/93] less debug spam, fix skipping already fetched --- rmaint.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index 70993c6..89060a5 100644 --- a/rmaint.py +++ b/rmaint.py @@ -106,7 +106,11 @@ def buildRevisionList(self, pages = None, depth = 10000): fetched = 0 for page in pages: if page in fetched_pages: - print('already fetched', page) + #print('already fetched', page) + continue + + if page == "sandbox": + print("Skipping", page) continue print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages))) From 335f1c8726e1e1c015e093c92ff1875f092a2ae0 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 27 Jul 2019 12:14:50 +0200 Subject: [PATCH 21/93] check for .git when checking if there's an existing repo --- rmaint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index 89060a5..5d3f00d 100644 --- a/rmaint.py +++ b/rmaint.py @@ -184,7 +184,7 @@ def openRepo(self): self.last_names = {} # Tracks page renames: name atm -> last name in repo self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - if os.path.isfile(self.path+'/.wstate'): + if os.path.isfile(self.path+'/.git'): print("Continuing from aborted dump state...") self.loadState() self.repo = Repo(self.path) From f6bd4e78db7550d81ac7941ef8c9bb3fd96bc926 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 27 Jul 2019 15:00:41 +0200 Subject: [PATCH 22/93] fix renames --- rmaint.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/rmaint.py b/rmaint.py index 5d3f00d..d696b2b 100644 --- a/rmaint.py +++ b/rmaint.py @@ -134,7 +134,10 @@ def buildRevisionList(self, pages = None, depth = 10000): }) self.saveWRevs() # Save a cached copy self.saveWRevs() # Save a cached copy - os.remove(self.path+'/.pages') + + if os.path.isfile(self.path+'/.pages'): + os.remove(self.path+'/.pages') + print("") @@ -239,16 +242,24 @@ def commitNext(self): # There are also problems when parent page gets renamed -- see updateChildren # If the page is tracked and its name just changed, tell Git + fname = str(rev_unixname) + '.txt' rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) + if rename: + name_rename_from = str(self.last_names[unixname])+'.txt' + if self.debug: - print("moving", str(self.last_names[unixname])+'.txt', str(rev_unixname)+'.txt') + print("moving", name_rename_from, "to", fname) self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there - self.index.move([str(self.last_names[unixname])+'.txt', str(rev_unixname)+'.txt']) + + # Try to do the best we can, these situations usually stem from vandalism people have cleaned up + if os.path.isfile(self.path + '/' + name_rename_from): + self.index.move([name_rename_from, fname], force=True) + else: + print("source file does not exist, probably deleted or renamed from already", name_rename_from) # Ouput contents - fname = rev_unixname+'.txt' outp = codecs.open(self.path + '/' + fname, "w", "UTF-8") if details['title']: outp.write('title:'+details['title']+'\n') From 974ddb0f4495bec15cfd38a91c8cd2b418df7caf Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 28 Jul 2019 14:17:06 +0200 Subject: [PATCH 23/93] fix cleanup, store fetched IDs so we don't fetch again later --- rmaint.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/rmaint.py b/rmaint.py index d696b2b..d056964 100644 --- a/rmaint.py +++ b/rmaint.py @@ -33,6 +33,7 @@ def __init__(self, wikidot, path): # Internal state self.wrevs = None # Compiled wikidot revision list (history) + self.fetcheds_revids = [] # Compiled wikidot revision list (history) self.rev_no = 0 # Next revision to process self.last_names = {} # Tracks page renames: name atm -> last name in repo @@ -60,6 +61,15 @@ def savePages(self, pages): pickle.dump(pages, fp) fp.close() + def saveFetched(self): + fp = open(self.path+'/.fetched', 'wb') + pickle.dump(self.fetched_revids, fp) + fp.close() + + def loadFetched(self): + fp = open(self.path+'/.fetched', 'rb') + self.fetched_revids = pickle.load(fp) + fp.close() # # Compiles a combined revision list for a given set of pages, or all pages on the site. # pages: compile history for these pages @@ -76,6 +86,11 @@ def buildRevisionList(self, pages = None, depth = 10000): self.wrevs = [] print('no wrevs') + if os.path.isfile(self.path+'/.fetched'): + loadFetched() + else: + self.fetched_revids = [] + print("Building revision list...") if not pages: if os.path.isfile(self.path+'/.pages'): @@ -124,6 +139,9 @@ def buildRevisionList(self, pages = None, depth = 10000): revs = self.wd.get_revisions(page_id, depth) print(("Revisions: "+str(len(revs)))) for rev in revs: + if rev['id'] in self.fetched_revids: + continue + self.wrevs.append({ 'page_id' : page_id, 'page_name' : page, # name atm, not at revision time @@ -133,10 +151,6 @@ def buildRevisionList(self, pages = None, depth = 10000): 'comment' : rev['comment'], }) self.saveWRevs() # Save a cached copy - self.saveWRevs() # Save a cached copy - - if os.path.isfile(self.path+'/.pages'): - os.remove(self.path+'/.pages') print("") @@ -306,7 +320,11 @@ def commitNext(self): if self.debug: print('committed', commit.name_rev, 'by', author) + self.fetched_revids.append(rev['rev_id']) + self.saveFetched() + self.saveState() # Update operation state + return True @@ -347,3 +365,7 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna def cleanup(self): os.remove(self.path+'/.wstate') os.remove(self.path+'/.wrevs') + + if os.path.isfile(self.path+'/.pages'): + os.remove(self.path+'/.pages') + From 7bf42b30b9f66da67a58826c582ca38fd648342e Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 28 Jul 2019 14:17:19 +0200 Subject: [PATCH 24/93] more verbose output when returned json fails to parse --- wikidot.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/wikidot.py b/wikidot.py index 53a0c69..2bc5eb4 100644 --- a/wikidot.py +++ b/wikidot.py @@ -41,7 +41,12 @@ def queryex(self, params, urlAppend = None): url += urlAppend req = requests.request('POST', url, data=params, cookies=cookies) - json = req.json() + try: + json = req.json() + except JSONDecodeError as e: + print(e, req, url, params) + raise e + #print(json) if json['status'] == 'ok': return json['body'], (json['title'] if 'title' in json else '') From 04e93240a19ab3de327b1358e1c69566c32e9bb6 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 28 Jul 2019 14:43:00 +0200 Subject: [PATCH 25/93] fix storing/skipping of already fetched revisions --- rmaint.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index d056964..7f136c3 100644 --- a/rmaint.py +++ b/rmaint.py @@ -87,7 +87,7 @@ def buildRevisionList(self, pages = None, depth = 10000): print('no wrevs') if os.path.isfile(self.path+'/.fetched'): - loadFetched() + self.loadFetched() else: self.fetched_revids = [] @@ -140,6 +140,7 @@ def buildRevisionList(self, pages = None, depth = 10000): print(("Revisions: "+str(len(revs)))) for rev in revs: if rev['id'] in self.fetched_revids: + print(rev['id'], 'already fetched') continue self.wrevs.append({ @@ -229,6 +230,14 @@ def commitNext(self): return False rev = self.wrevs[self.rev_no] + + if rev['rev_id'] in self.fetched_revids: + print(rev['rev_id'], 'already fetched') + self.rev_no += 1 + + self.saveState() # Update operation state + return True + source = self.wd.get_revision_source(rev['rev_id']) # Page title and unix_name changes are only available through another request: details = self.wd.get_revision_version(rev['rev_id']) From ab82f222570f2e2c62eb678015061028d7422c7c Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 28 Jul 2019 14:43:16 +0200 Subject: [PATCH 26/93] track renames with symlinks --- rmaint.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rmaint.py b/rmaint.py index 7f136c3..41a8604 100644 --- a/rmaint.py +++ b/rmaint.py @@ -266,6 +266,12 @@ def commitNext(self): # If the page is tracked and its name just changed, tell Git fname = str(rev_unixname) + '.txt' + + # We track renames as symlinks to try to emulate how it handles redirects from old to new names + # But if it is overwritten, don't write into the symlinked file, create a new one + if os.path.islink(self.path + '/' + fname): + os.remove(self.path + '/' + fname) + rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) if rename: @@ -279,6 +285,10 @@ def commitNext(self): # Try to do the best we can, these situations usually stem from vandalism people have cleaned up if os.path.isfile(self.path + '/' + name_rename_from): self.index.move([name_rename_from, fname], force=True) + + # Because the wiki redirects + os.symlink(fname, self.path + '/' + name_rename_from) + self.index.add([name_rename_from]) else: print("source file does not exist, probably deleted or renamed from already", name_rename_from) From bf568d512924a77942a45d32893a2d20fe469295 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 28 Jul 2019 15:05:27 +0200 Subject: [PATCH 27/93] avoid making the terminal backlog useless when scraping scp --- rmaint.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/rmaint.py b/rmaint.py index 41a8604..c7d0bf8 100644 --- a/rmaint.py +++ b/rmaint.py @@ -165,10 +165,13 @@ def buildRevisionList(self, pages = None, depth = 10000): print("") if self.debug: - print("Revision list: ") - for rev in self.wrevs: - print((str(rev)+"\n")) - print("") + if len(self.wrevs) < 100: + print("Revision list: ") + for rev in self.wrevs: + print((str(rev)+"\n")) + print("") + else: + print("Too many revisions, not printing everything") # From 8850e83cf6e6009eeee15f0b56be09aaa8b701c1 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 28 Jul 2019 15:06:43 +0200 Subject: [PATCH 28/93] Revert "track renames with symlinks" Renames don't automatically redirect, so stop trying to emulate that. Instead we properly track redirect pages. This reverts commit ab82f222570f2e2c62eb678015061028d7422c7c. --- rmaint.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/rmaint.py b/rmaint.py index c7d0bf8..e195b5f 100644 --- a/rmaint.py +++ b/rmaint.py @@ -269,12 +269,6 @@ def commitNext(self): # If the page is tracked and its name just changed, tell Git fname = str(rev_unixname) + '.txt' - - # We track renames as symlinks to try to emulate how it handles redirects from old to new names - # But if it is overwritten, don't write into the symlinked file, create a new one - if os.path.islink(self.path + '/' + fname): - os.remove(self.path + '/' + fname) - rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) if rename: @@ -288,10 +282,6 @@ def commitNext(self): # Try to do the best we can, these situations usually stem from vandalism people have cleaned up if os.path.isfile(self.path + '/' + name_rename_from): self.index.move([name_rename_from, fname], force=True) - - # Because the wiki redirects - os.symlink(fname, self.path + '/' + name_rename_from) - self.index.add([name_rename_from]) else: print("source file does not exist, probably deleted or renamed from already", name_rename_from) From d129d91ddfa4490f9d6bcba9d6f8a2e454b6ecf1 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 28 Jul 2019 15:07:34 +0200 Subject: [PATCH 29/93] track redirect pages correctly --- wikidot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikidot.py b/wikidot.py index 2bc5eb4..885719c 100644 --- a/wikidot.py +++ b/wikidot.py @@ -127,7 +127,7 @@ def list_pages(self, limit): def get_page_id(self, page_unix_name): # The only freaking way to get page ID is to load the page! Wikidot! self._wait_request_slot() - req = requests.request('GET', self.site+'/'+page_unix_name) + req = requests.request('GET', self.site+'/'+page_unix_name + '/noredirect/true') soup = BeautifulSoup(req.text, 'html.parser') for item in soup.head.find_all('script'): text = item.text From 23e6412ae787ce58e1ddf9a46da53974fb9eb6eb Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 28 Jul 2019 15:16:22 +0200 Subject: [PATCH 30/93] control debug spam --- rmaint.py | 60 ++++++++++++++++++++++++++++++++---------------------- wikidot.py | 22 ++++++++++++++------ 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/rmaint.py b/rmaint.py index e195b5f..68dd275 100644 --- a/rmaint.py +++ b/rmaint.py @@ -84,22 +84,27 @@ def buildRevisionList(self, pages = None, depth = 10000): self.loadWRevs() else: self.wrevs = [] - print('no wrevs') + if self.debug: + print('No existing wrevs') if os.path.isfile(self.path+'/.fetched'): self.loadFetched() else: self.fetched_revids = [] - print("Building revision list...") + if self.debug: + print("Building revision list...") + if not pages: if os.path.isfile(self.path+'/.pages'): - print('loading fetched pages') + print('Loading fetched pages') fp = open(self.path+'/.pages', 'rb') pages = pickle.load(fp) fp.close() - print('need to fetch pages') + if self.debug: + print('Need to fetch pages') + if not pages: pages = self.wd.list_pages(10000) self.savePages(pages) @@ -115,29 +120,35 @@ def buildRevisionList(self, pages = None, depth = 10000): fetched_pages.append(page_name) - print("fetched " + str(len(fetched_pages)) + " of " + str(len(pages))) + if self.debug: + print("Already fetched " + str(len(fetched_pages)) + " of " + str(len(pages))) - #self.wrevs = [] fetched = 0 for page in pages: if page in fetched_pages: #print('already fetched', page) continue + # TODO: more generic blacklisting if page == "sandbox": - print("Skipping", page) + if self.debug: + print("Skipping", page) continue - print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages))) + if self.debug: + print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages))) fetched += 1 page_id = self.wd.get_page_id(page) - print(("ID: "+str(page_id))) + + if self.debug: + print(("ID: "+str(page_id))) + if page_id is None: - print('page lost', page) + print('Page gone?', page) continue revs = self.wd.get_revisions(page_id, depth) - print(("Revisions: "+str(len(revs)))) + print("Revisions to fetch: "+str(len(revs))) for rev in revs: if rev['id'] in self.fetched_revids: print(rev['id'], 'already fetched') @@ -154,18 +165,17 @@ def buildRevisionList(self, pages = None, depth = 10000): self.saveWRevs() # Save a cached copy print("") - - + print(("Total revisions: "+str(len(self.wrevs)))) - - print("Sorting revisions...") - print(self.wrevs[0]) - print(self.wrevs[0]['date']) + + if self.debug: + print("Sorting revisions...") + self.wrevs.sort(key=lambda rev: rev['date']) - print("") if self.debug: if len(self.wrevs) < 100: + print("") print("Revision list: ") for rev in self.wrevs: print((str(rev)+"\n")) @@ -235,7 +245,9 @@ def commitNext(self): rev = self.wrevs[self.rev_no] if rev['rev_id'] in self.fetched_revids: - print(rev['rev_id'], 'already fetched') + if self.debug: + print(rev['rev_id'], 'already fetched') + self.rev_no += 1 self.saveState() # Update operation state @@ -275,7 +287,7 @@ def commitNext(self): name_rename_from = str(self.last_names[unixname])+'.txt' if self.debug: - print("moving", name_rename_from, "to", fname) + print("Moving renamed", name_rename_from, "to", fname) self.updateChildren(self.last_names[unixname], rev_unixname) # Update children which reference us -- see comments there @@ -283,7 +295,7 @@ def commitNext(self): if os.path.isfile(self.path + '/' + name_rename_from): self.index.move([name_rename_from, fname], force=True) else: - print("source file does not exist, probably deleted or renamed from already", name_rename_from) + print("Source file does not exist, probably deleted or renamed from already?", name_rename_from) # Ouput contents outp = codecs.open(self.path + '/' + fname, "w", "UTF-8") @@ -300,7 +312,7 @@ def commitNext(self): if not unixname in self.last_names: # never before seen commit_msg += "Created " if self.debug: - print("adding", fname) + print("Adding", fname) elif rev['comment'] == '': commit_msg += "Updated " @@ -317,7 +329,7 @@ def commitNext(self): else: commit_date = None - print(("Commiting: "+str(self.rev_no)+'. '+commit_msg)) + print("Committing: " + str(self.rev_no) + '. '+commit_msg) username = str(rev['user']) email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename @@ -330,7 +342,7 @@ def commitNext(self): self.rev_no += 1 if self.debug: - print('committed', commit.name_rev, 'by', author) + print('Committed', commit.name_rev, 'by', author) self.fetched_revids.append(rev['rev_id']) self.saveFetched() diff --git a/wikidot.py b/wikidot.py index 885719c..07d44ff 100644 --- a/wikidot.py +++ b/wikidot.py @@ -44,9 +44,8 @@ def queryex(self, params, urlAppend = None): try: json = req.json() except JSONDecodeError as e: - print(e, req, url, params) + print('Failed to parse response from wikidot', e, req, url, params) raise e - #print(json) if json['status'] == 'ok': return json['body'], (json['title'] if 'title' in json else '') @@ -87,18 +86,22 @@ def list_pages(self, limit): for entry in soup.div.p.text.split('\n'): pages.append(entry) + if self.debug: print('Pages found:', len(pages)) targets = soup.find_all('span','target') if len(targets) < 2: - print("unable to find next target") + print("Unable to find next listing page, not enough target spans") break next_url = targets[-1].a.get('href').split('/') if len(next_url) > 0 and next_url[-1].isnumeric(): next_page = int(next_url[-1]) - print('next page', next_page) + + if self.debug: + print('Next listing page', next_page) + else: print("invalid next url", next_url) break @@ -108,16 +111,23 @@ def list_pages(self, limit): current_spans = soup.find_all('span','current') if len(current_spans) > 0: current_page = int(current_spans[0].text) - print('current page', current_page) + + if self.debug: + print('Current listing page', current_page) + else: print("unable to find current page") break; if next_page != offset + 1: - print('next page is wrong', next_page) + if self.debug: + print('Next page is wrong', next_page, 'hopefully at the end') break offset += 1 + + print("Fetching listing page", offset) + return pages From b40b560fb32dc9d0a73aaaf4b399189931cf7c96 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Mon, 29 Jul 2019 10:24:24 +0200 Subject: [PATCH 31/93] more cleaning of debug output --- rmaint.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/rmaint.py b/rmaint.py index 68dd275..a930fee 100644 --- a/rmaint.py +++ b/rmaint.py @@ -102,16 +102,23 @@ def buildRevisionList(self, pages = None, depth = 10000): pages = pickle.load(fp) fp.close() - if self.debug: - print('Need to fetch pages') if not pages: + if self.debug: + print('Need to fetch pages') pages = self.wd.list_pages(10000) self.savePages(pages) + elif self.debug: + print(len(pages), 'pages loaded') fetched_pages = [] + if self.debug: + print('Collecting already pages we already got revisions for') + + # TODO: I don't know python, but this is highly suboptimal (and takes a ton of time) + # Should use a set/hashmap/whatever python calls it for wrev in self.wrevs: page_name = wrev['page_name'] @@ -121,12 +128,11 @@ def buildRevisionList(self, pages = None, depth = 10000): fetched_pages.append(page_name) if self.debug: - print("Already fetched " + str(len(fetched_pages)) + " of " + str(len(pages))) + print("Already fetched revisions for " + str(len(fetched_pages)) + " of " + str(len(pages))) fetched = 0 for page in pages: if page in fetched_pages: - #print('already fetched', page) continue # TODO: more generic blacklisting From 5a90bb195511d7ba627c3520d3f99dcc04ba6ca2 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Mon, 29 Jul 2019 10:24:33 +0200 Subject: [PATCH 32/93] fix dates in commits --- rmaint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index a930fee..ade4fce 100644 --- a/rmaint.py +++ b/rmaint.py @@ -344,7 +344,7 @@ def commitNext(self): self.index.add([str(fname)]) self.last_names[unixname] = rev_unixname - commit = self.index.commit(commit_msg, author=author, commit_date=commit_date) + commit = self.index.commit(commit_msg, author=author, commit_date=commit_date, author_date=commit_date) self.rev_no += 1 if self.debug: From e487cdd7260a8d42f2177782cfa0c798550e6016 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Mon, 29 Jul 2019 10:30:02 +0200 Subject: [PATCH 33/93] let commit date be the current datetime, it makes more sense --- rmaint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index ade4fce..4c0bae5 100644 --- a/rmaint.py +++ b/rmaint.py @@ -344,7 +344,7 @@ def commitNext(self): self.index.add([str(fname)]) self.last_names[unixname] = rev_unixname - commit = self.index.commit(commit_msg, author=author, commit_date=commit_date, author_date=commit_date) + commit = self.index.commit(commit_msg, author=author, author_date=commit_date) self.rev_no += 1 if self.debug: From 94fa6ae5bf6561aafc55f94fe5a1297596c942c1 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 4 Aug 2019 14:31:18 +0200 Subject: [PATCH 34/93] python doesn't have this already? --- rmaint.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/rmaint.py b/rmaint.py index 4c0bae5..4508c50 100644 --- a/rmaint.py +++ b/rmaint.py @@ -393,9 +393,12 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna # Finalizes the construction process and deletes any temporary files. # def cleanup(self): - os.remove(self.path+'/.wstate') - os.remove(self.path+'/.wrevs') + if os.path.exists(self.path+'/.wstate'): + os.remove(self.path+'/.wstate') - if os.path.isfile(self.path+'/.pages'): + if os.path.exists(self.path+'/.wrevs'): + os.remove(self.path+'/.wrevs') + + if os.path.exists(self.path+'/.pages'): os.remove(self.path+'/.pages') From f9175f3ce941bef932e2a8ce35bdb1dc7785541b Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 4 Aug 2019 14:36:12 +0200 Subject: [PATCH 35/93] retry in case of gateway errors, which seem to be semi-frequent and quickly recovered from --- wikidot.py | 50 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/wikidot.py b/wikidot.py index 07d44ff..e02caf4 100644 --- a/wikidot.py +++ b/wikidot.py @@ -14,6 +14,7 @@ def __init__(self, site): self.delay = 200 # Delay between requests in msec self.debug = False # Print debug messages self.next_timeslot = time.clock() # Can call immediately + self.max_retries = 5 # To honor usage rules, we wait for self.delay between requests. @@ -35,22 +36,47 @@ def queryex(self, params, urlAppend = None): print(params) print(cookies) - self._wait_request_slot() url = self.site+'/ajax-module-connector.php' if urlAppend is not None: url += urlAppend - req = requests.request('POST', url, data=params, cookies=cookies) - try: - json = req.json() - except JSONDecodeError as e: - print('Failed to parse response from wikidot', e, req, url, params) - raise e - - if json['status'] == 'ok': - return json['body'], (json['title'] if 'title' in json else '') - else: - raise Exception(req.text) + # In case of e. g. 500 errors + retries = 0 + while retries < self.max_retries: + self._wait_request_slot() + + req = requests.request('POST', url, data=params, cookies=cookies) + + # Usually a 502 error, recovers immediately + if req.status_code >= 500: + retries += 1 + print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) + + # In case of debug enabled, we already printed this above + if not self.debug: + print(req, params) + + # Be nice, double wait delay for errors + self._wait_request_slot() + + continue + + try: + # In case of 404 errors or other stuff that indicates + # some bug in how we handle or request things + req.raise_for_status() + json = req.json() + except Exception as e: + print('Failed to get response from wikidot', e, req, url, params) + raise e + + if json['status'] == 'ok': + return json['body'], (json['title'] if 'title' in json else '') + else: + raise Exception(req.text) + + print('Failed too many times', url, params, cookies) + raise Exception('Failed too many times for ' + url) # Same but only returns the body, most responses don't have titles def query(self, params, urlAppend = None): From 7070b38ac2e61a2434e8bf5726b852e729bc6a0e Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 4 Aug 2019 15:33:36 +0200 Subject: [PATCH 36/93] disable removing state tracking files, we want them if we continously update our mirror --- crawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl.py b/crawl.py index 9fd0bab..25d9609 100644 --- a/crawl.py +++ b/crawl.py @@ -105,5 +105,5 @@ def force_dirs(path): while rm.commitNext(): pass - rm.cleanup() + # rm.cleanup() print("Done.") From d103e4db1eff7dbd890c431892ecc821bd1b123a Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 4 Aug 2019 15:37:47 +0200 Subject: [PATCH 37/93] improve tracking of created files (not entirely sure why it didn't work), commit message when renaming --- rmaint.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/rmaint.py b/rmaint.py index 4508c50..a746e52 100644 --- a/rmaint.py +++ b/rmaint.py @@ -204,9 +204,11 @@ def loadState(self): fp = open(self.path+'/.wstate', 'rb') self.rev_no = pickle.load(fp) self.last_names = pickle.load(fp) + try: self.last_parents = pickle.load(fp) - except EOFError: + except EOFError as e: + print('EOFError while loading wstate', e) pass fp.close() @@ -289,6 +291,8 @@ def commitNext(self): fname = str(rev_unixname) + '.txt' rename = (unixname in self.last_names) and (self.last_names[unixname] != rev_unixname) + commit_msg = "" + if rename: name_rename_from = str(self.last_names[unixname])+'.txt' @@ -300,9 +304,18 @@ def commitNext(self): # Try to do the best we can, these situations usually stem from vandalism people have cleaned up if os.path.isfile(self.path + '/' + name_rename_from): self.index.move([name_rename_from, fname], force=True) + commit_msg += "Renamed from " str(self.last_names[unixname]) + ' to ' + str(rev_unixname) + ' ' else: print("Source file does not exist, probably deleted or renamed from already?", name_rename_from) + # Add new page + elif not os.path.isfile(self.path + '/' + fname): # never before seen + commit_msg += "Created " + if self.debug: + print("Adding", fname) + elif rev['comment'] == '': + commit_msg += "Updated " + # Ouput contents outp = codecs.open(self.path + '/' + fname, "w", "UTF-8") if details['title']: @@ -312,16 +325,6 @@ def commitNext(self): outp.write(source) outp.close() - commit_msg = "" - - # Add new page - if not unixname in self.last_names: # never before seen - commit_msg += "Created " - if self.debug: - print("Adding", fname) - elif rev['comment'] == '': - commit_msg += "Updated " - commit_msg += rev_unixname # Commit From 1955c2869a445a3e1a1594fb002bb81ecc0dca3f Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 4 Aug 2019 16:48:40 +0200 Subject: [PATCH 38/93] persist metadata (renames etc.) in the git repo --- crawl.py | 2 +- rmaint.py | 69 ++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 46 insertions(+), 25 deletions(-) diff --git a/crawl.py b/crawl.py index 25d9609..9fd0bab 100644 --- a/crawl.py +++ b/crawl.py @@ -105,5 +105,5 @@ def force_dirs(path): while rm.commitNext(): pass - # rm.cleanup() + rm.cleanup() print("Done.") diff --git a/rmaint.py b/rmaint.py index a746e52..42def44 100644 --- a/rmaint.py +++ b/rmaint.py @@ -4,6 +4,7 @@ import os import codecs import pickle as pickle +import json # git stuff from git import Repo, Actor @@ -61,15 +62,31 @@ def savePages(self, pages): pickle.dump(pages, fp) fp.close() - def saveFetched(self): - fp = open(self.path+'/.fetched', 'wb') - pickle.dump(self.fetched_revids, fp) + def appendFetchedRevid(self, revid): + fp = open(self.path+'/.fetched.txt', 'a') + fp.write(revid + '\n') fp.close() - def loadFetched(self): - fp = open(self.path+'/.fetched', 'rb') - self.fetched_revids = pickle.load(fp) + def loadFetchedRevids(self): + self.fetched_revids = [line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')] + + # Persistent metadata about the repo: + # - Tracks page renames: name atm -> last name in repo + # - Tracks page parent names: name atm -> last parent in repo + def saveMetadata(self): + metadata = {'names': self.last_names, 'parents': self.last_parents } + fp = open(self.path+'/.metadata.json', 'w') + json.dump(metadata, fp) + fp.close() + + def loadMetadata(self): + fp = open(self.path+'/.metadata.json', 'r') + metadata = json.load(fp) + self.last_names = metadata['names'] + self.last_parents = metadata['parents'] fp.close() + + self.loadFetchedRevids() # # Compiles a combined revision list for a given set of pages, or all pages on the site. # pages: compile history for these pages @@ -87,8 +104,9 @@ def buildRevisionList(self, pages = None, depth = 10000): if self.debug: print('No existing wrevs') - if os.path.isfile(self.path+'/.fetched'): - self.loadFetched() + if os.path.isfile(self.path+'/.fetched.txt'): + self.loadFetchedRevids() + print(self.fetched_revids) else: self.fetched_revids = [] @@ -111,7 +129,6 @@ def buildRevisionList(self, pages = None, depth = 10000): elif self.debug: print(len(pages), 'pages loaded') - fetched_pages = [] if self.debug: @@ -170,6 +187,9 @@ def buildRevisionList(self, pages = None, depth = 10000): }) self.saveWRevs() # Save a cached copy + if os.path.isfile(self.path+'/.metadata.json'): + self.loadMetadata() + print("") print(("Total revisions: "+str(len(self.wrevs)))) @@ -196,20 +216,11 @@ def buildRevisionList(self, pages = None, depth = 10000): def saveState(self): fp = open(self.path+'/.wstate', 'wb') pickle.dump(self.rev_no, fp) - pickle.dump(self.last_names, fp) - pickle.dump(self.last_parents, fp) fp.close() def loadState(self): fp = open(self.path+'/.wstate', 'rb') self.rev_no = pickle.load(fp) - self.last_names = pickle.load(fp) - - try: - self.last_parents = pickle.load(fp) - except EOFError as e: - print('EOFError while loading wstate', e) - pass fp.close() @@ -304,7 +315,7 @@ def commitNext(self): # Try to do the best we can, these situations usually stem from vandalism people have cleaned up if os.path.isfile(self.path + '/' + name_rename_from): self.index.move([name_rename_from, fname], force=True) - commit_msg += "Renamed from " str(self.last_names[unixname]) + ' to ' + str(rev_unixname) + ' ' + commit_msg += "Renamed from " + str(self.last_names[unixname]) + ' to ' + str(rev_unixname) + ' ' else: print("Source file does not exist, probably deleted or renamed from already?", name_rename_from) @@ -316,6 +327,8 @@ def commitNext(self): elif rev['comment'] == '': commit_msg += "Updated " + self.last_names[unixname] = rev_unixname + # Ouput contents outp = codecs.open(self.path + '/' + fname, "w", "UTF-8") if details['title']: @@ -340,22 +353,23 @@ def commitNext(self): print("Committing: " + str(self.rev_no) + '. '+commit_msg) + # Include metadata in the commit (if changed) + self.appendFetchedRevid(rev['rev_id']) + self.saveMetadata() + self.index.add([str(fname), '.metadata.json']) + username = str(rev['user']) email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename - author = Actor(username, email) - self.index.add([str(fname)]) - self.last_names[unixname] = rev_unixname commit = self.index.commit(commit_msg, author=author, author_date=commit_date) - self.rev_no += 1 if self.debug: print('Committed', commit.name_rev, 'by', author) self.fetched_revids.append(rev['rev_id']) - self.saveFetched() + self.rev_no += 1 self.saveState() # Update operation state return True @@ -398,10 +412,17 @@ def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixna def cleanup(self): if os.path.exists(self.path+'/.wstate'): os.remove(self.path+'/.wstate') + else: + print("wstate does not exist?") if os.path.exists(self.path+'/.wrevs'): os.remove(self.path+'/.wrevs') + else: + print("wrevs does not exist?") if os.path.exists(self.path+'/.pages'): os.remove(self.path+'/.pages') + if self.rev_no > 0: + self.index.add(['.fetched.txt']) + self.index.commit('Updating fetched revisions') From 3d08cc21872b752fd3c729ba51249276f4d687ea Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Tue, 14 Jul 2020 10:46:08 +0200 Subject: [PATCH 39/93] add dependencies to readme --- readme.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 755fdc1..c13d967 100644 --- a/readme.md +++ b/readme.md @@ -1,4 +1,7 @@ -This is a Python command line client for relatively popular wiki hosting http://www.wikidot.com which lets you: +*This is a fork to make a permanent backup of the SCP wiki.* + +This is a Python command line client for relatively popular wiki hosting +http://www.wikidot.com which lets you: * List all pages on a site * See all revisions of a page @@ -6,6 +9,14 @@ This is a Python command line client for relatively popular wiki hosting http:// Most interestingly, it allows you to download the whole site as a Git repository, with proper commit dates, author and comments! +##### Dependencies + +At least: + +* Python 3 +* python-beautifulsoup4 +* python-gitpython + ##### Examples: crawl.py http://example.wikidot.com --dump ExampleRepo From b29c0eee94949600e0e2f9c88fb8ca6f50d491fd Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Tue, 14 Jul 2020 10:47:54 +0200 Subject: [PATCH 40/93] python's time.clock() is gone --- wikidot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wikidot.py b/wikidot.py index e02caf4..2595bdc 100644 --- a/wikidot.py +++ b/wikidot.py @@ -13,14 +13,14 @@ def __init__(self, site): self.sitename = urlparse(site).hostname.lower() self.delay = 200 # Delay between requests in msec self.debug = False # Print debug messages - self.next_timeslot = time.clock() # Can call immediately + self.next_timeslot = time.process_time() # Can call immediately self.max_retries = 5 # To honor usage rules, we wait for self.delay between requests. # Low-level query functions call this before every request to Wikidot./ def _wait_request_slot(self): - tm = time.clock() + tm = time.process_time() if self.next_timeslot - tm > 0: time.sleep(self.next_timeslot - tm) self.next_timeslot = tm + self.delay / 1000 From bcf32407d2c8cc2e084edc1ff773631355f8997c Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Tue, 14 Jul 2020 11:01:42 +0200 Subject: [PATCH 41/93] bs (appropriate name) apparently has changed its API --- wikidot.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/wikidot.py b/wikidot.py index 2595bdc..bbf6fb0 100644 --- a/wikidot.py +++ b/wikidot.py @@ -166,7 +166,11 @@ def get_page_id(self, page_unix_name): req = requests.request('GET', self.site+'/'+page_unix_name + '/noredirect/true') soup = BeautifulSoup(req.text, 'html.parser') for item in soup.head.find_all('script'): - text = item.text + text = item.string + if text is None: + print("No text in script item", item) + continue + pos = text.find("WIKIREQUEST.info.pageId = ") if pos >= 0: pos += len("WIKIREQUEST.info.pageId = ") From 817c4d050a701adb020543faae6a16f5ff3d7288 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Tue, 14 Jul 2020 11:02:00 +0200 Subject: [PATCH 42/93] avoid double / in the URLs --- wikidot.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wikidot.py b/wikidot.py index bbf6fb0..29b6c65 100644 --- a/wikidot.py +++ b/wikidot.py @@ -10,6 +10,10 @@ class Wikidot: def __init__(self, site): self.site = site # Wikidot site to query + + # strip out trailing /, if it exists + if self.site[-1] == '/': + self.site = self.site[:-1] self.sitename = urlparse(site).hostname.lower() self.delay = 200 # Delay between requests in msec self.debug = False # Print debug messages From fd918ad8116d9094f18a97374765eb926b6a3d1b Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Tue, 14 Jul 2020 11:02:19 +0200 Subject: [PATCH 43/93] print URLs we fetch in debug output --- wikidot.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/wikidot.py b/wikidot.py index 29b6c65..e0b4e26 100644 --- a/wikidot.py +++ b/wikidot.py @@ -167,7 +167,12 @@ def list_pages(self, limit): def get_page_id(self, page_unix_name): # The only freaking way to get page ID is to load the page! Wikidot! self._wait_request_slot() - req = requests.request('GET', self.site+'/'+page_unix_name + '/noredirect/true') + url = self.site+'/'+page_unix_name + '/noredirect/true'; + + if self.debug: + print("fetching", url) + + req = requests.request('GET', url) soup = BeautifulSoup(req.text, 'html.parser') for item in soup.head.find_all('script'): text = item.string From 2fcec563645f7db1d3396b5966adda8ea6065178 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Tue, 14 Jul 2020 12:12:53 +0200 Subject: [PATCH 44/93] try to have more robust fetching (longer waiting on errors) --- wikidot.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wikidot.py b/wikidot.py index e0b4e26..828ced8 100644 --- a/wikidot.py +++ b/wikidot.py @@ -63,6 +63,10 @@ def queryex(self, params, urlAppend = None): # Be nice, double wait delay for errors self._wait_request_slot() + # Extra nice, sleep longer (expoential increase), hope for the + # server to recover + time.sleep(retries * retries * self.delay) + continue try: From c4d907ef07a66a866b1a1d8a4d9fb1b788b51343 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Tue, 14 Jul 2020 12:13:47 +0200 Subject: [PATCH 45/93] extract list of embedded images --- wikidot.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/wikidot.py b/wikidot.py index 828ced8..4fba067 100644 --- a/wikidot.py +++ b/wikidot.py @@ -279,6 +279,29 @@ def get_revision_version(self, rev_id): res = self.get_revision_version_raw(rev_id) # this has title! soup = BeautifulSoup(res[0], 'html.parser') + images = [] + for img_div in soup.find_all("div", attrs={"class": "scp-image-block"}): + img_src = None + img_name = "" + full_link = img_div.find("a") + if full_link is not None: + # Check if it has a thumbnail, otherwise we can't trust that it is the original + img = full_link.find("img", attrs={"class": "enlarge"}) + if img is not None: + img_src = full_link["href"] + img_name = img["alt"] + + if img_src is None: + img = img_div.find("img") + if img is not None: + img_src = img["src"] + img_name = img["alt"] + + if img_src is not None: + # Just in case, I don't think it ever happens + img_name = img_name.replace("/", "_forward_slash_") + images.append({"src": img_src, "filename": img_name}) + # First table is a flyout with revision details. Remove and study it. unixname = None details = soup.find("div", attrs={"id": "page-version-info"}).extract() @@ -293,4 +316,5 @@ def get_revision_version(self, rev_id): 'unixname': unixname, 'title': res[1], 'content': str(soup), # only content remains + 'images': images, } From 8d0a5eeafe5c8368ee1ff2b6d136d7b7217a22a4 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Tue, 14 Jul 2020 14:03:01 +0200 Subject: [PATCH 46/93] fix image downloading (TODO: make it add them in the right commit, now it just downloads them as they appear) --- rmaint.py | 18 +++++++- wikidot.py | 120 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 128 insertions(+), 10 deletions(-) diff --git a/rmaint.py b/rmaint.py index 42def44..5cb56aa 100644 --- a/rmaint.py +++ b/rmaint.py @@ -304,6 +304,8 @@ def commitNext(self): commit_msg = "" + added_file_paths = [] + if rename: name_rename_from = str(self.last_names[unixname])+'.txt' @@ -332,12 +334,14 @@ def commitNext(self): # Ouput contents outp = codecs.open(self.path + '/' + fname, "w", "UTF-8") if details['title']: - outp.write('title:'+details['title']+'\n') + outp.write('title:' + details['title']+'\n') if parent_unixname: outp.write('parent:'+parent_unixname+'\n') outp.write(source) outp.close() + added_file_paths.append(str(fname)) + commit_msg += rev_unixname # Commit @@ -351,12 +355,22 @@ def commitNext(self): else: commit_date = None + got_images = False; + for image in details['images']: + if self.wd.maybe_download_file(image['src'], self.path + '/' + image['filepath']): + got_images = True + # If we do this gitpython barfs on itself + #added_file_paths.append(image['filepath']) + + if got_images: + added_file_paths.append("images") print("Committing: " + str(self.rev_no) + '. '+commit_msg) # Include metadata in the commit (if changed) self.appendFetchedRevid(rev['rev_id']) self.saveMetadata() - self.index.add([str(fname), '.metadata.json']) + added_file_paths.append('.metadata.json') + self.index.add(added_file_paths) username = str(rev['user']) email = re.sub(pattern = r'[^a-zA-Z0-9\-.+]', repl='', string=username).lower() + '@' + self.wd.sitename diff --git a/wikidot.py b/wikidot.py index 4fba067..188f9f5 100644 --- a/wikidot.py +++ b/wikidot.py @@ -2,7 +2,12 @@ import random from bs4 import BeautifulSoup import time -from urllib.parse import urlparse +from urllib.parse import urlparse, urljoin +from pprint import pprint +import pathlib +import hashlib +import os +import shutil # Implements various queries to Wikidot engine through its AJAX facilities @@ -20,6 +25,65 @@ def __init__(self, site): self.next_timeslot = time.process_time() # Can call immediately self.max_retries = 5 + # Downloads file if it doesn't exist + def maybe_download_file(self, url, file_path): + self._wait_request_slot() + + path = pathlib.Path(file_path) + if path.exists(): + if self.debug: + print(file_path, "exists, skipping") + return False + + dirpath = path.resolve().relative_to(pathlib.Path.cwd()).parent + os.makedirs(dirpath, exist_ok=True) + + if self.debug: + print("downloading", url, "to" ,file_path, "dirpath", dirpath) + + # In case of e. g. 500 errors + retries = 0 + while retries < self.max_retries: + self._wait_request_slot() + + headers = requests.utils.default_headers() + # Pretty generic user-agent, but we append a unique none for us + # Makes wikimedia happy + headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"}) + req = requests.get(url, stream=True, ) + + if req.status_code >= 500: + retries += 1 + print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) + + # In case of debug enabled, we already printed this above + if not self.debug: + print(req) + + # Be nice, double wait delay for errors + self._wait_request_slot() + + # Extra nice, sleep longer (expoential increase), hope for the + # server to recover + time.sleep(retries * retries * self.delay) + + continue + + try: + # In case of 404 errors or other stuff that indicates + # some bug in how we handle or request things + req.raise_for_status() + + req.raw.decode_content = True + with open(file_path, 'wb') as out_file: + shutil.copyfileobj(req.raw, out_file) + + return True + except Exception as e: + print('Failed to download', e, req, url) + raise e + + return False # To honor usage rules, we wait for self.delay between requests. # Low-level query functions call this before every request to Wikidot./ @@ -28,6 +92,7 @@ def _wait_request_slot(self): if self.next_timeslot - tm > 0: time.sleep(self.next_timeslot - tm) self.next_timeslot = tm + self.delay / 1000 + pass # Makes a Wikidot AJAX query. Returns the response+title or throws an error. @@ -90,7 +155,6 @@ def queryex(self, params, urlAppend = None): def query(self, params, urlAppend = None): return self.queryex(params, urlAppend)[0] - # List all pages for the site. # Raw version @@ -181,7 +245,7 @@ def get_page_id(self, page_unix_name): for item in soup.head.find_all('script'): text = item.string if text is None: - print("No text in script item", item) + #print("No text in script item", item) continue pos = text.find("WIKIREQUEST.info.pageId = ") @@ -209,17 +273,25 @@ def get_revisions_raw(self, page_id, limit): }) soup = BeautifulSoup(res, 'html.parser') + print("revisions raw") return soup.table.contents # Client version def get_revisions(self, page_id, limit): revs = [] - for tr in self.get_revisions_raw(page_id, limit): + raw = self.get_revisions_raw(page_id, limit) + for tr in raw: if tr.name != 'tr': continue # there's a header + various junk # RevID is stored as a value of an INPUT field rev_id = tr.input['value'] if tr.input else None if rev_id is None: continue # can't parse + attachment_action = tr.find("span", attrs={"title": "file/attachment action"}) + attached_file = False + if attachment_action is not None: + attached_file = True + #pprint(raw) + print("was attchment", rev_id) # Unixtime is stored as a CSS class time_* rev_date = 0 @@ -228,6 +300,8 @@ def get_revisions(self, page_id, limit): for cls in date_span['class']: if cls.startswith('time_'): rev_date = int(cls[5:]) + else: + print("no odate found") # Username in a last under user_span = tr.find("span", attrs={"class": "printuser"}) @@ -246,6 +320,7 @@ def get_revisions(self, page_id, limit): 'date': rev_date, 'user': rev_user, 'comment': rev_comment, + 'attached_file': attached_file, }) return revs @@ -262,12 +337,18 @@ def get_revision_source(self, rev_id): # - htmlentities # -
s in place of linebreaks # - random real linebreaks (have to be ignored) + if self.debug: + print("revision source:") + #pprint(res) soup = BeautifulSoup(res, 'html.parser') return soup.div.getText().lstrip(' \r\n') # Retrieves the rendered version + additional info unavailable in get_revision_source: # * Title # * Unixname at the time + # + # TODO: I think this could fetch the source as well, so we don't need to + # fetch two pages (the fetch source function above). def get_revision_version_raw(self, rev_id): res = self.queryex({ 'moduleName': 'history/PageVersionModule', @@ -279,6 +360,8 @@ def get_revision_version(self, rev_id): res = self.get_revision_version_raw(rev_id) # this has title! soup = BeautifulSoup(res[0], 'html.parser') + + # Extract list of images images = [] for img_div in soup.find_all("div", attrs={"class": "scp-image-block"}): img_src = None @@ -297,10 +380,31 @@ def get_revision_version(self, rev_id): img_src = img["src"] img_name = img["alt"] - if img_src is not None: - # Just in case, I don't think it ever happens - img_name = img_name.replace("/", "_forward_slash_") - images.append({"src": img_src, "filename": img_name}) + if img_src is None: + continue + + # Just in case, I don't think it ever happens, but resolve '..' + # juuuust in case someone tries to be funny + img_url = urlparse(urljoin(img_src, ".")) + url_path = pathlib.Path(img_url.path) + + img_path = "" + if img_url.netloc != "": + img_path = img_url.netloc + "/" + if img_url.netloc[-1] != '/': + img_path += '/' + + if img_url.path != "" and img_url.path[0] == '/': + img_path += img_url.path[1:] + else: + img_path += img_url.path + + if img_path == "" or img_path[-1] == "/": + img_path += img_name + + images.append({"src": img_src, "filename": img_name, "filepath": "images/" + img_path}) + + # First table is a flyout with revision details. Remove and study it. unixname = None From 2eabf6b127b6d5a5b245cc9291dd19eca05e5d0a Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Tue, 14 Jul 2020 14:16:17 +0200 Subject: [PATCH 47/93] add comment explaining why we can't get the images in the right commit --- wikidot.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/wikidot.py b/wikidot.py index 188f9f5..85a0d9c 100644 --- a/wikidot.py +++ b/wikidot.py @@ -290,7 +290,6 @@ def get_revisions(self, page_id, limit): attached_file = False if attachment_action is not None: attached_file = True - #pprint(raw) print("was attchment", rev_id) # Unixtime is stored as a CSS class time_* @@ -339,7 +338,6 @@ def get_revision_source(self, rev_id): # - random real linebreaks (have to be ignored) if self.debug: print("revision source:") - #pprint(res) soup = BeautifulSoup(res, 'html.parser') return soup.div.getText().lstrip(' \r\n') @@ -362,6 +360,13 @@ def get_revision_version(self, rev_id): # Extract list of images + + # TODO: to get the right revision that added them, we need to go back + # and amend the commits that are flagged as attached_file above, + # because we can't get the image file name or URL reliably until they + # are added to the page source, wikidot itself doesn't store this information. + # So much hassle for little value, we get the empty commits when images + # are added anyways. images = [] for img_div in soup.find_all("div", attrs={"class": "scp-image-block"}): img_src = None From 8afada4ced927180d00c5cadc082202512de5b2b Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Fri, 17 Jul 2020 11:23:59 +0200 Subject: [PATCH 48/93] fuck python, this suddendly didn't work on my server --- wikidot.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/wikidot.py b/wikidot.py index 85a0d9c..9ed7d95 100644 --- a/wikidot.py +++ b/wikidot.py @@ -29,13 +29,12 @@ def __init__(self, site): def maybe_download_file(self, url, file_path): self._wait_request_slot() - path = pathlib.Path(file_path) - if path.exists(): + if os.path.exists(file_path): if self.debug: print(file_path, "exists, skipping") return False - dirpath = path.resolve().relative_to(pathlib.Path.cwd()).parent + dirpath = os.path.dirname(file_path) os.makedirs(dirpath, exist_ok=True) if self.debug: From 74923b193149f3c762fea6ff587b4011e731b844 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Fri, 17 Jul 2020 11:24:28 +0200 Subject: [PATCH 49/93] re-try in case of json errors, seems like they are spurious --- wikidot.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/wikidot.py b/wikidot.py index 9ed7d95..5163701 100644 --- a/wikidot.py +++ b/wikidot.py @@ -137,9 +137,19 @@ def queryex(self, params, urlAppend = None): # In case of 404 errors or other stuff that indicates # some bug in how we handle or request things req.raise_for_status() + except Exception as e: + print('Failed to get response from wikidot', e, req, url, params) + + try: json = req.json() except Exception as e: print('Failed to get response from wikidot', e, req, url, params) + if retries < self.max_retries: + retries += 1 + self._wait_request_slot() + time.sleep(retries * retries * self.delay) + continue + raise e if json['status'] == 'ok': From 9ed8019e17f45296ad30005f908d5283a1d110fb Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 18 Jul 2020 12:30:13 +0200 Subject: [PATCH 50/93] less debug spam --- rmaint.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/rmaint.py b/rmaint.py index 5cb56aa..3db3084 100644 --- a/rmaint.py +++ b/rmaint.py @@ -171,10 +171,11 @@ def buildRevisionList(self, pages = None, depth = 10000): continue revs = self.wd.get_revisions(page_id, depth) - print("Revisions to fetch: "+str(len(revs))) + print("Revisions to fetch: " + str(len(revs))) + already_fetched = 0 for rev in revs: if rev['id'] in self.fetched_revids: - print(rev['id'], 'already fetched') + already_fetched += 1 continue self.wrevs.append({ @@ -187,6 +188,8 @@ def buildRevisionList(self, pages = None, depth = 10000): }) self.saveWRevs() # Save a cached copy + print("Revisions already fetched", already_fetched) + if os.path.isfile(self.path+'/.metadata.json'): self.loadMetadata() @@ -265,7 +268,7 @@ def commitNext(self): if rev['rev_id'] in self.fetched_revids: if self.debug: - print(rev['rev_id'], 'already fetched') + print(rev['rev_id'], 'already fetched, yet called on to fetch again') self.rev_no += 1 From 49da2cfd17d402c2e0e85e821cdacf6d6e9e11d6 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 18 Jul 2020 13:40:38 +0200 Subject: [PATCH 51/93] don't need to wait for a download slot if we're not downloading --- wikidot.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/wikidot.py b/wikidot.py index 5163701..cb6b52e 100644 --- a/wikidot.py +++ b/wikidot.py @@ -27,13 +27,13 @@ def __init__(self, site): # Downloads file if it doesn't exist def maybe_download_file(self, url, file_path): - self._wait_request_slot() - if os.path.exists(file_path): if self.debug: print(file_path, "exists, skipping") return False + self._wait_request_slot() + dirpath = os.path.dirname(file_path) os.makedirs(dirpath, exist_ok=True) @@ -345,8 +345,6 @@ def get_revision_source(self, rev_id): # - htmlentities # -
s in place of linebreaks # - random real linebreaks (have to be ignored) - if self.debug: - print("revision source:") soup = BeautifulSoup(res, 'html.parser') return soup.div.getText().lstrip(' \r\n') From e6da37708bdbb9fc937c9c2c6ec7eb7ff727cae0 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 18 Jul 2020 13:42:59 +0200 Subject: [PATCH 52/93] better use of named parameters and stuff --- crawl.py | 17 ++++++++++------- rmaint.py | 21 ++++++++++++--------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/crawl.py b/crawl.py index 9fd0bab..3b9f486 100644 --- a/crawl.py +++ b/crawl.py @@ -15,6 +15,7 @@ parser.add_argument('site', help='URL of Wikidot site') # Actions parser.add_argument('--list-pages', action='store_true', help='List all pages on this site') +parser.add_argument('--max-page-count', type=int, default='10000', help='Only list/fetch up to this amount of pages') parser.add_argument('--source', action='store_true', help='Print page source (requires --page)') parser.add_argument('--content', action='store_true', help='Print page content (requires --page)') parser.add_argument('--log', action='store_true', help='Print page revision log (requires --page)') @@ -41,17 +42,17 @@ def force_dirs(path): os.makedirs(path, exist_ok=True) if args.list_pages_raw: - print((wd.list_pages_raw(args.depth))) + print((wd.list_pages_raw(limit = args.max_pages_count))) elif args.list_pages: - for page in wd.list_pages(args.depth): + for page in wd.list_pages(limit = args.max_pages_count): print(page) elif args.source: if not args.page: raise Exception("Please specify --page for --source.") - page_id = wd.get_page_id(args.page) + page_id = wd.get_page_id(page_unix_name=args.page) if not page_id: raise Exception("Page not found: "+args.page) @@ -62,7 +63,7 @@ def force_dirs(path): if not args.page: raise Exception("Please specify --page for --source.") - page_id = wd.get_page_id(args.page) + page_id = wd.get_page_id(page_unix_name=args.page) if not page_id: raise Exception("Page not found: "+args.page) @@ -73,7 +74,7 @@ def force_dirs(path): if not args.page: raise Exception("Please specify --page for --log.") - page_id = wd.get_page_id(args.page) + page_id = wd.get_page_id(page_unix_name=args.page) if not page_id: raise Exception("Page not found: "+args.page) @@ -84,7 +85,7 @@ def force_dirs(path): if not args.page: raise Exception("Please specify --page for --log.") - page_id = wd.get_page_id(args.page) + page_id = wd.get_page_id(page_unix_name=args.page) if not page_id: raise Exception("Page not found: "+args.page) for rev in wd.get_revisions(page_id, args.depth): @@ -98,7 +99,9 @@ def force_dirs(path): rm = RepoMaintainer(wd, args.dump) rm.debug = args.debug rm.storeRevIds = args.revids - rm.buildRevisionList([args.page] if args.page else None, args.depth) + rm.max_depth = args.depth + rm.max_page_count = args.max_page_count + rm.buildRevisionList([args.page] if args.page else None) rm.openRepo() print("Downloading revisions...") diff --git a/rmaint.py b/rmaint.py index 3db3084..9450b0c 100644 --- a/rmaint.py +++ b/rmaint.py @@ -16,7 +16,7 @@ # Usage: # rm = RepoMaintainer(wikidot, path) -# rm.buildRevisionList(pages, depth) +# rm.buildRevisionList(pages) # rm.openRepo() # while rm.commitNext(): # pass @@ -42,6 +42,12 @@ def __init__(self, wikidot, path): self.repo = None # Git repo object self.index = None # Git current index object + self.max_depth = 10000 # download at most this number of revisions + self.max_page_count = 10000 # download at most this number of pages + + self.pbar = None + self.first_fetched = 0 # For progress bar + self.fetched_revids = set() # @@ -90,12 +96,11 @@ def loadMetadata(self): # # Compiles a combined revision list for a given set of pages, or all pages on the site. # pages: compile history for these pages - # depth: download at most this number of revisions. # # If there exists a cached revision list at the repository destination, # it is loaded and no requests are made. # - def buildRevisionList(self, pages = None, depth = 10000): + def buildRevisionList(self, pages = None): if os.path.isfile(self.path+'/.wrevs'): print("Loading cached revision list...") self.loadWRevs() @@ -121,10 +126,10 @@ def buildRevisionList(self, pages = None, depth = 10000): fp.close() - if not pages: + if not pages or len(pages) < self.max_page_count: if self.debug: print('Need to fetch pages') - pages = self.wd.list_pages(10000) + pages = self.wd.list_pages(self.max_page_count) self.savePages(pages) elif self.debug: print(len(pages), 'pages loaded') @@ -170,10 +175,8 @@ def buildRevisionList(self, pages = None, depth = 10000): print('Page gone?', page) continue - revs = self.wd.get_revisions(page_id, depth) - print("Revisions to fetch: " + str(len(revs))) - already_fetched = 0 - for rev in revs: + revs = self.wd.get_revisions(page_id=page_id, limit=max_depth) + for rev in tqdm(revs, desc='Adding revisions from page ' + page_id): if rev['id'] in self.fetched_revids: already_fetched += 1 continue From c2479c959e090aa35461fc1d5a1e38c49c55d0fd Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 18 Jul 2020 13:44:12 +0200 Subject: [PATCH 53/93] add progress bars with tqdm --- crawl.py | 5 ++--- rmaint.py | 25 ++++++++++++++----------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/crawl.py b/crawl.py index 3b9f486..b6a4f2d 100644 --- a/crawl.py +++ b/crawl.py @@ -104,9 +104,8 @@ def force_dirs(path): rm.buildRevisionList([args.page] if args.page else None) rm.openRepo() - print("Downloading revisions...") - while rm.commitNext(): - pass + print("Downloading revisions") + rm.fetchAll() rm.cleanup() print("Done.") diff --git a/rmaint.py b/rmaint.py index 9450b0c..80d8da4 100644 --- a/rmaint.py +++ b/rmaint.py @@ -11,6 +11,8 @@ import time # For parsing unix epoch timestamps from wikidot and convert to normal timestamps import re # For sanitizing usernames to fake email addresses +from tqdm import tqdm # for progress bar + # Repository builder and maintainer # Contains logic for actual loading and maintaining the repository over the course of its construction. @@ -141,7 +143,7 @@ def buildRevisionList(self, pages = None): # TODO: I don't know python, but this is highly suboptimal (and takes a ton of time) # Should use a set/hashmap/whatever python calls it - for wrev in self.wrevs: + for wrev in tqdm(self.wrevs, desc='Collecting pages we already got revisions for'): page_name = wrev['page_name'] if page_name in fetched_pages: @@ -153,7 +155,7 @@ def buildRevisionList(self, pages = None): print("Already fetched revisions for " + str(len(fetched_pages)) + " of " + str(len(pages))) fetched = 0 - for page in pages: + for page in tqdm(pages, desc='Updating list of revisions to fetch'): if page in fetched_pages: continue @@ -178,12 +180,11 @@ def buildRevisionList(self, pages = None): revs = self.wd.get_revisions(page_id=page_id, limit=max_depth) for rev in tqdm(revs, desc='Adding revisions from page ' + page_id): if rev['id'] in self.fetched_revids: - already_fetched += 1 continue self.wrevs.append({ 'page_id' : page_id, - 'page_name' : page, # name atm, not at revision time + 'page_name' : page, # current name, not at revision time (revisions can rename them) 'rev_id' : rev['id'], 'date' : rev['date'], 'user' : rev['user'], @@ -191,7 +192,7 @@ def buildRevisionList(self, pages = None): }) self.saveWRevs() # Save a cached copy - print("Revisions already fetched", already_fetched) + print("Number of revisions already fetched", len(revs) - len(self.wrevs)) if os.path.isfile(self.path+'/.metadata.json'): self.loadMetadata() @@ -263,16 +264,11 @@ def openRepo(self): # Takes an unprocessed revision from a revision log, fetches its data and commits it. # Returns false if no unprocessed revisions remain. # - def commitNext(self): + def commitNext(self, rev): if self.rev_no >= len(self.wrevs): return False - rev = self.wrevs[self.rev_no] - if rev['rev_id'] in self.fetched_revids: - if self.debug: - print(rev['rev_id'], 'already fetched, yet called on to fetch again') - self.rev_no += 1 self.saveState() # Update operation state @@ -394,6 +390,13 @@ def commitNext(self): return True + def fetchAll(self): + to_fetch = [] + for rev in tqdm(self.wrevs, desc='Creating list of revisions to fetch'): + if rev['rev_id'] not in self.fetched_revids: + to_fetch.append(rev) + for rev in tqdm(to_fetch, desc='Downloading'): + self.commitNext(rev) # # Updates all children of the page to reflect parent's unixname change. From 511c6eb800b935262f890d7364679f4a5ad00bfb Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 18 Jul 2020 13:44:22 +0200 Subject: [PATCH 54/93] remove unused --- rmaint.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index 80d8da4..8e98335 100644 --- a/rmaint.py +++ b/rmaint.py @@ -36,7 +36,6 @@ def __init__(self, wikidot, path): # Internal state self.wrevs = None # Compiled wikidot revision list (history) - self.fetcheds_revids = [] # Compiled wikidot revision list (history) self.rev_no = 0 # Next revision to process self.last_names = {} # Tracks page renames: name atm -> last name in repo From 9c43ea7ab4055c0fd7b739ccafbf14e95e44f426 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sat, 18 Jul 2020 13:44:41 +0200 Subject: [PATCH 55/93] don't be dumb, use sets, massive speedup --- rmaint.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/rmaint.py b/rmaint.py index 8e98335..3a9ba4a 100644 --- a/rmaint.py +++ b/rmaint.py @@ -75,7 +75,7 @@ def appendFetchedRevid(self, revid): fp.close() def loadFetchedRevids(self): - self.fetched_revids = [line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')] + self.fetched_revids = set([line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')]) # Persistent metadata about the repo: # - Tracks page renames: name atm -> last name in repo @@ -112,9 +112,9 @@ def buildRevisionList(self, pages = None): if os.path.isfile(self.path+'/.fetched.txt'): self.loadFetchedRevids() - print(self.fetched_revids) + print(len(self.fetched_revids), 'revisions already fetched') else: - self.fetched_revids = [] + self.fetched_revids = set() if self.debug: print("Building revision list...") @@ -135,11 +135,7 @@ def buildRevisionList(self, pages = None): elif self.debug: print(len(pages), 'pages loaded') - fetched_pages = [] - - if self.debug: - print('Collecting already pages we already got revisions for') - + fetched_pages = set() # TODO: I don't know python, but this is highly suboptimal (and takes a ton of time) # Should use a set/hashmap/whatever python calls it for wrev in tqdm(self.wrevs, desc='Collecting pages we already got revisions for'): @@ -148,7 +144,7 @@ def buildRevisionList(self, pages = None): if page_name in fetched_pages: continue - fetched_pages.append(page_name) + fetched_pages.add(page_name) if self.debug: print("Already fetched revisions for " + str(len(fetched_pages)) + " of " + str(len(pages))) @@ -382,7 +378,7 @@ def commitNext(self, rev): if self.debug: print('Committed', commit.name_rev, 'by', author) - self.fetched_revids.append(rev['rev_id']) + self.fetched_revids.add(rev['rev_id']) self.rev_no += 1 self.saveState() # Update operation state From e2a763b61de1872d8998ab5e8fccc13c67035ae4 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 09:37:12 +0200 Subject: [PATCH 56/93] fix default argument --- crawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl.py b/crawl.py index b6a4f2d..d90c0ca 100644 --- a/crawl.py +++ b/crawl.py @@ -26,7 +26,7 @@ # Action settings parser.add_argument('--page', type=str, help='Query only this page') parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions') -parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository') +parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True) # Common settings parser.add_argument('--debug', action='store_true', help='Print debug info') parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot') From 5cc5fdbde58e320dc3376380dff3cdd7961c93c5 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 09:37:36 +0200 Subject: [PATCH 57/93] fix status output --- rmaint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index 3a9ba4a..f7b0bfc 100644 --- a/rmaint.py +++ b/rmaint.py @@ -187,7 +187,7 @@ def buildRevisionList(self, pages = None): }) self.saveWRevs() # Save a cached copy - print("Number of revisions already fetched", len(revs) - len(self.wrevs)) + print("Number of revisions already fetched", len(revs) - len(self.wrevs)) if os.path.isfile(self.path+'/.metadata.json'): self.loadMetadata() From 674bddac96a62e95b48666ecb117c572cb996b87 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 09:37:50 +0200 Subject: [PATCH 58/93] fix check for existing repo --- rmaint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index f7b0bfc..a20db9b 100644 --- a/rmaint.py +++ b/rmaint.py @@ -236,7 +236,7 @@ def openRepo(self): self.last_names = {} # Tracks page renames: name atm -> last name in repo self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo - if os.path.isfile(self.path+'/.git'): + if os.path.isdir(self.path+'/.git'): print("Continuing from aborted dump state...") self.loadState() self.repo = Repo(self.path) From 2f1a9f6c560984bc8703577a9e0ac925ff0063e3 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 09:38:03 +0200 Subject: [PATCH 59/93] add todo --- readme.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/readme.md b/readme.md index c13d967..e59e568 100644 --- a/readme.md +++ b/readme.md @@ -39,3 +39,8 @@ The descriptions for on-site modules are heavily correlated with AJAX ones: Someone else did Wikidot AJAX: * https://github.com/kerel-fs/ogn-rdb/blob/master/wikidotcrawler.py + + +#### TODO + +Handle deleted images. From 858b7ed223a6254e7bf1eb359f79cee832ac054f Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 09:38:39 +0200 Subject: [PATCH 60/93] 404 for images is not fatal --- wikidot.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wikidot.py b/wikidot.py index cb6b52e..d36e85d 100644 --- a/wikidot.py +++ b/wikidot.py @@ -51,6 +51,9 @@ def maybe_download_file(self, url, file_path): headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"}) req = requests.get(url, stream=True, ) + if req.status_code == 404: + return False + if req.status_code >= 500: retries += 1 print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) From 29e17918b4335a22e04104f2775c65ff7239271b Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 10:08:28 +0200 Subject: [PATCH 61/93] fix --- rmaint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rmaint.py b/rmaint.py index a20db9b..d0663b4 100644 --- a/rmaint.py +++ b/rmaint.py @@ -162,7 +162,6 @@ def buildRevisionList(self, pages = None): if self.debug: print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages))) - fetched += 1 page_id = self.wd.get_page_id(page) if self.debug: @@ -174,6 +173,7 @@ def buildRevisionList(self, pages = None): revs = self.wd.get_revisions(page_id=page_id, limit=max_depth) for rev in tqdm(revs, desc='Adding revisions from page ' + page_id): + fetched += 1 if rev['id'] in self.fetched_revids: continue @@ -187,7 +187,7 @@ def buildRevisionList(self, pages = None): }) self.saveWRevs() # Save a cached copy - print("Number of revisions already fetched", len(revs) - len(self.wrevs)) + print("Number of revisions already fetched", len(self.fetched_revids), len(self.wrevs)) if os.path.isfile(self.path+'/.metadata.json'): self.loadMetadata() From d24acca2bb0e014e70106312eb844cd7eb273e5b Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 10:08:52 +0200 Subject: [PATCH 62/93] fix relative path --- rmaint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index d0663b4..346adc3 100644 --- a/rmaint.py +++ b/rmaint.py @@ -249,7 +249,7 @@ def openRepo(self): if self.storeRevIds: # Add revision id file to the new repo - fname = '/.revid' + fname = self.path + '/.revid' codecs.open(self.path + fname, "w", "UTF-8").close() self.repo.index.add([fname]) self.index.commit("Initial creation of repo") From 938bb680421b2a0070381c3d475655b0b50d1891 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 10:09:16 +0200 Subject: [PATCH 63/93] add timeouts --- wikidot.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wikidot.py b/wikidot.py index d36e85d..7d57432 100644 --- a/wikidot.py +++ b/wikidot.py @@ -49,7 +49,7 @@ def maybe_download_file(self, url, file_path): # Pretty generic user-agent, but we append a unique none for us # Makes wikimedia happy headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"}) - req = requests.get(url, stream=True, ) + req = requests.get(url, stream=True, timeout=30) if req.status_code == 404: return False @@ -116,7 +116,7 @@ def queryex(self, params, urlAppend = None): while retries < self.max_retries: self._wait_request_slot() - req = requests.request('POST', url, data=params, cookies=cookies) + req = requests.request('POST', url, data=params, cookies=cookies, timeout=30) # Usually a 502 error, recovers immediately if req.status_code >= 500: @@ -252,7 +252,7 @@ def get_page_id(self, page_unix_name): if self.debug: print("fetching", url) - req = requests.request('GET', url) + req = requests.request('GET', url, timeout=30) soup = BeautifulSoup(req.text, 'html.parser') for item in soup.head.find_all('script'): text = item.string From e0d9e4b0df36f46c04d4ae13a5fa581e8803541b Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 10:09:39 +0200 Subject: [PATCH 64/93] python's time.sleep is in seconds, not milliseconds --- wikidot.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/wikidot.py b/wikidot.py index 7d57432..15d390b 100644 --- a/wikidot.py +++ b/wikidot.py @@ -67,7 +67,7 @@ def maybe_download_file(self, url, file_path): # Extra nice, sleep longer (expoential increase), hope for the # server to recover - time.sleep(retries * retries * self.delay) + time.sleep(retries * retries * self.delay / 1000) continue @@ -114,6 +114,9 @@ def queryex(self, params, urlAppend = None): # In case of e. g. 500 errors retries = 0 while retries < self.max_retries: + if retries > 0: + print("retry", retries, "of", self.max_retries) + self._wait_request_slot() req = requests.request('POST', url, data=params, cookies=cookies, timeout=30) @@ -132,7 +135,7 @@ def queryex(self, params, urlAppend = None): # Extra nice, sleep longer (expoential increase), hope for the # server to recover - time.sleep(retries * retries * self.delay) + time.sleep(retries * retries * self.delay / 1000) continue @@ -149,14 +152,20 @@ def queryex(self, params, urlAppend = None): print('Failed to get response from wikidot', e, req, url, params) if retries < self.max_retries: retries += 1 - self._wait_request_slot() - time.sleep(retries * retries * self.delay) + #self._wait_request_slot() + time.sleep(retries * retries * self.delay / 1000) continue raise e if json['status'] == 'ok': return json['body'], (json['title'] if 'title' in json else '') + elif retries < self.max_retries: + print("error in response", json) + retries += 1 + print("sleeping for", retries * retries * self.delay); + #self._wait_request_slot() + time.sleep(retries * retries * self.delay / 1000) else: raise Exception(req.text) From b01c253bf625d9aa4dbeb5c5eb01e327cb8395ef Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 10:10:08 +0200 Subject: [PATCH 65/93] support for skipping select revisions --- crawl.py | 4 ++++ rmaint.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/crawl.py b/crawl.py index d90c0ca..367dbf0 100644 --- a/crawl.py +++ b/crawl.py @@ -27,6 +27,7 @@ parser.add_argument('--page', type=str, help='Query only this page') parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions') parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True) +parser.add_argument('--skip', type=str, help='Skip the specified revision') # Common settings parser.add_argument('--debug', action='store_true', help='Print debug info') parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot') @@ -104,6 +105,9 @@ def force_dirs(path): rm.buildRevisionList([args.page] if args.page else None) rm.openRepo() + if args.skip: + rm.revs_to_skip = [args.skip] + print("Downloading revisions") rm.fetchAll() diff --git a/rmaint.py b/rmaint.py index 346adc3..8a568a8 100644 --- a/rmaint.py +++ b/rmaint.py @@ -50,6 +50,8 @@ def __init__(self, wikidot, path): self.first_fetched = 0 # For progress bar self.fetched_revids = set() + self.revs_to_skip = [] + # # Saves and loads revision list from file @@ -269,6 +271,10 @@ def commitNext(self, rev): self.saveState() # Update operation state return True + if rev['rev_id'] in self.revs_to_skip: + print("Skipping", rev) + return True + source = self.wd.get_revision_source(rev['rev_id']) # Page title and unix_name changes are only available through another request: details = self.wd.get_revision_version(rev['rev_id']) From b52dc93b77097a8c2134cd2e559cb5356e6db6b3 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 11:08:57 +0200 Subject: [PATCH 66/93] time how long a download takes, remove invalid images (usually 404 errors with wrong status in return header) --- wikidot.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/wikidot.py b/wikidot.py index 15d390b..a53384d 100644 --- a/wikidot.py +++ b/wikidot.py @@ -8,6 +8,8 @@ import hashlib import os import shutil +import imghdr +from timeit import default_timer as timer # Implements various queries to Wikidot engine through its AJAX facilities @@ -22,23 +24,23 @@ def __init__(self, site): self.sitename = urlparse(site).hostname.lower() self.delay = 200 # Delay between requests in msec self.debug = False # Print debug messages - self.next_timeslot = time.process_time() # Can call immediately + self.next_timeslot = timer() # Can call immediately self.max_retries = 5 # Downloads file if it doesn't exist def maybe_download_file(self, url, file_path): if os.path.exists(file_path): if self.debug: - print(file_path, "exists, skipping") + print(" - ", file_path, "exists, skipping") return False - self._wait_request_slot() + #self._wait_request_slot() dirpath = os.path.dirname(file_path) os.makedirs(dirpath, exist_ok=True) if self.debug: - print("downloading", url, "to" ,file_path, "dirpath", dirpath) + print(" < downloading", url, "to" ,file_path, "dirpath", dirpath) # In case of e. g. 500 errors retries = 0 @@ -49,6 +51,7 @@ def maybe_download_file(self, url, file_path): # Pretty generic user-agent, but we append a unique none for us # Makes wikimedia happy headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"}) + start = timer() req = requests.get(url, stream=True, timeout=30) if req.status_code == 404: @@ -56,11 +59,11 @@ def maybe_download_file(self, url, file_path): if req.status_code >= 500: retries += 1 - print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) + print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) # In case of debug enabled, we already printed this above if not self.debug: - print(req) + print(' - ', req) # Be nice, double wait delay for errors self._wait_request_slot() @@ -80,9 +83,18 @@ def maybe_download_file(self, url, file_path): with open(file_path, 'wb') as out_file: shutil.copyfileobj(req.raw, out_file) + if imghdr.what(file_path) is None: + print('Downloaded invalid image', url) + os.remove(file_path) + return False + + + if self.debug: + print(" - downloaded file size", os.path.getsize(file_path), "in", round(timer() - start, 2)) + return True except Exception as e: - print('Failed to download', e, req, url) + print(' ! Failed to download', e, req, url) raise e return False From bae36607c054677c9bdfc3a44536d86de4a23344 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 11:09:07 +0200 Subject: [PATCH 67/93] add tags to todo --- readme.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/readme.md b/readme.md index e59e568..354a887 100644 --- a/readme.md +++ b/readme.md @@ -43,4 +43,6 @@ Someone else did Wikidot AJAX: #### TODO -Handle deleted images. + - Handle deleted images. Probably need to check the diff and check all pages for references if removed from one page. + - Handle tags (both added and removed). + From 4dc15fdafe926fe06186d17ad4323749e734b3d5 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 11:10:10 +0200 Subject: [PATCH 68/93] improve debug output --- rmaint.py | 5 +++++ wikidot.py | 50 +++++++++++++++++++++++++++++--------------------- 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/rmaint.py b/rmaint.py index 8a568a8..2d32713 100644 --- a/rmaint.py +++ b/rmaint.py @@ -295,6 +295,8 @@ def commitNext(self, rev): if rev['comment'].startswith('Parent page set to: "'): # This is a parenting revision, remember the new parent parent_unixname = rev['comment'][21:-2] + if self.debug: + print('Parent changed', parent_unixname) self.last_parents[unixname] = parent_unixname else: # Else use last parent_unixname we've recorded @@ -409,6 +411,9 @@ def fetchAll(self): # Therefore, on every rename we must update all linked children in the same revision. # def updateChildren(self, oldunixname, newunixname): + if self.debug: + print('Updating parents for', oldunixname, newunixname) + for child in list(self.last_parents.keys()): if self.last_parents[child] == oldunixname: self.updateParentField(child, self.last_parents[child], newunixname) diff --git a/wikidot.py b/wikidot.py index a53384d..32e3ace 100644 --- a/wikidot.py +++ b/wikidot.py @@ -116,8 +116,8 @@ def queryex(self, params, urlAppend = None): params['wikidot_token7'] = token if self.debug: - print(params) - print(cookies) + print(' - ', params) + print(' - ', cookies) url = self.site+'/ajax-module-connector.php' if urlAppend is not None: @@ -127,16 +127,20 @@ def queryex(self, params, urlAppend = None): retries = 0 while retries < self.max_retries: if retries > 0: - print("retry", retries, "of", self.max_retries) + print(" ! retry", retries, "of", self.max_retries) self._wait_request_slot() + start = timer() req = requests.request('POST', url, data=params, cookies=cookies, timeout=30) + if self.debug: + print(' * ajax request completed in', round(timer() - start, 2)) + # Usually a 502 error, recovers immediately if req.status_code >= 500: retries += 1 - print('500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) + print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) # In case of debug enabled, we already printed this above if not self.debug: @@ -156,12 +160,12 @@ def queryex(self, params, urlAppend = None): # some bug in how we handle or request things req.raise_for_status() except Exception as e: - print('Failed to get response from wikidot', e, req, url, params) + print(' ! Failed to get response from wikidot', e, req, url, params) try: json = req.json() except Exception as e: - print('Failed to get response from wikidot', e, req, url, params) + print(' ! Failed to get response from wikidot', e, req, url, params) if retries < self.max_retries: retries += 1 #self._wait_request_slot() @@ -171,17 +175,18 @@ def queryex(self, params, urlAppend = None): raise e if json['status'] == 'ok': + return json['body'], (json['title'] if 'title' in json else '') elif retries < self.max_retries: - print("error in response", json) + print(" ! error in response", json) retries += 1 - print("sleeping for", retries * retries * self.delay); + print(" ! sleeping for", retries * retries * self.delay); #self._wait_request_slot() time.sleep(retries * retries * self.delay / 1000) else: raise Exception(req.text) - print('Failed too many times', url, params, cookies) + print(' ! Failed too many times', url, params, cookies) raise Exception('Failed too many times for ' + url) # Same but only returns the body, most responses don't have titles @@ -219,11 +224,11 @@ def list_pages(self, limit): pages.append(entry) if self.debug: - print('Pages found:', len(pages)) + print(' - Pages found:', len(pages)) targets = soup.find_all('span','target') if len(targets) < 2: - print("Unable to find next listing page, not enough target spans") + print(" ! Unable to find next listing page, not enough target spans") break next_url = targets[-1].a.get('href').split('/') @@ -231,10 +236,10 @@ def list_pages(self, limit): next_page = int(next_url[-1]) if self.debug: - print('Next listing page', next_page) + print(' - Next listing page', next_page) else: - print("invalid next url", next_url) + print(" ! invalid next url", next_url) break #next_page = int(targets[0].a.text) @@ -244,20 +249,20 @@ def list_pages(self, limit): current_page = int(current_spans[0].text) if self.debug: - print('Current listing page', current_page) + print(' - Current listing page', current_page) else: - print("unable to find current page") + print(" ! unable to find current page") break; if next_page != offset + 1: if self.debug: - print('Next page is wrong', next_page, 'hopefully at the end') + print(' ! Next page is wrong', next_page, 'hopefully at the end') break offset += 1 - print("Fetching listing page", offset) + print(" - Fetching listing page", offset) return pages @@ -271,9 +276,13 @@ def get_page_id(self, page_unix_name): url = self.site+'/'+page_unix_name + '/noredirect/true'; if self.debug: - print("fetching", url) + print(" > fetching", url) + start = timer() req = requests.request('GET', url, timeout=30) + if self.debug: + print(' * page id request completed in', round(timer() - start, 2)) + soup = BeautifulSoup(req.text, 'html.parser') for item in soup.head.find_all('script'): text = item.string @@ -306,7 +315,6 @@ def get_revisions_raw(self, page_id, limit): }) soup = BeautifulSoup(res, 'html.parser') - print("revisions raw") return soup.table.contents # Client version @@ -323,7 +331,7 @@ def get_revisions(self, page_id, limit): attached_file = False if attachment_action is not None: attached_file = True - print("was attchment", rev_id) + print(" - was attchment", rev_id) # Unixtime is stored as a CSS class time_* rev_date = 0 @@ -333,7 +341,7 @@ def get_revisions(self, page_id, limit): if cls.startswith('time_'): rev_date = int(cls[5:]) else: - print("no odate found") + print(" ! no odate found") # Username in a last
under user_span = tr.find("span", attrs={"class": "printuser"}) From 251b7068afa0345ca8ae1279b830cbb5604734f1 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 11:10:36 +0200 Subject: [PATCH 69/93] skip updating parent history if not actually changed --- rmaint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index 2d32713..cd69f7a 100644 --- a/rmaint.py +++ b/rmaint.py @@ -415,7 +415,7 @@ def updateChildren(self, oldunixname, newunixname): print('Updating parents for', oldunixname, newunixname) for child in list(self.last_parents.keys()): - if self.last_parents[child] == oldunixname: + if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname: self.updateParentField(child, self.last_parents[child], newunixname) # From 4ec2b3bd67de91c47bfe659853839ce1d8aae84d Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 11:20:55 +0200 Subject: [PATCH 70/93] mention added images in commit message --- rmaint.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/rmaint.py b/rmaint.py index cd69f7a..493006a 100644 --- a/rmaint.py +++ b/rmaint.py @@ -361,12 +361,19 @@ def commitNext(self, rev): commit_date = None got_images = False; + + # Add some spacing in the commit message + if len(details['images']) > 0: + commit_msg += '\n' + for image in details['images']: if self.wd.maybe_download_file(image['src'], self.path + '/' + image['filepath']): + commit_msg += '\nAdded image: ' + image['src'] got_images = True # If we do this gitpython barfs on itself #added_file_paths.append(image['filepath']) + if got_images: added_file_paths.append("images") print("Committing: " + str(self.rev_no) + '. '+commit_msg) From 22a3f1ed36e71e7c51920fe4061f272b9c95d33f Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 11:32:51 +0200 Subject: [PATCH 71/93] added some dependencies --- readme.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/readme.md b/readme.md index 354a887..641a570 100644 --- a/readme.md +++ b/readme.md @@ -16,6 +16,8 @@ At least: * Python 3 * python-beautifulsoup4 * python-gitpython +* python-requests +* python-tqdm ##### Examples: From d38d51414d7afcdc5d9aff98204a9b2bd3d3c5fb Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 11:40:58 +0200 Subject: [PATCH 72/93] avoid retrying images that we know are invalid (i. e. not temporary download failures) --- rmaint.py | 16 ++++++++++++++++ wikidot.py | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/rmaint.py b/rmaint.py index 493006a..ae89248 100644 --- a/rmaint.py +++ b/rmaint.py @@ -79,6 +79,19 @@ def appendFetchedRevid(self, revid): def loadFetchedRevids(self): self.fetched_revids = set([line.rstrip() for line in open(self.path+'/.fetched.txt', 'r')]) + def saveFailedImages(self): + file_path = self.path + '/.failed-images.txt' + fp = open(file_path, 'w') + for failed in self.wd.failed_images: + fp.write(failed + '\n') + fp.close() + + def loadFailedImages(self): + file_path = self.path + '/.failed-images.txt' + if not os.path.isfile(file_path): + return + self.self.wd.failed_images = set([line.rstrip() for line in open(file_path, 'r')]) + # Persistent metadata about the repo: # - Tracks page renames: name atm -> last name in repo # - Tracks page parent names: name atm -> last parent in repo @@ -237,6 +250,7 @@ def openRepo(self): # Create a new repository or continue from aborted dump self.last_names = {} # Tracks page renames: name atm -> last name in repo self.last_parents = {} # Tracks page parent names: name atm -> last parent in repo + self.loadFailedImages() if os.path.isdir(self.path+'/.git'): print("Continuing from aborted dump state...") @@ -372,6 +386,8 @@ def commitNext(self, rev): got_images = True # If we do this gitpython barfs on itself #added_file_paths.append(image['filepath']) + else: + self.saveFailedImages() if got_images: diff --git a/wikidot.py b/wikidot.py index 32e3ace..6b8e2f6 100644 --- a/wikidot.py +++ b/wikidot.py @@ -26,9 +26,15 @@ def __init__(self, site): self.debug = False # Print debug messages self.next_timeslot = timer() # Can call immediately self.max_retries = 5 + self.failed_images = set() # Downloads file if it doesn't exist def maybe_download_file(self, url, file_path): + if url in self.failed_images: + if self.debug: + print(" ! ", url, "already failed, skipping") + return False + if os.path.exists(file_path): if self.debug: print(" - ", file_path, "exists, skipping") @@ -55,6 +61,7 @@ def maybe_download_file(self, url, file_path): req = requests.get(url, stream=True, timeout=30) if req.status_code == 404: + self.failed_images.add(url) return False if req.status_code >= 500: @@ -86,6 +93,7 @@ def maybe_download_file(self, url, file_path): if imghdr.what(file_path) is None: print('Downloaded invalid image', url) os.remove(file_path) + self.failed_images.add(url) return False From 1b3f608c45c9f57dd045cf95e75763a1e94301e7 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 12:04:06 +0200 Subject: [PATCH 73/93] implement tag handling, not tested --- rmaint.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/rmaint.py b/rmaint.py index ae89248..18a594d 100644 --- a/rmaint.py +++ b/rmaint.py @@ -315,6 +315,11 @@ def commitNext(self, rev): else: # Else use last parent_unixname we've recorded parent_unixname = self.last_parents[unixname] if unixname in self.last_parents else None + + ## TODO: test + #if rev['comment'].startswith('Removed tags: ') or rev['comment'].startswith('Added tags: '): + # self.updateTags(rev['comment'], rev_unixname) + # There are also problems when parent page gets renamed -- see updateChildren # If the page is tracked and its name just changed, tell Git @@ -441,6 +446,48 @@ def updateChildren(self, oldunixname, newunixname): if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname: self.updateParentField(child, self.last_parents[child], newunixname) + def updateTags(self, comment, unixname): + file_name = self.path+'/'+unixname+'.txt' + removed = [] + removed_match = re.search(pattern = r'Removed tags: ([^.]+,?)\.') + if removed_match is not None: + removed = removed_match.group(1).split(', ') + + tags = [] + + with codecs.open(file_name, "r", "UTF-8") as f: + content = f.readlines() + + tagsline = None + for line in content: + if line.startswith('tags:'): + tagsline = line + break + + # Father forgive me for the indentation depth + idx = -1 + if tagsline is not None: + idx = content.index(tagsline) + for tag in tagsline.split(','): + if not tag in removed: + tags.append(tag) + + + added_match = re.search(pattern = r'Added tags: ([^.]+,?)\.') + if added_match is not None: + tags += added_match.group(1).split(', ') + + tags.sort() + + newtagsline = 'tags:' + ','.join(tags) + '\n' + if idx != -1: + contents[idx] = newtagsline + else: + contents = newtagsline + contents + + with codecs.open(file_name, "w", "UTF-8") as f: + f.writelines(content) + # # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. # The rest of the file is preserved. From 26e8977ca29cf99b9a7326fa4903edb62e3a0686 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 12:05:02 +0200 Subject: [PATCH 74/93] bump default delay, be nice --- wikidot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikidot.py b/wikidot.py index 6b8e2f6..bc26d51 100644 --- a/wikidot.py +++ b/wikidot.py @@ -22,7 +22,7 @@ def __init__(self, site): if self.site[-1] == '/': self.site = self.site[:-1] self.sitename = urlparse(site).hostname.lower() - self.delay = 200 # Delay between requests in msec + self.delay = 1000 # Delay between requests in msec self.debug = False # Print debug messages self.next_timeslot = timer() # Can call immediately self.max_retries = 5 From b991fe7cc001f09dd93ba64968337c003376e5ed Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 12:13:08 +0200 Subject: [PATCH 75/93] doh --- wikidot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikidot.py b/wikidot.py index bc26d51..bf06dd2 100644 --- a/wikidot.py +++ b/wikidot.py @@ -24,7 +24,7 @@ def __init__(self, site): self.sitename = urlparse(site).hostname.lower() self.delay = 1000 # Delay between requests in msec self.debug = False # Print debug messages - self.next_timeslot = timer() # Can call immediately + self.next_timeslot = time.process_time() # Can call immediately self.max_retries = 5 self.failed_images = set() From c8c8ed879e93bddaeeae95f6e093526b68c557e8 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 12:24:15 +0200 Subject: [PATCH 76/93] typo --- rmaint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index 18a594d..1f2eb2d 100644 --- a/rmaint.py +++ b/rmaint.py @@ -90,7 +90,7 @@ def loadFailedImages(self): file_path = self.path + '/.failed-images.txt' if not os.path.isfile(file_path): return - self.self.wd.failed_images = set([line.rstrip() for line in open(file_path, 'r')]) + self.wd.failed_images = set([line.rstrip() for line in open(file_path, 'r')]) # Persistent metadata about the repo: # - Tracks page renames: name atm -> last name in repo From 44a5fc11aa2295ab089bb1a5e6609aaf65d00e94 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 12:34:04 +0200 Subject: [PATCH 77/93] handle timeouts --- wikidot.py | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/wikidot.py b/wikidot.py index bf06dd2..0f5c777 100644 --- a/wikidot.py +++ b/wikidot.py @@ -58,7 +58,15 @@ def maybe_download_file(self, url, file_path): # Makes wikimedia happy headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0 wdotcrawler/1.0"}) start = timer() - req = requests.get(url, stream=True, timeout=30) + + try: + req = requests.get(url, stream=True, timeout=30) + except requests.exceptions.ReadTimeout: + print('request timed out!') + + retries += 1 + time.sleep(retries * retries * self.delay / 1000) + continue if req.status_code == 404: self.failed_images.add(url) @@ -140,7 +148,13 @@ def queryex(self, params, urlAppend = None): self._wait_request_slot() start = timer() - req = requests.request('POST', url, data=params, cookies=cookies, timeout=30) + try: + req = requests.request('POST', url, data=params, cookies=cookies, timeout=30) + except requests.exceptions.ReadTimeout: + print('request timed out!') + retries += 1 + time.sleep(retries * retries * self.delay / 1000) + continue if self.debug: print(' * ajax request completed in', round(timer() - start, 2)) @@ -287,7 +301,26 @@ def get_page_id(self, page_unix_name): print(" > fetching", url) start = timer() - req = requests.request('GET', url, timeout=30) + retries = 0 + req = None + while retries < self.max_retries: + try: + req = requests.request('GET', url, timeout=30) + except requests.exceptions.ReadTimeout: + print('request timed out!') + retries += 1 + time.sleep(retries * retries * self.delay / 1000) + continue + + if req.status_code >= 500: + print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) + retries += 1 + time.sleep(retries * retries * self.delay / 1000) + continue + + req.raise_for_status() + break + if self.debug: print(' * page id request completed in', round(timer() - start, 2)) From 404a1e4cc4b11085f54945cb53ac2bb5a903e013 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 12:34:38 +0200 Subject: [PATCH 78/93] make some errors that should be fatal fatal --- wikidot.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/wikidot.py b/wikidot.py index 0f5c777..3c29b7a 100644 --- a/wikidot.py +++ b/wikidot.py @@ -113,7 +113,7 @@ def maybe_download_file(self, url, file_path): print(' ! Failed to download', e, req, url) raise e - return False + raise Exception('Failed too many times for', url) # To honor usage rules, we wait for self.delay between requests. # Low-level query functions call this before every request to Wikidot./ @@ -339,7 +339,8 @@ def get_page_id(self, page_unix_name): return int(text[pos:crlf]) else: return int(text[pos:]) - return None + + raise Exception('Failed to get page_id for ' + page_unix_name) # Retrieves a list of revisions for a page. @@ -500,6 +501,9 @@ def get_revision_version(self, rev_id): if tds[0].getText().strip() == 'Page name:': unixname = tds[1].getText().strip() + if unixname is None: + raise Exception('Failed to find unixname for ' + rev_id) + return { 'rev_id': rev_id, 'unixname': unixname, From e0b27c39345fb1f97202f5211d2dd4e0d7dc00f3 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 12:34:58 +0200 Subject: [PATCH 79/93] avoid so long delays, it usually recovers immediately --- wikidot.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/wikidot.py b/wikidot.py index 3c29b7a..2283d38 100644 --- a/wikidot.py +++ b/wikidot.py @@ -73,20 +73,13 @@ def maybe_download_file(self, url, file_path): return False if req.status_code >= 500: - retries += 1 print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) - # In case of debug enabled, we already printed this above if not self.debug: print(' - ', req) - # Be nice, double wait delay for errors - self._wait_request_slot() - - # Extra nice, sleep longer (expoential increase), hope for the - # server to recover + retries += 1 time.sleep(retries * retries * self.delay / 1000) - continue try: From cf88384b57707279fe06c18603ca56fedf294dd5 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 12:35:05 +0200 Subject: [PATCH 80/93] simplify --- wikidot.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/wikidot.py b/wikidot.py index 2283d38..c6f047e 100644 --- a/wikidot.py +++ b/wikidot.py @@ -190,16 +190,13 @@ def queryex(self, params, urlAppend = None): raise e if json['status'] == 'ok': - return json['body'], (json['title'] if 'title' in json else '') - elif retries < self.max_retries: + else: print(" ! error in response", json) + retries += 1 - print(" ! sleeping for", retries * retries * self.delay); - #self._wait_request_slot() time.sleep(retries * retries * self.delay / 1000) - else: - raise Exception(req.text) + continue print(' ! Failed too many times', url, params, cookies) raise Exception('Failed too many times for ' + url) @@ -432,7 +429,6 @@ def get_revision_version(self, rev_id): res = self.get_revision_version_raw(rev_id) # this has title! soup = BeautifulSoup(res[0], 'html.parser') - # Extract list of images # TODO: to get the right revision that added them, we need to go back From c9b7f536c05c48a4c116fdeee7780ee9ee269277 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Thu, 30 Jul 2020 16:44:34 +0200 Subject: [PATCH 81/93] fix initial fetch --- rmaint.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/rmaint.py b/rmaint.py index 1f2eb2d..d355af8 100644 --- a/rmaint.py +++ b/rmaint.py @@ -151,8 +151,7 @@ def buildRevisionList(self, pages = None): print(len(pages), 'pages loaded') fetched_pages = set() - # TODO: I don't know python, but this is highly suboptimal (and takes a ton of time) - # Should use a set/hashmap/whatever python calls it + for wrev in tqdm(self.wrevs, desc='Collecting pages we already got revisions for'): page_name = wrev['page_name'] @@ -175,8 +174,7 @@ def buildRevisionList(self, pages = None): print("Skipping", page) continue - if self.debug: - print("Querying page: " + page + " " + str(fetched) + "/" + str(len(pages) - len(fetched_pages))) + fetched += 1 page_id = self.wd.get_page_id(page) if self.debug: @@ -186,9 +184,8 @@ def buildRevisionList(self, pages = None): print('Page gone?', page) continue - revs = self.wd.get_revisions(page_id=page_id, limit=max_depth) - for rev in tqdm(revs, desc='Adding revisions from page ' + page_id): - fetched += 1 + revs = self.wd.get_revisions(page_id=page_id, limit=self.max_depth) + for rev in revs: if rev['id'] in self.fetched_revids: continue From 9b526bf0414079d32e86f7261127a5ab05d42c88 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Fri, 31 Jul 2020 11:36:27 +0200 Subject: [PATCH 82/93] don't always cleanup --- crawl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crawl.py b/crawl.py index 367dbf0..fe2a9a0 100644 --- a/crawl.py +++ b/crawl.py @@ -28,6 +28,7 @@ parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions') parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True) parser.add_argument('--skip', type=str, help='Skip the specified revision') +parser.add_argument('--cleanup', action='store_true', help='Clean up after downloading repo') # Common settings parser.add_argument('--debug', action='store_true', help='Print debug info') parser.add_argument('--delay', type=int, default='200', help='Delay between consequent calls to Wikidot') @@ -111,5 +112,7 @@ def force_dirs(path): print("Downloading revisions") rm.fetchAll() - rm.cleanup() + if args.cleanup: + rm.cleanup() + print("Done.") From cdcb096a06aeb77d8ef7c5c3116cbd63dc35826a Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Fri, 31 Jul 2020 11:37:16 +0200 Subject: [PATCH 83/93] better throttling when requests fail --- wikidot.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/wikidot.py b/wikidot.py index c6f047e..926e69d 100644 --- a/wikidot.py +++ b/wikidot.py @@ -65,7 +65,7 @@ def maybe_download_file(self, url, file_path): print('request timed out!') retries += 1 - time.sleep(retries * retries * self.delay / 1000) + time.sleep(retries * retries * retries) # up to ~2 minutes continue if req.status_code == 404: @@ -79,7 +79,7 @@ def maybe_download_file(self, url, file_path): print(' - ', req) retries += 1 - time.sleep(retries * retries * self.delay / 1000) + time.sleep(retries * retries * retries) continue try: @@ -146,7 +146,7 @@ def queryex(self, params, urlAppend = None): except requests.exceptions.ReadTimeout: print('request timed out!') retries += 1 - time.sleep(retries * retries * self.delay / 1000) + time.sleep(retries * retries * retries) continue if self.debug: @@ -166,7 +166,7 @@ def queryex(self, params, urlAppend = None): # Extra nice, sleep longer (expoential increase), hope for the # server to recover - time.sleep(retries * retries * self.delay / 1000) + time.sleep(retries * retries * retries) continue @@ -184,7 +184,7 @@ def queryex(self, params, urlAppend = None): if retries < self.max_retries: retries += 1 #self._wait_request_slot() - time.sleep(retries * retries * self.delay / 1000) + time.sleep(retries * retries * retries) continue raise e @@ -195,7 +195,7 @@ def queryex(self, params, urlAppend = None): print(" ! error in response", json) retries += 1 - time.sleep(retries * retries * self.delay / 1000) + time.sleep(retries * retries * retries) continue print(' ! Failed too many times', url, params, cookies) @@ -299,13 +299,13 @@ def get_page_id(self, page_unix_name): except requests.exceptions.ReadTimeout: print('request timed out!') retries += 1 - time.sleep(retries * retries * self.delay / 1000) + time.sleep(retries * retries * retries) continue if req.status_code >= 500: print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) retries += 1 - time.sleep(retries * retries * self.delay / 1000) + time.sleep(retries * retries * retries) continue req.raise_for_status() From d3eeb75a328d6fa1031144996a9beb4a8762bc75 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 2 Aug 2020 11:51:29 +0200 Subject: [PATCH 84/93] fix starting from scratch --- rmaint.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rmaint.py b/rmaint.py index d355af8..ee0bfb3 100644 --- a/rmaint.py +++ b/rmaint.py @@ -233,6 +233,8 @@ def saveState(self): fp.close() def loadState(self): + if not os.path.isfile(self.path+'/.wstate'): + return fp = open(self.path+'/.wstate', 'rb') self.rev_no = pickle.load(fp) fp.close() From f6cfe018e468f6a374d709024d94ede11491df83 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 2 Aug 2020 11:52:05 +0200 Subject: [PATCH 85/93] fix path to revid file --- rmaint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rmaint.py b/rmaint.py index ee0bfb3..f48458b 100644 --- a/rmaint.py +++ b/rmaint.py @@ -264,8 +264,8 @@ def openRepo(self): if self.storeRevIds: # Add revision id file to the new repo - fname = self.path + '/.revid' - codecs.open(self.path + fname, "w", "UTF-8").close() + fname = '.revid' + codecs.open(self.path + '/' + fname, "w", "UTF-8").close() self.repo.index.add([fname]) self.index.commit("Initial creation of repo") self.index = self.repo.index From 2d96bf813c8865ae8caa53a86b517fd295667741 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 2 Aug 2020 11:54:55 +0200 Subject: [PATCH 86/93] ignore minor error --- rmaint.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/rmaint.py b/rmaint.py index f48458b..2edff0c 100644 --- a/rmaint.py +++ b/rmaint.py @@ -492,7 +492,13 @@ def updateTags(self, comment, unixname): # The rest of the file is preserved. # def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): - with codecs.open(self.path+'/'+child_unixname+'.txt', "r", "UTF-8") as f: + child_path = self.path+'/'+child_unixname+'.txt' + + ## TODO: find out when this happens + if not os.path.isfile(child_path): + print('Failed to find child file!', child_path) + return + with codecs.open(child_path, "r", "UTF-8") as f: content = f.readlines() # Since this is all tracked by us, we KNOW there's a line in standard format somewhere idx = content.index('parent:'+parent_oldunixname+'\n') From b8cd79f121cea456e3019c9a1021bfb4cb9ef3c5 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 2 Aug 2020 11:55:15 +0200 Subject: [PATCH 87/93] annoying --- wikidot.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wikidot.py b/wikidot.py index 926e69d..ea36821 100644 --- a/wikidot.py +++ b/wikidot.py @@ -61,7 +61,7 @@ def maybe_download_file(self, url, file_path): try: req = requests.get(url, stream=True, timeout=30) - except requests.exceptions.ReadTimeout: + except requests.exceptions.RequestException: print('request timed out!') retries += 1 @@ -143,7 +143,7 @@ def queryex(self, params, urlAppend = None): start = timer() try: req = requests.request('POST', url, data=params, cookies=cookies, timeout=30) - except requests.exceptions.ReadTimeout: + except requests.exceptions.RequestException: print('request timed out!') retries += 1 time.sleep(retries * retries * retries) @@ -296,7 +296,7 @@ def get_page_id(self, page_unix_name): while retries < self.max_retries: try: req = requests.request('GET', url, timeout=30) - except requests.exceptions.ReadTimeout: + except requests.exceptions.RequestException: print('request timed out!') retries += 1 time.sleep(retries * retries * retries) From 6be1b90c958e48a18f2a002e30bf35cca09a11c3 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 2 Aug 2020 11:59:51 +0200 Subject: [PATCH 88/93] move code around --- rmaint.py | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/rmaint.py b/rmaint.py index 2edff0c..a7191b9 100644 --- a/rmaint.py +++ b/rmaint.py @@ -445,6 +445,33 @@ def updateChildren(self, oldunixname, newunixname): if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname: self.updateParentField(child, self.last_parents[child], newunixname) + # + # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. + # The rest of the file is preserved. + # + def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): + child_path = self.path+'/'+child_unixname+'.txt' + + ## TODO: find out when this happens + # The child name is gotten from the commit message, so not very reliable + if not os.path.isfile(child_path): + print('Failed to find child file!', child_path) + return + with codecs.open(child_path, "r", "UTF-8") as f: + content = f.readlines() + # Since this is all tracked by us, we KNOW there's a line in standard format somewhere + idx = content.index('parent:'+parent_oldunixname+'\n') + if idx < 0: + raise Exception("Cannot update child page "+child_unixname+": " + +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); + content[idx] = 'parent:'+parent_newunixname+'\n' + with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f: + f.writelines(content) + + # + # Updates the tags field in the file + # Not used (yet) + # def updateTags(self, comment, unixname): file_name = self.path+'/'+unixname+'.txt' removed = [] @@ -487,28 +514,6 @@ def updateTags(self, comment, unixname): with codecs.open(file_name, "w", "UTF-8") as f: f.writelines(content) - # - # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. - # The rest of the file is preserved. - # - def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): - child_path = self.path+'/'+child_unixname+'.txt' - - ## TODO: find out when this happens - if not os.path.isfile(child_path): - print('Failed to find child file!', child_path) - return - with codecs.open(child_path, "r", "UTF-8") as f: - content = f.readlines() - # Since this is all tracked by us, we KNOW there's a line in standard format somewhere - idx = content.index('parent:'+parent_oldunixname+'\n') - if idx < 0: - raise Exception("Cannot update child page "+child_unixname+": " - +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); - content[idx] = 'parent:'+parent_newunixname+'\n' - with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f: - f.writelines(content) - # # Finalizes the construction process and deletes any temporary files. From 77490e2fedd0e410b108925220be3eb0949e24af Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 2 Aug 2020 12:00:35 +0200 Subject: [PATCH 89/93] start on forum scraping support --- wikidot.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/wikidot.py b/wikidot.py index ea36821..dd039d7 100644 --- a/wikidot.py +++ b/wikidot.py @@ -396,6 +396,45 @@ def get_revisions(self, page_id, limit): }) return revs + # topics in forum: http://www.scp-wiki.net/forum/c-###/sort/start + # -> div class 'title' + # -> a href= http://www.scp-wiki.net/forum/t-####/foobar (foobar not important) + + # posts in topic http://www.scp-wiki.net/forum/t-####/ + # -> div id 'thread-container' + # -> div class 'post-container' + # -> div class = 'post', id = 'post-####' + # -> div class 'title' + # -> div class 'content' + # -> div class 'post-container' + # -> ... + # -> div class 'post-container' + # -> ... + + #def get_forum_post_revisions(self, post_id): + # res = self.query({ + # 'moduleName': 'forum/sub/ForumPostRevisionsModule', + # 'postId': post_id, + # }) + # revisions = [] + # soup = BeautifulSoup(res, 'html.parser') + # for row in soup.find_all("tr"): + # columns = row.find_all("td") + + # if len(columns) != 3: + # raise Exception('Invalid row in post history for ' + str(post_id)) + + # user = columns[0].find('a').getText() + # time = columns[1].find('span').getText() + # rev_id_js = columns[0].find('a')['href'] + # match = re.search(r'showRevision\(event, ([0-9]+)\)', rev_id_js) + # rev_id = match.group(1) + + # revisions.append({ + # 'id': rev_id, + # 'user': user, + # 'time': time, + # }) # Retrieves revision source for a revision. # There's no raw version because there's nothing else in raw. From b197beb572dcd3c17fe3b8ebd79fbf366dc067d2 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 23 Aug 2020 00:25:45 +0200 Subject: [PATCH 90/93] support for skipping entire pages (for pages that fail for some reason) --- crawl.py | 3 +++ rmaint.py | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/crawl.py b/crawl.py index fe2a9a0..5201eeb 100644 --- a/crawl.py +++ b/crawl.py @@ -28,6 +28,7 @@ parser.add_argument('--depth', type=int, default='10000', help='Query only last N revisions') parser.add_argument('--revids', action='store_true', help='Store last revision ids in the repository', default=True) parser.add_argument('--skip', type=str, help='Skip the specified revision') +parser.add_argument('--skip-pages', type=str, help='Skip the specified pages') parser.add_argument('--cleanup', action='store_true', help='Clean up after downloading repo') # Common settings parser.add_argument('--debug', action='store_true', help='Print debug info') @@ -106,6 +107,8 @@ def force_dirs(path): rm.buildRevisionList([args.page] if args.page else None) rm.openRepo() + if args.skip_pages: + rm.pages_to_skip = args.skip_pages.split(",") if args.skip: rm.revs_to_skip = [args.skip] diff --git a/rmaint.py b/rmaint.py index a7191b9..b9a2287 100644 --- a/rmaint.py +++ b/rmaint.py @@ -51,6 +51,7 @@ def __init__(self, wikidot, path): self.fetched_revids = set() self.revs_to_skip = [] + self.pages_to_skip = [] # @@ -288,6 +289,11 @@ def commitNext(self, rev): print("Skipping", rev) return True + unixname = rev['page_name'] + if unixname in self.pages_to_skip: + print("Skipping", rev) + return True + source = self.wd.get_revision_source(rev['rev_id']) # Page title and unix_name changes are only available through another request: details = self.wd.get_revision_version(rev['rev_id']) @@ -300,7 +306,6 @@ def commitNext(self, rev): outp.write(rev['rev_id']) # rev_ids are unique amongst all pages, and only one page changes in each commit anyway outp.close() - unixname = rev['page_name'] rev_unixname = details['unixname'] # may be different in revision than atm # Unfortunately, there's no exposed way in Wikidot to see page breadcrumbs at any point in history. From 603d1647a830dd5644c3bb11e1523e2ce9e1d740 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 23 Aug 2020 00:26:05 +0200 Subject: [PATCH 91/93] fix support for skipping multiple revisions --- crawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl.py b/crawl.py index 5201eeb..68fcf41 100644 --- a/crawl.py +++ b/crawl.py @@ -110,7 +110,7 @@ def force_dirs(path): if args.skip_pages: rm.pages_to_skip = args.skip_pages.split(",") if args.skip: - rm.revs_to_skip = [args.skip] + rm.revs_to_skip = args.skip.split(",") print("Downloading revisions") rm.fetchAll() From 3524f541b73a4d15d82992516b54fcf3cd997e30 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 23 Aug 2020 00:26:32 +0200 Subject: [PATCH 92/93] fix --- rmaint.py | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/rmaint.py b/rmaint.py index b9a2287..1ab383f 100644 --- a/rmaint.py +++ b/rmaint.py @@ -450,33 +450,6 @@ def updateChildren(self, oldunixname, newunixname): if self.last_parents[child] == oldunixname and self.last_parents[child] != newunixname: self.updateParentField(child, self.last_parents[child], newunixname) - # - # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. - # The rest of the file is preserved. - # - def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): - child_path = self.path+'/'+child_unixname+'.txt' - - ## TODO: find out when this happens - # The child name is gotten from the commit message, so not very reliable - if not os.path.isfile(child_path): - print('Failed to find child file!', child_path) - return - with codecs.open(child_path, "r", "UTF-8") as f: - content = f.readlines() - # Since this is all tracked by us, we KNOW there's a line in standard format somewhere - idx = content.index('parent:'+parent_oldunixname+'\n') - if idx < 0: - raise Exception("Cannot update child page "+child_unixname+": " - +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); - content[idx] = 'parent:'+parent_newunixname+'\n' - with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f: - f.writelines(content) - - # - # Updates the tags field in the file - # Not used (yet) - # def updateTags(self, comment, unixname): file_name = self.path+'/'+unixname+'.txt' removed = [] @@ -519,6 +492,26 @@ def updateTags(self, comment, unixname): with codecs.open(file_name, "w", "UTF-8") as f: f.writelines(content) + # + # Processes a page file and updates "parent:..." string to reflect a change in parent's unixname. + # The rest of the file is preserved. + # + def updateParentField(self, child_unixname, parent_oldunixname, parent_newunixname): + child_path = self.path+'/'+child_unixname+'.txt' + if not os.path.isfile(child_path): + print('Failed to find child file!', child_path) + return + with codecs.open(child_path, "r", "UTF-8") as f: + content = f.readlines() + # Since this is all tracked by us, we KNOW there's a line in standard format somewhere + idx = content.index('parent:'+parent_oldunixname+'\n') + if idx < 0: + raise Exception("Cannot update child page "+child_unixname+": " + +"it is expected to have parent set to "+parent_oldunixname+", but there seems to be no such record in it."); + content[idx] = 'parent:'+parent_newunixname+'\n' + with codecs.open(self.path+'/'+child_unixname+'.txt', "w", "UTF-8") as f: + f.writelines(content) + # # Finalizes the construction process and deletes any temporary files. From f23b0ffd56b8561eb00b0e621d95ee2989772389 Mon Sep 17 00:00:00 2001 From: "Martin T. H. Sandsmark" Date: Sun, 23 Aug 2020 00:27:17 +0200 Subject: [PATCH 93/93] fix robustness when downloading images --- wikidot.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/wikidot.py b/wikidot.py index dd039d7..be378ea 100644 --- a/wikidot.py +++ b/wikidot.py @@ -42,8 +42,15 @@ def maybe_download_file(self, url, file_path): #self._wait_request_slot() - dirpath = os.path.dirname(file_path) - os.makedirs(dirpath, exist_ok=True) + try: + dirpath = os.path.dirname(file_path) + os.makedirs(dirpath, exist_ok=True) + except OSError as e: + if e.errno == 36: + print("Path too long", e) + return False + else: + raise # re-raise previously caught exception if self.debug: print(" < downloading", url, "to" ,file_path, "dirpath", dirpath) @@ -62,15 +69,17 @@ def maybe_download_file(self, url, file_path): try: req = requests.get(url, stream=True, timeout=30) except requests.exceptions.RequestException: - print('request timed out!') + print('request exception') retries += 1 time.sleep(retries * retries * retries) # up to ~2 minutes continue + except urllib3.exceptions.ReadTimeoutError: + print('read timeout') - if req.status_code == 404: - self.failed_images.add(url) - return False + retries += 1 + time.sleep(retries * retries * retries) # up to ~2 minutes + continue if req.status_code >= 500: print(' ! 500 error for ' + url + ', retries ' + str(retries) + '/' + str(self.max_retries)) @@ -82,6 +91,10 @@ def maybe_download_file(self, url, file_path): time.sleep(retries * retries * retries) continue + if req.status_code >= 400: + self.failed_images.add(url) + return False + try: # In case of 404 errors or other stuff that indicates # some bug in how we handle or request things @@ -102,11 +115,18 @@ def maybe_download_file(self, url, file_path): print(" - downloaded file size", os.path.getsize(file_path), "in", round(timer() - start, 2)) return True + except OSError as e: + if e.errno == 36: + print("Filename to long", e) + return False + else: + raise # re-raise previously caught exception except Exception as e: print(' ! Failed to download', e, req, url) raise e - raise Exception('Failed too many times for', url) + print('Failed too many times for', url) + return False # To honor usage rules, we wait for self.delay between requests. # Low-level query functions call this before every request to Wikidot./