Mw2html
From Noah.org
Jump to navigationJump to searchThis was originally from here. It's not as useful to me know, but I keep it around because there isn't much else I've found that will do the same thing.
#!/usr/bin/env python """mw2html - Mediawiki to static HTML I use this to create a personal website from a local mediawiki installation. No search functionality. Hacks the Monobook skin and the produced HTML. Connelly Barnes 2005. Public domain. """ __version__ = '0.1.0.0' import re import sys import getopt import random import urllib import urllib2 import textwrap import urlparse import os, os.path import errno import sha try: set except: from sets import Set as set try: import htmldata except: print 'Requires Python htmldata module:' print ' http://oregonstate.edu/~barnesc/htmldata/' sys.exit() MOVE_HREF = 'movehref' MADE_BY_COMMENT = '<!-- Content generated by Mediawiki and mw2html -->' INDEX_HTML = 'index.html' url_filename_cache = {} wrote_file_set = set() MONOBOOK_SKIN = 'monobook' # Constant identifier for Monobook. class Config: """ Instances contain all options passed at the command line. """ def __init__(self, rooturl, outdir, flatten=True, lower=True, index=None, clean=True, sidebar=None, hack_skin=True, made_by=True, overwrite=False, footer=None, skin=MONOBOOK_SKIN, move_href=True, remove_png=True, remove_history=True): self.rooturl = rooturl self.outdir = os.path.abspath(outdir) self.flatten = flatten self.lower = lower self.index = index self.clean = clean self.sidebar = sidebar self.hack_skin = hack_skin self.made_by = made_by self.overwrite = overwrite self.footer = footer self.skin = skin self.move_href = move_href if self.sidebar is not None: self.sidebar = os.path.abspath(self.sidebar) if self.footer is not None: self.footer = os.path.abspath(self.footer) self.remove_png = remove_png self.remove_history = remove_history def post_filename_transform(filename, config): """ User-customizable filename transform. Here filename is the full filename in the output directory. Returns modified full filename. """ return filename def monobook_fix_html_sidebar(doc, config): """ Sets sidebar for Mediawiki 1.4beta6 Monobook HTML output. """ if config.made_by: doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=') SIDEBAR_ID = 'SIDEBAR' + sha.new(str(random.random())).hexdigest() # Remove sidebar HTML doc = re.sub( r'(<!-- end content -->)[\s\S]+?' + r'(<!-- end of the left \(by default at least\) column -->)', r'\1<div class="visualClear"></div></div></div></div>' + SIDEBAR_ID + r'\2', doc) pre_sidebar = """ <div id="column-one"> <div id="p-cactions" class="portlet"></div> <div class="portlet" id="p-personal"></div> <div class="portlet" id="p-logo"></div> <div class="portlet" id="p-nav"> """ post_sidebar = """ </div> <div id="p-search" class="portlet"></div> <div class="portlet" id="p-tb"></div> </div> <!-- end left column --> """ sidebar_content = '' if config.sidebar != None: f = open(config.sidebar, 'rU') sidebar_content = f.read() f.close() sidebar_content = pre_sidebar + sidebar_content + post_sidebar doc = doc.replace(SIDEBAR_ID, sidebar_content) doc = re.sub( r'<div id="f-poweredbyico">[\s\S]+?(<ul id="f-list">)', r'\1', doc) # Remove edit links doc = re.sub(r'<div class="editsection"[\s\S]+?</div>', r'', doc) # Remove page has been accessed X times list item. doc = re.sub(r'<li id="f-viewcount">[\s\S]+?</li>', r'', doc) # Remove disclaimers list item. doc = re.sub(r'<li id="f-disclaimer">[\s\S]+?</li>', r'', doc) # Replace remaining text with footer, if available. if config.footer is not None: s1 = '<div id="footer">' s2 = '</div>' i1 = doc.index(s1) i2 = doc.index(s2, i1) f = open(config.footer, 'rU') footer_text = f.read() f.close() doc = doc[:i1+len(s1)] + footer_text + doc[i2:] return doc def fix_move_href_tags(doc, config): """ Return copy of doc with all MOVE_HREF tags removed. """ while '<' + MOVE_HREF in doc: i1 = doc.index('<' + MOVE_HREF) i2 = doc.index('</' + MOVE_HREF, i1+1) i3 = doc.index('>', i2+1) (start, end) = (i1, i3+1) tags = htmldata.tagextract(doc[start:end]) assert tags[0][0] == MOVE_HREF assert tags[-1][0] == '/' + MOVE_HREF href = tags[0][1].get('href', '') new_tags = [] for tag in tags[1:-1]: if len(tag) == 2: if 'href' in tag[1]: if href == '': continue tag[1]['href'] = href new_tags += [tag] doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:] return doc def html_remove_image_history(doc, config): """ Remove image history and links to information. """ doc = re.sub(r'<h2>Image history</h2>[\s\S]+?</ul>', r'', doc) doc = re.sub(r'<h2>Image links</h2>[\s\S]+?</ul>', r'', doc) return doc def post_html_transform(doc, url, config): """ User-customizable HTML transform. Given an HTML document (with URLs already rewritten), returns modified HTML document. """ if config.hack_skin: if config.skin == MONOBOOK_SKIN: doc = monobook_fix_html_sidebar(doc, config) doc = monobook_hack_skin_html(doc, config) else: raise ValueError('unknown skin') if config.move_href: doc = fix_move_href_tags(doc, config) if config.remove_history: doc = html_remove_image_history(doc, config) return doc def monobook_hack_skin_html(doc, config): """ Hacks Monobook HTML output: use CSS ids for hacked skin. See monobook_hack_skin_css. """ doc = doc.replace('<div id="globalWrapper">', '<div id="globalWrapperHacked">') doc = doc.replace('<div id="footer">', '<div id="footerHacked">') doc = doc.replace('</body>', '<br></body>') return doc def monobook_hack_skin_css(doc, url, config): """ Hacks Mediawiki 1.4beta6 Monobook main CSS file for better looks. Removes flower background. Defines *Hacked CSS ids, so we can add an orange bar at the top, and clear the orange bar right above the footer. """ if not url.endswith('monobook/main.css'): return doc doc = "/* Monobook skin automatically modified by mw2html. */" + doc doc = doc.replace('url("headbg.jpg")', '') doc += """ /* Begin hacks by mw2html */ #globalWrapperHacked { font-size:127%; width: 100%; background-color: White; border-top: 1px solid #fabd23; border-bottom: 1px solid #fabd23; margin: 0.6em 0em 1em 0em; padding: 0em 0em 1.2em 0em; } #footerHacked { background-color: White; margin: 0.6em 0em 0em 0em; padding: 0.4em 0em 0em 0em; text-align: center; font-size: 90%; } #footerHacked li { display: inline; margin: 0 1.3em; } """ c1 = '#column-one { padding-top: 160px; }' c2 = '#column-one { padding-top: 3.0em; }' assert c1 in doc doc = doc.replace(c1, '/* edit by mw2html */\n' + c2 + '\n/* end edit by mw2html */\n') # Remove external link icons. if config.remove_png: doc = re.sub(r'#bodyContent a\[href \^="http://"\][\s\S]+?\}', r'', doc) return doc def post_css_transform(doc, url, config): """ User-customizable CSS transform. Given a CSS document (with URLs already rewritten), returns modified CSS document. """ if config.hack_skin: if config.skin == MONOBOOK_SKIN: doc = monobook_hack_skin_css(doc, url, config) else: raise ValueError('unknown skin') return doc def url_to_filename(url, config): """ Translate a full url to a full filename (in local OS format) under outdir. """ url = split_section(url)[0] if url in url_filename_cache: return url_filename_cache[url] part = url if part.lower().startswith('http://'): part = part[len('http://'):] L = part.strip('/').split('/') L = [urllib.quote_plus(x) for x in L] if len(L) <= 1 or not '.' in L[-1]: # url ends with a directory name. Store it under index.html. L += [INDEX_HTML] # Local filename relative to outdir # (More transformations are made to this below...). subfile = os.sep.join(L) # Fix up extension based on mime type. fix_ext = True try: f = urllib2.urlopen(url) except urllib2.URLError, e: fix_ext = False if fix_ext: mimetype = f.info().type.lower().split(' ')[0] # Maps mimetype to file extension MIME_MAP = { 'image/jpeg': 'jpg', 'image/png': 'png', 'image/gif': 'gif', 'image/tiff': 'tiff', 'text/plain': 'txt', 'text/html': 'html', 'text/rtf': 'rtf', 'text/css': 'css', 'text/sgml': 'sgml', 'text/xml': 'xml', 'application/zip': 'zip' } if mimetype in MIME_MAP: (root, ext) = os.path.splitext(subfile) ext = '.' + MIME_MAP[mimetype] subfile = root + ext if config.lower: subfile = subfile.lower() ans = os.path.join(config.outdir, subfile) if config.flatten: ans = flatten_filename(url, config, ans) if config.clean: ans = clean_filename(url, config, ans) if config.index != None: ans = move_to_index_if_needed(config, ans) ans = find_unused_filename(ans, file_exists_in_written_set) ans = post_filename_transform(ans, config) # Cache and return answer. wrote_file_set.add(os.path.normcase(os.path.normpath(ans))) url_filename_cache[url] = ans return ans def file_exists_in_written_set(filename): return os.path.normcase(os.path.normpath(filename)) in wrote_file_set def find_unused_filename(filename, exists=os.path.exists): """ Return 'file' if 'file' doesn't exist, otherwise 'file1', 'file2', etc. Existance is determined by the callable exists(), which takes a filename and returns a boolean. """ if not exists(filename): return filename (head, tail) = os.path.split(filename) i = 1 while True: numbered = (os.path.splitext(tail)[0] + str(i) + os.path.splitext(tail)[1]) fullname = os.path.join(head, numbered) if not exists(fullname): return fullname i += 1 def clean_filename(url, config, ans): # Split outdir and our file/dir under outdir # (Note: ans may not be a valid filename) (par, ans) = (ans[:len(config.outdir)], ans[len(config.outdir):]) if ans.startswith(os.sep): ans = ans[1:] # Replace % escape codes with underscores, dashes with underscores. while '%%' in ans: ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%')+2:] while '%25' in ans: ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25')+5:] while '%' in ans: ans = ans[:ans.index('%')] + '_' + ans[ans.index('%')+3:] ans = ans.replace('-', '_') while '__' in ans: ans = ans.replace('__', '_') while '_.' in ans: ans = ans.replace('_.', '.') # Rename math thumbnails if '/math/' in url: tail = os.path.split(ans)[1] if os.path.splitext(tail)[1] == '.png': tail = os.path.splitext(tail)[0] if set(tail) <= set('0123456789abcdef') and len(tail) == 32: ans = 'math_' + sha.new(tail).hexdigest()[:4] + '.png' return os.path.join(par, ans) def move_to_index_if_needed(config, ans): if ans.endswith(config.index): ans = ans[:len(ans)-len(config.index)] + INDEX_HTML return ans def flatten_filename(url, config, filename): def get_fullname(relname): return os.path.join(config.outdir, relname) orig_ext = os.path.splitext(filename)[1] (head, tail) = os.path.split(filename) if tail == INDEX_HTML: (head, tail) = os.path.split(head) ans = tail if os.path.splitext(ans)[1] != orig_ext: ans = os.path.splitext(ans)[0] + orig_ext return os.path.join(config.outdir, ans) def split_section(url): """ Splits into (head, tail), where head contains no '#' and is max length. """ if '#' in url: i = url.index('#') return (url[:i], url[i:]) else: return (url, '') def rewrite_external_url(url, config): """ Rewrite any URL that could not be stored locally. To not rewrite any external URLs, simply return url. """ # If could not be stored locally, but in same domain, return ''. if get_domain(url) == get_domain(config.rooturl): return '' return url def url_to_relative(url, cururl, config): """ Translate a full url to a filename (in URL format) relative to cururl. """ cururl = split_section(cururl)[0] (url, section) = split_section(url) L1 = url_to_filename(url, config).replace(os.sep, '/').split('/') L2 = url_to_filename(cururl, config).replace(os.sep, '/').split('/') while L1 != [] and L2 != [] and L1[0] == L2[0]: L1 = L1[1:] L2 = L2[1:] return urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section def parse_css(doc, url, config): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links found in the CSS. """ new_urls = [] L = htmldata.urlextract(doc, url, 'text/css') for item in L: # Store url locally. u = item.url new_urls += [u] item.url = url_to_relative(u, url, config) newdoc = htmldata.urljoin(doc, L) newdoc = post_css_transform(newdoc, url, config) return (newdoc, new_urls) def get_domain(u): """ Get domain of URL. """ ans = urlparse.urlparse(u)[1] if ':' in ans: ans = ans[:ans.index(':')] return ans def should_follow(rooturl, url): """ Returns boolean for whether url should be spidered. Given that 'url' was linked to from site 'rooturl', return whether 'url' should be spidered as well. """ # False if different domains. if get_domain(rooturl) != get_domain(url): return False # False if multiple query fields. if url.count('&') >= 1: return False if 'MediaWiki:' in url or 'Special:' in url: return False return True def parse_html(doc, url, config): """ Returns (modified_doc, new_urls), where new_urls are absolute URLs for all links we want to spider in the HTML. """ BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>' END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>' new_urls = [] # Temporarily "get rid" of comments so htmldata will find the URLs # in the funky "<!--[if" HTML hackery for IE. doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE) doc = doc.replace('-->', END_COMMENT_REPLACE) L = htmldata.urlextract(doc, url, 'text/html') for item in L: u = item.url if should_follow(url, u): # Store url locally. new_urls += [u] item.url = url_to_relative(u, url, config) else: item.url = rewrite_external_url(item.url, config) newdoc = htmldata.urljoin(doc, L) newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--') newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->') newdoc = post_html_transform(newdoc, url, config) return (newdoc, new_urls) def run(config, out=sys.stdout): """ Code interface. """ if urlparse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'): out.write('Please do not use robots with the Wikipedia site.\n') out.write('Instead, install the Wikipedia database locally and use mw2html on\n') out.write('your local installation. See the Mediawiki site for more information.\n') sys.exit(1) # Number of files saved n = 0 if not config.overwrite and os.path.exists(config.outdir): out.write('Error: Directory exists: ' + str(config.outdir) ) sys.exit(1) complete = set() pending = set([config.rooturl]) while len(pending) > 0: url = pending.pop() if url in complete: continue complete.add(url) try: f = urllib2.urlopen(url) except urllib2.URLError, e: try: out.write(str(e.code) + ': ' + url + '\n\n') except: out.write('Error opening: ' + url + '\n\n') continue doc = f.read() mimetype = f.info().type.lower().split(' ')[0] f.close() new_urls = [] if mimetype == 'text/html': (doc, new_urls) = parse_html(doc, url, config) elif mimetype == 'text/css': (doc, new_urls) = parse_css(doc, url, config) # Enqueue URLs that we haven't yet spidered. for u in new_urls: if u not in complete: # Strip off any #section link. if '#' in u: u = u[:u.index('#')] pending.add(u) mode = ['wb', 'w'][mimetype.startswith('text')] # Save modified content to disk. filename = url_to_filename(url, config) # Make parent directory if it doesn't exist. try: os.makedirs(os.path.split(filename)[0]) except OSError, e: if e.errno != errno.EEXIST: raise # Not really needed since we checked that the directory # outdir didn't exist at the top of run(), but let's double check. if os.path.exists(filename) and not config.overwrite: out.write('File already exists: ' + str(filename)) sys.exit(1) f = open(filename, mode) f.write(doc) f.close() out.write(url + '\n => ' + filename + '\n\n') n += 1 out.write(str(n) + ' file(s) saved\n') def usage(): """ Print command line options. """ usage_str = """ mw2html url outdir [options] Converts an entire Mediawiki site into static HTML. Tested only with Mediawiki 1.4beta6 Monobook output. WARNING: This is a recursive robot that ignores robots.txt. Use with care. url - URL of mediawiki page to convert to static HTML. outdir - Output directory. -f, --force - Overwrite existing files in outdir. --no-flatten - Do not flatten directory structure. --no-lower - Retain original case for output filenames and dirs. --no-clean - Do not clean up filenames (clean replaces non-alphanumeric chars with _, renames math thumbs). --no-hack-skin - Do not modify skin CSS and HTML for looks. --no-made-by - Suppress "generated by" comment in HTML source. --no-move-href - Disable <movehref> tag. [1] --no-remove-png - Retain external link PNG icons. --no-remove-history - Retain image history and links to information. -l, --left=a.html - Paste HTML fragment file into left sidebar. -t, --top=a.html - Paste HTML fragment file into top horiz bar. -b, --bottom=a.html - Paste HTML fragment file into footer horiz bar. -i, --index=filename - Move given filename in outdir to index.html. Example Usage: mw2html http://127.0.0.1/mywiki/ out -f -i main_page.html -l sidebar.html Freezes wiki into 'out' directory, moves main_page.html => index.html, assumes sidebar.html is defined in the current directory. [1]. The <movehref> tag. Wiki syntax: <html><movehref href="a"></html>...<html></movehref></html>. When enabled, this tag will cause all href= attributes inside of it to be set to the given location. This is useful for linking images. In MediaWiki, for the <html> tag to work, one needs to enable $wgRawHtml and $wgWhitelistEdit in LocalSettings.php. A <movehref> tag with no href field will remove all links inside it. """ print textwrap.dedent(usage_str.strip('\n')) sys.exit(1) def main(): """ Command line interface. """ try: (opts, args) = getopt.gnu_getopt(sys.argv[1:], 'fl:t:b:i:', ['force', 'no-flatten', 'no-lower', 'no-clean', 'no-hack-skin', 'no-made-by', 'left=', 'top=', 'bottom=', 'index=', 'no-move-href', 'no-remove-png', 'no-remove-history']) except getopt.GetoptError: usage() # Parse non-option arguments try: (rooturl, outdir) = args except ValueError: usage() config = Config(rooturl=rooturl, outdir=outdir) # Parse option arguments for (opt, arg) in opts: if opt in ['-f', '--force']: config.overwrite = True if opt in ['--no-flatten']: config.flatten = False if opt in ['--no-lower']: config.lower = False if opt in ['--no-clean']: config.clean = False if opt in ['--no-hack-skin']: config.hack_skin = False if opt in ['--no-made-by']: config.made_by = False if opt in ['--no-move-href']: config.move_href = False if opt in ['--no-remove-png']: config.remove_png = False if opt in ['--no-remove-history']: config.remove_history = False if opt in ['-l', '--left']: config.sidebar = os.path.abspath(arg) if opt in ['-t', '--top']: raise NotImplementedError config.header = os.path.abspath(arg) if opt in ['-b', '--bottom']: config.footer = os.path.abspath(arg) if opt in ['-i', '--index']: config.index = arg # Run program run(config) if __name__ == '__main__': main()