Mw2html
From Noah.org
This was originally from here. It's not as useful to me know, but I keep it around because there isn't much else I've found that will do the same thing.
#!/usr/bin/env python
"""mw2html - Mediawiki to static HTML
I use this to create a personal website from a local mediawiki
installation. No search functionality. Hacks the Monobook skin and
the produced HTML.
Connelly Barnes 2005. Public domain.
"""
__version__ = '0.1.0.0'
import re
import sys
import getopt
import random
import urllib
import urllib2
import textwrap
import urlparse
import os, os.path
import errno
import sha
try:
set
except:
from sets import Set as set
try:
import htmldata
except:
print 'Requires Python htmldata module:'
print ' http://oregonstate.edu/~barnesc/htmldata/'
sys.exit()
MOVE_HREF = 'movehref'
MADE_BY_COMMENT = '<!-- Content generated by Mediawiki and mw2html -->'
INDEX_HTML = 'index.html'
url_filename_cache = {}
wrote_file_set = set()
MONOBOOK_SKIN = 'monobook' # Constant identifier for Monobook.
class Config:
"""
Instances contain all options passed at the command line.
"""
def __init__(self, rooturl, outdir,
flatten=True, lower=True, index=None, clean=True,
sidebar=None, hack_skin=True,
made_by=True, overwrite=False, footer=None,
skin=MONOBOOK_SKIN, move_href=True,
remove_png=True, remove_history=True):
self.rooturl = rooturl
self.outdir = os.path.abspath(outdir)
self.flatten = flatten
self.lower = lower
self.index = index
self.clean = clean
self.sidebar = sidebar
self.hack_skin = hack_skin
self.made_by = made_by
self.overwrite = overwrite
self.footer = footer
self.skin = skin
self.move_href = move_href
if self.sidebar is not None:
self.sidebar = os.path.abspath(self.sidebar)
if self.footer is not None:
self.footer = os.path.abspath(self.footer)
self.remove_png = remove_png
self.remove_history = remove_history
def post_filename_transform(filename, config):
"""
User-customizable filename transform.
Here filename is the full filename in the output directory.
Returns modified full filename.
"""
return filename
def monobook_fix_html_sidebar(doc, config):
"""
Sets sidebar for Mediawiki 1.4beta6 Monobook HTML output.
"""
if config.made_by:
doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
SIDEBAR_ID = 'SIDEBAR' + sha.new(str(random.random())).hexdigest()
# Remove sidebar HTML
doc = re.sub(
r'(<!-- end content -->)[\s\S]+?' +
r'(<!-- end of the left \(by default at least\) column -->)',
r'\1<div class="visualClear"></div></div></div></div>' + SIDEBAR_ID + r'\2', doc)
pre_sidebar = """
<div id="column-one">
<div id="p-cactions" class="portlet"></div>
<div class="portlet" id="p-personal"></div>
<div class="portlet" id="p-logo"></div>
<div class="portlet" id="p-nav">
"""
post_sidebar = """
</div>
<div id="p-search" class="portlet"></div>
<div class="portlet" id="p-tb"></div>
</div>
<!-- end left column -->
"""
sidebar_content = ''
if config.sidebar != None:
f = open(config.sidebar, 'rU')
sidebar_content = f.read()
f.close()
sidebar_content = pre_sidebar + sidebar_content + post_sidebar
doc = doc.replace(SIDEBAR_ID, sidebar_content)
doc = re.sub(
r'<div id="f-poweredbyico">[\s\S]+?(<ul id="f-list">)',
r'\1', doc)
# Remove edit links
doc = re.sub(r'<div class="editsection"[\s\S]+?</div>', r'', doc)
# Remove page has been accessed X times list item.
doc = re.sub(r'<li id="f-viewcount">[\s\S]+?</li>', r'', doc)
# Remove disclaimers list item.
doc = re.sub(r'<li id="f-disclaimer">[\s\S]+?</li>', r'', doc)
# Replace remaining text with footer, if available.
if config.footer is not None:
s1 = '<div id="footer">'
s2 = '</div>'
i1 = doc.index(s1)
i2 = doc.index(s2, i1)
f = open(config.footer, 'rU')
footer_text = f.read()
f.close()
doc = doc[:i1+len(s1)] + footer_text + doc[i2:]
return doc
def fix_move_href_tags(doc, config):
"""
Return copy of doc with all MOVE_HREF tags removed.
"""
while '<' + MOVE_HREF in doc:
i1 = doc.index('<' + MOVE_HREF)
i2 = doc.index('</' + MOVE_HREF, i1+1)
i3 = doc.index('>', i2+1)
(start, end) = (i1, i3+1)
tags = htmldata.tagextract(doc[start:end])
assert tags[0][0] == MOVE_HREF
assert tags[-1][0] == '/' + MOVE_HREF
href = tags[0][1].get('href', '')
new_tags = []
for tag in tags[1:-1]:
if len(tag) == 2:
if 'href' in tag[1]:
if href == '':
continue
tag[1]['href'] = href
new_tags += [tag]
doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:]
return doc
def html_remove_image_history(doc, config):
"""
Remove image history and links to information.
"""
doc = re.sub(r'<h2>Image history</h2>[\s\S]+?</ul>', r'', doc)
doc = re.sub(r'<h2>Image links</h2>[\s\S]+?</ul>', r'', doc)
return doc
def post_html_transform(doc, url, config):
"""
User-customizable HTML transform.
Given an HTML document (with URLs already rewritten), returns
modified HTML document.
"""
if config.hack_skin:
if config.skin == MONOBOOK_SKIN:
doc = monobook_fix_html_sidebar(doc, config)
doc = monobook_hack_skin_html(doc, config)
else:
raise ValueError('unknown skin')
if config.move_href:
doc = fix_move_href_tags(doc, config)
if config.remove_history:
doc = html_remove_image_history(doc, config)
return doc
def monobook_hack_skin_html(doc, config):
"""
Hacks Monobook HTML output: use CSS ids for hacked skin.
See monobook_hack_skin_css.
"""
doc = doc.replace('<div id="globalWrapper">', '<div id="globalWrapperHacked">')
doc = doc.replace('<div id="footer">', '<div id="footerHacked">')
doc = doc.replace('</body>', '<br></body>')
return doc
def monobook_hack_skin_css(doc, url, config):
"""
Hacks Mediawiki 1.4beta6 Monobook main CSS file for better looks.
Removes flower background. Defines *Hacked CSS ids, so we can add
an orange bar at the top, and clear the orange bar right above the
footer.
"""
if not url.endswith('monobook/main.css'):
return doc
doc = "/* Monobook skin automatically modified by mw2html. */" + doc
doc = doc.replace('url("headbg.jpg")', '')
doc += """
/* Begin hacks by mw2html */
#globalWrapperHacked {
font-size:127%;
width: 100%;
background-color: White;
border-top: 1px solid #fabd23;
border-bottom: 1px solid #fabd23;
margin: 0.6em 0em 1em 0em;
padding: 0em 0em 1.2em 0em;
}
#footerHacked {
background-color: White;
margin: 0.6em 0em 0em 0em;
padding: 0.4em 0em 0em 0em;
text-align: center;
font-size: 90%;
}
#footerHacked li {
display: inline;
margin: 0 1.3em;
}
"""
c1 = '#column-one { padding-top: 160px; }'
c2 = '#column-one { padding-top: 3.0em; }'
assert c1 in doc
doc = doc.replace(c1, '/* edit by mw2html */\n' + c2 +
'\n/* end edit by mw2html */\n')
# Remove external link icons.
if config.remove_png:
doc = re.sub(r'#bodyContent a\[href \^="http://"\][\s\S]+?\}', r'', doc)
return doc
def post_css_transform(doc, url, config):
"""
User-customizable CSS transform.
Given a CSS document (with URLs already rewritten), returns
modified CSS document.
"""
if config.hack_skin:
if config.skin == MONOBOOK_SKIN:
doc = monobook_hack_skin_css(doc, url, config)
else:
raise ValueError('unknown skin')
return doc
def url_to_filename(url, config):
"""
Translate a full url to a full filename (in local OS format) under outdir.
"""
url = split_section(url)[0]
if url in url_filename_cache:
return url_filename_cache[url]
part = url
if part.lower().startswith('http://'):
part = part[len('http://'):]
L = part.strip('/').split('/')
L = [urllib.quote_plus(x) for x in L]
if len(L) <= 1 or not '.' in L[-1]:
# url ends with a directory name. Store it under index.html.
L += [INDEX_HTML]
# Local filename relative to outdir
# (More transformations are made to this below...).
subfile = os.sep.join(L)
# Fix up extension based on mime type.
fix_ext = True
try:
f = urllib2.urlopen(url)
except urllib2.URLError, e:
fix_ext = False
if fix_ext:
mimetype = f.info().type.lower().split(' ')[0]
# Maps mimetype to file extension
MIME_MAP = {
'image/jpeg': 'jpg', 'image/png': 'png', 'image/gif': 'gif',
'image/tiff': 'tiff', 'text/plain': 'txt', 'text/html': 'html',
'text/rtf': 'rtf', 'text/css': 'css', 'text/sgml': 'sgml',
'text/xml': 'xml', 'application/zip': 'zip'
}
if mimetype in MIME_MAP:
(root, ext) = os.path.splitext(subfile)
ext = '.' + MIME_MAP[mimetype]
subfile = root + ext
if config.lower:
subfile = subfile.lower()
ans = os.path.join(config.outdir, subfile)
if config.flatten:
ans = flatten_filename(url, config, ans)
if config.clean:
ans = clean_filename(url, config, ans)
if config.index != None:
ans = move_to_index_if_needed(config, ans)
ans = find_unused_filename(ans, file_exists_in_written_set)
ans = post_filename_transform(ans, config)
# Cache and return answer.
wrote_file_set.add(os.path.normcase(os.path.normpath(ans)))
url_filename_cache[url] = ans
return ans
def file_exists_in_written_set(filename):
return os.path.normcase(os.path.normpath(filename)) in wrote_file_set
def find_unused_filename(filename, exists=os.path.exists):
"""
Return 'file' if 'file' doesn't exist, otherwise 'file1', 'file2', etc.
Existance is determined by the callable exists(), which takes
a filename and returns a boolean.
"""
if not exists(filename):
return filename
(head, tail) = os.path.split(filename)
i = 1
while True:
numbered = (os.path.splitext(tail)[0] + str(i) +
os.path.splitext(tail)[1])
fullname = os.path.join(head, numbered)
if not exists(fullname):
return fullname
i += 1
def clean_filename(url, config, ans):
# Split outdir and our file/dir under outdir
# (Note: ans may not be a valid filename)
(par, ans) = (ans[:len(config.outdir)], ans[len(config.outdir):])
if ans.startswith(os.sep):
ans = ans[1:]
# Replace % escape codes with underscores, dashes with underscores.
while '%%' in ans:
ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%')+2:]
while '%25' in ans:
ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25')+5:]
while '%' in ans:
ans = ans[:ans.index('%')] + '_' + ans[ans.index('%')+3:]
ans = ans.replace('-', '_')
while '__' in ans:
ans = ans.replace('__', '_')
while '_.' in ans:
ans = ans.replace('_.', '.')
# Rename math thumbnails
if '/math/' in url:
tail = os.path.split(ans)[1]
if os.path.splitext(tail)[1] == '.png':
tail = os.path.splitext(tail)[0]
if set(tail) <= set('0123456789abcdef') and len(tail) == 32:
ans = 'math_' + sha.new(tail).hexdigest()[:4] + '.png'
return os.path.join(par, ans)
def move_to_index_if_needed(config, ans):
if ans.endswith(config.index):
ans = ans[:len(ans)-len(config.index)] + INDEX_HTML
return ans
def flatten_filename(url, config, filename):
def get_fullname(relname):
return os.path.join(config.outdir, relname)
orig_ext = os.path.splitext(filename)[1]
(head, tail) = os.path.split(filename)
if tail == INDEX_HTML:
(head, tail) = os.path.split(head)
ans = tail
if os.path.splitext(ans)[1] != orig_ext:
ans = os.path.splitext(ans)[0] + orig_ext
return os.path.join(config.outdir, ans)
def split_section(url):
"""
Splits into (head, tail), where head contains no '#' and is max length.
"""
if '#' in url:
i = url.index('#')
return (url[:i], url[i:])
else:
return (url, '')
def rewrite_external_url(url, config):
"""
Rewrite any URL that could not be stored locally.
To not rewrite any external URLs, simply return url.
"""
# If could not be stored locally, but in same domain, return ''.
if get_domain(url) == get_domain(config.rooturl):
return ''
return url
def url_to_relative(url, cururl, config):
"""
Translate a full url to a filename (in URL format) relative to cururl.
"""
cururl = split_section(cururl)[0]
(url, section) = split_section(url)
L1 = url_to_filename(url, config).replace(os.sep, '/').split('/')
L2 = url_to_filename(cururl, config).replace(os.sep, '/').split('/')
while L1 != [] and L2 != [] and L1[0] == L2[0]:
L1 = L1[1:]
L2 = L2[1:]
return urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
def parse_css(doc, url, config):
"""
Returns (modified_doc, new_urls), where new_urls are absolute URLs for
all links found in the CSS.
"""
new_urls = []
L = htmldata.urlextract(doc, url, 'text/css')
for item in L:
# Store url locally.
u = item.url
new_urls += [u]
item.url = url_to_relative(u, url, config)
newdoc = htmldata.urljoin(doc, L)
newdoc = post_css_transform(newdoc, url, config)
return (newdoc, new_urls)
def get_domain(u):
"""
Get domain of URL.
"""
ans = urlparse.urlparse(u)[1]
if ':' in ans:
ans = ans[:ans.index(':')]
return ans
def should_follow(rooturl, url):
"""
Returns boolean for whether url should be spidered.
Given that 'url' was linked to from site 'rooturl', return whether
'url' should be spidered as well.
"""
# False if different domains.
if get_domain(rooturl) != get_domain(url):
return False
# False if multiple query fields.
if url.count('&') >= 1:
return False
if 'MediaWiki:' in url or 'Special:' in url:
return False
return True
def parse_html(doc, url, config):
"""
Returns (modified_doc, new_urls), where new_urls are absolute URLs for
all links we want to spider in the HTML.
"""
BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'
new_urls = []
# Temporarily "get rid" of comments so htmldata will find the URLs
# in the funky "<!--[if" HTML hackery for IE.
doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
doc = doc.replace('-->', END_COMMENT_REPLACE)
L = htmldata.urlextract(doc, url, 'text/html')
for item in L:
u = item.url
if should_follow(url, u):
# Store url locally.
new_urls += [u]
item.url = url_to_relative(u, url, config)
else:
item.url = rewrite_external_url(item.url, config)
newdoc = htmldata.urljoin(doc, L)
newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
newdoc = post_html_transform(newdoc, url, config)
return (newdoc, new_urls)
def run(config, out=sys.stdout):
"""
Code interface.
"""
if urlparse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
out.write('Please do not use robots with the Wikipedia site.\n')
out.write('Instead, install the Wikipedia database locally and use mw2html on\n')
out.write('your local installation. See the Mediawiki site for more information.\n')
sys.exit(1)
# Number of files saved
n = 0
if not config.overwrite and os.path.exists(config.outdir):
out.write('Error: Directory exists: ' + str(config.outdir) )
sys.exit(1)
complete = set()
pending = set([config.rooturl])
while len(pending) > 0:
url = pending.pop()
if url in complete:
continue
complete.add(url)
try:
f = urllib2.urlopen(url)
except urllib2.URLError, e:
try:
out.write(str(e.code) + ': ' + url + '\n\n')
except:
out.write('Error opening: ' + url + '\n\n')
continue
doc = f.read()
mimetype = f.info().type.lower().split(' ')[0]
f.close()
new_urls = []
if mimetype == 'text/html':
(doc, new_urls) = parse_html(doc, url, config)
elif mimetype == 'text/css':
(doc, new_urls) = parse_css(doc, url, config)
# Enqueue URLs that we haven't yet spidered.
for u in new_urls:
if u not in complete:
# Strip off any #section link.
if '#' in u:
u = u[:u.index('#')]
pending.add(u)
mode = ['wb', 'w'][mimetype.startswith('text')]
# Save modified content to disk.
filename = url_to_filename(url, config)
# Make parent directory if it doesn't exist.
try:
os.makedirs(os.path.split(filename)[0])
except OSError, e:
if e.errno != errno.EEXIST:
raise
# Not really needed since we checked that the directory
# outdir didn't exist at the top of run(), but let's double check.
if os.path.exists(filename) and not config.overwrite:
out.write('File already exists: ' + str(filename))
sys.exit(1)
f = open(filename, mode)
f.write(doc)
f.close()
out.write(url + '\n => ' + filename + '\n\n')
n += 1
out.write(str(n) + ' file(s) saved\n')
def usage():
"""
Print command line options.
"""
usage_str = """
mw2html url outdir [options]
Converts an entire Mediawiki site into static HTML.
Tested only with Mediawiki 1.4beta6 Monobook output.
WARNING: This is a recursive robot that ignores robots.txt. Use with care.
url - URL of mediawiki page to convert to static HTML.
outdir - Output directory.
-f, --force - Overwrite existing files in outdir.
--no-flatten - Do not flatten directory structure.
--no-lower - Retain original case for output filenames and dirs.
--no-clean - Do not clean up filenames (clean replaces
non-alphanumeric chars with _, renames math thumbs).
--no-hack-skin - Do not modify skin CSS and HTML for looks.
--no-made-by - Suppress "generated by" comment in HTML source.
--no-move-href - Disable <movehref> tag. [1]
--no-remove-png - Retain external link PNG icons.
--no-remove-history - Retain image history and links to information.
-l, --left=a.html - Paste HTML fragment file into left sidebar.
-t, --top=a.html - Paste HTML fragment file into top horiz bar.
-b, --bottom=a.html - Paste HTML fragment file into footer horiz bar.
-i, --index=filename - Move given filename in outdir to index.html.
Example Usage:
mw2html http://127.0.0.1/mywiki/ out -f -i main_page.html -l sidebar.html
Freezes wiki into 'out' directory, moves main_page.html => index.html,
assumes sidebar.html is defined in the current directory.
[1]. The <movehref> tag.
Wiki syntax: <html><movehref href="a"></html>...<html></movehref></html>.
When enabled, this tag will cause all href= attributes inside of it to be
set to the given location. This is useful for linking images.
In MediaWiki, for the <html> tag to work, one needs to enable $wgRawHtml
and $wgWhitelistEdit in LocalSettings.php. A <movehref> tag with no href
field will remove all links inside it.
"""
print textwrap.dedent(usage_str.strip('\n'))
sys.exit(1)
def main():
"""
Command line interface.
"""
try:
(opts, args) = getopt.gnu_getopt(sys.argv[1:], 'fl:t:b:i:',
['force', 'no-flatten', 'no-lower', 'no-clean',
'no-hack-skin', 'no-made-by', 'left=',
'top=', 'bottom=', 'index=', 'no-move-href',
'no-remove-png', 'no-remove-history'])
except getopt.GetoptError:
usage()
# Parse non-option arguments
try:
(rooturl, outdir) = args
except ValueError:
usage()
config = Config(rooturl=rooturl, outdir=outdir)
# Parse option arguments
for (opt, arg) in opts:
if opt in ['-f', '--force']:
config.overwrite = True
if opt in ['--no-flatten']:
config.flatten = False
if opt in ['--no-lower']:
config.lower = False
if opt in ['--no-clean']:
config.clean = False
if opt in ['--no-hack-skin']:
config.hack_skin = False
if opt in ['--no-made-by']:
config.made_by = False
if opt in ['--no-move-href']:
config.move_href = False
if opt in ['--no-remove-png']:
config.remove_png = False
if opt in ['--no-remove-history']:
config.remove_history = False
if opt in ['-l', '--left']:
config.sidebar = os.path.abspath(arg)
if opt in ['-t', '--top']:
raise NotImplementedError
config.header = os.path.abspath(arg)
if opt in ['-b', '--bottom']:
config.footer = os.path.abspath(arg)
if opt in ['-i', '--index']:
config.index = arg
# Run program
run(config)
if __name__ == '__main__':
main()