Revision as of 15:17, 25 August 2008 by Root (Talk | contribs)

Jump to: navigation, search

This was originally from here. It's not as useful to me know, but I keep it around because there isn't much else I've found that will do the same thing.

#!/usr/bin/env python
"""mw2html - Mediawiki to static HTML

I use this to create a personal website from a local mediawiki
installation.  No search functionality.  Hacks the Monobook skin and
the produced HTML.

Connelly Barnes 2005.  Public domain.

__version__ = ''

import re
import sys
import getopt
import random
import urllib
import urllib2
import textwrap
import urlparse
import os, os.path
import errno
import sha

  from sets import Set as set

  import htmldata
  print 'Requires Python htmldata module:'
  print ''

MOVE_HREF          = 'movehref'
MADE_BY_COMMENT    = '<!-- Content generated by Mediawiki and mw2html -->'
INDEX_HTML = 'index.html'
url_filename_cache = {}
wrote_file_set     = set()

MONOBOOK_SKIN      = 'monobook'    # Constant identifier for Monobook.

class Config:
  Instances contain all options passed at the command line.
  def __init__(self, rooturl, outdir,
               flatten=True, lower=True, index=None, clean=True,
               sidebar=None, hack_skin=True,
               made_by=True, overwrite=False, footer=None,
               skin=MONOBOOK_SKIN, move_href=True,
               remove_png=True, remove_history=True):
    self.rooturl         = rooturl
    self.outdir          = os.path.abspath(outdir)
    self.flatten         = flatten
    self.lower           = lower
    self.index           = index
    self.clean           = clean
    self.sidebar         = sidebar
    self.hack_skin       = hack_skin
    self.made_by         = made_by
    self.overwrite       = overwrite
    self.footer          = footer            = skin
    self.move_href       = move_href
    if self.sidebar is not None:
      self.sidebar       = os.path.abspath(self.sidebar)
    if self.footer is not None:
      self.footer        = os.path.abspath(self.footer)
    self.remove_png      = remove_png
    self.remove_history  = remove_history

def post_filename_transform(filename, config):
  User-customizable filename transform.

  Here filename is the full filename in the output directory.
  Returns modified full filename.
  return filename

def monobook_fix_html_sidebar(doc, config):
  Sets sidebar for Mediawiki 1.4beta6 Monobook HTML output.
  if config.made_by:
    doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')


  # Remove sidebar HTML
  doc = re.sub(
    r'(<!-- end content -->)[\s\S]+?' +
    r'(<!-- end of the left \(by default at least\) column -->)',
    r'\1<div class="visualClear"></div></div></div></div>' + SIDEBAR_ID + r'\2', doc)

  pre_sidebar = """
    <div id="column-one">
      <div id="p-cactions" class="portlet"></div>
      <div class="portlet" id="p-personal"></div>
      <div class="portlet" id="p-logo"></div>
      <div class="portlet" id="p-nav">

  post_sidebar = """
      <div id="p-search" class="portlet"></div>
      <div class="portlet" id="p-tb"></div>
    <!-- end left column -->

  sidebar_content = ''
  if config.sidebar != None:
    f = open(config.sidebar, 'rU')
    sidebar_content =

  sidebar_content = pre_sidebar + sidebar_content + post_sidebar

  doc = doc.replace(SIDEBAR_ID, sidebar_content)

  doc = re.sub(
    r'<div id="f-poweredbyico">[\s\S]+?(<ul id="f-list">)',
    r'\1', doc)

  # Remove edit links
  doc = re.sub(r'<div class="editsection"[\s\S]+?</div>', r'', doc)

  # Remove page has been accessed X times list item.
  doc = re.sub(r'<li id="f-viewcount">[\s\S]+?</li>', r'', doc)

  # Remove disclaimers list item.
  doc = re.sub(r'<li id="f-disclaimer">[\s\S]+?</li>', r'', doc)

  # Replace remaining text with footer, if available.
  if config.footer is not None:
    s1 = '<div id="footer">'
    s2 = '</div>'
    i1 = doc.index(s1)
    i2 = doc.index(s2, i1)
    f = open(config.footer, 'rU')
    footer_text =
    doc = doc[:i1+len(s1)] + footer_text + doc[i2:]

  return doc

def fix_move_href_tags(doc, config):
  Return copy of doc with all MOVE_HREF tags removed.
  while '<' + MOVE_HREF in doc:
    i1 = doc.index('<' + MOVE_HREF)
    i2 = doc.index('</' + MOVE_HREF, i1+1)
    i3 = doc.index('>', i2+1)
    (start, end) = (i1, i3+1)
    tags = htmldata.tagextract(doc[start:end])
    assert tags[0][0] == MOVE_HREF
    assert tags[-1][0] == '/' + MOVE_HREF
    href = tags[0][1].get('href', '')
    new_tags = []
    for tag in tags[1:-1]:
      if len(tag) == 2:
        if 'href' in tag[1]:
          if href == '':
          tag[1]['href'] = href
      new_tags += [tag]
    doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:]
  return doc

def html_remove_image_history(doc, config):
  Remove image history and links to information.
  doc = re.sub(r'<h2>Image history</h2>[\s\S]+?</ul>', r'', doc)
  doc = re.sub(r'<h2>Image links</h2>[\s\S]+?</ul>', r'', doc)
  return doc

def post_html_transform(doc, url, config):
  User-customizable HTML transform.

  Given an HTML document (with URLs already rewritten), returns
  modified HTML document.
  if config.hack_skin:
    if == MONOBOOK_SKIN:
      doc = monobook_fix_html_sidebar(doc, config)
      doc = monobook_hack_skin_html(doc, config)
      raise ValueError('unknown skin')
  if config.move_href:
    doc = fix_move_href_tags(doc, config)
  if config.remove_history:
    doc = html_remove_image_history(doc, config)
  return doc

def monobook_hack_skin_html(doc, config):
  Hacks Monobook HTML output: use CSS ids for hacked skin.

  See monobook_hack_skin_css.
  doc = doc.replace('<div id="globalWrapper">', '<div id="globalWrapperHacked">')
  doc = doc.replace('<div id="footer">', '<div id="footerHacked">')
  doc = doc.replace('</body>', '<br></body>')
  return doc

def monobook_hack_skin_css(doc, url, config):
  Hacks Mediawiki 1.4beta6 Monobook main CSS file for better looks.

  Removes flower background.  Defines *Hacked CSS ids, so we can add
  an orange bar at the top, and clear the orange bar right above the
  if not url.endswith('monobook/main.css'):
    return doc

  doc = "/* Monobook skin automatically modified by mw2html. */" + doc
  doc = doc.replace('url("headbg.jpg")', '')

  doc += """
    /* Begin hacks by mw2html */

    #globalWrapperHacked {
      width: 100%;
      background-color: White;
      border-top: 1px solid #fabd23;
      border-bottom: 1px solid #fabd23;
      margin: 0.6em 0em 1em 0em;
      padding: 0em 0em 1.2em 0em;

    #footerHacked {
      background-color: White;
      margin: 0.6em 0em 0em 0em;
      padding: 0.4em 0em 0em 0em;
      text-align: center;
      font-size: 90%;

    #footerHacked li {
      display: inline;
      margin: 0 1.3em;

  c1 = '#column-one { padding-top: 160px; }'
  c2 = '#column-one { padding-top: 3.0em; }'
  assert c1 in doc

  doc = doc.replace(c1, '/* edit by mw2html */\n' + c2 +
                        '\n/* end edit by mw2html */\n')

  # Remove external link icons.
  if config.remove_png:
    doc = re.sub(r'#bodyContent a\[href \^="http://"\][\s\S]+?\}', r'', doc)

  return doc

def post_css_transform(doc, url, config):
  User-customizable CSS transform.

  Given a CSS document (with URLs already rewritten), returns
  modified CSS document.
  if config.hack_skin:
    if == MONOBOOK_SKIN:
      doc = monobook_hack_skin_css(doc, url, config)
      raise ValueError('unknown skin')
  return doc

def url_to_filename(url, config):
  Translate a full url to a full filename (in local OS format) under outdir.
  url = split_section(url)[0]
  if url in url_filename_cache:
    return url_filename_cache[url]

  part = url
  if part.lower().startswith('http://'):
    part = part[len('http://'):]
  L = part.strip('/').split('/')
  L = [urllib.quote_plus(x) for x in L]
  if len(L) <= 1 or not '.' in L[-1]:
    # url ends with a directory name.  Store it under index.html.
    L += [INDEX_HTML]

  # Local filename relative to outdir
  # (More transformations are made to this below...).
  subfile = os.sep.join(L)

  # Fix up extension based on mime type.
  fix_ext = True
    f = urllib2.urlopen(url)
  except urllib2.URLError, e:
    fix_ext = False

  if fix_ext:
    mimetype =' ')[0]

    # Maps mimetype to file extension
    MIME_MAP = {
     'image/jpeg': 'jpg', 'image/png': 'png', 'image/gif': 'gif',
     'image/tiff': 'tiff', 'text/plain': 'txt', 'text/html': 'html',
     'text/rtf': 'rtf', 'text/css': 'css', 'text/sgml': 'sgml',
     'text/xml': 'xml', 'application/zip': 'zip'

    if mimetype in MIME_MAP:
      (root, ext) = os.path.splitext(subfile)
      ext = '.' + MIME_MAP[mimetype]
      subfile = root + ext

  if config.lower:
    subfile = subfile.lower()

  ans = os.path.join(config.outdir, subfile)

  if config.flatten:
    ans = flatten_filename(url, config, ans)

  if config.clean:
    ans = clean_filename(url, config, ans)

  if config.index != None:
    ans = move_to_index_if_needed(config, ans)

  ans = find_unused_filename(ans, file_exists_in_written_set)

  ans = post_filename_transform(ans, config)

  # Cache and return answer.
  url_filename_cache[url] = ans
  return ans

def file_exists_in_written_set(filename):
  return os.path.normcase(os.path.normpath(filename)) in wrote_file_set

def find_unused_filename(filename, exists=os.path.exists):
  Return 'file' if 'file' doesn't exist, otherwise 'file1', 'file2', etc.

  Existance is determined by the callable exists(), which takes
  a filename and returns a boolean.
  if not exists(filename):
    return filename
  (head, tail) = os.path.split(filename)
  i = 1
  while True:
    numbered = (os.path.splitext(tail)[0] + str(i) +
    fullname = os.path.join(head, numbered)
    if not exists(fullname):
      return fullname
    i += 1

def clean_filename(url, config, ans):
  # Split outdir and our file/dir under outdir
  # (Note: ans may not be a valid filename)
  (par, ans) = (ans[:len(config.outdir)], ans[len(config.outdir):])
  if ans.startswith(os.sep):
    ans = ans[1:]

  # Replace % escape codes with underscores, dashes with underscores.
  while '%%' in ans:
    ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%')+2:]
  while '%25' in ans:
    ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25')+5:]
  while '%' in ans:
    ans = ans[:ans.index('%')] + '_' + ans[ans.index('%')+3:]
  ans = ans.replace('-', '_')
  while '__' in ans:
    ans = ans.replace('__', '_')
  while '_.' in ans:
    ans = ans.replace('_.', '.')

  # Rename math thumbnails
  if '/math/' in url:
    tail = os.path.split(ans)[1]
    if os.path.splitext(tail)[1] == '.png':
      tail = os.path.splitext(tail)[0]
      if set(tail) <= set('0123456789abcdef') and len(tail) == 32:
        ans = 'math_' +[:4] + '.png'
  return os.path.join(par, ans)

def move_to_index_if_needed(config, ans):
  if ans.endswith(config.index):
    ans = ans[:len(ans)-len(config.index)] + INDEX_HTML
  return ans

def flatten_filename(url, config, filename):
  def get_fullname(relname):
    return os.path.join(config.outdir, relname)

  orig_ext = os.path.splitext(filename)[1]
  (head, tail) = os.path.split(filename)
  if tail == INDEX_HTML:
    (head, tail) = os.path.split(head)
  ans = tail
  if os.path.splitext(ans)[1] != orig_ext:
    ans = os.path.splitext(ans)[0] + orig_ext
  return os.path.join(config.outdir, ans)

def split_section(url):
  Splits into (head, tail), where head contains no '#' and is max length.
  if '#' in url:
    i = url.index('#')
    return (url[:i], url[i:])
    return (url, '')

def rewrite_external_url(url, config):
  Rewrite any URL that could not be stored locally.

  To not rewrite any external URLs, simply return url.
  # If could not be stored locally, but in same domain, return ''.
  if get_domain(url) == get_domain(config.rooturl):
    return ''
  return url

def url_to_relative(url, cururl, config):
  Translate a full url to a filename (in URL format) relative to cururl.
  cururl = split_section(cururl)[0]
  (url, section) = split_section(url)

  L1 = url_to_filename(url,    config).replace(os.sep, '/').split('/')
  L2 = url_to_filename(cururl, config).replace(os.sep, '/').split('/')

  while L1 != [] and L2 != [] and L1[0] == L2[0]:
    L1 = L1[1:]
    L2 = L2[1:]

  return urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section

def parse_css(doc, url, config):
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links found in the CSS.
  new_urls = []  

  L = htmldata.urlextract(doc, url, 'text/css')
  for item in L:
    # Store url locally.
    u = item.url
    new_urls += [u]
    item.url = url_to_relative(u, url, config)

  newdoc = htmldata.urljoin(doc, L)
  newdoc = post_css_transform(newdoc, url, config)

  return (newdoc, new_urls)

def get_domain(u):
  Get domain of URL.
  ans = urlparse.urlparse(u)[1]
  if ':' in ans:
    ans = ans[:ans.index(':')]
  return ans

def should_follow(rooturl, url):
  Returns boolean for whether url should be spidered.

  Given that 'url' was linked to from site 'rooturl', return whether
  'url' should be spidered as well.
  # False if different domains.
  if get_domain(rooturl) != get_domain(url):
    return False

  # False if multiple query fields.
  if url.count('&') >= 1:
    return False

  if 'MediaWiki:' in url or 'Special:' in url:
    return False

  return True

def parse_html(doc, url, config):
  Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  all links we want to spider in the HTML.
  BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
  END_COMMENT_REPLACE   = '<ENDCOMMENT-' + str(random.random()) + '>'

  new_urls = []  

  # Temporarily "get rid" of comments so htmldata will find the URLs
  # in the funky "<!--[if" HTML hackery for IE.
  doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
  doc = doc.replace('-->', END_COMMENT_REPLACE)

  L = htmldata.urlextract(doc, url, 'text/html')
  for item in L:
    u = item.url
    if should_follow(url, u):
      # Store url locally.
      new_urls += [u]
      item.url = url_to_relative(u, url, config)
      item.url = rewrite_external_url(item.url, config)

  newdoc = htmldata.urljoin(doc, L)
  newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
  newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
  newdoc = post_html_transform(newdoc, url, config)

  return (newdoc, new_urls)

def run(config, out=sys.stdout):
  Code interface.
  if urlparse.urlparse(config.rooturl)[1].lower().endswith(''):
    out.write('Please do not use robots with the Wikipedia site.\n')
    out.write('Instead, install the Wikipedia database locally and use mw2html on\n')
    out.write('your local installation.  See the Mediawiki site for more information.\n')

  # Number of files saved
  n = 0

  if not config.overwrite and os.path.exists(config.outdir):
    out.write('Error: Directory exists: ' + str(config.outdir) )

  complete = set()
  pending  = set([config.rooturl])

  while len(pending) > 0:
    url      = pending.pop()
    if url in complete:
      f        = urllib2.urlopen(url)
    except urllib2.URLError, e:
        out.write(str(e.code) + ': ' + url + '\n\n')
        out.write('Error opening: ' + url + '\n\n')
    doc      =
    mimetype =' ')[0]

    new_urls = []

    if mimetype == 'text/html':
      (doc, new_urls) = parse_html(doc, url, config)
    elif mimetype == 'text/css':
      (doc, new_urls) = parse_css(doc, url, config)

    # Enqueue URLs that we haven't yet spidered.
    for u in new_urls:
      if u not in complete:
        # Strip off any #section link.
        if '#' in u:
          u = u[:u.index('#')]

    mode = ['wb', 'w'][mimetype.startswith('text')]

    # Save modified content to disk.
    filename = url_to_filename(url, config)

    # Make parent directory if it doesn't exist.
    except OSError, e:
      if e.errno != errno.EEXIST:

    # Not really needed since we checked that the directory
    # outdir didn't exist at the top of run(), but let's double check.
    if os.path.exists(filename) and not config.overwrite:
      out.write('File already exists: ' + str(filename))

    f = open(filename, mode)

    out.write(url + '\n => ' + filename + '\n\n')
    n += 1

  out.write(str(n) + ' file(s) saved\n')

def usage():
  Print command line options.
  usage_str = """
  mw2html url outdir [options]

  Converts an entire Mediawiki site into static HTML.
  Tested only with Mediawiki 1.4beta6 Monobook output.
  WARNING: This is a recursive robot that ignores robots.txt.  Use with care.

    url                  - URL of mediawiki page to convert to static HTML.
    outdir               - Output directory.

    -f, --force          - Overwrite existing files in outdir.
    --no-flatten         - Do not flatten directory structure.
    --no-lower           - Retain original case for output filenames and dirs.
    --no-clean           - Do not clean up filenames (clean replaces
                           non-alphanumeric chars with _, renames math thumbs).
    --no-hack-skin       - Do not modify skin CSS and HTML for looks.
    --no-made-by         - Suppress "generated by" comment in HTML source.
    --no-move-href       - Disable <movehref> tag. [1]
    --no-remove-png      - Retain external link PNG icons.
    --no-remove-history  - Retain image history and links to information.
    -l, --left=a.html    - Paste HTML fragment file into left sidebar.
    -t, --top=a.html     - Paste HTML fragment file into top horiz bar.
    -b, --bottom=a.html  - Paste HTML fragment file into footer horiz bar.
    -i, --index=filename - Move given filename in outdir to index.html.

  Example Usage:
    mw2html out -f -i main_page.html -l sidebar.html

    Freezes wiki into 'out' directory, moves main_page.html => index.html,
    assumes sidebar.html is defined in the current directory.

  [1]. The <movehref> tag.
       Wiki syntax: <html><movehref href="a"></html>...<html></movehref></html>.
       When enabled, this tag will cause all href= attributes inside of it to be
       set to the given location.  This is useful for linking images.
       In MediaWiki, for the <html> tag to work, one needs to enable $wgRawHtml
       and $wgWhitelistEdit in LocalSettings.php.  A <movehref> tag with no href
       field will remove all links inside it.


  print textwrap.dedent(usage_str.strip('\n'))

def main():
  Command line interface.
    (opts, args) = getopt.gnu_getopt(sys.argv[1:], 'fl:t:b:i:',
                   ['force', 'no-flatten', 'no-lower', 'no-clean',
                    'no-hack-skin', 'no-made-by', 'left=',
                    'top=', 'bottom=', 'index=', 'no-move-href',
                    'no-remove-png', 'no-remove-history'])
  except getopt.GetoptError:

  # Parse non-option arguments
    (rooturl, outdir) = args
  except ValueError:
  config = Config(rooturl=rooturl, outdir=outdir)

  # Parse option arguments
  for (opt, arg) in opts:
    if opt in ['-f', '--force']:
      config.overwrite      = True
    if opt in ['--no-flatten']:
      config.flatten        = False
    if opt in ['--no-lower']:
      config.lower          = False
    if opt in ['--no-clean']:
      config.clean          = False
    if opt in ['--no-hack-skin']:
      config.hack_skin      = False
    if opt in ['--no-made-by']:
      config.made_by        = False
    if opt in ['--no-move-href']:
      config.move_href      = False
    if opt in ['--no-remove-png']:
      config.remove_png     = False
    if opt in ['--no-remove-history']:
      config.remove_history = False
    if opt in ['-l', '--left']:
      config.sidebar        = os.path.abspath(arg)
    if opt in ['-t', '--top']:
      raise NotImplementedError
      config.header         = os.path.abspath(arg)
    if opt in ['-b', '--bottom']:
      config.footer         = os.path.abspath(arg)
    if opt in ['-i', '--index']:
      config.index          = arg

  # Run program

if __name__ == '__main__':