LOBSTERTECHNOLOGIES
CSSPROPAGANDA

Source Code: lobsterblog.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    LobsterBlog
    ~~~~~~~~~~~

    HTML5 Technical Article Beautifier

      :Copyright: Copyright (c) 2010 J.A. Roberts Tunney
      :License: GPL v2 or later
      :Modified: 2010-08-14
      :URL: <http://lobstertech.com/lobsterblog/>

    LobsterBlog is for when plain old HTML is too much work and
    wordpress/django/etc. is more trouble than it's worth.

    Requires Python 2.6 or 2.7

    Invocation
    ----------
    ::
      python lobsterblog.py *.xml

    Dependencies
    ------------
    ::
      sudo apt-get install \
          texinfo tex-common dvipng \
          python-pygments python-lxml python-imaging \
          python-matplotlib python-numpy gnuplot-nox \
          graphviz python-sphinx dnotify rsync

"""

from __future__ import with_statement

import re
import os
import sys
import time
import codecs
import cPickle
import logging
import hashlib
import textwrap
import StringIO

import lxml.etree as etree


source_page_template = textwrap.dedent(r"""
    <!DOCTYPE html>
    <html lang="en">
      <head>
        <!--# include file="design_head.html" -->
        <meta name="robots" content="noindex"/>
      </head>
      <body>
        <!--# include file="design_top.html" -->
        <article>
          <h1>Source Code: %(file)s</h1>
          <pre class="pig">%(source)s</pre>
        </article>
        <!--# include file="design_footer.html" -->
      </body>
    </html>
""").strip()

matplotlib_template = textwrap.dedent(ur"""
    from numpy import *
    from pylab import *
    %(source)s
    savefig(%(imgfile)r, transparent=True)
""").strip()

latex_template = textwrap.dedent(r"""
    \documentclass[16px,fleqn]{article}
    \usepackage[margin=0cm,nohead]{geometry}
    \usepackage[utf8]{inputenc}
    \usepackage{amsmath}
    \setlength{\mathindent}{0cm}
    \pagestyle{empty}
    \begin{document}
    %(source)s
    \end{document}
""").strip()

gnuplot_template = textwrap.dedent(ur"""
    set output "%(imgfile)s"
    %(source)s
""").strip()

graphviz_template = textwrap.dedent(ur"""
    %(source)s
""").strip()

db_schema = r"""
    create table if not exists imghash (
        filename text not null primary key, -- relative image filename
        checksum text not null              -- md5 of code to make image
    );
"""

html_entities = {
    '<': '&lt;',
    '>': '&gt;',
    '&': '&gt;',
}

db = None
env = {}
xmlparser = etree.XMLParser(
    load_dtd=False, no_network=True, dtd_validation=False,
    remove_comments=False, remove_blank_text=True)


class OhNo(Exception):
    """Generic class for internal errors
    """


class chdir(object):
    """Context manager for temporarily changing current directory
    """
    def __init__(self, path):
        self.oldpath = os.getcwd()
        self.newpath = path

    def __enter__(self):
        os.chdir(self.newpath)

    def __exit__(self, type_, value, traceback):
        os.chdir(self.oldpath)


def forkcall(function):
    """Perform a long operation in the background
    """
    (readfd, writefd) = os.pipe()
    pread, pwrite = os.fdopen(readfd, 'rb'), os.fdopen(writefd, 'wb')
    pid = os.fork()
    if pid:
        pwrite.close()
        def waiter():
            returncode = os.waitpid(pid, 0)[1]
            if returncode != 0:
                raise Exception('child failed', pid, returncode)
            return cPickle.load(pread)
        return waiter
    else:
        pread.close()
        # db_connect()
        result = function()
        cPickle.dump(result, pwrite)
        sys.exit(0)


def slurp(f, codec='utf8', errors='strict'):
    """Return full decoded contents of file object or filename
    """
    if hasattr(f, 'read'):
        res = f.read()
        if isinstance(res, str):
            return res.decode(codec)
    else:
        return codecs.open(f, 'r', codec, errors).read()


def slugify(text):
    """Turn arbitrary strings into identifiers
    """
    slug = text.strip().lower()
    slug = re.sub('[^\w]+', '_', slug)
    slug = re.sub('_+$', '', slug)
    slug = re.sub('^_+', '', slug)
    return slug


def highlight_html_filename(fn):
    """Generate HTML filename for housing source code of `fn`
    """
    return 'html/' + fn.replace('/', '_') + '.html'


def htmlpath(path, base='.', cwd=None, secure=True, absolute=False):
    r"""Transform filename from HTML document into system filename

    - Absolute paths are relative to the current working directory.
    - Relative paths are relative to directory of ``env['file']``.
    - Does not check if the file exists

    Examples::

      >>> htmlpath('/dog.png')
      'dog.png'
      >>> htmlpath('/media/img/dog.png')
      'media/img/dog.png'
      >>> htmlpath('love.html', base='/var/www')
      'love.html'
      >>> htmlpath('love.html', base='/var/www', absolute=True)
      '/var/www/love.html'
      >>> htmlpath('love.html', base='/var/www', cwd='stuff')
      'stuff/love.html'
      >>> htmlpath('love.html', base='/var/www', cwd='/var/www/zomg')
      'zomg/love.html'
      >>> htmlpath('../../etc/passwd', base='/var/www', secure=False)
      '../../etc/passwd'
      >>> htmlpath('../../etc/passwd', base='/var/www', secure=True)
      Traceback (most recent call last):
        ...
      ValueError: '../../etc/passwd' resolves outside '/var/www'

    :param path:     Path taken from HTML document.  Never blank.
    :param base:     Base folder of website.  Defaults to ``os.getcwd()``.
                     Non-absolute paths are relative to ``os.getcwd()``.
    :param cwd:      Defaults to ``os.path.dirname(env['file'])``.
                     Non-absolute paths are relative to :param:`base`.
    :param secure:   Raise exception if result not inside :param:`base`.
    :param absolute: Return an absolute system path.
    :return:         Path relative to :param:`base`.  Never blank.
    """
    from os.path import dirname, relpath, abspath, join
    if not path:
        raise ValueError("no path specified")
    if cwd is None:
        cwd = dirname(env.get('file', '.')) or '.'
    if path.startswith('/'):
        ares = abspath(join(base, path[1:]))
    else:
        ares = abspath(join(base, cwd, path))
    if secure and not ares.startswith(abspath(base)):
        raise ValueError("%r resolves outside %r" % (path, abspath(base)))
    return ares if absolute else relpath(ares, base)


def guess_language(filename, source=None):
    r"""Guesses programming language for source code

    Will return file extension if possible, otherwise relying on
    shebang or ``-*-lang-*-`` comments.  Returns ``None`` if language
    could not be determined.

    Examples::

      >>> guess_language('FOO.C')
      'c'
      >>> guess_language('lol/Makefile')
      'make'
      >>> guess_language('/usr/share/foo.tar.gz', '-*-perl-*-')
      'gz'
      >>> guess_language('foo', '') is None
      True
      >>> guess_language(None, '-*-PERL-*-')
      'perl'
      >>> guess_language(None, '\n;; -*-lisp-mode-*-\n')
      'lisp'
      >>> guess_language(None, '#!/usr/bin/python')
      'python'
      >>> guess_language(None, '#!/usr/bin/python2.6')
      'python'
      >>> guess_language(None, '#!/usr/bin/python/') is None
      True
      >>> guess_language(None, '#!/usr/bin/env python\n')
      'python'
      >>> guess_language(None, '#!/usr/bin/env python2.6\n')
      'python'
      >>> guess_language(None, '#!/usr/bin/env python2.6\n# -*-perl-*-\n')
      'perl'
      >>> guess_language(None, '\n;; -*- coding: utf-8 -*-\n') is None
      True
      >>> guess_language(None, '\n;; -*-coding:big5-*-\n') is None
      True
      >>> guess_language(None, '\n#-*-coding:big5-*-\n#-*-perl-*-')
      'perl'

    """
    if filename:
        if '/' in filename:
            filename = filename[filename.rindex('/') + 1:]
        if filename == 'Makefile':
            return 'make'
    if filename and '.' in filename:
        exten = filename[filename.rindex('.') + 1:].lower()
        if exten == 'h':
            return 'c'
        else:
            return exten
    else:
        if source is None:
            if os.path.exists(filename):
                source = slurp(filename)
            else:
                return None
        for pat in guess_language.pats:
            mat = pat.search(source)
            if mat:
                return mat.group('lang').lower()


guess_language.pats = (
    re.compile(r'-\*-\s*(?P<lang>[a-z]+)[^:\s]*\s*-\*-', re.I | re.M),
    re.compile(r'^#!/\S+?/env (?P<lang>[a-z]+)', re.I | re.M),
    re.compile(r'^#!/\S+?/(?P<lang>[a-z]+)[^/]*(\s|$)', re.I | re.M),
)


def get_tag_source(tag):
    """Returns text directly inside a tag

    If tag contains a 'src' attribute, the contains of this file will
    be prepended to the source code contained within the tag (if any.)
    """
    source = ''
    if 'src' in tag.attrib:
        filename = htmlpath(tag.attrib['src'])
        if filename:
            source += slurp(filename)
    if tag.text:
        source += '\n' + textwrap.dedent(tag.text).strip()
    return source


def get_tag_text(tag):
    def do_tag(tag):
        if tag.tag == 'img':
            s = tag.attrib.get('alt')
        else:
            s = tag.text
        if s:
            yield s
        for child in tag.getchildren():
            for data in do_tag(child):
                yield data
            if child.tail:
                yield child.tail
    return " ".join(list(do_tag(tag))).strip()


def syntax_highlight(root):
    """Pygmentizes ``<pre class="pig lobsterblog-syntax--LANG">`` blocks
    """
    try:
        import pygments
        import pygments.lexers as pyglex
        import pygments.formatters as pygfmt
    except ImportError, exc:
        logging.warning('syntax highlighting: %r' % (exc))
        return root
    formatter = pygfmt.HtmlFormatter()
    for tag in root.xpath('//pre'):
        env['line'] = tag.sourceline
        info = tag.attrib.get('class')
        if not info:
            continue
        if 'lobsterblog-dedent' in info:
            code = etree.tostring(tag, method="text", encoding="utf8")
            code = textwrap.dedent(code).strip()
            tag.text = code
        match = re.search(r'lobsterblog-syntax--(?P<lang>\S+)', info)
        if match:
            env['lang'] = match.group('lang')
            try:
                lexer = pyglex.get_lexer_by_name(env['lang'])
            except pyglex.ClassNotFound:
                logging.warning(
                    "%(file)s:%(line)d: Could not find Pygments lexer "
                    "for language %(lang)r.  Please see: "
                    "http://pygments.org/docs/lexers/" % env)
            else:
                code = etree.tostring(tag, method="text", encoding="utf8")
                code = textwrap.dedent(code).strip()
                high = pygments.highlight(code, lexer, formatter)
                for child in tag:
                    tag.remove(child)
                tag.text = ''
                xml = etree.fromstring(high)
                for child in xml.xpath('//pre')[0]:
                    tag.append(child)
    return root


def image_sizes(root):
    """Adds width/height attributes to ``<img/>`` if not present
    """
    try:
        from PIL import Image
    except ImportError, exc:
        logging.warning('image sizes: %r' % (exc))
        return root
    for tag in root.xpath('//img'):
        env['line'] = tag.sourceline
        if 'width' not in tag.attrib or 'height' not in tag.attrib:
            try:
                path = htmlpath(tag.attrib.get('src'))
                assert os.path.exists(path), "file not found: " + path
            except Exception, exc:
                logging.warning("%(file)s:%(line)d: bad img src: "
                                % env + str(exc))
            else:
                try:
                    f = Image.open(path)
                except Exception, exc:
                    logging.warning("PIL.open(%r) failed: %s" % (path, exc))
                    continue
                width, height = f.size
                tag.attrib['width'] = str(width)
                tag.attrib['height'] = str(height)
                logging.info("Image %s dimensions are %d x %d"
                             % (path, width, height))
    return root


def latex(root):
    """Replaces ``<latex>`` tags with generated images
    """
    with chdir('media/img/tex'):
        n = 0
        for tag in root.xpath('//latex'):
            n = n + 1
            name = "%s.%d" % (env['slug'], n)
            env['line'] = tag.sourceline
            env['source'] = get_tag_source(tag)
            env['srcfile'] = "media/img/tex/%s.tex" % (name)
            env['imgfile'] = "media/img/tex/%s.png" % (name)
            if should_regenerate_image():
                logging.info("%(file)s:%(line)d: LaTeX "
                             "%(srcfile)s -> %(imgfile)s" % env)
                f = codecs.open(name + '.tex', 'w', 'utf8')
                f.write(latex_template % env)
                f.close()
                cmd = "latex %s.tex >/dev/null 2>&1" % (name)
                if os.system(cmd) != 0:
                    logging.warning("Failed to run: " + cmd + "\n" +
                                    slurp(name + '.log'))
                    continue
                cmd = ("dvipng -bg transparent -o %s.png %s.dvi "
                       ">/dev/null 2>&1") % (name, name)
                if os.system(cmd) != 0:
                    logging.warning("Failed to run: " + cmd)
                    continue
                os.unlink(name + '.log')
                os.unlink(name + '.dvi')
                env['extra_sources'].append(env['srcfile'])
                image_was_regenerated()
            img = etree.fromstring('<img/>')
            img.attrib['src'] = '/' + env['imgfile']
            img.attrib['alt'] = 'LaTeX Figure #%d' % (n)
            img.attrib['title'] = 'LaTeX Figure #%d' % (n)
            for k, v in tag.attrib.items():
                img.attrib[k] = v
            a = etree.fromstring('<a/>')
            a.tail = tag.tail
            a.attrib['href'] = os.path.relpath(
                highlight_html_filename(env['srcfile']), 'html')
            a.append(img)
            parent = tag.getparent()
            parent.replace(tag, a)
        return root


def python(root):
    """Executes ``<python>`` tags, replacing them with captured stdout
    """
    for tag in root.xpath("//*[name()='python' or name()='py']"):
        env['line'] = tag.sourceline
        code = get_tag_source(tag).strip()
        if tag.tag == 'python':
            sys.stdout, oldstdout = StringIO.StringIO(), sys.stdout
            try:
                exec code
            except:
                logging.exception("python exec() failed:\n\n%s\n\n"
                                  "output:\n\n%s", code, sys.stdout.getvalue())
                continue
            finally:
                output = sys.stdout.getvalue()
                sys.stdout = oldstdout
        elif tag.tag == 'py':
            try:
                output = eval(code)
            except:
                logging.exception("python eval(%r) failed", code)
                continue
        if not etree.iselement(output):
            try:
                try:
                    output = etree.fromstring(output)
                except:
                    output = etree.fromstring('<span>%s</span>' % (output))
            except Exception, exc:
                logging.error("bad xml output: %s\n\n%s", exc, output)
                continue
        output.tail = tag.tail
        for k, v in tag.attrib.items():
            output.attrib[k] = v
        parent = tag.getparent()
        parent.replace(tag, output)
    return root


def toc(root):
    """Generates a table of contents as a bulleted list.

    This works on a per article basis by scanning ``<h2>`` through
    ``<h6>`` tags.  Headers containing a ``notoc="notoc"`` attribute
    will be ignored.  If header tags do not contain an ``id``
    attribute one will be added by slugifying the text within the
    header.

    For example::

      <article>
        <h1>Open Source Is Happy</h1>
        <h2 notoc="notoc">Table of Contents</h2>
        <toc/> <!-- i get replaced with a bulleted list -->
        <h2>Section 1</h2>
        <h2>Section 2</h2>
          <h3>Section 2.1</h3>
            <h4>Section 2.1.1</h4>
            <h4>Section 2.1.2</h4>
          <h3>Section 2.2</h3>
        <h2>Section 3</h2>
        <h2>Section 4</h2>
      </article>

    """
    for art in root.xpath('//article'):
        toctags = list(art.xpath('//toc'))
        if not toctags:
            continue
        toc = etree.fromstring('<ul/>')
        ul_stack = [toc]
        last_li = toc
        lev = 2
        for tag in art.xpath(r'''
            //*[name()='h2' or name()='h3' or name()='h4' or
                name()='h5' or name()='h6']
            '''):
            env['line'] = tag.sourceline
            if 'notoc' in tag.attrib:
                del tag.attrib['notoc']
            else:
                # text = etree.tostring(tag, method="text", encoding="utf8")
                text = get_tag_text(tag)
                curlev = int(tag.tag[1])
                if 'id' not in tag.attrib:
                    tag.attrib['id'] = slugify(text)
                pilcrow = etree.fromstring('<a class="pilcrow">¶</a>')
                pilcrow.attrib['href'] = '#' + tag.attrib['id']
                tag.append(pilcrow)
                li = etree.fromstring('<li/>')
                a = etree.fromstring('<a/>')
                a.text = text
                a.attrib['href'] = '#' + tag.attrib['id']
                li.append(a)
                while curlev > lev:
                    ul = etree.fromstring('<ul/>')
                    ul_stack.append(ul)
                    last_li.append(ul)
                    last_li = ul
                    lev += 1
                while curlev < lev:
                    ul_stack.pop()
                    if ul_stack[-1].getchildren():
                        last_li = ul_stack[-1].getchildren()[-1]
                    else:
                        last_li = ul_stack[-1]
                    lev -= 1
                ul_stack[-1].append(li)
                last_li = li
        for tag in toctags:
            div = etree.fromstring('<div/>')
            for k, v in tag.attrib.items():
                div.attrib[k] = v
            for child in tag.getchildren():
                div.append(child)
            div.append(toc)
            tag.getparent().replace(tag, div)
    return root


def matplotlib(root):
    """Replaces ``<matplotlib>`` tags with generated images
    """
    try:
        import numpy
        import matplotlib
    except ImportError, exc:
        logging.warning('graph matplotlib: %r' % (exc))
        return root
    n = 0
    for tag in root.xpath('//matplotlib'):
        n = n + 1
        name = "%s.%d" % (env['slug'], n)
        env['line'] = tag.sourceline
        env['source'] = get_tag_source(tag)
        env['srcfile'] = "media/img/graph/%s.py" % (name)
        env['imgfile'] = "media/img/graph/%s.png" % (name)
        if should_regenerate_image():
            logging.info("%(file)s:%(line)d: Matplotlib "
                         "%(srcfile)s -> %(imgfile)s" % env)
            code = matplotlib_template % env
            exec(code)
            codecs.open(env['srcfile'], 'w', 'utf8').write(code)
            env['extra_sources'].append(env['srcfile'])
            image_was_regenerated()
        img = etree.fromstring('<img/>')
        img.tag = 'img'
        img.attrib['src'] = '/' + env['imgfile']
        img.attrib['alt'] = 'Graph #%d' % (n)
        img.attrib['title'] = 'Graph #%d' % (n)
        for k, v in img.attrib.items():
            img.attrib[k] = v
        a = etree.fromstring('<a/>')
        a.tail = tag.tail
        a.attrib['href'] = '/' + os.path.relpath(
            highlight_html_filename(env['srcfile']), 'html')
        a.append(img)
        parent = tag.getparent()
        parent.replace(tag, a)
    return root


def gnuplot(root):
    """Replaces ``<gnuplot>`` tags with generated images
    """
    n = 0
    for tag in root.xpath('//gnuplot'):
        n = n + 1
        name = "%s.%d" % (env['slug'], n)
        env['line'] = tag.sourceline
        env['source'] = get_tag_source(tag)
        env['srcfile'] = "media/img/plot/%s.gnuplot" % (name)
        env['imgfile'] = "media/img/plot/%s.png" % (name)
        if should_regenerate_image():
            logging.info(
                "%(file)s:%(line)d: Gnuplot %(srcfile)s -> %(imgfile)s" % env)
            code = gnuplot_template % env
            f = codecs.open(env['srcfile'], 'w', 'utf8')
            f.write(code)
            f.close()
            cmd = 'gnuplot ' + env['srcfile']
            if os.system(cmd) != 0:
                logging.warning("Failed to run: " + cmd)
                continue
            env['extra_sources'].append(env['srcfile'])
            image_was_regenerated()
        img = etree.fromstring('<img/>')
        img.tag = 'img'
        img.attrib['src'] = '/' + env['imgfile']
        img.attrib['alt'] = 'Plot #%d' % (n)
        img.attrib['title'] = 'Plot #%d' % (n)
        for k, v in img.attrib.items():
            img.attrib[k] = v
        a = etree.fromstring('<a/>')
        a.tail = tag.tail
        a.attrib['href'] = '/' + os.path.relpath(
            highlight_html_filename(env['srcfile']), 'html')
        a.append(img)
        parent = tag.getparent()
        parent.replace(tag, a)
    return root


def graphviz(root):
    """Replaces ``<graphviz type="dot">`` tags with generated images
    """
    n = 0
    for tag in root.xpath("//graphviz"):
        n = n + 1
        types = ('dot', 'circo', 'neato', 'twopi', 'fdp')
        cmd = tag.attrib.get('type', 'dot')
        if cmd not in types:
            logging.info(
                "%(file)s:%(line)d: " % env +
                "Bad GraphViz Type %r.  Try: %s" % (cmd, ', '.join(types)))
            continue
        name = "%s.%d" % (env['slug'], n)
        env['line'] = tag.sourceline
        env['source'] = get_tag_source(tag)
        env['srcfile'] = "media/img/graphviz/%s.dot" % (name)
        env['imgfile'] = "media/img/graphviz/%s.png" % (name)
        if should_regenerate_image():
            logging.info(
                "%(file)s:%(line)d: GraphViz %(srcfile)s -> %(imgfile)s" % env)
            code = graphviz_template % env
            f = codecs.open(env['srcfile'], 'w', 'utf8')
            f.write(code)
            f.close()
            cmd = '%s -Tpng -o %s %s' % (cmd, env['imgfile'], env['srcfile'])
            if os.system(cmd) != 0:
                logging.warning("Failed to run: " + cmd)
                continue
            env['extra_sources'].append(env['srcfile'])
            image_was_regenerated()
        img = etree.fromstring('<img/>')
        img.tag = 'img'
        img.attrib['src'] = '/' + env['imgfile']
        img.attrib['alt'] = 'Plot #%d' % (n)
        img.attrib['title'] = 'Plot #%d' % (n)
        for k, v in img.attrib.items():
            img.attrib[k] = v
        a = etree.fromstring('<a/>')
        a.tail = tag.tail
        a.attrib['href'] = '/' + os.path.relpath(
            highlight_html_filename(env['srcfile']), 'html')
        a.append(img)
        parent = tag.getparent()
        parent.replace(tag, a)
    return root


def head_tags(root):
    if not root.xpath('//head'):
        return root
    head = root.xpath('//head')[0]
    if not root.xpath('//head/title'):
        h1text = root.xpath('//h1')
        if h1text:
            tag = etree.fromstring('<title/>')
            tag.text = re.sub(r'\s+', ' ', get_tag_text(h1text[0])).strip()
            head.append(tag)
    sumtext = root.xpath('//article/summary/p')
    if sumtext:
        tag = etree.fromstring('<meta name="description"/>')
        tag.attrib['content'] = \
            re.sub(r'\s+', ' ', get_tag_text(sumtext[0])).strip()
        head.append(tag)
    # <link rel="canonical" href="http://www.example.com/product.php?item=swedish-fish" />
    return root


def xml_to_html(fin, fout):
    """Transforms certain elements inside HTML5 document
    """
    try:
        root = etree.XML(fin.read(), xmlparser)
    except etree.XMLSyntaxError, exc:
        logging.error("%s: %s" % (env['file'], exc))
        return
    try:
        env['root'] = root
        root = python(root)
        root = latex(root)
        root = matplotlib(root)
        root = gnuplot(root)
        root = graphviz(root)
        root = image_sizes(root)
        root = syntax_highlight(root)
        root = toc(root)
        root = head_tags(root)
    except OhNo, exc:
        logging.error("%s: %s" % (env['file'], exc))
        return
    if root.tag == 'html':
        fout.write("<!DOCTYPE html>\n")
    tags = root if root.tag == 'frag' else [root]
    for tag in tags:
        fout.write(etree.tostring(tag, pretty_print=True, encoding="utf8"))


def source_to_html(fin, fout):
    """Turns a source code files into HTML5 documents
    """
    try:
        import pygments
        import pygments.lexers as pyglex
        import pygments.formatters as pygfmt
    except ImportError, exc:
        logging.warning('source to html: %r' % (exc))
        return
    data = slurp(fin)
    lang = guess_language(env['file'], data)
    if lang == 'xml':
        lang = 'html'
    try:
        lexer = pyglex.get_lexer_by_name(lang)
    except pyglex.ClassNotFound:
        logging.warning("No pygments lexer '%s'" % (lang))
        env['source'] = esc(data)
    else:
        formatter = pygfmt.HtmlFormatter(nobackground=True)
        data = pygments.highlight(data, lexer, formatter)
        sl = []
        frag = etree.fromstring(data.encode('utf8'))
        for pre in frag.xpath('//pre'):
            for tag in pre:
                sl.append(etree.tostring(tag, encoding='utf8'))
        env['source'] = "".join(sl)
    fout.write(source_page_template % env)


def db_connect():
    """We store checksums in sqlite to avoid having to constantly
    regenerate a zillion image files.
    """
    global db
    import sqlite3
    db = sqlite3.connect('.lobsterblog.sqlite')
    try:
        c = db.cursor()
        c.execute(db_schema)
        db.commit()
    except Exception, exc:
        logging.warning("failed to build sqlite schema: %s" % (exc))


def should_regenerate_image():
    if not os.path.exists(env['imgfile']):
        return True
    if not db:
        db_connect()
    try:
        ck = hashlib.md5(env['source']).hexdigest()
        q = "select count(*) from imghash where filename = ? and checksum = ?"
        lol = db.cursor().execute(q, [env['imgfile'], ck]).fetchone()[0]
        if lol:
            return False
        else:
            return True
    except Exception, exc:
        logging.warning("failed to check db: %s" % (exc))
        return True


def image_was_regenerated():
    if not db:
        db_connect()
    try:
        ck = hashlib.md5(env['source']).hexdigest()
        c = db.cursor()
        c.execute("delete from imghash where filename = ?", [env['imgfile']])
        c.execute("insert into imghash values (?, ?)", [env['imgfile'], ck])
        db.commit()
    except Exception, exc:
        logging.warning("failed to update db: %s" % (exc))


def xpath(query, path=None):
    """This function helps ``<py>`` tags extract document content

    For example::

      <py> xpath('//article/summary')[0] </py>
      <py> xpath('//article/summary', '/article.xml')[0] </py>

    """
    if path:
        root = etree.XML(open(path).read(), xmlparser)
    else:
        root = env['root']
    return root.xpath(query)


def esc(unsafe):
    """Escape HTML

    Same as PHP's ``htmlspecialchars()`` function.
    """
    safe = []
    for c in unsafe:
        if c in html_entities:
            safe.append(html_entities[c])
        else:
            safe.append(c)
    return ''.join(safe).encode('utf8')


def main(args):
    from os.path import relpath, exists, getmtime
    os.system('mkdir -p html')
    os.system('mkdir -p media')
    os.system('mkdir -p media/img')
    os.system('mkdir -p media/img/tex')
    os.system('mkdir -p media/img/plot')
    os.system('mkdir -p media/img/graph')
    os.system('mkdir -p media/img/graphviz')
    files = [relpath(fn) for fn in args]
    env['extra_sources'] = []
    for fn in [fn for fn in files if fn.endswith(('.xml', '.rss'))]:
        env['file'] = fn
        env['slug'] = slugify(fn)
        env['newfile'] = 'html/' + \
            fn.replace('.xml', '.html').replace('/', '_')
        if (not exists(env['newfile']) or
            getmtime(env['file']) > getmtime(env['newfile'])):
            logging.info('Source Code %(file)s -> %(newfile)s' % env)
            ss = StringIO.StringIO()
            xml_to_html(open(env['file']), ss)
            open(env['newfile'], 'w').write(ss.getvalue())
    for fn in set(files + env['extra_sources']):
        env['file'] = fn
        env['slug'] = slugify(fn)
        env['newfile'] = highlight_html_filename(fn)
        if (not exists(env['newfile']) or
            getmtime(env['file']) > getmtime(env['newfile'])):
            logging.info('Source Code %(file)s -> %(newfile)s' % env)
            ss = StringIO.StringIO()
            source_to_html(open(env['file']), ss)
            open(env['newfile'], 'w').write(ss.getvalue())


if __name__ == '__main__':
    import doctest
    doctest.testmod()
    logging.basicConfig(level=logging.DEBUG)
    main(sys.argv[1:])
    # try:
    #     while True:
    #         main(sys.argv[1:])
    #         time.sleep(0.5)
    # except KeyboardInterrupt:
    #     pass