#!/usr/bin/env python # -*- coding: utf-8 -*- """ LobsterBlog ~~~~~~~~~~~ HTML5 Technical Article Beautifier :Copyright: Copyright (c) 2010 J.A. Roberts Tunney :License: GPL v2 or later :Modified: 2010-08-14 :URL: LobsterBlog is for when plain old HTML is too much work and wordpress/django/etc. is more trouble than it's worth. Requires Python 2.6 or 2.7 ChangeLog --------- - 2011-08-28: Pygments broke support for utf8 strings so we now give it a unicode string. This broke my unicode article for god knows how long and I feel really frustrated and embarassed right now :'( Invocation ---------- :: python lobsterblog.py *.xml Dependencies ------------ :: sudo apt-get install \ texinfo tex-common dvipng \ python-pygments python-lxml python-imaging \ python-matplotlib python-numpy gnuplot-nox \ graphviz python-sphinx dnotify rsync """ from __future__ import with_statement import re import os import sys import time import codecs import cPickle import logging import hashlib import textwrap import StringIO import lxml.etree as etree source_page_template = textwrap.dedent(r"""

Source Code: %(file)s

%(source)s
""").strip() matplotlib_template = textwrap.dedent(ur""" from numpy import * from pylab import * %(source)s savefig(%(imgfile)r, transparent=True) """).strip() latex_template = textwrap.dedent(r""" \documentclass[16px,fleqn]{article} \usepackage[margin=0cm,nohead]{geometry} \usepackage[utf8]{inputenc} \usepackage{amsmath} \setlength{\mathindent}{0cm} \pagestyle{empty} \begin{document} %(source)s \end{document} """).strip() gnuplot_template = textwrap.dedent(ur""" set output "%(imgfile)s" %(source)s """).strip() graphviz_template = textwrap.dedent(ur""" %(source)s """).strip() db_schema = r""" create table if not exists imghash ( filename text not null primary key, -- relative image filename checksum text not null -- md5 of code to make image ); """ html_entities = { '<': '<', '>': '>', '&': '>', } db = None env = {} xmlparser = etree.XMLParser( load_dtd=False, no_network=True, dtd_validation=False, remove_comments=False, remove_blank_text=True) class OhNo(Exception): """Generic class for internal errors """ class chdir(object): """Context manager for temporarily changing current directory """ def __init__(self, path): self.oldpath = os.getcwd() self.newpath = path def __enter__(self): os.chdir(self.newpath) def __exit__(self, type_, value, traceback): os.chdir(self.oldpath) def forkcall(function): """Perform a long operation in the background """ (readfd, writefd) = os.pipe() pread, pwrite = os.fdopen(readfd, 'rb'), os.fdopen(writefd, 'wb') pid = os.fork() if pid: pwrite.close() def waiter(): returncode = os.waitpid(pid, 0)[1] if returncode != 0: raise Exception('child failed', pid, returncode) return cPickle.load(pread) return waiter else: pread.close() # db_connect() result = function() cPickle.dump(result, pwrite) sys.exit(0) def slurp(f, codec='utf8', errors='strict'): """Return full decoded contents of file object or filename """ if hasattr(f, 'read'): res = f.read() if isinstance(res, str): return res.decode(codec) else: return codecs.open(f, 'r', codec, errors).read() def slugify(text): """Turn arbitrary strings into identifiers """ slug = text.strip().lower() slug = re.sub('[^\w]+', '_', slug) slug = re.sub('_+$', '', slug) slug = re.sub('^_+', '', slug) return slug def highlight_html_filename(fn): """Generate HTML filename for housing source code of `fn` """ return 'html/' + fn.replace('/', '_') + '.html' def htmlpath(path, base='.', cwd=None, secure=True, absolute=False): r"""Transform filename from HTML document into system filename - Absolute paths are relative to the current working directory. - Relative paths are relative to directory of ``env['file']``. - Does not check if the file exists Examples:: >>> htmlpath('/dog.png') 'dog.png' >>> htmlpath('/media/img/dog.png') 'media/img/dog.png' >>> htmlpath('love.html', base='/var/www') 'love.html' >>> htmlpath('love.html', base='/var/www', absolute=True) '/var/www/love.html' >>> htmlpath('love.html', base='/var/www', cwd='stuff') 'stuff/love.html' >>> htmlpath('love.html', base='/var/www', cwd='/var/www/zomg') 'zomg/love.html' >>> htmlpath('../../etc/passwd', base='/var/www', secure=False) '../../etc/passwd' >>> htmlpath('../../etc/passwd', base='/var/www', secure=True) Traceback (most recent call last): ... ValueError: '../../etc/passwd' resolves outside '/var/www' :param path: Path taken from HTML document. Never blank. :param base: Base folder of website. Defaults to ``os.getcwd()``. Non-absolute paths are relative to ``os.getcwd()``. :param cwd: Defaults to ``os.path.dirname(env['file'])``. Non-absolute paths are relative to :param:`base`. :param secure: Raise exception if result not inside :param:`base`. :param absolute: Return an absolute system path. :return: Path relative to :param:`base`. Never blank. """ from os.path import dirname, relpath, abspath, join if not path: raise ValueError("no path specified") if cwd is None: cwd = dirname(env.get('file', '.')) or '.' if path.startswith('/'): ares = abspath(join(base, path[1:])) else: ares = abspath(join(base, cwd, path)) if secure and not ares.startswith(abspath(base)): raise ValueError("%r resolves outside %r" % (path, abspath(base))) return ares if absolute else relpath(ares, base) def guess_language(filename, source=None): r"""Guesses programming language for source code Will return file extension if possible, otherwise relying on shebang or ``-*-lang-*-`` comments. Returns ``None`` if language could not be determined. Examples:: >>> guess_language('FOO.C') 'c' >>> guess_language('lol/Makefile') 'make' >>> guess_language('/usr/share/foo.tar.gz', '-*-perl-*-') 'gz' >>> guess_language('foo', '') is None True >>> guess_language(None, '-*-PERL-*-') 'perl' >>> guess_language(None, '\n;; -*-lisp-mode-*-\n') 'lisp' >>> guess_language(None, '#!/usr/bin/python') 'python' >>> guess_language(None, '#!/usr/bin/python2.6') 'python' >>> guess_language(None, '#!/usr/bin/python/') is None True >>> guess_language(None, '#!/usr/bin/env python\n') 'python' >>> guess_language(None, '#!/usr/bin/env python2.6\n') 'python' >>> guess_language(None, '#!/usr/bin/env python2.6\n# -*-perl-*-\n') 'perl' >>> guess_language(None, '\n;; -*- coding: utf-8 -*-\n') is None True >>> guess_language(None, '\n;; -*-coding:big5-*-\n') is None True >>> guess_language(None, '\n#-*-coding:big5-*-\n#-*-perl-*-') 'perl' """ if filename: if '/' in filename: filename = filename[filename.rindex('/') + 1:] if filename == 'Makefile': return 'make' if filename and '.' in filename: exten = filename[filename.rindex('.') + 1:].lower() if exten == 'h': return 'c' else: return exten else: if source is None: if os.path.exists(filename): source = slurp(filename) else: return None for pat in guess_language.pats: mat = pat.search(source) if mat: return mat.group('lang').lower() guess_language.pats = ( re.compile(r'-\*-\s*(?P[a-z]+)[^:\s]*\s*-\*-', re.I | re.M), re.compile(r'^#!/\S+?/env (?P[a-z]+)', re.I | re.M), re.compile(r'^#!/\S+?/(?P[a-z]+)[^/]*(\s|$)', re.I | re.M), ) def get_tag_source(tag): """Returns text directly inside a tag If tag contains a 'src' attribute, the contains of this file will be prepended to the source code contained within the tag (if any.) """ source = '' if 'src' in tag.attrib: filename = htmlpath(tag.attrib['src']) if filename: source += slurp(filename) if tag.text: source += '\n' + textwrap.dedent(tag.text).strip() return source def get_tag_text(tag): def do_tag(tag): if tag.tag == 'img': s = tag.attrib.get('alt') else: s = tag.text if s: yield s for child in tag.getchildren(): for data in do_tag(child): yield data if child.tail: yield child.tail return " ".join(list(do_tag(tag))).strip() def syntax_highlight(root): """Pygmentizes ``
`` blocks
    """
    try:
        import pygments
        import pygments.lexers as pyglex
        import pygments.formatters as pygfmt
    except ImportError, exc:
        logging.warning('syntax highlighting: %r' % (exc))
        return root
    formatter = pygfmt.HtmlFormatter()
    for tag in root.xpath('//pre'):
        env['line'] = tag.sourceline
        info = tag.attrib.get('class')
        if not info:
            continue
        if 'lobsterblog-dedent' in info:
            code = etree.tostring(tag, method="text", encoding="utf8")
            code = code.decode('utf8')
            code = textwrap.dedent(code).strip()
            tag.text = code
        match = re.search(r'lobsterblog-syntax--(?P\S+)', info)
        if match:
            env['lang'] = match.group('lang')
            try:
                lexer = pyglex.get_lexer_by_name(env['lang'])
            except pyglex.ClassNotFound:
                logging.warning(
                    "%(file)s:%(line)d: Could not find Pygments lexer "
                    "for language %(lang)r.  Please see: "
                    "http://pygments.org/docs/lexers/" % env)
            else:
                code = etree.tostring(tag, method="text", encoding="utf8")
                code = code.decode('utf8')
                code = textwrap.dedent(code).strip()
                high = pygments.highlight(code, lexer, formatter)
                for child in tag:
                    tag.remove(child)
                tag.text = ''
                xml = etree.fromstring(high)
                for child in xml.xpath('//pre')[0]:
                    tag.append(child)
    return root


def image_sizes(root):
    """Adds width/height attributes to ```` if not present
    """
    try:
        from PIL import Image
    except ImportError, exc:
        logging.warning('image sizes: %r' % (exc))
        return root
    for tag in root.xpath('//img'):
        env['line'] = tag.sourceline
        if 'width' not in tag.attrib or 'height' not in tag.attrib:
            try:
                path = htmlpath(tag.attrib.get('src'))
                assert os.path.exists(path), "file not found: " + path
            except Exception, exc:
                logging.warning("%(file)s:%(line)d: bad img src: "
                                % env + str(exc))
            else:
                try:
                    f = Image.open(path)
                except Exception, exc:
                    logging.warning("PIL.open(%r) failed: %s" % (path, exc))
                    continue
                width, height = f.size
                tag.attrib['width'] = str(width)
                tag.attrib['height'] = str(height)
                logging.info("Image %s dimensions are %d x %d"
                             % (path, width, height))
    return root


def latex(root):
    """Replaces ```` tags with generated images
    """
    with chdir('media/img/tex'):
        n = 0
        for tag in root.xpath('//latex'):
            n = n + 1
            name = "%s.%d" % (env['slug'], n)
            env['line'] = tag.sourceline
            env['source'] = get_tag_source(tag)
            env['srcfile'] = "media/img/tex/%s.tex" % (name)
            env['imgfile'] = "media/img/tex/%s.png" % (name)
            if should_regenerate_image():
                logging.info("%(file)s:%(line)d: LaTeX "
                             "%(srcfile)s -> %(imgfile)s" % env)
                f = codecs.open(name + '.tex', 'w', 'utf8')
                f.write(latex_template % env)
                f.close()
                cmd = "latex %s.tex >/dev/null 2>&1" % (name)
                if os.system(cmd) != 0:
                    logging.warning("Failed to run: " + cmd + "\n" +
                                    slurp(name + '.log'))
                    continue
                cmd = ("dvipng -bg transparent -o %s.png %s.dvi "
                       ">/dev/null 2>&1") % (name, name)
                if os.system(cmd) != 0:
                    logging.warning("Failed to run: " + cmd)
                    continue
                os.unlink(name + '.log')
                os.unlink(name + '.dvi')
                env['extra_sources'].append(env['srcfile'])
                image_was_regenerated()
            img = etree.fromstring('')
            img.attrib['src'] = '/' + env['imgfile']
            img.attrib['alt'] = 'LaTeX Figure #%d' % (n)
            img.attrib['title'] = 'LaTeX Figure #%d' % (n)
            for k, v in tag.attrib.items():
                img.attrib[k] = v
            a = etree.fromstring('')
            a.tail = tag.tail
            a.attrib['href'] = os.path.relpath(
                highlight_html_filename(env['srcfile']), 'html')
            a.append(img)
            parent = tag.getparent()
            parent.replace(tag, a)
        return root


def python(root):
    """Executes ```` tags, replacing them with captured stdout
    """
    for tag in root.xpath("//*[name()='python' or name()='py']"):
        env['line'] = tag.sourceline
        code = get_tag_source(tag).strip()
        if tag.tag == 'python':
            sys.stdout, oldstdout = StringIO.StringIO(), sys.stdout
            try:
                exec code
            except:
                logging.exception("python exec() failed:\n\n%s\n\n"
                                  "output:\n\n%s", code, sys.stdout.getvalue())
                continue
            finally:
                output = sys.stdout.getvalue()
                sys.stdout = oldstdout
        elif tag.tag == 'py':
            try:
                output = eval(code)
            except:
                logging.exception("python eval(%r) failed", code)
                continue
        if not etree.iselement(output):
            try:
                try:
                    output = etree.fromstring(output)
                except:
                    output = etree.fromstring('%s' % (output))
            except Exception, exc:
                logging.error("bad xml output: %s\n\n%s", exc, output)
                continue
        output.tail = tag.tail
        for k, v in tag.attrib.items():
            output.attrib[k] = v
        parent = tag.getparent()
        parent.replace(tag, output)
    return root


def toc(root):
    """Generates a table of contents as a bulleted list.

    This works on a per article basis by scanning ``

`` through ``

`` tags. Headers containing a ``notoc="notoc"`` attribute will be ignored. If header tags do not contain an ``id`` attribute one will be added by slugifying the text within the header. For example::

Open Source Is Happy

Table of Contents

Section 1

Section 2

Section 2.1

Section 2.1.1

Section 2.1.2

Section 2.2

Section 3

Section 4

""" for art in root.xpath('//article'): toctags = list(art.xpath('//toc')) if not toctags: continue toc = etree.fromstring('