ezt.py

#!/usr/bin/env python
"""ezt.py -- EaZy Templating

For documentation, please see: http://code.google.com/p/ezt/wiki/Syntax
"""
#
# Copyright (C) 2001-2011 Greg Stein. All Rights Reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
#
# This software is maintained by Greg and is available at:
#    http://code.google.com/p/ezt/
#

__author__ = 'Greg Stein'
__version__ = '1.0'
__license__ = 'BSD'

import re
from types import IntType, FloatType, LongType
import os
import urllib
import StringIO

#
# Formatting types
#
FORMAT_RAW = 'raw'
FORMAT_HTML = 'html'
FORMAT_XML = 'xml'
FORMAT_JS = 'js'
FORMAT_URL = 'url'

#
# This regular expression matches four alternatives:
#   expr: NEWLINE | DIRECTIVE | BRACKET | COMMENT
#   DIRECTIVE: '[' ITEM (whitespace ARG)* ']
#   ITEM: STRING | NAME
#   ARG: STRING | NAME | NUMBER
#   STRING: '"' (not-slash-or-dquote | '\' anychar)* '"'
#   NAME: (alpha | '_') (alphanum | '_' | '-' | '.')*
#   NUMBER: digit+
#   BRACKET: '[[]'
#   COMMENT: '[#' not-rbracket* ']'
#
# Note: the above BNR is a bit loose around ITEM/ARG/NAME/NUMBER. The
#       important point is that the first value in a directive must
#       start with '_' or an alpha character (no digits). This greatly
#       helps to avoid simple errors like '[0]' in templates.
#
# When used with the split() method, the return value will be composed of
# non-matching text and the three paren groups (NEWLINE, DIRECTIVE and
# BRACKET). Since the COMMENT matches are not placed into a group, they are
# considered a "splitting" value and simply dropped.
#
_item = r'(?:"(?:[^\\"]|\\.)*"|[A-Za-z_][-\w.]*)'
_arg = r'(?:"(?:[^\\"]|\\.)*"|[-\w.]+)'
_re_parse = re.compile(r'(\r?\n)|\[(%s(?: +%s)*)\]|(\[\[\])|\[#[^\]]*\]' %
                       (_item, _arg))

_re_args = re.compile(r'"(?:[^\\"]|\\.)*"|[-\w.]+')

# block commands and their argument counts
_block_cmd_specs = { 'if-index':2, 'for':1, 'is':2, 'define':1, 'format':1 }
_block_cmds = _block_cmd_specs.keys()

# two regular expressions for compressing whitespace. the first is used to
# compress any whitespace including a newline into a single newline. the
# second regex is used to compress runs of whitespace into a single space.
_re_newline = re.compile('[ \t\r\f\v]*\n\\s*')
_re_whitespace = re.compile(r'\s\s+')

# this regex is used to substitute arguments into a value. we split the value,
# replace the relevant pieces, and then put it all back together. splitting
# will produce a list of: TEXT ( splitter TEXT )*. splitter will be '%' or
# an integer.
_re_subst = re.compile('%(%|[0-9]+)')

class Template:

  def __init__(self, fname=None, compress_whitespace=1,
               base_format=FORMAT_RAW):
    self.compress_whitespace = compress_whitespace
    if fname:
      self.parse_file(fname, base_format)

  def parse_file(self, fname, base_format=FORMAT_RAW):
    "fname -> a string object with pathname of file containg an EZT template."

    self.parse(_FileReader(fname), base_format)

  def parse(self, text_or_reader, base_format=FORMAT_RAW):
    """Parse the template specified by text_or_reader.

    The argument should be a string containing the template, or it should
    specify a subclass of ezt.Reader which can read templates. The base
    format for printing values is given by base_format.
    """
    if not isinstance(text_or_reader, Reader):
      # assume the argument is a plain text string
      text_or_reader = _TextReader(text_or_reader)

    self.program = self._parse(text_or_reader,
                               base_printer=_parse_format(base_format))

  def generate(self, fp, data):
    if hasattr(data, '__getitem__') or callable(getattr(data, 'keys', None)):
      # a dictionary-like object was passed. convert it to an
      # attribute-based object.
      class _data_ob:
        def __init__(self, d):
          vars(self).update(d)
      data = _data_ob(data)

    ctx = _context()
    ctx.data = data
    ctx.for_index = { }
    ctx.defines = { }
    self._execute(self.program, fp, ctx)

  def _parse(self, reader, for_names=None, file_args=(), base_printer=None):
    """text -> string object containing the template.

    This is a private helper function doing the real work for method parse.
    It returns the parsed template as a 'program'.  This program is a sequence
    made out of strings or (function, argument) 2-tuples.

    Note: comment directives [# ...] are automatically dropped by _re_parse.
    """

    filename = reader.filename()
    # parse the template program into: (TEXT NEWLINE DIRECTIVE BRACKET)* TEXT
    parts = _re_parse.split(reader.text)

    program = [ ]
    stack = [ ]
    if not for_names:
      for_names = [ ]

    if base_printer is None:
      base_printer = ()
    printers = [ base_printer ]

    one_newline_copied = False
    line_number = 1
    for i in range(len(parts)):
      piece = parts[i]
      which = i % 4  # discriminate between: TEXT NEWLINE DIRECTIVE BRACKET
      if which == 0:
        # TEXT. append if non-empty.
        if piece:
          if self.compress_whitespace:
            piece = _re_whitespace.sub(' ', piece)
          program.append(piece)
          one_newline_copied = False
      elif which == 1:
        # NEWLINE. append unless compress_whitespace requested
        if piece:
          line_number += 1
          if self.compress_whitespace:
            if not one_newline_copied:
              program.append('\n')
              one_newline_copied = True
          else:
            program.append(piece)
      elif which == 3:
        # BRACKET directive. append '[' if present.
        if piece:
          program.append('[')
          one_newline_copied = False
      elif piece:
        # DIRECTIVE is present.
        one_newline_copied = False
        args = _re_args.findall(piece)
        cmd = args[0]
        if cmd == 'else':
          if len(args) > 1:
            raise ArgCountSyntaxError(str(args[1:]), filename, line_number)
          ### check: don't allow for 'for' cmd
          idx = stack[-1][1]
          true_section = program[idx:]
          del program[idx:]
          stack[-1][3] = true_section
        elif cmd == 'end':
          if len(args) > 1:
            raise ArgCountSyntaxError(str(args[1:]), filename, line_number)
          # note: true-section may be None
          try:
            cmd, idx, args, true_section, start_line_number = stack.pop()
          except IndexError:
            raise UnmatchedEndError(None, filename, line_number)
          else_section = program[idx:]
          if cmd == 'format':
            printers.pop()
          else:
            func = getattr(self, '_cmd_' + re.sub('-', '_', cmd))
            program[idx:] = [ (func, (args, true_section, else_section),
                               filename, line_number) ]
            if cmd == 'for':
              for_names.pop()
        elif cmd in _block_cmds:
          if len(args) > _block_cmd_specs[cmd] + 1:
            raise ArgCountSyntaxError(str(args[1:]), filename, line_number)
          ### this assumes arg1 is always a ref unless cmd is 'define'
          if cmd != 'define':
            args[1] = _prepare_ref(args[1], for_names, file_args)

          # handle arg2 for the 'is' command
          if cmd == 'is':
            args[2] = _prepare_ref(args[2], for_names, file_args)
          elif cmd == 'for':
            for_names.append(args[1][0])  # append the refname
          elif cmd == 'format':
            if args[1][0]:
              raise BadFormatConstantError(str(args[1:]), filename, line_number)
            printers.append(_parse_format(args[1][1]))

          # remember the cmd, current pos, args, and a section placeholder
          stack.append([cmd, len(program), args[1:], None, line_number])
        elif cmd == 'include' or cmd == 'insertfile':
          is_insertfile = (cmd == 'insertfile')
          # extra arguments are meaningless when using insertfile
          if is_insertfile and len(args) != 2:
            raise ArgCountSyntaxError(str(args), filename, line_number)
          if args[1][0] == '"':
            include_filename = args[1][1:-1]
            if is_insertfile:
              program.append(reader.read_other(include_filename).text)
            else:
              f_args = [ ]
              for arg in args[2:]:
                f_args.append(_prepare_ref(arg, for_names, file_args))
              program.extend(self._parse(reader.read_other(include_filename),
                                         for_names, f_args, printers[-1]))
          else:
            if len(args) != 2:
              raise ArgCountSyntaxError(str(args), filename, line_number)
            if is_insertfile:
              cmd = self._cmd_insertfile
            else:
              cmd = self._cmd_include
            program.append((cmd,
                            (_prepare_ref(args[1], for_names, file_args),
                             reader, printers[-1]), filename, line_number))
        elif cmd == 'if-any':
          f_args = [ ]
          for arg in args[1:]:
            f_args.append(_prepare_ref(arg, for_names, file_args))
          stack.append(['if-any', len(program), f_args, None, line_number])
        else:
          # implied PRINT command
          if len(args) > 1:
            f_args = [ ]
            for arg in args:
              f_args.append(_prepare_ref(arg, for_names, file_args))
            program.append((self._cmd_subst,
                            (printers[-1], f_args[0], f_args[1:]),
                            filename, line_number))
          else:
            valref = _prepare_ref(args[0], for_names, file_args)
            program.append((self._cmd_print, (printers[-1], valref),
                            filename, line_number))

    if stack:
      raise UnclosedBlocksError('Block opened at line %s' % stack[-1][4],
                                filename=filename)
    return program

  def _execute(self, program, fp, ctx):
    """This private helper function takes a 'program' sequence as created
    by the method '_parse' and executes it step by step.  strings are written
    to the file object 'fp' and functions are called.
    """
    for step in program:
      if isinstance(step, basestring):
        fp.write(step)
      else:
        method, method_args, filename, line_number = step
        method(method_args, fp, ctx, filename, line_number)

  def _cmd_print(self, (transforms, valref), fp, ctx, filename, line_number):
    value = _get_value(valref, ctx, filename, line_number)
    # if the value has a 'read' attribute, then it is a stream: copy it
    if hasattr(value, 'read'):
      while 1:
        chunk = value.read(16384)
        if not chunk:
          break
        for t in transforms:
          chunk = t(chunk)
        fp.write(chunk)
    else:
      for t in transforms:
        value = t(value)
      fp.write(value)

  def _cmd_subst(self, (transforms, valref, args), fp, ctx, filename,
                 line_number):
    fmt = _get_value(valref, ctx, filename, line_number)
    parts = _re_subst.split(fmt)
    for i in range(len(parts)):
      piece = parts[i]
      if i%2 == 1 and piece != '%':
        idx = int(piece)
        if idx < len(args):
          piece = _get_value(args[idx], ctx, filename, line_number)
        else:
          piece = '<undef>'
      for t in transforms:
        piece = t(piece)
      fp.write(piece)

  def _cmd_include(self, (valref, reader, printer), fp, ctx, filename,
                   line_number):
    fname = _get_value(valref, ctx, filename, line_number)
    ### note: we don't have the set of for_names to pass into this parse.
    ### I don't think there is anything to do but document it
    self._execute(self._parse(reader.read_other(fname), base_printer=printer),
                  fp, ctx)

  def _cmd_insertfile(self, (valref, reader, printer), fp, ctx, filename,
                      line_number):
    fname = _get_value(valref, ctx, filename, line_number)
    fp.write(reader.read_other(fname).text)

  def _cmd_if_any(self, args, fp, ctx, filename, line_number):
    "If any value is a non-empty string or non-empty list, then T else F."
    (valrefs, t_section, f_section) = args
    value = 0
    for valref in valrefs:
      if _get_value(valref, ctx, filename, line_number):
        value = 1
        break
    self._do_if(value, t_section, f_section, fp, ctx)

  def _cmd_if_index(self, args, fp, ctx, filename, line_number):
    ((valref, value), t_section, f_section) = args
    list, idx = ctx.for_index[valref[0]]
    if value == 'even':
      value = idx % 2 == 0
    elif value == 'odd':
      value = idx % 2 == 1
    elif value == 'first':
      value = idx == 0
    elif value == 'last':
      value = idx == len(list)-1
    else:
      value = idx == int(value)
    self._do_if(value, t_section, f_section, fp, ctx)

  def _cmd_is(self, args, fp, ctx, filename, line_number):
    ((left_ref, right_ref), t_section, f_section) = args
    right_value = _get_value(right_ref, ctx, filename, line_number)
    left_value = _get_value(left_ref, ctx, filename, line_number)
    value = left_value.lower() == right_value.lower()
    self._do_if(value, t_section, f_section, fp, ctx)

  def _do_if(self, value, t_section, f_section, fp, ctx):
    if t_section is None:
      t_section = f_section
      f_section = None
    if value:
      section = t_section
    else:
      section = f_section
    if section is not None:
      self._execute(section, fp, ctx)

  def _cmd_for(self, args, fp, ctx, filename, line_number):
    ((valref,), unused, section) = args
    list = _get_value(valref, ctx, filename, line_number)
    refname = valref[0]
    if isinstance(list, basestring):
      raise NeedSequenceError(refname, filename, line_number)
    ctx.for_index[refname] = idx = [ list, 0 ]
    for item in list:
      self._execute(section, fp, ctx)
      idx[1] = idx[1] + 1
    del ctx.for_index[refname]

  def _cmd_define(self, args, fp, ctx, filename, line_number):
    ((name,), unused, section) = args
    valfp = StringIO.StringIO()
    if section is not None:
      self._execute(section, valfp, ctx)
    ctx.defines[name] = valfp.getvalue()

def boolean(value):
  "Return a value suitable for [if-any bool_var] usage in a template."
  if value:
    return 'yes'
  return None


def _prepare_ref(refname, for_names, file_args):
  """refname -> a string containing a dotted identifier. example:"foo.bar.bang"
  for_names -> a list of active for sequences.

  Returns a `value reference', a 3-tuple made out of (refname, start, rest),
  for fast access later.
  """
  # is the reference a string constant?
  if refname[0] == '"':
    return None, refname[1:-1], None

  parts = refname.split('.')
  start = parts[0]
  rest = parts[1:]

  # if this is an include-argument, then just return the prepared ref
  if start[:3] == 'arg':
    try:
      idx = int(start[3:])
    except ValueError:
      pass
    else:
      if idx < len(file_args):
        orig_refname, start, more_rest = file_args[idx]
        if more_rest is None:
          # the include-argument was a string constant
          return None, start, None

        # prepend the argument's "rest" for our further processing
        rest[:0] = more_rest

        # rewrite the refname to ensure that any potential 'for' processing
        # has the correct name
        ### this can make it hard for debugging include files since we lose
        ### the 'argNNN' names
        if not rest:
          return start, start, [ ]
        refname = start + '.' + '.'.join(rest)

  if for_names:
    # From last to first part, check if this reference is part of a for loop
    for i in range(len(parts), 0, -1):
      name = '.'.join(parts[:i])
      if name in for_names:
        return refname, name, parts[i:]

  return refname, start, rest

def _get_value((refname, start, rest), ctx, filename, line_number):
  """(refname, start, rest) -> a prepared `value reference' (see above).
  ctx -> an execution context instance.

  Does a name space lookup within the template name space.  Active
  for blocks take precedence over data dictionary members with the
  same name.
  """
  if rest is None:
    # it was a string constant
    return start

  # get the starting object
  if ctx.for_index.has_key(start):
    list, idx = ctx.for_index[start]
    ob = list[idx]
  elif ctx.defines.has_key(start):
    ob = ctx.defines[start]
  elif hasattr(ctx.data, start):
    ob = getattr(ctx.data, start)
  else:
    raise UnknownReference(refname, filename, line_number)

  # walk the rest of the dotted reference
  for attr in rest:
    try:
      ob = getattr(ob, attr)
    except AttributeError:
      raise UnknownReference(refname, filename, line_number)

  # make sure we return a string instead of some various Python types
  if isinstance(ob, (IntType, FloatType, LongType)):
    return str(ob)
  if ob is None:
    return ''

  # string or a sequence
  return ob

def _replace(s, replace_map):
  for orig, repl in replace_map:
    s = s.replace(orig, repl)
  return s

REPLACE_JS_MAP = (
  ('\\', r'\\'), ('\t', r'\t'), ('\n', r'\n'), ('\r', r'\r'),
  ('"', r'\x22'), ('\'', r'\x27'), ('&', r'\x26'),
  ('<', r'\x3c'), ('>', r'\x3e'), ('=', r'\x3d'),
)

# Various unicode whitespace
REPLACE_JS_UNICODE_MAP = (
  (u'\u0085', r'\u0085'), (u'\u2028', r'\u2028'), (u'\u2029', r'\u2029'),
)

# Why not cgi.escape? It doesn't do single quotes which are occasionally
# used to contain HTML attributes and event handler definitions (unfortunately)
REPLACE_HTML_MAP = (
  ('&', '&amp;'), ('<', '&lt;'), ('>', '&gt;'),
  ('"', '&quot;'), ('\'', '&#39;'),
)

def _js_escape(s):
  s = _replace(s, REPLACE_JS_MAP)
  ### perhaps attempt to coerce the string to unicode and then replace?
  if isinstance(s, unicode):
    s = _replace(s, REPLACE_JS_UNICODE_MAP)
  return s

def _html_escape(s):
  return _replace(s, REPLACE_HTML_MAP)

def _url_escape(s):
  ### quote_plus barfs on non-ASCII characters. According to
  ### http://www.w3.org/International/O-URL-code.html URIs should be
  ### UTF-8 encoded first.
  if isinstance(s, unicode):
    s = s.encode('utf8')
  return urllib.quote_plus(s)

FORMATTERS = {
  FORMAT_RAW: None,
  FORMAT_HTML: _html_escape,
  FORMAT_XML: _html_escape,   ### use the same quoting as HTML for now
  FORMAT_JS: _js_escape,
  FORMAT_URL: _url_escape,
}

def _parse_format(format_string=FORMAT_RAW):
  format_funcs = []
  try:
    for fspec in format_string.split(','):
      format_func = FORMATTERS[fspec]
      if format_func is not None:
        format_funcs.append(format_func)
  except KeyError:
    raise UnknownFormatConstantError(format_string)
  return format_funcs

class _context:
  """A container for the execution context"""


class Reader:
  """Abstract class which allows EZT to detect Reader objects."""
  def filename(self):
    return '(%s does not provide filename() method)' % repr(self)

class _FileReader(Reader):
  """Reads templates from the filesystem."""
  def __init__(self, fname):
    self.text = open(fname, 'rb').read()
    self._dir = os.path.dirname(fname)
    self.fname = fname
  def read_other(self, relative):
    return _FileReader(os.path.join(self._dir, relative))
  def filename(self):
    return self.fname

class _TextReader(Reader):
  """'Reads' a template from provided text."""
  def __init__(self, text):
    self.text = text
  def read_other(self, relative):
    raise BaseUnavailableError()
  def filename(self):
    return '(text)'


class EZTException(Exception):
  """Parent class of all EZT exceptions."""
  def __init__(self, message=None, filename=None, line_number=None):
    self.message = message
    self.filename = filename
    self.line_number = line_number
  def __str__(self):
    ret = []
    if self.message is not None:
      ret.append(self.message)
    if self.filename is not None:
      ret.append('in file ' + str(self.filename))
    if self.line_number is not None:
      ret.append('at line ' + str(self.line_number))
    return ' '.join(ret)

class ArgCountSyntaxError(EZTException):
  """A bracket directive got the wrong number of arguments."""

class UnknownReference(EZTException):
  """The template references an object not contained in the data dictionary."""

class NeedSequenceError(EZTException):
  """The object dereferenced by the template is no sequence (tuple or list)."""

class UnclosedBlocksError(EZTException):
  """This error may be simply a missing [end]."""

class UnmatchedEndError(EZTException):
  """This error may be caused by a misspelled if directive."""

class BaseUnavailableError(EZTException):
  """Base location is unavailable, which disables includes."""

class BadFormatConstantError(EZTException):
  """Format specifiers must be string constants."""

class UnknownFormatConstantError(EZTException):
  """The format specifier is an unknown value."""


# --- standard test environment ---
def test_parse():
  assert _re_parse.split('[a]') == ['', '[a]', None, '']
  assert _re_parse.split('[a] [b]') == \
         ['', '[a]', None, ' ', '[b]', None, '']
  assert _re_parse.split('[a c] [b]') == \
         ['', '[a c]', None, ' ', '[b]', None, '']
  assert _re_parse.split('x [a] y [b] z') == \
         ['x ', '[a]', None, ' y ', '[b]', None, ' z']
  assert _re_parse.split('[a "b" c "d"]') == \
         ['', '[a "b" c "d"]', None, '']
  assert _re_parse.split(r'["a \"b[foo]" c.d f]') == \
         ['', '["a \\"b[foo]" c.d f]', None, '']

def _test(argv):
  import doctest, ezt
  verbose = "-v" in argv
  return doctest.testmod(ezt, verbose=verbose)

if __name__ == "__main__":
  # invoke unit test for this module:
  import sys
  sys.exit(_test(sys.argv)[0])