genChRanges.py

#!/usr/bin/env python3
#
# Portions of this script have been (shamelessly) stolen from the
# prior work of Daniel Veillard (genUnicode.py)
#
# I, however, take full credit for any bugs, errors or difficulties :-)
#
# William Brack
# October 2003
#
# 18 October 2003
# Modified to maintain binary compatibility with previous library versions
# by adding a suffix 'Q' ('quick') to the macro generated for the original,
# function, and adding generation of a function (with the original name) which
# instantiates the macro.
#

import sys
import time

#
# A routine to take a list of yes/no (1, 0) values and turn it
# into a list of ranges.  This will later be used to determine whether
# to generate single-byte lookup tables, or inline comparisons
#
def makeRange(lst):
    ret = []
    pos = 0
    while pos < len(lst):
        try:            # index generates exception if not present
            s = lst[pos:].index(1)      # look for start of next range
        except:
            break                       # if no more, finished
        pos += s                        # pointer to start of possible range
        try:
            e = lst[pos:].index(0)      # look for end of range
            e += pos
        except:                         # if no end, set to end of list
            e = len(lst)
        ret.append((pos, e-1))          # append range tuple to list
        pos = e + 1                     # ready to check for next range
    return ret

sources = "chvalid.def"                 # input filename

# minTableSize gives the minimum number of ranges which must be present
# before a 256-byte lookup table is produced.  If there are less than this
# number, a macro with inline comparisons is generated
minTableSize = 6

# dictionary of functions, key=name, element contains char-map and range-list
Functs = {}

state = 0

try:
    defines = open("chvalid.def", "r")
except:
    print("Missing chvalid.def, aborting ...")
    sys.exit(1)

#
# The lines in the .def file have three types:-
#   name:   Defines a new function block
#   ur:     Defines individual or ranges of unicode values
#   end:    Indicates the end of the function block
#
# These lines are processed below.
#
for line in defines.readlines():
    # ignore blank lines, or lines beginning with '#'
    if line[0] == '#':
        continue
    line = line.strip()
    if line == '':
        continue
    # split line into space-separated fields, then split on type
    try:
        fields = line.split(' ')
        #
        # name line:
        #   validate any previous function block already ended
        #   validate this function not already defined
        #   initialize an entry in the function dicitonary
        #       including a mask table with no values yet defined
        #
        if fields[0] == 'name':
            name = fields[1]
            if state != 0:
                print("'name' %s found before previous name" \
                      "completed" % (fields[1]))
                continue
            state = 1
            if name in Functs:
                print("name '%s' already present - may give" \
                      " wrong results" % (name))
            else:
                # dict entry with two list elements (chdata, rangedata)
                Functs[name] = [ [], [] ]
                for v in range(256):
                    Functs[name][0].append(0)
        #
        # end line:
        #   validate there was a preceding function name line
        #   set state to show no current function active
        #
        elif fields[0] == 'end':
            if state == 0:
                print("'end' found outside of function block")
                continue
            state = 0

        #
        # ur line:
        #   validate function has been defined
        #   process remaining fields on the line, which may be either
        #       individual unicode values or ranges of values
        #
        elif fields[0] == 'ur':
            if state != 1:
                raise Exception("'ur' found outside of 'name' block")
            for el in fields[1:]:
                pos = el.find('..')
                # pos <=0 means not a range, so must be individual value
                if pos <= 0:
                    # cheap handling of hex or decimal values
                    if el[0:2] == '0x':
                        value = int(el[2:],16)
                    elif el[0] == "'":
                        value = ord(el[1])
                    else:
                        value = int(el)
                    if ((value < 0) | (value > 0x1fffff)):
                        raise Exception('Illegal value (%s) in ch for'\
                                ' name %s' % (el,name))
                    # for ur we have only ranges (makes things simpler),
                    # so convert val to range
                    currange = (value, value)
                # pos > 0 means this is a range, so isolate/validate
                # the interval
                else:
                    # split the range into it's first-val, last-val
                    (first, last) = el.split("..")
                    # convert values from text into binary
                    if first[0:2] == '0x':
                        start = int(first[2:],16)
                    elif first[0] == "'":
                        start = ord(first[1])
                    else:
                        start = int(first)
                    if last[0:2] == '0x':
                        end = int(last[2:],16)
                    elif last[0] == "'":
                        end = ord(last[1])
                    else:
                        end = int(last)
                    if (start < 0) | (end > 0x1fffff) | (start > end):
                        raise Exception("Invalid range '%s'" % el)
                    currange = (start, end)
                # common path - 'currange' has the range, now take care of it
                # We split on single-byte values vs. multibyte
                if currange[1] < 0x100: # single-byte
                    for ch in range(currange[0],currange[1]+1):
                        # validate that value not previously defined
                        if Functs[name][0][ch]:
                            msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
                            raise Exception(msg)
                        Functs[name][0][ch] = 1
                else:                   # multi-byte
                    if currange in Functs[name][1]:
                        raise Exception("range already defined in" \
                                " function")
                    else:
                        Functs[name][1].append(currange)

    except:
        print("Failed to process line: %s" % (line))
        raise
#
# At this point, the entire definition file has been processed.  Now we
# enter the output phase, where we generate the two files chvalid.c and'
# chvalid.h
#
# To do this, we first output the 'static' data (heading, fixed
# definitions, etc.), then output the 'dynamic' data (the results
# of the above processing), and finally output closing 'static' data
# (e.g. the subroutine to process the ranges)
#

#
# Generate the headings:
#
try:
    header = open("include/libxml/chvalid.h", "w")
except:
    print("Failed to open include/libxml/chvalid.h")
    sys.exit(1)

try:
    output = open("chvalid.c", "w")
except:
    print("Failed to open chvalid.c")
    sys.exit(1)

date = time.asctime(time.localtime(time.time()))

header.write(
"""/*
 * Summary: Unicode character range checking
 * Description: this module exports interfaces for the character
 *               range validation APIs
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
 *
 * Generation date: %s
 * Sources: %s
 * Author: William Brack <wbrack@mmm.com.hk>
 */

#ifndef __XML_CHVALID_H__
#define __XML_CHVALID_H__

#include <libxml/xmlversion.h>
#include <libxml/xmlstring.h>

#ifdef __cplusplus
extern "C" {
#endif

/*
 * Define our typedefs and structures
 *
 */
typedef struct _xmlChSRange xmlChSRange;
typedef xmlChSRange *xmlChSRangePtr;
struct _xmlChSRange {
    unsigned short\tlow;
    unsigned short\thigh;
};

typedef struct _xmlChLRange xmlChLRange;
typedef xmlChLRange *xmlChLRangePtr;
struct _xmlChLRange {
    unsigned int\tlow;
    unsigned int\thigh;
};

typedef struct _xmlChRangeGroup xmlChRangeGroup;
typedef xmlChRangeGroup *xmlChRangeGroupPtr;
struct _xmlChRangeGroup {
    int\t\t\tnbShortRange;
    int\t\t\tnbLongRange;
    const xmlChSRange\t*shortRange;\t/* points to an array of ranges */
    const xmlChLRange\t*longRange;
};

/**
 * Range checking routine
 */
XMLPUBFUN int
\t\txmlCharInRange(unsigned int val, const xmlChRangeGroup *group);

""" % (date, sources));
output.write(
"""/*
 * chvalid.c:\tthis module implements the character range
 *\t\tvalidation APIs
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
 *
 * Generation date: %s
 * Sources: %s
 * William Brack <wbrack@mmm.com.hk>
 */

#define IN_LIBXML
#include "libxml.h"
#include <libxml/chvalid.h>

#include <stddef.h>

/*
 * The initial tables ({func_name}_tab) are used to validate whether a
 * single-byte character is within the specified group.  Each table
 * contains 256 bytes, with each byte representing one of the 256
 * possible characters.  If the table byte is set, the character is
 * allowed.
 *
 */
""" % (date, sources));

#
# Now output the generated data.
# We try to produce the best execution times.  Tests have shown that validation
# with direct table lookup is, when there are a "small" number of valid items,
# still not as fast as a sequence of inline compares.  So, if the single-byte
# portion of a range has a "small" number of ranges, we output a macro for inline
# compares, otherwise we output a 256-byte table and a macro to use it.
#

fkeys = sorted(Functs.keys())

for f in fkeys:

# First we convert the specified single-byte values into a group of ranges.
# If the total number of such ranges is less than minTableSize, we generate
# an inline macro for direct comparisons; if greater, we generate a lookup
# table.
    if max(Functs[f][0]) > 0:   # only check if at least one entry
        rangeTable = makeRange(Functs[f][0])
        numRanges = len(rangeTable)
        if numRanges >= minTableSize:   # table is worthwhile
            header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
            header.write("""
/**
 * %s_ch:
 * @c: char to validate
 *
 * Automatically generated by genChRanges.py
 */
""" % f)
            header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))

            # write the constant data to the code file
            output.write("const unsigned char %s_tab[256] = {\n" % f)
            pline = "   "
            for n in range(255):
                pline += " 0x%02x," % Functs[f][0][n]
                if len(pline) > 72:
                    output.write(pline + "\n")
                    pline = "   "
            output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])

        else:           # inline check is used
            # first another little optimisation - if space is present,
            # put it at the front of the list so it is checked first
            try:
                ix = rangeTable.remove((0x20, 0x20))
                rangeTable.insert(0, (0x20, 0x20))
            except:
                pass
            firstFlag = 1

            header.write("""
/**
 * %s_ch:
 * @c: char to validate
 *
 * Automatically generated by genChRanges.py
 */
""" % f)
            # okay, I'm tired of the messy lineup - let's automate it!
            pline = "#define %s_ch(c)" % f
            # 'ntab' is number of tabs needed to position to col. 33 from name end
            ntab = 4 - (len(pline)) // 8
            if ntab < 0:
                ntab = 0
            just = ""
            for i in range(ntab):
                just += "\t"
            pline = pline + just + "("
            for rg in rangeTable:
                if not firstFlag:
                    pline += " || \\\n\t\t\t\t "
                else:
                    firstFlag = 0
                if rg[0] == rg[1]:              # single value - check equal
                    pline += "((c) == 0x%x)" % rg[0]
                else:                           # value range
                # since we are doing char, also change range ending in 0xff
                    if rg[1] != 0xff:
                        pline += "((0x%x <= (c)) &&" % rg[0]
                        pline += " ((c) <= 0x%x))" % rg[1]
                    else:
                        pline += " (0x%x <= (c))" % rg[0]
            pline += ")\n"
            header.write(pline)

    header.write("""
/**
 * %sQ:
 * @c: char to validate
 *
 * Automatically generated by genChRanges.py
 */
""" % f)
    pline = "#define %sQ(c)" % f
    ntab = 4 - (len(pline)) // 8
    if ntab < 0:
        ntab = 0
    just = ""
    for i in range(ntab):
        just += "\t"
    header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ")
    if max(Functs[f][0]) > 0:
        header.write("%s_ch((c)) :" % f)
    else:
        header.write("0 :")

    # if no ranges defined, value invalid if >= 0x100
    numRanges = len(Functs[f][1])
    if numRanges == 0:
        header.write(" 0)\n\n")
    else:
        if numRanges >= minTableSize:
            header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n"  % f)
        else:           # if < minTableSize, generate inline code
            firstFlag = 1
            for rg in Functs[f][1]:
                if not firstFlag:
                    pline += " || \\\n\t\t\t\t "
                else:
                    firstFlag = 0
                    pline = "\\\n\t\t\t\t("
                if rg[0] == rg[1]:              # single value - check equal
                    pline += "((c) == 0x%x)" % rg[0]
                else:                           # value range
                    pline += "((0x%x <= (c)) &&" % rg[0]
                    pline += " ((c) <= 0x%x))" % rg[1]
            pline += "))\n\n"
            header.write(pline)


    if len(Functs[f][1]) > 0:
        header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)


#
# Next we do the unicode ranges
#

for f in fkeys:
    if len(Functs[f][1]) > 0:   # only generate if unicode ranges present
        rangeTable = Functs[f][1]
        rangeTable.sort()       # ascending tuple sequence
        numShort = 0
        numLong  = 0
        for rg in rangeTable:
            if rg[1] < 0x10000: # if short value
                if numShort == 0:       # first occurrence
                    pline = "static const xmlChSRange %s_srng[] = {" % f
                else:
                    pline += ","
                numShort += 1
                if len(pline) > 60:
                    output.write(pline + "\n")
                    pline = "    "
                else:
                    pline += " "
                pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
            else:               # if long value
                if numLong == 0:        # first occurrence
                    if numShort > 0:    # if there were shorts, finish them off
                        output.write(pline + "};\n")
                    pline = "static const xmlChLRange %s_lrng[] = { " % f
                else:
                    pline += ", "
                numLong += 1
                if len(pline) > 60:
                    output.write(pline + "\n")
                    pline = "    "
                pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
        output.write(pline + "};\n")    # finish off last group

        pline = "const xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong)
        if numShort > 0:
            pline += "%s_srng" % f
        else:
            pline += "(xmlChSRangePtr)0"
        if numLong > 0:
            pline += ", %s_lrng" % f
        else:
            pline += ", (xmlChLRangePtr)0"

        output.write(pline + "};\n\n")

output.write(
"""
/**
 * xmlCharInRange:
 * @val: character to be validated
 * @rptr: pointer to range to be used to validate
 *
 * Does a binary search of the range table to determine if char
 * is valid
 *
 * Returns: true if character valid, false otherwise
 */
int
xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
    int low, high, mid;
    const xmlChSRange *sptr;
    const xmlChLRange *lptr;

    if (rptr == NULL) return(0);
    if (val < 0x10000) {\t/* is val in 'short' or 'long'  array? */
\tif (rptr->nbShortRange == 0)
\t    return 0;
\tlow = 0;
\thigh = rptr->nbShortRange - 1;
\tsptr = rptr->shortRange;
\twhile (low <= high) {
\t    mid = (low + high) / 2;
\t    if ((unsigned short) val < sptr[mid].low) {
\t\thigh = mid - 1;
\t    } else {
\t\tif ((unsigned short) val > sptr[mid].high) {
\t\t    low = mid + 1;
\t\t} else {
\t\t    return 1;
\t\t}
\t    }
\t}
    } else {
\tif (rptr->nbLongRange == 0) {
\t    return 0;
\t}
\tlow = 0;
\thigh = rptr->nbLongRange - 1;
\tlptr = rptr->longRange;
\twhile (low <= high) {
\t    mid = (low + high) / 2;
\t    if (val < lptr[mid].low) {
\t\thigh = mid - 1;
\t    } else {
\t\tif (val > lptr[mid].high) {
\t\t    low = mid + 1;
\t\t} else {
\t\t    return 1;
\t\t}
\t    }
\t}
    }
    return 0;
}

""");

#
# finally, generate the ABI compatibility functions
#
for f in fkeys:
    output.write("""
/**
 * %s:
 * @ch:  character to validate
 *
 * This function is DEPRECATED.
""" % f);
    if max(Functs[f][0]) > 0:
        output.write(" * Use %s_ch or %sQ instead" % (f, f))
    else:
        output.write(" * Use %sQ instead" % f)
    output.write("""
 *
 * Returns true if argument valid, false otherwise
 */
""")
    output.write("int\n%s(unsigned int ch) {\n    return(%sQ(ch));\n}\n\n" % (f,f))
    header.write("XMLPUBFUN int\n\t\t%s(unsigned int ch);\n" % f);
#
# Run complete - write trailers and close the output files
#

header.write("""
#ifdef __cplusplus
}
#endif
#endif /* __XML_CHVALID_H__ */
""")

header.close()

output.close()