Update the two Python maintenance scripts for Python 3.

This commit is contained in:
ph10 2014-06-03 16:26:20 +00:00
parent d94877dd75
commit ac013a4452
2 changed files with 104 additions and 95 deletions

View File

@ -1,8 +1,6 @@
#! /usr/bin/python
# Generate utt tables. Note: this script is written in Python 2 and is
# incompatible with Python 3. However, the 2to3 conversion script has been
# successfully tested on it.
# Generate utt tables. Note: this script has now been converted to Python 3.
# The source file pcre2_tables.c contains (amongst other things), a table that
# is indexed by script name. In order to reduce the number of relocations when
@ -22,6 +20,7 @@
# necessary for Unicode 6.2.0 support.
# Modfied by PH 26-February-2013 to add the Xuc special category.
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
# Script updated to Python 3 by running it through the 2to3 converter.
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
@ -53,9 +52,9 @@ general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
# First add the Unicode script and category names.
utt_table = zip(script_names, ['PT_SC'] * len(script_names))
utt_table += zip(category_names, ['PT_PC'] * len(category_names))
utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
utt_table = list(zip(script_names, ['PT_SC'] * len(script_names)))
utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
# Now add our own specials.
@ -75,29 +74,29 @@ utt_table.sort()
# UTF-8 mode on EBCDIC platforms.
for utt in utt_table:
print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
for c in utt[0]:
if c == '_':
print 'STR_UNDERSCORE',
print('STR_UNDERSCORE', end=' ')
elif c == '&':
print 'STR_AMPERSAND',
print('STR_AMPERSAND', end=' ')
else:
print 'STR_%s' % c,;
print '"\\0"'
print('STR_%s' % c, end=' ');
print('"\\0"')
# Print the actual table, using the string names
print ''
print 'const char PRIV(utt_names)[] =';
print('')
print('const char PRIV(utt_names)[] =');
last = ''
for utt in utt_table:
if utt == utt_table[-1]:
last = ';'
print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
# This was how it was done before the EBCDIC-compatible modification.
# print ' "%s\\0"%s' % (utt[0], last)
print '\nconst ucp_type_table PRIV(utt)[] = {'
print('\nconst ucp_type_table PRIV(utt)[] = {')
offset = 0
last = ','
for utt in utt_table:
@ -108,6 +107,6 @@ for utt in utt_table:
value = 'ucp_' + utt[0]
if utt == utt_table[-1]:
last = ''
print ' { %3d, %s, %s }%s' % (offset, utt[1], value, last)
print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last))
offset += len(utt[0]) + 1
print '};'
print('};')

View File

@ -10,9 +10,10 @@
# generate the pcre_ucd.c file that contains a digested form of the Unicode
# data tables.
#
# The script should be run in the maint subdirectory, using the command
# The script has now been upgraded to Python 3 for PCRE2, and should be run in
# the maint subdirectory, using the command
#
# [python2] ./MultiStage2.py >../src/pcre2_ucd.c
# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
#
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
@ -42,6 +43,13 @@
# offsets into the table are added to the main output records. This new
# code scans CaseFolding.txt instead of UnicodeData.txt.
#
# Update for Python3:
# . Processed with 2to3, but that didn't fix everything
# . Changed string.strip to str.strip
# . Added encoding='utf-8' to the open() call
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
# required and the result of the division is a float
#
# The main tables generated by this script are used by macros defined in
# pcre2_internal.h. They look up Unicode character properties using short
# sequences of code that contains no branches, which makes for greater speed.
@ -110,6 +118,7 @@
# final hole in the structure.
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
# 13-May-2014: Updated for PCRE2
# 03-June-2014: Updated for Python 3
##############################################################################
@ -133,11 +142,11 @@ def get_other_case(chardata):
# Read the whole table in memory
def read_table(file_name, get_value, default_value):
file = open(file_name, 'r')
file = open(file_name, 'r', encoding='utf-8')
table = [default_value] * MAX_UNICODE
for line in file:
line = re.sub(r'#.*', '', line)
chardata = map(string.strip, line.split(';'))
chardata = list(map(str.strip, line.split(';')))
if len(chardata) <= 1:
continue
value = get_value(chardata)
@ -170,7 +179,7 @@ def get_type_size(table):
if minlimit <= minval and maxval <= maxlimit:
return type_size[num]
else:
raise OverflowError, "Too large to fit into C types"
raise OverflowError("Too large to fit into C types")
def get_tables_size(*tables):
total_size = 0
@ -205,13 +214,13 @@ def print_table(table, table_name, block_size = None):
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
if block_size:
s += ", block = %d" % block_size
print s + " */"
print(s + " */")
table = tuple(table)
if block_size is None:
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
mult = MAX_UNICODE / len(table)
for i in range(0, len(table), ELEMS_PER_LINE):
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
print(fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)))
else:
if block_size > ELEMS_PER_LINE:
el = ELEMS_PER_LINE
@ -219,10 +228,10 @@ def print_table(table, table_name, block_size = None):
el = block_size
fmt = "%3d," * el + "\n"
if block_size > ELEMS_PER_LINE:
fmt = fmt * (block_size / ELEMS_PER_LINE)
fmt = fmt * int(block_size / ELEMS_PER_LINE)
for i in range(0, len(table), block_size):
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
print "};\n"
print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
print("};\n")
# Extract the unique combinations of properties into records
def combine_tables(*tables):
@ -241,7 +250,7 @@ def get_record_size_struct(records):
'types in this structure definition from pcre2_internal.h (the actual\n' + \
'field names will be different):\n\ntypedef struct {\n'
for i in range(len(records[0])):
record_slice = map(lambda record: record[i], records)
record_slice = [record[i] for record in records]
slice_type, slice_size = get_type_size(record_slice)
# add padding: round up to the nearest power of slice_size
size = (size + slice_size - 1) & -slice_size
@ -249,7 +258,7 @@ def get_record_size_struct(records):
structure += '%s property_%d;\n' % (slice_type, i)
# round up to the first item of the next structure in array
record_slice = map(lambda record: record[0], records)
record_slice = [record[0] for record in records]
slice_type, slice_size = get_type_size(record_slice)
size = (size + slice_size - 1) & -slice_size
@ -273,13 +282,14 @@ def test_record_size():
#print struct
def print_records(records, record_size):
print 'const ucd_record PRIV(ucd_records)[] = { ' + \
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
records = zip(records.keys(), records.values())
records.sort(None, lambda x: x[1])
print('const ucd_record PRIV(ucd_records)[] = { ' + \
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
records = list(zip(list(records.keys()), list(records.values())))
records.sort(key = lambda x: x[1])
for i, record in enumerate(records):
print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
print '};\n'
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
print('};\n')
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
@ -393,10 +403,10 @@ for s in sets:
table, records = combine_tables(script, category, break_props,
caseless_offsets, other_case)
record_size, record_struct = get_record_size_struct(records.keys())
record_size, record_struct = get_record_size_struct(list(records.keys()))
# Find the optimum block size for the two-stage table
min_size = sys.maxint
min_size = sys.maxsize
for block_size in [2 ** i for i in range(5,10)]:
size = len(records) * record_size
stage1, stage2 = compress_table(table, block_size)
@ -407,76 +417,76 @@ for block_size in [2 ** i for i in range(5,10)]:
min_stage1, min_stage2 = stage1, stage2
min_block_size = block_size
print "/* This module is generated by the maint/MultiStage2.py script."
print "Do not modify it by hand. Instead modify the script and run it"
print "to regenerate this code."
print
print "As well as being part of the PCRE2 library, this module is #included"
print "by the pcre2test program, which redefines the PRIV macro to change"
print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes"
print "with the library. At present, just one of these tables is actually"
print "needed. */"
print
print "#ifndef PCRE2_INCLUDED"
print
print "#ifdef HAVE_CONFIG_H"
print "#include \"config.h\""
print "#endif"
print
print "#include \"pcre2_internal.h\""
print
print "#endif /* PCRE2_INCLUDED */"
print
print "/* Unicode character database. */"
print "/* This file was autogenerated by the MultiStage2.py script. */"
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
print
print "/* The tables herein are needed only when UCP support is built,"
print "and in PCRE2 that happens automatically with UTF support."
print "This module should not be referenced otherwise, so"
print "it should not matter whether it is compiled or not. However"
print "a comment was received about space saving - maybe the guy linked"
print "all the modules rather than using a library - so we include a"
print "condition to cut out the tables when not needed. But don't leave"
print "a totally empty module because some compilers barf at that."
print "Instead, just supply small dummy tables. */"
print
print "#ifndef SUPPORT_UTF"
print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
print "const uint8_t PRIV(ucd_stage1)[] = {0};"
print "const uint16_t PRIV(ucd_stage2)[] = {0};"
print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};"
print "#else"
print
print record_struct
print("/* This module is generated by the maint/MultiStage2.py script.")
print("Do not modify it by hand. Instead modify the script and run it")
print("to regenerate this code.")
print()
print("As well as being part of the PCRE2 library, this module is #included")
print("by the pcre2test program, which redefines the PRIV macro to change")
print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
print("with the library. At present, just one of these tables is actually")
print("needed. */")
print()
print("#ifndef PCRE2_INCLUDED")
print()
print("#ifdef HAVE_CONFIG_H")
print("#include \"config.h\"")
print("#endif")
print()
print("#include \"pcre2_internal.h\"")
print()
print("#endif /* PCRE2_INCLUDED */")
print()
print("/* Unicode character database. */")
print("/* This file was autogenerated by the MultiStage2.py script. */")
print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
print()
print("/* The tables herein are needed only when UCP support is built,")
print("and in PCRE2 that happens automatically with UTF support.")
print("This module should not be referenced otherwise, so")
print("it should not matter whether it is compiled or not. However")
print("a comment was received about space saving - maybe the guy linked")
print("all the modules rather than using a library - so we include a")
print("condition to cut out the tables when not needed. But don't leave")
print("a totally empty module because some compilers barf at that.")
print("Instead, just supply small dummy tables. */")
print()
print("#ifndef SUPPORT_UTF")
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
print("const uint8_t PRIV(ucd_stage1)[] = {0};")
print("const uint16_t PRIV(ucd_stage2)[] = {0};")
print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
print("#else")
print()
print(record_struct)
# --- Added by PH: output the table of caseless character sets ---
print "const uint32_t PRIV(ucd_caseless_sets)[] = {"
print " NOTACHAR,"
print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
print(" NOTACHAR,")
for s in sets:
s = sorted(s)
for x in s:
print ' 0x%04x,' % x,
print ' NOTACHAR,'
print '};'
print
print(' 0x%04x,' % x, end=' ')
print(' NOTACHAR,')
print('};')
print()
# ------
print "/* When #included in pcre2test, we don't need this large table. */"
print
print "#ifndef PCRE2_INCLUDED"
print
print("/* When #included in pcre2test, we don't need this large table. */")
print()
print("#ifndef PCRE2_INCLUDED")
print()
print_records(records, record_size)
print_table(min_stage1, 'PRIV(ucd_stage1)')
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
print "#if UCD_BLOCK_SIZE != %d" % min_block_size
print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h"
print "#endif"
print "#endif /* SUPPORT_UTF */"
print
print "#endif /* PCRE2_INCLUDED */"
print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
print("#endif")
print("#endif /* SUPPORT_UTF */")
print()
print("#endif /* PCRE2_INCLUDED */")
"""