From ac013a445206a06216c2aac85787868fd79027df Mon Sep 17 00:00:00 2001 From: ph10 Date: Tue, 3 Jun 2014 16:26:20 +0000 Subject: [PATCH] Update the two Python maintenance scripts for Python 3. --- maint/GenerateUtt.py | 33 +++++---- maint/MultiStage2.py | 166 +++++++++++++++++++++++-------------------- 2 files changed, 104 insertions(+), 95 deletions(-) diff --git a/maint/GenerateUtt.py b/maint/GenerateUtt.py index c9a6a55..81ad20f 100755 --- a/maint/GenerateUtt.py +++ b/maint/GenerateUtt.py @@ -1,8 +1,6 @@ #! /usr/bin/python -# Generate utt tables. Note: this script is written in Python 2 and is -# incompatible with Python 3. However, the 2to3 conversion script has been -# successfully tested on it. +# Generate utt tables. Note: this script has now been converted to Python 3. # The source file pcre2_tables.c contains (amongst other things), a table that # is indexed by script name. In order to reduce the number of relocations when @@ -22,6 +20,7 @@ # necessary for Unicode 6.2.0 support. # Modfied by PH 26-February-2013 to add the Xuc special category. # Comment modified by PH 13-May-2014 to update to PCRE2 file names. +# Script updated to Python 3 by running it through the 2to3 converter. script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ @@ -53,9 +52,9 @@ general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z'] # First add the Unicode script and category names. -utt_table = zip(script_names, ['PT_SC'] * len(script_names)) -utt_table += zip(category_names, ['PT_PC'] * len(category_names)) -utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names)) +utt_table = list(zip(script_names, ['PT_SC'] * len(script_names))) +utt_table += list(zip(category_names, ['PT_PC'] * len(category_names))) +utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names))) # Now add our own specials. @@ -75,29 +74,29 @@ utt_table.sort() # UTF-8 mode on EBCDIC platforms. for utt in utt_table: - print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), + print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ') for c in utt[0]: if c == '_': - print 'STR_UNDERSCORE', + print('STR_UNDERSCORE', end=' ') elif c == '&': - print 'STR_AMPERSAND', + print('STR_AMPERSAND', end=' ') else: - print 'STR_%s' % c,; - print '"\\0"' + print('STR_%s' % c, end=' '); + print('"\\0"') # Print the actual table, using the string names -print '' -print 'const char PRIV(utt_names)[] ='; +print('') +print('const char PRIV(utt_names)[] ='); last = '' for utt in utt_table: if utt == utt_table[-1]: last = ';' - print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last) + print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)) # This was how it was done before the EBCDIC-compatible modification. # print ' "%s\\0"%s' % (utt[0], last) -print '\nconst ucp_type_table PRIV(utt)[] = {' +print('\nconst ucp_type_table PRIV(utt)[] = {') offset = 0 last = ',' for utt in utt_table: @@ -108,6 +107,6 @@ for utt in utt_table: value = 'ucp_' + utt[0] if utt == utt_table[-1]: last = '' - print ' { %3d, %s, %s }%s' % (offset, utt[1], value, last) + print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last)) offset += len(utt[0]) + 1 -print '};' +print('};') diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py index bec081f..726fcb6 100755 --- a/maint/MultiStage2.py +++ b/maint/MultiStage2.py @@ -10,9 +10,10 @@ # generate the pcre_ucd.c file that contains a digested form of the Unicode # data tables. # -# The script should be run in the maint subdirectory, using the command +# The script has now been upgraded to Python 3 for PCRE2, and should be run in +# the maint subdirectory, using the command # -# [python2] ./MultiStage2.py >../src/pcre2_ucd.c +# [python3] ./MultiStage2.py >../src/pcre2_ucd.c # # It requires four Unicode data tables, DerivedGeneralCategory.txt, # GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the @@ -42,6 +43,13 @@ # offsets into the table are added to the main output records. This new # code scans CaseFolding.txt instead of UnicodeData.txt. # +# Update for Python3: +# . Processed with 2to3, but that didn't fix everything +# . Changed string.strip to str.strip +# . Added encoding='utf-8' to the open() call +# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is +# required and the result of the division is a float +# # The main tables generated by this script are used by macros defined in # pcre2_internal.h. They look up Unicode character properties using short # sequences of code that contains no branches, which makes for greater speed. @@ -110,6 +118,7 @@ # final hole in the structure. # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 # 13-May-2014: Updated for PCRE2 +# 03-June-2014: Updated for Python 3 ############################################################################## @@ -133,11 +142,11 @@ def get_other_case(chardata): # Read the whole table in memory def read_table(file_name, get_value, default_value): - file = open(file_name, 'r') + file = open(file_name, 'r', encoding='utf-8') table = [default_value] * MAX_UNICODE for line in file: line = re.sub(r'#.*', '', line) - chardata = map(string.strip, line.split(';')) + chardata = list(map(str.strip, line.split(';'))) if len(chardata) <= 1: continue value = get_value(chardata) @@ -170,7 +179,7 @@ def get_type_size(table): if minlimit <= minval and maxval <= maxlimit: return type_size[num] else: - raise OverflowError, "Too large to fit into C types" + raise OverflowError("Too large to fit into C types") def get_tables_size(*tables): total_size = 0 @@ -205,13 +214,13 @@ def print_table(table, table_name, block_size = None): s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) if block_size: s += ", block = %d" % block_size - print s + " */" + print(s + " */") table = tuple(table) if block_size is None: fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */" mult = MAX_UNICODE / len(table) for i in range(0, len(table), ELEMS_PER_LINE): - print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)) + print(fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))) else: if block_size > ELEMS_PER_LINE: el = ELEMS_PER_LINE @@ -219,10 +228,10 @@ def print_table(table, table_name, block_size = None): el = block_size fmt = "%3d," * el + "\n" if block_size > ELEMS_PER_LINE: - fmt = fmt * (block_size / ELEMS_PER_LINE) + fmt = fmt * int(block_size / ELEMS_PER_LINE) for i in range(0, len(table), block_size): - print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]) - print "};\n" + print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])) + print("};\n") # Extract the unique combinations of properties into records def combine_tables(*tables): @@ -241,7 +250,7 @@ def get_record_size_struct(records): 'types in this structure definition from pcre2_internal.h (the actual\n' + \ 'field names will be different):\n\ntypedef struct {\n' for i in range(len(records[0])): - record_slice = map(lambda record: record[i], records) + record_slice = [record[i] for record in records] slice_type, slice_size = get_type_size(record_slice) # add padding: round up to the nearest power of slice_size size = (size + slice_size - 1) & -slice_size @@ -249,7 +258,7 @@ def get_record_size_struct(records): structure += '%s property_%d;\n' % (slice_type, i) # round up to the first item of the next structure in array - record_slice = map(lambda record: record[0], records) + record_slice = [record[0] for record in records] slice_type, slice_size = get_type_size(record_slice) size = (size + slice_size - 1) & -slice_size @@ -273,13 +282,14 @@ def test_record_size(): #print struct def print_records(records, record_size): - print 'const ucd_record PRIV(ucd_records)[] = { ' + \ - '/* %d bytes, record size %d */' % (len(records) * record_size, record_size) - records = zip(records.keys(), records.values()) - records.sort(None, lambda x: x[1]) + print('const ucd_record PRIV(ucd_records)[] = { ' + \ + '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)) + + records = list(zip(list(records.keys()), list(records.values()))) + records.sort(key = lambda x: x[1]) for i, record in enumerate(records): - print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)) - print '};\n' + print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))) + print('};\n') script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ @@ -393,10 +403,10 @@ for s in sets: table, records = combine_tables(script, category, break_props, caseless_offsets, other_case) -record_size, record_struct = get_record_size_struct(records.keys()) +record_size, record_struct = get_record_size_struct(list(records.keys())) # Find the optimum block size for the two-stage table -min_size = sys.maxint +min_size = sys.maxsize for block_size in [2 ** i for i in range(5,10)]: size = len(records) * record_size stage1, stage2 = compress_table(table, block_size) @@ -407,76 +417,76 @@ for block_size in [2 ** i for i in range(5,10)]: min_stage1, min_stage2 = stage1, stage2 min_block_size = block_size -print "/* This module is generated by the maint/MultiStage2.py script." -print "Do not modify it by hand. Instead modify the script and run it" -print "to regenerate this code." -print -print "As well as being part of the PCRE2 library, this module is #included" -print "by the pcre2test program, which redefines the PRIV macro to change" -print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes" -print "with the library. At present, just one of these tables is actually" -print "needed. */" -print -print "#ifndef PCRE2_INCLUDED" -print -print "#ifdef HAVE_CONFIG_H" -print "#include \"config.h\"" -print "#endif" -print -print "#include \"pcre2_internal.h\"" -print -print "#endif /* PCRE2_INCLUDED */" -print -print "/* Unicode character database. */" -print "/* This file was autogenerated by the MultiStage2.py script. */" -print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size) -print -print "/* The tables herein are needed only when UCP support is built," -print "and in PCRE2 that happens automatically with UTF support." -print "This module should not be referenced otherwise, so" -print "it should not matter whether it is compiled or not. However" -print "a comment was received about space saving - maybe the guy linked" -print "all the modules rather than using a library - so we include a" -print "condition to cut out the tables when not needed. But don't leave" -print "a totally empty module because some compilers barf at that." -print "Instead, just supply small dummy tables. */" -print -print "#ifndef SUPPORT_UTF" -print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};" -print "const uint8_t PRIV(ucd_stage1)[] = {0};" -print "const uint16_t PRIV(ucd_stage2)[] = {0};" -print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};" -print "#else" -print -print record_struct +print("/* This module is generated by the maint/MultiStage2.py script.") +print("Do not modify it by hand. Instead modify the script and run it") +print("to regenerate this code.") +print() +print("As well as being part of the PCRE2 library, this module is #included") +print("by the pcre2test program, which redefines the PRIV macro to change") +print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes") +print("with the library. At present, just one of these tables is actually") +print("needed. */") +print() +print("#ifndef PCRE2_INCLUDED") +print() +print("#ifdef HAVE_CONFIG_H") +print("#include \"config.h\"") +print("#endif") +print() +print("#include \"pcre2_internal.h\"") +print() +print("#endif /* PCRE2_INCLUDED */") +print() +print("/* Unicode character database. */") +print("/* This file was autogenerated by the MultiStage2.py script. */") +print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)) +print() +print("/* The tables herein are needed only when UCP support is built,") +print("and in PCRE2 that happens automatically with UTF support.") +print("This module should not be referenced otherwise, so") +print("it should not matter whether it is compiled or not. However") +print("a comment was received about space saving - maybe the guy linked") +print("all the modules rather than using a library - so we include a") +print("condition to cut out the tables when not needed. But don't leave") +print("a totally empty module because some compilers barf at that.") +print("Instead, just supply small dummy tables. */") +print() +print("#ifndef SUPPORT_UTF") +print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};") +print("const uint8_t PRIV(ucd_stage1)[] = {0};") +print("const uint16_t PRIV(ucd_stage2)[] = {0};") +print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};") +print("#else") +print() +print(record_struct) # --- Added by PH: output the table of caseless character sets --- -print "const uint32_t PRIV(ucd_caseless_sets)[] = {" -print " NOTACHAR," +print("const uint32_t PRIV(ucd_caseless_sets)[] = {") +print(" NOTACHAR,") for s in sets: s = sorted(s) for x in s: - print ' 0x%04x,' % x, - print ' NOTACHAR,' -print '};' -print + print(' 0x%04x,' % x, end=' ') + print(' NOTACHAR,') +print('};') +print() # ------ -print "/* When #included in pcre2test, we don't need this large table. */" -print -print "#ifndef PCRE2_INCLUDED" -print +print("/* When #included in pcre2test, we don't need this large table. */") +print() +print("#ifndef PCRE2_INCLUDED") +print() print_records(records, record_size) print_table(min_stage1, 'PRIV(ucd_stage1)') print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) -print "#if UCD_BLOCK_SIZE != %d" % min_block_size -print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h" -print "#endif" -print "#endif /* SUPPORT_UTF */" -print -print "#endif /* PCRE2_INCLUDED */" +print("#if UCD_BLOCK_SIZE != %d" % min_block_size) +print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h") +print("#endif") +print("#endif /* SUPPORT_UTF */") +print() +print("#endif /* PCRE2_INCLUDED */") """