From 045c4fab85bd7bcc59010d663e23ea70dda3c7bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Slav=C3=ADk?= Date: Fri, 31 Dec 1999 00:33:41 +0000 Subject: [PATCH] added wxEncodingConverter - scripts for creating convertion tables git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@5156 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- misc/unictabl/README | 21 ++++++ misc/unictabl/mk_ctable.c | 80 ++++++++++++++++++++++ misc/unictabl/mk_encodings.sh | 122 ++++++++++++++++++++++++++++++++++ misc/unictabl/mk_fallbacks.sh | 74 +++++++++++++++++++++ misc/unictabl/regenerate.sh | 24 +++++++ 5 files changed, 321 insertions(+) create mode 100644 misc/unictabl/README create mode 100644 misc/unictabl/mk_ctable.c create mode 100755 misc/unictabl/mk_encodings.sh create mode 100755 misc/unictabl/mk_fallbacks.sh create mode 100755 misc/unictabl/regenerate.sh diff --git a/misc/unictabl/README b/misc/unictabl/README new file mode 100644 index 0000000000..9b3d57cc75 --- /dev/null +++ b/misc/unictabl/README @@ -0,0 +1,21 @@ + + Files in this directory are used to generate + src/common/unictabl.inc -- wxEncodingConverter helper tables that hold + information about charset-to-unicode and unicode-to-charset conversion + + These scripts will most probably not run under Windows, even with + Cygwin tools. You'll need some Unix machine + (tested only under Linux, so...) + + To add support for new encoding, simply add proper .TXT file from + ftp://ftp.unicode.org/MAPPINGS to mappings directory. But **please** + make sure that newly added file has only CR, not LF+CR at end of line. + Also make sure it consists of 4 tab-separated columns (not spaces; + # and comment must be separated by one tab) + + After adding the file, run ./regenerate.sh and commit + src/common/unictabl.inc to CVS. + + Or just send the mapping file to me (v.slavik@volny.cz) and I'll + take care of it myself ;-) + diff --git a/misc/unictabl/mk_ctable.c b/misc/unictabl/mk_ctable.c new file mode 100644 index 0000000000..65c96f87bb --- /dev/null +++ b/misc/unictabl/mk_ctable.c @@ -0,0 +1,80 @@ + +/* CVS-ID: $Id$ */ + +#include + +typedef struct { + unsigned char c; + unsigned short u; + } charsetItem; + + + +int cmpt(const void *i1, const void *i2) +{ + unsigned short u1 = ((charsetItem*)i1) -> u; + unsigned short u2 = ((charsetItem*)i2) -> u; + return (u1 - u2); +} + + + +int main(int argc, char *argv[]) +{ + unsigned enc, unic; + unsigned i; + charsetItem table[256]; + + for (i = 0; i < 256; i++) { table[i].c = i, table[i].u = 0; /* unknown */} + + while (!feof(stdin)) + { + scanf("%i\t%i\n", &enc, &unic); + table[enc].u = unic; + table[enc].c = enc; + if (enc < 128 && enc != unic) + fprintf(stderr, "7bit ASCII incompatibilit (%s): %i->%i\n", + argv[2], enc, unic); + } + + /* dump it: */ + + printf("\n\n" + "/* \n" + " * %s to Unicode recoding table \n" + " * based on file %s by Unicode Consortium\n" + " */\n\n" + "static wxUint16 encoding_table__%s[128] = {", + argv[2], argv[1], argv[2]); + + for (i = 128; i < 256; i++) + { + if (i % 8 == 0) + printf("\n "); + printf("0x%04X%c ", table[i].u, (i == 255) ? '\n' : ','); + } + printf("};\n"); + + qsort(table + 128, 128, sizeof(table[0]), cmpt); + + +/* + NO, WE DON'T NEED REVERSE TABLE, WE CAN BUILD IT AT RUNTIME + (won't take that much time, after all you don't init + conversion so often...) + + printf("\n" + "static wxUint16 encoding_table_rev__%s[128] = {", + argv[2]); + + for (i = 128; i < 256; i++) + { + if (i % 4 == 0) + printf("\n "); + printf("{c:0x%02X,u:0x%04X}%c ", table[i].c, table[i].u, (i == 255) ? '\n' : ','); + } + printf("};\n"); +*/ + + return 1; +} diff --git a/misc/unictabl/mk_encodings.sh b/misc/unictabl/mk_encodings.sh new file mode 100755 index 0000000000..f97abc6687 --- /dev/null +++ b/misc/unictabl/mk_encodings.sh @@ -0,0 +1,122 @@ +#!/bin/sh +# CVS-ID: $Id$ + + +echo " * compiling C source generator..." + +cc mk_ctable.c -o mk_ctable + +echo " * writing copyright info..." + +echo " + +/* + * This file is #included by encconv.cpp + * + * CVS-ID: \$Id\$ + * + * *** *** CAUTION! *** *** + * Do not modify this file by hand! It is generated by shell + * script \$(WXWIN)/misc/unictabl/regenerate + * + * Parts of this file are based on tables published by Unicode, Inc. + * Original tables are freely available at + * ftp://ftp.unicode.org/MAPPINGS + * + * Original copyright info as present in mapping tables follows: + * + * + * Copyright (c) 1991-1999 Unicode, Inc. All Rights reserved. + * + * This file is provided as-is by Unicode, Inc. (The Unicode Consortium). + * No claims are made as to fitness for any particular purpose. No + * warranties of any kind are expressed or implied. The recipient + * agrees to determine applicability of information provided. If this + * file has been provided on optical media by Unicode, Inc., the sole + * remedy for any claim will be exchange of defective media within 90 + * days of receipt. + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form for + * internal or external distribution as long as this notice remains + * attached. + */ + +" > unictabl.inc + + +echo " * creating C tables..." + +all_encodings="" + +for i in mappings/*.TXT ; do + + enc=`echo $i | cut -c10- | tr - _ | sed 's/\.TXT//g' |\ +sed 's/8859_\(.*\)/ISO8859_\1/g +s/KOI8_R/KOI8/g'` + cat $i | sed -n '/^0x...0x..../p' | cut -f1,2 | \ + ./mk_ctable $i $enc >> unictabl.inc + all_encodings="$all_encodings $enc" + +done + +rm -f mk_ctable + + +echo " * adding fallback..." + +echo " + + + + +/* + * + * Unicode to 7bit ASCII fallback + * (for use with wxRECODE_SUBSTITUTE recoding mode) + * + */ + +static struct { + wxUint16 c /*code*/; + wxUint8 s /*inaccurate substitution*/; +} encoding_unicode_fallback[] = { +" >> unictabl.inc + +cat Fallbacks | while read i ; do + code=`echo "$i" | cut -f1` + subs=`echo "$i" | cut -f2 | cut -c1-2,5-6` + echo " {$code, $subs}," >> unictabl.inc +done +echo " {0, 0}" >> unictabl.inc +echo " };" >> unictabl.inc +echo " +static unsigned encoding_unicode_fallback_count = "`cat Fallbacks | wc -l`";" >> unictabl.inc + + +echo " * adding reference table..." + + +echo " + + + +/* + * + * Table of all supported encodings: + * + */ + +static struct { + wxFontEncoding encoding; // encoding identifier + wxUint16 *table; // 8bit to unicode table +} encodings_list[] = { +" >> unictabl.inc + +for i in $all_encodings ; do + echo " { wxFONTENCODING_$i, encoding_table__$i}," >> unictabl.inc +done + +echo " {wxFONTENCODING_MAX /*anything*/, NULL}" >> unictabl.inc +echo " };" >> unictabl.inc diff --git a/misc/unictabl/mk_fallbacks.sh b/misc/unictabl/mk_fallbacks.sh new file mode 100755 index 0000000000..3543cccac5 --- /dev/null +++ b/misc/unictabl/mk_fallbacks.sh @@ -0,0 +1,74 @@ +#!/bin/sh +# CVS-ID: $Id$ + + +add_fallback() +{ + echo " - for $3..." + cat _tmp3 | grep "$1" | while read i ; do + code=`echo $i | cut -c1-6` + echo "$code $2" >> _tmp5 + done +} + + +echo " * getting list of needed unicode characters..." + +cat mappings/*.TXT | sed -n '/^0x../p' | \ + cut -f2,4 | sort | uniq | sed -n '/^0x/p' > _tmp1 +cat _tmp1 | cut -f1 | sort | uniq > _tmp2 + + +echo " * making unique list of unicode characters meanings..." + +rm -f _tmp3 +cat _tmp2 | while read i ; do + sed -n "/^$i/p" _tmp1 | (read t ; echo "$t" >> _tmp3) +done + +cp _tmp3 UnicodeChars + +echo " * creating one-byte fallback tables..." + +rm -f Fallbacks _tmp5 + +echo " - for latin capital letters..." + +cat _tmp3 | grep 'LATIN CAPITAL LETTER [A-Z]$' > _tmp6 +cat _tmp3 | grep 'LATIN CAPITAL LETTER [A-Z] WITH' >> _tmp6 +cat _tmp6 | sort +2 > _tmp4 + +cat _tmp4 | while read i ; do + code=`echo $i | cut -c1-6` + fallb=`echo $i | cut -c8-29` + cat _tmp4 | fgrep "$fallb" | cut -c1-6 | (read i ; + echo "$code $i" >> _tmp5) +done + + +echo " - for latin small letters..." + +cat _tmp3 | grep 'LATIN SMALL LETTER [A-Z]$' > _tmp6 +cat _tmp3 | grep 'LATIN SMALL LETTER [A-Z] WITH' >> _tmp6 +cat _tmp6 | sort +2 > _tmp4 + +cat _tmp4 | while read i ; do + code=`echo $i | cut -c1-6` + fallb=`echo $i | cut -c8-27` + cat _tmp4 | fgrep "$fallb" | cut -c1-6 | (read i ; + echo "$code $i" >> _tmp5) +done + + +add_fallback "DOUBLE .*QUOTATION MARK" "0x0022" "double quotations" +add_fallback "SINGLE .*QUOTATION MARK" "0x0027" "single quotations" +add_fallback "DASH" "0x002D" "dashes" + + + +echo " * removing infinite loops from fallback tables..." + +cat _tmp5 | grep -v '\(0x....\) \1' | sort > Fallbacks + +rm -f _tmp1 _tmp2 _tmp3 _tmp4 _tmp5 _tmp6 + diff --git a/misc/unictabl/regenerate.sh b/misc/unictabl/regenerate.sh new file mode 100755 index 0000000000..8e69d10304 --- /dev/null +++ b/misc/unictabl/regenerate.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# CVS-ID: $Id$ + +echo "" +echo "-----------------------------------" +echo " Refreshing tables, please wait..." +echo "-----------------------------------" +echo "" + +sh ./mk_fallbacks.sh +sh ./mk_encodings.sh + +echo " * removing temporary files..." + +rm -f Fallbacks UnicodeChars + +echo " + * copying tables to src/common/unictabl.inc..." + +mv -f unictabl.inc ../../src/common/unictabl.inc + +echo " + DONE +"