added wxEncodingConverter - scripts for creating convertion tables

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@5156 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Václav Slavík 1999-12-31 00:33:41 +00:00
parent 2de89127de
commit 045c4fab85
5 changed files with 321 additions and 0 deletions

21
misc/unictabl/README Normal file
View File

@ -0,0 +1,21 @@
Files in this directory are used to generate
src/common/unictabl.inc -- wxEncodingConverter helper tables that hold
information about charset-to-unicode and unicode-to-charset conversion
These scripts will most probably not run under Windows, even with
Cygwin tools. You'll need some Unix machine
(tested only under Linux, so...)
To add support for new encoding, simply add proper .TXT file from
ftp://ftp.unicode.org/MAPPINGS to mappings directory. But **please**
make sure that newly added file has only CR, not LF+CR at end of line.
Also make sure it consists of 4 tab-separated columns (not spaces;
# and comment must be separated by one tab)
After adding the file, run ./regenerate.sh and commit
src/common/unictabl.inc to CVS.
Or just send the mapping file to me (v.slavik@volny.cz) and I'll
take care of it myself ;-)

80
misc/unictabl/mk_ctable.c Normal file
View File

@ -0,0 +1,80 @@
/* CVS-ID: $Id$ */
#include <stdio.h>
typedef struct {
unsigned char c;
unsigned short u;
} charsetItem;
int cmpt(const void *i1, const void *i2)
{
unsigned short u1 = ((charsetItem*)i1) -> u;
unsigned short u2 = ((charsetItem*)i2) -> u;
return (u1 - u2);
}
int main(int argc, char *argv[])
{
unsigned enc, unic;
unsigned i;
charsetItem table[256];
for (i = 0; i < 256; i++) { table[i].c = i, table[i].u = 0; /* unknown */}
while (!feof(stdin))
{
scanf("%i\t%i\n", &enc, &unic);
table[enc].u = unic;
table[enc].c = enc;
if (enc < 128 && enc != unic)
fprintf(stderr, "7bit ASCII incompatibilit (%s): %i->%i\n",
argv[2], enc, unic);
}
/* dump it: */
printf("\n\n"
"/* \n"
" * %s to Unicode recoding table \n"
" * based on file %s by Unicode Consortium\n"
" */\n\n"
"static wxUint16 encoding_table__%s[128] = {",
argv[2], argv[1], argv[2]);
for (i = 128; i < 256; i++)
{
if (i % 8 == 0)
printf("\n ");
printf("0x%04X%c ", table[i].u, (i == 255) ? '\n' : ',');
}
printf("};\n");
qsort(table + 128, 128, sizeof(table[0]), cmpt);
/*
NO, WE DON'T NEED REVERSE TABLE, WE CAN BUILD IT AT RUNTIME
(won't take that much time, after all you don't init
conversion so often...)
printf("\n"
"static wxUint16 encoding_table_rev__%s[128] = {",
argv[2]);
for (i = 128; i < 256; i++)
{
if (i % 4 == 0)
printf("\n ");
printf("{c:0x%02X,u:0x%04X}%c ", table[i].c, table[i].u, (i == 255) ? '\n' : ',');
}
printf("};\n");
*/
return 1;
}

122
misc/unictabl/mk_encodings.sh Executable file
View File

@ -0,0 +1,122 @@
#!/bin/sh
# CVS-ID: $Id$
echo " * compiling C source generator..."
cc mk_ctable.c -o mk_ctable
echo " * writing copyright info..."
echo "
/*
* This file is #included by encconv.cpp
*
* CVS-ID: \$Id\$
*
* *** *** CAUTION! *** ***
* Do not modify this file by hand! It is generated by shell
* script \$(WXWIN)/misc/unictabl/regenerate
*
* Parts of this file are based on tables published by Unicode, Inc.
* Original tables are freely available at
* ftp://ftp.unicode.org/MAPPINGS
*
* Original copyright info as present in mapping tables follows:
*
*
* Copyright (c) 1991-1999 Unicode, Inc. All Rights reserved.
*
* This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
* No claims are made as to fitness for any particular purpose. No
* warranties of any kind are expressed or implied. The recipient
* agrees to determine applicability of information provided. If this
* file has been provided on optical media by Unicode, Inc., the sole
* remedy for any claim will be exchange of defective media within 90
* days of receipt.
*
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form for
* internal or external distribution as long as this notice remains
* attached.
*/
" > unictabl.inc
echo " * creating C tables..."
all_encodings=""
for i in mappings/*.TXT ; do
enc=`echo $i | cut -c10- | tr - _ | sed 's/\.TXT//g' |\
sed 's/8859_\(.*\)/ISO8859_\1/g
s/KOI8_R/KOI8/g'`
cat $i | sed -n '/^0x...0x..../p' | cut -f1,2 | \
./mk_ctable $i $enc >> unictabl.inc
all_encodings="$all_encodings $enc"
done
rm -f mk_ctable
echo " * adding fallback..."
echo "
/*
*
* Unicode to 7bit ASCII fallback
* (for use with wxRECODE_SUBSTITUTE recoding mode)
*
*/
static struct {
wxUint16 c /*code*/;
wxUint8 s /*inaccurate substitution*/;
} encoding_unicode_fallback[] = {
" >> unictabl.inc
cat Fallbacks | while read i ; do
code=`echo "$i" | cut -f1`
subs=`echo "$i" | cut -f2 | cut -c1-2,5-6`
echo " {$code, $subs}," >> unictabl.inc
done
echo " {0, 0}" >> unictabl.inc
echo " };" >> unictabl.inc
echo "
static unsigned encoding_unicode_fallback_count = "`cat Fallbacks | wc -l`";" >> unictabl.inc
echo " * adding reference table..."
echo "
/*
*
* Table of all supported encodings:
*
*/
static struct {
wxFontEncoding encoding; // encoding identifier
wxUint16 *table; // 8bit to unicode table
} encodings_list[] = {
" >> unictabl.inc
for i in $all_encodings ; do
echo " { wxFONTENCODING_$i, encoding_table__$i}," >> unictabl.inc
done
echo " {wxFONTENCODING_MAX /*anything*/, NULL}" >> unictabl.inc
echo " };" >> unictabl.inc

74
misc/unictabl/mk_fallbacks.sh Executable file
View File

@ -0,0 +1,74 @@
#!/bin/sh
# CVS-ID: $Id$
add_fallback()
{
echo " - for $3..."
cat _tmp3 | grep "$1" | while read i ; do
code=`echo $i | cut -c1-6`
echo "$code $2" >> _tmp5
done
}
echo " * getting list of needed unicode characters..."
cat mappings/*.TXT | sed -n '/^0x../p' | \
cut -f2,4 | sort | uniq | sed -n '/^0x/p' > _tmp1
cat _tmp1 | cut -f1 | sort | uniq > _tmp2
echo " * making unique list of unicode characters meanings..."
rm -f _tmp3
cat _tmp2 | while read i ; do
sed -n "/^$i/p" _tmp1 | (read t ; echo "$t" >> _tmp3)
done
cp _tmp3 UnicodeChars
echo " * creating one-byte fallback tables..."
rm -f Fallbacks _tmp5
echo " - for latin capital letters..."
cat _tmp3 | grep 'LATIN CAPITAL LETTER [A-Z]$' > _tmp6
cat _tmp3 | grep 'LATIN CAPITAL LETTER [A-Z] WITH' >> _tmp6
cat _tmp6 | sort +2 > _tmp4
cat _tmp4 | while read i ; do
code=`echo $i | cut -c1-6`
fallb=`echo $i | cut -c8-29`
cat _tmp4 | fgrep "$fallb" | cut -c1-6 | (read i ;
echo "$code $i" >> _tmp5)
done
echo " - for latin small letters..."
cat _tmp3 | grep 'LATIN SMALL LETTER [A-Z]$' > _tmp6
cat _tmp3 | grep 'LATIN SMALL LETTER [A-Z] WITH' >> _tmp6
cat _tmp6 | sort +2 > _tmp4
cat _tmp4 | while read i ; do
code=`echo $i | cut -c1-6`
fallb=`echo $i | cut -c8-27`
cat _tmp4 | fgrep "$fallb" | cut -c1-6 | (read i ;
echo "$code $i" >> _tmp5)
done
add_fallback "DOUBLE .*QUOTATION MARK" "0x0022" "double quotations"
add_fallback "SINGLE .*QUOTATION MARK" "0x0027" "single quotations"
add_fallback "DASH" "0x002D" "dashes"
echo " * removing infinite loops from fallback tables..."
cat _tmp5 | grep -v '\(0x....\) \1' | sort > Fallbacks
rm -f _tmp1 _tmp2 _tmp3 _tmp4 _tmp5 _tmp6

24
misc/unictabl/regenerate.sh Executable file
View File

@ -0,0 +1,24 @@
#!/bin/sh
# CVS-ID: $Id$
echo ""
echo "-----------------------------------"
echo " Refreshing tables, please wait..."
echo "-----------------------------------"
echo ""
sh ./mk_fallbacks.sh
sh ./mk_encodings.sh
echo " * removing temporary files..."
rm -f Fallbacks UnicodeChars
echo "
* copying tables to src/common/unictabl.inc..."
mv -f unictabl.inc ../../src/common/unictabl.inc
echo "
DONE
"