added wxEncodingConverter - scripts for creating convertion tables
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@5156 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
parent
2de89127de
commit
045c4fab85
21
misc/unictabl/README
Normal file
21
misc/unictabl/README
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
|
||||||
|
Files in this directory are used to generate
|
||||||
|
src/common/unictabl.inc -- wxEncodingConverter helper tables that hold
|
||||||
|
information about charset-to-unicode and unicode-to-charset conversion
|
||||||
|
|
||||||
|
These scripts will most probably not run under Windows, even with
|
||||||
|
Cygwin tools. You'll need some Unix machine
|
||||||
|
(tested only under Linux, so...)
|
||||||
|
|
||||||
|
To add support for new encoding, simply add proper .TXT file from
|
||||||
|
ftp://ftp.unicode.org/MAPPINGS to mappings directory. But **please**
|
||||||
|
make sure that newly added file has only CR, not LF+CR at end of line.
|
||||||
|
Also make sure it consists of 4 tab-separated columns (not spaces;
|
||||||
|
# and comment must be separated by one tab)
|
||||||
|
|
||||||
|
After adding the file, run ./regenerate.sh and commit
|
||||||
|
src/common/unictabl.inc to CVS.
|
||||||
|
|
||||||
|
Or just send the mapping file to me (v.slavik@volny.cz) and I'll
|
||||||
|
take care of it myself ;-)
|
||||||
|
|
80
misc/unictabl/mk_ctable.c
Normal file
80
misc/unictabl/mk_ctable.c
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
|
||||||
|
/* CVS-ID: $Id$ */
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
unsigned char c;
|
||||||
|
unsigned short u;
|
||||||
|
} charsetItem;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int cmpt(const void *i1, const void *i2)
|
||||||
|
{
|
||||||
|
unsigned short u1 = ((charsetItem*)i1) -> u;
|
||||||
|
unsigned short u2 = ((charsetItem*)i2) -> u;
|
||||||
|
return (u1 - u2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
unsigned enc, unic;
|
||||||
|
unsigned i;
|
||||||
|
charsetItem table[256];
|
||||||
|
|
||||||
|
for (i = 0; i < 256; i++) { table[i].c = i, table[i].u = 0; /* unknown */}
|
||||||
|
|
||||||
|
while (!feof(stdin))
|
||||||
|
{
|
||||||
|
scanf("%i\t%i\n", &enc, &unic);
|
||||||
|
table[enc].u = unic;
|
||||||
|
table[enc].c = enc;
|
||||||
|
if (enc < 128 && enc != unic)
|
||||||
|
fprintf(stderr, "7bit ASCII incompatibilit (%s): %i->%i\n",
|
||||||
|
argv[2], enc, unic);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* dump it: */
|
||||||
|
|
||||||
|
printf("\n\n"
|
||||||
|
"/* \n"
|
||||||
|
" * %s to Unicode recoding table \n"
|
||||||
|
" * based on file %s by Unicode Consortium\n"
|
||||||
|
" */\n\n"
|
||||||
|
"static wxUint16 encoding_table__%s[128] = {",
|
||||||
|
argv[2], argv[1], argv[2]);
|
||||||
|
|
||||||
|
for (i = 128; i < 256; i++)
|
||||||
|
{
|
||||||
|
if (i % 8 == 0)
|
||||||
|
printf("\n ");
|
||||||
|
printf("0x%04X%c ", table[i].u, (i == 255) ? '\n' : ',');
|
||||||
|
}
|
||||||
|
printf("};\n");
|
||||||
|
|
||||||
|
qsort(table + 128, 128, sizeof(table[0]), cmpt);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
NO, WE DON'T NEED REVERSE TABLE, WE CAN BUILD IT AT RUNTIME
|
||||||
|
(won't take that much time, after all you don't init
|
||||||
|
conversion so often...)
|
||||||
|
|
||||||
|
printf("\n"
|
||||||
|
"static wxUint16 encoding_table_rev__%s[128] = {",
|
||||||
|
argv[2]);
|
||||||
|
|
||||||
|
for (i = 128; i < 256; i++)
|
||||||
|
{
|
||||||
|
if (i % 4 == 0)
|
||||||
|
printf("\n ");
|
||||||
|
printf("{c:0x%02X,u:0x%04X}%c ", table[i].c, table[i].u, (i == 255) ? '\n' : ',');
|
||||||
|
}
|
||||||
|
printf("};\n");
|
||||||
|
*/
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
122
misc/unictabl/mk_encodings.sh
Executable file
122
misc/unictabl/mk_encodings.sh
Executable file
@ -0,0 +1,122 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# CVS-ID: $Id$
|
||||||
|
|
||||||
|
|
||||||
|
echo " * compiling C source generator..."
|
||||||
|
|
||||||
|
cc mk_ctable.c -o mk_ctable
|
||||||
|
|
||||||
|
echo " * writing copyright info..."
|
||||||
|
|
||||||
|
echo "
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is #included by encconv.cpp
|
||||||
|
*
|
||||||
|
* CVS-ID: \$Id\$
|
||||||
|
*
|
||||||
|
* *** *** CAUTION! *** ***
|
||||||
|
* Do not modify this file by hand! It is generated by shell
|
||||||
|
* script \$(WXWIN)/misc/unictabl/regenerate
|
||||||
|
*
|
||||||
|
* Parts of this file are based on tables published by Unicode, Inc.
|
||||||
|
* Original tables are freely available at
|
||||||
|
* ftp://ftp.unicode.org/MAPPINGS
|
||||||
|
*
|
||||||
|
* Original copyright info as present in mapping tables follows:
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Copyright (c) 1991-1999 Unicode, Inc. All Rights reserved.
|
||||||
|
*
|
||||||
|
* This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
|
||||||
|
* No claims are made as to fitness for any particular purpose. No
|
||||||
|
* warranties of any kind are expressed or implied. The recipient
|
||||||
|
* agrees to determine applicability of information provided. If this
|
||||||
|
* file has been provided on optical media by Unicode, Inc., the sole
|
||||||
|
* remedy for any claim will be exchange of defective media within 90
|
||||||
|
* days of receipt.
|
||||||
|
*
|
||||||
|
* Unicode, Inc. hereby grants the right to freely use the information
|
||||||
|
* supplied in this file in the creation of products supporting the
|
||||||
|
* Unicode Standard, and to make copies of this file in any form for
|
||||||
|
* internal or external distribution as long as this notice remains
|
||||||
|
* attached.
|
||||||
|
*/
|
||||||
|
|
||||||
|
" > unictabl.inc
|
||||||
|
|
||||||
|
|
||||||
|
echo " * creating C tables..."
|
||||||
|
|
||||||
|
all_encodings=""
|
||||||
|
|
||||||
|
for i in mappings/*.TXT ; do
|
||||||
|
|
||||||
|
enc=`echo $i | cut -c10- | tr - _ | sed 's/\.TXT//g' |\
|
||||||
|
sed 's/8859_\(.*\)/ISO8859_\1/g
|
||||||
|
s/KOI8_R/KOI8/g'`
|
||||||
|
cat $i | sed -n '/^0x...0x..../p' | cut -f1,2 | \
|
||||||
|
./mk_ctable $i $enc >> unictabl.inc
|
||||||
|
all_encodings="$all_encodings $enc"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
rm -f mk_ctable
|
||||||
|
|
||||||
|
|
||||||
|
echo " * adding fallback..."
|
||||||
|
|
||||||
|
echo "
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Unicode to 7bit ASCII fallback
|
||||||
|
* (for use with wxRECODE_SUBSTITUTE recoding mode)
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
static struct {
|
||||||
|
wxUint16 c /*code*/;
|
||||||
|
wxUint8 s /*inaccurate substitution*/;
|
||||||
|
} encoding_unicode_fallback[] = {
|
||||||
|
" >> unictabl.inc
|
||||||
|
|
||||||
|
cat Fallbacks | while read i ; do
|
||||||
|
code=`echo "$i" | cut -f1`
|
||||||
|
subs=`echo "$i" | cut -f2 | cut -c1-2,5-6`
|
||||||
|
echo " {$code, $subs}," >> unictabl.inc
|
||||||
|
done
|
||||||
|
echo " {0, 0}" >> unictabl.inc
|
||||||
|
echo " };" >> unictabl.inc
|
||||||
|
echo "
|
||||||
|
static unsigned encoding_unicode_fallback_count = "`cat Fallbacks | wc -l`";" >> unictabl.inc
|
||||||
|
|
||||||
|
|
||||||
|
echo " * adding reference table..."
|
||||||
|
|
||||||
|
|
||||||
|
echo "
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* Table of all supported encodings:
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
static struct {
|
||||||
|
wxFontEncoding encoding; // encoding identifier
|
||||||
|
wxUint16 *table; // 8bit to unicode table
|
||||||
|
} encodings_list[] = {
|
||||||
|
" >> unictabl.inc
|
||||||
|
|
||||||
|
for i in $all_encodings ; do
|
||||||
|
echo " { wxFONTENCODING_$i, encoding_table__$i}," >> unictabl.inc
|
||||||
|
done
|
||||||
|
|
||||||
|
echo " {wxFONTENCODING_MAX /*anything*/, NULL}" >> unictabl.inc
|
||||||
|
echo " };" >> unictabl.inc
|
74
misc/unictabl/mk_fallbacks.sh
Executable file
74
misc/unictabl/mk_fallbacks.sh
Executable file
@ -0,0 +1,74 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# CVS-ID: $Id$
|
||||||
|
|
||||||
|
|
||||||
|
add_fallback()
|
||||||
|
{
|
||||||
|
echo " - for $3..."
|
||||||
|
cat _tmp3 | grep "$1" | while read i ; do
|
||||||
|
code=`echo $i | cut -c1-6`
|
||||||
|
echo "$code $2" >> _tmp5
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
echo " * getting list of needed unicode characters..."
|
||||||
|
|
||||||
|
cat mappings/*.TXT | sed -n '/^0x../p' | \
|
||||||
|
cut -f2,4 | sort | uniq | sed -n '/^0x/p' > _tmp1
|
||||||
|
cat _tmp1 | cut -f1 | sort | uniq > _tmp2
|
||||||
|
|
||||||
|
|
||||||
|
echo " * making unique list of unicode characters meanings..."
|
||||||
|
|
||||||
|
rm -f _tmp3
|
||||||
|
cat _tmp2 | while read i ; do
|
||||||
|
sed -n "/^$i/p" _tmp1 | (read t ; echo "$t" >> _tmp3)
|
||||||
|
done
|
||||||
|
|
||||||
|
cp _tmp3 UnicodeChars
|
||||||
|
|
||||||
|
echo " * creating one-byte fallback tables..."
|
||||||
|
|
||||||
|
rm -f Fallbacks _tmp5
|
||||||
|
|
||||||
|
echo " - for latin capital letters..."
|
||||||
|
|
||||||
|
cat _tmp3 | grep 'LATIN CAPITAL LETTER [A-Z]$' > _tmp6
|
||||||
|
cat _tmp3 | grep 'LATIN CAPITAL LETTER [A-Z] WITH' >> _tmp6
|
||||||
|
cat _tmp6 | sort +2 > _tmp4
|
||||||
|
|
||||||
|
cat _tmp4 | while read i ; do
|
||||||
|
code=`echo $i | cut -c1-6`
|
||||||
|
fallb=`echo $i | cut -c8-29`
|
||||||
|
cat _tmp4 | fgrep "$fallb" | cut -c1-6 | (read i ;
|
||||||
|
echo "$code $i" >> _tmp5)
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
echo " - for latin small letters..."
|
||||||
|
|
||||||
|
cat _tmp3 | grep 'LATIN SMALL LETTER [A-Z]$' > _tmp6
|
||||||
|
cat _tmp3 | grep 'LATIN SMALL LETTER [A-Z] WITH' >> _tmp6
|
||||||
|
cat _tmp6 | sort +2 > _tmp4
|
||||||
|
|
||||||
|
cat _tmp4 | while read i ; do
|
||||||
|
code=`echo $i | cut -c1-6`
|
||||||
|
fallb=`echo $i | cut -c8-27`
|
||||||
|
cat _tmp4 | fgrep "$fallb" | cut -c1-6 | (read i ;
|
||||||
|
echo "$code $i" >> _tmp5)
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
add_fallback "DOUBLE .*QUOTATION MARK" "0x0022" "double quotations"
|
||||||
|
add_fallback "SINGLE .*QUOTATION MARK" "0x0027" "single quotations"
|
||||||
|
add_fallback "DASH" "0x002D" "dashes"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo " * removing infinite loops from fallback tables..."
|
||||||
|
|
||||||
|
cat _tmp5 | grep -v '\(0x....\) \1' | sort > Fallbacks
|
||||||
|
|
||||||
|
rm -f _tmp1 _tmp2 _tmp3 _tmp4 _tmp5 _tmp6
|
||||||
|
|
24
misc/unictabl/regenerate.sh
Executable file
24
misc/unictabl/regenerate.sh
Executable file
@ -0,0 +1,24 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# CVS-ID: $Id$
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "-----------------------------------"
|
||||||
|
echo " Refreshing tables, please wait..."
|
||||||
|
echo "-----------------------------------"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
sh ./mk_fallbacks.sh
|
||||||
|
sh ./mk_encodings.sh
|
||||||
|
|
||||||
|
echo " * removing temporary files..."
|
||||||
|
|
||||||
|
rm -f Fallbacks UnicodeChars
|
||||||
|
|
||||||
|
echo "
|
||||||
|
* copying tables to src/common/unictabl.inc..."
|
||||||
|
|
||||||
|
mv -f unictabl.inc ../../src/common/unictabl.inc
|
||||||
|
|
||||||
|
echo "
|
||||||
|
DONE
|
||||||
|
"
|
Loading…
Reference in New Issue
Block a user