added wxEncodingConverter - scripts for creating convertion tables
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@5156 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
parent
2de89127de
commit
045c4fab85
21
misc/unictabl/README
Normal file
21
misc/unictabl/README
Normal file
@ -0,0 +1,21 @@
|
||||
|
||||
Files in this directory are used to generate
|
||||
src/common/unictabl.inc -- wxEncodingConverter helper tables that hold
|
||||
information about charset-to-unicode and unicode-to-charset conversion
|
||||
|
||||
These scripts will most probably not run under Windows, even with
|
||||
Cygwin tools. You'll need some Unix machine
|
||||
(tested only under Linux, so...)
|
||||
|
||||
To add support for new encoding, simply add proper .TXT file from
|
||||
ftp://ftp.unicode.org/MAPPINGS to mappings directory. But **please**
|
||||
make sure that newly added file has only CR, not LF+CR at end of line.
|
||||
Also make sure it consists of 4 tab-separated columns (not spaces;
|
||||
# and comment must be separated by one tab)
|
||||
|
||||
After adding the file, run ./regenerate.sh and commit
|
||||
src/common/unictabl.inc to CVS.
|
||||
|
||||
Or just send the mapping file to me (v.slavik@volny.cz) and I'll
|
||||
take care of it myself ;-)
|
||||
|
80
misc/unictabl/mk_ctable.c
Normal file
80
misc/unictabl/mk_ctable.c
Normal file
@ -0,0 +1,80 @@
|
||||
|
||||
/* CVS-ID: $Id$ */
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
typedef struct {
|
||||
unsigned char c;
|
||||
unsigned short u;
|
||||
} charsetItem;
|
||||
|
||||
|
||||
|
||||
int cmpt(const void *i1, const void *i2)
|
||||
{
|
||||
unsigned short u1 = ((charsetItem*)i1) -> u;
|
||||
unsigned short u2 = ((charsetItem*)i2) -> u;
|
||||
return (u1 - u2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
unsigned enc, unic;
|
||||
unsigned i;
|
||||
charsetItem table[256];
|
||||
|
||||
for (i = 0; i < 256; i++) { table[i].c = i, table[i].u = 0; /* unknown */}
|
||||
|
||||
while (!feof(stdin))
|
||||
{
|
||||
scanf("%i\t%i\n", &enc, &unic);
|
||||
table[enc].u = unic;
|
||||
table[enc].c = enc;
|
||||
if (enc < 128 && enc != unic)
|
||||
fprintf(stderr, "7bit ASCII incompatibilit (%s): %i->%i\n",
|
||||
argv[2], enc, unic);
|
||||
}
|
||||
|
||||
/* dump it: */
|
||||
|
||||
printf("\n\n"
|
||||
"/* \n"
|
||||
" * %s to Unicode recoding table \n"
|
||||
" * based on file %s by Unicode Consortium\n"
|
||||
" */\n\n"
|
||||
"static wxUint16 encoding_table__%s[128] = {",
|
||||
argv[2], argv[1], argv[2]);
|
||||
|
||||
for (i = 128; i < 256; i++)
|
||||
{
|
||||
if (i % 8 == 0)
|
||||
printf("\n ");
|
||||
printf("0x%04X%c ", table[i].u, (i == 255) ? '\n' : ',');
|
||||
}
|
||||
printf("};\n");
|
||||
|
||||
qsort(table + 128, 128, sizeof(table[0]), cmpt);
|
||||
|
||||
|
||||
/*
|
||||
NO, WE DON'T NEED REVERSE TABLE, WE CAN BUILD IT AT RUNTIME
|
||||
(won't take that much time, after all you don't init
|
||||
conversion so often...)
|
||||
|
||||
printf("\n"
|
||||
"static wxUint16 encoding_table_rev__%s[128] = {",
|
||||
argv[2]);
|
||||
|
||||
for (i = 128; i < 256; i++)
|
||||
{
|
||||
if (i % 4 == 0)
|
||||
printf("\n ");
|
||||
printf("{c:0x%02X,u:0x%04X}%c ", table[i].c, table[i].u, (i == 255) ? '\n' : ',');
|
||||
}
|
||||
printf("};\n");
|
||||
*/
|
||||
|
||||
return 1;
|
||||
}
|
122
misc/unictabl/mk_encodings.sh
Executable file
122
misc/unictabl/mk_encodings.sh
Executable file
@ -0,0 +1,122 @@
|
||||
#!/bin/sh
|
||||
# CVS-ID: $Id$
|
||||
|
||||
|
||||
echo " * compiling C source generator..."
|
||||
|
||||
cc mk_ctable.c -o mk_ctable
|
||||
|
||||
echo " * writing copyright info..."
|
||||
|
||||
echo "
|
||||
|
||||
/*
|
||||
* This file is #included by encconv.cpp
|
||||
*
|
||||
* CVS-ID: \$Id\$
|
||||
*
|
||||
* *** *** CAUTION! *** ***
|
||||
* Do not modify this file by hand! It is generated by shell
|
||||
* script \$(WXWIN)/misc/unictabl/regenerate
|
||||
*
|
||||
* Parts of this file are based on tables published by Unicode, Inc.
|
||||
* Original tables are freely available at
|
||||
* ftp://ftp.unicode.org/MAPPINGS
|
||||
*
|
||||
* Original copyright info as present in mapping tables follows:
|
||||
*
|
||||
*
|
||||
* Copyright (c) 1991-1999 Unicode, Inc. All Rights reserved.
|
||||
*
|
||||
* This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
|
||||
* No claims are made as to fitness for any particular purpose. No
|
||||
* warranties of any kind are expressed or implied. The recipient
|
||||
* agrees to determine applicability of information provided. If this
|
||||
* file has been provided on optical media by Unicode, Inc., the sole
|
||||
* remedy for any claim will be exchange of defective media within 90
|
||||
* days of receipt.
|
||||
*
|
||||
* Unicode, Inc. hereby grants the right to freely use the information
|
||||
* supplied in this file in the creation of products supporting the
|
||||
* Unicode Standard, and to make copies of this file in any form for
|
||||
* internal or external distribution as long as this notice remains
|
||||
* attached.
|
||||
*/
|
||||
|
||||
" > unictabl.inc
|
||||
|
||||
|
||||
echo " * creating C tables..."
|
||||
|
||||
all_encodings=""
|
||||
|
||||
for i in mappings/*.TXT ; do
|
||||
|
||||
enc=`echo $i | cut -c10- | tr - _ | sed 's/\.TXT//g' |\
|
||||
sed 's/8859_\(.*\)/ISO8859_\1/g
|
||||
s/KOI8_R/KOI8/g'`
|
||||
cat $i | sed -n '/^0x...0x..../p' | cut -f1,2 | \
|
||||
./mk_ctable $i $enc >> unictabl.inc
|
||||
all_encodings="$all_encodings $enc"
|
||||
|
||||
done
|
||||
|
||||
rm -f mk_ctable
|
||||
|
||||
|
||||
echo " * adding fallback..."
|
||||
|
||||
echo "
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Unicode to 7bit ASCII fallback
|
||||
* (for use with wxRECODE_SUBSTITUTE recoding mode)
|
||||
*
|
||||
*/
|
||||
|
||||
static struct {
|
||||
wxUint16 c /*code*/;
|
||||
wxUint8 s /*inaccurate substitution*/;
|
||||
} encoding_unicode_fallback[] = {
|
||||
" >> unictabl.inc
|
||||
|
||||
cat Fallbacks | while read i ; do
|
||||
code=`echo "$i" | cut -f1`
|
||||
subs=`echo "$i" | cut -f2 | cut -c1-2,5-6`
|
||||
echo " {$code, $subs}," >> unictabl.inc
|
||||
done
|
||||
echo " {0, 0}" >> unictabl.inc
|
||||
echo " };" >> unictabl.inc
|
||||
echo "
|
||||
static unsigned encoding_unicode_fallback_count = "`cat Fallbacks | wc -l`";" >> unictabl.inc
|
||||
|
||||
|
||||
echo " * adding reference table..."
|
||||
|
||||
|
||||
echo "
|
||||
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Table of all supported encodings:
|
||||
*
|
||||
*/
|
||||
|
||||
static struct {
|
||||
wxFontEncoding encoding; // encoding identifier
|
||||
wxUint16 *table; // 8bit to unicode table
|
||||
} encodings_list[] = {
|
||||
" >> unictabl.inc
|
||||
|
||||
for i in $all_encodings ; do
|
||||
echo " { wxFONTENCODING_$i, encoding_table__$i}," >> unictabl.inc
|
||||
done
|
||||
|
||||
echo " {wxFONTENCODING_MAX /*anything*/, NULL}" >> unictabl.inc
|
||||
echo " };" >> unictabl.inc
|
74
misc/unictabl/mk_fallbacks.sh
Executable file
74
misc/unictabl/mk_fallbacks.sh
Executable file
@ -0,0 +1,74 @@
|
||||
#!/bin/sh
|
||||
# CVS-ID: $Id$
|
||||
|
||||
|
||||
add_fallback()
|
||||
{
|
||||
echo " - for $3..."
|
||||
cat _tmp3 | grep "$1" | while read i ; do
|
||||
code=`echo $i | cut -c1-6`
|
||||
echo "$code $2" >> _tmp5
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
echo " * getting list of needed unicode characters..."
|
||||
|
||||
cat mappings/*.TXT | sed -n '/^0x../p' | \
|
||||
cut -f2,4 | sort | uniq | sed -n '/^0x/p' > _tmp1
|
||||
cat _tmp1 | cut -f1 | sort | uniq > _tmp2
|
||||
|
||||
|
||||
echo " * making unique list of unicode characters meanings..."
|
||||
|
||||
rm -f _tmp3
|
||||
cat _tmp2 | while read i ; do
|
||||
sed -n "/^$i/p" _tmp1 | (read t ; echo "$t" >> _tmp3)
|
||||
done
|
||||
|
||||
cp _tmp3 UnicodeChars
|
||||
|
||||
echo " * creating one-byte fallback tables..."
|
||||
|
||||
rm -f Fallbacks _tmp5
|
||||
|
||||
echo " - for latin capital letters..."
|
||||
|
||||
cat _tmp3 | grep 'LATIN CAPITAL LETTER [A-Z]$' > _tmp6
|
||||
cat _tmp3 | grep 'LATIN CAPITAL LETTER [A-Z] WITH' >> _tmp6
|
||||
cat _tmp6 | sort +2 > _tmp4
|
||||
|
||||
cat _tmp4 | while read i ; do
|
||||
code=`echo $i | cut -c1-6`
|
||||
fallb=`echo $i | cut -c8-29`
|
||||
cat _tmp4 | fgrep "$fallb" | cut -c1-6 | (read i ;
|
||||
echo "$code $i" >> _tmp5)
|
||||
done
|
||||
|
||||
|
||||
echo " - for latin small letters..."
|
||||
|
||||
cat _tmp3 | grep 'LATIN SMALL LETTER [A-Z]$' > _tmp6
|
||||
cat _tmp3 | grep 'LATIN SMALL LETTER [A-Z] WITH' >> _tmp6
|
||||
cat _tmp6 | sort +2 > _tmp4
|
||||
|
||||
cat _tmp4 | while read i ; do
|
||||
code=`echo $i | cut -c1-6`
|
||||
fallb=`echo $i | cut -c8-27`
|
||||
cat _tmp4 | fgrep "$fallb" | cut -c1-6 | (read i ;
|
||||
echo "$code $i" >> _tmp5)
|
||||
done
|
||||
|
||||
|
||||
add_fallback "DOUBLE .*QUOTATION MARK" "0x0022" "double quotations"
|
||||
add_fallback "SINGLE .*QUOTATION MARK" "0x0027" "single quotations"
|
||||
add_fallback "DASH" "0x002D" "dashes"
|
||||
|
||||
|
||||
|
||||
echo " * removing infinite loops from fallback tables..."
|
||||
|
||||
cat _tmp5 | grep -v '\(0x....\) \1' | sort > Fallbacks
|
||||
|
||||
rm -f _tmp1 _tmp2 _tmp3 _tmp4 _tmp5 _tmp6
|
||||
|
24
misc/unictabl/regenerate.sh
Executable file
24
misc/unictabl/regenerate.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/sh
|
||||
# CVS-ID: $Id$
|
||||
|
||||
echo ""
|
||||
echo "-----------------------------------"
|
||||
echo " Refreshing tables, please wait..."
|
||||
echo "-----------------------------------"
|
||||
echo ""
|
||||
|
||||
sh ./mk_fallbacks.sh
|
||||
sh ./mk_encodings.sh
|
||||
|
||||
echo " * removing temporary files..."
|
||||
|
||||
rm -f Fallbacks UnicodeChars
|
||||
|
||||
echo "
|
||||
* copying tables to src/common/unictabl.inc..."
|
||||
|
||||
mv -f unictabl.inc ../../src/common/unictabl.inc
|
||||
|
||||
echo "
|
||||
DONE
|
||||
"
|
Loading…
Reference in New Issue
Block a user