From 6203e0f8ca1081117e8fb4d5911b14cbb5927014 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 17 Nov 1998 12:42:16 +0000 Subject: [PATCH] Redo auto-detection inline with UTF-16 ID --- expat/xmltok/xmltok.c | 121 +++++++++++++++++++++++++++++++++++++++ expat/xmltok/xmltok_ns.c | 111 +++++++---------------------------- 2 files changed, 143 insertions(+), 89 deletions(-) diff --git a/expat/xmltok/xmltok.c b/expat/xmltok/xmltok.c index f85f8b80..c0e99e76 100755 --- a/expat/xmltok/xmltok.c +++ b/expat/xmltok/xmltok.c @@ -1142,6 +1142,127 @@ XmlInitUnknownEncoding(void *mem, return &(e->normal.enc); } +/* If this enumeration is changed, getEncodingIndex and encodings +must also be changed. */ +enum { + UNKNOWN_ENC = -1, + ISO_8859_1_ENC = 0, + US_ASCII_ENC, + UTF_8_ENC, + UTF_16_ENC, + UTF_16BE_ENC, + UTF_16LE_ENC, + /* must match encodingNames up to here */ + NO_ENC +}; + +static +int getEncodingIndex(const char *name) +{ + static const char *encodingNames[] = { + "ISO-8859-1", + "US-ASCII", + "UTF-8", + "UTF-16", + "UTF-16BE" + "UTF-16LE", + }; + int i; + if (name == 0) + return NO_ENC; + for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++) + if (streqci(name, encodingNames[i])) + return i; + return UNKNOWN_ENC; +} + +/* For binary compatibility, we store the index of the encoding specified +at initialization in the isUtf16 member. */ + +#define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16) + +/* This is what detects the encoding. +encodingTable maps from encoding indices to encodings; +INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding; +state is XML_CONTENT_STATE if we're parsing an external text entity, +and XML_PROLOG_STATE otherwise. +*/ + + +static +int initScan(const ENCODING **encodingTable, + const INIT_ENCODING *enc, + int state, + const char *ptr, + const char *end, + const char **nextTokPtr) +{ + const ENCODING **encPtr; + + if (ptr == end) + return XML_TOK_NONE; + encPtr = enc->encPtr; + if (ptr + 1 == end) { + switch ((unsigned char)*ptr) { + case 0xFE: + case 0xFF: + case 0xEF: /* possibly first byte of UTF-8 BOM */ + if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC + && state == XML_CONTENT_STATE) + break; + /* fall through */ + case 0x00: + case 0x3C: + return XML_TOK_PARTIAL; + } + } + else { + switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { + case 0x003C: + if (INIT_ENC_INDEX(enc) == UTF_16LE_ENC + && state == XML_CONTENT_STATE) + break; + *encPtr = encodingTable[UTF_16BE_ENC]; + return XmlTok(*encPtr, state, ptr, end, nextTokPtr); + case 0xFEFF: + *nextTokPtr = ptr + 2; + *encPtr = encodingTable[UTF_16BE_ENC]; + return XML_TOK_BOM; + case 0x3C00: + if (INIT_ENC_INDEX(enc) == UTF_16BE_ENC + && state == XML_CONTENT_STATE) + break; + *encPtr = encodingTable[UTF_16LE_ENC]; + return XmlTok(*encPtr, state, ptr, end, nextTokPtr); + case 0xFFFE: + *nextTokPtr = ptr + 2; + *encPtr = encodingTable[UTF_16LE_ENC]; + return XML_TOK_BOM; + case 0xEFBB: + /* Maybe a UTF-8 BOM (EF BB BF) */ + /* If there's an explicitly specified (external) encoding + of ISO-8859-1 or some flavour of UTF-16 + and this is an external text entity, + don't look for the BOM, + because it might be a legal data. */ + if (state == XML_CONTENT_STATE) { + int e = INIT_ENC_INDEX(enc); + if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC) + break; + } + if (ptr + 2 == end) + return XML_TOK_PARTIAL; + if ((unsigned char)ptr[2] == 0xBF) { + *encPtr = encodingTable[UTF_8_ENC]; + return XML_TOK_BOM; + } + } + } + *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; + return XmlTok(*encPtr, state, ptr, end, nextTokPtr); +} + + #define NS(x) x #define ns(x) x #include "xmltok_ns.c" diff --git a/expat/xmltok/xmltok_ns.c b/expat/xmltok/xmltok_ns.c index f9ae5cbc..6accf12d 100755 --- a/expat/xmltok/xmltok_ns.c +++ b/expat/xmltok/xmltok_ns.c @@ -16,91 +16,36 @@ const ENCODING *NS(XmlGetUtf16InternalEncoding)() } static -int NS(initScan)(const ENCODING *enc, int state, const char *ptr, const char *end, - const char **nextTokPtr) -{ - const ENCODING **encPtr; - - if (ptr == end) - return XML_TOK_NONE; - encPtr = ((const INIT_ENCODING *)enc)->encPtr; - if (ptr + 1 == end) { - switch ((unsigned char)*ptr) { - case 0xFE: - case 0xFF: - case 0x00: - case 0x3C: - return XML_TOK_PARTIAL; - } - } - else { - switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { - case 0x003C: - *encPtr = &ns(big2_encoding).enc; - return XmlTok(*encPtr, state, ptr, end, nextTokPtr); - case 0xFEFF: - *nextTokPtr = ptr + 2; - *encPtr = &ns(big2_encoding).enc; - return XML_TOK_BOM; - case 0x3C00: - *encPtr = &ns(little2_encoding).enc; - return XmlTok(*encPtr, state, ptr, end, nextTokPtr); - case 0xFFFE: - *nextTokPtr = ptr + 2; - *encPtr = &ns(little2_encoding).enc; - return XML_TOK_BOM; - } - } - *encPtr = (enc->minBytesPerChar == 2 - ? &ns(big2_encoding).enc - : &ns(utf8_encoding).enc); - return XmlTok(*encPtr, state, ptr, end, nextTokPtr); -} - +const ENCODING *NS(encodings)[] = { + &ns(latin1_encoding).enc, + &ns(ascii_encoding).enc, + &ns(utf8_encoding).enc, + &ns(big2_encoding).enc, + &ns(big2_encoding).enc, + &ns(little2_encoding).enc, + &ns(utf8_encoding).enc /* NO_ENC */ +}; static int NS(initScanProlog)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - return NS(initScan)(enc, XML_PROLOG_STATE, ptr, end, nextTokPtr); + return NS(initScan)(NS(encodings), (const INIT_ENCODING *)enc, XML_PROLOG_STATE, ptr, end, nextTokPtr); } static int NS(initScanContent)(const ENCODING *enc, const char *ptr, const char *end, - const char **nextTokPtr) + const char **nextTokPtr) { - return NS(initScan)(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr); + return NS(initScan)(NS(encodings), (const INIT_ENCODING *)enc, XML_CONTENT_STATE, ptr, end, nextTokPtr); } int NS(XmlInitEncoding)(INIT_ENCODING *p, const ENCODING **encPtr, const char *name) { - if (name) { - if (streqci(name, "ISO-8859-1")) { - *encPtr = &ns(latin1_encoding).enc; - return 1; - } - if (streqci(name, "UTF-8")) { - *encPtr = &ns(utf8_encoding).enc; - return 1; - } - if (streqci(name, "US-ASCII")) { - *encPtr = &ns(ascii_encoding).enc; - return 1; - } - if (streqci(name, "UTF-16BE")) { - *encPtr = &ns(big2_encoding).enc; - return 1; - } - if (streqci(name, "UTF-16LE")) { - *encPtr = &ns(little2_encoding).enc; - return 1; - } - if (!streqci(name, "UTF-16")) - return 0; - p->initEnc.minBytesPerChar = 2; - } - else - p->initEnc.minBytesPerChar = 1; + int i = getEncodingIndex(name); + if (i == UNKNOWN_ENC) + return 0; + INIT_ENC_INDEX(p) = (char)i; p->initEnc.scanners[XML_PROLOG_STATE] = NS(initScanProlog); p->initEnc.scanners[XML_CONTENT_STATE] = NS(initScanContent); p->initEnc.updatePosition = initUpdatePosition; @@ -109,7 +54,6 @@ int NS(XmlInitEncoding)(INIT_ENCODING *p, const ENCODING **encPtr, const char *n return 1; } - static const ENCODING *NS(findEncoding)(const ENCODING *enc, const char *ptr, const char *end) { @@ -121,23 +65,12 @@ const ENCODING *NS(findEncoding)(const ENCODING *enc, const char *ptr, const cha if (ptr != end) return 0; *p = 0; - for (i = 0; buf[i]; i++) { - if ('a' <= buf[i] && buf[i] <= 'z') - buf[i] += 'A' - 'a'; - } - if (streqci(buf, "UTF-8")) - return &ns(utf8_encoding).enc; - if (streqci(buf, "ISO-8859-1")) - return &ns(latin1_encoding).enc; - if (streqci(buf, "US-ASCII")) - return &ns(ascii_encoding).enc; - if (streqci(buf, "UTF-16")) { - static const unsigned short n = 1; - if (enc->minBytesPerChar == 2) - return enc; - return &ns(big2_encoding).enc; - } - return 0; + if (streqci(buf, "UTF-16") && enc->minBytesPerChar == 2) + return enc; + i = getEncodingIndex(buf); + if (i == UNKNOWN_ENC) + return 0; + return NS(encodings)[i]; } int NS(XmlParseXmlDecl)(int isGeneralTextEntity,