Refine auto-detection algorithm

This commit is contained in:
James Clark 1998-12-30 08:57:44 +00:00
parent cc0924046a
commit 39603886d3

View File

@ -1379,6 +1379,18 @@ int initScan(const ENCODING **encodingTable,
return XML_TOK_NONE;
encPtr = enc->encPtr;
if (ptr + 1 == end) {
/* only a single byte available for auto-detection */
/* a well-formed document entity must have more than one byte */
if (state != XML_CONTENT_STATE)
return XML_TOK_PARTIAL;
/* so we're parsing an external text entity... */
/* if UTF-16 was externally specified, then we need at least 2 bytes */
switch (INIT_ENC_INDEX(enc)) {
case UTF_16_ENC:
case UTF_16LE_ENC:
case UTF_16BE_ENC:
return XML_TOK_PARTIAL;
}
switch ((unsigned char)*ptr) {
case 0xFE:
case 0xFF:
@ -1394,12 +1406,6 @@ int initScan(const ENCODING **encodingTable,
}
else {
switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
case 0x003C:
if (INIT_ENC_INDEX(enc) == UTF_16LE_ENC
&& state == XML_CONTENT_STATE)
break;
*encPtr = encodingTable[UTF_16BE_ENC];
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
case 0xFEFF:
if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
&& state == XML_CONTENT_STATE)
@ -1407,8 +1413,10 @@ int initScan(const ENCODING **encodingTable,
*nextTokPtr = ptr + 2;
*encPtr = encodingTable[UTF_16BE_ENC];
return XML_TOK_BOM;
/* 00 3C is handled in the default case */
case 0x3C00:
if (INIT_ENC_INDEX(enc) == UTF_16BE_ENC
if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
|| INIT_ENC_INDEX(enc) == UTF_16_ENC)
&& state == XML_CONTENT_STATE)
break;
*encPtr = encodingTable[UTF_16LE_ENC];
@ -1438,6 +1446,33 @@ int initScan(const ENCODING **encodingTable,
*encPtr = encodingTable[UTF_8_ENC];
return XML_TOK_BOM;
}
break;
default:
if (ptr[0] == '\0') {
/* 0 isn't a legal data character. Furthermore a document entity can only
start with ASCII characters. So the only way this can fail to be big-endian
UTF-16 if it it's an external parsed general entity that's labelled as
UTF-16LE. */
if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
break;
*encPtr = encodingTable[UTF_16BE_ENC];
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
}
else if (ptr[1] == '\0') {
/* We could recover here in the case:
- parsing an external entity
- second byte is 0
- no externally specified encoding
- no encoding declaration
by assuming UTF-16LE. But we don't, because this would mean when
presented just with a single byte, we couldn't reliably determine
whether we needed further bytes. */
if (state == XML_CONTENT_STATE)
break;
*encPtr = encodingTable[UTF_16LE_ENC];
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
}
break;
}
}
*encPtr = encodingTable[INIT_ENC_INDEX(enc)];