Don't handle CDATA sections as a single token
This commit is contained in:
parent
7e71c8f5d5
commit
fa8b616b9c
@ -103,12 +103,15 @@ typedef enum XML_Error Processor(XML_Parser parser,
|
||||
|
||||
static Processor prologProcessor;
|
||||
static Processor contentProcessor;
|
||||
static Processor cdataSectionProcessor;
|
||||
static Processor epilogProcessor;
|
||||
static Processor errorProcessor;
|
||||
|
||||
static enum XML_Error
|
||||
doContent(XML_Parser parser, int startTagLevel, const ENCODING *enc,
|
||||
const char *start, const char *end, const char **endPtr);
|
||||
static enum XML_Error
|
||||
doCdataSection(XML_Parser parser, const char **startPtr, const char *end, const char **nextPtr);
|
||||
static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *, const char *tagName, const char *s);
|
||||
static int
|
||||
defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *, int isCdata, const char *dfltValue);
|
||||
@ -473,7 +476,8 @@ const char *XML_ErrorString(int code)
|
||||
"reference to external entity in attribute",
|
||||
"xml processing instruction not at start of external entity",
|
||||
"unknown encoding",
|
||||
"encoding specified in XML declaration is incorrect"
|
||||
"encoding specified in XML declaration is incorrect",
|
||||
"unclosed CDATA section",
|
||||
};
|
||||
if (code > 0 && code < sizeof(message)/sizeof(message[0]))
|
||||
return message[code];
|
||||
@ -744,15 +748,13 @@ doContent(XML_Parser parser,
|
||||
characterDataHandler(userData, &c, 1);
|
||||
}
|
||||
break;
|
||||
case XML_TOK_CDATA_SECTION:
|
||||
if (characterDataHandler) {
|
||||
const char *lim = next - enc->minBytesPerChar * 3;
|
||||
s += enc->minBytesPerChar * 9;
|
||||
do {
|
||||
char *dataPtr = dataBuf;
|
||||
XmlConvert(enc, XML_UTF8_ENCODING, &s, lim, &dataPtr, dataBufEnd);
|
||||
characterDataHandler(userData, dataBuf, dataPtr - dataBuf);
|
||||
} while (s != lim);
|
||||
case XML_TOK_CDATA_SECT_OPEN:
|
||||
{
|
||||
enum XML_Error result = doCdataSection(parser, &next, end, nextPtr);
|
||||
if (!next) {
|
||||
processor = cdataSectionProcessor;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case XML_TOK_TRAILING_RSQB:
|
||||
@ -883,6 +885,76 @@ static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *enc,
|
||||
return XML_ERROR_NONE;
|
||||
}
|
||||
|
||||
/* The idea here is to avoid using stack for each CDATA section when
|
||||
the whole file is parsed with one call. */
|
||||
|
||||
static
|
||||
enum XML_Error cdataSectionProcessor(XML_Parser parser,
|
||||
const char *start,
|
||||
const char *end,
|
||||
const char **endPtr)
|
||||
{
|
||||
enum XML_Error result = doCdataSection(parser, &start, end, endPtr);
|
||||
if (start) {
|
||||
processor = contentProcessor;
|
||||
return contentProcessor(parser, start, end, endPtr);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* startPtr gets set to non-null is the section is closed, and to null if
|
||||
the section is not yet closed. */
|
||||
|
||||
static
|
||||
enum XML_Error doCdataSection(XML_Parser parser,
|
||||
const char **startPtr,
|
||||
const char *end,
|
||||
const char **nextPtr)
|
||||
{
|
||||
const char *s = *startPtr;
|
||||
*startPtr = 0;
|
||||
for (;;) {
|
||||
const char *next;
|
||||
int tok = XmlCdataSectionTok(encoding, s, end, &next);
|
||||
switch (tok) {
|
||||
case XML_TOK_CDATA_SECT_CLOSE:
|
||||
*startPtr = next;
|
||||
return XML_ERROR_NONE;
|
||||
case XML_TOK_DATA_NEWLINE:
|
||||
if (characterDataHandler) {
|
||||
char c = '\n';
|
||||
characterDataHandler(userData, &c, 1);
|
||||
}
|
||||
break;
|
||||
case XML_TOK_DATA_CHARS:
|
||||
if (characterDataHandler) {
|
||||
do {
|
||||
char *dataPtr = dataBuf;
|
||||
XmlConvert(encoding, XML_UTF8_ENCODING, &s, next, &dataPtr, dataBufEnd);
|
||||
characterDataHandler(userData, dataBuf, dataPtr - dataBuf);
|
||||
} while (s != next);
|
||||
}
|
||||
break;
|
||||
case XML_TOK_INVALID:
|
||||
errorPtr = next;
|
||||
return XML_ERROR_INVALID_TOKEN;
|
||||
case XML_TOK_PARTIAL:
|
||||
case XML_TOK_NONE:
|
||||
if (nextPtr) {
|
||||
*nextPtr = s;
|
||||
return XML_ERROR_NONE;
|
||||
}
|
||||
errorPtr = s;
|
||||
return XML_ERROR_UNCLOSED_CDATA_SECTION;
|
||||
default:
|
||||
abort();
|
||||
}
|
||||
s = next;
|
||||
}
|
||||
/* not reached */
|
||||
}
|
||||
|
||||
|
||||
static enum XML_Error
|
||||
prologProcessor(XML_Parser parser,
|
||||
const char *s,
|
||||
|
@ -110,7 +110,8 @@ enum XML_Error {
|
||||
XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF,
|
||||
XML_ERROR_MISPLACED_XML_PI,
|
||||
XML_ERROR_UNKNOWN_ENCODING,
|
||||
XML_ERROR_INCORRECT_ENCODING
|
||||
XML_ERROR_INCORRECT_ENCODING,
|
||||
XML_ERROR_UNCLOSED_CDATA_SECTION
|
||||
};
|
||||
|
||||
int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);
|
||||
|
@ -23,7 +23,7 @@ Contributor(s):
|
||||
#include "nametab.h"
|
||||
|
||||
#define VTABLE1 \
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, \
|
||||
{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
|
||||
{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
|
||||
PREFIX(sameName), \
|
||||
PREFIX(nameMatchesAscii), \
|
||||
|
@ -41,7 +41,7 @@ extern "C" {
|
||||
#define XML_TOK_INVALID 0
|
||||
|
||||
/* The following tokens are returned by XmlContentTok; some are also
|
||||
returned by XmlAttributeValueTok and XmlEntityTok */
|
||||
returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok */
|
||||
|
||||
#define XML_TOK_START_TAG_WITH_ATTS 1
|
||||
#define XML_TOK_START_TAG_NO_ATTS 2
|
||||
@ -50,7 +50,7 @@ extern "C" {
|
||||
#define XML_TOK_END_TAG 5
|
||||
#define XML_TOK_DATA_CHARS 6
|
||||
#define XML_TOK_DATA_NEWLINE 7
|
||||
#define XML_TOK_CDATA_SECTION 8
|
||||
#define XML_TOK_CDATA_SECT_OPEN 8
|
||||
#define XML_TOK_ENTITY_REF 9
|
||||
#define XML_TOK_CHAR_REF 10 /* numeric character reference */
|
||||
|
||||
@ -88,12 +88,16 @@ extern "C" {
|
||||
#define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */
|
||||
#define XML_TOK_COMMA 38
|
||||
|
||||
/* The following tokens is returned only by XmlAttributeValueTok */
|
||||
/* The following token is returned only by XmlAttributeValueTok */
|
||||
#define XML_TOK_ATTRIBUTE_VALUE_S 39
|
||||
|
||||
#define XML_N_STATES 2
|
||||
/* The following token is returned only by XmlCdataSectionTok */
|
||||
#define XML_TOK_CDATA_SECT_CLOSE 40
|
||||
|
||||
#define XML_N_STATES 3
|
||||
#define XML_PROLOG_STATE 0
|
||||
#define XML_CONTENT_STATE 1
|
||||
#define XML_CDATA_SECTION_STATE 2
|
||||
|
||||
#define XML_N_LITERAL_TYPES 2
|
||||
#define XML_ATTRIBUTE_VALUE_LITERAL 0
|
||||
@ -189,6 +193,9 @@ literals, comments and processing instructions.
|
||||
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
|
||||
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
|
||||
|
||||
#define XmlCdataSectionTok(enc, ptr, end, nextTokPtr) \
|
||||
XmlTok(enc, XML_CDATA_SECTION_STATE, ptr, end, nextTokPtr)
|
||||
|
||||
/* This is used for performing a 2nd-level tokenization on
|
||||
the content of a literal that has already been returned by XmlTok. */
|
||||
|
||||
|
@ -293,15 +293,14 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
||||
/* ptr points to character following "<![" */
|
||||
|
||||
static
|
||||
int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
int i;
|
||||
/* CDATA[]]> */
|
||||
if (end - ptr < 9 * MINBPC)
|
||||
/* CDATA[ */
|
||||
if (end - ptr < 6 * MINBPC)
|
||||
return XML_TOK_PARTIAL;
|
||||
for (i = 0; i < 6; i++, ptr += MINBPC) {
|
||||
if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
|
||||
@ -309,22 +308,86 @@ int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *e
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
end -= 2 * MINBPC;
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
INVALID_CASES(ptr, nextTokPtr)
|
||||
case BT_RSQB:
|
||||
if (CHAR_MATCHES(enc, ptr + MINBPC, ']')
|
||||
&& CHAR_MATCHES(enc, ptr + 2 * MINBPC, '>')) {
|
||||
*nextTokPtr = ptr + 3 * MINBPC;
|
||||
return XML_TOK_CDATA_SECTION;
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_CDATA_SECT_OPEN;
|
||||
}
|
||||
/* fall through */
|
||||
|
||||
static
|
||||
int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
if (ptr == end)
|
||||
return XML_TOK_NONE;
|
||||
#if MINBPC > 1
|
||||
{
|
||||
size_t n = end - ptr;
|
||||
if (n & (MINBPC - 1)) {
|
||||
n &= ~(MINBPC - 1);
|
||||
if (n == 0)
|
||||
return XML_TOK_PARTIAL;
|
||||
end = ptr + n;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_RSQB:
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (!CHAR_MATCHES(enc, ptr, ']'))
|
||||
break;
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (!CHAR_MATCHES(enc, ptr, '>')) {
|
||||
ptr -= MINBPC;
|
||||
break;
|
||||
}
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_CDATA_SECT_CLOSE;
|
||||
case BT_CR:
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (BYTE_TYPE(enc, ptr) == BT_LF)
|
||||
ptr += MINBPC;
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_DATA_NEWLINE;
|
||||
case BT_LF:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_DATA_NEWLINE;
|
||||
INVALID_CASES(ptr, nextTokPtr)
|
||||
default:
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
}
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
#define LEAD_CASE(n) \
|
||||
case BT_LEAD ## n: \
|
||||
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
|
||||
*nextTokPtr = ptr; \
|
||||
return XML_TOK_DATA_CHARS; \
|
||||
} \
|
||||
ptr += n; \
|
||||
break;
|
||||
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
|
||||
#undef LEAD_CASE
|
||||
case BT_NONXML:
|
||||
case BT_MALFORM:
|
||||
case BT_TRAIL:
|
||||
case BT_CR:
|
||||
case BT_LF:
|
||||
case BT_RSQB:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_DATA_CHARS;
|
||||
default:
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_DATA_CHARS;
|
||||
}
|
||||
|
||||
/* ptr points to character following "</" */
|
||||
|
Loading…
Reference in New Issue
Block a user