Don't handle CDATA sections as a single token

This commit is contained in:
James Clark 1998-05-02 05:31:15 +00:00
parent 7e71c8f5d5
commit fa8b616b9c
5 changed files with 176 additions and 33 deletions

View File

@ -103,12 +103,15 @@ typedef enum XML_Error Processor(XML_Parser parser,
static Processor prologProcessor; static Processor prologProcessor;
static Processor contentProcessor; static Processor contentProcessor;
static Processor cdataSectionProcessor;
static Processor epilogProcessor; static Processor epilogProcessor;
static Processor errorProcessor; static Processor errorProcessor;
static enum XML_Error static enum XML_Error
doContent(XML_Parser parser, int startTagLevel, const ENCODING *enc, doContent(XML_Parser parser, int startTagLevel, const ENCODING *enc,
const char *start, const char *end, const char **endPtr); const char *start, const char *end, const char **endPtr);
static enum XML_Error
doCdataSection(XML_Parser parser, const char **startPtr, const char *end, const char **nextPtr);
static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *, const char *tagName, const char *s); static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *, const char *tagName, const char *s);
static int static int
defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *, int isCdata, const char *dfltValue); defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *, int isCdata, const char *dfltValue);
@ -473,7 +476,8 @@ const char *XML_ErrorString(int code)
"reference to external entity in attribute", "reference to external entity in attribute",
"xml processing instruction not at start of external entity", "xml processing instruction not at start of external entity",
"unknown encoding", "unknown encoding",
"encoding specified in XML declaration is incorrect" "encoding specified in XML declaration is incorrect",
"unclosed CDATA section",
}; };
if (code > 0 && code < sizeof(message)/sizeof(message[0])) if (code > 0 && code < sizeof(message)/sizeof(message[0]))
return message[code]; return message[code];
@ -744,15 +748,13 @@ doContent(XML_Parser parser,
characterDataHandler(userData, &c, 1); characterDataHandler(userData, &c, 1);
} }
break; break;
case XML_TOK_CDATA_SECTION: case XML_TOK_CDATA_SECT_OPEN:
if (characterDataHandler) { {
const char *lim = next - enc->minBytesPerChar * 3; enum XML_Error result = doCdataSection(parser, &next, end, nextPtr);
s += enc->minBytesPerChar * 9; if (!next) {
do { processor = cdataSectionProcessor;
char *dataPtr = dataBuf; return result;
XmlConvert(enc, XML_UTF8_ENCODING, &s, lim, &dataPtr, dataBufEnd); }
characterDataHandler(userData, dataBuf, dataPtr - dataBuf);
} while (s != lim);
} }
break; break;
case XML_TOK_TRAILING_RSQB: case XML_TOK_TRAILING_RSQB:
@ -883,6 +885,76 @@ static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *enc,
return XML_ERROR_NONE; return XML_ERROR_NONE;
} }
/* The idea here is to avoid using stack for each CDATA section when
the whole file is parsed with one call. */
static
enum XML_Error cdataSectionProcessor(XML_Parser parser,
const char *start,
const char *end,
const char **endPtr)
{
enum XML_Error result = doCdataSection(parser, &start, end, endPtr);
if (start) {
processor = contentProcessor;
return contentProcessor(parser, start, end, endPtr);
}
return result;
}
/* startPtr gets set to non-null is the section is closed, and to null if
the section is not yet closed. */
static
enum XML_Error doCdataSection(XML_Parser parser,
const char **startPtr,
const char *end,
const char **nextPtr)
{
const char *s = *startPtr;
*startPtr = 0;
for (;;) {
const char *next;
int tok = XmlCdataSectionTok(encoding, s, end, &next);
switch (tok) {
case XML_TOK_CDATA_SECT_CLOSE:
*startPtr = next;
return XML_ERROR_NONE;
case XML_TOK_DATA_NEWLINE:
if (characterDataHandler) {
char c = '\n';
characterDataHandler(userData, &c, 1);
}
break;
case XML_TOK_DATA_CHARS:
if (characterDataHandler) {
do {
char *dataPtr = dataBuf;
XmlConvert(encoding, XML_UTF8_ENCODING, &s, next, &dataPtr, dataBufEnd);
characterDataHandler(userData, dataBuf, dataPtr - dataBuf);
} while (s != next);
}
break;
case XML_TOK_INVALID:
errorPtr = next;
return XML_ERROR_INVALID_TOKEN;
case XML_TOK_PARTIAL:
case XML_TOK_NONE:
if (nextPtr) {
*nextPtr = s;
return XML_ERROR_NONE;
}
errorPtr = s;
return XML_ERROR_UNCLOSED_CDATA_SECTION;
default:
abort();
}
s = next;
}
/* not reached */
}
static enum XML_Error static enum XML_Error
prologProcessor(XML_Parser parser, prologProcessor(XML_Parser parser,
const char *s, const char *s,

View File

@ -110,7 +110,8 @@ enum XML_Error {
XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF, XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF,
XML_ERROR_MISPLACED_XML_PI, XML_ERROR_MISPLACED_XML_PI,
XML_ERROR_UNKNOWN_ENCODING, XML_ERROR_UNKNOWN_ENCODING,
XML_ERROR_INCORRECT_ENCODING XML_ERROR_INCORRECT_ENCODING,
XML_ERROR_UNCLOSED_CDATA_SECTION
}; };
int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser); int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);

View File

@ -23,7 +23,7 @@ Contributor(s):
#include "nametab.h" #include "nametab.h"
#define VTABLE1 \ #define VTABLE1 \
{ PREFIX(prologTok), PREFIX(contentTok) }, \ { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
PREFIX(sameName), \ PREFIX(sameName), \
PREFIX(nameMatchesAscii), \ PREFIX(nameMatchesAscii), \

View File

@ -41,7 +41,7 @@ extern "C" {
#define XML_TOK_INVALID 0 #define XML_TOK_INVALID 0
/* The following tokens are returned by XmlContentTok; some are also /* The following tokens are returned by XmlContentTok; some are also
returned by XmlAttributeValueTok and XmlEntityTok */ returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok */
#define XML_TOK_START_TAG_WITH_ATTS 1 #define XML_TOK_START_TAG_WITH_ATTS 1
#define XML_TOK_START_TAG_NO_ATTS 2 #define XML_TOK_START_TAG_NO_ATTS 2
@ -50,7 +50,7 @@ extern "C" {
#define XML_TOK_END_TAG 5 #define XML_TOK_END_TAG 5
#define XML_TOK_DATA_CHARS 6 #define XML_TOK_DATA_CHARS 6
#define XML_TOK_DATA_NEWLINE 7 #define XML_TOK_DATA_NEWLINE 7
#define XML_TOK_CDATA_SECTION 8 #define XML_TOK_CDATA_SECT_OPEN 8
#define XML_TOK_ENTITY_REF 9 #define XML_TOK_ENTITY_REF 9
#define XML_TOK_CHAR_REF 10 /* numeric character reference */ #define XML_TOK_CHAR_REF 10 /* numeric character reference */
@ -88,12 +88,16 @@ extern "C" {
#define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */ #define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */
#define XML_TOK_COMMA 38 #define XML_TOK_COMMA 38
/* The following tokens is returned only by XmlAttributeValueTok */ /* The following token is returned only by XmlAttributeValueTok */
#define XML_TOK_ATTRIBUTE_VALUE_S 39 #define XML_TOK_ATTRIBUTE_VALUE_S 39
#define XML_N_STATES 2 /* The following token is returned only by XmlCdataSectionTok */
#define XML_TOK_CDATA_SECT_CLOSE 40
#define XML_N_STATES 3
#define XML_PROLOG_STATE 0 #define XML_PROLOG_STATE 0
#define XML_CONTENT_STATE 1 #define XML_CONTENT_STATE 1
#define XML_CDATA_SECTION_STATE 2
#define XML_N_LITERAL_TYPES 2 #define XML_N_LITERAL_TYPES 2
#define XML_ATTRIBUTE_VALUE_LITERAL 0 #define XML_ATTRIBUTE_VALUE_LITERAL 0
@ -189,6 +193,9 @@ literals, comments and processing instructions.
#define XmlContentTok(enc, ptr, end, nextTokPtr) \ #define XmlContentTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr) XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
#define XmlCdataSectionTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CDATA_SECTION_STATE, ptr, end, nextTokPtr)
/* This is used for performing a 2nd-level tokenization on /* This is used for performing a 2nd-level tokenization on
the content of a literal that has already been returned by XmlTok. */ the content of a literal that has already been returned by XmlTok. */

View File

@ -293,15 +293,14 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_PARTIAL; return XML_TOK_PARTIAL;
} }
/* ptr points to character following "<![" */
static static
int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end, int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr) const char **nextTokPtr)
{ {
int i; int i;
/* CDATA[]]> */ /* CDATA[ */
if (end - ptr < 9 * MINBPC) if (end - ptr < 6 * MINBPC)
return XML_TOK_PARTIAL; return XML_TOK_PARTIAL;
for (i = 0; i < 6; i++, ptr += MINBPC) { for (i = 0; i < 6; i++, ptr += MINBPC) {
if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) { if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
@ -309,22 +308,86 @@ int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *e
return XML_TOK_INVALID; return XML_TOK_INVALID;
} }
} }
end -= 2 * MINBPC; *nextTokPtr = ptr;
while (ptr != end) { return XML_TOK_CDATA_SECT_OPEN;
switch (BYTE_TYPE(enc, ptr)) { }
INVALID_CASES(ptr, nextTokPtr)
case BT_RSQB: static
if (CHAR_MATCHES(enc, ptr + MINBPC, ']') int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
&& CHAR_MATCHES(enc, ptr + 2 * MINBPC, '>')) { const char **nextTokPtr)
*nextTokPtr = ptr + 3 * MINBPC; {
return XML_TOK_CDATA_SECTION; if (ptr == end)
} return XML_TOK_NONE;
/* fall through */ #if MINBPC > 1
default: {
ptr += MINBPC; size_t n = end - ptr;
if (n & (MINBPC - 1)) {
n &= ~(MINBPC - 1);
if (n == 0)
return XML_TOK_PARTIAL;
end = ptr + n;
} }
} }
return XML_TOK_PARTIAL; #endif
switch (BYTE_TYPE(enc, ptr)) {
case BT_RSQB:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
if (!CHAR_MATCHES(enc, ptr, ']'))
break;
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
if (!CHAR_MATCHES(enc, ptr, '>')) {
ptr -= MINBPC;
break;
}
*nextTokPtr = ptr + MINBPC;
return XML_TOK_CDATA_SECT_CLOSE;
case BT_CR:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
if (BYTE_TYPE(enc, ptr) == BT_LF)
ptr += MINBPC;
*nextTokPtr = ptr;
return XML_TOK_DATA_NEWLINE;
case BT_LF:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_DATA_NEWLINE;
INVALID_CASES(ptr, nextTokPtr)
default:
ptr += MINBPC;
break;
}
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
#define LEAD_CASE(n) \
case BT_LEAD ## n: \
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
*nextTokPtr = ptr; \
return XML_TOK_DATA_CHARS; \
} \
ptr += n; \
break;
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
#undef LEAD_CASE
case BT_NONXML:
case BT_MALFORM:
case BT_TRAIL:
case BT_CR:
case BT_LF:
case BT_RSQB:
*nextTokPtr = ptr;
return XML_TOK_DATA_CHARS;
default:
ptr += MINBPC;
break;
}
}
*nextTokPtr = ptr;
return XML_TOK_DATA_CHARS;
} }
/* ptr points to character following "</" */ /* ptr points to character following "</" */