Don't handle CDATA sections as a single token

This commit is contained in:
James Clark 1998-05-02 05:31:15 +00:00
parent 7e71c8f5d5
commit fa8b616b9c
5 changed files with 176 additions and 33 deletions

View File

@ -103,12 +103,15 @@ typedef enum XML_Error Processor(XML_Parser parser,
static Processor prologProcessor;
static Processor contentProcessor;
static Processor cdataSectionProcessor;
static Processor epilogProcessor;
static Processor errorProcessor;
static enum XML_Error
doContent(XML_Parser parser, int startTagLevel, const ENCODING *enc,
const char *start, const char *end, const char **endPtr);
static enum XML_Error
doCdataSection(XML_Parser parser, const char **startPtr, const char *end, const char **nextPtr);
static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *, const char *tagName, const char *s);
static int
defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *, int isCdata, const char *dfltValue);
@ -473,7 +476,8 @@ const char *XML_ErrorString(int code)
"reference to external entity in attribute",
"xml processing instruction not at start of external entity",
"unknown encoding",
"encoding specified in XML declaration is incorrect"
"encoding specified in XML declaration is incorrect",
"unclosed CDATA section",
};
if (code > 0 && code < sizeof(message)/sizeof(message[0]))
return message[code];
@ -744,15 +748,13 @@ doContent(XML_Parser parser,
characterDataHandler(userData, &c, 1);
}
break;
case XML_TOK_CDATA_SECTION:
if (characterDataHandler) {
const char *lim = next - enc->minBytesPerChar * 3;
s += enc->minBytesPerChar * 9;
do {
char *dataPtr = dataBuf;
XmlConvert(enc, XML_UTF8_ENCODING, &s, lim, &dataPtr, dataBufEnd);
characterDataHandler(userData, dataBuf, dataPtr - dataBuf);
} while (s != lim);
case XML_TOK_CDATA_SECT_OPEN:
{
enum XML_Error result = doCdataSection(parser, &next, end, nextPtr);
if (!next) {
processor = cdataSectionProcessor;
return result;
}
}
break;
case XML_TOK_TRAILING_RSQB:
@ -883,6 +885,76 @@ static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *enc,
return XML_ERROR_NONE;
}
/* The idea here is to avoid using stack for each CDATA section when
the whole file is parsed with one call. */
static
enum XML_Error cdataSectionProcessor(XML_Parser parser,
const char *start,
const char *end,
const char **endPtr)
{
enum XML_Error result = doCdataSection(parser, &start, end, endPtr);
if (start) {
processor = contentProcessor;
return contentProcessor(parser, start, end, endPtr);
}
return result;
}
/* startPtr gets set to non-null is the section is closed, and to null if
the section is not yet closed. */
static
enum XML_Error doCdataSection(XML_Parser parser,
const char **startPtr,
const char *end,
const char **nextPtr)
{
const char *s = *startPtr;
*startPtr = 0;
for (;;) {
const char *next;
int tok = XmlCdataSectionTok(encoding, s, end, &next);
switch (tok) {
case XML_TOK_CDATA_SECT_CLOSE:
*startPtr = next;
return XML_ERROR_NONE;
case XML_TOK_DATA_NEWLINE:
if (characterDataHandler) {
char c = '\n';
characterDataHandler(userData, &c, 1);
}
break;
case XML_TOK_DATA_CHARS:
if (characterDataHandler) {
do {
char *dataPtr = dataBuf;
XmlConvert(encoding, XML_UTF8_ENCODING, &s, next, &dataPtr, dataBufEnd);
characterDataHandler(userData, dataBuf, dataPtr - dataBuf);
} while (s != next);
}
break;
case XML_TOK_INVALID:
errorPtr = next;
return XML_ERROR_INVALID_TOKEN;
case XML_TOK_PARTIAL:
case XML_TOK_NONE:
if (nextPtr) {
*nextPtr = s;
return XML_ERROR_NONE;
}
errorPtr = s;
return XML_ERROR_UNCLOSED_CDATA_SECTION;
default:
abort();
}
s = next;
}
/* not reached */
}
static enum XML_Error
prologProcessor(XML_Parser parser,
const char *s,

View File

@ -110,7 +110,8 @@ enum XML_Error {
XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF,
XML_ERROR_MISPLACED_XML_PI,
XML_ERROR_UNKNOWN_ENCODING,
XML_ERROR_INCORRECT_ENCODING
XML_ERROR_INCORRECT_ENCODING,
XML_ERROR_UNCLOSED_CDATA_SECTION
};
int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);

View File

@ -23,7 +23,7 @@ Contributor(s):
#include "nametab.h"
#define VTABLE1 \
{ PREFIX(prologTok), PREFIX(contentTok) }, \
{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
PREFIX(sameName), \
PREFIX(nameMatchesAscii), \

View File

@ -41,7 +41,7 @@ extern "C" {
#define XML_TOK_INVALID 0
/* The following tokens are returned by XmlContentTok; some are also
returned by XmlAttributeValueTok and XmlEntityTok */
returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok */
#define XML_TOK_START_TAG_WITH_ATTS 1
#define XML_TOK_START_TAG_NO_ATTS 2
@ -50,7 +50,7 @@ extern "C" {
#define XML_TOK_END_TAG 5
#define XML_TOK_DATA_CHARS 6
#define XML_TOK_DATA_NEWLINE 7
#define XML_TOK_CDATA_SECTION 8
#define XML_TOK_CDATA_SECT_OPEN 8
#define XML_TOK_ENTITY_REF 9
#define XML_TOK_CHAR_REF 10 /* numeric character reference */
@ -88,12 +88,16 @@ extern "C" {
#define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */
#define XML_TOK_COMMA 38
/* The following tokens is returned only by XmlAttributeValueTok */
/* The following token is returned only by XmlAttributeValueTok */
#define XML_TOK_ATTRIBUTE_VALUE_S 39
#define XML_N_STATES 2
/* The following token is returned only by XmlCdataSectionTok */
#define XML_TOK_CDATA_SECT_CLOSE 40
#define XML_N_STATES 3
#define XML_PROLOG_STATE 0
#define XML_CONTENT_STATE 1
#define XML_CDATA_SECTION_STATE 2
#define XML_N_LITERAL_TYPES 2
#define XML_ATTRIBUTE_VALUE_LITERAL 0
@ -189,6 +193,9 @@ literals, comments and processing instructions.
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
#define XmlCdataSectionTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CDATA_SECTION_STATE, ptr, end, nextTokPtr)
/* This is used for performing a 2nd-level tokenization on
the content of a literal that has already been returned by XmlTok. */

View File

@ -293,15 +293,14 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_PARTIAL;
}
/* ptr points to character following "<![" */
static
int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
int i;
/* CDATA[]]> */
if (end - ptr < 9 * MINBPC)
/* CDATA[ */
if (end - ptr < 6 * MINBPC)
return XML_TOK_PARTIAL;
for (i = 0; i < 6; i++, ptr += MINBPC) {
if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
@ -309,22 +308,86 @@ int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *e
return XML_TOK_INVALID;
}
}
end -= 2 * MINBPC;
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
INVALID_CASES(ptr, nextTokPtr)
case BT_RSQB:
if (CHAR_MATCHES(enc, ptr + MINBPC, ']')
&& CHAR_MATCHES(enc, ptr + 2 * MINBPC, '>')) {
*nextTokPtr = ptr + 3 * MINBPC;
return XML_TOK_CDATA_SECTION;
*nextTokPtr = ptr;
return XML_TOK_CDATA_SECT_OPEN;
}
/* fall through */
static
int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
if (ptr == end)
return XML_TOK_NONE;
#if MINBPC > 1
{
size_t n = end - ptr;
if (n & (MINBPC - 1)) {
n &= ~(MINBPC - 1);
if (n == 0)
return XML_TOK_PARTIAL;
end = ptr + n;
}
}
#endif
switch (BYTE_TYPE(enc, ptr)) {
case BT_RSQB:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
if (!CHAR_MATCHES(enc, ptr, ']'))
break;
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
if (!CHAR_MATCHES(enc, ptr, '>')) {
ptr -= MINBPC;
break;
}
*nextTokPtr = ptr + MINBPC;
return XML_TOK_CDATA_SECT_CLOSE;
case BT_CR:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
if (BYTE_TYPE(enc, ptr) == BT_LF)
ptr += MINBPC;
*nextTokPtr = ptr;
return XML_TOK_DATA_NEWLINE;
case BT_LF:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_DATA_NEWLINE;
INVALID_CASES(ptr, nextTokPtr)
default:
ptr += MINBPC;
break;
}
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
#define LEAD_CASE(n) \
case BT_LEAD ## n: \
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
*nextTokPtr = ptr; \
return XML_TOK_DATA_CHARS; \
} \
ptr += n; \
break;
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
#undef LEAD_CASE
case BT_NONXML:
case BT_MALFORM:
case BT_TRAIL:
case BT_CR:
case BT_LF:
case BT_RSQB:
*nextTokPtr = ptr;
return XML_TOK_DATA_CHARS;
default:
ptr += MINBPC;
break;
}
}
return XML_TOK_PARTIAL;
*nextTokPtr = ptr;
return XML_TOK_DATA_CHARS;
}
/* ptr points to character following "</" */