From fa8b616b9c9b0a5a843c5c32e90501b083d9c128 Mon Sep 17 00:00:00 2001 From: James Clark Date: Sat, 2 May 1998 05:31:15 +0000 Subject: [PATCH] Don't handle CDATA sections as a single token --- expat/xmlparse/xmlparse.c | 92 ++++++++++++++++++++++++++++++++---- expat/xmlparse/xmlparse.h | 3 +- expat/xmltok/xmltok.c | 2 +- expat/xmltok/xmltok.h | 15 ++++-- expat/xmltok/xmltok_impl.c | 97 +++++++++++++++++++++++++++++++------- 5 files changed, 176 insertions(+), 33 deletions(-) diff --git a/expat/xmlparse/xmlparse.c b/expat/xmlparse/xmlparse.c index 3aa13d0e..74b0b688 100755 --- a/expat/xmlparse/xmlparse.c +++ b/expat/xmlparse/xmlparse.c @@ -103,12 +103,15 @@ typedef enum XML_Error Processor(XML_Parser parser, static Processor prologProcessor; static Processor contentProcessor; +static Processor cdataSectionProcessor; static Processor epilogProcessor; static Processor errorProcessor; static enum XML_Error doContent(XML_Parser parser, int startTagLevel, const ENCODING *enc, const char *start, const char *end, const char **endPtr); +static enum XML_Error +doCdataSection(XML_Parser parser, const char **startPtr, const char *end, const char **nextPtr); static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *, const char *tagName, const char *s); static int defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *, int isCdata, const char *dfltValue); @@ -473,7 +476,8 @@ const char *XML_ErrorString(int code) "reference to external entity in attribute", "xml processing instruction not at start of external entity", "unknown encoding", - "encoding specified in XML declaration is incorrect" + "encoding specified in XML declaration is incorrect", + "unclosed CDATA section", }; if (code > 0 && code < sizeof(message)/sizeof(message[0])) return message[code]; @@ -744,15 +748,13 @@ doContent(XML_Parser parser, characterDataHandler(userData, &c, 1); } break; - case XML_TOK_CDATA_SECTION: - if (characterDataHandler) { - const char *lim = next - enc->minBytesPerChar * 3; - s += enc->minBytesPerChar * 9; - do { - char *dataPtr = dataBuf; - XmlConvert(enc, XML_UTF8_ENCODING, &s, lim, &dataPtr, dataBufEnd); - characterDataHandler(userData, dataBuf, dataPtr - dataBuf); - } while (s != lim); + case XML_TOK_CDATA_SECT_OPEN: + { + enum XML_Error result = doCdataSection(parser, &next, end, nextPtr); + if (!next) { + processor = cdataSectionProcessor; + return result; + } } break; case XML_TOK_TRAILING_RSQB: @@ -883,6 +885,76 @@ static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *enc, return XML_ERROR_NONE; } +/* The idea here is to avoid using stack for each CDATA section when +the whole file is parsed with one call. */ + +static +enum XML_Error cdataSectionProcessor(XML_Parser parser, + const char *start, + const char *end, + const char **endPtr) +{ + enum XML_Error result = doCdataSection(parser, &start, end, endPtr); + if (start) { + processor = contentProcessor; + return contentProcessor(parser, start, end, endPtr); + } + return result; +} + +/* startPtr gets set to non-null is the section is closed, and to null if +the section is not yet closed. */ + +static +enum XML_Error doCdataSection(XML_Parser parser, + const char **startPtr, + const char *end, + const char **nextPtr) +{ + const char *s = *startPtr; + *startPtr = 0; + for (;;) { + const char *next; + int tok = XmlCdataSectionTok(encoding, s, end, &next); + switch (tok) { + case XML_TOK_CDATA_SECT_CLOSE: + *startPtr = next; + return XML_ERROR_NONE; + case XML_TOK_DATA_NEWLINE: + if (characterDataHandler) { + char c = '\n'; + characterDataHandler(userData, &c, 1); + } + break; + case XML_TOK_DATA_CHARS: + if (characterDataHandler) { + do { + char *dataPtr = dataBuf; + XmlConvert(encoding, XML_UTF8_ENCODING, &s, next, &dataPtr, dataBufEnd); + characterDataHandler(userData, dataBuf, dataPtr - dataBuf); + } while (s != next); + } + break; + case XML_TOK_INVALID: + errorPtr = next; + return XML_ERROR_INVALID_TOKEN; + case XML_TOK_PARTIAL: + case XML_TOK_NONE: + if (nextPtr) { + *nextPtr = s; + return XML_ERROR_NONE; + } + errorPtr = s; + return XML_ERROR_UNCLOSED_CDATA_SECTION; + default: + abort(); + } + s = next; + } + /* not reached */ +} + + static enum XML_Error prologProcessor(XML_Parser parser, const char *s, diff --git a/expat/xmlparse/xmlparse.h b/expat/xmlparse/xmlparse.h index 216ec6d0..a1cea7fe 100755 --- a/expat/xmlparse/xmlparse.h +++ b/expat/xmlparse/xmlparse.h @@ -110,7 +110,8 @@ enum XML_Error { XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF, XML_ERROR_MISPLACED_XML_PI, XML_ERROR_UNKNOWN_ENCODING, - XML_ERROR_INCORRECT_ENCODING + XML_ERROR_INCORRECT_ENCODING, + XML_ERROR_UNCLOSED_CDATA_SECTION }; int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser); diff --git a/expat/xmltok/xmltok.c b/expat/xmltok/xmltok.c index aba5e55c..92089504 100755 --- a/expat/xmltok/xmltok.c +++ b/expat/xmltok/xmltok.c @@ -23,7 +23,7 @@ Contributor(s): #include "nametab.h" #define VTABLE1 \ - { PREFIX(prologTok), PREFIX(contentTok) }, \ + { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \ { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ PREFIX(sameName), \ PREFIX(nameMatchesAscii), \ diff --git a/expat/xmltok/xmltok.h b/expat/xmltok/xmltok.h index 25a17396..61cd8286 100755 --- a/expat/xmltok/xmltok.h +++ b/expat/xmltok/xmltok.h @@ -41,7 +41,7 @@ extern "C" { #define XML_TOK_INVALID 0 /* The following tokens are returned by XmlContentTok; some are also - returned by XmlAttributeValueTok and XmlEntityTok */ + returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok */ #define XML_TOK_START_TAG_WITH_ATTS 1 #define XML_TOK_START_TAG_NO_ATTS 2 @@ -50,7 +50,7 @@ extern "C" { #define XML_TOK_END_TAG 5 #define XML_TOK_DATA_CHARS 6 #define XML_TOK_DATA_NEWLINE 7 -#define XML_TOK_CDATA_SECTION 8 +#define XML_TOK_CDATA_SECT_OPEN 8 #define XML_TOK_ENTITY_REF 9 #define XML_TOK_CHAR_REF 10 /* numeric character reference */ @@ -88,12 +88,16 @@ extern "C" { #define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */ #define XML_TOK_COMMA 38 - /* The following tokens is returned only by XmlAttributeValueTok */ +/* The following token is returned only by XmlAttributeValueTok */ #define XML_TOK_ATTRIBUTE_VALUE_S 39 -#define XML_N_STATES 2 +/* The following token is returned only by XmlCdataSectionTok */ +#define XML_TOK_CDATA_SECT_CLOSE 40 + +#define XML_N_STATES 3 #define XML_PROLOG_STATE 0 #define XML_CONTENT_STATE 1 +#define XML_CDATA_SECTION_STATE 2 #define XML_N_LITERAL_TYPES 2 #define XML_ATTRIBUTE_VALUE_LITERAL 0 @@ -189,6 +193,9 @@ literals, comments and processing instructions. #define XmlContentTok(enc, ptr, end, nextTokPtr) \ XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr) +#define XmlCdataSectionTok(enc, ptr, end, nextTokPtr) \ + XmlTok(enc, XML_CDATA_SECTION_STATE, ptr, end, nextTokPtr) + /* This is used for performing a 2nd-level tokenization on the content of a literal that has already been returned by XmlTok. */ diff --git a/expat/xmltok/xmltok_impl.c b/expat/xmltok/xmltok_impl.c index 2207a3df..db6cc1ab 100755 --- a/expat/xmltok/xmltok_impl.c +++ b/expat/xmltok/xmltok_impl.c @@ -293,15 +293,14 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_PARTIAL; } -/* ptr points to character following " */ - if (end - ptr < 9 * MINBPC) + /* CDATA[ */ + if (end - ptr < 6 * MINBPC) return XML_TOK_PARTIAL; for (i = 0; i < 6; i++, ptr += MINBPC) { if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) { @@ -309,22 +308,86 @@ int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *e return XML_TOK_INVALID; } } - end -= 2 * MINBPC; - while (ptr != end) { - switch (BYTE_TYPE(enc, ptr)) { - INVALID_CASES(ptr, nextTokPtr) - case BT_RSQB: - if (CHAR_MATCHES(enc, ptr + MINBPC, ']') - && CHAR_MATCHES(enc, ptr + 2 * MINBPC, '>')) { - *nextTokPtr = ptr + 3 * MINBPC; - return XML_TOK_CDATA_SECTION; - } - /* fall through */ - default: - ptr += MINBPC; + *nextTokPtr = ptr; + return XML_TOK_CDATA_SECT_OPEN; +} + +static +int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end, + const char **nextTokPtr) +{ + if (ptr == end) + return XML_TOK_NONE; +#if MINBPC > 1 + { + size_t n = end - ptr; + if (n & (MINBPC - 1)) { + n &= ~(MINBPC - 1); + if (n == 0) + return XML_TOK_PARTIAL; + end = ptr + n; } } - return XML_TOK_PARTIAL; +#endif + switch (BYTE_TYPE(enc, ptr)) { + case BT_RSQB: + ptr += MINBPC; + if (ptr == end) + return XML_TOK_PARTIAL; + if (!CHAR_MATCHES(enc, ptr, ']')) + break; + ptr += MINBPC; + if (ptr == end) + return XML_TOK_PARTIAL; + if (!CHAR_MATCHES(enc, ptr, '>')) { + ptr -= MINBPC; + break; + } + *nextTokPtr = ptr + MINBPC; + return XML_TOK_CDATA_SECT_CLOSE; + case BT_CR: + ptr += MINBPC; + if (ptr == end) + return XML_TOK_PARTIAL; + if (BYTE_TYPE(enc, ptr) == BT_LF) + ptr += MINBPC; + *nextTokPtr = ptr; + return XML_TOK_DATA_NEWLINE; + case BT_LF: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_DATA_NEWLINE; + INVALID_CASES(ptr, nextTokPtr) + default: + ptr += MINBPC; + break; + } + while (ptr != end) { + switch (BYTE_TYPE(enc, ptr)) { +#define LEAD_CASE(n) \ + case BT_LEAD ## n: \ + if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ + *nextTokPtr = ptr; \ + return XML_TOK_DATA_CHARS; \ + } \ + ptr += n; \ + break; + LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) +#undef LEAD_CASE + case BT_NONXML: + case BT_MALFORM: + case BT_TRAIL: + case BT_CR: + case BT_LF: + case BT_RSQB: + *nextTokPtr = ptr; + return XML_TOK_DATA_CHARS; + default: + ptr += MINBPC; + break; + } + } + *nextTokPtr = ptr; + return XML_TOK_DATA_CHARS; } /* ptr points to character following "