#include "xmltok.h" #include "nametab.h" #define VTABLE1 \ { PREFIX(prologTok), PREFIX(contentTok) }, \ { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ PREFIX(sameName), \ PREFIX(nameMatchesAscii), \ PREFIX(nameLength), \ PREFIX(getAtts), \ PREFIX(charRefNumber), \ PREFIX(updatePosition), \ PREFIX(isPublicId) #define VTABLE2 \ PREFIX(encode), \ { PREFIX(toUtf8) } #define VTABLE VTABLE1, VTABLE2 #define UCS2_GET_NAMING(pages, hi, lo) \ (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) /* A 2 byte UTF-8 representation splits the characters 11 bits between the bottom 5 and 6 bits of the bytes. We need 8 bits to index into pages, 3 bits to add to that index and 5 bits to generate the mask. */ #define UTF8_GET_NAMING2(pages, byte) \ (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ + ((((byte)[0]) & 3) << 1) \ + ((((byte)[1]) >> 5) & 1)] \ & (1 << (((byte)[1]) & 0x1F))) /* A 3 byte UTF-8 representation splits the characters 16 bits between the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index into pages, 3 bits to add to that index and 5 bits to generate the mask. */ #define UTF8_GET_NAMING3(pages, byte) \ (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ + ((((byte)[1]) >> 2) & 0xF)] \ << 3) \ + ((((byte)[1]) & 3) << 1) \ + ((((byte)[2]) >> 5) & 1)] \ & (1 << (((byte)[2]) & 0x1F))) #define UTF8_GET_NAMING(pages, p, n) \ ((n) == 2 \ ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ : ((n) == 3 \ ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ : 0)) struct normal_encoding { ENCODING enc; unsigned char type[256]; }; static int checkCharRefNumber(int); #include "xmltok_impl.h" /* minimum bytes per character */ #define MINBPC 1 #define BYTE_TYPE(enc, p) \ (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) #define BYTE_TO_ASCII(enc, p) (*p) #define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n) #define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n) /* c is an ASCII character */ #define CHAR_MATCHES(enc, p, c) (*(p) == c) #define PREFIX(ident) normal_ ## ident #include "xmltok_impl.c" #undef MINBPC #undef BYTE_TYPE #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR #undef IS_NMSTRT_CHAR enum { // cvalN is value of masked first byte of N byte sequence cval1 = 0x00, cval2 = 0xc0, cval3 = 0xe0, cval4 = 0xf0, // minN is minimum legal resulting value for N byte sequence min2 = 0x80, min3 = 0x800, min4 = 0x10000 }; static int utf8_encode(const ENCODING *enc, int c, char *buf) { if (c < 0) return 0; if (c < min2) { buf[0] = (c | cval1); return 1; } if (c < min3) { buf[0] = ((c >> 6) | cval2); buf[1] = ((c & 0x3f) | 0x80); return 2; } if (c < min4) { buf[0] = ((c >> 12) | cval3); buf[1] = (((c >> 6) & 0x3f) | 0x80); buf[2] = ((c & 0x3f) | 0x80); return 3; } if (c < 0x110000) { buf[0] = ((c >> 18) | cval4); buf[1] = (((c >> 12) & 0x3f) | 0x80); buf[2] = (((c >> 6) & 0x3f) | 0x80); buf[3] = ((c & 0x3f) | 0x80); return 3; } return 0; } static void utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim) { char *to; const char *from; if (fromLim - *fromP > toLim - *toP) { /* Avoid copying partial characters. */ for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) break; } for (to = *toP, from = *fromP; from != fromLim; from++, to++) *to = *from; *fromP = from; *toP = to; } static const struct normal_encoding utf8_encoding = { { VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 }, { #include "asciitab.h" #include "utf8tab.h" } }; static const struct normal_encoding internal_utf8_encoding = { { VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 }, { #include "iasciitab.h" #include "utf8tab.h" } }; static int latin1_encode(const ENCODING *enc, int c, char *buf) { if (c < 0) return 0; if (c <= 0xFF) { buf[0] = (char)c; return 1; } return 0; } static void latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim) { for (;;) { unsigned char c; if (*fromP == fromLim) break; c = (unsigned char)**fromP; if (c & 0x80) { if (toLim - *toP < 2) break; *(*toP)++ = ((c >> 6) | cval2); *(*toP)++ = ((c & 0x3f) | 0x80); } else { if (*toP == toLim) break; *(*toP)++ = *(*fromP)++; } } } static const struct normal_encoding latin1_encoding = { { VTABLE1, latin1_encode, { latin1_toUtf8 }, 1 }, { #include "asciitab.h" #include "latin1tab.h" } }; #define latin1tab (latin1_encoding.type) #undef PREFIX static int unicode_byte_type(char hi, char lo) { switch ((unsigned char)hi) { case 0xD8: case 0xD9: case 0xDA: case 0xDB: return BT_LEAD4; case 0xDC: case 0xDD: case 0xDE: case 0xDF: return BT_TRAIL; case 0xFF: switch ((unsigned char)lo) { case 0xFF: case 0xFE: return BT_NONXML; } break; } return BT_NONASCII; } #define DEFINE_UTF16_ENCODE \ static \ int PREFIX(encode)(const ENCODING *enc, int charNum, char *buf) \ { \ if (charNum < 0) \ return 0; \ if (charNum < 0x10000) { \ SET2(buf, charNum); \ return 2; \ } \ if (charNum < 0x110000) { \ charNum -= 0x10000; \ SET2(buf, (charNum >> 10) + 0xD800); \ SET2(buf + 2, (charNum & 0x3FF) + 0xDC00); \ return 4; \ } \ return 0; \ } #define DEFINE_UTF16_TO_UTF8 \ static \ void PREFIX(toUtf8)(const ENCODING *enc, \ const char **fromP, const char *fromLim, \ char **toP, const char *toLim) \ { \ const char *from; \ for (from = *fromP; from != fromLim; from += 2) { \ unsigned char lo2; \ unsigned char lo = GET_LO(from); \ unsigned char hi = GET_HI(from); \ switch (hi) { \ case 0: \ if (*toP == toLim) \ return; \ *(*toP)++ = lo; \ break; \ case 0x1: case 0x2: case 0x3: \ case 0x4: case 0x5: case 0x6: case 0x7: \ if (toLim - *toP < 2) \ return; \ *(*toP)++ = ((lo >> 6) | (hi << 2) | cval2); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ default: \ if (toLim - *toP < 3) \ return; \ /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ *(*toP)++ = ((hi >> 4) | cval3); \ *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ if (toLim - *toP < 4) \ return; \ /* IIIIIIWW XXXXXXYY IIIIIIYY YYZZZZZ => */ \ /* JJJJJJWW JJXXXXXX JJYYYYYY JJZZZZZ */ \ *(*toP)++ = ((hi & 0x3) | cval4); \ *(*toP)++ = ((lo >> 2) | 0x80); \ from += 2; \ lo2 = GET_LO(from); \ *(*toP)++ = (((lo & 0x3) << 4) \ | ((GET_HI(from) & 0x3) << 2) \ | (lo2 >> 6) \ | 0x80); \ *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ break; \ } \ } \ } #define PREFIX(ident) little2_ ## ident #define MINBPC 2 #define BYTE_TYPE(enc, p) \ ((p)[1] == 0 ? latin1tab[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0])) #define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) #define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) #define IS_NAME_CHAR(enc, p, n) \ UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) #define IS_NMSTRT_CHAR(enc, p, n) \ UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) #include "xmltok_impl.c" #define SET2(ptr, ch) \ (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) #define GET_LO(ptr) ((unsigned char)(ptr)[0]) #define GET_HI(ptr) ((unsigned char)(ptr)[1]) DEFINE_UTF16_ENCODE DEFINE_UTF16_TO_UTF8 #undef SET2 #undef GET_LO #undef GET_HI #undef MINBPC #undef BYTE_TYPE #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR #undef IS_NMSTRT_CHAR static const struct encoding little2_encoding = { VTABLE, 2 }; #undef PREFIX #define PREFIX(ident) big2_ ## ident #define MINBPC 2 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ #define BYTE_TYPE(enc, p) \ ((p)[0] == 0 ? latin1tab[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1])) #define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) #define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) #define IS_NAME_CHAR(enc, p, n) \ UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) #define IS_NMSTRT_CHAR(enc, p, n) \ UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) #include "xmltok_impl.c" #define SET2(ptr, ch) \ (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) #define GET_LO(ptr) ((unsigned char)(ptr)[1]) #define GET_HI(ptr) ((unsigned char)(ptr)[0]) DEFINE_UTF16_ENCODE DEFINE_UTF16_TO_UTF8 #undef SET2 #undef GET_LO #undef GET_HI #undef MINBPC #undef BYTE_TYPE #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR #undef IS_NMSTRT_CHAR static const struct encoding big2_encoding = { VTABLE, 2 }; #undef PREFIX static int initScan(const ENCODING *enc, int state, const char *ptr, const char *end, const char **nextTokPtr) { const ENCODING **encPtr; if (ptr == end) return XML_TOK_NONE; encPtr = ((const INIT_ENCODING *)enc)->encPtr; if (ptr + 1 == end) { switch ((unsigned char)*ptr) { case 0xFE: case 0xFF: case 0x00: case 0x3C: return XML_TOK_PARTIAL; } } else { switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { case 0x003C: *encPtr = &big2_encoding; return XmlTok(*encPtr, state, ptr, end, nextTokPtr); case 0xFEFF: *nextTokPtr = ptr + 2; *encPtr = &big2_encoding; return XML_TOK_BOM; case 0x3C00: *encPtr = &little2_encoding; return XmlTok(*encPtr, state, ptr, end, nextTokPtr); case 0xFFFE: *nextTokPtr = ptr + 2; *encPtr = &little2_encoding; return XML_TOK_BOM; } } *encPtr = &utf8_encoding.enc; return XmlTok(*encPtr, state, ptr, end, nextTokPtr); } static int initScanProlog(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { return initScan(enc, XML_PROLOG_STATE, ptr, end, nextTokPtr); } static int initScanContent(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { return initScan(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr); } static void initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end, POSITION *pos) { normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); } const ENCODING *XmlGetInternalEncoding(int e) { switch (e) { case XML_UTF8_ENCODING: return &internal_utf8_encoding.enc; } return 0; } void XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr) { p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog; p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent; p->initEnc.updatePosition = initUpdatePosition; p->initEnc.minBytesPerChar = 1; p->encPtr = encPtr; *encPtr = &(p->initEnc); } static int toAscii(const ENCODING *enc, const char *ptr, const char *end) { char buf[1]; char *p = buf; XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + 1); if (p == buf) return -1; else return buf[0]; } static int isSpace(int c) { switch (c) { case ' ': case '\r': case '\n': case '\t': return 1; } return 0; } /* Return 1 if there's just optional white space or there's an S followed by name=val. */ static int parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end, const char **namePtr, const char **valPtr, const char **nextTokPtr) { int c; char open; if (ptr == end) { *namePtr = 0; return 1; } if (!isSpace(toAscii(enc, ptr, end))) { *nextTokPtr = ptr; return 0; } do { ptr += enc->minBytesPerChar; } while (isSpace(toAscii(enc, ptr, end))); if (ptr == end) { *namePtr = 0; return 1; } *namePtr = ptr; for (;;) { c = toAscii(enc, ptr, end); if (c == -1) { *nextTokPtr = ptr; return 0; } if (c == '=') break; if (isSpace(c)) { do { ptr += enc->minBytesPerChar; } while (isSpace(c = toAscii(enc, ptr, end))); if (c != '=') { *nextTokPtr = ptr; return 0; } break; } ptr += enc->minBytesPerChar; } if (ptr == *namePtr) { *nextTokPtr = ptr; return 0; } ptr += enc->minBytesPerChar; c = toAscii(enc, ptr, end); while (isSpace(c)) { ptr += enc->minBytesPerChar; c = toAscii(enc, ptr, end); } if (c != '"' && c != '\'') { *nextTokPtr = ptr; return 0; } open = c; ptr += enc->minBytesPerChar; *valPtr = ptr; for (;; ptr += enc->minBytesPerChar) { c = toAscii(enc, ptr, end); if (c == open) break; if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z') && !('0' <= c && c <= '9') && c != '.' && c != '-' && c != '_') { *nextTokPtr = ptr; return 0; } } *nextTokPtr = ptr + enc->minBytesPerChar; return 1; } static int streq(const char *s1, const char *s2) { for (; *s1 == *s2; s1++, s2++) if (!*s1) return 1; return 0; } static const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *end) { #define ENCODING_MAX 128 char buf[ENCODING_MAX]; char *p = buf; int i; XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + ENCODING_MAX - 1); if (ptr != end) return 0; *p = 0; for (i = 0; buf[i]; i++) { if ('a' <= buf[i] && buf[i] <= 'z') buf[i] += 'A' - 'a'; } if (streq(buf, "UTF-8")) return &utf8_encoding.enc; if (streq(buf, "ISO-8859-1")) return &latin1_encoding.enc; if (streq(buf, "UTF-16")) { static const unsigned short n = 1; if (enc->minBytesPerChar == 2) return enc; if (*(const char *)&n) return &little2_encoding; else return &big2_encoding; } return 0; } int XmlParseXmlDecl(int isGeneralTextEntity, const ENCODING *enc, const char *ptr, const char *end, const char **badPtr, const char **versionPtr, const char **encodingName, const ENCODING **encoding, int *standalone) { const char *val = 0; const char *name = 0; ptr += 5 * enc->minBytesPerChar; end -= 2 * enc->minBytesPerChar; if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) { *badPtr = ptr; return 0; } if (!XmlNameMatchesAscii(enc, name, "version")) { if (!isGeneralTextEntity) { *badPtr = name; return 0; } } else { if (versionPtr) *versionPtr = val; if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { *badPtr = ptr; return 0; } if (!name) return 1; } if (XmlNameMatchesAscii(enc, name, "encoding")) { int c = toAscii(enc, val, end); if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) { *badPtr = val; return 0; } if (encodingName) *encodingName = val; if (encoding) *encoding = findEncoding(enc, val, ptr - enc->minBytesPerChar); if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) { *badPtr = ptr; return 0; } if (!name) return 1; } if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) { *badPtr = name; return 0; } if (XmlNameMatchesAscii(enc, val, "yes")) { if (standalone) *standalone = 1; } else if (XmlNameMatchesAscii(enc, val, "no")) { if (standalone) *standalone = 0; } else { *badPtr = val; return 0; } while (isSpace(toAscii(enc, ptr, end))) ptr += enc->minBytesPerChar; if (ptr != end) { *badPtr = ptr; return 0; } return 1; } static int checkCharRefNumber(int result) { switch (result >> 8) { case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE: case 0xDF: return -1; case 0: if (latin1_encoding.type[result] == BT_NONXML) return -1; break; case 0xFF: if (result == 0xFFFE || result == 0xFFFF) return -1; break; } return result; }