#include #include #include "wfcheck.h" #include "hashtable.h" #include "xmltok.h" #include "xmlrole.h" typedef struct { const char *name; const char *textPtr; size_t textLen; const char *docTextPtr; const char *systemId; const char *publicId; const char *notation; char open; char wfInContent; char wfInAttribute; char magic; } ENTITY; #define INIT_BLOCK_SIZE 1024 typedef struct block { struct block *next; char s[1]; } BLOCK; typedef struct { BLOCK *blocks; const char *end; char *ptr; char *start; } STRING_POOL; typedef struct { HASH_TABLE generalEntities; STRING_POOL pool; int containsRef; int standalone; char *groupConnector; size_t groupSize; } DTD; typedef struct { DTD dtd; size_t stackSize; const char **startName; int attsSize; ATTRIBUTE *atts; } CONTEXT; static void poolInit(STRING_POOL *); static void poolDestroy(STRING_POOL *); static const char *poolAppend(STRING_POOL *pool, const ENCODING *enc, const char *ptr, const char *end); static const char *poolStoreString(STRING_POOL *pool, const ENCODING *enc, const char *ptr, const char *end); static int poolGrow(STRING_POOL *); static int dtdInit(DTD *); static void dtdDestroy(DTD *); static int contextInit(CONTEXT *); static void contextDestroy(CONTEXT *); #define poolStart(pool) ((pool)->start) #define poolDiscard(pool) ((pool)->ptr = (pool)->start) #define poolFinish(pool) ((pool)->start = (pool)->ptr) static enum WfCheckResult checkProlog(DTD *, const char *s, const char *end, const char **, const ENCODING **enc); static enum WfCheckResult checkContent(size_t level, CONTEXT *context, const ENCODING *enc, const char *s, const char *end, const char **badPtr); static enum WfCheckResult checkGeneralTextEntity(CONTEXT *context, const char *s, const char *end, const char **nextPtr, const ENCODING **enc); static enum WfCheckResult checkAttributeValue(DTD *, const ENCODING *, const char *, const char *, const char **); static enum WfCheckResult checkAttributeUniqueness(CONTEXT *context, const ENCODING *enc, int nAtts, const char **badPtr); static enum WfCheckResult checkParsedEntities(CONTEXT *context, const char **badPtr); static enum WfCheckResult storeEntity(DTD *dtd, const ENCODING *enc, const char *entityNamePtr, const char *entityNameEnd, const char *entityTextPtr, const char *entityTextEnd, const char **badPtr); enum WfCheckResult wfCheck(enum EntityType entityType, const char *s, size_t n, const char **badPtr, unsigned long *badLine, unsigned long *badCol) { CONTEXT context; const ENCODING *enc; const char *start = s; const char *end = s + n; const char *next = 0; enum WfCheckResult result; if (!contextInit(&context)) { contextDestroy(&context); return noMemory; } if (entityType == documentEntity) { result = checkProlog(&context.dtd, s, end, &next, &enc); s = next; if (!result) { result = checkParsedEntities(&context, &next); s = next; if (!result) { result = checkContent(0, &context, enc, s, end, &next); s = next; } } } else { result = checkGeneralTextEntity(&context, s, end, &next, &enc); s = next; } if (result && s) { POSITION pos; memset(&pos, 0, sizeof(POSITION)); XmlUpdatePosition(enc, start, s, &pos); *badPtr = s; *badLine = pos.lineNumber; *badCol = pos.columnNumber; } contextDestroy(&context); return result; } static int contextInit(CONTEXT *p) { p->stackSize = 1024; p->startName = malloc(p->stackSize * sizeof(char *)); p->attsSize = 1024; p->atts = malloc(p->attsSize * sizeof(ATTRIBUTE)); return dtdInit(&(p->dtd)) && p->atts && p->startName; } static void contextDestroy(CONTEXT *p) { dtdDestroy(&(p->dtd)); free((void *)p->startName); free((void *)p->atts); } static enum WfCheckResult checkContent(size_t level, CONTEXT *context, const ENCODING *enc, const char *s, const char *end, const char **badPtr) { size_t startLevel = level; const char *next; int tok = XmlContentTok(enc, s, end, &next); for (;;) { switch (tok) { case XML_TOK_TRAILING_CR: case XML_TOK_NONE: if (startLevel > 0) { if (level != startLevel) { *badPtr = s; return asyncEntity; } return wellFormed; } *badPtr = s; return noElements; case XML_TOK_INVALID: *badPtr = next; return invalidToken; case XML_TOK_PARTIAL: *badPtr = s; return unclosedToken; case XML_TOK_PARTIAL_CHAR: *badPtr = s; return partialChar; case XML_TOK_EMPTY_ELEMENT_NO_ATTS: break; case XML_TOK_ENTITY_REF: { const char *name = poolStoreString(&context->dtd.pool, enc, s + enc->minBytesPerChar, next - enc->minBytesPerChar); ENTITY *entity = (ENTITY *)lookup(&context->dtd.generalEntities, name, 0); poolDiscard(&context->dtd.pool); if (!entity) { if (!context->dtd.containsRef || context->dtd.standalone) { *badPtr = s; return undefinedEntity; } break; } if (entity->wfInContent) break; if (entity->open) { *badPtr = s; return recursiveEntityRef; } if (entity->notation) { *badPtr = s; return binaryEntityRef; } if (entity) { if (entity->textPtr) { enum WfCheckResult result; const ENCODING *internalEnc = XmlGetInternalEncoding(XML_UTF8_ENCODING); entity->open = 1; result = checkContent(level, context, internalEnc, entity->textPtr, entity->textPtr + entity->textLen, badPtr); entity->open = 0; if (result && *badPtr) { *badPtr = s; return result; } entity->wfInContent = 1; } } break; } case XML_TOK_START_TAG_NO_ATTS: if (level == context->stackSize) { context->startName = realloc((void *)context->startName, (context->stackSize *= 2) * sizeof(char *)); if (!context->startName) return noMemory; } context->startName[level++] = s + enc->minBytesPerChar; break; case XML_TOK_START_TAG_WITH_ATTS: if (level == context->stackSize) { context->startName = realloc((void *)context->startName, (context->stackSize *= 2) * sizeof(char *)); if (!context->startName) return noMemory; } context->startName[level++] = s + enc->minBytesPerChar; /* fall through */ case XML_TOK_EMPTY_ELEMENT_WITH_ATTS: { int i; int n = XmlGetAttributes(enc, s, context->attsSize, context->atts); if (n > context->attsSize) { context->attsSize = 2*n; context->atts = realloc((void *)context->atts, context->attsSize * sizeof(ATTRIBUTE)); if (!context->atts) return noMemory; XmlGetAttributes(enc, s, n, context->atts); } for (i = 0; i < n; i++) { if (context->atts[i].containsRef) { enum WfCheckResult result = checkAttributeValue(&context->dtd, enc, context->atts[i].valuePtr, context->atts[i].valueEnd, badPtr); if (result) return result; } } if (i > 1) { enum WfCheckResult result = checkAttributeUniqueness(context, enc, n, badPtr); if (result) return result; } } break; case XML_TOK_END_TAG: if (level == startLevel) { *badPtr = s; return asyncEntity; } --level; if (!XmlSameName(enc, context->startName[level], s + enc->minBytesPerChar * 2)) { *badPtr = s; return tagMismatch; } break; case XML_TOK_CHAR_REF: if (XmlCharRefNumber(enc, s) < 0) { *badPtr = s; return badCharRef; } break; case XML_TOK_XML_DECL: *badPtr = s; return misplacedXmlPi; } s = next; if (level == 0) { do { tok = XmlPrologTok(enc, s, end, &next); switch (tok) { case XML_TOK_TRAILING_CR: case XML_TOK_NONE: return wellFormed; case XML_TOK_PROLOG_S: case XML_TOK_COMMENT: case XML_TOK_PI: s = next; break; default: if (tok > 0) { *badPtr = s; return junkAfterDocElement; } break; } } while (tok > 0); } else tok = XmlContentTok(enc, s, end, &next); } /* not reached */ } static int attcmp(const void *p1, const void *p2) { const ATTRIBUTE *a1 = p1; const ATTRIBUTE *a2 = p2; size_t n1 = a1->valuePtr - a1->name; size_t n2 = a2->valuePtr - a2->name; if (n1 == n2) { int n = memcmp(a1->name, a2->name, n1); if (n) return n; /* Sort identical attribute names by position, so that we always report the first duplicate attribute. */ if (a1->name < a2->name) return -1; else if (a1->name > a2->name) return 1; else return 0; } else if (n1 < n2) return -1; else return 1; } /* Note that this trashes the attribute values. */ static enum WfCheckResult checkAttributeUniqueness(CONTEXT *context, const ENCODING *enc, int nAtts, const char **badPtr) { #define QSORT_MIN_ATTS 10 if (nAtts < QSORT_MIN_ATTS) { int i; for (i = 1; i < nAtts; i++) { int j; for (j = 0; j < i; j++) { if (XmlSameName(enc, context->atts[i].name, context->atts[j].name)) { *badPtr = context->atts[i].name; return duplicateAttribute; } } } } else { int i; const char *dup = 0; /* Store the end of the name in valuePtr */ for (i = 0; i < nAtts; i++) { ATTRIBUTE *a = context->atts + i; a->valuePtr = a->name + XmlNameLength(enc, a->name); } qsort(context->atts, nAtts, sizeof(ATTRIBUTE), attcmp); for (i = 1; i < nAtts; i++) { ATTRIBUTE *a = context->atts + i; if (XmlSameName(enc, a->name, a[-1].name)) { if (!dup || a->name < dup) dup = a->name; } } if (dup) { *badPtr = dup; return duplicateAttribute; } } return wellFormed; } static enum WfCheckResult checkProlog(DTD *dtd, const char *s, const char *end, const char **nextPtr, const ENCODING **enc) { const char *entityNamePtr, *entityNameEnd; PROLOG_STATE state; ENTITY *entity; INIT_ENCODING initEnc; XmlInitEncoding(&initEnc, enc); XmlPrologStateInit(&state); for (;;) { const char *next; int tok = XmlPrologTok(*enc, s, end, &next); switch (XmlTokenRole(&state, tok, s, next, *enc)) { case XML_ROLE_XML_DECL: { const char *encodingName = 0; const ENCODING *encoding = 0; const char *version; int standalone = -1; if (!XmlParseXmlDecl(0, *enc, s, next, nextPtr, &version, &encodingName, &encoding, &standalone)) return syntaxError; if (encoding) { if (encoding->minBytesPerChar != (*enc)->minBytesPerChar) { *nextPtr = encodingName; return incorrectEncoding; } *enc = encoding; } else if (encodingName) { *nextPtr = encodingName; return unknownEncoding; } if (standalone == 1) dtd->standalone = 1; break; } case XML_ROLE_DOCTYPE_SYSTEM_ID: dtd->containsRef = 1; break; case XML_ROLE_DOCTYPE_PUBLIC_ID: case XML_ROLE_ENTITY_PUBLIC_ID: case XML_ROLE_NOTATION_PUBLIC_ID: if (!XmlIsPublicId(*enc, s, next, nextPtr)) return syntaxError; break; case XML_ROLE_INSTANCE_START: *nextPtr = s; return wellFormed; case XML_ROLE_DEFAULT_ATTRIBUTE_VALUE: case XML_ROLE_FIXED_ATTRIBUTE_VALUE: { const char *tem = 0; enum WfCheckResult result = checkAttributeValue(dtd, *enc, s + (*enc)->minBytesPerChar, next - (*enc)->minBytesPerChar, &tem); if (result) { if (tem) *nextPtr = tem; return result; } break; } case XML_ROLE_ENTITY_VALUE: { enum WfCheckResult result = storeEntity(dtd, *enc, entityNamePtr, entityNameEnd, s, next, nextPtr); if (result != wellFormed) return result; } break; case XML_ROLE_ENTITY_SYSTEM_ID: if (entityNamePtr) { const char *name = poolStoreString(&dtd->pool, *enc, entityNamePtr, entityNameEnd); entity = (ENTITY *)lookup(&dtd->generalEntities, name, sizeof(ENTITY)); if (entity->name != name) { poolDiscard(&dtd->pool); entity = 0; } else { poolFinish(&dtd->pool); entity->systemId = poolStoreString(&dtd->pool, *enc, s + (*enc)->minBytesPerChar, next - (*enc)->minBytesPerChar); poolFinish(&dtd->pool); } } break; case XML_ROLE_ENTITY_NOTATION_NAME: if (entity) { entity->notation = poolStoreString(&dtd->pool, *enc, s, next); poolFinish(&dtd->pool); } break; case XML_ROLE_GENERAL_ENTITY_NAME: entityNamePtr = s; entityNameEnd = next; break; case XML_ROLE_PARAM_ENTITY_NAME: entityNamePtr = 0; entityNameEnd = 0; break; case XML_ROLE_ERROR: *nextPtr = s; switch (tok) { case XML_TOK_PARAM_ENTITY_REF: return paramEntityRef; case XML_TOK_INVALID: *nextPtr = next; return invalidToken; case XML_TOK_NONE: return noElements; case XML_TOK_PARTIAL: return unclosedToken; case XML_TOK_PARTIAL_CHAR: return partialChar; case XML_TOK_TRAILING_CR: *nextPtr = s + (*enc)->minBytesPerChar; return noElements; case XML_TOK_XML_DECL: return misplacedXmlPi; default: return syntaxError; } case XML_ROLE_GROUP_OPEN: if (state.level >= dtd->groupSize) { if (dtd->groupSize) dtd->groupConnector = realloc(dtd->groupConnector, dtd->groupSize *= 2); else dtd->groupConnector = malloc(dtd->groupSize = 32); if (!dtd->groupConnector) return noMemory; } dtd->groupConnector[state.level] = 0; break; case XML_ROLE_GROUP_SEQUENCE: if (dtd->groupConnector[state.level] == '|') { *nextPtr = s; return syntaxError; } dtd->groupConnector[state.level] = ','; break; case XML_ROLE_GROUP_CHOICE: if (dtd->groupConnector[state.level] == ',') { *nextPtr = s; return syntaxError; } dtd->groupConnector[state.level] = '|'; break; case XML_ROLE_NONE: if (tok == XML_TOK_PARAM_ENTITY_REF) dtd->containsRef = 1; break; } s = next; } /* not reached */ } static enum WfCheckResult checkParsedEntities(CONTEXT *context, const char **badPtr) { HASH_TABLE_ITER iter; hashTableIterInit(&iter, &context->dtd.generalEntities); for (;;) { ENTITY *entity = (ENTITY *)hashTableIterNext(&iter); if (!entity) break; if (entity->textPtr && !entity->wfInContent && !entity->magic) { enum WfCheckResult result; const ENCODING *internalEnc = XmlGetInternalEncoding(XML_UTF8_ENCODING); entity->open = 1; result = checkContent(1, context, internalEnc, entity->textPtr, entity->textPtr + entity->textLen, badPtr); entity->open = 0; if (result && *badPtr) { *badPtr = entity->docTextPtr; return result; } entity->wfInContent = 1; } } return wellFormed; } static enum WfCheckResult checkGeneralTextEntity(CONTEXT *context, const char *s, const char *end, const char **nextPtr, const ENCODING **enc) { INIT_ENCODING initEnc; const char *next; int tok; XmlInitEncoding(&initEnc, enc); tok = XmlContentTok(*enc, s, end, &next); if (tok == XML_TOK_BOM) { s = next; tok = XmlContentTok(*enc, s, end, &next); } if (tok == XML_TOK_XML_DECL) { const char *encodingName = 0; const ENCODING *encoding = 0; const char *version; if (!XmlParseXmlDecl(1, *enc, s, next, nextPtr, &version, &encodingName, &encoding, 0)) return syntaxError; if (encoding) { if (encoding->minBytesPerChar != (*enc)->minBytesPerChar) { *nextPtr = encodingName; return incorrectEncoding; } *enc = encoding; } else if (encodingName) { *nextPtr = encodingName; return unknownEncoding; } s = next; } context->dtd.containsRef = 1; return checkContent(1, context, *enc, s, end, nextPtr); } static enum WfCheckResult checkAttributeValue(DTD *dtd, const ENCODING *enc, const char *ptr, const char *end, const char **badPtr) { for (;;) { const char *next; int tok = XmlAttributeValueTok(enc, ptr, end, &next); switch (tok) { case XML_TOK_TRAILING_CR: case XML_TOK_NONE: return wellFormed; case XML_TOK_INVALID: *badPtr = next; return invalidToken; case XML_TOK_PARTIAL: *badPtr = ptr; return invalidToken; case XML_TOK_CHAR_REF: if (XmlCharRefNumber(enc, ptr) < 0) { *badPtr = ptr; return badCharRef; } break; case XML_TOK_DATA_CHARS: case XML_TOK_DATA_NEWLINE: break; case XML_TOK_ENTITY_REF: { const char *name = poolStoreString(&dtd->pool, enc, ptr + enc->minBytesPerChar, next - enc->minBytesPerChar); ENTITY *entity = (ENTITY *)lookup(&dtd->generalEntities, name, 0); poolDiscard(&dtd->pool); if (!entity) { if (!dtd->containsRef) { *badPtr = ptr; return undefinedEntity; } break; } if (entity->wfInAttribute) break; if (entity->open) { *badPtr = ptr; return recursiveEntityRef; } if (entity->notation) { *badPtr = ptr; return binaryEntityRef; } if (entity) { if (entity->textPtr) { enum WfCheckResult result; const ENCODING *internalEnc = XmlGetInternalEncoding(XML_UTF8_ENCODING); const char *textEnd = entity->textPtr + entity->textLen; entity->open = 1; result = checkAttributeValue(dtd, internalEnc, entity->textPtr, textEnd, badPtr); entity->open = 0; if (result && *badPtr) { *badPtr = ptr; return result; } entity->wfInAttribute = 1; } else { *badPtr = ptr; return attributeExternalEntityRef; } } break; } break; default: abort(); } ptr = next; } /* not reached */ } static void poolInit(STRING_POOL *pool) { pool->blocks = 0; pool->start = 0; pool->ptr = 0; pool->end = 0; } static void poolDestroy(STRING_POOL *pool) { BLOCK *p = pool->blocks; while (p) { BLOCK *tem = p->next; free(p); p = tem; } pool->blocks = 0; pool->ptr = 0; pool->start = 0; pool->end = 0; } static const char *poolAppend(STRING_POOL *pool, const ENCODING *enc, const char *ptr, const char *end) { for (;;) { XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &(pool->ptr), pool->end); if (ptr == end) break; if (!poolGrow(pool)) return 0; } return pool->start; } static const char *poolStoreString(STRING_POOL *pool, const ENCODING *enc, const char *ptr, const char *end) { if (!poolAppend(pool, enc, ptr, end)) return 0; if (pool->ptr == pool->end && !poolGrow(pool)) return 0; *(pool->ptr)++ = 0; return pool->start; } static int poolGrow(STRING_POOL *pool) { if (pool->blocks && pool->start == pool->blocks->s) { size_t blockSize = (pool->end - pool->start)*2; pool->blocks = realloc(pool->blocks, offsetof(BLOCK, s) + blockSize); if (!pool->blocks) return 0; pool->ptr = pool->blocks->s + (pool->ptr - pool->start); pool->start = pool->blocks->s; pool->end = pool->start + blockSize; } else { BLOCK *tem; size_t blockSize = pool->end - pool->start; if (blockSize < INIT_BLOCK_SIZE) blockSize = INIT_BLOCK_SIZE; else blockSize *= 2; tem = malloc(offsetof(BLOCK, s) + blockSize); if (!tem) return 0; tem->next = pool->blocks; pool->blocks = tem; memcpy(tem->s, pool->start, pool->ptr - pool->start); pool->ptr = tem->s + (pool->ptr - pool->start); pool->start = tem->s; pool->end = tem->s + blockSize; } return 1; } static int dtdInit(DTD *dtd) { static const char *names[] = { "lt", "amp", "gt", "quot", "apos" }; static const char chars[] = { '<', '&', '>', '"', '\'' }; int i; poolInit(&(dtd->pool)); hashTableInit(&(dtd->generalEntities)); for (i = 0; i < 5; i++) { ENTITY *entity = (ENTITY *)lookup(&(dtd->generalEntities), names[i], sizeof(ENTITY)); if (!entity) return 0; entity->textPtr = chars + i; entity->textLen = 1; entity->magic = 1; entity->wfInContent = 1; entity->wfInAttribute = 1; } dtd->containsRef = 0; dtd->groupSize = 0; dtd->groupConnector = 0; return 1; } static void dtdDestroy(DTD *dtd) { poolDestroy(&(dtd->pool)); hashTableDestroy(&(dtd->generalEntities)); free(dtd->groupConnector); } static enum WfCheckResult storeEntity(DTD *dtd, const ENCODING *enc, const char *entityNamePtr, const char *entityNameEnd, const char *entityTextPtr, const char *entityTextEnd, const char **badPtr) { ENTITY *entity; const ENCODING *utf8 = XmlGetInternalEncoding(XML_UTF8_ENCODING); STRING_POOL *pool = &(dtd->pool); if (entityNamePtr) { if (!poolStoreString(pool, enc, entityNamePtr, entityNameEnd)) return noMemory; entity = (ENTITY *)lookup(&(dtd->generalEntities), pool->start, sizeof(ENTITY)); if (entity->name != pool->start) { poolDiscard(pool); entityNamePtr = 0; } else poolFinish(pool); } entityTextPtr += enc->minBytesPerChar; entityTextEnd -= enc->minBytesPerChar; if (entityNamePtr) entity->docTextPtr = entityTextPtr; for (;;) { const char *next; int tok = XmlEntityValueTok(enc, entityTextPtr, entityTextEnd, &next); switch (tok) { case XML_TOK_PARAM_ENTITY_REF: *badPtr = entityTextPtr; return syntaxError; case XML_TOK_NONE: if (entityNamePtr) { entity->textPtr = pool->start; entity->textLen = pool->ptr - pool->start; poolFinish(pool); } else poolDiscard(pool); return wellFormed; case XML_TOK_ENTITY_REF: case XML_TOK_DATA_CHARS: if (!poolAppend(pool, enc, entityTextPtr, next)) return noMemory; break; case XML_TOK_TRAILING_CR: next = entityTextPtr + enc->minBytesPerChar; /* fall through */ case XML_TOK_DATA_NEWLINE: if (pool->end == pool->ptr && !poolGrow(pool)) return noMemory; *(pool->ptr)++ = '\n'; break; case XML_TOK_CHAR_REF: { char buf[XML_MAX_BYTES_PER_CHAR]; int i; int n = XmlCharRefNumber(enc, entityTextPtr); if (n < 0) { *badPtr = entityTextPtr; return badCharRef; } n = XmlEncode(utf8, n, buf); if (!n) { *badPtr = entityTextPtr; return badCharRef; } for (i = 0; i < n; i++) { if (pool->end == pool->ptr && !poolGrow(pool)) return noMemory; *(pool->ptr)++ = buf[i]; } } break; case XML_TOK_PARTIAL: *badPtr = entityTextPtr; return invalidToken; case XML_TOK_INVALID: *badPtr = next; return invalidToken; default: abort(); } entityTextPtr = next; } /* not reached */ }