Cover UTF-8 limit correction; some tests fail

Failing tests are:
[-] UTF-8 case  3: Expected movement by -1 chars, actually moved by  0 chars: "\xdf"
[-] UTF-8 case  4: Expected movement by  0 chars, actually moved by -1 chars: "\xdf\xbf"
[-] UTF-8 case  5: Expected movement by -1 chars, actually moved by  0 chars: "\xef"
[-] UTF-8 case  6: Expected movement by -2 chars, actually moved by -1 chars: "\xef\xbf"
[-] UTF-8 case  7: Expected movement by  0 chars, actually moved by -2 chars: "\xef\xbf\xbf"
[-] UTF-8 case  8: Expected movement by -1 chars, actually moved by  0 chars: "\xf7"
[-] UTF-8 case  9: Expected movement by -2 chars, actually moved by -1 chars: "\xf7\xbf"
[-] UTF-8 case 10: Expected movement by -3 chars, actually moved by -2 chars: "\xf7\xbf\xbf"
[-] UTF-8 case 11: Expected movement by  0 chars, actually moved by -3 chars: "\xf7\xbf\xbf\xbf"
This commit is contained in:
Sebastian Pipping 2016-05-20 22:20:19 +02:00
parent 525be92f78
commit be917d9f84
3 changed files with 82 additions and 1 deletions

View File

@ -79,3 +79,17 @@
# define UNUSED_P(p) UNUSED_ ## p # define UNUSED_P(p) UNUSED_ ## p
# endif # endif
#endif #endif
#ifdef __cplusplus
extern "C" {
#endif
void
align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef);
#ifdef __cplusplus
}
#endif

View File

@ -329,7 +329,7 @@ enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
UTF8_cval4 = 0xf0 UTF8_cval4 = 0xf0
}; };
static void void
align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef) align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef)
{ {
const char * fromLim = *fromLimRef; const char * fromLim = *fromLimRef;

View File

@ -13,6 +13,10 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
#include <stddef.h> /* ptrdiff_t */
#ifndef __cplusplus
# include <stdbool.h>
#endif
#include "expat.h" #include "expat.h"
#include "chardata.h" #include "chardata.h"
@ -367,6 +371,68 @@ START_TEST(test_illegal_utf8)
} }
END_TEST END_TEST
/* Examples, not masks: */
#define UTF8_LEAD_1 "\x7f" /* 0b01111111 */
#define UTF8_LEAD_2 "\xdf" /* 0b11011111 */
#define UTF8_LEAD_3 "\xef" /* 0b11101111 */
#define UTF8_LEAD_4 "\xf7" /* 0b11110111 */
#define UTF8_FOLLOW "\xbf" /* 0b10111111 */
START_TEST(test_utf8_auto_align)
{
struct TestCase {
ptrdiff_t expectedMovementInChars;
const char * input;
};
struct TestCase cases[] = {
{00, ""},
{00, UTF8_LEAD_1},
{-1, UTF8_LEAD_2},
{00, UTF8_LEAD_2 UTF8_FOLLOW},
{-1, UTF8_LEAD_3},
{-2, UTF8_LEAD_3 UTF8_FOLLOW},
{00, UTF8_LEAD_3 UTF8_FOLLOW UTF8_FOLLOW},
{-1, UTF8_LEAD_4},
{-2, UTF8_LEAD_4 UTF8_FOLLOW},
{-3, UTF8_LEAD_4 UTF8_FOLLOW UTF8_FOLLOW},
{00, UTF8_LEAD_4 UTF8_FOLLOW UTF8_FOLLOW UTF8_FOLLOW},
};
size_t i = 0;
bool success = true;
for (; i < sizeof(cases) / sizeof(*cases); i++) {
const char * fromLim = cases[i].input + strlen(cases[i].input);
const char * const fromLimInitially = fromLim;
ptrdiff_t actualMovementInChars;
align_limit_to_full_utf8_characters(cases[i].input, &fromLim);
actualMovementInChars = (fromLim - fromLimInitially);
if (actualMovementInChars != cases[i].expectedMovementInChars) {
size_t j = 0;
success = false;
printf("[-] UTF-8 case %2lu: Expected movement by %2ld chars"
", actually moved by %2ld chars: \"",
i + 1, cases[i].expectedMovementInChars, actualMovementInChars);
for (; j < strlen(cases[i].input); j++) {
printf("\\x%02x", (unsigned char)cases[i].input[j]);
}
printf("\"\n");
}
}
if (! success) {
fail("UTF-8 auto-alignment is not bullet-proof\n");
}
}
END_TEST
START_TEST(test_utf16) START_TEST(test_utf16)
{ {
/* <?xml version="1.0" encoding="UTF-16"?> /* <?xml version="1.0" encoding="UTF-16"?>
@ -1543,6 +1609,7 @@ make_suite(void)
tcase_add_test(tc_basic, test_bom_utf16_be); tcase_add_test(tc_basic, test_bom_utf16_be);
tcase_add_test(tc_basic, test_bom_utf16_le); tcase_add_test(tc_basic, test_bom_utf16_le);
tcase_add_test(tc_basic, test_illegal_utf8); tcase_add_test(tc_basic, test_illegal_utf8);
tcase_add_test(tc_basic, test_utf8_auto_align);
tcase_add_test(tc_basic, test_utf16); tcase_add_test(tc_basic, test_utf16);
tcase_add_test(tc_basic, test_utf16_le_epilog_newline); tcase_add_test(tc_basic, test_utf16_le_epilog_newline);
tcase_add_test(tc_basic, test_latin1_umlauts); tcase_add_test(tc_basic, test_latin1_umlauts);