Cover UTF-8 limit correction; some tests fail
Failing tests are: [-] UTF-8 case 3: Expected movement by -1 chars, actually moved by 0 chars: "\xdf" [-] UTF-8 case 4: Expected movement by 0 chars, actually moved by -1 chars: "\xdf\xbf" [-] UTF-8 case 5: Expected movement by -1 chars, actually moved by 0 chars: "\xef" [-] UTF-8 case 6: Expected movement by -2 chars, actually moved by -1 chars: "\xef\xbf" [-] UTF-8 case 7: Expected movement by 0 chars, actually moved by -2 chars: "\xef\xbf\xbf" [-] UTF-8 case 8: Expected movement by -1 chars, actually moved by 0 chars: "\xf7" [-] UTF-8 case 9: Expected movement by -2 chars, actually moved by -1 chars: "\xf7\xbf" [-] UTF-8 case 10: Expected movement by -3 chars, actually moved by -2 chars: "\xf7\xbf\xbf" [-] UTF-8 case 11: Expected movement by 0 chars, actually moved by -3 chars: "\xf7\xbf\xbf\xbf"
This commit is contained in:
parent
525be92f78
commit
be917d9f84
@ -79,3 +79,17 @@
|
|||||||
# define UNUSED_P(p) UNUSED_ ## p
|
# define UNUSED_P(p) UNUSED_ ## p
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
@ -329,7 +329,7 @@ enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
|
|||||||
UTF8_cval4 = 0xf0
|
UTF8_cval4 = 0xf0
|
||||||
};
|
};
|
||||||
|
|
||||||
static void
|
void
|
||||||
align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef)
|
align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef)
|
||||||
{
|
{
|
||||||
const char * fromLim = *fromLimRef;
|
const char * fromLim = *fromLimRef;
|
||||||
|
@ -13,6 +13,10 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <stddef.h> /* ptrdiff_t */
|
||||||
|
#ifndef __cplusplus
|
||||||
|
# include <stdbool.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "expat.h"
|
#include "expat.h"
|
||||||
#include "chardata.h"
|
#include "chardata.h"
|
||||||
@ -367,6 +371,68 @@ START_TEST(test_illegal_utf8)
|
|||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
|
|
||||||
|
/* Examples, not masks: */
|
||||||
|
#define UTF8_LEAD_1 "\x7f" /* 0b01111111 */
|
||||||
|
#define UTF8_LEAD_2 "\xdf" /* 0b11011111 */
|
||||||
|
#define UTF8_LEAD_3 "\xef" /* 0b11101111 */
|
||||||
|
#define UTF8_LEAD_4 "\xf7" /* 0b11110111 */
|
||||||
|
#define UTF8_FOLLOW "\xbf" /* 0b10111111 */
|
||||||
|
|
||||||
|
START_TEST(test_utf8_auto_align)
|
||||||
|
{
|
||||||
|
struct TestCase {
|
||||||
|
ptrdiff_t expectedMovementInChars;
|
||||||
|
const char * input;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct TestCase cases[] = {
|
||||||
|
{00, ""},
|
||||||
|
|
||||||
|
{00, UTF8_LEAD_1},
|
||||||
|
|
||||||
|
{-1, UTF8_LEAD_2},
|
||||||
|
{00, UTF8_LEAD_2 UTF8_FOLLOW},
|
||||||
|
|
||||||
|
{-1, UTF8_LEAD_3},
|
||||||
|
{-2, UTF8_LEAD_3 UTF8_FOLLOW},
|
||||||
|
{00, UTF8_LEAD_3 UTF8_FOLLOW UTF8_FOLLOW},
|
||||||
|
|
||||||
|
{-1, UTF8_LEAD_4},
|
||||||
|
{-2, UTF8_LEAD_4 UTF8_FOLLOW},
|
||||||
|
{-3, UTF8_LEAD_4 UTF8_FOLLOW UTF8_FOLLOW},
|
||||||
|
{00, UTF8_LEAD_4 UTF8_FOLLOW UTF8_FOLLOW UTF8_FOLLOW},
|
||||||
|
};
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
bool success = true;
|
||||||
|
for (; i < sizeof(cases) / sizeof(*cases); i++) {
|
||||||
|
const char * fromLim = cases[i].input + strlen(cases[i].input);
|
||||||
|
const char * const fromLimInitially = fromLim;
|
||||||
|
ptrdiff_t actualMovementInChars;
|
||||||
|
|
||||||
|
align_limit_to_full_utf8_characters(cases[i].input, &fromLim);
|
||||||
|
|
||||||
|
actualMovementInChars = (fromLim - fromLimInitially);
|
||||||
|
if (actualMovementInChars != cases[i].expectedMovementInChars) {
|
||||||
|
size_t j = 0;
|
||||||
|
success = false;
|
||||||
|
printf("[-] UTF-8 case %2lu: Expected movement by %2ld chars"
|
||||||
|
", actually moved by %2ld chars: \"",
|
||||||
|
i + 1, cases[i].expectedMovementInChars, actualMovementInChars);
|
||||||
|
for (; j < strlen(cases[i].input); j++) {
|
||||||
|
printf("\\x%02x", (unsigned char)cases[i].input[j]);
|
||||||
|
}
|
||||||
|
printf("\"\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (! success) {
|
||||||
|
fail("UTF-8 auto-alignment is not bullet-proof\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
START_TEST(test_utf16)
|
START_TEST(test_utf16)
|
||||||
{
|
{
|
||||||
/* <?xml version="1.0" encoding="UTF-16"?>
|
/* <?xml version="1.0" encoding="UTF-16"?>
|
||||||
@ -1543,6 +1609,7 @@ make_suite(void)
|
|||||||
tcase_add_test(tc_basic, test_bom_utf16_be);
|
tcase_add_test(tc_basic, test_bom_utf16_be);
|
||||||
tcase_add_test(tc_basic, test_bom_utf16_le);
|
tcase_add_test(tc_basic, test_bom_utf16_le);
|
||||||
tcase_add_test(tc_basic, test_illegal_utf8);
|
tcase_add_test(tc_basic, test_illegal_utf8);
|
||||||
|
tcase_add_test(tc_basic, test_utf8_auto_align);
|
||||||
tcase_add_test(tc_basic, test_utf16);
|
tcase_add_test(tc_basic, test_utf16);
|
||||||
tcase_add_test(tc_basic, test_utf16_le_epilog_newline);
|
tcase_add_test(tc_basic, test_utf16_le_epilog_newline);
|
||||||
tcase_add_test(tc_basic, test_latin1_umlauts);
|
tcase_add_test(tc_basic, test_latin1_umlauts);
|
||||||
|
Loading…
Reference in New Issue
Block a user