Fail to convert wide string with incomplete surrogates to UTF-8

Correctly fail if the wide string being converted is UTF-16 encoded (which can
only happen on platforms using 16 bit wchar_t, i.e. MSW) and ends in the
middle of a surrogate pair.

Notice that other conversions still wrongly encode invalid wchar_t sequences
such as 0xd800 not followed by anything, this will need to be fixed in the
future, but for now at least make it work for the most commonly used
conversion.

See #17070.
This commit is contained in:
Vadim Zeitlin 2015-11-12 02:39:36 +01:00
parent 6602eb3384
commit 048ba4b509
2 changed files with 35 additions and 6 deletions

View File

@ -1122,13 +1122,30 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
wxUint32 code;
#ifdef WC_UTF16
// cast is ok for WC_UTF16
if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
// Be careful here: decode_utf16() may need to read the next wchar_t
// but we might not have any left, so pass it a temporary buffer which
// always has 2 wide characters and take care to set its second element
// to 0, which is invalid as a second half of a surrogate, to ensure
// that we return an error when trying to convert a buffer ending with
// half of a surrogate.
wxUint16 tmp[2];
tmp[0] = wp[0];
tmp[1] = srcLen != 0 ? wp[1] : 0;
switch ( decode_utf16(tmp, code) )
{
// skip the next char too as we decoded a surrogate
wp++;
if ( srcLen != wxNO_LEN )
srcLen--;
case 1:
// Nothing special to do, just a character from BMP.
break;
case 2:
// skip the next char too as we decoded a surrogate
wp++;
if ( srcLen != wxNO_LEN )
srcLen--;
break;
case wxCONV_FAILED:
return wxCONV_FAILED;
}
#else // wchar_t is UTF-32
code = *wp & 0x7fffffff;

View File

@ -203,6 +203,12 @@ private:
void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); }
void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); }
// Test that converting string with incomplete surrogates in them fails
// (surrogates are only used in UTF-16, i.e. when wchar_t is 16 bits).
#if SIZEOF_WCHAR_T == 2
void UTF8_fail_broken_surrogates();
#endif // SIZEOF_WCHAR_T == 2
// implementation for the utf-8 tests (see comments below)
void UTF8(const char *charSequence, const wchar_t *wideSequence);
void UTF8PUA(const char *charSequence, const wchar_t *wideSequence);
@ -461,6 +467,12 @@ void MBConvTestCase::UTF8Tests()
wxConvUTF8,
1
);
#if SIZEOF_WCHAR_T == 2
// Can't use \ud800 as it's an invalid Unicode character.
const wchar_t wc = 0xd800;
CPPUNIT_ASSERT_EQUAL(wxCONV_FAILED, wxConvUTF8.FromWChar(NULL, 0, &wc, 1));
#endif // SIZEOF_WCHAR_T == 2
}
void MBConvTestCase::UTF16LETests()