Fail to convert wide string with incomplete surrogates to UTF-8

Correctly fail if the wide string being converted is UTF-16 encoded (which can only happen on platforms using 16 bit wchar_t, i.e. MSW) and ends in the middle of a surrogate pair. Notice that other conversions still wrongly encode invalid wchar_t sequences such as 0xd800 not followed by anything, this will need to be fixed in the future, but for now at least make it work for the most commonly used conversion. See #17070.
2015-11-12 02:39:36 +01:00 · 2015-11-12 02:39:36 +01:00 · 048ba4b509
commit 048ba4b509
parent 6602eb3384
2 changed files with 35 additions and 6 deletions
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@ -1122,13 +1122,30 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,

        wxUint32 code;
 #ifdef WC_UTF16
-        // cast is ok for WC_UTF16
-        if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
+        // Be careful here: decode_utf16() may need to read the next wchar_t
+        // but we might not have any left, so pass it a temporary buffer which
+        // always has 2 wide characters and take care to set its second element
+        // to 0, which is invalid as a second half of a surrogate, to ensure
+        // that we return an error when trying to convert a buffer ending with
+        // half of a surrogate.
+        wxUint16 tmp[2];
+        tmp[0] = wp[0];
+        tmp[1] = srcLen != 0 ? wp[1] : 0;
+        switch ( decode_utf16(tmp, code) )
        {
-            // skip the next char too as we decoded a surrogate
-            wp++;
-            if ( srcLen != wxNO_LEN )
-                srcLen--;
+            case 1:
+                // Nothing special to do, just a character from BMP.
+                break;
+
+            case 2:
+                // skip the next char too as we decoded a surrogate
+                wp++;
+                if ( srcLen != wxNO_LEN )
+                    srcLen--;
+                break;
+
+            case wxCONV_FAILED:
+                return wxCONV_FAILED;
        }
 #else // wchar_t is UTF-32
        code = *wp & 0x7fffffff;
--- a/tests/mbconv/mbconvtest.cpp
+++ b/tests/mbconv/mbconvtest.cpp
@ -203,6 +203,12 @@ private:
    void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); }
    void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); }

+    // Test that converting string with incomplete surrogates in them fails
+    // (surrogates are only used in UTF-16, i.e. when wchar_t is 16 bits).
+#if SIZEOF_WCHAR_T == 2
+    void UTF8_fail_broken_surrogates();
+#endif // SIZEOF_WCHAR_T == 2
+
    // implementation for the utf-8 tests (see comments below)
    void UTF8(const char *charSequence, const wchar_t *wideSequence);
    void UTF8PUA(const char *charSequence, const wchar_t *wideSequence);
@ -461,6 +467,12 @@ void MBConvTestCase::UTF8Tests()
        wxConvUTF8,
        1
        );
+
+#if SIZEOF_WCHAR_T == 2
+    // Can't use \ud800 as it's an invalid Unicode character.
+    const wchar_t wc = 0xd800;
+    CPPUNIT_ASSERT_EQUAL(wxCONV_FAILED, wxConvUTF8.FromWChar(NULL, 0, &wc, 1));
+#endif // SIZEOF_WCHAR_T == 2
 }

 void MBConvTestCase::UTF16LETests()