From f365b0712b522e2f5246922bae21d328a04b44aa Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Wed, 8 Nov 2017 23:27:13 +0100 Subject: [PATCH 1/7] Get rid of wxTextInputStream code dealing with wxEOT Having NextChar() returning wxEOT only for GetChar() to turn it back to NUL didn't make any sense, just return NUL directly and get rid of GetChar/NextChar() distinction. No real changes, just simplify the code. --- include/wx/txtstrm.h | 7 +++---- src/common/txtstrm.cpp | 27 ++++++++++++++------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/wx/txtstrm.h b/include/wx/txtstrm.h index 8fc43a176f..5218f44520 100644 --- a/include/wx/txtstrm.h +++ b/include/wx/txtstrm.h @@ -25,7 +25,8 @@ typedef wxTextOutputStream& (*__wxTextOutputManip)(wxTextOutputStream&); WXDLLIMPEXP_BASE wxTextOutputStream &endl( wxTextOutputStream &stream ); -#define wxEOT wxT('\4') // the End-Of-Text control code (used only inside wxTextInputStream) +// Obsolete constant defined only for compatibility, not used. +#define wxEOT wxT('\4') // If you're scanning through a file using wxTextInputStream, you should check for EOF _before_ // reading the next item (word / number), because otherwise the last item may get lost. @@ -58,7 +59,7 @@ public: double ReadDouble(); wxString ReadLine(); wxString ReadWord(); - wxChar GetChar() { wxChar c = NextChar(); return (wxChar)(c != wxEOT ? c : 0); } + wxChar GetChar(); wxString GetStringSeparators() const { return m_separators; } void SetStringSeparators(const wxString &c) { m_separators = c; } @@ -100,8 +101,6 @@ protected: bool EatEOL(const wxChar &c); void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues - // returns EOT (\4) if there is a stream error, or end of file - wxChar NextChar(); // this should be used instead of GetC() because of Unicode issues wxChar NextNonSeparators(); wxDECLARE_NO_COPY_CLASS(wxTextInputStream); diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 8c48cd6c77..86c19583b0 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -65,7 +65,7 @@ void wxTextInputStream::UngetLast() memset((void*)m_lastBytes, 0, 10); } -wxChar wxTextInputStream::NextChar() +wxChar wxTextInputStream::GetChar() { #if wxUSE_UNICODE #if SIZEOF_WCHAR_T == 2 @@ -87,7 +87,7 @@ wxChar wxTextInputStream::NextChar() m_lastBytes[inlen] = m_input.GetC(); if(m_input.LastRead() <= 0) - return wxEOT; + return 0; switch ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1) ) { @@ -108,7 +108,7 @@ wxChar wxTextInputStream::NextChar() // them with an extra single byte, something fishy is going on // (except if we use UTF-16, see below) wxFAIL_MSG("unexpected decoding result"); - return wxEOT; + return 0; #if SIZEOF_WCHAR_T == 2 case 2: @@ -131,12 +131,12 @@ wxChar wxTextInputStream::NextChar() // there should be no encoding which requires more than nine bytes for one // character so something must be wrong with our conversion but we have no // way to signal it from here - return wxEOT; + return 0; #else m_lastBytes[0] = m_input.GetC(); if(m_input.LastRead() <= 0) - return wxEOT; + return 0; return m_lastBytes[0]; #endif @@ -147,8 +147,9 @@ wxChar wxTextInputStream::NextNonSeparators() { for (;;) { - wxChar c = NextChar(); - if (c == wxEOT) return (wxChar) 0; + wxChar c = GetChar(); + if (!c) + return c; if (c != wxT('\n') && c != wxT('\r') && @@ -164,8 +165,8 @@ bool wxTextInputStream::EatEOL(const wxChar &c) if (c == wxT('\r')) // eat on both Mac and DOS { - wxChar c2 = NextChar(); - if(c2 == wxEOT) return true; // end of stream reached, had enough :-) + wxChar c2 = GetChar(); + if (!c2) return true; // end of stream reached, had enough :-) if (c2 != wxT('\n')) UngetLast(); // Don't eat on Mac return true; @@ -261,8 +262,8 @@ wxString wxTextInputStream::ReadLine() while ( !m_input.Eof() ) { - wxChar c = NextChar(); - if(c == wxEOT) + wxChar c = GetChar(); + if (!c) break; if (EatEOL(c)) @@ -289,8 +290,8 @@ wxString wxTextInputStream::ReadWord() while ( !m_input.Eof() ) { - c = NextChar(); - if(c == wxEOT) + c = GetChar(); + if (!c) break; if (m_separators.Find(c) >= 0) From 666ff421bb19d7ed453477d66ac580de017b13ba Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 9 Nov 2017 21:56:07 +0100 Subject: [PATCH 2/7] Fix an out of bounds read in UTF-7 decoding code Calling wxMBConvUTF7::ToWChar(..., "+", 1) resulted in reading uninitialized memory as the decoding code didn't check that there were any bytes left when switching to the "shifted" mode. Fix this by explicitly checking for this and returning an error if nothing is left. --- src/common/strconv.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 11d1adbc54..51808a3078 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -699,6 +699,10 @@ size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen, // start of an encoded segment? if ( cc == '+' ) { + // Can't end with a plus sign. + if ( src == srcEnd ) + return wxCONV_FAILED; + if ( *src == '-' ) { // just the encoded plus sign, don't switch to shifted mode From 9d5ff447e1f8bc9424982dde4fc8ce5f8b0ed48a Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 9 Nov 2017 22:09:29 +0100 Subject: [PATCH 3/7] Add subsections to MBConvTestCase::NonBMPCharTests() This allows to immediately see which of the tests failed just looking at the logs instead of having to check the failure line number manually. No real changes. --- tests/mbconv/mbconvtest.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/mbconv/mbconvtest.cpp b/tests/mbconv/mbconvtest.cpp index 5a0afa4e5b..7edee7bdd6 100644 --- a/tests/mbconv/mbconvtest.cpp +++ b/tests/mbconv/mbconvtest.cpp @@ -985,6 +985,7 @@ void MBConvTestCase::NonBMPCharTests() TestDecoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1); TestEncoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1); } + SECTION("wxMBConvUTF16LE") { char u16le[sizeof(u16)]; for (size_t i = 0; i < sizeof(u16)/2; ++i) { @@ -995,6 +996,7 @@ void MBConvTestCase::NonBMPCharTests() TestDecoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2); TestEncoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2); } + SECTION("wxMBConvUTF16BE") { char u16be[sizeof(u16)]; for (size_t i = 0; i < sizeof(u16)/2; ++i) { @@ -1005,6 +1007,7 @@ void MBConvTestCase::NonBMPCharTests() TestDecoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2); TestEncoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2); } + SECTION("wxMBConvUTF32LE") { char u32le[sizeof(u32)]; for (size_t i = 0; i < sizeof(u32)/4; ++i) { @@ -1017,6 +1020,7 @@ void MBConvTestCase::NonBMPCharTests() TestDecoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4); TestEncoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4); } + SECTION("wxMBConvUTF32BE") { char u32be[sizeof(u32)]; for (size_t i = 0; i < sizeof(u32)/4; ++i) { From 2ee199acac7185894323c4ec1d2391469ef5da9f Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 9 Nov 2017 23:45:29 +0100 Subject: [PATCH 4/7] Change decode_utf16() to take wxChar16 instead of wxUint16 Under Unix systems, this is the same thing, but under MSW, where sizeof(wchar_t) == 2, this allows to pass wchar_t pointers to this function without casts. It also makes it consistent with wxDecodeSurrogate() and allows to get rid of another ugly cast there. No real changes. --- src/common/strconv.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 51808a3078..857486766a 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -103,7 +103,7 @@ static size_t encode_utf16(wxUint32 input, wxUint16 *output) } } -static size_t decode_utf16(const wxUint16* input, wxUint32& output) +static size_t decode_utf16(const wxChar16* input, wxUint32& output) { if ((*input < 0xd800) || (*input > 0xdfff)) { @@ -130,8 +130,7 @@ static size_t decode_utf16(const wxUint16* input, wxUint32& output) static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc) { wxUint32 out; - const size_t - n = decode_utf16(reinterpret_cast(*pSrc), out); + const size_t n = decode_utf16(*pSrc, out); if ( n == wxCONV_FAILED ) *pSrc = NULL; else @@ -1107,7 +1106,7 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, // to 0, which is invalid as a second half of a surrogate, to ensure // that we return an error when trying to convert a buffer ending with // half of a surrogate. - wxUint16 tmp[2]; + wchar_t tmp[2]; tmp[0] = wp[0]; tmp[1] = srcLen != 0 ? wp[1] : 0; switch ( decode_utf16(tmp, code) ) @@ -1391,8 +1390,7 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, wxUint32 cc; #ifdef WC_UTF16 - // cast is ok for WC_UTF16 - size_t pa = decode_utf16((const wxUint16 *)psz, cc); + size_t pa = decode_utf16(psz, cc); // we could have consumed two input code units if we decoded a // surrogate, so adjust the input pointer and, if necessary, the length From d82e3d4429be9c8c05b58b5bef7f010280b3a22a Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 9 Nov 2017 22:37:12 +0100 Subject: [PATCH 5/7] Never read uninitialized memory when decoding UTF-16 again Pass length value to decode_utf16() and end pointer to wxDecodeSurrogate() to ensure that we never read beyond the end of the buffer when decoding UTF-16 when the last (complete) 16 bit value in the buffer is the first half of a surrogate. This had been previously partially addressed by ad hoc changes, e.g. f72aa7b1c96c8bbf745459b488716d6ac58f0a2a did it for wxMBConvUTF16swap, but the problem still remained for wxMBConvUTF16straight. Ensure that this bug is fixed everywhere now but making it impossible to even try decoding a surrogate without providing the buffer length. --- src/common/strconv.cpp | 79 +++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 36 deletions(-) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 857486766a..58a8363501 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -103,14 +103,15 @@ static size_t encode_utf16(wxUint32 input, wxUint16 *output) } } -static size_t decode_utf16(const wxChar16* input, wxUint32& output) +static size_t +decode_utf16(const wxChar16* input, size_t len, wxUint32& output) { if ((*input < 0xd800) || (*input > 0xdfff)) { output = *input; return 1; } - else if ((input[1] < 0xdc00) || (input[1] > 0xdfff)) + else if ( !len || (input[1] < 0xdc00) || (input[1] > 0xdfff) ) { output = *input; return wxCONV_FAILED; @@ -122,15 +123,16 @@ static size_t decode_utf16(const wxChar16* input, wxUint32& output) } } -// returns the next UTF-32 character from the wchar_t buffer and advances the -// pointer to the character after this one +// Returns the next UTF-32 character from the wchar_t buffer terminated by the +// "end" pointer (the caller must ensure that on input "*pSrc < end") and +// advances the pointer to the character after this one. // -// if an invalid character is found, *pSrc is set to NULL, the caller must -// check for this -static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc) +// If an invalid or incomplete character is found, *pSrc is set to NULL, the +// caller must check for this. +static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc, const wxChar16* end) { wxUint32 out; - const size_t n = decode_utf16(*pSrc, out); + const size_t n = decode_utf16(*pSrc, end - *pSrc - 1, out); if ( n == wxCONV_FAILED ) *pSrc = NULL; else @@ -1100,16 +1102,7 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, wxUint32 code; #ifdef WC_UTF16 - // Be careful here: decode_utf16() may need to read the next wchar_t - // but we might not have any left, so pass it a temporary buffer which - // always has 2 wide characters and take care to set its second element - // to 0, which is invalid as a second half of a surrogate, to ensure - // that we return an error when trying to convert a buffer ending with - // half of a surrogate. - wchar_t tmp[2]; - tmp[0] = wp[0]; - tmp[1] = srcLen != 0 ? wp[1] : 0; - switch ( decode_utf16(tmp, code) ) + switch ( decode_utf16(wp, srcLen, code) ) { case 1: // Nothing special to do, just a character from BMP. @@ -1390,13 +1383,20 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, wxUint32 cc; #ifdef WC_UTF16 - size_t pa = decode_utf16(psz, cc); + switch ( decode_utf16(psz++, srcLen, cc) ) + { + case 1: + break; - // we could have consumed two input code units if we decoded a - // surrogate, so adjust the input pointer and, if necessary, the length - psz += (pa == wxCONV_FAILED) ? 1 : pa; - if ( pa == 2 && !isNulTerminated ) - srcLen--; + case 2: + psz++; + if ( !isNulTerminated ) + srcLen--; + break; + + case wxCONV_FAILED: + return wxCONV_FAILED; + } #else cc = (*psz++) & 0x7fffffff; #endif @@ -1625,7 +1625,7 @@ wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen, const wxUint16 *inBuff = reinterpret_cast(src); for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) { - const wxUint32 ch = wxDecodeSurrogate(&inBuff); + const wxUint32 ch = wxDecodeSurrogate(&inBuff, inEnd); if ( !inBuff ) return wxCONV_FAILED; @@ -1698,26 +1698,33 @@ wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, wxUint32 ch; wxUint16 tmp[2]; + size_t len; tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff); if ( ++inBuff < inEnd ) { // Normal case, we have a next character to decode. + len = 1; tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff); } else // End of input. { - // Setting the second character to 0 ensures we correctly return - // wxCONV_FAILED if the first one is the first half of a surrogate - // as the second half can't be 0 in this case. - tmp[1] = 0; + // Setting the length to 0 ensures we correctly return wxCONV_FAILED + // if the first one is the first half of a surrogate. + len = 0; } - const size_t numChars = decode_utf16(tmp, ch); - if ( numChars == wxCONV_FAILED ) - return wxCONV_FAILED; + switch ( decode_utf16(tmp, len, ch) ) + { + case 1: + break; - if ( numChars == 2 ) - inBuff++; + case 2: + inBuff++; + break; + + case wxCONV_FAILED: + return wxCONV_FAILED; + } outLen++; @@ -1863,7 +1870,7 @@ wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen, size_t outLen = 0; for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) { - const wxUint32 ch = wxDecodeSurrogate(&src); + const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd); if ( !src ) return wxCONV_FAILED; @@ -1932,7 +1939,7 @@ wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen, size_t outLen = 0; for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) { - const wxUint32 ch = wxDecodeSurrogate(&src); + const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd); if ( !src ) return wxCONV_FAILED; From 46ea3cb8c0afc36d3a0fd33e14a483c91367f03e Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 9 Nov 2017 23:24:43 +0100 Subject: [PATCH 6/7] Refactor: merge decode_utf16() into wxDecodeSurrogate() No real changes, but just get rid of two functions doing the same thing but using (semantically) different API, this was just too confusing. Change all the code to use wxDecodeSurrogate() that encapsulates decoding the surrogate and advancing the input pointer as needed and so is less error-prone. More generally, change the code to use end pointers instead of decrementing the length to check for the end condition: this is more clear, simpler and probably even more efficient. --- src/common/strconv.cpp | 142 ++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 88 deletions(-) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 58a8363501..dcdf7ba76d 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -103,26 +103,6 @@ static size_t encode_utf16(wxUint32 input, wxUint16 *output) } } -static size_t -decode_utf16(const wxChar16* input, size_t len, wxUint32& output) -{ - if ((*input < 0xd800) || (*input > 0xdfff)) - { - output = *input; - return 1; - } - else if ( !len || (input[1] < 0xdc00) || (input[1] > 0xdfff) ) - { - output = *input; - return wxCONV_FAILED; - } - else - { - output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00); - return 2; - } -} - // Returns the next UTF-32 character from the wchar_t buffer terminated by the // "end" pointer (the caller must ensure that on input "*pSrc < end") and // advances the pointer to the character after this one. @@ -131,14 +111,36 @@ decode_utf16(const wxChar16* input, size_t len, wxUint32& output) // caller must check for this. static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc, const wxChar16* end) { - wxUint32 out; - const size_t n = decode_utf16(*pSrc, end - *pSrc - 1, out); - if ( n == wxCONV_FAILED ) - *pSrc = NULL; - else - *pSrc += n; + const wxChar16*& src = *pSrc; - return out; + // Is this a BMP character? + const wxUint16 u = *src++; + if ((u < 0xd800) || (u > 0xdfff)) + { + // Yes, just return it. + return u; + } + + // No, we have the first half of a surrogate, check if we also have the + // second half (notice that this check does nothing if end == NULL, as it + // is allowed to be, and this is correct). + if ( src == end ) + { + // No, we don't because this is the end of input. + src = NULL; + return 0; + } + + const wxUint16 u2 = *src++; + if ( (u2 < 0xdc00) || (u2 > 0xdfff) ) + { + // No, it's not in the low surrogate range. + src = NULL; + return 0; + } + + // Yes, decode it and return the corresponding Unicode character. + return ((u - 0xd7c0) << 10) + (u2 - 0xdc00); } // ---------------------------------------------------------------------------- @@ -1075,9 +1077,10 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, char *out = dstLen ? dst : NULL; size_t written = 0; - for ( const wchar_t *wp = src; ; wp++ ) + const wchar_t* const end = srcLen == wxNO_LEN ? NULL : src + srcLen; + for ( const wchar_t *wp = src; ; ) { - if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) ) + if ( end ? wp == end : !*wp ) { // all done successfully, just add the trailing NULL if we are not // using explicit length @@ -1097,29 +1100,13 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, return written; } - if ( srcLen != wxNO_LEN ) - srcLen--; - wxUint32 code; #ifdef WC_UTF16 - switch ( decode_utf16(wp, srcLen, code) ) - { - case 1: - // Nothing special to do, just a character from BMP. - break; - - case 2: - // skip the next char too as we decoded a surrogate - wp++; - if ( srcLen != wxNO_LEN ) - srcLen--; - break; - - case wxCONV_FAILED: - return wxCONV_FAILED; - } + code = wxDecodeSurrogate(&wp, end); + if ( !wp ) + return wxCONV_FAILED; #else // wchar_t is UTF-32 - code = *wp & 0x7fffffff; + code = *wp++ & 0x7fffffff; #endif unsigned len; @@ -1377,26 +1364,15 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, // The length can be either given explicitly or computed implicitly for the // NUL-terminated strings. - const bool isNulTerminated = srcLen == wxNO_LEN; - while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n))) + const wchar_t* const end = srcLen == wxNO_LEN ? NULL : psz + srcLen; + while ((end ? psz < end : *psz) && ((!buf) || (len < n))) { wxUint32 cc; #ifdef WC_UTF16 - switch ( decode_utf16(psz++, srcLen, cc) ) - { - case 1: - break; - - case 2: - psz++; - if ( !isNulTerminated ) - srcLen--; - break; - - case wxCONV_FAILED: - return wxCONV_FAILED; - } + cc = wxDecodeSurrogate(&psz, end); + if ( !psz ) + return wxCONV_FAILED; #else cc = (*psz++) & 0x7fffffff; #endif @@ -1457,7 +1433,7 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, } } - if ( isNulTerminated ) + if ( !end ) { // Add the trailing NUL in this case if we have a large enough buffer. if ( buf && (len < n) ) @@ -1695,36 +1671,26 @@ wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, const wxUint16 *inBuff = reinterpret_cast(src); for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) { - wxUint32 ch; wxUint16 tmp[2]; + const wxUint16* tmpEnd = tmp; - size_t len; tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff); - if ( ++inBuff < inEnd ) + tmpEnd++; + + if ( inBuff + 1 < inEnd ) { // Normal case, we have a next character to decode. - len = 1; - tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff); - } - else // End of input. - { - // Setting the length to 0 ensures we correctly return wxCONV_FAILED - // if the first one is the first half of a surrogate. - len = 0; + tmp[1] = wxUINT16_SWAP_ALWAYS(inBuff[1]); + tmpEnd++; } - switch ( decode_utf16(tmp, len, ch) ) - { - case 1: - break; + const wxUint16* p = tmp; + const wxUint32 ch = wxDecodeSurrogate(&p, tmpEnd); + if ( !p ) + return wxCONV_FAILED; - case 2: - inBuff++; - break; - - case wxCONV_FAILED: - return wxCONV_FAILED; - } + // Move the real pointer by the same amount as "p" was updated by. + inBuff += p - tmp; outLen++; From 4502e7563be3930841352750fbff5f5ea515d892 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 9 Nov 2017 02:02:54 +0100 Subject: [PATCH 7/7] Fix wxTextInputStream for input starting with BOM-like bytes Contrary to what a comment in wxTextInputStream::GetChar() said, it is actually possible to get more than one wide character from a call to wxMBConv::ToWChar(len+1) even if a previous call to ToWChar(len) failed to decode anything at all. This happens with wxConvAuto because it keeps returning an error while it doesn't have enough data to determine if the input contains a BOM or not, but then returns all the characters examined so far at once if it turns out that there was no BOM, after all. The simplest case in which this created problems was just input starting with a NUL byte as it as this could be a start of UTF-32BE BOM. The fix consists in keeping all the bytes read but not yet decoded in the m_lastBytes buffer and retrying to decode them during the next GetChar() call. This implies keeping track of how much valid data is there in m_lastBytes exactly, as we can't discard the already decoded data immediately, but need to keep it in the buffer too, in order to allow implementing UngetLast(). Incidentally, UngetLast() was totally broken for UTF-16/32 input (containing NUL bytes in the middle of the characters) before and this change fixes this as a side effect. Also add test cases for previously failing inputs. --- include/wx/txtstrm.h | 17 +++++- src/common/txtstrm.cpp | 93 +++++++++++++++++++++++--------- tests/streams/textstreamtest.cpp | 36 +++++++++++++ 3 files changed, 120 insertions(+), 26 deletions(-) diff --git a/include/wx/txtstrm.h b/include/wx/txtstrm.h index 5218f44520..0bcf10e2cd 100644 --- a/include/wx/txtstrm.h +++ b/include/wx/txtstrm.h @@ -84,7 +84,22 @@ public: protected: wxInputStream &m_input; wxString m_separators; - char m_lastBytes[10]; // stores the bytes that were read for the last character + + // Data possibly (see m_validXXX) read from the stream but not decoded yet. + // This is necessary because GetChar() may only return a single character + // but we may get more than one character when decoding raw input bytes. + char m_lastBytes[10]; + + // The bytes [0, m_validEnd) of m_lastBytes contain the bytes read by the + // last GetChar() call (this interval may be empty if GetChar() hasn't been + // called yet). The bytes [0, m_validBegin) have been already decoded and + // returned to caller or stored in m_lastWChar in the particularly + // egregious case of decoding a non-BMP character when using UTF-16 for + // wchar_t. Finally, the bytes [m_validBegin, m_validEnd) remain to be + // decoded and returned during the next call (again, this interval can, and + // usually will, be empty too if m_validBegin == m_validEnd). + size_t m_validBegin, + m_validEnd; #if wxUSE_UNICODE wxMBConv *m_conv; diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 86c19583b0..d174e4e13f 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -35,7 +35,8 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxMBConv& conv) : m_input(s), m_separators(sep), m_conv(conv.Clone()) { - memset((void*)m_lastBytes, 0, 10); + m_validBegin = + m_validEnd = 0; #if SIZEOF_WCHAR_T == 2 m_lastWChar = 0; @@ -45,7 +46,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s, wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep) : m_input(s), m_separators(sep) { - memset((void*)m_lastBytes, 0, 10); + m_validBegin = + m_validEnd = 0; + + m_lastBytes[0] = 0; } #endif @@ -58,11 +62,13 @@ wxTextInputStream::~wxTextInputStream() void wxTextInputStream::UngetLast() { - size_t byteCount = 0; - while(m_lastBytes[byteCount]) // pseudo ANSI strlen (even for Unicode!) - byteCount++; - m_input.Ungetch(m_lastBytes, byteCount); - memset((void*)m_lastBytes, 0, 10); + if ( m_validEnd ) + { + m_input.Ungetch(m_lastBytes, m_validEnd); + + m_validBegin = + m_validEnd = 0; + } } wxChar wxTextInputStream::GetChar() @@ -77,17 +83,37 @@ wxChar wxTextInputStream::GetChar() m_lastWChar = 0; return wc; } -#endif // !SWIG_ONLY_SCRIPT_API +#endif // SIZEOF_WCHAR_T - wxChar wbuf[2]; - memset((void*)m_lastBytes, 0, 10); - for(size_t inlen = 0; inlen < 9; inlen++) + // If we have any non-decoded bytes left from the last call, shift them to + // be at the beginning of the buffer. + if ( m_validBegin < m_validEnd ) { - // actually read the next character - m_lastBytes[inlen] = m_input.GetC(); + m_validEnd -= m_validBegin; + memmove(m_lastBytes, m_lastBytes + m_validBegin, m_validEnd); + } + else // All bytes were already decoded and consumed. + { + m_validEnd = 0; + } - if(m_input.LastRead() <= 0) - return 0; + // We may need to decode up to 4 characters if we have input starting with + // 3 BOM-like bytes, but not actually containing a BOM, as decoding it will + // only succeed when 4 bytes are read -- and will yield 4 wide characters. + wxChar wbuf[4]; + for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++) + { + if ( inlen >= m_validEnd ) + { + // actually read the next character + m_lastBytes[inlen] = m_input.GetC(); + + if(m_input.LastRead() <= 0) + return 0; + + m_validEnd++; + } + //else: Retry decoding what we already have in the buffer. switch ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1) ) { @@ -103,12 +129,17 @@ wxChar wxTextInputStream::GetChar() break; default: - // if we couldn't decode a single character during the last - // loop iteration we shouldn't be able to decode 2 or more of - // them with an extra single byte, something fishy is going on - // (except if we use UTF-16, see below) - wxFAIL_MSG("unexpected decoding result"); - return 0; + // If we couldn't decode a single character during the last + // loop iteration, but decoded more than one of them with just + // one extra byte, the only explanation is that we were using a + // wxConvAuto conversion recognizing the initial BOM and that + // it couldn't detect the presence or absence of BOM so far, + // but now finally has enough data to see that there is none. + // As we must have fallen back to Latin-1 in this case, return + // just the first byte and keep the other ones for the next + // time. + m_validBegin = 1; + return wbuf[0]; #if SIZEOF_WCHAR_T == 2 case 2: @@ -118,25 +149,37 @@ wxChar wxTextInputStream::GetChar() // remember the second one for the next call, as there is no // way to fit both of them into a single wxChar in this case. m_lastWChar = wbuf[1]; -#endif // !SWIG_ONLY_SCRIPT_API +#endif // SIZEOF_WCHAR_T == 2 wxFALLTHROUGH; case 1: + m_validBegin = inlen + 1; // we finally decoded a character return wbuf[0]; } } - // there should be no encoding which requires more than nine bytes for one - // character so something must be wrong with our conversion but we have no - // way to signal it from here + // There should be no encoding which requires more than 10 bytes to decode + // at least one character (the most actually seems to be 7: 3 for the + // initial BOM, which is ignored, and 4 for the longest possible encoding + // of a Unicode character in UTF-8), so something must be wrong with our + // conversion but we have no way to signal it from here and just return 0 + // as if we reached the end of the stream. + m_validBegin = 0; + m_validEnd = sizeof(m_lastBytes); + return 0; #else m_lastBytes[0] = m_input.GetC(); if(m_input.LastRead() <= 0) + { + m_validEnd = 0; return 0; + } + + m_validEnd = 1; return m_lastBytes[0]; #endif diff --git a/tests/streams/textstreamtest.cpp b/tests/streams/textstreamtest.cpp index 0975e7526e..edb6eaa8a2 100644 --- a/tests/streams/textstreamtest.cpp +++ b/tests/streams/textstreamtest.cpp @@ -290,4 +290,40 @@ void TextStreamTestCase::TestInput(const wxMBConv& conv, CPPUNIT_ASSERT_EQUAL( 0, memcmp(txtWchar, temp.wc_str(), sizeof(txtWchar)) ); } +TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]") +{ + // This is the simplest possible test that used to trigger assertion in + // wxTextInputStream::GetChar(). + SECTION("starts-with-nul") + { + const wxUint8 buf[] = { 0x00, 0x01, }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + REQUIRE( tis.GetChar() == 0x00 ); + REQUIRE( tis.GetChar() == 0x01 ); + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } + + // This exercises a problematic path in GetChar() as the first 3 bytes of + // this stream look like the start of UTF-32BE BOM, but this is not + // actually a BOM because the 4th byte is 0xFE and not 0xFF, so the stream + // should decode the buffer as Latin-1 once it gets there. + SECTION("almost-UTF-32-BOM") + { + const wxUint8 buf[] = { 0x00, 0x00, 0xFE, 0xFE, 0x01 }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + REQUIRE( tis.GetChar() == 0x00 ); + REQUIRE( tis.GetChar() == 0x00 ); + REQUIRE( tis.GetChar() == 0xFE ); + REQUIRE( tis.GetChar() == 0xFE ); + REQUIRE( tis.GetChar() == 0x01 ); + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } +} + #endif // wxUSE_UNICODE