diff --git a/include/wx/txtstrm.h b/include/wx/txtstrm.h index 8fc43a176f..0bcf10e2cd 100644 --- a/include/wx/txtstrm.h +++ b/include/wx/txtstrm.h @@ -25,7 +25,8 @@ typedef wxTextOutputStream& (*__wxTextOutputManip)(wxTextOutputStream&); WXDLLIMPEXP_BASE wxTextOutputStream &endl( wxTextOutputStream &stream ); -#define wxEOT wxT('\4') // the End-Of-Text control code (used only inside wxTextInputStream) +// Obsolete constant defined only for compatibility, not used. +#define wxEOT wxT('\4') // If you're scanning through a file using wxTextInputStream, you should check for EOF _before_ // reading the next item (word / number), because otherwise the last item may get lost. @@ -58,7 +59,7 @@ public: double ReadDouble(); wxString ReadLine(); wxString ReadWord(); - wxChar GetChar() { wxChar c = NextChar(); return (wxChar)(c != wxEOT ? c : 0); } + wxChar GetChar(); wxString GetStringSeparators() const { return m_separators; } void SetStringSeparators(const wxString &c) { m_separators = c; } @@ -83,7 +84,22 @@ public: protected: wxInputStream &m_input; wxString m_separators; - char m_lastBytes[10]; // stores the bytes that were read for the last character + + // Data possibly (see m_validXXX) read from the stream but not decoded yet. + // This is necessary because GetChar() may only return a single character + // but we may get more than one character when decoding raw input bytes. + char m_lastBytes[10]; + + // The bytes [0, m_validEnd) of m_lastBytes contain the bytes read by the + // last GetChar() call (this interval may be empty if GetChar() hasn't been + // called yet). The bytes [0, m_validBegin) have been already decoded and + // returned to caller or stored in m_lastWChar in the particularly + // egregious case of decoding a non-BMP character when using UTF-16 for + // wchar_t. Finally, the bytes [m_validBegin, m_validEnd) remain to be + // decoded and returned during the next call (again, this interval can, and + // usually will, be empty too if m_validBegin == m_validEnd). + size_t m_validBegin, + m_validEnd; #if wxUSE_UNICODE wxMBConv *m_conv; @@ -100,8 +116,6 @@ protected: bool EatEOL(const wxChar &c); void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues - // returns EOT (\4) if there is a stream error, or end of file - wxChar NextChar(); // this should be used instead of GetC() because of Unicode issues wxChar NextNonSeparators(); wxDECLARE_NO_COPY_CLASS(wxTextInputStream); diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 11d1adbc54..dcdf7ba76d 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -103,41 +103,44 @@ static size_t encode_utf16(wxUint32 input, wxUint16 *output) } } -static size_t decode_utf16(const wxUint16* input, wxUint32& output) -{ - if ((*input < 0xd800) || (*input > 0xdfff)) - { - output = *input; - return 1; - } - else if ((input[1] < 0xdc00) || (input[1] > 0xdfff)) - { - output = *input; - return wxCONV_FAILED; - } - else - { - output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00); - return 2; - } -} - -// returns the next UTF-32 character from the wchar_t buffer and advances the -// pointer to the character after this one +// Returns the next UTF-32 character from the wchar_t buffer terminated by the +// "end" pointer (the caller must ensure that on input "*pSrc < end") and +// advances the pointer to the character after this one. // -// if an invalid character is found, *pSrc is set to NULL, the caller must -// check for this -static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc) +// If an invalid or incomplete character is found, *pSrc is set to NULL, the +// caller must check for this. +static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc, const wxChar16* end) { - wxUint32 out; - const size_t - n = decode_utf16(reinterpret_cast(*pSrc), out); - if ( n == wxCONV_FAILED ) - *pSrc = NULL; - else - *pSrc += n; + const wxChar16*& src = *pSrc; - return out; + // Is this a BMP character? + const wxUint16 u = *src++; + if ((u < 0xd800) || (u > 0xdfff)) + { + // Yes, just return it. + return u; + } + + // No, we have the first half of a surrogate, check if we also have the + // second half (notice that this check does nothing if end == NULL, as it + // is allowed to be, and this is correct). + if ( src == end ) + { + // No, we don't because this is the end of input. + src = NULL; + return 0; + } + + const wxUint16 u2 = *src++; + if ( (u2 < 0xdc00) || (u2 > 0xdfff) ) + { + // No, it's not in the low surrogate range. + src = NULL; + return 0; + } + + // Yes, decode it and return the corresponding Unicode character. + return ((u - 0xd7c0) << 10) + (u2 - 0xdc00); } // ---------------------------------------------------------------------------- @@ -699,6 +702,10 @@ size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen, // start of an encoded segment? if ( cc == '+' ) { + // Can't end with a plus sign. + if ( src == srcEnd ) + return wxCONV_FAILED; + if ( *src == '-' ) { // just the encoded plus sign, don't switch to shifted mode @@ -1070,9 +1077,10 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, char *out = dstLen ? dst : NULL; size_t written = 0; - for ( const wchar_t *wp = src; ; wp++ ) + const wchar_t* const end = srcLen == wxNO_LEN ? NULL : src + srcLen; + for ( const wchar_t *wp = src; ; ) { - if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) ) + if ( end ? wp == end : !*wp ) { // all done successfully, just add the trailing NULL if we are not // using explicit length @@ -1092,38 +1100,13 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, return written; } - if ( srcLen != wxNO_LEN ) - srcLen--; - wxUint32 code; #ifdef WC_UTF16 - // Be careful here: decode_utf16() may need to read the next wchar_t - // but we might not have any left, so pass it a temporary buffer which - // always has 2 wide characters and take care to set its second element - // to 0, which is invalid as a second half of a surrogate, to ensure - // that we return an error when trying to convert a buffer ending with - // half of a surrogate. - wxUint16 tmp[2]; - tmp[0] = wp[0]; - tmp[1] = srcLen != 0 ? wp[1] : 0; - switch ( decode_utf16(tmp, code) ) - { - case 1: - // Nothing special to do, just a character from BMP. - break; - - case 2: - // skip the next char too as we decoded a surrogate - wp++; - if ( srcLen != wxNO_LEN ) - srcLen--; - break; - - case wxCONV_FAILED: - return wxCONV_FAILED; - } + code = wxDecodeSurrogate(&wp, end); + if ( !wp ) + return wxCONV_FAILED; #else // wchar_t is UTF-32 - code = *wp & 0x7fffffff; + code = *wp++ & 0x7fffffff; #endif unsigned len; @@ -1381,20 +1364,15 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, // The length can be either given explicitly or computed implicitly for the // NUL-terminated strings. - const bool isNulTerminated = srcLen == wxNO_LEN; - while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n))) + const wchar_t* const end = srcLen == wxNO_LEN ? NULL : psz + srcLen; + while ((end ? psz < end : *psz) && ((!buf) || (len < n))) { wxUint32 cc; #ifdef WC_UTF16 - // cast is ok for WC_UTF16 - size_t pa = decode_utf16((const wxUint16 *)psz, cc); - - // we could have consumed two input code units if we decoded a - // surrogate, so adjust the input pointer and, if necessary, the length - psz += (pa == wxCONV_FAILED) ? 1 : pa; - if ( pa == 2 && !isNulTerminated ) - srcLen--; + cc = wxDecodeSurrogate(&psz, end); + if ( !psz ) + return wxCONV_FAILED; #else cc = (*psz++) & 0x7fffffff; #endif @@ -1455,7 +1433,7 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, } } - if ( isNulTerminated ) + if ( !end ) { // Add the trailing NUL in this case if we have a large enough buffer. if ( buf && (len < n) ) @@ -1623,7 +1601,7 @@ wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen, const wxUint16 *inBuff = reinterpret_cast(src); for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) { - const wxUint32 ch = wxDecodeSurrogate(&inBuff); + const wxUint32 ch = wxDecodeSurrogate(&inBuff, inEnd); if ( !inBuff ) return wxCONV_FAILED; @@ -1693,29 +1671,26 @@ wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, const wxUint16 *inBuff = reinterpret_cast(src); for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) { - wxUint32 ch; wxUint16 tmp[2]; + const wxUint16* tmpEnd = tmp; tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff); - if ( ++inBuff < inEnd ) + tmpEnd++; + + if ( inBuff + 1 < inEnd ) { // Normal case, we have a next character to decode. - tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff); - } - else // End of input. - { - // Setting the second character to 0 ensures we correctly return - // wxCONV_FAILED if the first one is the first half of a surrogate - // as the second half can't be 0 in this case. - tmp[1] = 0; + tmp[1] = wxUINT16_SWAP_ALWAYS(inBuff[1]); + tmpEnd++; } - const size_t numChars = decode_utf16(tmp, ch); - if ( numChars == wxCONV_FAILED ) + const wxUint16* p = tmp; + const wxUint32 ch = wxDecodeSurrogate(&p, tmpEnd); + if ( !p ) return wxCONV_FAILED; - if ( numChars == 2 ) - inBuff++; + // Move the real pointer by the same amount as "p" was updated by. + inBuff += p - tmp; outLen++; @@ -1861,7 +1836,7 @@ wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen, size_t outLen = 0; for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) { - const wxUint32 ch = wxDecodeSurrogate(&src); + const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd); if ( !src ) return wxCONV_FAILED; @@ -1930,7 +1905,7 @@ wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen, size_t outLen = 0; for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) { - const wxUint32 ch = wxDecodeSurrogate(&src); + const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd); if ( !src ) return wxCONV_FAILED; diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 8c48cd6c77..d174e4e13f 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -35,7 +35,8 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxMBConv& conv) : m_input(s), m_separators(sep), m_conv(conv.Clone()) { - memset((void*)m_lastBytes, 0, 10); + m_validBegin = + m_validEnd = 0; #if SIZEOF_WCHAR_T == 2 m_lastWChar = 0; @@ -45,7 +46,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s, wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep) : m_input(s), m_separators(sep) { - memset((void*)m_lastBytes, 0, 10); + m_validBegin = + m_validEnd = 0; + + m_lastBytes[0] = 0; } #endif @@ -58,14 +62,16 @@ wxTextInputStream::~wxTextInputStream() void wxTextInputStream::UngetLast() { - size_t byteCount = 0; - while(m_lastBytes[byteCount]) // pseudo ANSI strlen (even for Unicode!) - byteCount++; - m_input.Ungetch(m_lastBytes, byteCount); - memset((void*)m_lastBytes, 0, 10); + if ( m_validEnd ) + { + m_input.Ungetch(m_lastBytes, m_validEnd); + + m_validBegin = + m_validEnd = 0; + } } -wxChar wxTextInputStream::NextChar() +wxChar wxTextInputStream::GetChar() { #if wxUSE_UNICODE #if SIZEOF_WCHAR_T == 2 @@ -77,17 +83,37 @@ wxChar wxTextInputStream::NextChar() m_lastWChar = 0; return wc; } -#endif // !SWIG_ONLY_SCRIPT_API +#endif // SIZEOF_WCHAR_T - wxChar wbuf[2]; - memset((void*)m_lastBytes, 0, 10); - for(size_t inlen = 0; inlen < 9; inlen++) + // If we have any non-decoded bytes left from the last call, shift them to + // be at the beginning of the buffer. + if ( m_validBegin < m_validEnd ) { - // actually read the next character - m_lastBytes[inlen] = m_input.GetC(); + m_validEnd -= m_validBegin; + memmove(m_lastBytes, m_lastBytes + m_validBegin, m_validEnd); + } + else // All bytes were already decoded and consumed. + { + m_validEnd = 0; + } - if(m_input.LastRead() <= 0) - return wxEOT; + // We may need to decode up to 4 characters if we have input starting with + // 3 BOM-like bytes, but not actually containing a BOM, as decoding it will + // only succeed when 4 bytes are read -- and will yield 4 wide characters. + wxChar wbuf[4]; + for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++) + { + if ( inlen >= m_validEnd ) + { + // actually read the next character + m_lastBytes[inlen] = m_input.GetC(); + + if(m_input.LastRead() <= 0) + return 0; + + m_validEnd++; + } + //else: Retry decoding what we already have in the buffer. switch ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1) ) { @@ -103,12 +129,17 @@ wxChar wxTextInputStream::NextChar() break; default: - // if we couldn't decode a single character during the last - // loop iteration we shouldn't be able to decode 2 or more of - // them with an extra single byte, something fishy is going on - // (except if we use UTF-16, see below) - wxFAIL_MSG("unexpected decoding result"); - return wxEOT; + // If we couldn't decode a single character during the last + // loop iteration, but decoded more than one of them with just + // one extra byte, the only explanation is that we were using a + // wxConvAuto conversion recognizing the initial BOM and that + // it couldn't detect the presence or absence of BOM so far, + // but now finally has enough data to see that there is none. + // As we must have fallen back to Latin-1 in this case, return + // just the first byte and keep the other ones for the next + // time. + m_validBegin = 1; + return wbuf[0]; #if SIZEOF_WCHAR_T == 2 case 2: @@ -118,25 +149,37 @@ wxChar wxTextInputStream::NextChar() // remember the second one for the next call, as there is no // way to fit both of them into a single wxChar in this case. m_lastWChar = wbuf[1]; -#endif // !SWIG_ONLY_SCRIPT_API +#endif // SIZEOF_WCHAR_T == 2 wxFALLTHROUGH; case 1: + m_validBegin = inlen + 1; // we finally decoded a character return wbuf[0]; } } - // there should be no encoding which requires more than nine bytes for one - // character so something must be wrong with our conversion but we have no - // way to signal it from here - return wxEOT; + // There should be no encoding which requires more than 10 bytes to decode + // at least one character (the most actually seems to be 7: 3 for the + // initial BOM, which is ignored, and 4 for the longest possible encoding + // of a Unicode character in UTF-8), so something must be wrong with our + // conversion but we have no way to signal it from here and just return 0 + // as if we reached the end of the stream. + m_validBegin = 0; + m_validEnd = sizeof(m_lastBytes); + + return 0; #else m_lastBytes[0] = m_input.GetC(); if(m_input.LastRead() <= 0) - return wxEOT; + { + m_validEnd = 0; + return 0; + } + + m_validEnd = 1; return m_lastBytes[0]; #endif @@ -147,8 +190,9 @@ wxChar wxTextInputStream::NextNonSeparators() { for (;;) { - wxChar c = NextChar(); - if (c == wxEOT) return (wxChar) 0; + wxChar c = GetChar(); + if (!c) + return c; if (c != wxT('\n') && c != wxT('\r') && @@ -164,8 +208,8 @@ bool wxTextInputStream::EatEOL(const wxChar &c) if (c == wxT('\r')) // eat on both Mac and DOS { - wxChar c2 = NextChar(); - if(c2 == wxEOT) return true; // end of stream reached, had enough :-) + wxChar c2 = GetChar(); + if (!c2) return true; // end of stream reached, had enough :-) if (c2 != wxT('\n')) UngetLast(); // Don't eat on Mac return true; @@ -261,8 +305,8 @@ wxString wxTextInputStream::ReadLine() while ( !m_input.Eof() ) { - wxChar c = NextChar(); - if(c == wxEOT) + wxChar c = GetChar(); + if (!c) break; if (EatEOL(c)) @@ -289,8 +333,8 @@ wxString wxTextInputStream::ReadWord() while ( !m_input.Eof() ) { - c = NextChar(); - if(c == wxEOT) + c = GetChar(); + if (!c) break; if (m_separators.Find(c) >= 0) diff --git a/tests/mbconv/mbconvtest.cpp b/tests/mbconv/mbconvtest.cpp index 5a0afa4e5b..7edee7bdd6 100644 --- a/tests/mbconv/mbconvtest.cpp +++ b/tests/mbconv/mbconvtest.cpp @@ -985,6 +985,7 @@ void MBConvTestCase::NonBMPCharTests() TestDecoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1); TestEncoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1); } + SECTION("wxMBConvUTF16LE") { char u16le[sizeof(u16)]; for (size_t i = 0; i < sizeof(u16)/2; ++i) { @@ -995,6 +996,7 @@ void MBConvTestCase::NonBMPCharTests() TestDecoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2); TestEncoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2); } + SECTION("wxMBConvUTF16BE") { char u16be[sizeof(u16)]; for (size_t i = 0; i < sizeof(u16)/2; ++i) { @@ -1005,6 +1007,7 @@ void MBConvTestCase::NonBMPCharTests() TestDecoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2); TestEncoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2); } + SECTION("wxMBConvUTF32LE") { char u32le[sizeof(u32)]; for (size_t i = 0; i < sizeof(u32)/4; ++i) { @@ -1017,6 +1020,7 @@ void MBConvTestCase::NonBMPCharTests() TestDecoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4); TestEncoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4); } + SECTION("wxMBConvUTF32BE") { char u32be[sizeof(u32)]; for (size_t i = 0; i < sizeof(u32)/4; ++i) { diff --git a/tests/streams/textstreamtest.cpp b/tests/streams/textstreamtest.cpp index 0975e7526e..edb6eaa8a2 100644 --- a/tests/streams/textstreamtest.cpp +++ b/tests/streams/textstreamtest.cpp @@ -290,4 +290,40 @@ void TextStreamTestCase::TestInput(const wxMBConv& conv, CPPUNIT_ASSERT_EQUAL( 0, memcmp(txtWchar, temp.wc_str(), sizeof(txtWchar)) ); } +TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]") +{ + // This is the simplest possible test that used to trigger assertion in + // wxTextInputStream::GetChar(). + SECTION("starts-with-nul") + { + const wxUint8 buf[] = { 0x00, 0x01, }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + REQUIRE( tis.GetChar() == 0x00 ); + REQUIRE( tis.GetChar() == 0x01 ); + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } + + // This exercises a problematic path in GetChar() as the first 3 bytes of + // this stream look like the start of UTF-32BE BOM, but this is not + // actually a BOM because the 4th byte is 0xFE and not 0xFF, so the stream + // should decode the buffer as Latin-1 once it gets there. + SECTION("almost-UTF-32-BOM") + { + const wxUint8 buf[] = { 0x00, 0x00, 0xFE, 0xFE, 0x01 }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + REQUIRE( tis.GetChar() == 0x00 ); + REQUIRE( tis.GetChar() == 0x00 ); + REQUIRE( tis.GetChar() == 0xFE ); + REQUIRE( tis.GetChar() == 0xFE ); + REQUIRE( tis.GetChar() == 0x01 ); + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } +} + #endif // wxUSE_UNICODE