From af83769bd097b1416d8837ca0b30bfaa7e3bb21f Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sun, 21 Apr 2019 20:04:52 +0200 Subject: [PATCH 1/4] Add wxMBConv::GetMaxCharLen() This is not used yet, but will be needed soon in order to determine whether we have sufficiently many to decode them. --- include/wx/strconv.h | 20 ++++++++++++++++++++ interface/wx/strconv.h | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/include/wx/strconv.h b/include/wx/strconv.h index 207a235605..d344a5a911 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -32,6 +32,8 @@ class WXDLLIMPEXP_FWD_BASE wxString; // don't let the fact that the existing classes implement MB2WC/WC2MB() instead // confuse you. // +// For many encodings you must override GetMaxCharLen(). +// // You also have to implement Clone() to allow copying the conversions // polymorphically. // @@ -118,6 +120,10 @@ public: wxWCharBuffer cWX2WC(const char *psz) const { return cMB2WC(psz); } #endif // Unicode/ANSI + // return the maximum number of bytes that can be required to encode a + // single character in this encoding, e.g. 4 for UTF-8 + virtual size_t GetMaxCharLen() const { return 1; } + // this function is used in the implementation of cMB2WC() to distinguish // between the following cases: // @@ -254,6 +260,8 @@ public: virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF7; } private: @@ -341,6 +349,8 @@ public: virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvStrictUTF8(); } // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't @@ -365,6 +375,8 @@ public: virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF8(m_options); } // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't @@ -405,6 +417,7 @@ public: const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16LE; } }; @@ -419,6 +432,7 @@ public: const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16BE; } }; @@ -451,6 +465,7 @@ public: const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32LE; } }; @@ -465,6 +480,7 @@ public: const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32BE; } }; @@ -566,6 +582,10 @@ public: FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + // Use the value for UTF-8 here to make sure we try to decode up to 4 bytes + // as UTF-8 before giving up. + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxWhateverWorksConv(); diff --git a/interface/wx/strconv.h b/interface/wx/strconv.h index 00dfb359e6..7a2055495f 100644 --- a/interface/wx/strconv.h +++ b/interface/wx/strconv.h @@ -48,6 +48,26 @@ public: */ virtual wxMBConv* Clone() const = 0; + /** + This function must be overridden in the derived classes to return the + maximum length, in bytes, of a single Unicode character representation + in this encoding. + + As a consequence, the conversion object must be able to decode any + valid sequence of bytes in the corresponding encoding if it's at least + that many bytes long, but may fail if it is shorter. For example, for + UTF-8 the maximum character length is 4, as 3 bytes or less may be + insufficient to represent a Unicode character in UTF-8, but 4 are + always enough. + + For compatibility reasons, this method is not pure virtual and returns + 1 by default in the base class, however it should be always overridden + in the derived classes. + + @since 3.1.3 + */ + virtual size_t GetMaxCharLen() const; + /** This function returns 1 for most of the multibyte encodings in which the string is terminated by a single @c NUL, 2 for UTF-16 and 4 for UTF-32 for From 731b3a804f3ea39ac7a6ce98d2ca260c9040c9e6 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sun, 21 Apr 2019 20:05:37 +0200 Subject: [PATCH 2/4] Return error from wxConvAuto if not given enough bytes Instead of falling back on Latin-1 if we fail to decode the input as UTF-8, check if we have enough bytes for the latter and just return an error if we don't. This ensures that wxTextInputStream::GetChar() and similar code will retry with a longer byte sequence, allowing wxConvAuto to be used for decoding UTF-8 contents on the fly, which didn't work before. See #14720. --- src/common/convauto.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 6a5fba4ecb..a9fe15cb2d 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -309,6 +309,13 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None ) { + // we may need more bytes before we can decode the input, don't switch + // to the fall-back conversion in this case as it would prevent us from + // decoding UTF-8 input when fed it byte by byte, as done by + // wxTextInputStream, for example + if ( srcLen < m_conv->GetMaxCharLen() ) + return wxCONV_FAILED; + // if the conversion failed but we didn't really detect anything and // simply tried UTF-8 by default, retry it using the fall-back if ( m_encDefault != wxFONTENCODING_MAX ) From 7d3687f515cb0f22ca1a34237547bc66780ebef8 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sun, 21 Apr 2019 20:09:13 +0200 Subject: [PATCH 3/4] Restore wxConvAuto in wxTextInputStream used by wxExecute() This reverts commit a05ae051d86bad67ba03620cfb6871b29e618e85 and allows to correctly decode UTF-8 output of child processes again. Also add a (disabled by default) test allowing to verify that this does work now. Closes #14720, #18382. --- src/common/utilscmn.cpp | 9 +-------- tests/exec/exec.cpp | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/common/utilscmn.cpp b/src/common/utilscmn.cpp index b1f9ffed46..2206f3d895 100644 --- a/src/common/utilscmn.cpp +++ b/src/common/utilscmn.cpp @@ -613,14 +613,7 @@ static bool ReadAll(wxInputStream *is, wxArrayString& output) // the stream could be already at EOF or in wxSTREAM_BROKEN_PIPE state is->Reset(); - // Notice that wxTextInputStream doesn't work correctly with wxConvAuto - // currently, see #14720, so use the current locale conversion explicitly - // under assumption that any external program should be using it too. - wxTextInputStream tis(*is, " \t" -#if wxUSE_UNICODE - , wxConvLibc -#endif - ); + wxTextInputStream tis(*is); for ( ;; ) { diff --git a/tests/exec/exec.cpp b/tests/exec/exec.cpp index f350376043..bb82e4c059 100644 --- a/tests/exec/exec.cpp +++ b/tests/exec/exec.cpp @@ -516,3 +516,28 @@ void ExecTestCase::TestOverlappedSyncExecute() CPPUNIT_ASSERT_EQUAL( SLEEP_END_STRING, longSleepOutput.Last() ); #endif // !__WINDOWS__ } + +#ifdef __UNIX__ + +// This test is disabled by default because it must be run in French locale, +// i.e. with explicit LC_ALL=fr_FR.UTF-8 and only works with GNU ls, which +// produces the expected output. +TEST_CASE("wxExecute::RedirectUTF8", "[exec][unicode][.]") +{ + wxArrayString output; + REQUIRE( wxExecute("/bin/ls --version", output) == 0 ); + + for ( size_t n = 0; n < output.size(); ++n ) + { + // It seems unlikely that this part of the output will change for GNU + // ls, so check for its presence as a sign that the program output was + // decoded correctly. + if ( output[n].find(wxString::FromUTF8("vous \xc3\xaates libre")) != wxString::npos ) + return; + } + + INFO("output was:\n" << wxJoin(output, '\n')); + FAIL("Expected output fragment not found."); +} + +#endif // __UNIX__ From f28224dfad81d9b40ad221f78c103605af5fe517 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sun, 21 Apr 2019 20:10:49 +0200 Subject: [PATCH 4/4] Avoid infinite loops when decoding fails in wxTextInputStream wxTextInputStream::ReadLine() must set stream error to indicate to the caller that it failed to decode its input, otherwise there is no way to know about the error and the caller risks to keep calling ReadLine() forever, as it happened in ReadAll() helper used by wxExecute(), for example. See #18382. --- src/common/txtstrm.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 814efb62b0..c38f7c29ab 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -303,12 +303,24 @@ wxString wxTextInputStream::ReadLine() { wxString line; - while ( !m_input.Eof() ) + for ( ;; ) { wxChar c = GetChar(); - if (!c) + if ( m_input.Eof() ) break; + if (!c) + { + // If we failed to get a character and the stream is not at EOF, it + // can only mean that decoding the stream contents using our + // conversion object failed. In this case, we must signal an error + // at the stream level, as otherwise the code using this function + // would never know that something went wrong and would continue + // calling it again and again, resulting in an infinite loop. + m_input.Reset(wxSTREAM_READ_ERROR); + break; + } + if (EatEOL(c)) break;