Merge branch 'utf8-text-stream'

Really fix reading from UTF-8 text streams. Closes #14720. See https://github.com/wxWidgets/wxWidgets/pull/1304
2019-05-10 01:35:55 +02:00 · 2019-05-10 01:35:55 +02:00 · 04689e9727
commit 04689e9727
parent 998097b3a4 f28224dfad
6 changed files with 87 additions and 10 deletions
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@ -32,6 +32,8 @@ class WXDLLIMPEXP_FWD_BASE wxString;
 // don't let the fact that the existing classes implement MB2WC/WC2MB() instead
 // confuse you.
 //
+// For many encodings you must override GetMaxCharLen().
+//
 // You also have to implement Clone() to allow copying the conversions
 // polymorphically.
 //
@ -118,6 +120,10 @@ public:
    wxWCharBuffer cWX2WC(const char *psz) const { return cMB2WC(psz); }
 #endif // Unicode/ANSI

+    // return the maximum number of bytes that can be required to encode a
+    // single character in this encoding, e.g. 4 for UTF-8
+    virtual size_t GetMaxCharLen() const { return 1; }
+
    // this function is used in the implementation of cMB2WC() to distinguish
    // between the following cases:
    //
@ -254,6 +260,8 @@ public:
    virtual size_t FromWChar(char *dst, size_t dstLen,
                             const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;

+    virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
+
    virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF7; }

 private:
@ -341,6 +349,8 @@ public:
    virtual size_t FromWChar(char *dst, size_t dstLen,
                             const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;

+    virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
+
    virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvStrictUTF8(); }

    // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
@ -365,6 +375,8 @@ public:
    virtual size_t FromWChar(char *dst, size_t dstLen,
                             const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;

+    virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
+
    virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF8(m_options); }

    // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
@ -405,6 +417,7 @@ public:
                           const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
    virtual size_t FromWChar(char *dst, size_t dstLen,
                             const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
+    virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
    virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16LE; }
 };

@ -419,6 +432,7 @@ public:
                           const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
    virtual size_t FromWChar(char *dst, size_t dstLen,
                             const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
+    virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
    virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16BE; }
 };

@ -451,6 +465,7 @@ public:
                           const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
    virtual size_t FromWChar(char *dst, size_t dstLen,
                             const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
+    virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
    virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32LE; }
 };

@ -465,6 +480,7 @@ public:
                           const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
    virtual size_t FromWChar(char *dst, size_t dstLen,
                             const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
+    virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
    virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32BE; }
 };

@ -566,6 +582,10 @@ public:
    FromWChar(char *dst, size_t dstLen,
              const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;

+    // Use the value for UTF-8 here to make sure we try to decode up to 4 bytes
+    // as UTF-8 before giving up.
+    virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
+
    virtual wxMBConv *Clone() const wxOVERRIDE
    {
        return new wxWhateverWorksConv();
--- a/interface/wx/strconv.h
+++ b/interface/wx/strconv.h
@ -48,6 +48,26 @@ public:
    */
    virtual wxMBConv* Clone() const = 0;

+    /**
+        This function must be overridden in the derived classes to return the
+        maximum length, in bytes, of a single Unicode character representation
+        in this encoding.
+
+        As a consequence, the conversion object must be able to decode any
+        valid sequence of bytes in the corresponding encoding if it's at least
+        that many bytes long, but may fail if it is shorter. For example, for
+        UTF-8 the maximum character length is 4, as 3 bytes or less may be
+        insufficient to represent a Unicode character in UTF-8, but 4 are
+        always enough.
+
+        For compatibility reasons, this method is not pure virtual and returns
+        1 by default in the base class, however it should be always overridden
+        in the derived classes.
+
+        @since 3.1.3
+     */
+    virtual size_t GetMaxCharLen() const;
+
    /**
        This function returns 1 for most of the multibyte encodings in which the
        string is terminated by a single @c NUL, 2 for UTF-16 and 4 for UTF-32 for
--- a/src/common/convauto.cpp
+++ b/src/common/convauto.cpp
@ -309,6 +309,13 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
    size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
    if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
    {
+        // we may need more bytes before we can decode the input, don't switch
+        // to the fall-back conversion in this case as it would prevent us from
+        // decoding UTF-8 input when fed it byte by byte, as done by
+        // wxTextInputStream, for example
+        if ( srcLen < m_conv->GetMaxCharLen() )
+            return wxCONV_FAILED;
+
        // if the conversion failed but we didn't really detect anything and
        // simply tried UTF-8 by default, retry it using the fall-back
        if ( m_encDefault != wxFONTENCODING_MAX )
--- a/src/common/txtstrm.cpp
+++ b/src/common/txtstrm.cpp
@ -303,12 +303,24 @@ wxString wxTextInputStream::ReadLine()
 {
    wxString line;

-    while ( !m_input.Eof() )
+    for ( ;; )
    {
        wxChar c = GetChar();
-        if (!c)
+        if ( m_input.Eof() )
            break;

+        if (!c)
+        {
+            // If we failed to get a character and the stream is not at EOF, it
+            // can only mean that decoding the stream contents using our
+            // conversion object failed. In this case, we must signal an error
+            // at the stream level, as otherwise the code using this function
+            // would never know that something went wrong and would continue
+            // calling it again and again, resulting in an infinite loop.
+            m_input.Reset(wxSTREAM_READ_ERROR);
+            break;
+        }
+
        if (EatEOL(c))
            break;

--- a/src/common/utilscmn.cpp
+++ b/src/common/utilscmn.cpp
@ -613,14 +613,7 @@ static bool ReadAll(wxInputStream *is, wxArrayString& output)
    // the stream could be already at EOF or in wxSTREAM_BROKEN_PIPE state
    is->Reset();

-    // Notice that wxTextInputStream doesn't work correctly with wxConvAuto
-    // currently, see #14720, so use the current locale conversion explicitly
-    // under assumption that any external program should be using it too.
-    wxTextInputStream tis(*is, " \t"
-#if wxUSE_UNICODE
-                                    , wxConvLibc
-#endif
-                                                );
+    wxTextInputStream tis(*is);

    for ( ;; )
    {
--- a/tests/exec/exec.cpp
+++ b/tests/exec/exec.cpp
@ -516,3 +516,28 @@ void ExecTestCase::TestOverlappedSyncExecute()
    CPPUNIT_ASSERT_EQUAL( SLEEP_END_STRING, longSleepOutput.Last() );
 #endif // !__WINDOWS__
 }
+
+#ifdef __UNIX__
+
+// This test is disabled by default because it must be run in French locale,
+// i.e. with explicit LC_ALL=fr_FR.UTF-8 and only works with GNU ls, which
+// produces the expected output.
+TEST_CASE("wxExecute::RedirectUTF8", "[exec][unicode][.]")
+{
+    wxArrayString output;
+    REQUIRE( wxExecute("/bin/ls --version", output) == 0 );
+
+    for ( size_t n = 0; n < output.size(); ++n )
+    {
+        // It seems unlikely that this part of the output will change for GNU
+        // ls, so check for its presence as a sign that the program output was
+        // decoded correctly.
+        if ( output[n].find(wxString::FromUTF8("vous \xc3\xaates libre")) != wxString::npos )
+            return;
+    }
+
+    INFO("output was:\n" << wxJoin(output, '\n'));
+    FAIL("Expected output fragment not found.");
+}
+
+#endif // __UNIX__