From 314630945a6f47ad195fa9d825ab6fa99626dc69 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sat, 13 Feb 2016 03:59:43 +0100 Subject: [PATCH] Fix wxURI::Unescape() to work with Unicode strings Such strings are not really URIs as they should have been encoded if they were but we can obtain them from e.g. wxFileSystem::FindFirst(), so handle them correctly here as it's simpler than checking all the places where Unescape() is called. Add a unit test checking that decoding an URI containing both Unicode and percent-encoded Unicode characters works correctly. --- docs/changes.txt | 1 + include/wx/uri.h | 5 ----- src/common/uri.cpp | 54 ++++++++++++++++----------------------------- tests/uris/uris.cpp | 9 ++++++++ 4 files changed, 29 insertions(+), 40 deletions(-) diff --git a/docs/changes.txt b/docs/changes.txt index e6b20ccd1c..d16735c81a 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -63,6 +63,7 @@ All: - Add UTF-8 and ZIP 64 support to wxZip{Input,Output}Stream (Tobias Taschner). - Upgrade libpng to 1.6.21 fixing several security bugs (Paul Kulchenko). +- Fix handling of Unicode file names in wxFileSystem::FindFirst(). - Add wxStandardPaths::GetUserDir() (Tobias Taschner). - Allow calling wxItemContainer::Add() and similar with std::vector<> argument. - Add "%z" support to printf()-like functions like wxString::Format() (RIVDSL). diff --git a/include/wx/uri.h b/include/wx/uri.h index 11bbd770f6..828428e954 100644 --- a/include/wx/uri.h +++ b/include/wx/uri.h @@ -137,11 +137,6 @@ protected: static bool ParseIPv6address(const char*& uri); static bool ParseIPvFuture(const char*& uri); - // should be called with i pointing to '%', returns the encoded character - // following it or -1 if invalid and advances i past it (so that it points - // to the last character consumed on return) - static int DecodeEscape(wxString::const_iterator& i); - // append next character pointer to by p to the string in an escaped form // and advance p past it // diff --git a/src/common/uri.cpp b/src/common/uri.cpp index f5011f737c..60a2128679 100644 --- a/src/common/uri.cpp +++ b/src/common/uri.cpp @@ -100,38 +100,32 @@ int wxURI::CharToHex(char c) return -1; } -int wxURI::DecodeEscape(wxString::const_iterator& i) -{ - int hi = CharToHex(*++i); - if ( hi == -1 ) - return -1; - - int lo = CharToHex(*++i); - if ( lo == -1 ) - return -1; - - return (hi << 4) | lo; -} - /* static */ wxString wxURI::Unescape(const wxString& uri) { + // URIs can contain escaped 8-bit characters that have to be decoded using + // UTF-8 (see RFC 3986), however in our (probably broken...) case we can + // also end up with not escaped Unicode characters in the URI string which + // can't be decoded as UTF-8. So what we do here is to encode all Unicode + // characters as UTF-8 only to decode them back below. This is obviously + // inefficient but there doesn't seem to be anything else to do, other than + // not allowing to mix Unicode characters with escapes in the first place, + // but this seems to be done in a lot of places, unfortunately. + const wxScopedCharBuffer& uriU8(uri.utf8_str()); + const size_t len = uriU8.length(); + // the unescaped version can't be longer than the original one - wxCharBuffer buf(uri.length()); + wxCharBuffer buf(uriU8.length()); char *p = buf.data(); - for ( wxString::const_iterator i = uri.begin(); i != uri.end(); ++i, ++p ) + const char* const end = uriU8.data() + len; + for ( const char* s = uriU8.data(); s != end; ++s, ++p ) { - char c = *i; - if ( c == '%' ) + char c = *s; + if ( c == '%' && s < end - 2 && IsHex(s[1]) && IsHex(s[2]) ) { - int n = wxURI::DecodeEscape(i); - if ( n == -1 ) - return wxString(); - - wxASSERT_MSG( n >= 0 && n <= 0xff, "unexpected character value" ); - - c = static_cast(n); + c = (CharToHex(s[1]) << 4) | CharToHex(s[2]); + s += 2; } *p = c; @@ -139,17 +133,7 @@ wxString wxURI::Unescape(const wxString& uri) *p = '\0'; - // by default assume that the URI is in UTF-8, this is the most common - // practice - wxString s = wxString::FromUTF8(buf); - if ( s.empty() ) - { - // if it isn't, use latin-1 as a fallback -- at least this always - // succeeds - s = wxCSConv(wxFONTENCODING_ISO8859_1).cMB2WC(buf); - } - - return s; + return wxString::FromUTF8(buf); } void wxURI::AppendNextEscaped(wxString& s, const char *& p) diff --git a/tests/uris/uris.cpp b/tests/uris/uris.cpp index 65eede5a02..7b88306b58 100644 --- a/tests/uris/uris.cpp +++ b/tests/uris/uris.cpp @@ -338,6 +338,15 @@ void URITestCase::Unescaping() "\xD1\x87\xD0\xB8\xD1\x81\xD0\xBB\xD0\xBE" ), unescaped ); + + escaped = L"file://\u043C\u043E\u0439%5C%d1%84%d0%b0%d0%b9%d0%bb"; + unescaped = wxURI::Unescape(escaped); + + CPPUNIT_ASSERT_EQUAL + ( + L"file://\u043C\u043E\u0439\\\u0444\u0430\u0439\u043B", + unescaped + ); #endif // wxUSE_UNICODE }