Fix recognizing locales using UTF-8 charset

Do not assume that C locale uses UTF-8, as this is simply not true and none of the CRT functions handle UTF-8 correctly with this locale. Do recognize locales explicitly using UTF-8 charset as being in UTF-8. On most Unix systems (including Linux), it didn't really matter that we didn't do it, because we used nl_langinfo() there, but it does matter a lot with MSVC under MSW whose CRT supports UTF-8 now, but UTF-8 functions were not used there -- do use them now. (cherry picked from commit a1d289fe3ea74aa1c713e0f02f5fd5f83810af58)
2023-03-28 15:49:00 +01:00 · 2023-03-28 15:49:00 +01:00 · 2c2d9fd909
commit 2c2d9fd909
parent 759331e839
1 changed files with 29 additions and 19 deletions
--- a/src/common/wxcrt.cpp
+++ b/src/common/wxcrt.cpp
@ -1065,6 +1065,19 @@ char *strdup(const char *s)
 bool wxLocaleIsUtf8 = false; // the safer setting if not known
 #endif

+static bool wxIsCharsetUtf8(const char* charset)
+{
+    if ( strcmp(charset, "UTF-8") == 0 ||
+         strcmp(charset, "utf-8") == 0 ||
+         strcmp(charset, "UTF8") == 0 ||
+         strcmp(charset, "utf8") == 0 )
+    {
+        return true;
+    }
+
+    return false;
+}
+
 static bool wxIsLocaleUtf8()
 {
    // NB: we intentionally don't use wxLocale::GetSystemEncodingName(),
@ -1075,31 +1088,28 @@ static bool wxIsLocaleUtf8()
    // GNU libc provides current character set this way (this conforms to
    // Unix98)
    const char *charset = nl_langinfo(CODESET);
-    if ( charset )
-    {
-        // "UTF-8" is used by modern glibc versions, but test other variants
-        // as well, just in case:
-        if ( strcmp(charset, "UTF-8") == 0 ||
-             strcmp(charset, "utf-8") == 0 ||
-             strcmp(charset, "UTF8") == 0 ||
-             strcmp(charset, "utf8") == 0 )
-        {
+    if ( charset && wxIsCharsetUtf8(charset) )
        return true;
-        }
-    }
 #endif // HAVE_LANGINFO_H

+    // check LC_CTYPE string: this also works with (sufficiently recent) MSVC
+    // and on any other system without nl_langinfo()
+    const char *lc_ctype = setlocale(LC_CTYPE, NULL);
+    if ( lc_ctype )
+    {
        // check if we're running under the "C" locale: it is 7bit subset
        // of UTF-8, so it can be safely used with the UTF-8 build:
-    const char *lc_ctype = setlocale(LC_CTYPE, NULL);
-    if ( lc_ctype &&
-         (strcmp(lc_ctype, "C") == 0 || strcmp(lc_ctype, "POSIX") == 0) )
-    {
+        if ( (strcmp(lc_ctype, "C") == 0 || strcmp(lc_ctype, "POSIX") == 0) )
+            return true;
+
+        // any other locale can also use UTF-8 encoding if it's explicitly
+        // specified
+        const char* charset = strrchr(lc_ctype, '.');
+        if ( charset && wxIsCharsetUtf8(charset + 1) )
            return true;
    }

-    // we don't know what charset libc is using, so assume the worst
-    // to be safe:
+    // by default assume that we don't use UTF-8
    return false;
 }