Fix recognizing locales using UTF-8 charset

Do not assume that C locale uses UTF-8, as this is simply not true and
none of the CRT functions handle UTF-8 correctly with this locale.

Do recognize locales explicitly using UTF-8 charset as being in UTF-8.
On most Unix systems (including Linux), it didn't really matter that we
didn't do it, because we used nl_langinfo() there, but it does matter a
lot with MSVC under MSW whose CRT supports UTF-8 now, but UTF-8
functions were not used there -- do use them now.

(cherry picked from commit a1d289fe3ea74aa1c713e0f02f5fd5f83810af58)
This commit is contained in:
Vadim Zeitlin 2023-03-28 15:49:00 +01:00
parent 759331e839
commit 2c2d9fd909

View File

@ -1065,6 +1065,19 @@ char *strdup(const char *s)
bool wxLocaleIsUtf8 = false; // the safer setting if not known
#endif
static bool wxIsCharsetUtf8(const char* charset)
{
if ( strcmp(charset, "UTF-8") == 0 ||
strcmp(charset, "utf-8") == 0 ||
strcmp(charset, "UTF8") == 0 ||
strcmp(charset, "utf8") == 0 )
{
return true;
}
return false;
}
static bool wxIsLocaleUtf8()
{
// NB: we intentionally don't use wxLocale::GetSystemEncodingName(),
@ -1075,31 +1088,28 @@ static bool wxIsLocaleUtf8()
// GNU libc provides current character set this way (this conforms to
// Unix98)
const char *charset = nl_langinfo(CODESET);
if ( charset )
{
// "UTF-8" is used by modern glibc versions, but test other variants
// as well, just in case:
if ( strcmp(charset, "UTF-8") == 0 ||
strcmp(charset, "utf-8") == 0 ||
strcmp(charset, "UTF8") == 0 ||
strcmp(charset, "utf8") == 0 )
{
if ( charset && wxIsCharsetUtf8(charset) )
return true;
}
}
#endif // HAVE_LANGINFO_H
// check LC_CTYPE string: this also works with (sufficiently recent) MSVC
// and on any other system without nl_langinfo()
const char *lc_ctype = setlocale(LC_CTYPE, NULL);
if ( lc_ctype )
{
// check if we're running under the "C" locale: it is 7bit subset
// of UTF-8, so it can be safely used with the UTF-8 build:
const char *lc_ctype = setlocale(LC_CTYPE, NULL);
if ( lc_ctype &&
(strcmp(lc_ctype, "C") == 0 || strcmp(lc_ctype, "POSIX") == 0) )
{
if ( (strcmp(lc_ctype, "C") == 0 || strcmp(lc_ctype, "POSIX") == 0) )
return true;
// any other locale can also use UTF-8 encoding if it's explicitly
// specified
const char* charset = strrchr(lc_ctype, '.');
if ( charset && wxIsCharsetUtf8(charset + 1) )
return true;
}
// we don't know what charset libc is using, so assume the worst
// to be safe:
// by default assume that we don't use UTF-8
return false;
}