From 16b22128bfc7fe86603f67ef163507c05034ebeb Mon Sep 17 00:00:00 2001 From: Weiyi Wang Date: Sat, 22 Sep 2018 00:45:50 -0400 Subject: [PATCH 1/3] string_util: remove ShiftJIS/CP1252 conversion function We always use unicode internally. Any dirty work of conversion with other codec should be handled by frontend framework (Qt). Further more, ShiftJIS/CP1252 are not special (they are not code set used by 3ds, or any guest/host dependencies we have), so there is no reason to specifically include them --- src/common/string_util.cpp | 19 ------------------- src/common/string_util.h | 3 --- 2 files changed, 22 deletions(-) diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index 8bbfb60a3..b269923b3 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -257,14 +257,6 @@ std::wstring UTF8ToUTF16W(const std::string& input) { return CPToUTF16(CP_UTF8, input); } -std::string SHIFTJISToUTF8(const std::string& input) { - return UTF16ToUTF8(CPToUTF16(932, input)); -} - -std::string CP1252ToUTF8(const std::string& input) { - return UTF16ToUTF8(CPToUTF16(1252, input)); -} - #else template @@ -364,17 +356,6 @@ std::string UTF16ToUTF8(const std::u16string& input) { return CodeToUTF8("UTF-16LE", input); } -std::string CP1252ToUTF8(const std::string& input) { - // return CodeToUTF8("CP1252//TRANSLIT", input); - // return CodeToUTF8("CP1252//IGNORE", input); - return CodeToUTF8("CP1252", input); -} - -std::string SHIFTJISToUTF8(const std::string& input) { - // return CodeToUTF8("CP932", input); - return CodeToUTF8("SJIS", input); -} - #endif std::string StringFromFixedZeroTerminatedBuffer(const char* buffer, std::size_t max_len) { diff --git a/src/common/string_util.h b/src/common/string_util.h index c3a0504ff..20a0ceb19 100644 --- a/src/common/string_util.h +++ b/src/common/string_util.h @@ -70,9 +70,6 @@ std::string ReplaceAll(std::string result, const std::string& src, const std::st std::string UTF16ToUTF8(const std::u16string& input); std::u16string UTF8ToUTF16(const std::string& input); -std::string CP1252ToUTF8(const std::string& str); -std::string SHIFTJISToUTF8(const std::string& str); - #ifdef _WIN32 std::string UTF16ToUTF8(const std::wstring& input); std::wstring UTF8ToUTF16W(const std::string& str); From e1765e7c5cfc7509224eef2584edc8ebfd322834 Mon Sep 17 00:00:00 2001 From: Weiyi Wang Date: Sat, 22 Sep 2018 00:52:38 -0400 Subject: [PATCH 2/3] string_util: remove TString conversion for windows First of all they are foundamentally broken. As our convention is that std::string is always UTF-8, these functions assume that the multi-byte character version of TString (std::string) from windows is also in UTF-8, which is almost always wrong. We are not going to build multi-byte character build, and even if we do, this dirty work should be handled by frontend framework early. --- src/common/file_util.h | 2 +- src/common/string_util.h | 18 ------------------ 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/src/common/file_util.h b/src/common/file_util.h index 4b04edb3e..878fbfedc 100644 --- a/src/common/file_util.h +++ b/src/common/file_util.h @@ -264,7 +264,7 @@ private: template void OpenFStream(T& fstream, const std::string& filename, std::ios_base::openmode openmode) { #ifdef _MSC_VER - fstream.open(Common::UTF8ToTStr(filename).c_str(), openmode); + fstream.open(Common::UTF8ToUTF16W(filename).c_str(), openmode); #else fstream.open(filename.c_str(), openmode); #endif diff --git a/src/common/string_util.h b/src/common/string_util.h index 20a0ceb19..091735d5f 100644 --- a/src/common/string_util.h +++ b/src/common/string_util.h @@ -74,24 +74,6 @@ std::u16string UTF8ToUTF16(const std::string& input); std::string UTF16ToUTF8(const std::wstring& input); std::wstring UTF8ToUTF16W(const std::string& str); -#ifdef _UNICODE -inline std::string TStrToUTF8(const std::wstring& str) { - return UTF16ToUTF8(str); -} - -inline std::wstring UTF8ToTStr(const std::string& str) { - return UTF8ToUTF16W(str); -} -#else -inline std::string TStrToUTF8(const std::string& str) { - return str; -} - -inline std::string UTF8ToTStr(const std::string& str) { - return str; -} -#endif - #endif /** From e087cb9a28886b46b389008438a3e65d68c26ffc Mon Sep 17 00:00:00 2001 From: Weiyi Wang Date: Sat, 22 Sep 2018 01:03:47 -0400 Subject: [PATCH 3/3] string_util: unify UTF8<->UTF16 conversion to codecvt --- CMakeLists.txt | 8 --- src/common/string_util.cpp | 115 ++----------------------------------- 2 files changed, 6 insertions(+), 117 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2797d182e..a8b693bd6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -281,14 +281,6 @@ ELSEIF (CMAKE_SYSTEM_NAME MATCHES "^(Linux|kFreeBSD|GNU|SunOS)$") set(PLATFORM_LIBRARIES rt) ENDIF (APPLE) -# MINGW: GCC does not support codecvt, so use iconv instead -if (UNIX OR MINGW) - find_library(ICONV_LIBRARY NAMES iconv) - if (ICONV_LIBRARY) - list(APPEND PLATFORM_LIBRARIES ${ICONV_LIBRARY}) - endif() -endif() - # Setup a custom clang-format target (if clang-format can be found) that will run # against all the src files. This should be used before making a pull request. # ======================================================================= diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index b269923b3..c556d5040 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -13,11 +14,7 @@ #include "common/string_util.h" #ifdef _WIN32 -#include #include -#include "common/common_funcs.h" -#else -#include #endif namespace Common { @@ -191,11 +188,9 @@ std::string ReplaceAll(std::string result, const std::string& src, const std::st return result; } -#ifdef _WIN32 - std::string UTF16ToUTF8(const std::u16string& input) { -#if _MSC_VER >= 1900 - // Workaround for missing char16_t/char32_t instantiations in MSVC2015 +#ifdef _MSC_VER + // Workaround for missing char16_t/char32_t instantiations in MSVC2017 std::wstring_convert, __int16> convert; std::basic_string<__int16> tmp_buffer(input.cbegin(), input.cend()); return convert.to_bytes(tmp_buffer); @@ -206,8 +201,8 @@ std::string UTF16ToUTF8(const std::u16string& input) { } std::u16string UTF8ToUTF16(const std::string& input) { -#if _MSC_VER >= 1900 - // Workaround for missing char16_t/char32_t instantiations in MSVC2015 +#ifdef _MSC_VER + // Workaround for missing char16_t/char32_t instantiations in MSVC2017 std::wstring_convert, __int16> convert; auto tmp_buffer = convert.from_bytes(input); return std::u16string(tmp_buffer.cbegin(), tmp_buffer.cend()); @@ -217,6 +212,7 @@ std::u16string UTF8ToUTF16(const std::string& input) { #endif } +#ifdef _WIN32 static std::wstring CPToUTF16(u32 code_page, const std::string& input) { const auto size = MultiByteToWideChar(code_page, 0, input.data(), static_cast(input.size()), nullptr, 0); @@ -257,105 +253,6 @@ std::wstring UTF8ToUTF16W(const std::string& input) { return CPToUTF16(CP_UTF8, input); } -#else - -template -static std::string CodeToUTF8(const char* fromcode, const std::basic_string& input) { - iconv_t const conv_desc = iconv_open("UTF-8", fromcode); - if ((iconv_t)(-1) == conv_desc) { - LOG_ERROR(Common, "Iconv initialization failure [{}]: {}", fromcode, strerror(errno)); - iconv_close(conv_desc); - return {}; - } - - const std::size_t in_bytes = sizeof(T) * input.size(); - // Multiply by 4, which is the max number of bytes to encode a codepoint - const std::size_t out_buffer_size = 4 * in_bytes; - - std::string out_buffer(out_buffer_size, '\0'); - - auto src_buffer = &input[0]; - std::size_t src_bytes = in_bytes; - auto dst_buffer = &out_buffer[0]; - std::size_t dst_bytes = out_buffer.size(); - - while (0 != src_bytes) { - std::size_t const iconv_result = - iconv(conv_desc, (char**)(&src_buffer), &src_bytes, &dst_buffer, &dst_bytes); - - if (static_cast(-1) == iconv_result) { - if (EILSEQ == errno || EINVAL == errno) { - // Try to skip the bad character - if (0 != src_bytes) { - --src_bytes; - ++src_buffer; - } - } else { - LOG_ERROR(Common, "iconv failure [{}]: {}", fromcode, strerror(errno)); - break; - } - } - } - - std::string result; - out_buffer.resize(out_buffer_size - dst_bytes); - out_buffer.swap(result); - - iconv_close(conv_desc); - - return result; -} - -std::u16string UTF8ToUTF16(const std::string& input) { - iconv_t const conv_desc = iconv_open("UTF-16LE", "UTF-8"); - if ((iconv_t)(-1) == conv_desc) { - LOG_ERROR(Common, "Iconv initialization failure [UTF-8]: {}", strerror(errno)); - iconv_close(conv_desc); - return {}; - } - - const std::size_t in_bytes = sizeof(char) * input.size(); - // Multiply by 4, which is the max number of bytes to encode a codepoint - const std::size_t out_buffer_size = 4 * sizeof(char16_t) * in_bytes; - - std::u16string out_buffer(out_buffer_size, char16_t{}); - - char* src_buffer = const_cast(&input[0]); - std::size_t src_bytes = in_bytes; - char* dst_buffer = (char*)(&out_buffer[0]); - std::size_t dst_bytes = out_buffer.size(); - - while (0 != src_bytes) { - std::size_t const iconv_result = - iconv(conv_desc, &src_buffer, &src_bytes, &dst_buffer, &dst_bytes); - - if (static_cast(-1) == iconv_result) { - if (EILSEQ == errno || EINVAL == errno) { - // Try to skip the bad character - if (0 != src_bytes) { - --src_bytes; - ++src_buffer; - } - } else { - LOG_ERROR(Common, "iconv failure [UTF-8]: {}", strerror(errno)); - break; - } - } - } - - std::u16string result; - out_buffer.resize(out_buffer_size - dst_bytes); - out_buffer.swap(result); - - iconv_close(conv_desc); - - return result; -} - -std::string UTF16ToUTF8(const std::u16string& input) { - return CodeToUTF8("UTF-16LE", input); -} - #endif std::string StringFromFixedZeroTerminatedBuffer(const char* buffer, std::size_t max_len) {