diff --git a/luprex/cpp/drv/drvutil.cpp b/luprex/cpp/drv/drvutil.cpp index 18544805..50c8fee2 100644 --- a/luprex/cpp/drv/drvutil.cpp +++ b/luprex/cpp/drv/drvutil.cpp @@ -109,6 +109,34 @@ static int buffer_codepoint_utf8(char32_t scp, char *buffer) { } } +static int32_t read_codepoint_utf16(std::u16string_view &source) { + if (source.empty()) return -1; + + int32_t word0 = ((const uint16_t *)source.data())[0]; + source.remove_prefix(1); + + if (word0 < 0xD800) { + return word0; + } else if (word0 < 0xDC00) { + if (source.empty()) { + return -2; + } + int32_t word1 = ((const uint16_t *)source.data())[0]; + if ((word1 < 0xDC00)||(word1 > 0xDFFF)) { + return -2; + } + int32_t part1 = word0 & 0x3FF; + int32_t part2 = word1 & 0x3FF; + int32_t result = ((part1 << 10) | part2) + 0x10000; + source.remove_prefix(1); + return result; + } else if (word0 < 0xE000) { + return -2; + } else { + return word0; + } +} + static int32_t read_codepoint_utf8(std::string_view &source) { size_t size = source.size(); if (size == 0) return -1; @@ -218,6 +246,18 @@ std::u16string utf8_to_ucs2(std::string_view s, int *consumed) { return result.substr(0, len); } +std::string utf16_to_utf8(std::u16string_view s) { + std::string result(s.size() * 4, 0); + int len = 0; + while (true) { + int codepoint = read_codepoint_utf16(s); + if (codepoint == -1) break; + if (codepoint < 0) continue; + len += buffer_codepoint_utf8(codepoint, &result[len]); + } + return result.substr(0, len); +} + static std::vector parse_control_lst(std::string_view ctrl) { std::vector result; while (!ctrl.empty()) { diff --git a/luprex/cpp/drv/drvutil.hpp b/luprex/cpp/drv/drvutil.hpp index c59b0d27..c670c238 100644 --- a/luprex/cpp/drv/drvutil.hpp +++ b/luprex/cpp/drv/drvutil.hpp @@ -55,7 +55,7 @@ bool is_single_wchar_t(char32_t c); // std::string utf32_to_utf8(const std::u32string &cps); -// Convert a UTF8 string to a codepoint string. +// Convert a UTF8 string to a UTF32 string. // // If the UTF8 string contains invalid sequences, they're silently dropped. // Some of the bytes may not be consumed, if the source ends with an unfinished @@ -73,6 +73,12 @@ std::u32string utf8_to_utf32(std::string_view source, int *consumed); // std::u16string utf8_to_ucs2(std::string_view source, int *consumed); +// Convert a UTF16 string to a UTF8 string. +// +// If the UTF16 string contains invalid sequences, they're silently dropped. +// +std::string utf16_to_utf8(std::u16string_view source); + // Get a system error message, in an OS-independent manner. // // These versions of strerror is thread-safe, and it never fails