From b98bf337241aaf03c2287e447488ca9849a41a12 Mon Sep 17 00:00:00 2001 From: jyelon Date: Tue, 30 May 2023 23:35:54 -0400 Subject: [PATCH] Rename some unicode functions and document better --- luprex/cpp/drv/driver-linux.cpp | 4 ++-- luprex/cpp/drv/driver.cpp | 6 +++--- luprex/cpp/drv/drvutil.cpp | 4 ++-- luprex/cpp/drv/drvutil.hpp | 10 ++++++++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/luprex/cpp/drv/driver-linux.cpp b/luprex/cpp/drv/driver-linux.cpp index 0b43e456..23cbba86 100644 --- a/luprex/cpp/drv/driver-linux.cpp +++ b/luprex/cpp/drv/driver-linux.cpp @@ -221,7 +221,7 @@ static int socket_poll(struct pollfd *pollvec, int pollcount, int mstimeout, std // Write unicode onto the console. static void console_write(const std::u32string &cps) { - std::string utf8 = drvutil::to_utf8(cps); + std::string utf8 = drvutil::utf32_to_utf8(cps); write(1, utf8.c_str(), utf8.size()); } @@ -231,7 +231,7 @@ static std::u32string console_read() { int nread = read(0, buffer, 512); if (nread > 0) { std::string_view s(buffer, nread); - result = drvutil::from_utf8(s, nullptr); + result = drvutil::utf8_to_utf32(s, nullptr); } return result; } diff --git a/luprex/cpp/drv/driver.cpp b/luprex/cpp/drv/driver.cpp index 61ca74a0..6a9705b4 100644 --- a/luprex/cpp/drv/driver.cpp +++ b/luprex/cpp/drv/driver.cpp @@ -206,7 +206,7 @@ class Driver { if (ndata > DRV_SHORTSTRING_SIZE) ndata = DRV_SHORTSTRING_SIZE; std::string_view src(data, ndata); int consumed; - std::u32string cps = drvutil::from_utf8(src, &consumed); + std::u32string cps = drvutil::utf8_to_utf32(src, &consumed); readline_device_.print(cps); engw.play_sent_outgoing(&engw, 0, consumed); } @@ -217,7 +217,7 @@ class Driver { uint32_t promptlen; const char *promptdata; engw.get_console_prompt(&engw, &promptlen, &promptdata); - std::u32string prompt = drvutil::from_utf8(std::string_view(promptdata, promptlen), nullptr); + std::u32string prompt = drvutil::utf8_to_utf32(std::string_view(promptdata, promptlen), nullptr); readline_device_.set_prompt(prompt); while (true) { std::u32string cps = console_read(); @@ -226,7 +226,7 @@ class Driver { for (char32_t c : cps) { std::u32string line = readline_device_.putcode(c); if (!line.empty()) { - std::string utf8 = drvutil::to_utf8(line); + std::string utf8 = drvutil::utf32_to_utf8(line); engw.play_recv_incoming(&engw, 0, utf8.size(), utf8.c_str()); } } diff --git a/luprex/cpp/drv/drvutil.cpp b/luprex/cpp/drv/drvutil.cpp index 7ab53739..18544805 100644 --- a/luprex/cpp/drv/drvutil.cpp +++ b/luprex/cpp/drv/drvutil.cpp @@ -163,7 +163,7 @@ static int32_t read_codepoint_utf8(std::string_view &source) { return codepoint; } -std::string to_utf8(const std::u32string &s) { +std::string utf32_to_utf8(const std::u32string &s) { std::string result(s.size() * 4, 0); char *buffer = &result[0]; int len = 0; @@ -174,7 +174,7 @@ std::string to_utf8(const std::u32string &s) { return result.substr(0, len); } -std::u32string from_utf8(std::string_view s, int *consumed) { +std::u32string utf8_to_utf32(std::string_view s, int *consumed) { std::string_view rest = s; std::u32string result(s.size(), 0); int len = 0; diff --git a/luprex/cpp/drv/drvutil.hpp b/luprex/cpp/drv/drvutil.hpp index 5ef8484b..c59b0d27 100644 --- a/luprex/cpp/drv/drvutil.hpp +++ b/luprex/cpp/drv/drvutil.hpp @@ -53,7 +53,7 @@ bool is_single_wchar_t(char32_t c); // Convert a codepoint string into a UTF8-string. // If the codepoint string contains invalid codepoints, they're silently dropped. // -std::string to_utf8(const std::u32string &cps); +std::string utf32_to_utf8(const std::u32string &cps); // Convert a UTF8 string to a codepoint string. // @@ -61,10 +61,16 @@ std::string to_utf8(const std::u32string &cps); // Some of the bytes may not be consumed, if the source ends with an unfinished // utf-8 sequence. Returns the Codepoint string and the number of bytes consumed. // -std::u32string from_utf8(std::string_view source, int *consumed); +std::u32string utf8_to_utf32(std::string_view source, int *consumed); // Convert a UTF8 string to a UCS-2 string. // +// If the UTF8 string contains invalid sequences, they're silently dropped. +// Some of the bytes may not be consumed, if the source ends with an unfinished +// utf-8 sequence. Returns the UCS-2 string and the number of bytes consumed. +// Of course, UCS-2 can't represent all of unicode, so this is lossy. +// Any character that can't be represented is replaced with a box. +// std::u16string utf8_to_ucs2(std::string_view source, int *consumed); // Get a system error message, in an OS-independent manner.