Rename some unicode functions and document better

2023-05-30 23:35:54 -04:00
parent 54125c9c8c
commit b98bf33724
4 changed files with 15 additions and 9 deletions
--- a/luprex/cpp/drv/driver-linux.cpp
+++ b/luprex/cpp/drv/driver-linux.cpp
@@ -221,7 +221,7 @@ static int socket_poll(struct pollfd *pollvec, int pollcount, int mstimeout, std

 // Write unicode onto the console.
 static void console_write(const std::u32string &cps) {
-    std::string utf8 = drvutil::to_utf8(cps);
+    std::string utf8 = drvutil::utf32_to_utf8(cps);
    write(1, utf8.c_str(), utf8.size());
 }

@@ -231,7 +231,7 @@ static std::u32string console_read() {
    int nread = read(0, buffer, 512);
    if (nread > 0) {
        std::string_view s(buffer, nread);
-        result = drvutil::from_utf8(s, nullptr);
+        result = drvutil::utf8_to_utf32(s, nullptr);
    }
    return result;
 }
--- a/luprex/cpp/drv/driver.cpp
+++ b/luprex/cpp/drv/driver.cpp
@@ -206,7 +206,7 @@ class Driver {
            if (ndata > DRV_SHORTSTRING_SIZE) ndata = DRV_SHORTSTRING_SIZE;
            std::string_view src(data, ndata);
            int consumed;
-            std::u32string cps = drvutil::from_utf8(src, &consumed);
+            std::u32string cps = drvutil::utf8_to_utf32(src, &consumed);
            readline_device_.print(cps);
            engw.play_sent_outgoing(&engw, 0, consumed);
        }
@@ -217,7 +217,7 @@ class Driver {
        uint32_t promptlen;
        const char *promptdata;
        engw.get_console_prompt(&engw, &promptlen, &promptdata);
-        std::u32string prompt = drvutil::from_utf8(std::string_view(promptdata, promptlen), nullptr);
+        std::u32string prompt = drvutil::utf8_to_utf32(std::string_view(promptdata, promptlen), nullptr);
        readline_device_.set_prompt(prompt);
        while (true) {
            std::u32string cps = console_read();
@@ -226,7 +226,7 @@ class Driver {
            for (char32_t c : cps) {
                std::u32string line = readline_device_.putcode(c);
                if (!line.empty()) {
-                    std::string utf8 = drvutil::to_utf8(line);
+                    std::string utf8 = drvutil::utf32_to_utf8(line);
                    engw.play_recv_incoming(&engw, 0, utf8.size(), utf8.c_str());
                }
            }
--- a/luprex/cpp/drv/drvutil.cpp
+++ b/luprex/cpp/drv/drvutil.cpp
@@ -163,7 +163,7 @@ static int32_t read_codepoint_utf8(std::string_view &source) {
    return codepoint;
 }

-std::string to_utf8(const std::u32string &s) {
+std::string utf32_to_utf8(const std::u32string &s) {
    std::string result(s.size() * 4, 0);
    char *buffer = &result[0];
    int len = 0;
@@ -174,7 +174,7 @@ std::string to_utf8(const std::u32string &s) {
    return result.substr(0, len);
 }

-std::u32string from_utf8(std::string_view s, int *consumed) {
+std::u32string utf8_to_utf32(std::string_view s, int *consumed) {
    std::string_view rest = s;
    std::u32string result(s.size(), 0);
    int len = 0;
--- a/luprex/cpp/drv/drvutil.hpp
+++ b/luprex/cpp/drv/drvutil.hpp
@@ -53,7 +53,7 @@ bool is_single_wchar_t(char32_t c);
 // Convert a codepoint string into a UTF8-string.
 // If the codepoint string contains invalid codepoints, they're silently dropped.
 //
-std::string to_utf8(const std::u32string &cps);
+std::string utf32_to_utf8(const std::u32string &cps);

 // Convert a UTF8 string to a codepoint string.
 // 
@@ -61,10 +61,16 @@ std::string to_utf8(const std::u32string &cps);
 // Some of the bytes may not be consumed, if the source ends with an unfinished
 // utf-8 sequence.  Returns the Codepoint string and the number of bytes consumed.
 //
-std::u32string from_utf8(std::string_view source, int *consumed);
+std::u32string utf8_to_utf32(std::string_view source, int *consumed);

 // Convert a UTF8 string to a UCS-2 string.
 //
+// If the UTF8 string contains invalid sequences, they're silently dropped.
+// Some of the bytes may not be consumed, if the source ends with an unfinished
+// utf-8 sequence.  Returns the UCS-2 string and the number of bytes consumed.
+// Of course, UCS-2 can't represent all of unicode, so this is lossy.
+// Any character that can't be represented is replaced with a box.
+//
 std::u16string utf8_to_ucs2(std::string_view source, int *consumed);

 // Get a system error message, in an OS-independent manner.