Lots of work on unicode support

2023-05-19 00:23:23 -04:00
parent a25213d259
commit 7e25be10a4
10 changed files with 249 additions and 228 deletions
--- a/luprex/cpp/core/json.cpp
+++ b/luprex/cpp/core/json.cpp
@@ -136,10 +136,10 @@ static bool encode_string(lua_State *L, eng::ostringstream &oss) {
    std::string_view str(s, len);
    oss << '"';
    if (sv::valid_utf8(str) && !sv::has_prefix(str, "")) {
-        // Output the string in the straightforward way,
-        // using traditional json escaping.
-        for (char c : str) {
-            switch (c) {
+        while (!str.empty()) {
+            int32_t cp = sv::read_codepoint_utf8(str);
+            assert(cp >= 0);
+            switch (cp) {
                case '\\': oss << "\\\\"; break;
                case '"' : oss << "\\\""; break;
                case '\b': oss << "\\b"; break;
@@ -148,10 +148,11 @@ static bool encode_string(lua_State *L, eng::ostringstream &oss) {
                case '\n': oss << "\\n"; break;
                case '\t': oss << "\\t"; break;
                default: {
-                    if (c < 32) {
-                        oss << "\\u" << util::hex16.val(c);
+                    if (cp < 32) {
+                        oss << "\\u" << util::hex16.val(cp);
                    } else {
-                        oss << c;
+                        bool ok = util::write_codepoint_utf8(cp, &oss);
+                        assert(ok);
                    }
                }
            }
--- a/luprex/cpp/core/util.cpp
+++ b/luprex/cpp/core/util.cpp
@@ -298,7 +298,7 @@ int32_t read_ascii_char(string_view &source) {
    return result;
 }

-int32_t read_codepoint_utf8(string_view &source) {
+int32_t read_codepoint_utf8(std::string_view &source) {
    size_t size = source.size();
    if (size == 0) return -1;

@@ -322,9 +322,8 @@ int32_t read_codepoint_utf8(string_view &source) {
        codepoint = (bytes[0] & 0x07);
        seqlen = 4;
    } else {
-        // Bad character. Drop a byte and return invalid CP.
-        source.remove_prefix(1);
-        return 1;
+        // Bad character. return invalid CP.
+        return -2;
    }

    if (seqlen > size) {
@@ -333,9 +332,8 @@ int32_t read_codepoint_utf8(string_view &source) {

    for (size_t i = 1; i < seqlen; ++i) {
        if ((bytes[i] & 0xC0) != 0x80) {
-            // Bad character. Drop a byte and return invalid CP.
-            source.remove_prefix(1);
-            return 1;
+            // Bad character. return invalid CP.
+            return -2;
        }
        codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
    }
@@ -346,17 +344,15 @@ int32_t read_codepoint_utf8(string_view &source) {
        ((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
        ((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
        ((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
-        // Bad character. Drop a byte and return invalid CP.
-        source.remove_prefix(1);
-        return 1;
+        // Bad character. return invalid CP.
+        return -2;
    }

    source.remove_prefix(seqlen);
    return codepoint;
 }

-bool valid_utf8(string_view s)
-{
+bool valid_utf8(string_view s) {
    while (!s.empty()) {
        int32_t codepoint = read_codepoint_utf8(s);
        if (codepoint < 0) return false;
@@ -403,27 +399,32 @@ void quote_string(const eng::string &s, std::ostream *os) {
    }
    bool usesinglequote = (!anysq)||(anydq);
    (*os) << (usesinglequote ? '\'' : '"');
-    for (char c : s) {
-        if (c >= 32) {
-            if (c == '"') {
-                (*os) << (usesinglequote ? "\"" : "\\\"");
-            } else if (c == '\'') {
-                (*os) << (usesinglequote ? "\\'" : "'");
-            } else if (c == '\\') {
-                (*os) << "\\\\"; 
-            } else {
-                (*os) << c;
-            }
-        } else {
-            unsigned int value = ((unsigned char)c);
-            switch (c) {
+    std::string_view str(s);
+    while (!str.empty()) {
+        unsigned char c0 = (unsigned char)(str[0]);
+        int cp = sv::read_codepoint_utf8(str);
+        if (cp < 0) {
+            (*os) << "\\" << dec.width(3).fill('0').val(c0);
+            str.remove_prefix(1);
+        } else if (cp < 32) {
+            c0 = ((unsigned char)cp);
+            switch (c0) {
            case '\n': (*os) << "\\n"; break;
            case '\t': (*os) << "\\t"; break;
            case '\r': (*os) << "\\r"; break;
+            case '\b': (*os) << "\\b"; break;
            default:
-                (*os) << "\\" << dec.width(3).fill('0').val(value);
+                (*os) << "\\" << dec.width(3).fill('0').val(c0);
                break;
            }
+        } else if (cp == '"') {
+            (*os) << (usesinglequote ? "\"" : "\\\"");
+        } else if (cp == '\'') {
+            (*os) << (usesinglequote ? "\\'" : "'");
+        } else if (cp == '\\') {
+            (*os) << "\\\\"; 
+        } else {
+            write_codepoint_utf8(cp, os);
        }
    }
    (*os) << (usesinglequote ? '\'' : '"');
@@ -656,50 +657,52 @@ eng::string toupper(eng::string input) {
    return input;
 }

-static void buffer_codepoint_utf8(int32_t scp, char *buffer) {
+static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
    uint32_t cp = (uint32_t)scp;
    unsigned char *c = (unsigned char *)buffer;
-    if (cp <= 0x7F) {
+    if (cp < 0) {
+        return 0;
+    }
+    else if (cp <= 0x7F) {
        c[0] = cp;
-        c[1] = 0;
+        return 1;
    }
    else if (cp <= 0x7FF) {
        c[0] = (cp>>6)+192;
        c[1] = (cp&63)+128;
-        c[2] = 0;
+        return 2;
    }
    else if (cp <= 0xFFFF) {
-        if (0xd800 <= cp && cp <= 0xdfff) {
-            c[0] = 0;
-        } else {
-            c[0] = (cp>>12)+224;
-            c[1] = ((cp>>6)&63)+128;
-            c[2] = (cp&63)+128;
-            c[3] = 0;
+        if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
+            return 0;
        }
+        c[0] = (cp>>12)+224;
+        c[1] = ((cp>>6)&63)+128;
+        c[2] = (cp&63)+128;
+        return 3;
    }
    else if (cp <= 0x10FFFF) {
        c[0] = (cp>>18)+240;
        c[1] = ((cp>>12)&63)+128;
        c[2] = ((cp>>6)&63)+128;
        c[3] = (cp&63)+128;
-        c[4] = 0;
+        return 4;
    } else {
-        c[0] = 0;
+        return 0;
    }
 }

 eng::string get_codepoint_utf8(uint32_t cp) {
-    char buffer[5];
-    buffer_codepoint_utf8(cp, buffer);
-    return eng::string(buffer);
+    char buffer[4];
+    int len = buffer_codepoint_utf8(cp, buffer);
+    return eng::string(buffer, len);
 }

 bool write_codepoint_utf8(int32_t cp, std::ostream *s) {
-    char buffer[5];
-    buffer_codepoint_utf8(cp, buffer);
-    (*s) << buffer;
-    return buffer[0] != 0;
+    char buffer[4];
+    int len = buffer_codepoint_utf8(cp, buffer);
+    (*s) << std::string_view(buffer, len);
+    return (len > 0);
 }

 double distance_squared(double x1, double y1, double x2, double y2) {
--- a/luprex/cpp/core/util.hpp
+++ b/luprex/cpp/core/util.hpp
@@ -181,8 +181,14 @@ int32_t read_ascii_char(string_view &source);

 // Read a UTF8 codepoint from a string_view.
 //
-// If the next thing in the string_view isn't a valid
-// codepoint, returns -1 and doesn't update the view.
+// If the string_view is empty, returns -1 and doesn't update
+// the string_view.
+//
+// If the string_view contains an unfinished but possibly valid
+// codepoint, returns -1 and doesn't update the string_view.
+//
+// If the next thing in the string_view is an invalid codepoint,
+// returns -2 and doesn't update the string_view.
 //
 int32_t read_codepoint_utf8(string_view &source);

--- a/luprex/cpp/drv/driver-linux.cpp
+++ b/luprex/cpp/drv/driver-linux.cpp
@@ -219,18 +219,18 @@ static int socket_poll(struct pollfd *pollvec, int pollcount, int mstimeout, std
 }

 // Write unicode onto the console.
-static void console_write(const CodepointString &cps) {
-    std::string utf8 = ReadlineDevice::to_utf8(cps);
+static void console_write(const std::u32string &cps) {
+    std::string utf8 = drvutil::to_utf8(cps);
    write(1, utf8.c_str(), utf8.size());
 }

-static CodepointString console_read() {
-    CodepointString result;
+static std::u32string console_read() {
+    std::u32string result;
    char buffer[512];
    int nread = read(0, buffer, 512);
    if (nread > 0) {
        std::string_view s(buffer, nread);
-        result = ReadlineDevice::from_utf8(s, nullptr);
+        result = drvutil::from_utf8(s, nullptr);
    }
    return result;
 }
--- a/luprex/cpp/drv/driver-windows.cpp
+++ b/luprex/cpp/drv/driver-windows.cpp
@@ -230,14 +230,15 @@ static void init_winsock() {
    }
 }

-static void console_write(const CodepointString &cps) {
+
+static void console_write(const std::u32string &cps) {
    if (cps.size() == 0) return;
-    // Convert to wstring.
-    // Any character outside the range 0xFFFF is replaced with a box.
+    // Convert to wstring.   Any character not representable as a single wchar_t
+    // is replaced with a box.  It's not ideal, but it's pretty good.
    std::wstring ws(cps.size(), 0);
    for (int i = 0; i < int(cps.size()); i++) {
        char32_t c = cps[i];
-        if ((c >= 0)&&(c <= 0xFFFF)) ws[i] = (wchar_t)c;
+        if (drvutil::is_single_wchar_t(c)) ws[i] = (wchar_t)c;
        else ws[i] = 0x2610;
    }
    HANDLE hstdout = GetStdHandle(STD_OUTPUT_HANDLE);
@@ -253,7 +254,7 @@ static void console_write(const CodepointString &cps) {
    }
 }

-static CodepointString console_read() {
+static std::u32string console_read() {
    HANDLE hstdin = GetStdHandle(STD_INPUT_HANDLE);
    assert(hstdin != INVALID_HANDLE_VALUE);
    INPUT_RECORD inrecords[512];
@@ -262,7 +263,7 @@ static CodepointString console_read() {
        if (int(nevents) > 0) {
            if (int(nevents) > 512) nevents = 512;
            ReadConsoleInputW(hstdin, inrecords, nevents, &nread);
-            CodepointString result(nread, 0);
+            std::u32string result(nread, 0);
            int len = 0;
            for (int i = 0; i < int(nread); i++) {
                const INPUT_RECORD &inr = inrecords[i];
@@ -274,7 +275,7 @@ static CodepointString console_read() {
            return result.substr(0, len);
        }
    }
-    return CodepointString();
+    return std::u32string();
 }

 static void ssl_load_certificate_authorities(SSL_CTX *ctx) {
--- a/luprex/cpp/drv/driver.cpp
+++ b/luprex/cpp/drv/driver.cpp
@@ -206,7 +206,7 @@ class Driver {
            if (ndata > DRV_SHORTSTRING_SIZE) ndata = DRV_SHORTSTRING_SIZE;
            std::string_view src(data, ndata);
            int consumed;
-            CodepointString cps = ReadlineDevice::from_utf8(src, &consumed);
+            std::u32string cps = drvutil::from_utf8(src, &consumed);
            readline_device_.print(cps);
            engw.play_sent_outgoing(&engw, 0, consumed);
        }
@@ -217,16 +217,16 @@ class Driver {
        uint32_t promptlen;
        const char *promptdata;
        engw.get_console_prompt(&engw, &promptlen, &promptdata);
-        CodepointString prompt = ReadlineDevice::from_utf8(std::string_view(promptdata, promptlen), nullptr);
+        std::u32string prompt = drvutil::from_utf8(std::string_view(promptdata, promptlen), nullptr);
        readline_device_.set_prompt(prompt);
        while (true) {
-            CodepointString cps = console_read();
+            std::u32string cps = console_read();
            if (cps.size() == 0) break;
            read_console_recently_ = true;
            for (char32_t c : cps) {
-                CodepointString line = readline_device_.putcode(c);
+                std::u32string line = readline_device_.putcode(c);
                if (!line.empty()) {
-                    std::string utf8 = ReadlineDevice::to_utf8(line);
+                    std::string utf8 = drvutil::to_utf8(line);
                    engw.play_recv_incoming(&engw, 0, utf8.size(), utf8.c_str());
                }
            }
--- a/luprex/cpp/drv/drvutil.cpp
+++ b/luprex/cpp/drv/drvutil.cpp
@@ -77,6 +77,131 @@ void split_target(std::string_view target, std::string &cert, std::string &host,
    port = std::string(split[2]);
 }

+bool is_single_wchar_t(char32_t c) {
+    if ((c >= 0xD800) && (c <= 0xDFFF)) return false;
+    if ((c >= 0) && (c <= 0xFFFF)) return true;
+    return false;
+}
+
+static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
+    uint32_t cp = (uint32_t)scp;
+    unsigned char *c = (unsigned char *)buffer;
+    if (cp < 0) {
+        return 0;
+    }
+    else if (cp <= 0x7F) {
+        c[0] = cp;
+        return 1;
+    }
+    else if (cp <= 0x7FF) {
+        c[0] = (cp>>6)+192;
+        c[1] = (cp&63)+128;
+        return 2;
+    }
+    else if (cp <= 0xFFFF) {
+        if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
+            return 0;
+        }
+        c[0] = (cp>>12)+224;
+        c[1] = ((cp>>6)&63)+128;
+        c[2] = (cp&63)+128;
+        return 3;
+    }
+    else if (cp <= 0x10FFFF) {
+        c[0] = (cp>>18)+240;
+        c[1] = ((cp>>12)&63)+128;
+        c[2] = ((cp>>6)&63)+128;
+        c[3] = (cp&63)+128;
+        return 4;
+    } else {
+        return 0;
+    }
+}
+
+static int32_t read_codepoint_utf8(std::string_view &source) {
+    size_t size = source.size();
+    if (size == 0) return -1;
+
+    const unsigned char *bytes = (const unsigned char *)source.data();
+    int codepoint;
+    size_t seqlen;
+    if ((bytes[0] & 0x80) == 0x00) {
+        // U+0000 to U+007F
+        codepoint = (bytes[0] & 0x7F);
+        seqlen = 1;
+    } else if ((bytes[0] & 0xE0) == 0xC0) {
+        // U+0080 to U+07FF
+        codepoint = (bytes[0] & 0x1F);
+        seqlen = 2;
+    } else if ((bytes[0] & 0xF0) == 0xE0) {
+        // U+0800 to U+FFFF
+        codepoint = (bytes[0] & 0x0F);
+        seqlen = 3;
+    } else if ((bytes[0] & 0xF8) == 0xF0) {
+        // U+10000 to U+10FFFF
+        codepoint = (bytes[0] & 0x07);
+        seqlen = 4;
+    } else {
+        // Bad character. return invalid CP.
+        return -2;
+    }
+
+    if (seqlen > size) {
+        return -1;
+    }
+
+    for (size_t i = 1; i < seqlen; ++i) {
+        if ((bytes[i] & 0xC0) != 0x80) {
+            // Bad character. return invalid CP.
+            return -2;
+        }
+        codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
+    }
+
+    if ((codepoint > 0x10FFFF) ||
+        ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) ||
+        ((codepoint <= 0x007F) && (seqlen != 1)) ||
+        ((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
+        ((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
+        ((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
+        // Bad character. return invalid CP.
+        return -2;
+    }
+
+    source.remove_prefix(seqlen);
+    return codepoint;
+}
+
+std::string to_utf8(const std::u32string &s) {
+    std::string result(s.size() * 4, 0);
+    char *buffer = &result[0];
+    int len = 0;
+    for (char32_t c : s) {
+        int clen = buffer_codepoint_utf8(c, buffer + len);
+        len += clen;
+    }
+    return result.substr(0, len);
+}
+
+std::u32string from_utf8(std::string_view s, int *consumed) {
+    std::string_view rest = s;
+    std::u32string result(s.size(), 0);
+    int len = 0;
+    while (true) {
+        int32_t c = read_codepoint_utf8(rest);
+        if (c == -1) {
+            break; // EOF reached;
+        } else if (c < 0) {
+            rest.remove_prefix(1);
+        } else {
+            result[len++] = (char32_t)c;
+        }
+    }
+    if (consumed != nullptr) {
+        *consumed = s.size() - rest.size();
+    }
+    return result.substr(0, len);
+}

 static std::vector<std::string> parse_control_lst(std::string_view ctrl) {
    std::vector<std::string> result;
--- a/luprex/cpp/drv/drvutil.hpp
+++ b/luprex/cpp/drv/drvutil.hpp
@@ -46,6 +46,23 @@ std::string package_lua_source(const std::filesystem::path &base, std::ostream *
 //
 void split_target(std::string_view target, std::string &cert, std::string &host, std::string &port);

+// Return true if the unicode codepoint can be converted to a single 16-bit wchar_t.
+//
+bool is_single_wchar_t(char32_t c);
+
+// Convert a codepoint string into a UTF8-string.
+// If the codepoint string contains invalid codepoints, they're silently dropped.
+//
+std::string to_utf8(const std::u32string &cps);
+
+// Convert a UTF8 string to a codepoint string.
+// 
+// If the UTF8 string contains invalid sequences, they're silently dropped.
+// Some of the bytes may not be consumed, if the source ends with an unfinished
+// utf-8 sequence.  Returns the Codepoint string and the number of bytes consumed.
+//
+std::u32string from_utf8(std::string_view source, int *consumed);
+
 // Get a system error message, in an OS-independent manner.
 //
 // These versions of strerror is thread-safe, and it never fails
--- a/luprex/cpp/drv/readline.cpp
+++ b/luprex/cpp/drv/readline.cpp
@@ -2,8 +2,8 @@

 #define MAXLINE 512

-static CodepointString n_backspaces(int n) {
-    CodepointString result(3 * n, 0);
+static std::u32string n_backspaces(int n) {
+    std::u32string result(3 * n, 0);
    for (int i = 0; i < n; i++) {
        result[i*3 + 0] = '\b';
        result[i*3 + 1] = ' ';
@@ -12,7 +12,7 @@ static CodepointString n_backspaces(int n) {
    return result;
 }

-static int common_prefix_length(const CodepointString &a, const CodepointString &b) {
+static int common_prefix_length(const std::u32string &a, const std::u32string &b) {
    int minlen = std::min(a.size(), b.size());
    for (int i = 0; i < minlen; i++) {
        if (a[i] != b[i]) return i;
@@ -20,104 +20,11 @@ static int common_prefix_length(const CodepointString &a, const CodepointString
    return minlen;
 }

-static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
-    uint32_t cp = (uint32_t)scp;
-    unsigned char *c = (unsigned char *)buffer;
-    if (cp < 0) {
-        return 0;
-    }
-    else if (cp <= 0x7F) {
-        c[0] = cp;
-        return 1;
-    }
-    else if (cp <= 0x7FF) {
-        c[0] = (cp>>6)+192;
-        c[1] = (cp&63)+128;
-        return 2;
-    }
-    else if (cp <= 0xFFFF) {
-        c[0] = (cp>>12)+224;
-        c[1] = ((cp>>6)&63)+128;
-        c[2] = (cp&63)+128;
-        return 3;
-    }
-    else if (cp <= 0x10FFFF) {
-        c[0] = (cp>>18)+240;
-        c[1] = ((cp>>12)&63)+128;
-        c[2] = ((cp>>6)&63)+128;
-        c[3] = (cp&63)+128;
-        return 4;
-    } else {
-        return 0;
-    }
-}
-
-static int32_t read_codepoint_utf8(std::string_view &source) {
-    size_t size = source.size();
-    if (size == 0) return -1;
-
-    const unsigned char *bytes = (const unsigned char *)source.data();
-    int codepoint;
-    size_t seqlen;
-    if ((bytes[0] & 0x80) == 0x00) {
-        // U+0000 to U+007F
-        codepoint = (bytes[0] & 0x7F);
-        seqlen = 1;
-    } else if ((bytes[0] & 0xE0) == 0xC0) {
-        // U+0080 to U+07FF
-        codepoint = (bytes[0] & 0x1F);
-        seqlen = 2;
-    } else if ((bytes[0] & 0xF0) == 0xE0) {
-        // U+0800 to U+FFFF
-        codepoint = (bytes[0] & 0x0F);
-        seqlen = 3;
-    } else if ((bytes[0] & 0xF8) == 0xF0) {
-        // U+10000 to U+10FFFF
-        codepoint = (bytes[0] & 0x07);
-        seqlen = 4;
-    } else {
-        // Bad character. Drop a byte and return invalid CP.
-        source.remove_prefix(1);
-        return -2;
-    }
-
-    if (seqlen > size) {
-        return -1;
-    }
-
-    for (size_t i = 1; i < seqlen; ++i) {
-        if ((bytes[i] & 0xC0) != 0x80) {
-            // Bad character. Drop a byte and return invalid CP.
-            source.remove_prefix(1);
-            return -2;
-        }
-        codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
-    }
-
-    if ((codepoint > 0x10FFFF) ||
-        ((codepoint <= 0x007F) && (seqlen != 1)) ||
-        ((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
-        ((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
-        ((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
-        // Bad character. Drop a byte and return invalid CP.
-        source.remove_prefix(1);
-        return -2;
-    }
-
-    source.remove_prefix(seqlen);
-    return codepoint;
-}
-
-ReadlineDevice::ReadlineDevice() {
-    desired_prompt_ = CodepointString(1, '>');
-}
-
-
 void ReadlineDevice::set_print_callback(print_callback cb) {
    print_cb_ = cb;
 }

-void ReadlineDevice::set_prompt(const CodepointString &prompt) {
+void ReadlineDevice::set_prompt(const std::u32string &prompt) {
    desired_prompt_ = prompt;
    echo_command();
 }
@@ -152,24 +59,24 @@ void ReadlineDevice::echo_command() {
    }

    // Echo the new part.
-    CodepointString newpart = desired_command_.substr(current_command_.size());
+    std::u32string newpart = desired_command_.substr(current_command_.size());
    if (!newpart.empty()) {
        print_cb_(newpart);
        current_command_ = desired_command_;
    }
 }

-CodepointString ReadlineDevice::putcode(char32_t c) {
+std::u32string ReadlineDevice::putcode(char32_t c) {
    if ((c == '\n') && (readline_lastc_ == '\r')) {
        // Ignore newline immediately after carriage return.
        // Otherwise, crlf produces two newlines.
-        return CodepointString();
+        return std::u32string();
    } else if ((c == '\r') || (c == '\n')) {
-        CodepointString white(1, ' ');
-        CodepointString newline(1, '\n');
+        std::u32string white(1, ' ');
+        std::u32string newline(1, '\n');
        echo_command();
        print_cb_(white + newline);
-        CodepointString result = desired_command_ + newline;
+        std::u32string result = desired_command_ + newline;
        desired_command_.clear();
        current_prompt_.clear();
        current_command_.clear();
@@ -181,20 +88,20 @@ CodepointString ReadlineDevice::putcode(char32_t c) {
            desired_command_ = desired_command_.substr(0, len-1);
        }
        echo_command();
-        return CodepointString();
+        return std::u32string();
    } else if ((c >= 32)&&(c <= 0x10FFFF)) {
        int len = desired_command_.size();
        if (len < MAXLINE) {
            desired_command_ = desired_command_ + c;
        }
        echo_command();
-        return CodepointString();
+        return std::u32string();
    }
    readline_lastc_ = c;
-    return CodepointString();
+    return std::u32string();
 }

-void ReadlineDevice::print(const CodepointString &s) {
+void ReadlineDevice::print(const std::u32string &s) {
    if (!s.empty()) {
        erase_command();
        print_cb_(s);
@@ -202,30 +109,3 @@ void ReadlineDevice::print(const CodepointString &s) {
    }
 }

-std::string ReadlineDevice::to_utf8(const CodepointString &s) {
-    std::string result(s.size() * 4, 0);
-    char *buffer = &result[0];
-    int len = 0;
-    for (char32_t c : s) {
-        int clen = buffer_codepoint_utf8(c, buffer + len);
-        len += clen;
-    }
-    return result.substr(0, len);
-}
-
-CodepointString ReadlineDevice::from_utf8(std::string_view s, int *consumed) {
-    std::string_view rest = s;
-    CodepointString result(s.size(), 0);
-    int len = 0;
-    while (true) {
-        int32_t c = read_codepoint_utf8(rest);
-        if (c == -1) break; // EOF reached;
-        if (c == -2) continue; // Filter out bad UTF8 but continue.
-        result[len++] = (char32_t)c;
-    }
-    if (consumed != nullptr) {
-        *consumed = s.size() - rest.size();
-    }
-    return result.substr(0, len);
-}
-
--- a/luprex/cpp/drv/readline.hpp
+++ b/luprex/cpp/drv/readline.hpp
@@ -4,19 +4,19 @@

 #include <string>
 #include <string_view>
+#include "drvutil.hpp"

-using CodepointString = std::basic_string<char32_t>;

 class ReadlineDevice {
 public:
-    using print_callback = void (*)(const CodepointString &text);
+    using print_callback = void (*)(const std::u32string &text);

 private:
    print_callback print_cb_;
-    CodepointString desired_command_;
-    CodepointString current_command_;
-    CodepointString desired_prompt_;
-    CodepointString current_prompt_;
+    std::u32string desired_command_;
+    std::u32string current_command_;
+    std::u32string desired_prompt_;
+    std::u32string current_prompt_;
    char32_t readline_lastc_;

    void erase_command();
@@ -24,31 +24,19 @@ private:


 public:
-    ReadlineDevice();
-
    // The callback must be set before using the readline device.
    void set_print_callback(print_callback cb);
    
    // change the prompt.
-    void set_prompt(const CodepointString &prompt);
+    void set_prompt(const std::u32string &prompt);

    // Use this to print anything on the console.
-    void print(const CodepointString &cps);
+    void print(const std::u32string &cps);

    // Whenever the user types a character, call 'putcode'. If the code is
    // newline, this returns the line of text that was entered, including the
    // newline.  Otherwise returns empty string. Backspace is handled here.
-    CodepointString putcode(char32_t codepoint);
-
-    // This can be used to convert a codepoint string into a
-    // UTF8-string.
-    static std::string to_utf8(const CodepointString &cps);
-
-    // This can be used to convert UTF8 to a codepoint string.
-    // Some of the bytes may not be consumed, if the source contains
-    // a partial utf-8 sequence.  Returns the Codepoint string and the
-    // number of bytes consumed.
-    static CodepointString from_utf8(std::string_view source, int *consumed);
+    std::u32string putcode(char32_t codepoint);
 };