From 850b4aa43b6461caea460f6876d47239428e2df0 Mon Sep 17 00:00:00 2001
From: jyelon <jyelon@gmail.com>
Date: Wed, 14 Jan 2026 12:30:44 -0500
Subject: [PATCH] More refactors to prepare for doc-search, including moving
 unicode support into ext.

---
 Content/Luprex/SimpleColorMaterial.uasset |   3 +
 Integration.code-workspace.tpl.json       |   2 +-
 luprex/cpp/core/source.cpp                |  18 ++
 luprex/cpp/core/source.hpp                |   7 +
 luprex/cpp/core/util.cpp                  |  87 ++-----
 luprex/cpp/core/util.hpp                  |  12 +-
 luprex/cpp/drv/drvutil.cpp                | 188 +--------------
 luprex/cpp/drv/drvutil.hpp                |   1 +
 luprex/cpp/wrap/wrap-string.hpp           |   2 +
 luprex/ext/unicode-stuff.hpp              | 268 ++++++++++++++++++++++
 10 files changed, 334 insertions(+), 254 deletions(-)
 create mode 100644 Content/Luprex/SimpleColorMaterial.uasset
 create mode 100644 luprex/ext/unicode-stuff.hpp
diff --git a/Content/Luprex/SimpleColorMaterial.uasset b/Content/Luprex/SimpleColorMaterial.uasset
new file mode 100644
index 00000000..83f05f87
--- /dev/null
+++ b/Content/Luprex/SimpleColorMaterial.uasset
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a8703f0afcf0e908ccb7934b5527b202d7b863099c99e371edf3823cfc709ba
+size 11196
diff --git a/Integration.code-workspace.tpl.json b/Integration.code-workspace.tpl.json
index 2953c07e..a8650da6 100644
--- a/Integration.code-workspace.tpl.json
+++ b/Integration.code-workspace.tpl.json
@@ -47,7 +47,7 @@
           "--header-insertion=never"
         ],
         "C_Cpp.autocomplete": "disabled",
-        "search.useIgnoreFiles": false
+        "search.useIgnoreFiles": true
     },
     "extensions": {
         "recommendations": [
diff --git a/luprex/cpp/core/source.cpp b/luprex/cpp/core/source.cpp
index a6f3114d..73fb0590 100644
--- a/luprex/cpp/core/source.cpp
+++ b/luprex/cpp/core/source.cpp
@@ -545,6 +545,24 @@ void SourceDB::register_lua_builtins() {
     lua_close(L);
 }
 
+
+
+util::StringVec SourceDB::search_docs(const eng::string &substring) {
+    // This map will hold the results.  It maps function name
+    // to a documentation line.
+    eng::map<eng::string, eng::string> results;
+
+    // Search the built-in functions.
+    // for (const LuaFunctionReg *reg = LuaFunctionReg::All; reg != nullptr; reg=reg->next()) {
+    // }
+
+    util::StringVec resultvec;
+    for (const auto &pair : results) {
+        resultvec.push_back(pair.second);
+    }
+    return resultvec;
+}
+
 eng::string SourceDB::function_docs(const LuaCoreStack &LS, LuaSlot fn) {
     lua_State *L = LS.state();
     if (LS.iscfunction(fn)) {
diff --git a/luprex/cpp/core/source.hpp b/luprex/cpp/core/source.hpp
index d3ba462a..41f80a60 100644
--- a/luprex/cpp/core/source.hpp
+++ b/luprex/cpp/core/source.hpp
@@ -204,6 +204,13 @@ public:
     //
     eng::string function_docs(const LuaCoreStack &LS, LuaSlot slot);
 
+    // Search the documentation.
+    //
+    // Search all the documentation for the specified substring.
+    // In the result, each line points to a different result.
+    //
+    util::StringVec search_docs(const eng::string &substring);
+
     // Serialize and unserialize a source vector.
     //
     static void serialize_source(const util::LuaSourceVec &sv, StreamBuffer *sb);
diff --git a/luprex/cpp/core/util.cpp b/luprex/cpp/core/util.cpp
index c1098f5a..fc6e1c9b 100644
--- a/luprex/cpp/core/util.cpp
+++ b/luprex/cpp/core/util.cpp
@@ -4,6 +4,8 @@
 #include "fast-float.hpp"
 #include "luastack.hpp"
 
+#include "../../ext/unicode-stuff.hpp"
+
 #include <algorithm>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -13,7 +15,6 @@
 #include <cmath>
 #include <charconv>
 
-
 namespace sv {
 
 bool case_insensitive_eq(string_view s1, string_view s2) {
@@ -336,73 +337,16 @@ int32_t read_ascii_char(string_view &source) {
     return result;
 }
 
-int32_t read_codepoint_utf8(std::string_view &source) {
-    size_t size = source.size();
-    if (size == 0) return -1;
-
-    const unsigned char *bytes = (const unsigned char *)source.data();
-    int codepoint;
-    size_t seqlen;
-    if ((bytes[0] & 0x80) == 0x00) {
-        // U+0000 to U+007F
-        codepoint = (bytes[0] & 0x7F);
-        seqlen = 1;
-    } else if ((bytes[0] & 0xE0) == 0xC0) {
-        // U+0080 to U+07FF
-        codepoint = (bytes[0] & 0x1F);
-        seqlen = 2;
-    } else if ((bytes[0] & 0xF0) == 0xE0) {
-        // U+0800 to U+FFFF
-        codepoint = (bytes[0] & 0x0F);
-        seqlen = 3;
-    } else if ((bytes[0] & 0xF8) == 0xF0) {
-        // U+10000 to U+10FFFF
-        codepoint = (bytes[0] & 0x07);
-        seqlen = 4;
-    } else {
-        // Bad character. return invalid CP.
-        return -2;
-    }
-
-    if (seqlen > size) {
-        return -1;
-    }
-
-    for (size_t i = 1; i < seqlen; ++i) {
-        if ((bytes[i] & 0xC0) != 0x80) {
-            // Bad character. return invalid CP.
-            return -2;
-        }
-        codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
-    }
-
-    if ((codepoint > 0x10FFFF) ||
-        ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) ||
-        ((codepoint <= 0x007F) && (seqlen != 1)) ||
-        ((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
-        ((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
-        ((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
-        // Bad character. return invalid CP.
-        return -2;
-    }
-
-    source.remove_prefix(seqlen);
-    return codepoint;
-}
-
-bool valid_utf8(string_view s) {
-    while (!s.empty()) {
-        int32_t codepoint = read_codepoint_utf8(s);
-        if (codepoint < 0) return false;
-    }
-    return true;
-}
-
 bool valid_number(string_view s, bool plus, bool minus, bool dec, bool exp) {
     read_number(s, plus, minus, dec, exp);
     return s.empty();
 }
 
+using UC = UnicodeStuff<eng::string, eng::u16string, eng::u32string>;
+
+int32_t read_codepoint_utf8(string_view &source) { return UC::read_codepoint_utf8(source); }
+bool valid_utf8(string_view s) { return UC::valid_utf8(s); }
+
 } // namespace sv
 
 
@@ -989,6 +933,23 @@ LuaDefine(unittests_util, "", "some unit tests") {
     LuaAssert(L, read_number_x("-123e+5x", true, true, true, true) == "-123e+5");
     LuaAssert(L, read_number_x("-123e+x", true, true, true, true) == "");
     
+    // Test read_codepoint_utf8.
+    std::string_view str("𝞮ὥπq");
+    LuaAssert(L, str.size() == 10);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1D7AE); // 4-byte char
+    LuaAssert(L, str.size() == 6);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1F65); // 3-byte char
+    LuaAssert(L, str.size() == 3);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == 0x3C0); // 2-byte char
+    LuaAssert(L, str.size() == 1);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == 0x71); // 1-byte char
+    LuaAssert(L, str.size() == 0);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == -1); // EOF
+
+    // Test read_codepoint_utf8 on an invalid unicode sequence.
+    std::string_view strbad("\xC0\xC0");
+    LuaAssert(L, sv::read_codepoint_utf8(strbad) == -2);
+
     return 0;
 }
 
diff --git a/luprex/cpp/core/util.hpp b/luprex/cpp/core/util.hpp
index abb7551d..439a9130 100644
--- a/luprex/cpp/core/util.hpp
+++ b/luprex/cpp/core/util.hpp
@@ -196,18 +196,14 @@ int32_t read_ascii_char(string_view &source);
 
 // Read a UTF8 codepoint from a string_view.
 //
-// If the string_view is empty, returns -1 and doesn't update
-// the string_view.
-//
-// If the string_view contains an unfinished but possibly valid
-// codepoint, returns -1 and doesn't update the string_view.
-//
-// If the next thing in the string_view is an invalid codepoint,
-// returns -2 and doesn't update the string_view.
+// See documentation in unicode-stuff.hpp
 //
 int32_t read_codepoint_utf8(string_view &source);
 
 // Return true if the string is valid utf-8.
+//
+// See documentation in unicode-stuff.hpp
+//
 bool valid_utf8(string_view s);
 
 // Return true if the number conforms to the spec.
diff --git a/luprex/cpp/drv/drvutil.cpp b/luprex/cpp/drv/drvutil.cpp
index 663cfadd..4fdbd3d1 100644
--- a/luprex/cpp/drv/drvutil.cpp
+++ b/luprex/cpp/drv/drvutil.cpp
@@ -7,6 +7,7 @@
 #include <fstream>
 #include <iostream>
 #include <filesystem>
+#include "../../ext/unicode-stuff.hpp"
 
 namespace drvutil {
 
@@ -74,189 +75,12 @@ bool is_single_wchar_t(char32_t c) {
     return false;
 }
 
-static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
-    uint32_t cp = (uint32_t)scp;
-    unsigned char *c = (unsigned char *)buffer;
-    if (cp < 0) {
-        return 0;
-    }
-    else if (cp <= 0x7F) {
-        c[0] = cp;
-        return 1;
-    }
-    else if (cp <= 0x7FF) {
-        c[0] = (cp>>6)+192;
-        c[1] = (cp&63)+128;
-        return 2;
-    }
-    else if (cp <= 0xFFFF) {
-        if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
-            return 0;
-        }
-        c[0] = (cp>>12)+224;
-        c[1] = ((cp>>6)&63)+128;
-        c[2] = (cp&63)+128;
-        return 3;
-    }
-    else if (cp <= 0x10FFFF) {
-        c[0] = (cp>>18)+240;
-        c[1] = ((cp>>12)&63)+128;
-        c[2] = ((cp>>6)&63)+128;
-        c[3] = (cp&63)+128;
-        return 4;
-    } else {
-        return 0;
-    }
-}
+using UC = UnicodeStuff<std::string, std::u16string, std::u32string>;
 
-static int32_t read_codepoint_utf16(std::u16string_view &source) {
-    if (source.empty()) return -1;
-    
-    int32_t word0 = ((const uint16_t *)source.data())[0];
-    source.remove_prefix(1);
-
-    if (word0 < 0xD800) {
-        return word0;
-    } else if (word0 < 0xDC00) {
-        if (source.empty()) {
-            return -2;
-        }
-        int32_t word1 = ((const uint16_t *)source.data())[0];
-        if ((word1 < 0xDC00)||(word1 > 0xDFFF)) {
-            return -2;
-        }
-        int32_t part1 = word0 & 0x3FF;
-        int32_t part2 = word1 & 0x3FF;
-        int32_t result = ((part1 << 10) | part2) + 0x10000;
-        source.remove_prefix(1);
-        return result;
-    } else if (word0 < 0xE000) {
-        return -2;
-    } else {
-        return word0;
-    }
-}
-
-static int32_t read_codepoint_utf8(std::string_view &source) {
-    size_t size = source.size();
-    if (size == 0) return -1;
-
-    const unsigned char *bytes = (const unsigned char *)source.data();
-    int codepoint;
-    size_t seqlen;
-    if ((bytes[0] & 0x80) == 0x00) {
-        // U+0000 to U+007F
-        codepoint = (bytes[0] & 0x7F);
-        seqlen = 1;
-    } else if ((bytes[0] & 0xE0) == 0xC0) {
-        // U+0080 to U+07FF
-        codepoint = (bytes[0] & 0x1F);
-        seqlen = 2;
-    } else if ((bytes[0] & 0xF0) == 0xE0) {
-        // U+0800 to U+FFFF
-        codepoint = (bytes[0] & 0x0F);
-        seqlen = 3;
-    } else if ((bytes[0] & 0xF8) == 0xF0) {
-        // U+10000 to U+10FFFF
-        codepoint = (bytes[0] & 0x07);
-        seqlen = 4;
-    } else {
-        // Bad character. return invalid CP.
-        return -2;
-    }
-
-    if (seqlen > size) {
-        return -1;
-    }
-
-    for (size_t i = 1; i < seqlen; ++i) {
-        if ((bytes[i] & 0xC0) != 0x80) {
-            // Bad character. return invalid CP.
-            return -2;
-        }
-        codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
-    }
-
-    if ((codepoint > 0x10FFFF) ||
-        ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) ||
-        ((codepoint <= 0x007F) && (seqlen != 1)) ||
-        ((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
-        ((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
-        ((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
-        // Bad character. return invalid CP.
-        return -2;
-    }
-
-    source.remove_prefix(seqlen);
-    return codepoint;
-}
-
-std::string utf32_to_utf8(const std::u32string &s) {
-    std::string result(s.size() * 4, 0);
-    char *buffer = &result[0];
-    int len = 0;
-    for (char32_t c : s) {
-        int clen = buffer_codepoint_utf8(c, buffer + len);
-        len += clen;
-    }
-    return result.substr(0, len);
-}
-
-std::u32string utf8_to_utf32(std::string_view s, int *consumed) {
-    std::string_view rest = s;
-    std::u32string result(s.size(), 0);
-    int len = 0;
-    while (true) {
-        int32_t c = read_codepoint_utf8(rest);
-        if (c == -1) {
-            break; // EOF reached;
-        } else if (c < 0) {
-            rest.remove_prefix(1);
-        } else {
-            result[len++] = (char32_t)c;
-        }
-    }
-    if (consumed != nullptr) {
-        *consumed = s.size() - rest.size();
-    }
-    return result.substr(0, len);
-}
-
-std::u16string utf8_to_ucs2(std::string_view s, int *consumed) {
-    std::string_view rest = s;
-    std::u16string result(s.size(), 0);
-    int len = 0;
-    while (true) {
-        int32_t c = read_codepoint_utf8(rest);
-        if (c == -1) {
-            break; // EOF reached;
-        } else if (c < 0) {
-            rest.remove_prefix(1);
-        } else if ((c >= 0xD800) && (c <= 0xDFFF)) {
-            result[len++] = 0x2610;
-        } else if (c > 0xFFFF) {
-            result[len++] = 0x2610;
-        } else {
-            result[len++] = (char16_t)c;
-        }
-    }
-    if (consumed != nullptr) {
-        *consumed = s.size() - rest.size();
-    }
-    return result.substr(0, len);
-}
-
-std::string utf16_to_utf8(std::u16string_view s) {
-    std::string result(s.size() * 4, 0);
-    int len = 0;
-    while (true) {
-        int codepoint = read_codepoint_utf16(s);
-        if (codepoint == -1) break;
-        if (codepoint < 0) continue;
-        len += buffer_codepoint_utf8(codepoint, &result[len]);
-    }
-    return result.substr(0, len);
-}
+std::string utf32_to_utf8(const std::u32string &s) { return UC::utf32_to_utf8(s); }
+std::u32string utf8_to_utf32(std::string_view s, int *consumed) { return UC::utf8_to_utf32(s, consumed); }
+std::u16string utf8_to_ucs2(std::string_view s, int *consumed) { return UC::utf8_to_ucs2(s, consumed); }
+std::string utf16_to_utf8(std::u16string_view s) { return UC::utf16_to_utf8(s); }
 
 static std::vector<std::string> parse_control_lst(std::string_view ctrl) {
     std::vector<std::string> result;
diff --git a/luprex/cpp/drv/drvutil.hpp b/luprex/cpp/drv/drvutil.hpp
index afc89c0f..bccac4b4 100644
--- a/luprex/cpp/drv/drvutil.hpp
+++ b/luprex/cpp/drv/drvutil.hpp
@@ -59,6 +59,7 @@ void split_target(std::string_view target, std::string &cert, std::string &host,
 bool is_single_wchar_t(char32_t c);
 
 // Convert a codepoint string into a UTF8-string.
+//
 // If the codepoint string contains invalid codepoints, they're silently dropped.
 //
 std::string utf32_to_utf8(const std::u32string &cps);
diff --git a/luprex/cpp/wrap/wrap-string.hpp b/luprex/cpp/wrap/wrap-string.hpp
index df54cc62..f6f6f237 100644
--- a/luprex/cpp/wrap/wrap-string.hpp
+++ b/luprex/cpp/wrap/wrap-string.hpp
@@ -8,6 +8,8 @@ namespace eng {
 template<class C, class T=std::char_traits<C>>
 using basic_string = std::basic_string<C, T, eng::allocator<C>>;
 using string = basic_string<char>;
+using u32string = basic_string<char32_t>;
+using u16string = basic_string<char16_t>;
 } // namespace eng
 
 #endif // WRAP_STRING_HPP
diff --git a/luprex/ext/unicode-stuff.hpp b/luprex/ext/unicode-stuff.hpp
new file mode 100644
index 00000000..eb706361
--- /dev/null
+++ b/luprex/ext/unicode-stuff.hpp
@@ -0,0 +1,268 @@
+// This file implements unicode encoding conversions.
+//
+// Unicode conversions aren't that complicated.  It is possible
+// to implement them in a few hundred lines of code.  Most unicode
+// libraries are much larger because they also implement many
+// other pieces of functionality.  I don't need anything but
+// conversions.  So I implemented my own tiny library.
+//
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+template <class U8STR, class U16STR, class U32STR>
+class UnicodeStuff
+{
+public:
+    using u8string = U8STR;
+    using u16string = U16STR;
+    using u32string = U32STR;
+    
+    // Convert a single UTF32 codepoint into a UTF8 string.
+    //
+    // The string is stored in a preallocated buffer.  The length of the
+    // codepoint is returned.  If it returns 0, it means the codepoint is
+    // not a valid unicode codepoint.
+    //
+    static int codepoint_to_utf8(char32_t scp, char *buffer) {
+        uint32_t cp = (uint32_t)scp;
+        unsigned char *c = (unsigned char *)buffer;
+        if (cp < 0) {
+            return 0;
+        }
+        else if (cp <= 0x7F) {
+            c[0] = cp;
+            return 1;
+        }
+        else if (cp <= 0x7FF) {
+            c[0] = (cp>>6)+192;
+            c[1] = (cp&63)+128;
+            return 2;
+        }
+        else if (cp <= 0xFFFF) {
+            if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
+                return 0;
+            }
+            c[0] = (cp>>12)+224;
+            c[1] = ((cp>>6)&63)+128;
+            c[2] = (cp&63)+128;
+            return 3;
+        }
+        else if (cp <= 0x10FFFF) {
+            c[0] = (cp>>18)+240;
+            c[1] = ((cp>>12)&63)+128;
+            c[2] = ((cp>>6)&63)+128;
+            c[3] = (cp&63)+128;
+            return 4;
+        } else {
+            return 0;
+        }
+    }
+
+    // Read a single codepoint from a UTF16 string.
+    //
+    // Returns -1 if the string is empty.  Returns -2 if the string
+    // starts with an invalid sequence.
+    //
+    // The string-view is updated to remove the codepoint from the view.
+    //
+    static char32_t read_codepoint_utf16(std::u16string_view &source) {
+        if (source.empty()) return -1;
+        
+        int32_t word0 = ((const uint16_t *)source.data())[0];
+        source.remove_prefix(1);
+
+        if (word0 < 0xD800) {
+            return word0;
+        } else if (word0 < 0xDC00) {
+            if (source.empty()) {
+                return -2;
+            }
+            int32_t word1 = ((const uint16_t *)source.data())[0];
+            if ((word1 < 0xDC00)||(word1 > 0xDFFF)) {
+                return -2;
+            }
+            int32_t part1 = word0 & 0x3FF;
+            int32_t part2 = word1 & 0x3FF;
+            int32_t result = ((part1 << 10) | part2) + 0x10000;
+            source.remove_prefix(1);
+            return result;
+        } else if (word0 < 0xE000) {
+            return -2;
+        } else {
+            return word0;
+        }
+    }
+
+    // Read a single codepoint from a UTF8 string.
+    //
+    // If the string_view starts with a valid codepoint, the codepoint
+    // is removed from the string_view and is returned.
+    //
+    // If the string_view is empty, returns -1.
+    //
+    // If the string_view starts with an unfinished but possibly
+    // valid codepoint, returns -1.
+    // 
+    // If the string_view starts with a finish but invalid codepoint,
+    // returns -2.
+    //
+    static int32_t read_codepoint_utf8(std::string_view &source) {
+        size_t size = source.size();
+        if (size == 0) return -1;
+
+        const unsigned char *bytes = (const unsigned char *)source.data();
+
+        int codepoint;
+        size_t seqlen;
+        if ((bytes[0] & 0x80) == 0x00) {
+            // U+0000 to U+007F
+            codepoint = (bytes[0] & 0x7F);
+            seqlen = 1;
+        } else if ((bytes[0] & 0xE0) == 0xC0) {
+            // U+0080 to U+07FF
+            if (size < 2) return -1;
+            if ((bytes[1] & 0xC0) != 0x80) return -2;
+            codepoint = (bytes[0] & 0x1F);
+            codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
+            seqlen = 2;
+        } else if ((bytes[0] & 0xF0) == 0xE0) {
+            // U+0800 to U+FFFF
+            if (size < 3) return -1;
+            if ((bytes[1] & 0xC0) != 0x80) return -2;
+            if ((bytes[2] & 0xC0) != 0x80) return -2;
+            codepoint = (bytes[0] & 0x0F);
+            codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
+            codepoint = (codepoint << 6) | (bytes[2] & 0x3F);
+            seqlen = 3;
+        } else if ((bytes[0] & 0xF8) == 0xF0) {
+            // U+10000 to U+10FFFF
+            if (size < 4) return -1;
+            if ((bytes[1] & 0xC0) != 0x80) return -2;
+            if ((bytes[2] & 0xC0) != 0x80) return -2;
+            if ((bytes[3] & 0xC0) != 0x80) return -2;
+            codepoint = (bytes[0] & 0x07);
+            codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
+            codepoint = (codepoint << 6) | (bytes[2] & 0x3F);
+            codepoint = (codepoint << 6) | (bytes[3] & 0x3F);
+            if (codepoint >= 0x110000) return -2;
+            seqlen = 4;
+        } else {
+            return -2;
+        }
+
+        if ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) {
+            return -2;
+        }
+
+        source.remove_prefix(seqlen);
+        return codepoint;
+    }
+
+    // Convert a codepoint string into a UTF8-string.
+    // If the codepoint string contains invalid codepoints, they're silently dropped.
+    //
+    static u8string utf32_to_utf8(const u32string &s) {
+        u8string result(s.size() * 4, 0);
+        char *buffer = &result[0];
+        int len = 0;
+        for (char32_t c : s) {
+            int clen = codepoint_to_utf8(c, buffer + len);
+            len += clen;
+        }
+        result.resize(len);
+        return result;
+    }
+
+    // Convert a UTF8 string to a UTF32 string.
+    // 
+    // If the UTF8 string contains invalid sequences, they're silently dropped.
+    // Some of the bytes may not be consumed, if the source ends with an unfinished
+    // utf-8 sequence.  Returns the Codepoint string and the number of bytes consumed.
+    // You may pass nullptr for consumed if you don't care how many bytes were
+    // consumed.
+    //
+    static u32string utf8_to_utf32(std::string_view s, int *consumed) {
+        std::string_view rest = s;
+        u32string result(s.size(), 0);
+        int len = 0;
+        while (true) {
+            int32_t c = read_codepoint_utf8(rest);
+            if (c == -1) {
+                break; // EOF reached;
+            } else if (c < 0) {
+                rest.remove_prefix(1);
+            } else {
+                result[len++] = (char32_t)c;
+            }
+        }
+        if (consumed != nullptr) {
+            *consumed = s.size() - rest.size();
+        }
+        result.resize(len);
+        return result;
+    }
+
+    // Convert a UTF8 string to a UCS-2 string.
+    //
+    // If the UTF8 string contains invalid sequences, they're silently dropped.
+    // Some of the bytes may not be consumed, if the source ends with an unfinished
+    // utf-8 sequence.  Returns the UCS-2 string and the number of bytes consumed.
+    // Of course, UCS-2 can't represent all of unicode, so this is lossy.
+    // Any character that can't be represented is replaced with a box.
+    //
+    static u16string utf8_to_ucs2(std::string_view s, int *consumed) {
+        std::string_view rest = s;
+        u16string result(s.size(), 0);
+        int len = 0;
+        while (true) {
+            int32_t c = read_codepoint_utf8(rest);
+            if (c == -1) {
+                break; // EOF reached;
+            } else if (c < 0) {
+                rest.remove_prefix(1);
+            } else if ((c >= 0xD800) && (c <= 0xDFFF)) {
+                result[len++] = 0x2610;
+            } else if (c > 0xFFFF) {
+                result[len++] = 0x2610;
+            } else {
+                result[len++] = (char16_t)c;
+            }
+        }
+        if (consumed != nullptr) {
+            *consumed = s.size() - rest.size();
+        }
+        result.resize(len);
+        return result;
+    }
+
+    // Convert a UTF16 string to a UTF8 string.
+    //
+    // This also works for ucs2 strings.  If the UTF16 string
+    // contains invalid sequences, they're silently dropped.
+    //
+    static u8string utf16_to_utf8(std::u16string_view s) {
+        u8string result(s.size() * 4, 0);
+        int len = 0;
+        while (true) {
+            int codepoint = read_codepoint_utf16(s);
+            if (codepoint == -1) break;
+            if (codepoint < 0) continue;
+            len += codepoint_to_utf8(codepoint, &result[len]);
+        }
+        result.resize(len);
+        return result;
+    }
+
+    // Check if UTF8 is valid.
+    //
+    static bool valid_utf8(std::string_view s) {
+        while (!s.empty()) {
+            int32_t codepoint = read_codepoint_utf8(s);
+            if (codepoint < 0) return false;
+        }
+        return true;
+    }
+};
\ No newline at end of file