More refactors to prepare for doc-search, including moving unicode support into ext.

2026-01-14 12:30:44 -05:00
parent 4e374294b6
commit 850b4aa43b
10 changed files with 334 additions and 254 deletions
--- a/luprex/cpp/core/util.cpp
+++ b/luprex/cpp/core/util.cpp
@@ -4,6 +4,8 @@
 #include "fast-float.hpp"
 #include "luastack.hpp"

+#include "../../ext/unicode-stuff.hpp"
+
 #include <algorithm>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -13,7 +15,6 @@
 #include <cmath>
 #include <charconv>

-
 namespace sv {

 bool case_insensitive_eq(string_view s1, string_view s2) {
@@ -336,73 +337,16 @@ int32_t read_ascii_char(string_view &source) {
    return result;
 }

-int32_t read_codepoint_utf8(std::string_view &source) {
-    size_t size = source.size();
-    if (size == 0) return -1;
-
-    const unsigned char *bytes = (const unsigned char *)source.data();
-    int codepoint;
-    size_t seqlen;
-    if ((bytes[0] & 0x80) == 0x00) {
-        // U+0000 to U+007F
-        codepoint = (bytes[0] & 0x7F);
-        seqlen = 1;
-    } else if ((bytes[0] & 0xE0) == 0xC0) {
-        // U+0080 to U+07FF
-        codepoint = (bytes[0] & 0x1F);
-        seqlen = 2;
-    } else if ((bytes[0] & 0xF0) == 0xE0) {
-        // U+0800 to U+FFFF
-        codepoint = (bytes[0] & 0x0F);
-        seqlen = 3;
-    } else if ((bytes[0] & 0xF8) == 0xF0) {
-        // U+10000 to U+10FFFF
-        codepoint = (bytes[0] & 0x07);
-        seqlen = 4;
-    } else {
-        // Bad character. return invalid CP.
-        return -2;
-    }
-
-    if (seqlen > size) {
-        return -1;
-    }
-
-    for (size_t i = 1; i < seqlen; ++i) {
-        if ((bytes[i] & 0xC0) != 0x80) {
-            // Bad character. return invalid CP.
-            return -2;
-        }
-        codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
-    }
-
-    if ((codepoint > 0x10FFFF) ||
-        ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) ||
-        ((codepoint <= 0x007F) && (seqlen != 1)) ||
-        ((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
-        ((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
-        ((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
-        // Bad character. return invalid CP.
-        return -2;
-    }
-
-    source.remove_prefix(seqlen);
-    return codepoint;
-}
-
-bool valid_utf8(string_view s) {
-    while (!s.empty()) {
-        int32_t codepoint = read_codepoint_utf8(s);
-        if (codepoint < 0) return false;
-    }
-    return true;
-}
-
 bool valid_number(string_view s, bool plus, bool minus, bool dec, bool exp) {
    read_number(s, plus, minus, dec, exp);
    return s.empty();
 }

+using UC = UnicodeStuff<eng::string, eng::u16string, eng::u32string>;
+
+int32_t read_codepoint_utf8(string_view &source) { return UC::read_codepoint_utf8(source); }
+bool valid_utf8(string_view s) { return UC::valid_utf8(s); }
+
 } // namespace sv


@@ -989,6 +933,23 @@ LuaDefine(unittests_util, "", "some unit tests") {
    LuaAssert(L, read_number_x("-123e+5x", true, true, true, true) == "-123e+5");
    LuaAssert(L, read_number_x("-123e+x", true, true, true, true) == "");
    
+    // Test read_codepoint_utf8.
+    std::string_view str("𝞮ὥπq");
+    LuaAssert(L, str.size() == 10);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1D7AE); // 4-byte char
+    LuaAssert(L, str.size() == 6);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1F65); // 3-byte char
+    LuaAssert(L, str.size() == 3);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == 0x3C0); // 2-byte char
+    LuaAssert(L, str.size() == 1);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == 0x71); // 1-byte char
+    LuaAssert(L, str.size() == 0);
+    LuaAssert(L, sv::read_codepoint_utf8(str) == -1); // EOF
+
+    // Test read_codepoint_utf8 on an invalid unicode sequence.
+    std::string_view strbad("\xC0\xC0");
+    LuaAssert(L, sv::read_codepoint_utf8(strbad) == -2);
+
    return 0;
 }