Lots of work on unicode support

2023-05-19 00:23:23 -04:00
parent a25213d259
commit 7e25be10a4
10 changed files with 249 additions and 228 deletions
--- a/luprex/cpp/core/json.cpp
+++ b/luprex/cpp/core/json.cpp
@@ -136,10 +136,10 @@ static bool encode_string(lua_State *L, eng::ostringstream &oss) {
    std::string_view str(s, len);
    oss << '"';
    if (sv::valid_utf8(str) && !sv::has_prefix(str, "")) {
-        // Output the string in the straightforward way,
-        // using traditional json escaping.
-        for (char c : str) {
-            switch (c) {
+        while (!str.empty()) {
+            int32_t cp = sv::read_codepoint_utf8(str);
+            assert(cp >= 0);
+            switch (cp) {
                case '\\': oss << "\\\\"; break;
                case '"' : oss << "\\\""; break;
                case '\b': oss << "\\b"; break;
@@ -148,10 +148,11 @@ static bool encode_string(lua_State *L, eng::ostringstream &oss) {
                case '\n': oss << "\\n"; break;
                case '\t': oss << "\\t"; break;
                default: {
-                    if (c < 32) {
-                        oss << "\\u" << util::hex16.val(c);
+                    if (cp < 32) {
+                        oss << "\\u" << util::hex16.val(cp);
                    } else {
-                        oss << c;
+                        bool ok = util::write_codepoint_utf8(cp, &oss);
+                        assert(ok);
                    }
                }
            }
--- a/luprex/cpp/core/util.cpp
+++ b/luprex/cpp/core/util.cpp
@@ -298,7 +298,7 @@ int32_t read_ascii_char(string_view &source) {
    return result;
 }

-int32_t read_codepoint_utf8(string_view &source) {
+int32_t read_codepoint_utf8(std::string_view &source) {
    size_t size = source.size();
    if (size == 0) return -1;

@@ -322,9 +322,8 @@ int32_t read_codepoint_utf8(string_view &source) {
        codepoint = (bytes[0] & 0x07);
        seqlen = 4;
    } else {
-        // Bad character. Drop a byte and return invalid CP.
-        source.remove_prefix(1);
-        return 1;
+        // Bad character. return invalid CP.
+        return -2;
    }

    if (seqlen > size) {
@@ -333,9 +332,8 @@ int32_t read_codepoint_utf8(string_view &source) {

    for (size_t i = 1; i < seqlen; ++i) {
        if ((bytes[i] & 0xC0) != 0x80) {
-            // Bad character. Drop a byte and return invalid CP.
-            source.remove_prefix(1);
-            return 1;
+            // Bad character. return invalid CP.
+            return -2;
        }
        codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
    }
@@ -346,17 +344,15 @@ int32_t read_codepoint_utf8(string_view &source) {
        ((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
        ((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
        ((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
-        // Bad character. Drop a byte and return invalid CP.
-        source.remove_prefix(1);
-        return 1;
+        // Bad character. return invalid CP.
+        return -2;
    }

    source.remove_prefix(seqlen);
    return codepoint;
 }

-bool valid_utf8(string_view s)
-{
+bool valid_utf8(string_view s) {
    while (!s.empty()) {
        int32_t codepoint = read_codepoint_utf8(s);
        if (codepoint < 0) return false;
@@ -403,27 +399,32 @@ void quote_string(const eng::string &s, std::ostream *os) {
    }
    bool usesinglequote = (!anysq)||(anydq);
    (*os) << (usesinglequote ? '\'' : '"');
-    for (char c : s) {
-        if (c >= 32) {
-            if (c == '"') {
-                (*os) << (usesinglequote ? "\"" : "\\\"");
-            } else if (c == '\'') {
-                (*os) << (usesinglequote ? "\\'" : "'");
-            } else if (c == '\\') {
-                (*os) << "\\\\"; 
-            } else {
-                (*os) << c;
-            }
-        } else {
-            unsigned int value = ((unsigned char)c);
-            switch (c) {
+    std::string_view str(s);
+    while (!str.empty()) {
+        unsigned char c0 = (unsigned char)(str[0]);
+        int cp = sv::read_codepoint_utf8(str);
+        if (cp < 0) {
+            (*os) << "\\" << dec.width(3).fill('0').val(c0);
+            str.remove_prefix(1);
+        } else if (cp < 32) {
+            c0 = ((unsigned char)cp);
+            switch (c0) {
            case '\n': (*os) << "\\n"; break;
            case '\t': (*os) << "\\t"; break;
            case '\r': (*os) << "\\r"; break;
+            case '\b': (*os) << "\\b"; break;
            default:
-                (*os) << "\\" << dec.width(3).fill('0').val(value);
+                (*os) << "\\" << dec.width(3).fill('0').val(c0);
                break;
            }
+        } else if (cp == '"') {
+            (*os) << (usesinglequote ? "\"" : "\\\"");
+        } else if (cp == '\'') {
+            (*os) << (usesinglequote ? "\\'" : "'");
+        } else if (cp == '\\') {
+            (*os) << "\\\\"; 
+        } else {
+            write_codepoint_utf8(cp, os);
        }
    }
    (*os) << (usesinglequote ? '\'' : '"');
@@ -656,50 +657,52 @@ eng::string toupper(eng::string input) {
    return input;
 }

-static void buffer_codepoint_utf8(int32_t scp, char *buffer) {
+static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
    uint32_t cp = (uint32_t)scp;
    unsigned char *c = (unsigned char *)buffer;
-    if (cp <= 0x7F) {
+    if (cp < 0) {
+        return 0;
+    }
+    else if (cp <= 0x7F) {
        c[0] = cp;
-        c[1] = 0;
+        return 1;
    }
    else if (cp <= 0x7FF) {
        c[0] = (cp>>6)+192;
        c[1] = (cp&63)+128;
-        c[2] = 0;
+        return 2;
    }
    else if (cp <= 0xFFFF) {
-        if (0xd800 <= cp && cp <= 0xdfff) {
-            c[0] = 0;
-        } else {
-            c[0] = (cp>>12)+224;
-            c[1] = ((cp>>6)&63)+128;
-            c[2] = (cp&63)+128;
-            c[3] = 0;
+        if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
+            return 0;
        }
+        c[0] = (cp>>12)+224;
+        c[1] = ((cp>>6)&63)+128;
+        c[2] = (cp&63)+128;
+        return 3;
    }
    else if (cp <= 0x10FFFF) {
        c[0] = (cp>>18)+240;
        c[1] = ((cp>>12)&63)+128;
        c[2] = ((cp>>6)&63)+128;
        c[3] = (cp&63)+128;
-        c[4] = 0;
+        return 4;
    } else {
-        c[0] = 0;
+        return 0;
    }
 }

 eng::string get_codepoint_utf8(uint32_t cp) {
-    char buffer[5];
-    buffer_codepoint_utf8(cp, buffer);
-    return eng::string(buffer);
+    char buffer[4];
+    int len = buffer_codepoint_utf8(cp, buffer);
+    return eng::string(buffer, len);
 }

 bool write_codepoint_utf8(int32_t cp, std::ostream *s) {
-    char buffer[5];
-    buffer_codepoint_utf8(cp, buffer);
-    (*s) << buffer;
-    return buffer[0] != 0;
+    char buffer[4];
+    int len = buffer_codepoint_utf8(cp, buffer);
+    (*s) << std::string_view(buffer, len);
+    return (len > 0);
 }

 double distance_squared(double x1, double y1, double x2, double y2) {
--- a/luprex/cpp/core/util.hpp
+++ b/luprex/cpp/core/util.hpp
@@ -181,8 +181,14 @@ int32_t read_ascii_char(string_view &source);

 // Read a UTF8 codepoint from a string_view.
 //
-// If the next thing in the string_view isn't a valid
-// codepoint, returns -1 and doesn't update the view.
+// If the string_view is empty, returns -1 and doesn't update
+// the string_view.
+//
+// If the string_view contains an unfinished but possibly valid
+// codepoint, returns -1 and doesn't update the string_view.
+//
+// If the next thing in the string_view is an invalid codepoint,
+// returns -2 and doesn't update the string_view.
 //
 int32_t read_codepoint_utf8(string_view &source);