From 850b4aa43b6461caea460f6876d47239428e2df0 Mon Sep 17 00:00:00 2001 From: jyelon Date: Wed, 14 Jan 2026 12:30:44 -0500 Subject: [PATCH] More refactors to prepare for doc-search, including moving unicode support into ext. --- Content/Luprex/SimpleColorMaterial.uasset | 3 + Integration.code-workspace.tpl.json | 2 +- luprex/cpp/core/source.cpp | 18 ++ luprex/cpp/core/source.hpp | 7 + luprex/cpp/core/util.cpp | 87 ++----- luprex/cpp/core/util.hpp | 12 +- luprex/cpp/drv/drvutil.cpp | 188 +-------------- luprex/cpp/drv/drvutil.hpp | 1 + luprex/cpp/wrap/wrap-string.hpp | 2 + luprex/ext/unicode-stuff.hpp | 268 ++++++++++++++++++++++ 10 files changed, 334 insertions(+), 254 deletions(-) create mode 100644 Content/Luprex/SimpleColorMaterial.uasset create mode 100644 luprex/ext/unicode-stuff.hpp diff --git a/Content/Luprex/SimpleColorMaterial.uasset b/Content/Luprex/SimpleColorMaterial.uasset new file mode 100644 index 00000000..83f05f87 --- /dev/null +++ b/Content/Luprex/SimpleColorMaterial.uasset @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a8703f0afcf0e908ccb7934b5527b202d7b863099c99e371edf3823cfc709ba +size 11196 diff --git a/Integration.code-workspace.tpl.json b/Integration.code-workspace.tpl.json index 2953c07e..a8650da6 100644 --- a/Integration.code-workspace.tpl.json +++ b/Integration.code-workspace.tpl.json @@ -47,7 +47,7 @@ "--header-insertion=never" ], "C_Cpp.autocomplete": "disabled", - "search.useIgnoreFiles": false + "search.useIgnoreFiles": true }, "extensions": { "recommendations": [ diff --git a/luprex/cpp/core/source.cpp b/luprex/cpp/core/source.cpp index a6f3114d..73fb0590 100644 --- a/luprex/cpp/core/source.cpp +++ b/luprex/cpp/core/source.cpp @@ -545,6 +545,24 @@ void SourceDB::register_lua_builtins() { lua_close(L); } + + +util::StringVec SourceDB::search_docs(const eng::string &substring) { + // This map will hold the results. It maps function name + // to a documentation line. + eng::map results; + + // Search the built-in functions. + // for (const LuaFunctionReg *reg = LuaFunctionReg::All; reg != nullptr; reg=reg->next()) { + // } + + util::StringVec resultvec; + for (const auto &pair : results) { + resultvec.push_back(pair.second); + } + return resultvec; +} + eng::string SourceDB::function_docs(const LuaCoreStack &LS, LuaSlot fn) { lua_State *L = LS.state(); if (LS.iscfunction(fn)) { diff --git a/luprex/cpp/core/source.hpp b/luprex/cpp/core/source.hpp index d3ba462a..41f80a60 100644 --- a/luprex/cpp/core/source.hpp +++ b/luprex/cpp/core/source.hpp @@ -204,6 +204,13 @@ public: // eng::string function_docs(const LuaCoreStack &LS, LuaSlot slot); + // Search the documentation. + // + // Search all the documentation for the specified substring. + // In the result, each line points to a different result. + // + util::StringVec search_docs(const eng::string &substring); + // Serialize and unserialize a source vector. // static void serialize_source(const util::LuaSourceVec &sv, StreamBuffer *sb); diff --git a/luprex/cpp/core/util.cpp b/luprex/cpp/core/util.cpp index c1098f5a..fc6e1c9b 100644 --- a/luprex/cpp/core/util.cpp +++ b/luprex/cpp/core/util.cpp @@ -4,6 +4,8 @@ #include "fast-float.hpp" #include "luastack.hpp" +#include "../../ext/unicode-stuff.hpp" + #include #include #include @@ -13,7 +15,6 @@ #include #include - namespace sv { bool case_insensitive_eq(string_view s1, string_view s2) { @@ -336,73 +337,16 @@ int32_t read_ascii_char(string_view &source) { return result; } -int32_t read_codepoint_utf8(std::string_view &source) { - size_t size = source.size(); - if (size == 0) return -1; - - const unsigned char *bytes = (const unsigned char *)source.data(); - int codepoint; - size_t seqlen; - if ((bytes[0] & 0x80) == 0x00) { - // U+0000 to U+007F - codepoint = (bytes[0] & 0x7F); - seqlen = 1; - } else if ((bytes[0] & 0xE0) == 0xC0) { - // U+0080 to U+07FF - codepoint = (bytes[0] & 0x1F); - seqlen = 2; - } else if ((bytes[0] & 0xF0) == 0xE0) { - // U+0800 to U+FFFF - codepoint = (bytes[0] & 0x0F); - seqlen = 3; - } else if ((bytes[0] & 0xF8) == 0xF0) { - // U+10000 to U+10FFFF - codepoint = (bytes[0] & 0x07); - seqlen = 4; - } else { - // Bad character. return invalid CP. - return -2; - } - - if (seqlen > size) { - return -1; - } - - for (size_t i = 1; i < seqlen; ++i) { - if ((bytes[i] & 0xC0) != 0x80) { - // Bad character. return invalid CP. - return -2; - } - codepoint = (codepoint << 6) | (bytes[i] & 0x3F); - } - - if ((codepoint > 0x10FFFF) || - ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) || - ((codepoint <= 0x007F) && (seqlen != 1)) || - ((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) || - ((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) || - ((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) { - // Bad character. return invalid CP. - return -2; - } - - source.remove_prefix(seqlen); - return codepoint; -} - -bool valid_utf8(string_view s) { - while (!s.empty()) { - int32_t codepoint = read_codepoint_utf8(s); - if (codepoint < 0) return false; - } - return true; -} - bool valid_number(string_view s, bool plus, bool minus, bool dec, bool exp) { read_number(s, plus, minus, dec, exp); return s.empty(); } +using UC = UnicodeStuff; + +int32_t read_codepoint_utf8(string_view &source) { return UC::read_codepoint_utf8(source); } +bool valid_utf8(string_view s) { return UC::valid_utf8(s); } + } // namespace sv @@ -989,6 +933,23 @@ LuaDefine(unittests_util, "", "some unit tests") { LuaAssert(L, read_number_x("-123e+5x", true, true, true, true) == "-123e+5"); LuaAssert(L, read_number_x("-123e+x", true, true, true, true) == ""); + // Test read_codepoint_utf8. + std::string_view str("𝞮ὥπq"); + LuaAssert(L, str.size() == 10); + LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1D7AE); // 4-byte char + LuaAssert(L, str.size() == 6); + LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1F65); // 3-byte char + LuaAssert(L, str.size() == 3); + LuaAssert(L, sv::read_codepoint_utf8(str) == 0x3C0); // 2-byte char + LuaAssert(L, str.size() == 1); + LuaAssert(L, sv::read_codepoint_utf8(str) == 0x71); // 1-byte char + LuaAssert(L, str.size() == 0); + LuaAssert(L, sv::read_codepoint_utf8(str) == -1); // EOF + + // Test read_codepoint_utf8 on an invalid unicode sequence. + std::string_view strbad("\xC0\xC0"); + LuaAssert(L, sv::read_codepoint_utf8(strbad) == -2); + return 0; } diff --git a/luprex/cpp/core/util.hpp b/luprex/cpp/core/util.hpp index abb7551d..439a9130 100644 --- a/luprex/cpp/core/util.hpp +++ b/luprex/cpp/core/util.hpp @@ -196,18 +196,14 @@ int32_t read_ascii_char(string_view &source); // Read a UTF8 codepoint from a string_view. // -// If the string_view is empty, returns -1 and doesn't update -// the string_view. -// -// If the string_view contains an unfinished but possibly valid -// codepoint, returns -1 and doesn't update the string_view. -// -// If the next thing in the string_view is an invalid codepoint, -// returns -2 and doesn't update the string_view. +// See documentation in unicode-stuff.hpp // int32_t read_codepoint_utf8(string_view &source); // Return true if the string is valid utf-8. +// +// See documentation in unicode-stuff.hpp +// bool valid_utf8(string_view s); // Return true if the number conforms to the spec. diff --git a/luprex/cpp/drv/drvutil.cpp b/luprex/cpp/drv/drvutil.cpp index 663cfadd..4fdbd3d1 100644 --- a/luprex/cpp/drv/drvutil.cpp +++ b/luprex/cpp/drv/drvutil.cpp @@ -7,6 +7,7 @@ #include #include #include +#include "../../ext/unicode-stuff.hpp" namespace drvutil { @@ -74,189 +75,12 @@ bool is_single_wchar_t(char32_t c) { return false; } -static int buffer_codepoint_utf8(char32_t scp, char *buffer) { - uint32_t cp = (uint32_t)scp; - unsigned char *c = (unsigned char *)buffer; - if (cp < 0) { - return 0; - } - else if (cp <= 0x7F) { - c[0] = cp; - return 1; - } - else if (cp <= 0x7FF) { - c[0] = (cp>>6)+192; - c[1] = (cp&63)+128; - return 2; - } - else if (cp <= 0xFFFF) { - if ((cp >= 0xD800) && (cp <= 0xDFFF)) { - return 0; - } - c[0] = (cp>>12)+224; - c[1] = ((cp>>6)&63)+128; - c[2] = (cp&63)+128; - return 3; - } - else if (cp <= 0x10FFFF) { - c[0] = (cp>>18)+240; - c[1] = ((cp>>12)&63)+128; - c[2] = ((cp>>6)&63)+128; - c[3] = (cp&63)+128; - return 4; - } else { - return 0; - } -} +using UC = UnicodeStuff; -static int32_t read_codepoint_utf16(std::u16string_view &source) { - if (source.empty()) return -1; - - int32_t word0 = ((const uint16_t *)source.data())[0]; - source.remove_prefix(1); - - if (word0 < 0xD800) { - return word0; - } else if (word0 < 0xDC00) { - if (source.empty()) { - return -2; - } - int32_t word1 = ((const uint16_t *)source.data())[0]; - if ((word1 < 0xDC00)||(word1 > 0xDFFF)) { - return -2; - } - int32_t part1 = word0 & 0x3FF; - int32_t part2 = word1 & 0x3FF; - int32_t result = ((part1 << 10) | part2) + 0x10000; - source.remove_prefix(1); - return result; - } else if (word0 < 0xE000) { - return -2; - } else { - return word0; - } -} - -static int32_t read_codepoint_utf8(std::string_view &source) { - size_t size = source.size(); - if (size == 0) return -1; - - const unsigned char *bytes = (const unsigned char *)source.data(); - int codepoint; - size_t seqlen; - if ((bytes[0] & 0x80) == 0x00) { - // U+0000 to U+007F - codepoint = (bytes[0] & 0x7F); - seqlen = 1; - } else if ((bytes[0] & 0xE0) == 0xC0) { - // U+0080 to U+07FF - codepoint = (bytes[0] & 0x1F); - seqlen = 2; - } else if ((bytes[0] & 0xF0) == 0xE0) { - // U+0800 to U+FFFF - codepoint = (bytes[0] & 0x0F); - seqlen = 3; - } else if ((bytes[0] & 0xF8) == 0xF0) { - // U+10000 to U+10FFFF - codepoint = (bytes[0] & 0x07); - seqlen = 4; - } else { - // Bad character. return invalid CP. - return -2; - } - - if (seqlen > size) { - return -1; - } - - for (size_t i = 1; i < seqlen; ++i) { - if ((bytes[i] & 0xC0) != 0x80) { - // Bad character. return invalid CP. - return -2; - } - codepoint = (codepoint << 6) | (bytes[i] & 0x3F); - } - - if ((codepoint > 0x10FFFF) || - ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) || - ((codepoint <= 0x007F) && (seqlen != 1)) || - ((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) || - ((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) || - ((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) { - // Bad character. return invalid CP. - return -2; - } - - source.remove_prefix(seqlen); - return codepoint; -} - -std::string utf32_to_utf8(const std::u32string &s) { - std::string result(s.size() * 4, 0); - char *buffer = &result[0]; - int len = 0; - for (char32_t c : s) { - int clen = buffer_codepoint_utf8(c, buffer + len); - len += clen; - } - return result.substr(0, len); -} - -std::u32string utf8_to_utf32(std::string_view s, int *consumed) { - std::string_view rest = s; - std::u32string result(s.size(), 0); - int len = 0; - while (true) { - int32_t c = read_codepoint_utf8(rest); - if (c == -1) { - break; // EOF reached; - } else if (c < 0) { - rest.remove_prefix(1); - } else { - result[len++] = (char32_t)c; - } - } - if (consumed != nullptr) { - *consumed = s.size() - rest.size(); - } - return result.substr(0, len); -} - -std::u16string utf8_to_ucs2(std::string_view s, int *consumed) { - std::string_view rest = s; - std::u16string result(s.size(), 0); - int len = 0; - while (true) { - int32_t c = read_codepoint_utf8(rest); - if (c == -1) { - break; // EOF reached; - } else if (c < 0) { - rest.remove_prefix(1); - } else if ((c >= 0xD800) && (c <= 0xDFFF)) { - result[len++] = 0x2610; - } else if (c > 0xFFFF) { - result[len++] = 0x2610; - } else { - result[len++] = (char16_t)c; - } - } - if (consumed != nullptr) { - *consumed = s.size() - rest.size(); - } - return result.substr(0, len); -} - -std::string utf16_to_utf8(std::u16string_view s) { - std::string result(s.size() * 4, 0); - int len = 0; - while (true) { - int codepoint = read_codepoint_utf16(s); - if (codepoint == -1) break; - if (codepoint < 0) continue; - len += buffer_codepoint_utf8(codepoint, &result[len]); - } - return result.substr(0, len); -} +std::string utf32_to_utf8(const std::u32string &s) { return UC::utf32_to_utf8(s); } +std::u32string utf8_to_utf32(std::string_view s, int *consumed) { return UC::utf8_to_utf32(s, consumed); } +std::u16string utf8_to_ucs2(std::string_view s, int *consumed) { return UC::utf8_to_ucs2(s, consumed); } +std::string utf16_to_utf8(std::u16string_view s) { return UC::utf16_to_utf8(s); } static std::vector parse_control_lst(std::string_view ctrl) { std::vector result; diff --git a/luprex/cpp/drv/drvutil.hpp b/luprex/cpp/drv/drvutil.hpp index afc89c0f..bccac4b4 100644 --- a/luprex/cpp/drv/drvutil.hpp +++ b/luprex/cpp/drv/drvutil.hpp @@ -59,6 +59,7 @@ void split_target(std::string_view target, std::string &cert, std::string &host, bool is_single_wchar_t(char32_t c); // Convert a codepoint string into a UTF8-string. +// // If the codepoint string contains invalid codepoints, they're silently dropped. // std::string utf32_to_utf8(const std::u32string &cps); diff --git a/luprex/cpp/wrap/wrap-string.hpp b/luprex/cpp/wrap/wrap-string.hpp index df54cc62..f6f6f237 100644 --- a/luprex/cpp/wrap/wrap-string.hpp +++ b/luprex/cpp/wrap/wrap-string.hpp @@ -8,6 +8,8 @@ namespace eng { template> using basic_string = std::basic_string>; using string = basic_string; +using u32string = basic_string; +using u16string = basic_string; } // namespace eng #endif // WRAP_STRING_HPP diff --git a/luprex/ext/unicode-stuff.hpp b/luprex/ext/unicode-stuff.hpp new file mode 100644 index 00000000..eb706361 --- /dev/null +++ b/luprex/ext/unicode-stuff.hpp @@ -0,0 +1,268 @@ +// This file implements unicode encoding conversions. +// +// Unicode conversions aren't that complicated. It is possible +// to implement them in a few hundred lines of code. Most unicode +// libraries are much larger because they also implement many +// other pieces of functionality. I don't need anything but +// conversions. So I implemented my own tiny library. +// + +#pragma once + +#include +#include + +template +class UnicodeStuff +{ +public: + using u8string = U8STR; + using u16string = U16STR; + using u32string = U32STR; + + // Convert a single UTF32 codepoint into a UTF8 string. + // + // The string is stored in a preallocated buffer. The length of the + // codepoint is returned. If it returns 0, it means the codepoint is + // not a valid unicode codepoint. + // + static int codepoint_to_utf8(char32_t scp, char *buffer) { + uint32_t cp = (uint32_t)scp; + unsigned char *c = (unsigned char *)buffer; + if (cp < 0) { + return 0; + } + else if (cp <= 0x7F) { + c[0] = cp; + return 1; + } + else if (cp <= 0x7FF) { + c[0] = (cp>>6)+192; + c[1] = (cp&63)+128; + return 2; + } + else if (cp <= 0xFFFF) { + if ((cp >= 0xD800) && (cp <= 0xDFFF)) { + return 0; + } + c[0] = (cp>>12)+224; + c[1] = ((cp>>6)&63)+128; + c[2] = (cp&63)+128; + return 3; + } + else if (cp <= 0x10FFFF) { + c[0] = (cp>>18)+240; + c[1] = ((cp>>12)&63)+128; + c[2] = ((cp>>6)&63)+128; + c[3] = (cp&63)+128; + return 4; + } else { + return 0; + } + } + + // Read a single codepoint from a UTF16 string. + // + // Returns -1 if the string is empty. Returns -2 if the string + // starts with an invalid sequence. + // + // The string-view is updated to remove the codepoint from the view. + // + static char32_t read_codepoint_utf16(std::u16string_view &source) { + if (source.empty()) return -1; + + int32_t word0 = ((const uint16_t *)source.data())[0]; + source.remove_prefix(1); + + if (word0 < 0xD800) { + return word0; + } else if (word0 < 0xDC00) { + if (source.empty()) { + return -2; + } + int32_t word1 = ((const uint16_t *)source.data())[0]; + if ((word1 < 0xDC00)||(word1 > 0xDFFF)) { + return -2; + } + int32_t part1 = word0 & 0x3FF; + int32_t part2 = word1 & 0x3FF; + int32_t result = ((part1 << 10) | part2) + 0x10000; + source.remove_prefix(1); + return result; + } else if (word0 < 0xE000) { + return -2; + } else { + return word0; + } + } + + // Read a single codepoint from a UTF8 string. + // + // If the string_view starts with a valid codepoint, the codepoint + // is removed from the string_view and is returned. + // + // If the string_view is empty, returns -1. + // + // If the string_view starts with an unfinished but possibly + // valid codepoint, returns -1. + // + // If the string_view starts with a finish but invalid codepoint, + // returns -2. + // + static int32_t read_codepoint_utf8(std::string_view &source) { + size_t size = source.size(); + if (size == 0) return -1; + + const unsigned char *bytes = (const unsigned char *)source.data(); + + int codepoint; + size_t seqlen; + if ((bytes[0] & 0x80) == 0x00) { + // U+0000 to U+007F + codepoint = (bytes[0] & 0x7F); + seqlen = 1; + } else if ((bytes[0] & 0xE0) == 0xC0) { + // U+0080 to U+07FF + if (size < 2) return -1; + if ((bytes[1] & 0xC0) != 0x80) return -2; + codepoint = (bytes[0] & 0x1F); + codepoint = (codepoint << 6) | (bytes[1] & 0x3F); + seqlen = 2; + } else if ((bytes[0] & 0xF0) == 0xE0) { + // U+0800 to U+FFFF + if (size < 3) return -1; + if ((bytes[1] & 0xC0) != 0x80) return -2; + if ((bytes[2] & 0xC0) != 0x80) return -2; + codepoint = (bytes[0] & 0x0F); + codepoint = (codepoint << 6) | (bytes[1] & 0x3F); + codepoint = (codepoint << 6) | (bytes[2] & 0x3F); + seqlen = 3; + } else if ((bytes[0] & 0xF8) == 0xF0) { + // U+10000 to U+10FFFF + if (size < 4) return -1; + if ((bytes[1] & 0xC0) != 0x80) return -2; + if ((bytes[2] & 0xC0) != 0x80) return -2; + if ((bytes[3] & 0xC0) != 0x80) return -2; + codepoint = (bytes[0] & 0x07); + codepoint = (codepoint << 6) | (bytes[1] & 0x3F); + codepoint = (codepoint << 6) | (bytes[2] & 0x3F); + codepoint = (codepoint << 6) | (bytes[3] & 0x3F); + if (codepoint >= 0x110000) return -2; + seqlen = 4; + } else { + return -2; + } + + if ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) { + return -2; + } + + source.remove_prefix(seqlen); + return codepoint; + } + + // Convert a codepoint string into a UTF8-string. + // If the codepoint string contains invalid codepoints, they're silently dropped. + // + static u8string utf32_to_utf8(const u32string &s) { + u8string result(s.size() * 4, 0); + char *buffer = &result[0]; + int len = 0; + for (char32_t c : s) { + int clen = codepoint_to_utf8(c, buffer + len); + len += clen; + } + result.resize(len); + return result; + } + + // Convert a UTF8 string to a UTF32 string. + // + // If the UTF8 string contains invalid sequences, they're silently dropped. + // Some of the bytes may not be consumed, if the source ends with an unfinished + // utf-8 sequence. Returns the Codepoint string and the number of bytes consumed. + // You may pass nullptr for consumed if you don't care how many bytes were + // consumed. + // + static u32string utf8_to_utf32(std::string_view s, int *consumed) { + std::string_view rest = s; + u32string result(s.size(), 0); + int len = 0; + while (true) { + int32_t c = read_codepoint_utf8(rest); + if (c == -1) { + break; // EOF reached; + } else if (c < 0) { + rest.remove_prefix(1); + } else { + result[len++] = (char32_t)c; + } + } + if (consumed != nullptr) { + *consumed = s.size() - rest.size(); + } + result.resize(len); + return result; + } + + // Convert a UTF8 string to a UCS-2 string. + // + // If the UTF8 string contains invalid sequences, they're silently dropped. + // Some of the bytes may not be consumed, if the source ends with an unfinished + // utf-8 sequence. Returns the UCS-2 string and the number of bytes consumed. + // Of course, UCS-2 can't represent all of unicode, so this is lossy. + // Any character that can't be represented is replaced with a box. + // + static u16string utf8_to_ucs2(std::string_view s, int *consumed) { + std::string_view rest = s; + u16string result(s.size(), 0); + int len = 0; + while (true) { + int32_t c = read_codepoint_utf8(rest); + if (c == -1) { + break; // EOF reached; + } else if (c < 0) { + rest.remove_prefix(1); + } else if ((c >= 0xD800) && (c <= 0xDFFF)) { + result[len++] = 0x2610; + } else if (c > 0xFFFF) { + result[len++] = 0x2610; + } else { + result[len++] = (char16_t)c; + } + } + if (consumed != nullptr) { + *consumed = s.size() - rest.size(); + } + result.resize(len); + return result; + } + + // Convert a UTF16 string to a UTF8 string. + // + // This also works for ucs2 strings. If the UTF16 string + // contains invalid sequences, they're silently dropped. + // + static u8string utf16_to_utf8(std::u16string_view s) { + u8string result(s.size() * 4, 0); + int len = 0; + while (true) { + int codepoint = read_codepoint_utf16(s); + if (codepoint == -1) break; + if (codepoint < 0) continue; + len += codepoint_to_utf8(codepoint, &result[len]); + } + result.resize(len); + return result; + } + + // Check if UTF8 is valid. + // + static bool valid_utf8(std::string_view s) { + while (!s.empty()) { + int32_t codepoint = read_codepoint_utf8(s); + if (codepoint < 0) return false; + } + return true; + } +}; \ No newline at end of file