More refactors to prepare for doc-search, including moving unicode support into ext.

This commit is contained in:
2026-01-14 12:30:44 -05:00
parent 4e374294b6
commit 850b4aa43b
10 changed files with 334 additions and 254 deletions

View File

@@ -545,6 +545,24 @@ void SourceDB::register_lua_builtins() {
lua_close(L);
}
util::StringVec SourceDB::search_docs(const eng::string &substring) {
// This map will hold the results. It maps function name
// to a documentation line.
eng::map<eng::string, eng::string> results;
// Search the built-in functions.
// for (const LuaFunctionReg *reg = LuaFunctionReg::All; reg != nullptr; reg=reg->next()) {
// }
util::StringVec resultvec;
for (const auto &pair : results) {
resultvec.push_back(pair.second);
}
return resultvec;
}
eng::string SourceDB::function_docs(const LuaCoreStack &LS, LuaSlot fn) {
lua_State *L = LS.state();
if (LS.iscfunction(fn)) {

View File

@@ -204,6 +204,13 @@ public:
//
eng::string function_docs(const LuaCoreStack &LS, LuaSlot slot);
// Search the documentation.
//
// Search all the documentation for the specified substring.
// In the result, each line points to a different result.
//
util::StringVec search_docs(const eng::string &substring);
// Serialize and unserialize a source vector.
//
static void serialize_source(const util::LuaSourceVec &sv, StreamBuffer *sb);

View File

@@ -4,6 +4,8 @@
#include "fast-float.hpp"
#include "luastack.hpp"
#include "../../ext/unicode-stuff.hpp"
#include <algorithm>
#include <sys/types.h>
#include <sys/stat.h>
@@ -13,7 +15,6 @@
#include <cmath>
#include <charconv>
namespace sv {
bool case_insensitive_eq(string_view s1, string_view s2) {
@@ -336,73 +337,16 @@ int32_t read_ascii_char(string_view &source) {
return result;
}
int32_t read_codepoint_utf8(std::string_view &source) {
size_t size = source.size();
if (size == 0) return -1;
const unsigned char *bytes = (const unsigned char *)source.data();
int codepoint;
size_t seqlen;
if ((bytes[0] & 0x80) == 0x00) {
// U+0000 to U+007F
codepoint = (bytes[0] & 0x7F);
seqlen = 1;
} else if ((bytes[0] & 0xE0) == 0xC0) {
// U+0080 to U+07FF
codepoint = (bytes[0] & 0x1F);
seqlen = 2;
} else if ((bytes[0] & 0xF0) == 0xE0) {
// U+0800 to U+FFFF
codepoint = (bytes[0] & 0x0F);
seqlen = 3;
} else if ((bytes[0] & 0xF8) == 0xF0) {
// U+10000 to U+10FFFF
codepoint = (bytes[0] & 0x07);
seqlen = 4;
} else {
// Bad character. return invalid CP.
return -2;
}
if (seqlen > size) {
return -1;
}
for (size_t i = 1; i < seqlen; ++i) {
if ((bytes[i] & 0xC0) != 0x80) {
// Bad character. return invalid CP.
return -2;
}
codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
}
if ((codepoint > 0x10FFFF) ||
((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) ||
((codepoint <= 0x007F) && (seqlen != 1)) ||
((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
// Bad character. return invalid CP.
return -2;
}
source.remove_prefix(seqlen);
return codepoint;
}
bool valid_utf8(string_view s) {
while (!s.empty()) {
int32_t codepoint = read_codepoint_utf8(s);
if (codepoint < 0) return false;
}
return true;
}
bool valid_number(string_view s, bool plus, bool minus, bool dec, bool exp) {
read_number(s, plus, minus, dec, exp);
return s.empty();
}
using UC = UnicodeStuff<eng::string, eng::u16string, eng::u32string>;
int32_t read_codepoint_utf8(string_view &source) { return UC::read_codepoint_utf8(source); }
bool valid_utf8(string_view s) { return UC::valid_utf8(s); }
} // namespace sv
@@ -989,6 +933,23 @@ LuaDefine(unittests_util, "", "some unit tests") {
LuaAssert(L, read_number_x("-123e+5x", true, true, true, true) == "-123e+5");
LuaAssert(L, read_number_x("-123e+x", true, true, true, true) == "");
// Test read_codepoint_utf8.
std::string_view str("𝞮ὥπq");
LuaAssert(L, str.size() == 10);
LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1D7AE); // 4-byte char
LuaAssert(L, str.size() == 6);
LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1F65); // 3-byte char
LuaAssert(L, str.size() == 3);
LuaAssert(L, sv::read_codepoint_utf8(str) == 0x3C0); // 2-byte char
LuaAssert(L, str.size() == 1);
LuaAssert(L, sv::read_codepoint_utf8(str) == 0x71); // 1-byte char
LuaAssert(L, str.size() == 0);
LuaAssert(L, sv::read_codepoint_utf8(str) == -1); // EOF
// Test read_codepoint_utf8 on an invalid unicode sequence.
std::string_view strbad("\xC0\xC0");
LuaAssert(L, sv::read_codepoint_utf8(strbad) == -2);
return 0;
}

View File

@@ -196,18 +196,14 @@ int32_t read_ascii_char(string_view &source);
// Read a UTF8 codepoint from a string_view.
//
// If the string_view is empty, returns -1 and doesn't update
// the string_view.
//
// If the string_view contains an unfinished but possibly valid
// codepoint, returns -1 and doesn't update the string_view.
//
// If the next thing in the string_view is an invalid codepoint,
// returns -2 and doesn't update the string_view.
// See documentation in unicode-stuff.hpp
//
int32_t read_codepoint_utf8(string_view &source);
// Return true if the string is valid utf-8.
//
// See documentation in unicode-stuff.hpp
//
bool valid_utf8(string_view s);
// Return true if the number conforms to the spec.