diff --git a/CLAUDE.md b/CLAUDE.md index 3a92b1b6..b6dd34c6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -101,6 +101,7 @@ Do not use git to make changes (commit, push, branch, etc.). Read-only git comma ## Workflow - When the user gives a direct command, execute it. But when proposing changes on your own initiative, describe the plan and get approval before editing files. +- If an instruction ends with an ellipsis (`...`), the user has more to say. Wait for the next message before acting. ## Coding Conventions diff --git a/Docs/A-Summary-of-our-Lua-Patches.md b/Docs/A-Summary-of-our-Lua-Patches.md index 1998aa92..11528b35 100644 --- a/Docs/A-Summary-of-our-Lua-Patches.md +++ b/Docs/A-Summary-of-our-Lua-Patches.md @@ -252,7 +252,7 @@ Update 2: I don't remember using userdata objects at all. I am not sure that Upd ## Token Literal Syntax Patch Tokens are lightuserdata values encoding short alphanumeric -strings as base37 numbers (see `Tokens-A-New-Lua-Type.md`). +strings as base38 numbers (see `Tokens-A-New-Lua-Type.md`). This patch adds a literal syntax to the Lua parser so that tokens can be written directly in Lua source code using the `@` prefix: diff --git a/luprex/cpp/core/luastack.cpp b/luprex/cpp/core/luastack.cpp index 975a6474..3b6615a4 100644 --- a/luprex/cpp/core/luastack.cpp +++ b/luprex/cpp/core/luastack.cpp @@ -47,16 +47,13 @@ LuaConstantReg *LuaConstantReg::All; eng::string LuaToken::str() const { + static const char encoding[] = + "\0_0123456789abcdefghijklmnopqrstuvwxyz"; uint64_t n = (uint64_t)value; char buffer[13] = {}; for (int i = 11; i >= 0; i--) { - int d = n % 37; - n /= 37; - if (d >= 1 && d <= 10) { - buffer[i] = '0' + (d - 1); - } else if (d >= 11 && d <= 36) { - buffer[i] = 'a' + (d - 11); - } + buffer[i] = encoding[n % 38]; + n /= 38; } return eng::string(buffer); } @@ -974,6 +971,9 @@ LuaDefine(unittests_token, "", "Unit tests for LuaToken encoding") { LuaAssertStrEq(L, LuaToken("a0").str(), "a0"); LuaAssertStrEq(L, LuaToken("0a").str(), "0a"); LuaAssertStrEq(L, LuaToken("000000000000").str(), "000000000000"); + LuaAssertStrEq(L, LuaToken("foo_bar").str(), "foo_bar"); + LuaAssertStrEq(L, LuaToken("_").str(), "_"); + LuaAssertStrEq(L, LuaToken("a_b").str(), "a_b"); // Test that empty/invalid strings produce the empty token. LuaAssert(L, LuaToken(std::string_view("")).empty()); @@ -990,6 +990,10 @@ LuaDefine(unittests_token, "", "Unit tests for LuaToken encoding") { LuaAssert(L, LuaToken("hello").value > LuaToken("hell").value); LuaAssert(L, LuaToken("a0").value > LuaToken("a").value); LuaAssert(L, LuaToken("a").value != LuaToken("a0").value); + LuaAssert(L, LuaToken("0").value > LuaToken("_").value); + LuaAssert(L, LuaToken("a").value > LuaToken("9").value); + LuaAssert(L, LuaToken("a_b").value > LuaToken("a_a").value); + LuaAssert(L, LuaToken("foo_bar").value != LuaToken("foobar").value); return 0; } diff --git a/luprex/cpp/core/luastack.hpp b/luprex/cpp/core/luastack.hpp index ab86b67c..5910d640 100644 --- a/luprex/cpp/core/luastack.hpp +++ b/luprex/cpp/core/luastack.hpp @@ -261,9 +261,10 @@ // we have a json null. // // So that finally brings me to what a "token" is. A token is a lightuserdata -// containing a short string encoded as a fixed-width base37 number. Tokens -// may only contain the characters a-z and 0-9, and can be up to 12 characters -// long (since 37^12 fits in 64 bits). In effect, it's a short string, but it's +// containing a short string encoded as a fixed-width base38 number. Tokens +// may only contain the characters a-z, 0-9, and underscore, and can be up to +// 12 characters long (since 38^12 fits in 64 bits). In effect, it's a short +// string, but it's // a string that's distinguishable from a normal lua string. It doesn't have // the same type as a lua string (it shows up as a lightuserdata). // The purpose of tokens is to represent special unique values, like json null. @@ -271,7 +272,7 @@ // To make working with tokens easy, I've created a C++ struct 'LuaToken'. // It stores an int64. You can construct a LuaToken in two different ways: // -// LuaToken(0x3D5E30BCAF2EF663) +// LuaToken(0x559D0F68151CB900) // LuaToken("null") // // Those are equivalent. The second form is just as fast as the first, @@ -408,12 +409,14 @@ enum LuaTableType { struct LuaToken { private: - // Encode a token string as a fixed-width base37 number. - // Each character is mapped to a digit 1-36 (0 means "no character"), - // and the result is: CH0*37^11 + CH1*37^10 + ... + CH11*37^0. + // Encode a token string as a fixed-width base38 number. + // Each character is mapped to a digit 1-37 (0 means "no character"), + // and the result is: CH0*38^11 + CH1*38^10 + ... + CH11*38^0. // This fixed-width encoding ensures that numeric ordering matches // lexicographic ordering of the original strings. // + // Digit mapping: _ → 1, 0-9 → 2-11, a-z → 12-37. + // // WARNING: The Lua lexer in llex.c contains a duplicate of this // encoding logic (in the '@' token literal case). If you change // the encoding here, you must update llex.c to match. @@ -428,20 +431,22 @@ private: for (int i = 0; i < int(str.size()); i++) { char c = str[i]; uint64_t digit = 0; - if ((c >= '0') && (c <= '9')) { - digit = uint64_t(c - '0') + 1; + if (c == '_') { + digit = 1; + } else if ((c >= '0') && (c <= '9')) { + digit = uint64_t(c - '0') + 2; } else if ((c >= 'a') && (c <= 'z')) { - digit = uint64_t(c - 'a') + 11; + digit = uint64_t(c - 'a') + 12; } else if ((c >= 'A') && (c <= 'Z')) { - digit = uint64_t(c - 'A') + 11; + digit = uint64_t(c - 'A') + 12; } else { return 0; } - result = result * 37 + digit; + result = result * 38 + digit; } // Pad remaining positions with zeros (no character). for (int i = int(str.size()); i < 12; i++) { - result = result * 37; + result = result * 38; } return result; } diff --git a/luprex/ext/eris-master/src/llex.c b/luprex/ext/eris-master/src/llex.c index 28cef65f..97df479a 100644 --- a/luprex/ext/eris-master/src/llex.c +++ b/luprex/ext/eris-master/src/llex.c @@ -493,18 +493,19 @@ static int llex (LexState *ls, SemInfo *seminfo) { while (1) { char c = (char)ls->current; size_t digit; - if (c >= '0' && c <= '9') digit = (size_t)(c - '0') + 1; - else if (c >= 'a' && c <= 'z') digit = (size_t)(c - 'a') + 11; - else if (c >= 'A' && c <= 'Z') digit = (size_t)(c - 'A') + 11; + if (c == '_') digit = 1; + else if (c >= '0' && c <= '9') digit = (size_t)(c - '0') + 2; + else if (c >= 'a' && c <= 'z') digit = (size_t)(c - 'a') + 12; + else if (c >= 'A' && c <= 'Z') digit = (size_t)(c - 'A') + 12; else break; - tokval = tokval * 37 + digit; + tokval = tokval * 38 + digit; toklen++; save_and_next(ls); } - if (toklen == 0 || toklen > 12 || ls->current == '_') + if (toklen == 0 || toklen > 12) lexerror(ls, "invalid token literal", TK_TOKEN); /* Pad to fixed width of 12 digits. */ - for (int i = toklen; i < 12; i++) tokval *= 37; + for (int i = toklen; i < 12; i++) tokval *= 38; seminfo->p = (void *)tokval; return TK_TOKEN; }