Add _ to the set of characters allowed in tokens

This commit is contained in:
2026-02-19 23:35:38 -05:00
parent 3f2f3416c6
commit d79ecef1fe
5 changed files with 38 additions and 27 deletions

View File

@@ -101,6 +101,7 @@ Do not use git to make changes (commit, push, branch, etc.). Read-only git comma
## Workflow ## Workflow
- When the user gives a direct command, execute it. But when proposing changes on your own initiative, describe the plan and get approval before editing files. - When the user gives a direct command, execute it. But when proposing changes on your own initiative, describe the plan and get approval before editing files.
- If an instruction ends with an ellipsis (`...`), the user has more to say. Wait for the next message before acting.
## Coding Conventions ## Coding Conventions

View File

@@ -252,7 +252,7 @@ Update 2: I don't remember using userdata objects at all. I am not sure that Upd
## Token Literal Syntax Patch ## Token Literal Syntax Patch
Tokens are lightuserdata values encoding short alphanumeric Tokens are lightuserdata values encoding short alphanumeric
strings as base37 numbers (see `Tokens-A-New-Lua-Type.md`). strings as base38 numbers (see `Tokens-A-New-Lua-Type.md`).
This patch adds a literal syntax to the Lua parser so that This patch adds a literal syntax to the Lua parser so that
tokens can be written directly in Lua source code using the tokens can be written directly in Lua source code using the
`@` prefix: `@` prefix:

View File

@@ -47,16 +47,13 @@ LuaConstantReg *LuaConstantReg::All;
eng::string LuaToken::str() const { eng::string LuaToken::str() const {
static const char encoding[] =
"\0_0123456789abcdefghijklmnopqrstuvwxyz";
uint64_t n = (uint64_t)value; uint64_t n = (uint64_t)value;
char buffer[13] = {}; char buffer[13] = {};
for (int i = 11; i >= 0; i--) { for (int i = 11; i >= 0; i--) {
int d = n % 37; buffer[i] = encoding[n % 38];
n /= 37; n /= 38;
if (d >= 1 && d <= 10) {
buffer[i] = '0' + (d - 1);
} else if (d >= 11 && d <= 36) {
buffer[i] = 'a' + (d - 11);
}
} }
return eng::string(buffer); return eng::string(buffer);
} }
@@ -974,6 +971,9 @@ LuaDefine(unittests_token, "", "Unit tests for LuaToken encoding") {
LuaAssertStrEq(L, LuaToken("a0").str(), "a0"); LuaAssertStrEq(L, LuaToken("a0").str(), "a0");
LuaAssertStrEq(L, LuaToken("0a").str(), "0a"); LuaAssertStrEq(L, LuaToken("0a").str(), "0a");
LuaAssertStrEq(L, LuaToken("000000000000").str(), "000000000000"); LuaAssertStrEq(L, LuaToken("000000000000").str(), "000000000000");
LuaAssertStrEq(L, LuaToken("foo_bar").str(), "foo_bar");
LuaAssertStrEq(L, LuaToken("_").str(), "_");
LuaAssertStrEq(L, LuaToken("a_b").str(), "a_b");
// Test that empty/invalid strings produce the empty token. // Test that empty/invalid strings produce the empty token.
LuaAssert(L, LuaToken(std::string_view("")).empty()); LuaAssert(L, LuaToken(std::string_view("")).empty());
@@ -990,6 +990,10 @@ LuaDefine(unittests_token, "", "Unit tests for LuaToken encoding") {
LuaAssert(L, LuaToken("hello").value > LuaToken("hell").value); LuaAssert(L, LuaToken("hello").value > LuaToken("hell").value);
LuaAssert(L, LuaToken("a0").value > LuaToken("a").value); LuaAssert(L, LuaToken("a0").value > LuaToken("a").value);
LuaAssert(L, LuaToken("a").value != LuaToken("a0").value); LuaAssert(L, LuaToken("a").value != LuaToken("a0").value);
LuaAssert(L, LuaToken("0").value > LuaToken("_").value);
LuaAssert(L, LuaToken("a").value > LuaToken("9").value);
LuaAssert(L, LuaToken("a_b").value > LuaToken("a_a").value);
LuaAssert(L, LuaToken("foo_bar").value != LuaToken("foobar").value);
return 0; return 0;
} }

View File

@@ -261,9 +261,10 @@
// we have a json null. // we have a json null.
// //
// So that finally brings me to what a "token" is. A token is a lightuserdata // So that finally brings me to what a "token" is. A token is a lightuserdata
// containing a short string encoded as a fixed-width base37 number. Tokens // containing a short string encoded as a fixed-width base38 number. Tokens
// may only contain the characters a-z and 0-9, and can be up to 12 characters // may only contain the characters a-z, 0-9, and underscore, and can be up to
// long (since 37^12 fits in 64 bits). In effect, it's a short string, but it's // 12 characters long (since 38^12 fits in 64 bits). In effect, it's a short
// string, but it's
// a string that's distinguishable from a normal lua string. It doesn't have // a string that's distinguishable from a normal lua string. It doesn't have
// the same type as a lua string (it shows up as a lightuserdata). // the same type as a lua string (it shows up as a lightuserdata).
// The purpose of tokens is to represent special unique values, like json null. // The purpose of tokens is to represent special unique values, like json null.
@@ -271,7 +272,7 @@
// To make working with tokens easy, I've created a C++ struct 'LuaToken'. // To make working with tokens easy, I've created a C++ struct 'LuaToken'.
// It stores an int64. You can construct a LuaToken in two different ways: // It stores an int64. You can construct a LuaToken in two different ways:
// //
// LuaToken(0x3D5E30BCAF2EF663) // LuaToken(0x559D0F68151CB900)
// LuaToken("null") // LuaToken("null")
// //
// Those are equivalent. The second form is just as fast as the first, // Those are equivalent. The second form is just as fast as the first,
@@ -408,12 +409,14 @@ enum LuaTableType {
struct LuaToken { struct LuaToken {
private: private:
// Encode a token string as a fixed-width base37 number. // Encode a token string as a fixed-width base38 number.
// Each character is mapped to a digit 1-36 (0 means "no character"), // Each character is mapped to a digit 1-37 (0 means "no character"),
// and the result is: CH0*37^11 + CH1*37^10 + ... + CH11*37^0. // and the result is: CH0*38^11 + CH1*38^10 + ... + CH11*38^0.
// This fixed-width encoding ensures that numeric ordering matches // This fixed-width encoding ensures that numeric ordering matches
// lexicographic ordering of the original strings. // lexicographic ordering of the original strings.
// //
// Digit mapping: _ → 1, 0-9 → 2-11, a-z → 12-37.
//
// WARNING: The Lua lexer in llex.c contains a duplicate of this // WARNING: The Lua lexer in llex.c contains a duplicate of this
// encoding logic (in the '@' token literal case). If you change // encoding logic (in the '@' token literal case). If you change
// the encoding here, you must update llex.c to match. // the encoding here, you must update llex.c to match.
@@ -428,20 +431,22 @@ private:
for (int i = 0; i < int(str.size()); i++) { for (int i = 0; i < int(str.size()); i++) {
char c = str[i]; char c = str[i];
uint64_t digit = 0; uint64_t digit = 0;
if ((c >= '0') && (c <= '9')) { if (c == '_') {
digit = uint64_t(c - '0') + 1; digit = 1;
} else if ((c >= '0') && (c <= '9')) {
digit = uint64_t(c - '0') + 2;
} else if ((c >= 'a') && (c <= 'z')) { } else if ((c >= 'a') && (c <= 'z')) {
digit = uint64_t(c - 'a') + 11; digit = uint64_t(c - 'a') + 12;
} else if ((c >= 'A') && (c <= 'Z')) { } else if ((c >= 'A') && (c <= 'Z')) {
digit = uint64_t(c - 'A') + 11; digit = uint64_t(c - 'A') + 12;
} else { } else {
return 0; return 0;
} }
result = result * 37 + digit; result = result * 38 + digit;
} }
// Pad remaining positions with zeros (no character). // Pad remaining positions with zeros (no character).
for (int i = int(str.size()); i < 12; i++) { for (int i = int(str.size()); i < 12; i++) {
result = result * 37; result = result * 38;
} }
return result; return result;
} }

View File

@@ -493,18 +493,19 @@ static int llex (LexState *ls, SemInfo *seminfo) {
while (1) { while (1) {
char c = (char)ls->current; char c = (char)ls->current;
size_t digit; size_t digit;
if (c >= '0' && c <= '9') digit = (size_t)(c - '0') + 1; if (c == '_') digit = 1;
else if (c >= 'a' && c <= 'z') digit = (size_t)(c - 'a') + 11; else if (c >= '0' && c <= '9') digit = (size_t)(c - '0') + 2;
else if (c >= 'A' && c <= 'Z') digit = (size_t)(c - 'A') + 11; else if (c >= 'a' && c <= 'z') digit = (size_t)(c - 'a') + 12;
else if (c >= 'A' && c <= 'Z') digit = (size_t)(c - 'A') + 12;
else break; else break;
tokval = tokval * 37 + digit; tokval = tokval * 38 + digit;
toklen++; toklen++;
save_and_next(ls); save_and_next(ls);
} }
if (toklen == 0 || toklen > 12 || ls->current == '_') if (toklen == 0 || toklen > 12)
lexerror(ls, "invalid token literal", TK_TOKEN); lexerror(ls, "invalid token literal", TK_TOKEN);
/* Pad to fixed width of 12 digits. */ /* Pad to fixed width of 12 digits. */
for (int i = toklen; i < 12; i++) tokval *= 37; for (int i = toklen; i < 12; i++) tokval *= 38;
seminfo->p = (void *)tokval; seminfo->p = (void *)tokval;
return TK_TOKEN; return TK_TOKEN;
} }