Add _ to the set of characters allowed in tokens

This commit is contained in:
2026-02-19 23:35:38 -05:00
parent 3f2f3416c6
commit d79ecef1fe
5 changed files with 38 additions and 27 deletions

View File

@@ -101,6 +101,7 @@ Do not use git to make changes (commit, push, branch, etc.). Read-only git comma
## Workflow
- When the user gives a direct command, execute it. But when proposing changes on your own initiative, describe the plan and get approval before editing files.
- If an instruction ends with an ellipsis (`...`), the user has more to say. Wait for the next message before acting.
## Coding Conventions

View File

@@ -252,7 +252,7 @@ Update 2: I don't remember using userdata objects at all. I am not sure that Upd
## Token Literal Syntax Patch
Tokens are lightuserdata values encoding short alphanumeric
strings as base37 numbers (see `Tokens-A-New-Lua-Type.md`).
strings as base38 numbers (see `Tokens-A-New-Lua-Type.md`).
This patch adds a literal syntax to the Lua parser so that
tokens can be written directly in Lua source code using the
`@` prefix:

View File

@@ -47,16 +47,13 @@ LuaConstantReg *LuaConstantReg::All;
eng::string LuaToken::str() const {
static const char encoding[] =
"\0_0123456789abcdefghijklmnopqrstuvwxyz";
uint64_t n = (uint64_t)value;
char buffer[13] = {};
for (int i = 11; i >= 0; i--) {
int d = n % 37;
n /= 37;
if (d >= 1 && d <= 10) {
buffer[i] = '0' + (d - 1);
} else if (d >= 11 && d <= 36) {
buffer[i] = 'a' + (d - 11);
}
buffer[i] = encoding[n % 38];
n /= 38;
}
return eng::string(buffer);
}
@@ -974,6 +971,9 @@ LuaDefine(unittests_token, "", "Unit tests for LuaToken encoding") {
LuaAssertStrEq(L, LuaToken("a0").str(), "a0");
LuaAssertStrEq(L, LuaToken("0a").str(), "0a");
LuaAssertStrEq(L, LuaToken("000000000000").str(), "000000000000");
LuaAssertStrEq(L, LuaToken("foo_bar").str(), "foo_bar");
LuaAssertStrEq(L, LuaToken("_").str(), "_");
LuaAssertStrEq(L, LuaToken("a_b").str(), "a_b");
// Test that empty/invalid strings produce the empty token.
LuaAssert(L, LuaToken(std::string_view("")).empty());
@@ -990,6 +990,10 @@ LuaDefine(unittests_token, "", "Unit tests for LuaToken encoding") {
LuaAssert(L, LuaToken("hello").value > LuaToken("hell").value);
LuaAssert(L, LuaToken("a0").value > LuaToken("a").value);
LuaAssert(L, LuaToken("a").value != LuaToken("a0").value);
LuaAssert(L, LuaToken("0").value > LuaToken("_").value);
LuaAssert(L, LuaToken("a").value > LuaToken("9").value);
LuaAssert(L, LuaToken("a_b").value > LuaToken("a_a").value);
LuaAssert(L, LuaToken("foo_bar").value != LuaToken("foobar").value);
return 0;
}

View File

@@ -261,9 +261,10 @@
// we have a json null.
//
// So that finally brings me to what a "token" is. A token is a lightuserdata
// containing a short string encoded as a fixed-width base37 number. Tokens
// may only contain the characters a-z and 0-9, and can be up to 12 characters
// long (since 37^12 fits in 64 bits). In effect, it's a short string, but it's
// containing a short string encoded as a fixed-width base38 number. Tokens
// may only contain the characters a-z, 0-9, and underscore, and can be up to
// 12 characters long (since 38^12 fits in 64 bits). In effect, it's a short
// string, but it's
// a string that's distinguishable from a normal lua string. It doesn't have
// the same type as a lua string (it shows up as a lightuserdata).
// The purpose of tokens is to represent special unique values, like json null.
@@ -271,7 +272,7 @@
// To make working with tokens easy, I've created a C++ struct 'LuaToken'.
// It stores an int64. You can construct a LuaToken in two different ways:
//
// LuaToken(0x3D5E30BCAF2EF663)
// LuaToken(0x559D0F68151CB900)
// LuaToken("null")
//
// Those are equivalent. The second form is just as fast as the first,
@@ -408,12 +409,14 @@ enum LuaTableType {
struct LuaToken {
private:
// Encode a token string as a fixed-width base37 number.
// Each character is mapped to a digit 1-36 (0 means "no character"),
// and the result is: CH0*37^11 + CH1*37^10 + ... + CH11*37^0.
// Encode a token string as a fixed-width base38 number.
// Each character is mapped to a digit 1-37 (0 means "no character"),
// and the result is: CH0*38^11 + CH1*38^10 + ... + CH11*38^0.
// This fixed-width encoding ensures that numeric ordering matches
// lexicographic ordering of the original strings.
//
// Digit mapping: _ → 1, 0-9 → 2-11, a-z → 12-37.
//
// WARNING: The Lua lexer in llex.c contains a duplicate of this
// encoding logic (in the '@' token literal case). If you change
// the encoding here, you must update llex.c to match.
@@ -428,20 +431,22 @@ private:
for (int i = 0; i < int(str.size()); i++) {
char c = str[i];
uint64_t digit = 0;
if ((c >= '0') && (c <= '9')) {
digit = uint64_t(c - '0') + 1;
if (c == '_') {
digit = 1;
} else if ((c >= '0') && (c <= '9')) {
digit = uint64_t(c - '0') + 2;
} else if ((c >= 'a') && (c <= 'z')) {
digit = uint64_t(c - 'a') + 11;
digit = uint64_t(c - 'a') + 12;
} else if ((c >= 'A') && (c <= 'Z')) {
digit = uint64_t(c - 'A') + 11;
digit = uint64_t(c - 'A') + 12;
} else {
return 0;
}
result = result * 37 + digit;
result = result * 38 + digit;
}
// Pad remaining positions with zeros (no character).
for (int i = int(str.size()); i < 12; i++) {
result = result * 37;
result = result * 38;
}
return result;
}

View File

@@ -493,18 +493,19 @@ static int llex (LexState *ls, SemInfo *seminfo) {
while (1) {
char c = (char)ls->current;
size_t digit;
if (c >= '0' && c <= '9') digit = (size_t)(c - '0') + 1;
else if (c >= 'a' && c <= 'z') digit = (size_t)(c - 'a') + 11;
else if (c >= 'A' && c <= 'Z') digit = (size_t)(c - 'A') + 11;
if (c == '_') digit = 1;
else if (c >= '0' && c <= '9') digit = (size_t)(c - '0') + 2;
else if (c >= 'a' && c <= 'z') digit = (size_t)(c - 'a') + 12;
else if (c >= 'A' && c <= 'Z') digit = (size_t)(c - 'A') + 12;
else break;
tokval = tokval * 37 + digit;
tokval = tokval * 38 + digit;
toklen++;
save_and_next(ls);
}
if (toklen == 0 || toklen > 12 || ls->current == '_')
if (toklen == 0 || toklen > 12)
lexerror(ls, "invalid token literal", TK_TOKEN);
/* Pad to fixed width of 12 digits. */
for (int i = toklen; i < 12; i++) tokval *= 37;
for (int i = toklen; i < 12; i++) tokval *= 38;
seminfo->p = (void *)tokval;
return TK_TOKEN;
}