Add _ to the set of characters allowed in tokens

This commit is contained in:
2026-02-19 23:35:38 -05:00
parent 3f2f3416c6
commit d79ecef1fe
5 changed files with 38 additions and 27 deletions

View File

@@ -261,9 +261,10 @@
// we have a json null.
//
// So that finally brings me to what a "token" is. A token is a lightuserdata
// containing a short string encoded as a fixed-width base37 number. Tokens
// may only contain the characters a-z and 0-9, and can be up to 12 characters
// long (since 37^12 fits in 64 bits). In effect, it's a short string, but it's
// containing a short string encoded as a fixed-width base38 number. Tokens
// may only contain the characters a-z, 0-9, and underscore, and can be up to
// 12 characters long (since 38^12 fits in 64 bits). In effect, it's a short
// string, but it's
// a string that's distinguishable from a normal lua string. It doesn't have
// the same type as a lua string (it shows up as a lightuserdata).
// The purpose of tokens is to represent special unique values, like json null.
@@ -271,7 +272,7 @@
// To make working with tokens easy, I've created a C++ struct 'LuaToken'.
// It stores an int64. You can construct a LuaToken in two different ways:
//
// LuaToken(0x3D5E30BCAF2EF663)
// LuaToken(0x559D0F68151CB900)
// LuaToken("null")
//
// Those are equivalent. The second form is just as fast as the first,
@@ -408,12 +409,14 @@ enum LuaTableType {
struct LuaToken {
private:
// Encode a token string as a fixed-width base37 number.
// Each character is mapped to a digit 1-36 (0 means "no character"),
// and the result is: CH0*37^11 + CH1*37^10 + ... + CH11*37^0.
// Encode a token string as a fixed-width base38 number.
// Each character is mapped to a digit 1-37 (0 means "no character"),
// and the result is: CH0*38^11 + CH1*38^10 + ... + CH11*38^0.
// This fixed-width encoding ensures that numeric ordering matches
// lexicographic ordering of the original strings.
//
// Digit mapping: _ → 1, 0-9 → 2-11, a-z → 12-37.
//
// WARNING: The Lua lexer in llex.c contains a duplicate of this
// encoding logic (in the '@' token literal case). If you change
// the encoding here, you must update llex.c to match.
@@ -428,20 +431,22 @@ private:
for (int i = 0; i < int(str.size()); i++) {
char c = str[i];
uint64_t digit = 0;
if ((c >= '0') && (c <= '9')) {
digit = uint64_t(c - '0') + 1;
if (c == '_') {
digit = 1;
} else if ((c >= '0') && (c <= '9')) {
digit = uint64_t(c - '0') + 2;
} else if ((c >= 'a') && (c <= 'z')) {
digit = uint64_t(c - 'a') + 11;
digit = uint64_t(c - 'a') + 12;
} else if ((c >= 'A') && (c <= 'Z')) {
digit = uint64_t(c - 'A') + 11;
digit = uint64_t(c - 'A') + 12;
} else {
return 0;
}
result = result * 37 + digit;
result = result * 38 + digit;
}
// Pad remaining positions with zeros (no character).
for (int i = int(str.size()); i < 12; i++) {
result = result * 37;
result = result * 38;
}
return result;
}