Add _ to the set of characters allowed in tokens

2026-02-19 23:35:38 -05:00
parent 3f2f3416c6
commit d79ecef1fe
5 changed files with 38 additions and 27 deletions
--- a/luprex/cpp/core/luastack.hpp
+++ b/luprex/cpp/core/luastack.hpp
@@ -261,9 +261,10 @@
 // we have a json null.
 //
 // So that finally brings me to what a "token" is.  A token is a lightuserdata
-// containing a short string encoded as a fixed-width base37 number.  Tokens
-// may only contain the characters a-z and 0-9, and can be up to 12 characters
-// long (since 37^12 fits in 64 bits).  In effect, it's a short string, but it's
+// containing a short string encoded as a fixed-width base38 number.  Tokens
+// may only contain the characters a-z, 0-9, and underscore, and can be up to
+// 12 characters long (since 38^12 fits in 64 bits).  In effect, it's a short
+// string, but it's
 // a string that's distinguishable from a normal lua string.  It doesn't have
 // the same type as a lua string (it shows up as a lightuserdata).
 // The purpose of tokens is to represent special unique values, like json null.
@@ -271,7 +272,7 @@
 // To make working with tokens easy, I've created a C++ struct 'LuaToken'.
 // It stores an int64.  You can construct a LuaToken in two different ways:
 //
-//   LuaToken(0x3D5E30BCAF2EF663)
+//   LuaToken(0x559D0F68151CB900)
 //   LuaToken("null")
 //
 // Those are equivalent.  The second form is just as fast as the first,
@@ -408,12 +409,14 @@ enum LuaTableType {

 struct LuaToken {
 private:
-    // Encode a token string as a fixed-width base37 number.
-    // Each character is mapped to a digit 1-36 (0 means "no character"),
-    // and the result is: CH0*37^11 + CH1*37^10 + ... + CH11*37^0.
+    // Encode a token string as a fixed-width base38 number.
+    // Each character is mapped to a digit 1-37 (0 means "no character"),
+    // and the result is: CH0*38^11 + CH1*38^10 + ... + CH11*38^0.
    // This fixed-width encoding ensures that numeric ordering matches
    // lexicographic ordering of the original strings.
    //
+    // Digit mapping: _ → 1, 0-9 → 2-11, a-z → 12-37.
+    //
    // WARNING: The Lua lexer in llex.c contains a duplicate of this
    // encoding logic (in the '@' token literal case). If you change
    // the encoding here, you must update llex.c to match.
@@ -428,20 +431,22 @@ private:
        for (int i = 0; i < int(str.size()); i++) {
            char c = str[i];
            uint64_t digit = 0;
-            if ((c >= '0') && (c <= '9')) {
-                digit = uint64_t(c - '0') + 1;
+            if (c == '_') {
+                digit = 1;
+            } else if ((c >= '0') && (c <= '9')) {
+                digit = uint64_t(c - '0') + 2;
            } else if ((c >= 'a') && (c <= 'z')) {
-                digit = uint64_t(c - 'a') + 11;
+                digit = uint64_t(c - 'a') + 12;
            } else if ((c >= 'A') && (c <= 'Z')) {
-                digit = uint64_t(c - 'A') + 11;
+                digit = uint64_t(c - 'A') + 12;
            } else {
                return 0;
            }
-            result = result * 37 + digit;
+            result = result * 38 + digit;
        }
        // Pad remaining positions with zeros (no character).
        for (int i = int(str.size()); i < 12; i++) {
-            result = result * 37;
+            result = result * 38;
        }
        return result;
    }