Add _ to the set of characters allowed in tokens

2026-02-19 23:35:38 -05:00
parent 3f2f3416c6
commit d79ecef1fe
5 changed files with 38 additions and 27 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -101,6 +101,7 @@ Do not use git to make changes (commit, push, branch, etc.). Read-only git comma
 ## Workflow

 - When the user gives a direct command, execute it. But when proposing changes on your own initiative, describe the plan and get approval before editing files.
+- If an instruction ends with an ellipsis (`...`), the user has more to say. Wait for the next message before acting.

 ## Coding Conventions

--- a/Docs/A-Summary-of-our-Lua-Patches.md
+++ b/Docs/A-Summary-of-our-Lua-Patches.md
@@ -252,7 +252,7 @@ Update 2: I don't remember using userdata objects at all. I am not sure that Upd
 ## Token Literal Syntax Patch

 Tokens are lightuserdata values encoding short alphanumeric
-strings as base37 numbers (see `Tokens-A-New-Lua-Type.md`).
+strings as base38 numbers (see `Tokens-A-New-Lua-Type.md`).
 This patch adds a literal syntax to the Lua parser so that
 tokens can be written directly in Lua source code using the
 `@` prefix:
--- a/luprex/cpp/core/luastack.cpp
+++ b/luprex/cpp/core/luastack.cpp
@@ -47,16 +47,13 @@ LuaConstantReg *LuaConstantReg::All;


 eng::string LuaToken::str() const {
+    static const char encoding[] =
+        "\0_0123456789abcdefghijklmnopqrstuvwxyz";
    uint64_t n = (uint64_t)value;
    char buffer[13] = {};
    for (int i = 11; i >= 0; i--) {
-        int d = n % 37;
-        n /= 37;
-        if (d >= 1 && d <= 10) {
-            buffer[i] = '0' + (d - 1);
-        } else if (d >= 11 && d <= 36) {
-            buffer[i] = 'a' + (d - 11);
-        }
+        buffer[i] = encoding[n % 38];
+        n /= 38;
    }
    return eng::string(buffer);
 }
@@ -974,6 +971,9 @@ LuaDefine(unittests_token, "", "Unit tests for LuaToken encoding") {
    LuaAssertStrEq(L, LuaToken("a0").str(), "a0");
    LuaAssertStrEq(L, LuaToken("0a").str(), "0a");
    LuaAssertStrEq(L, LuaToken("000000000000").str(), "000000000000");
+    LuaAssertStrEq(L, LuaToken("foo_bar").str(), "foo_bar");
+    LuaAssertStrEq(L, LuaToken("_").str(), "_");
+    LuaAssertStrEq(L, LuaToken("a_b").str(), "a_b");

    // Test that empty/invalid strings produce the empty token.
    LuaAssert(L, LuaToken(std::string_view("")).empty());
@@ -990,6 +990,10 @@ LuaDefine(unittests_token, "", "Unit tests for LuaToken encoding") {
    LuaAssert(L, LuaToken("hello").value > LuaToken("hell").value);
    LuaAssert(L, LuaToken("a0").value > LuaToken("a").value);
    LuaAssert(L, LuaToken("a").value != LuaToken("a0").value);
+    LuaAssert(L, LuaToken("0").value > LuaToken("_").value);
+    LuaAssert(L, LuaToken("a").value > LuaToken("9").value);
+    LuaAssert(L, LuaToken("a_b").value > LuaToken("a_a").value);
+    LuaAssert(L, LuaToken("foo_bar").value != LuaToken("foobar").value);

    return 0;
 }
--- a/luprex/cpp/core/luastack.hpp
+++ b/luprex/cpp/core/luastack.hpp
@@ -261,9 +261,10 @@
 // we have a json null.
 //
 // So that finally brings me to what a "token" is.  A token is a lightuserdata
-// containing a short string encoded as a fixed-width base37 number.  Tokens
-// may only contain the characters a-z and 0-9, and can be up to 12 characters
-// long (since 37^12 fits in 64 bits).  In effect, it's a short string, but it's
+// containing a short string encoded as a fixed-width base38 number.  Tokens
+// may only contain the characters a-z, 0-9, and underscore, and can be up to
+// 12 characters long (since 38^12 fits in 64 bits).  In effect, it's a short
+// string, but it's
 // a string that's distinguishable from a normal lua string.  It doesn't have
 // the same type as a lua string (it shows up as a lightuserdata).
 // The purpose of tokens is to represent special unique values, like json null.
@@ -271,7 +272,7 @@
 // To make working with tokens easy, I've created a C++ struct 'LuaToken'.
 // It stores an int64.  You can construct a LuaToken in two different ways:
 //
-//   LuaToken(0x3D5E30BCAF2EF663)
+//   LuaToken(0x559D0F68151CB900)
 //   LuaToken("null")
 //
 // Those are equivalent.  The second form is just as fast as the first,
@@ -408,12 +409,14 @@ enum LuaTableType {

 struct LuaToken {
 private:
-    // Encode a token string as a fixed-width base37 number.
-    // Each character is mapped to a digit 1-36 (0 means "no character"),
-    // and the result is: CH0*37^11 + CH1*37^10 + ... + CH11*37^0.
+    // Encode a token string as a fixed-width base38 number.
+    // Each character is mapped to a digit 1-37 (0 means "no character"),
+    // and the result is: CH0*38^11 + CH1*38^10 + ... + CH11*38^0.
    // This fixed-width encoding ensures that numeric ordering matches
    // lexicographic ordering of the original strings.
    //
+    // Digit mapping: _ → 1, 0-9 → 2-11, a-z → 12-37.
+    //
    // WARNING: The Lua lexer in llex.c contains a duplicate of this
    // encoding logic (in the '@' token literal case). If you change
    // the encoding here, you must update llex.c to match.
@@ -428,20 +431,22 @@ private:
        for (int i = 0; i < int(str.size()); i++) {
            char c = str[i];
            uint64_t digit = 0;
-            if ((c >= '0') && (c <= '9')) {
-                digit = uint64_t(c - '0') + 1;
+            if (c == '_') {
+                digit = 1;
+            } else if ((c >= '0') && (c <= '9')) {
+                digit = uint64_t(c - '0') + 2;
            } else if ((c >= 'a') && (c <= 'z')) {
-                digit = uint64_t(c - 'a') + 11;
+                digit = uint64_t(c - 'a') + 12;
            } else if ((c >= 'A') && (c <= 'Z')) {
-                digit = uint64_t(c - 'A') + 11;
+                digit = uint64_t(c - 'A') + 12;
            } else {
                return 0;
            }
-            result = result * 37 + digit;
+            result = result * 38 + digit;
        }
        // Pad remaining positions with zeros (no character).
        for (int i = int(str.size()); i < 12; i++) {
-            result = result * 37;
+            result = result * 38;
        }
        return result;
    }
--- a/luprex/ext/eris-master/src/llex.c
+++ b/luprex/ext/eris-master/src/llex.c
@@ -493,18 +493,19 @@ static int llex (LexState *ls, SemInfo *seminfo) {
        while (1) {
          char c = (char)ls->current;
          size_t digit;
-          if (c >= '0' && c <= '9') digit = (size_t)(c - '0') + 1;
-          else if (c >= 'a' && c <= 'z') digit = (size_t)(c - 'a') + 11;
-          else if (c >= 'A' && c <= 'Z') digit = (size_t)(c - 'A') + 11;
+          if (c == '_') digit = 1;
+          else if (c >= '0' && c <= '9') digit = (size_t)(c - '0') + 2;
+          else if (c >= 'a' && c <= 'z') digit = (size_t)(c - 'a') + 12;
+          else if (c >= 'A' && c <= 'Z') digit = (size_t)(c - 'A') + 12;
          else break;
-          tokval = tokval * 37 + digit;
+          tokval = tokval * 38 + digit;
          toklen++;
          save_and_next(ls);
        }
-        if (toklen == 0 || toklen > 12 || ls->current == '_')
+        if (toklen == 0 || toklen > 12)
          lexerror(ls, "invalid token literal", TK_TOKEN);
        /* Pad to fixed width of 12 digits. */
-        for (int i = toklen; i < 12; i++) tokval *= 37;
+        for (int i = toklen; i < 12; i++) tokval *= 38;
        seminfo->p = (void *)tokval;
        return TK_TOKEN;
      }