Tokenizer is now done, we also have the new InternalizeID and ExternalizeID

This commit is contained in:
2026-03-28 19:29:15 -04:00
parent 5aef356199
commit 88fa260c9d
10 changed files with 4883 additions and 0 deletions

View File

@@ -0,0 +1,32 @@
#pragma once
#include "CoreMinimal.h"
#include "WingServer.h"
#include "WingHandler.h"
#include "WingTokenizer.h"
#include "Test_Sanitizer.generated.h"
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
UCLASS()
class UWing_Test_Sanitizer : public UObject, public IWingHandler
{
GENERATED_BODY()
public:
UPROPERTY(meta=(Description="The string to sanitize"))
FString Input;
virtual FString GetDescription() const override
{
return TEXT("Test the sanitizer by sanitizing a string and printing the result.");
}
virtual void Handle() override
{
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(Input));
}
};

View File

@@ -0,0 +1,33 @@
#pragma once
#include "CoreMinimal.h"
#include "WingServer.h"
#include "WingHandler.h"
#include "WingTokenizer.h"
#include "Test_Tokenizer.generated.h"
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
UCLASS()
class UWing_Test_Tokenizer : public UObject, public IWingHandler
{
GENERATED_BODY()
public:
UPROPERTY(meta=(Description="The string to tokenize"))
FString Input;
virtual FString GetDescription() const override
{
return TEXT("Test the tokenizer by tokenizing a string and printing the result.");
}
virtual void Handle() override
{
WingTokenizer T(Input);
T.PrintEverything();
}
};

View File

@@ -0,0 +1,41 @@
#pragma once
#include "CoreMinimal.h"
#include "WingServer.h"
#include "WingHandler.h"
#include "WingTokenizer.h"
#include "Test_Unsanitize.generated.h"
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
UCLASS()
class UWing_Test_Unsanitize : public UObject, public IWingHandler
{
GENERATED_BODY()
public:
UPROPERTY(meta=(Description="The sanitized identifier to unsanitize"))
FString Input;
virtual FString GetDescription() const override
{
return TEXT("Test the unsanitizer by unsanitizing a string and printing the result.");
}
virtual void Handle() override
{
FString Error;
FString Result = WingTokenizer::TryInternalizeID(Input, Error);
if (!Error.IsEmpty())
{
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
}
if (!Result.IsEmpty())
{
UWingServer::Printf(TEXT("Result: %s\n"), *Result);
}
}
};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,302 @@
#include "WingTokenizer.h"
#include "WingServer.h"
void WingCharacterClasses::Assign(Cat Category, FStringView String)
{
int32 OldSize = CharCategory.Num();
int32 NewSize = 0;
for (TCHAR Ch : String) if (Ch >= NewSize) NewSize = Ch + 1;
if (NewSize > OldSize)
{
CharCategory.SetNum(NewSize);
for (int i = OldSize; i < NewSize; i++) CharCategory[i] = Cat::Other;
}
for (TCHAR Ch : String) CharCategory[Ch] = Category;
}
WingCharacterClasses::WingCharacterClasses()
{
// This is the set of printable, visible, non-whitespace characters that
// appear in most ubuntu default fonts. I initially map all of these as
// 'Identifier' characters, but later I swap some of them over to punctuation.
Assign(Cat::Identifier, TEXT(
"!#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefgh"
"ijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐ"
"ÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖ"
"ėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜ"
"ŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƀƁƂƃƄƅƆƇƈƉƊƋƌƍƎƏƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟƠơƢ"
"ƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƾƿǀǁǂǃǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǦǧǨǩǪǫǬǭǮǯǰǴǵǸǹǼ"
"ǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟȤȥȦȧȨȩȪȫȬȭȮȯȰȱȲȳȷʒʼˆˇˉ˘˙˚˛˜˝΄΅ΆΈΉΊ"
"ΌΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώЀЁЂЃЄ"
"ЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъ"
"ыьэюяѐёђѓєѕіїјљњћќѝўџѲҐґҒғҔҕҖҗҘҙҚқҢңҤҥҪҫҬҭҮүҰұҲҳҺһӀӁӂӃӄӇӈӋӌӏӐӑӒӓӔӕӖӗӘә"
"ӚӛӜӝӞӟӠӡӢӣӤӥӦӧӨөӪӫӬӭӮӯӰӱӲӳӴӵӶӷӸӹẀẁẂẃẄẅỲỳ–—―‘’‚“”„†‡•…‰‹›⁰⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆"
"₇₈₉€₹№™Ω⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞∂∆∏∑−∕∙√∞∫≈≠≤≥◊fifl\"\\"
));
// All the punctuation marks that we do anything interesting with.
Assign(Cat::Punctuation, PunctuationString);
// Control characters.
for (int i = 0; i < 32; i++) CharCategory[i] = Cat::Control;
CharCategory[0x7F] = Cat::Control;
}
WingCharacterClasses WingCharacterClasses::TheSet;
void WingTokenizer::Add(TCHAR Type, FString InternalID)
{
Token T;
T.Type = Type;
T.InternalID = MoveTemp(InternalID);
Tokens.Add(T);
}
TCHAR WingTokenizer::FromHex(FStringView Digits, FString &Error)
{
if (Digits.Len() < 1) { Error = "Empty hex escape sequence"; return 0; }
int32 Value = 0;
for (TCHAR Ch : Digits)
{
if (Ch >= '0' && Ch <= '9') Value = Value * 16 + (Ch - '0');
else if (Ch >= 'a' && Ch <= 'f') Value = Value * 16 + (Ch - 'a' + 10);
else if (Ch >= 'A' && Ch <= 'F') Value = Value * 16 + (Ch - 'A' + 10);
else { Error = "Invalid hex digit in escape sequence"; return 0; }
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
}
return (TCHAR)Value;
}
TCHAR WingTokenizer::FromDecimal(FStringView Digits, FString &Error)
{
if (Digits.Len() < 1) { Error = "Empty decimal escape sequence"; return 0; }
int32 Value = 0;
for (TCHAR Ch : Digits)
{
if (Ch >= '0' && Ch <= '9') Value = Value * 10 + (Ch - '0');
else { Error = "Invalid decimal digit in escape sequence"; return 0; }
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
}
return (TCHAR)Value;
}
TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error)
{
if (!Error.IsEmpty()) return 0;
// Search for the semicolon.
int32 SemiPos;
if (!Rest.FindChar(';', SemiPos))
{
Error = "Ampersand escape sequence doesn't end in semicolon";
return 0;
}
if (SemiPos < 3)
{
Error = "Ampersand escape sequence too short";
return 0;
}
TCHAR Result = 0;
if (Rest[1] == '#')
{
if ((Rest[2] == 'x') || (Rest[2] == 'X'))
Result = FromHex(Rest.Mid(3, SemiPos - 3), Error);
else
Result = FromDecimal(Rest.Mid(2, SemiPos - 2), Error);
}
else
{
FString Name(Rest.Mid(1, SemiPos - 1));
Result = WingEntityList::GetChar(Name);
if (Result == 0)
{
Error = FString::Printf(TEXT("Unknown HTML entity: &%s;"), *Name);
return 0;
}
}
Rest = Rest.RightChop(SemiPos + 1);
return Result;
}
FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
{
if (!Error.IsEmpty()) return FString();
TStringBuilder<512> Decoded;
while (!Rest.IsEmpty() && Error.IsEmpty())
{
TCHAR Ch = Rest[0];
if (Ch == ' ') break;
if (Ch == '.')
{
Decoded.AppendChar(' ');
Rest = Rest.RightChop(1);
continue;
}
if (Ch == '&')
{
Decoded.AppendChar(TokenizeEscapeSequence(Rest, Error));
continue;
}
Cat Category = WingCharacterClasses::GetCat(Ch);
if ((Category == Cat::Identifier) || (Category == Cat::Other))
{
// We accept other characters in case the LLM sends unicode
// that isn't on the whitelist. This is intentional.
Decoded.AppendChar(Ch);
Rest = Rest.RightChop(1);
}
else break;
}
if (!Error.IsEmpty()) return FString();
// We deliberately do not produce an error message for empty identifiers,
// because we can't generate a good message here. We leave it to others
// to deal with that case.
return Decoded.ToString();
}
WingTokenizer::WingTokenizer(const FString& Input)
{
FStringView Rest(Input);
while (!Rest.IsEmpty() && Error.IsEmpty())
{
TCHAR Ch = Rest[0];
if ((Ch == ' ') || (Ch == '\t'))
{
Rest = Rest.RightChop(1);
continue;
}
if (Ch == '=')
{
Add(RestOfLine, FString(Rest.RightChop(1)));
break;
}
if ((Ch == '.') || (Ch == '&'))
{
Add(Identifier, TokenizeIdentifier(Rest, Error));
continue;
}
Cat Category = WingCharacterClasses::GetCat(Ch);
if (Category == Cat::Punctuation)
{
Add(Ch, FString());
Rest = Rest.RightChop(1);
continue;
}
if (Category == Cat::Control)
{
Error = "Control characters in input, not allowed";
break;
}
Add(Identifier, TokenizeIdentifier(Rest, Error));
continue;
}
if (!Error.IsEmpty()) Tokens.Empty();
}
void WingTokenizer::PrintEverything() const
{
if (!Error.IsEmpty())
{
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
}
for (const Token& T : Tokens)
{
TStringBuilder<512> ExtraStr;
for (TCHAR Ch : T.InternalID)
{
if (Ch >= 0x20 && Ch <= 0x7E)
{
ExtraStr.AppendChar(Ch);
ExtraStr.AppendChar(' ');
}
else
{
ExtraStr.Appendf(TEXT("%04X "), (int32)Ch);
}
}
if (T.Type >= 0x20 && T.Type <= 0x7E)
UWingServer::Printf(TEXT("Token '%c': %s\n"), T.Type, *ExtraStr);
else
UWingServer::Printf(TEXT("Token %04X: %s\n"), (int32)T.Type, *ExtraStr);
}
}
FString WingTokenizer::ExternalizeID(const FString &S)
{
TStringBuilder<512> Result;
for (TCHAR Ch : S)
{
if (Ch == ' ') Result.AppendChar('.');
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
else
{
Result.AppendChar('&');
FStringView Name = WingEntityList::GetName(Ch);
if (Name.IsEmpty())
{
Result.AppendChar('#');
Result.Appendf(TEXT("%d"), (int32)Ch);
}
else
{
Result.Append(Name);
}
Result.AppendChar(';');
}
}
return Result.ToString();
}
FString WingTokenizer::TryInternalizeID(const FString &S, FString &Error)
{
FStringView Input(S);
Error.Empty();
FString Result = TokenizeIdentifier(Input, Error);
// If there's already an error, annotate with context
if (!Error.IsEmpty())
{
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *S, *Error);
return FString();
}
// If the identifier tokenizer stops before consuming the whole
// input, then we need to generate an error message. We do our best
// to generate the most informative error possible.
if (!Input.IsEmpty())
{
Cat Category = WingCharacterClasses::GetCat(Input[0]);
if (Input[0] == ' ')
{
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, spaces must be escaped"), *S);
}
else if (Category == Cat::Punctuation)
{
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, these marks must be escaped: %s"),
*S, WingCharacterClasses::PunctuationString);
}
else if (Category == Cat::Control)
{
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *S);
}
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *S);
return FString();
}
// One last error case: empty input
if (Result.IsEmpty())
{
Error = TEXT("ERROR: Empty identifiers are not allowed");
return FString();
}
return Result;
}
FString WingTokenizer::CheckInternalizeID(const FString &S)
{
FString Error;
FString Result = TryInternalizeID(S, Error);
if (!Error.IsEmpty())
{
UWingServer::Printf(TEXT("%s\n"), *Error);
UWingServer::SuggestManual(WingManual::Section::IdentifierSanitization);
}
return Result;
}

View File

@@ -65,6 +65,9 @@ FString WingUtils::SanitizeName(const FString &InName)
if (c == ' ') c=L'·';
if (c == '<') c=L'';
if (c == '>') c=L'';
if (c == '(') c=L'';
if (c == ')') c=L'';
if (c == '=') c=L'';
if (c == ',') c=L'';
Name[Dst++] = c;
}
@@ -84,6 +87,9 @@ FString WingUtils::UnsanitizeName(const FString &InName)
if (c == L'·') c=' ';
if (c == L'') c='<';
if (c == L'') c='>';
if (c == L'') c='(';
if (c == L'') c=')';
if (c == L'') c='=';
if (c == L'') c=',';
Name[Dst++] = c;
}

View File

@@ -0,0 +1,169 @@
#pragma once
#include "CoreMinimal.h"
// -----------------------------------------------------------------
//
// WingCharacterClasses
//
// We recognize these disjoint classes of characters:
//
// Punctuation. A small hardwired list of punctuation marks
// that we want to escape, specifically \"'(),.:;<=>&
// These particular punctuation marks were chosen because they
// either need to be escaped for json's sake, or for our
// parser's sake. Any other punctuation is just classified as
// an identifier character.
//
// Identifier characters. A whitelist of about a thousand
// ascii and unicode characters that can be used directly in
// identifiers without any kind of escaping. To get on the list,
// you need the following: to not be an ascii punctuation mark,
// to be printable and visible and not whitespace, and to be
// easily rendered by all of the default ubuntu fonts.
//
// Control Characters. Ascii control characters, including DEL.
//
// Other Characters. Anything else.
//
// -----------------------------------------------------------------
struct WingCharacterClasses
{
enum class Cat : uint8
{
Identifier,
Punctuation,
Control,
Other,
};
static Cat GetCat(TCHAR Ch)
{
int32 Cp = (int32)Ch;
if (Cp < 0 || Cp >= TheSet.CharCategory.Num()) return Cat::Other;
return TheSet.CharCategory[Cp];
}
static constexpr const TCHAR *PunctuationString = TEXT("\\\"'(),.:;<=>&");
private:
TArray<Cat> CharCategory;
WingCharacterClasses();
void Assign(Cat Category, FStringView String);
static WingCharacterClasses TheSet;
};
// -----------------------------------------------------------------
//
// The HTML Entity List.
//
// When escaping identifiers, we use HTML escapes like &lt;
// These work well because they have no conflict with the json
// parser (MCP protocol is json), they are also easy to deal
// with in the tokenizer, and the LLM is already familiar with
// that kind of escaping. The names stored in this table do not
// include the ampersand or the semicolon.
//
// This class doesn't handle hex character codes, this is just
// a lookup table from character to name and back.
//
// -----------------------------------------------------------------
struct WingEntityList
{
static TCHAR GetChar(const FString &Name)
{
TCHAR *Result = TheList.NameToChar.Find(Name);
if (Result == nullptr) return 0;
return *Result;
}
static FStringView GetName(TCHAR Ch)
{
FString *Result = TheList.CharToName.Find(Ch);
if (Result == nullptr) return FStringView();
return *Result;
}
private:
TMap<TCHAR, FString> CharToName;
TMap<FString, TCHAR> NameToChar;
struct Raw { const char *Name; TCHAR Codepoint; };
WingEntityList(std::initializer_list<Raw> Data);
static WingEntityList TheList;
};
struct WingTokenizer
{
using Cat = WingCharacterClasses::Cat;
const TCHAR Identifier = 'i';
const TCHAR RestOfLine = 'r';
// A token has a token type which can be Identifier,
// RestOfLine, or a single-character punctuation mark.
// The InternalID field contains the result of converting
// the token from an external ID to an internal ID.
struct Token
{
TCHAR Type;
FString InternalID;
};
// The string that we tokenized.
FString Input;
// If the tokenization failed, an error message.
FString Error;
// The result, an array of tokens.
TArray<Token> Tokens;
// Tokenize a line of input. The tokens are stored in
// the token array. If there's an error, the error is
// stored in the error field, and the token array is
// cleared. If the tokens contain identifiers,
WingTokenizer(const FString& Input);
// Convert an internal ID into an external ID.
// Spaces are converted to periods. Any other
// non-identifier character is HTML escaped.
static FString ExternalizeID(const FString &S);
// Convert an external ID into an internal ID.
// Periods are converted back to spaces. HTML escapes
// are converted back to raw characters. This could
// fail, for example, if the external name contains an
// invalid HTML escape. If it does, returns empty
// string and sets the error message.
static FString TryInternalizeID(const FString &S, FString &Error);
// Calls TryInternalizeName. If this generates an
// error, prints the error message, suggests the manual
// entry on identifier sanitization, and returns empty
// string.
static FString CheckInternalizeID(const FString &S);
// Print all tokens to the log for debugging.
void PrintEverything() const;
private:
// Add a token to the token array.
void Add(TCHAR Type, FString InternalID);
// Convert numbers to TCHAR. If there's an error, set the error
// message and return zero.
static TCHAR FromHex(FStringView Digits, FString &Error);
static TCHAR FromDecimal(FStringView Digits, FString &Error);
// Tokenize an escape sequence. Attempts to consume a valid escape
// sequence from rest, and return the character indicated. On error,
// sets the error message and returns zero.
static TCHAR TokenizeEscapeSequence(FStringView &Rest, FString &Error);
// Tokenize an identifier. Attempts to consume a valid identifier
// from rest, and return the identifier. On error, sets the error
// message and returns empty string.
static FString TokenizeIdentifier(FStringView &Rest, FString &Error);
};