Files
integration/Plugins/UEWingman/Source/UEWingman/Public/WingTokenizer.h

176 lines
5.5 KiB
C++

#pragma once
#include "CoreMinimal.h"
// -----------------------------------------------------------------
//
// WingCharacterClasses
//
// We recognize these disjoint classes of characters:
//
// Punctuation. A small hardwired list of punctuation marks
// that we want to escape, specifically \"'(),.:;<=>&
// These particular punctuation marks were chosen because they
// either need to be escaped for json's sake, or for our
// parser's sake. Any other punctuation is just classified as
// an identifier character.
//
// Identifier characters. A whitelist of about a thousand
// ascii and unicode characters that can be used directly in
// identifiers without any kind of escaping. To get on the list,
// you need the following: to not be an ascii punctuation mark,
// to be printable and visible and not whitespace, and to be
// easily rendered by all of the default ubuntu fonts.
//
// Control Characters. Ascii control characters, including DEL.
//
// Other Characters. Anything else.
//
// -----------------------------------------------------------------
struct WingCharacterClasses
{
enum class Cat : uint8
{
Identifier,
Punctuation,
Control,
Other,
};
static Cat GetCat(TCHAR Ch)
{
int32 Cp = (int32)Ch;
if (Cp < 0 || Cp >= TheSet.CharCategory.Num()) return Cat::Other;
return TheSet.CharCategory[Cp];
}
static constexpr const TCHAR *PunctuationString = TEXT("\\\"'(),.:;<=>&");
private:
TArray<Cat> CharCategory;
WingCharacterClasses();
void Assign(Cat Category, FStringView String);
static WingCharacterClasses TheSet;
};
// -----------------------------------------------------------------
//
// The HTML Entity List.
//
// When escaping identifiers, we use HTML escapes like &lt;
// These work well because they have no conflict with the json
// parser (MCP protocol is json), they are also easy to deal
// with in the tokenizer, and the LLM is already familiar with
// that kind of escaping. The names stored in this table do not
// include the ampersand or the semicolon.
//
// This class doesn't handle hex character codes, this is just
// a lookup table from character to name and back.
//
// -----------------------------------------------------------------
struct WingEntityList
{
static TCHAR GetChar(const FString &Name)
{
TCHAR *Result = TheList.NameToChar.Find(Name);
if (Result == nullptr) return 0;
return *Result;
}
static FStringView GetName(TCHAR Ch)
{
FString *Result = TheList.CharToName.Find(Ch);
if (Result == nullptr) return FStringView();
return *Result;
}
private:
TMap<TCHAR, FString> CharToName;
TMap<FString, TCHAR> NameToChar;
struct Raw { const char *Name; TCHAR Codepoint; };
WingEntityList(std::initializer_list<Raw> Data);
static WingEntityList TheList;
};
struct WingTokenizer
{
using Cat = WingCharacterClasses::Cat;
const TCHAR Identifier = 'i';
const TCHAR RestOfLine = 'r';
// A token has a token type which can be Identifier,
// RestOfLine, or a single-character punctuation mark.
// The InternalID field contains the result of converting
// the token from an external ID to an internal ID.
struct Token
{
TCHAR Type;
FString InternalID;
};
// The string that we tokenized.
FString Input;
// If the tokenization failed, an error message.
FString Error;
// The result, an array of tokens.
TArray<Token> Tokens;
// Tokenize a line of input. The tokens are stored in
// the token array. If there's an error, the error is
// stored in the error field, and the token array is
// cleared. If the tokens contain identifiers,
WingTokenizer(const FString& Input);
// Convert an internal ID into an external ID.
// Spaces are converted to periods. Any other
// non-identifier character is HTML escaped.
static FString ExternalizeID(const FString &InternalID);
// Return true if the internal ID would convert
// to a readable, easy-to-understand external ID without
// HTML escape sequences.
static bool WouldExternalizeReadably(const FString &InternalID);
// Convert an external ID into an internal ID.
// Periods are converted back to spaces. HTML escapes
// are converted back to raw characters. This could
// fail, for example, if the external name contains an
// invalid HTML escape. If it does, returns empty
// string and sets the error message.
static FString TryInternalizeID(const FString &ExternalID, FString &Error);
// Simplify an ID. This removes any non-identifier
// characters from the ID. Be careful! This could
// remove the whole identifier! So obviously this
// should only be used in certain rare contexts where
// that's OK.
static FString SimplifyID(const FString &ID);
// Print all tokens into a string builder for debugging.
void PrintEverything(FStringBuilderBase &Out) const;
private:
// Add a token to the token array.
void Add(TCHAR Type, FString InternalID);
// Convert numbers to TCHAR. If there's an error, set the error
// message and return zero.
static TCHAR FromHex(FStringView Digits, FString &Error);
static TCHAR FromDecimal(FStringView Digits, FString &Error);
// Tokenize an escape sequence. Attempts to consume a valid escape
// sequence from rest, and return the character indicated. On error,
// sets the error message and returns zero.
static TCHAR TokenizeEscapeSequence(FStringView &Rest, FString &Error);
// Tokenize an identifier. Attempts to consume a valid identifier
// from rest, and return the identifier. On error, sets the error
// message and returns empty string.
static FString TokenizeIdentifier(FStringView &Rest, FString &Error);
};