176 lines
5.5 KiB
C++
176 lines
5.5 KiB
C++
#pragma once
|
|
|
|
#include "CoreMinimal.h"
|
|
|
|
// -----------------------------------------------------------------
|
|
//
|
|
// WingCharacterClasses
|
|
//
|
|
// We recognize these disjoint classes of characters:
|
|
//
|
|
// Punctuation. A small hardwired list of punctuation marks
|
|
// that we want to escape, specifically \"'(),.:;<=>&
|
|
// These particular punctuation marks were chosen because they
|
|
// either need to be escaped for json's sake, or for our
|
|
// parser's sake. Any other punctuation is just classified as
|
|
// an identifier character.
|
|
//
|
|
// Identifier characters. A whitelist of about a thousand
|
|
// ascii and unicode characters that can be used directly in
|
|
// identifiers without any kind of escaping. To get on the list,
|
|
// you need the following: to not be an ascii punctuation mark,
|
|
// to be printable and visible and not whitespace, and to be
|
|
// easily rendered by all of the default ubuntu fonts.
|
|
//
|
|
// Control Characters. Ascii control characters, including DEL.
|
|
//
|
|
// Other Characters. Anything else.
|
|
//
|
|
// -----------------------------------------------------------------
|
|
|
|
struct WingCharacterClasses
|
|
{
|
|
enum class Cat : uint8
|
|
{
|
|
Identifier,
|
|
Punctuation,
|
|
Control,
|
|
Other,
|
|
};
|
|
|
|
static Cat GetCat(TCHAR Ch)
|
|
{
|
|
int32 Cp = (int32)Ch;
|
|
if (Cp < 0 || Cp >= TheSet.CharCategory.Num()) return Cat::Other;
|
|
return TheSet.CharCategory[Cp];
|
|
}
|
|
|
|
static constexpr const TCHAR *PunctuationString = TEXT("\\\"'(),.:;<=>&");
|
|
|
|
private:
|
|
TArray<Cat> CharCategory;
|
|
WingCharacterClasses();
|
|
void Assign(Cat Category, FStringView String);
|
|
static WingCharacterClasses TheSet;
|
|
};
|
|
|
|
// -----------------------------------------------------------------
|
|
//
|
|
// The HTML Entity List.
|
|
//
|
|
// When escaping identifiers, we use HTML escapes like <
|
|
// These work well because they have no conflict with the json
|
|
// parser (MCP protocol is json), they are also easy to deal
|
|
// with in the tokenizer, and the LLM is already familiar with
|
|
// that kind of escaping. The names stored in this table do not
|
|
// include the ampersand or the semicolon.
|
|
//
|
|
// This class doesn't handle hex character codes, this is just
|
|
// a lookup table from character to name and back.
|
|
//
|
|
// -----------------------------------------------------------------
|
|
|
|
struct WingEntityList
|
|
{
|
|
static TCHAR GetChar(const FString &Name)
|
|
{
|
|
TCHAR *Result = TheList.NameToChar.Find(Name);
|
|
if (Result == nullptr) return 0;
|
|
return *Result;
|
|
}
|
|
|
|
static FStringView GetName(TCHAR Ch)
|
|
{
|
|
FString *Result = TheList.CharToName.Find(Ch);
|
|
if (Result == nullptr) return FStringView();
|
|
return *Result;
|
|
}
|
|
|
|
private:
|
|
TMap<TCHAR, FString> CharToName;
|
|
TMap<FString, TCHAR> NameToChar;
|
|
struct Raw { const char *Name; TCHAR Codepoint; };
|
|
WingEntityList(std::initializer_list<Raw> Data);
|
|
static WingEntityList TheList;
|
|
};
|
|
|
|
|
|
struct WingTokenizer
|
|
{
|
|
using Cat = WingCharacterClasses::Cat;
|
|
const TCHAR Identifier = 'i';
|
|
const TCHAR RestOfLine = 'r';
|
|
|
|
// A token has a token type which can be Identifier,
|
|
// RestOfLine, or a single-character punctuation mark.
|
|
// The InternalID field contains the result of converting
|
|
// the token from an external ID to an internal ID.
|
|
struct Token
|
|
{
|
|
TCHAR Type;
|
|
FString InternalID;
|
|
};
|
|
|
|
// The string that we tokenized.
|
|
FString Input;
|
|
|
|
// If the tokenization failed, an error message.
|
|
FString Error;
|
|
|
|
// The result, an array of tokens.
|
|
TArray<Token> Tokens;
|
|
|
|
// Tokenize a line of input. The tokens are stored in
|
|
// the token array. If there's an error, the error is
|
|
// stored in the error field, and the token array is
|
|
// cleared. If the tokens contain identifiers,
|
|
WingTokenizer(const FString& Input);
|
|
|
|
// Convert an internal ID into an external ID.
|
|
// Spaces are converted to periods. Any other
|
|
// non-identifier character is HTML escaped.
|
|
static FString ExternalizeID(const FString &InternalID);
|
|
|
|
// Return true if the internal ID would convert
|
|
// to a readable, easy-to-understand external ID without
|
|
// HTML escape sequences.
|
|
static bool WouldExternalizeReadably(const FString &InternalID);
|
|
|
|
// Convert an external ID into an internal ID.
|
|
// Periods are converted back to spaces. HTML escapes
|
|
// are converted back to raw characters. This could
|
|
// fail, for example, if the external name contains an
|
|
// invalid HTML escape. If it does, returns empty
|
|
// string and sets the error message.
|
|
static FString TryInternalizeID(const FString &ExternalID, FString &Error);
|
|
|
|
// Simplify an ID. This removes any non-identifier
|
|
// characters from the ID. Be careful! This could
|
|
// remove the whole identifier! So obviously this
|
|
// should only be used in certain rare contexts where
|
|
// that's OK.
|
|
static FString SimplifyID(const FString &ID);
|
|
|
|
// Print all tokens into a string builder for debugging.
|
|
void PrintEverything(FStringBuilderBase &Out) const;
|
|
|
|
private:
|
|
// Add a token to the token array.
|
|
void Add(TCHAR Type, FString InternalID);
|
|
|
|
// Convert numbers to TCHAR. If there's an error, set the error
|
|
// message and return zero.
|
|
static TCHAR FromHex(FStringView Digits, FString &Error);
|
|
static TCHAR FromDecimal(FStringView Digits, FString &Error);
|
|
|
|
// Tokenize an escape sequence. Attempts to consume a valid escape
|
|
// sequence from rest, and return the character indicated. On error,
|
|
// sets the error message and returns zero.
|
|
static TCHAR TokenizeEscapeSequence(FStringView &Rest, FString &Error);
|
|
|
|
// Tokenize an identifier. Attempts to consume a valid identifier
|
|
// from rest, and return the identifier. On error, sets the error
|
|
// message and returns empty string.
|
|
static FString TokenizeIdentifier(FStringView &Rest, FString &Error);
|
|
};
|