Tokenizer is now done, we also have the new InternalizeID and ExternalizeID
This commit is contained in:
32
Plugins/UEWingman/Source/UEWingman/Handlers/Test_Sanitizer.h
Normal file
32
Plugins/UEWingman/Source/UEWingman/Handlers/Test_Sanitizer.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#pragma once
|
||||
|
||||
#include "CoreMinimal.h"
|
||||
#include "WingServer.h"
|
||||
#include "WingHandler.h"
|
||||
#include "WingTokenizer.h"
|
||||
#include "Test_Sanitizer.generated.h"
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ---------------------------------------------------------------------------
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
UCLASS()
|
||||
class UWing_Test_Sanitizer : public UObject, public IWingHandler
|
||||
{
|
||||
GENERATED_BODY()
|
||||
|
||||
public:
|
||||
UPROPERTY(meta=(Description="The string to sanitize"))
|
||||
FString Input;
|
||||
|
||||
virtual FString GetDescription() const override
|
||||
{
|
||||
return TEXT("Test the sanitizer by sanitizing a string and printing the result.");
|
||||
}
|
||||
|
||||
virtual void Handle() override
|
||||
{
|
||||
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(Input));
|
||||
}
|
||||
};
|
||||
33
Plugins/UEWingman/Source/UEWingman/Handlers/Test_Tokenizer.h
Normal file
33
Plugins/UEWingman/Source/UEWingman/Handlers/Test_Tokenizer.h
Normal file
@@ -0,0 +1,33 @@
|
||||
#pragma once
|
||||
|
||||
#include "CoreMinimal.h"
|
||||
#include "WingServer.h"
|
||||
#include "WingHandler.h"
|
||||
#include "WingTokenizer.h"
|
||||
#include "Test_Tokenizer.generated.h"
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ---------------------------------------------------------------------------
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
UCLASS()
|
||||
class UWing_Test_Tokenizer : public UObject, public IWingHandler
|
||||
{
|
||||
GENERATED_BODY()
|
||||
|
||||
public:
|
||||
UPROPERTY(meta=(Description="The string to tokenize"))
|
||||
FString Input;
|
||||
|
||||
virtual FString GetDescription() const override
|
||||
{
|
||||
return TEXT("Test the tokenizer by tokenizing a string and printing the result.");
|
||||
}
|
||||
|
||||
virtual void Handle() override
|
||||
{
|
||||
WingTokenizer T(Input);
|
||||
T.PrintEverything();
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,41 @@
|
||||
#pragma once
|
||||
|
||||
#include "CoreMinimal.h"
|
||||
#include "WingServer.h"
|
||||
#include "WingHandler.h"
|
||||
#include "WingTokenizer.h"
|
||||
#include "Test_Unsanitize.generated.h"
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ---------------------------------------------------------------------------
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
UCLASS()
|
||||
class UWing_Test_Unsanitize : public UObject, public IWingHandler
|
||||
{
|
||||
GENERATED_BODY()
|
||||
|
||||
public:
|
||||
UPROPERTY(meta=(Description="The sanitized identifier to unsanitize"))
|
||||
FString Input;
|
||||
|
||||
virtual FString GetDescription() const override
|
||||
{
|
||||
return TEXT("Test the unsanitizer by unsanitizing a string and printing the result.");
|
||||
}
|
||||
|
||||
virtual void Handle() override
|
||||
{
|
||||
FString Error;
|
||||
FString Result = WingTokenizer::TryInternalizeID(Input, Error);
|
||||
if (!Error.IsEmpty())
|
||||
{
|
||||
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
|
||||
}
|
||||
if (!Result.IsEmpty())
|
||||
{
|
||||
UWingServer::Printf(TEXT("Result: %s\n"), *Result);
|
||||
}
|
||||
}
|
||||
};
|
||||
1917
Plugins/UEWingman/Source/UEWingman/Private/WingEntities.cpp
Normal file
1917
Plugins/UEWingman/Source/UEWingman/Private/WingEntities.cpp
Normal file
File diff suppressed because it is too large
Load Diff
302
Plugins/UEWingman/Source/UEWingman/Private/WingTokenizer.cpp
Normal file
302
Plugins/UEWingman/Source/UEWingman/Private/WingTokenizer.cpp
Normal file
@@ -0,0 +1,302 @@
|
||||
#include "WingTokenizer.h"
|
||||
#include "WingServer.h"
|
||||
|
||||
|
||||
void WingCharacterClasses::Assign(Cat Category, FStringView String)
|
||||
{
|
||||
int32 OldSize = CharCategory.Num();
|
||||
int32 NewSize = 0;
|
||||
for (TCHAR Ch : String) if (Ch >= NewSize) NewSize = Ch + 1;
|
||||
if (NewSize > OldSize)
|
||||
{
|
||||
CharCategory.SetNum(NewSize);
|
||||
for (int i = OldSize; i < NewSize; i++) CharCategory[i] = Cat::Other;
|
||||
}
|
||||
for (TCHAR Ch : String) CharCategory[Ch] = Category;
|
||||
}
|
||||
|
||||
WingCharacterClasses::WingCharacterClasses()
|
||||
{
|
||||
// This is the set of printable, visible, non-whitespace characters that
|
||||
// appear in most ubuntu default fonts. I initially map all of these as
|
||||
// 'Identifier' characters, but later I swap some of them over to punctuation.
|
||||
Assign(Cat::Identifier, TEXT(
|
||||
"!#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefgh"
|
||||
"ijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐ"
|
||||
"ÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖ"
|
||||
"ėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜ"
|
||||
"ŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƀƁƂƃƄƅƆƇƈƉƊƋƌƍƎƏƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟƠơƢ"
|
||||
"ƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƾƿǀǁǂǃǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǦǧǨǩǪǫǬǭǮǯǰǴǵǸǹǼ"
|
||||
"ǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟȤȥȦȧȨȩȪȫȬȭȮȯȰȱȲȳȷʒʼˆˇˉ˘˙˚˛˜˝΄΅ΆΈΉΊ"
|
||||
"ΌΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώЀЁЂЃЄ"
|
||||
"ЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъ"
|
||||
"ыьэюяѐёђѓєѕіїјљњћќѝўџѲҐґҒғҔҕҖҗҘҙҚқҢңҤҥҪҫҬҭҮүҰұҲҳҺһӀӁӂӃӄӇӈӋӌӏӐӑӒӓӔӕӖӗӘә"
|
||||
"ӚӛӜӝӞӟӠӡӢӣӤӥӦӧӨөӪӫӬӭӮӯӰӱӲӳӴӵӶӷӸӹẀẁẂẃẄẅỲỳ–—―‘’‚“”„†‡•…‰‹›⁰⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆"
|
||||
"₇₈₉€₹№™Ω⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞∂∆∏∑−∕∙√∞∫≈≠≤≥◊fifl\"\\"
|
||||
));
|
||||
|
||||
// All the punctuation marks that we do anything interesting with.
|
||||
Assign(Cat::Punctuation, PunctuationString);
|
||||
|
||||
// Control characters.
|
||||
for (int i = 0; i < 32; i++) CharCategory[i] = Cat::Control;
|
||||
CharCategory[0x7F] = Cat::Control;
|
||||
}
|
||||
|
||||
WingCharacterClasses WingCharacterClasses::TheSet;
|
||||
|
||||
void WingTokenizer::Add(TCHAR Type, FString InternalID)
|
||||
{
|
||||
Token T;
|
||||
T.Type = Type;
|
||||
T.InternalID = MoveTemp(InternalID);
|
||||
Tokens.Add(T);
|
||||
}
|
||||
|
||||
TCHAR WingTokenizer::FromHex(FStringView Digits, FString &Error)
|
||||
{
|
||||
if (Digits.Len() < 1) { Error = "Empty hex escape sequence"; return 0; }
|
||||
int32 Value = 0;
|
||||
for (TCHAR Ch : Digits)
|
||||
{
|
||||
if (Ch >= '0' && Ch <= '9') Value = Value * 16 + (Ch - '0');
|
||||
else if (Ch >= 'a' && Ch <= 'f') Value = Value * 16 + (Ch - 'a' + 10);
|
||||
else if (Ch >= 'A' && Ch <= 'F') Value = Value * 16 + (Ch - 'A' + 10);
|
||||
else { Error = "Invalid hex digit in escape sequence"; return 0; }
|
||||
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
|
||||
}
|
||||
return (TCHAR)Value;
|
||||
}
|
||||
|
||||
TCHAR WingTokenizer::FromDecimal(FStringView Digits, FString &Error)
|
||||
{
|
||||
if (Digits.Len() < 1) { Error = "Empty decimal escape sequence"; return 0; }
|
||||
int32 Value = 0;
|
||||
for (TCHAR Ch : Digits)
|
||||
{
|
||||
if (Ch >= '0' && Ch <= '9') Value = Value * 10 + (Ch - '0');
|
||||
else { Error = "Invalid decimal digit in escape sequence"; return 0; }
|
||||
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
|
||||
}
|
||||
return (TCHAR)Value;
|
||||
}
|
||||
|
||||
TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error)
|
||||
{
|
||||
if (!Error.IsEmpty()) return 0;
|
||||
// Search for the semicolon.
|
||||
int32 SemiPos;
|
||||
if (!Rest.FindChar(';', SemiPos))
|
||||
{
|
||||
Error = "Ampersand escape sequence doesn't end in semicolon";
|
||||
return 0;
|
||||
}
|
||||
if (SemiPos < 3)
|
||||
{
|
||||
Error = "Ampersand escape sequence too short";
|
||||
return 0;
|
||||
}
|
||||
TCHAR Result = 0;
|
||||
if (Rest[1] == '#')
|
||||
{
|
||||
if ((Rest[2] == 'x') || (Rest[2] == 'X'))
|
||||
Result = FromHex(Rest.Mid(3, SemiPos - 3), Error);
|
||||
else
|
||||
Result = FromDecimal(Rest.Mid(2, SemiPos - 2), Error);
|
||||
}
|
||||
else
|
||||
{
|
||||
FString Name(Rest.Mid(1, SemiPos - 1));
|
||||
Result = WingEntityList::GetChar(Name);
|
||||
if (Result == 0)
|
||||
{
|
||||
Error = FString::Printf(TEXT("Unknown HTML entity: &%s;"), *Name);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
Rest = Rest.RightChop(SemiPos + 1);
|
||||
return Result;
|
||||
}
|
||||
|
||||
FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
|
||||
{
|
||||
if (!Error.IsEmpty()) return FString();
|
||||
TStringBuilder<512> Decoded;
|
||||
while (!Rest.IsEmpty() && Error.IsEmpty())
|
||||
{
|
||||
TCHAR Ch = Rest[0];
|
||||
if (Ch == ' ') break;
|
||||
if (Ch == '.')
|
||||
{
|
||||
Decoded.AppendChar(' ');
|
||||
Rest = Rest.RightChop(1);
|
||||
continue;
|
||||
}
|
||||
if (Ch == '&')
|
||||
{
|
||||
Decoded.AppendChar(TokenizeEscapeSequence(Rest, Error));
|
||||
continue;
|
||||
}
|
||||
Cat Category = WingCharacterClasses::GetCat(Ch);
|
||||
if ((Category == Cat::Identifier) || (Category == Cat::Other))
|
||||
{
|
||||
// We accept other characters in case the LLM sends unicode
|
||||
// that isn't on the whitelist. This is intentional.
|
||||
Decoded.AppendChar(Ch);
|
||||
Rest = Rest.RightChop(1);
|
||||
}
|
||||
else break;
|
||||
}
|
||||
if (!Error.IsEmpty()) return FString();
|
||||
// We deliberately do not produce an error message for empty identifiers,
|
||||
// because we can't generate a good message here. We leave it to others
|
||||
// to deal with that case.
|
||||
return Decoded.ToString();
|
||||
}
|
||||
|
||||
WingTokenizer::WingTokenizer(const FString& Input)
|
||||
{
|
||||
FStringView Rest(Input);
|
||||
while (!Rest.IsEmpty() && Error.IsEmpty())
|
||||
{
|
||||
TCHAR Ch = Rest[0];
|
||||
if ((Ch == ' ') || (Ch == '\t'))
|
||||
{
|
||||
Rest = Rest.RightChop(1);
|
||||
continue;
|
||||
}
|
||||
if (Ch == '=')
|
||||
{
|
||||
Add(RestOfLine, FString(Rest.RightChop(1)));
|
||||
break;
|
||||
}
|
||||
if ((Ch == '.') || (Ch == '&'))
|
||||
{
|
||||
Add(Identifier, TokenizeIdentifier(Rest, Error));
|
||||
continue;
|
||||
}
|
||||
Cat Category = WingCharacterClasses::GetCat(Ch);
|
||||
if (Category == Cat::Punctuation)
|
||||
{
|
||||
Add(Ch, FString());
|
||||
Rest = Rest.RightChop(1);
|
||||
continue;
|
||||
}
|
||||
if (Category == Cat::Control)
|
||||
{
|
||||
Error = "Control characters in input, not allowed";
|
||||
break;
|
||||
}
|
||||
Add(Identifier, TokenizeIdentifier(Rest, Error));
|
||||
continue;
|
||||
}
|
||||
if (!Error.IsEmpty()) Tokens.Empty();
|
||||
}
|
||||
|
||||
void WingTokenizer::PrintEverything() const
|
||||
{
|
||||
if (!Error.IsEmpty())
|
||||
{
|
||||
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
|
||||
}
|
||||
for (const Token& T : Tokens)
|
||||
{
|
||||
TStringBuilder<512> ExtraStr;
|
||||
for (TCHAR Ch : T.InternalID)
|
||||
{
|
||||
if (Ch >= 0x20 && Ch <= 0x7E)
|
||||
{
|
||||
ExtraStr.AppendChar(Ch);
|
||||
ExtraStr.AppendChar(' ');
|
||||
}
|
||||
else
|
||||
{
|
||||
ExtraStr.Appendf(TEXT("%04X "), (int32)Ch);
|
||||
}
|
||||
}
|
||||
if (T.Type >= 0x20 && T.Type <= 0x7E)
|
||||
UWingServer::Printf(TEXT("Token '%c': %s\n"), T.Type, *ExtraStr);
|
||||
else
|
||||
UWingServer::Printf(TEXT("Token %04X: %s\n"), (int32)T.Type, *ExtraStr);
|
||||
}
|
||||
}
|
||||
|
||||
FString WingTokenizer::ExternalizeID(const FString &S)
|
||||
{
|
||||
TStringBuilder<512> Result;
|
||||
for (TCHAR Ch : S)
|
||||
{
|
||||
if (Ch == ' ') Result.AppendChar('.');
|
||||
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
|
||||
else
|
||||
{
|
||||
Result.AppendChar('&');
|
||||
FStringView Name = WingEntityList::GetName(Ch);
|
||||
if (Name.IsEmpty())
|
||||
{
|
||||
Result.AppendChar('#');
|
||||
Result.Appendf(TEXT("%d"), (int32)Ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
Result.Append(Name);
|
||||
}
|
||||
Result.AppendChar(';');
|
||||
}
|
||||
}
|
||||
return Result.ToString();
|
||||
}
|
||||
|
||||
FString WingTokenizer::TryInternalizeID(const FString &S, FString &Error)
|
||||
{
|
||||
FStringView Input(S);
|
||||
Error.Empty();
|
||||
FString Result = TokenizeIdentifier(Input, Error);
|
||||
// If there's already an error, annotate with context
|
||||
if (!Error.IsEmpty())
|
||||
{
|
||||
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *S, *Error);
|
||||
return FString();
|
||||
}
|
||||
// If the identifier tokenizer stops before consuming the whole
|
||||
// input, then we need to generate an error message. We do our best
|
||||
// to generate the most informative error possible.
|
||||
if (!Input.IsEmpty())
|
||||
{
|
||||
Cat Category = WingCharacterClasses::GetCat(Input[0]);
|
||||
if (Input[0] == ' ')
|
||||
{
|
||||
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, spaces must be escaped"), *S);
|
||||
}
|
||||
else if (Category == Cat::Punctuation)
|
||||
{
|
||||
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, these marks must be escaped: %s"),
|
||||
*S, WingCharacterClasses::PunctuationString);
|
||||
}
|
||||
else if (Category == Cat::Control)
|
||||
{
|
||||
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *S);
|
||||
}
|
||||
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *S);
|
||||
return FString();
|
||||
}
|
||||
// One last error case: empty input
|
||||
if (Result.IsEmpty())
|
||||
{
|
||||
Error = TEXT("ERROR: Empty identifiers are not allowed");
|
||||
return FString();
|
||||
}
|
||||
return Result;
|
||||
}
|
||||
|
||||
FString WingTokenizer::CheckInternalizeID(const FString &S)
|
||||
{
|
||||
FString Error;
|
||||
FString Result = TryInternalizeID(S, Error);
|
||||
if (!Error.IsEmpty())
|
||||
{
|
||||
UWingServer::Printf(TEXT("%s\n"), *Error);
|
||||
UWingServer::SuggestManual(WingManual::Section::IdentifierSanitization);
|
||||
}
|
||||
return Result;
|
||||
}
|
||||
@@ -65,6 +65,9 @@ FString WingUtils::SanitizeName(const FString &InName)
|
||||
if (c == ' ') c=L'·';
|
||||
if (c == '<') c=L'◁';
|
||||
if (c == '>') c=L'▷';
|
||||
if (c == '(') c=L'❨';
|
||||
if (c == ')') c=L'❩';
|
||||
if (c == '=') c=L'≡';
|
||||
if (c == ',') c=L'▾';
|
||||
Name[Dst++] = c;
|
||||
}
|
||||
@@ -84,6 +87,9 @@ FString WingUtils::UnsanitizeName(const FString &InName)
|
||||
if (c == L'·') c=' ';
|
||||
if (c == L'◁') c='<';
|
||||
if (c == L'▷') c='>';
|
||||
if (c == L'❨') c='(';
|
||||
if (c == L'❩') c=')';
|
||||
if (c == L'≡') c='=';
|
||||
if (c == L'▾') c=',';
|
||||
Name[Dst++] = c;
|
||||
}
|
||||
|
||||
169
Plugins/UEWingman/Source/UEWingman/Public/WingTokenizer.h
Normal file
169
Plugins/UEWingman/Source/UEWingman/Public/WingTokenizer.h
Normal file
@@ -0,0 +1,169 @@
|
||||
#pragma once
|
||||
|
||||
#include "CoreMinimal.h"
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// WingCharacterClasses
|
||||
//
|
||||
// We recognize these disjoint classes of characters:
|
||||
//
|
||||
// Punctuation. A small hardwired list of punctuation marks
|
||||
// that we want to escape, specifically \"'(),.:;<=>&
|
||||
// These particular punctuation marks were chosen because they
|
||||
// either need to be escaped for json's sake, or for our
|
||||
// parser's sake. Any other punctuation is just classified as
|
||||
// an identifier character.
|
||||
//
|
||||
// Identifier characters. A whitelist of about a thousand
|
||||
// ascii and unicode characters that can be used directly in
|
||||
// identifiers without any kind of escaping. To get on the list,
|
||||
// you need the following: to not be an ascii punctuation mark,
|
||||
// to be printable and visible and not whitespace, and to be
|
||||
// easily rendered by all of the default ubuntu fonts.
|
||||
//
|
||||
// Control Characters. Ascii control characters, including DEL.
|
||||
//
|
||||
// Other Characters. Anything else.
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
struct WingCharacterClasses
|
||||
{
|
||||
enum class Cat : uint8
|
||||
{
|
||||
Identifier,
|
||||
Punctuation,
|
||||
Control,
|
||||
Other,
|
||||
};
|
||||
|
||||
static Cat GetCat(TCHAR Ch)
|
||||
{
|
||||
int32 Cp = (int32)Ch;
|
||||
if (Cp < 0 || Cp >= TheSet.CharCategory.Num()) return Cat::Other;
|
||||
return TheSet.CharCategory[Cp];
|
||||
}
|
||||
|
||||
static constexpr const TCHAR *PunctuationString = TEXT("\\\"'(),.:;<=>&");
|
||||
|
||||
private:
|
||||
TArray<Cat> CharCategory;
|
||||
WingCharacterClasses();
|
||||
void Assign(Cat Category, FStringView String);
|
||||
static WingCharacterClasses TheSet;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
// The HTML Entity List.
|
||||
//
|
||||
// When escaping identifiers, we use HTML escapes like <
|
||||
// These work well because they have no conflict with the json
|
||||
// parser (MCP protocol is json), they are also easy to deal
|
||||
// with in the tokenizer, and the LLM is already familiar with
|
||||
// that kind of escaping. The names stored in this table do not
|
||||
// include the ampersand or the semicolon.
|
||||
//
|
||||
// This class doesn't handle hex character codes, this is just
|
||||
// a lookup table from character to name and back.
|
||||
//
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
struct WingEntityList
|
||||
{
|
||||
static TCHAR GetChar(const FString &Name)
|
||||
{
|
||||
TCHAR *Result = TheList.NameToChar.Find(Name);
|
||||
if (Result == nullptr) return 0;
|
||||
return *Result;
|
||||
}
|
||||
|
||||
static FStringView GetName(TCHAR Ch)
|
||||
{
|
||||
FString *Result = TheList.CharToName.Find(Ch);
|
||||
if (Result == nullptr) return FStringView();
|
||||
return *Result;
|
||||
}
|
||||
|
||||
private:
|
||||
TMap<TCHAR, FString> CharToName;
|
||||
TMap<FString, TCHAR> NameToChar;
|
||||
struct Raw { const char *Name; TCHAR Codepoint; };
|
||||
WingEntityList(std::initializer_list<Raw> Data);
|
||||
static WingEntityList TheList;
|
||||
};
|
||||
|
||||
|
||||
struct WingTokenizer
|
||||
{
|
||||
using Cat = WingCharacterClasses::Cat;
|
||||
const TCHAR Identifier = 'i';
|
||||
const TCHAR RestOfLine = 'r';
|
||||
|
||||
// A token has a token type which can be Identifier,
|
||||
// RestOfLine, or a single-character punctuation mark.
|
||||
// The InternalID field contains the result of converting
|
||||
// the token from an external ID to an internal ID.
|
||||
struct Token
|
||||
{
|
||||
TCHAR Type;
|
||||
FString InternalID;
|
||||
};
|
||||
|
||||
// The string that we tokenized.
|
||||
FString Input;
|
||||
|
||||
// If the tokenization failed, an error message.
|
||||
FString Error;
|
||||
|
||||
// The result, an array of tokens.
|
||||
TArray<Token> Tokens;
|
||||
|
||||
// Tokenize a line of input. The tokens are stored in
|
||||
// the token array. If there's an error, the error is
|
||||
// stored in the error field, and the token array is
|
||||
// cleared. If the tokens contain identifiers,
|
||||
WingTokenizer(const FString& Input);
|
||||
|
||||
// Convert an internal ID into an external ID.
|
||||
// Spaces are converted to periods. Any other
|
||||
// non-identifier character is HTML escaped.
|
||||
static FString ExternalizeID(const FString &S);
|
||||
|
||||
// Convert an external ID into an internal ID.
|
||||
// Periods are converted back to spaces. HTML escapes
|
||||
// are converted back to raw characters. This could
|
||||
// fail, for example, if the external name contains an
|
||||
// invalid HTML escape. If it does, returns empty
|
||||
// string and sets the error message.
|
||||
static FString TryInternalizeID(const FString &S, FString &Error);
|
||||
|
||||
// Calls TryInternalizeName. If this generates an
|
||||
// error, prints the error message, suggests the manual
|
||||
// entry on identifier sanitization, and returns empty
|
||||
// string.
|
||||
static FString CheckInternalizeID(const FString &S);
|
||||
|
||||
// Print all tokens to the log for debugging.
|
||||
void PrintEverything() const;
|
||||
|
||||
private:
|
||||
// Add a token to the token array.
|
||||
void Add(TCHAR Type, FString InternalID);
|
||||
|
||||
// Convert numbers to TCHAR. If there's an error, set the error
|
||||
// message and return zero.
|
||||
static TCHAR FromHex(FStringView Digits, FString &Error);
|
||||
static TCHAR FromDecimal(FStringView Digits, FString &Error);
|
||||
|
||||
// Tokenize an escape sequence. Attempts to consume a valid escape
|
||||
// sequence from rest, and return the character indicated. On error,
|
||||
// sets the error message and returns zero.
|
||||
static TCHAR TokenizeEscapeSequence(FStringView &Rest, FString &Error);
|
||||
|
||||
// Tokenize an identifier. Attempts to consume a valid identifier
|
||||
// from rest, and return the identifier. On error, sets the error
|
||||
// message and returns empty string.
|
||||
static FString TokenizeIdentifier(FStringView &Rest, FString &Error);
|
||||
};
|
||||
Reference in New Issue
Block a user