Files
integration/Plugins/UEWingman/Source/UEWingman/Private/WingTokenizer.cpp

390 lines
12 KiB
C++

#include "WingTokenizer.h"
void WingCharacterClasses::Assign(Cat Category, FStringView String)
{
int32 OldSize = CharCategory.Num();
int32 NewSize = 0;
for (TCHAR Ch : String) if (Ch >= NewSize) NewSize = Ch + 1;
if (NewSize > OldSize)
{
CharCategory.SetNum(NewSize);
for (int i = OldSize; i < NewSize; i++) CharCategory[i] = Cat::Other;
}
for (TCHAR Ch : String) CharCategory[Ch] = Category;
}
WingCharacterClasses::WingCharacterClasses()
{
// This is the set of printable, visible, non-whitespace characters that
// appear in most ubuntu default fonts. I initially map all of these as
// 'Identifier' characters, but later I swap some of them over to punctuation.
Assign(Cat::Identifier, TEXT(
"!#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefgh"
"ijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐ"
"ÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖ"
"ėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜ"
"ŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƀƁƂƃƄƅƆƇƈƉƊƋƌƍƎƏƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟƠơƢ"
"ƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƾƿǀǁǂǃǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǦǧǨǩǪǫǬǭǮǯǰǴǵǸǹǼ"
"ǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟȤȥȦȧȨȩȪȫȬȭȮȯȰȱȲȳȷʒʼˆˇˉ˘˙˚˛˜˝΄΅ΆΈΉΊ"
"ΌΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώЀЁЂЃЄ"
"ЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъ"
"ыьэюяѐёђѓєѕіїјљњћќѝўџѲҐґҒғҔҕҖҗҘҙҚқҢңҤҥҪҫҬҭҮүҰұҲҳҺһӀӁӂӃӄӇӈӋӌӏӐӑӒӓӔӕӖӗӘә"
"ӚӛӜӝӞӟӠӡӢӣӤӥӦӧӨөӪӫӬӭӮӯӰӱӲӳӴӵӶӷӸӹẀẁẂẃẄẅỲỳ–—―‘’‚“”„†‡•…‰‹›⁰⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆"
"₇₈₉€₹№™Ω⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞∂∆∏∑−∕∙√∞∫≈≠≤≥◊fifl\"\\"
));
// All the punctuation marks that we do anything interesting with.
Assign(Cat::Punctuation, PunctuationString);
// Control characters.
for (int i = 0; i < 32; i++) CharCategory[i] = Cat::Control;
CharCategory[0x7F] = Cat::Control;
}
WingCharacterClasses WingCharacterClasses::TheSet;
void WingTokenizer::Add(TCHAR Type, FName Name, FStringView Before, FStringView After)
{
Token T;
T.Type = Type;
T.Name = Name;
T.Source = FStringView(Before.GetData(), After.GetData() - Before.GetData());
Tokens.Add(T);
}
void WingTokenizer::Add(TCHAR Type, FStringView Rest, FStringView Before, FStringView After)
{
Token T;
T.Type = Type;
T.Rest = Rest;
T.Source = FStringView(Before.GetData(), After.GetData() - Before.GetData());
Tokens.Add(T);
}
TCHAR WingTokenizer::FromHex(FStringView Digits, FString &Error)
{
if (Digits.Len() < 1) { Error = "Empty hex escape sequence"; return 0; }
int32 Value = 0;
for (TCHAR Ch : Digits)
{
if (Ch >= '0' && Ch <= '9') Value = Value * 16 + (Ch - '0');
else if (Ch >= 'a' && Ch <= 'f') Value = Value * 16 + (Ch - 'a' + 10);
else if (Ch >= 'A' && Ch <= 'F') Value = Value * 16 + (Ch - 'A' + 10);
else { Error = "Invalid hex digit in escape sequence"; return 0; }
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
}
return (TCHAR)Value;
}
TCHAR WingTokenizer::FromDecimal(FStringView Digits, FString &Error)
{
if (Digits.Len() < 1) { Error = "Empty decimal escape sequence"; return 0; }
int32 Value = 0;
for (TCHAR Ch : Digits)
{
if (Ch >= '0' && Ch <= '9') Value = Value * 10 + (Ch - '0');
else { Error = "Invalid decimal digit in escape sequence"; return 0; }
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
}
return (TCHAR)Value;
}
TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error)
{
if (!Error.IsEmpty()) return 0;
// Search for the semicolon.
int32 SemiPos;
if (!Rest.FindChar(';', SemiPos))
{
Error = "Ampersand escape sequence doesn't end in semicolon";
return 0;
}
if (SemiPos < 3)
{
Error = "Ampersand escape sequence too short";
return 0;
}
TCHAR Result = 0;
if (Rest[1] == '#')
{
if ((Rest[2] == 'x') || (Rest[2] == 'X'))
Result = FromHex(Rest.Mid(3, SemiPos - 3), Error);
else
Result = FromDecimal(Rest.Mid(2, SemiPos - 2), Error);
}
else
{
FString Name(Rest.Mid(1, SemiPos - 1));
Result = WingEntityList::GetChar(Name);
if (Result == 0)
{
Error = FString::Printf(TEXT("Unknown HTML entity: &%s;"), *Name);
return 0;
}
}
Rest = Rest.RightChop(SemiPos + 1);
return Result;
}
FStringView WingTokenizer::TokenizeAssetName(FStringView &Rest, FString &Error)
{
if (!Error.IsEmpty()) return FStringView();
int i = 0;
while ((i < Rest.Len()) && (Rest[i] != ',')) i++;
FStringView Result = Rest.SubStr(0, i);
Rest = Rest.RightChop(i);
return Result;
}
FName WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
{
if (!Error.IsEmpty()) return FName();
TCHAR Buffer[NAME_SIZE];
int Len = 0;
while (!Rest.IsEmpty() && Error.IsEmpty())
{
TCHAR Ch = Rest[0];
if (Ch == ' ') break;
if (Ch == '.')
{
if (Len < NAME_SIZE) Buffer[Len++] = ' ';
Rest = Rest.RightChop(1);
continue;
}
if (Ch == '&')
{
TCHAR Decoded = TokenizeEscapeSequence(Rest, Error);
if (Len < NAME_SIZE) Buffer[Len++] = Decoded;
continue;
}
Cat Category = WingCharacterClasses::GetCat(Ch);
if ((Category == Cat::Identifier) || (Category == Cat::Other))
{
// We accept other characters in case the LLM sends unicode
// that isn't on the whitelist. This is intentional.
if (Len < NAME_SIZE) Buffer[Len++] = Ch;
Rest = Rest.RightChop(1);
}
else break;
}
if (!Error.IsEmpty()) return FName();
// The buffer has just enough room to hold the longest FName,
// plus the required null terminator. If we filled the whole
// buffer, leaving no room for the null, it means the name
// is too long.
if (Len == NAME_SIZE)
{
Error = "FName too long";
return FName();
}
Buffer[Len] = 0;
return FName(Len, Buffer);
// Note about code above: we deliberately do not check for empty
// names here, because we don't have the context to generate a good
// error message. So instead, we leave it to the caller.
}
WingTokenizer::WingTokenizer(const FString& In)
{
Input = In;
FStringView Rest(Input);
while (!Rest.IsEmpty() && Error.IsEmpty())
{
TCHAR Ch = Rest[0];
if ((Ch == ' ') || (Ch == '\t'))
{
Rest = Rest.RightChop(1);
continue;
}
FStringView Before = Rest;
if (Ch == '=')
{
FStringView Body = Rest.RightChop(1);
Rest = Rest.RightChop(Rest.Len());
Add(RestOfLine, Body, Before, Rest);
break;
}
if (Ch == '/')
{
FStringView Asset = TokenizeAssetName(Rest, Error);
Add(AssetName, Asset, Before, Rest);
continue;
}
if ((Ch == '.') || (Ch == '&'))
{
FName Id = TokenizeIdentifier(Rest, Error);
Add(Identifier, Id, Before, Rest);
continue;
}
Cat Category = WingCharacterClasses::GetCat(Ch);
if (Category == Cat::Punctuation)
{
Rest = Rest.RightChop(1);
Add(Ch, FString(), Before, Rest);
continue;
}
if (Category == Cat::Control)
{
Error = "Control characters in input, not allowed";
break;
}
FName Id = TokenizeIdentifier(Rest, Error);
Add(Identifier, Id, Before, Rest);
continue;
}
if (!Error.IsEmpty()) Tokens.Empty();
// Two sentinels means we can safely do lookahead 2 without risk.
Rest = Rest.LeftChop(Rest.Len());
Add(0, FName(), Rest, Rest);
Add(0, FName(), Rest, Rest);
Next = Tokens.GetData();
}
void WingTokenizer::SaveCursor(FName Name)
{
int Cursor = Next - Tokens.GetData();
for (auto &Pair : SavedCursor) if (Pair.Key == Name) { Pair.Value = Cursor; return; }
SavedCursor.Emplace(Name, Cursor);
}
FString WingTokenizer::GetRange(FName SavePos, int Extra) const
{
int Lo = 0;
for (auto &Pair : SavedCursor) if (Pair.Key == SavePos) Lo = Pair.Value;
int Hi = (Next - Tokens.GetData()) + Extra;
Hi = FMath::Clamp(Hi, Lo, Tokens.Num());
if (Lo >= Hi) return FString();
const TCHAR* Start = Tokens[Lo].Source.GetData();
const TCHAR* End = Tokens[Hi - 1].Source.GetData() + Tokens[Hi - 1].Source.Len();
return FString(End - Start, Start);
}
void WingTokenizer::PrintEverything(FStringBuilderBase &Out) const
{
if (!Error.IsEmpty())
{
Out.Appendf(TEXT("Error: %s\n"), *Error);
}
for (const Token& T : Tokens)
{
Out.Appendf(TEXT("Token '%c': "), T.Type);
if (T.Type == Identifier)
{
for (TCHAR Ch : T.Name.ToString())
{
if (Ch >= 0x20 && Ch <= 0x7E)
{
Out.AppendChar(Ch);
Out.AppendChar(' ');
}
else
{
Out.Appendf(TEXT("%04X "), (int32)Ch);
}
}
}
if ((T.Type == RestOfLine) || (T.Type == AssetName))
{
Out.Appendf(TEXT("[%s]"), *FString(T.Rest));
}
Out.AppendChar('\n');
}
}
FString WingTokenizer::ExternalizeID(FName InternalID)
{
TStringBuilder<512> Result;
TCHAR Buffer[FName::StringBufferSize];
int32 Len = InternalID.ToString(Buffer);
for (TCHAR Ch : FStringView(Buffer, Len))
{
if (Ch == ' ') Result.AppendChar('.');
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
else
{
Result.AppendChar('&');
FStringView Name = WingEntityList::GetName(Ch);
if (Name.IsEmpty())
{
Result.AppendChar('#');
Result.Appendf(TEXT("%d"), (int32)Ch);
}
else
{
Result.Append(Name);
}
Result.AppendChar(';');
}
}
return Result.ToString();
}
bool WingTokenizer::WouldExternalizeReadably(FName InternalID)
{
if (InternalID.IsNone()) return false;
TCHAR Buffer[FName::StringBufferSize];
int32 Len = InternalID.ToString(Buffer);
for (TCHAR Ch : FStringView(Buffer, Len))
{
if (Ch == ' ') continue;
if (WingCharacterClasses::GetCat(Ch) != Cat::Identifier) return false;
}
return true;
}
FName WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error)
{
FStringView Input(ExternalID);
Error.Empty();
FName InternalID = TokenizeIdentifier(Input, Error);
// If there's already an error, annotate with context
if (!Error.IsEmpty())
{
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *ExternalID, *Error);
return FName();
}
// If the identifier tokenizer stops before consuming the whole
// input, then we need to generate an error message. We do our best
// to generate the most informative error possible.
if (!Input.IsEmpty())
{
Cat Category = WingCharacterClasses::GetCat(Input[0]);
if (Input[0] == ' ')
{
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, spaces must be escaped"), *ExternalID);
}
else if (Category == Cat::Punctuation)
{
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, these marks must be escaped: %s"),
*ExternalID, WingCharacterClasses::PunctuationString);
}
else if (Category == Cat::Control)
{
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *ExternalID);
}
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *ExternalID);
return FName();
}
// One last error case: empty input
if (InternalID.IsNone())
{
Error = TEXT("ERROR: Empty identifiers are not allowed");
return FName();
}
return InternalID;
}
FString WingTokenizer::SimplifyID(const FString &ID)
{
TStringBuilder<512> Result;
for (TCHAR Ch : ID)
{
if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier)
Result.AppendChar(Ch);
}
return Result.ToString();
}