390 lines
12 KiB
C++
390 lines
12 KiB
C++
#include "WingTokenizer.h"
|
|
|
|
|
|
void WingCharacterClasses::Assign(Cat Category, FStringView String)
|
|
{
|
|
int32 OldSize = CharCategory.Num();
|
|
int32 NewSize = 0;
|
|
for (TCHAR Ch : String) if (Ch >= NewSize) NewSize = Ch + 1;
|
|
if (NewSize > OldSize)
|
|
{
|
|
CharCategory.SetNum(NewSize);
|
|
for (int i = OldSize; i < NewSize; i++) CharCategory[i] = Cat::Other;
|
|
}
|
|
for (TCHAR Ch : String) CharCategory[Ch] = Category;
|
|
}
|
|
|
|
WingCharacterClasses::WingCharacterClasses()
|
|
{
|
|
// This is the set of printable, visible, non-whitespace characters that
|
|
// appear in most ubuntu default fonts. I initially map all of these as
|
|
// 'Identifier' characters, but later I swap some of them over to punctuation.
|
|
Assign(Cat::Identifier, TEXT(
|
|
"!#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefgh"
|
|
"ijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐ"
|
|
"ÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖ"
|
|
"ėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜ"
|
|
"ŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƀƁƂƃƄƅƆƇƈƉƊƋƌƍƎƏƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟƠơƢ"
|
|
"ƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƾƿǀǁǂǃǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǦǧǨǩǪǫǬǭǮǯǰǴǵǸǹǼ"
|
|
"ǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟȤȥȦȧȨȩȪȫȬȭȮȯȰȱȲȳȷʒʼˆˇˉ˘˙˚˛˜˝΄΅ΆΈΉΊ"
|
|
"ΌΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώЀЁЂЃЄ"
|
|
"ЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъ"
|
|
"ыьэюяѐёђѓєѕіїјљњћќѝўџѲҐґҒғҔҕҖҗҘҙҚқҢңҤҥҪҫҬҭҮүҰұҲҳҺһӀӁӂӃӄӇӈӋӌӏӐӑӒӓӔӕӖӗӘә"
|
|
"ӚӛӜӝӞӟӠӡӢӣӤӥӦӧӨөӪӫӬӭӮӯӰӱӲӳӴӵӶӷӸӹẀẁẂẃẄẅỲỳ–—―‘’‚“”„†‡•…‰‹›⁰⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆"
|
|
"₇₈₉€₹№™Ω⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞∂∆∏∑−∕∙√∞∫≈≠≤≥◊fifl\"\\"
|
|
));
|
|
|
|
// All the punctuation marks that we do anything interesting with.
|
|
Assign(Cat::Punctuation, PunctuationString);
|
|
|
|
// Control characters.
|
|
for (int i = 0; i < 32; i++) CharCategory[i] = Cat::Control;
|
|
CharCategory[0x7F] = Cat::Control;
|
|
}
|
|
|
|
WingCharacterClasses WingCharacterClasses::TheSet;
|
|
|
|
void WingTokenizer::Add(TCHAR Type, FName Name, FStringView Before, FStringView After)
|
|
{
|
|
Token T;
|
|
T.Type = Type;
|
|
T.Name = Name;
|
|
T.Source = FStringView(Before.GetData(), After.GetData() - Before.GetData());
|
|
Tokens.Add(T);
|
|
}
|
|
|
|
void WingTokenizer::Add(TCHAR Type, FStringView Rest, FStringView Before, FStringView After)
|
|
{
|
|
Token T;
|
|
T.Type = Type;
|
|
T.Rest = Rest;
|
|
T.Source = FStringView(Before.GetData(), After.GetData() - Before.GetData());
|
|
Tokens.Add(T);
|
|
}
|
|
|
|
TCHAR WingTokenizer::FromHex(FStringView Digits, FString &Error)
|
|
{
|
|
if (Digits.Len() < 1) { Error = "Empty hex escape sequence"; return 0; }
|
|
int32 Value = 0;
|
|
for (TCHAR Ch : Digits)
|
|
{
|
|
if (Ch >= '0' && Ch <= '9') Value = Value * 16 + (Ch - '0');
|
|
else if (Ch >= 'a' && Ch <= 'f') Value = Value * 16 + (Ch - 'a' + 10);
|
|
else if (Ch >= 'A' && Ch <= 'F') Value = Value * 16 + (Ch - 'A' + 10);
|
|
else { Error = "Invalid hex digit in escape sequence"; return 0; }
|
|
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
|
|
}
|
|
return (TCHAR)Value;
|
|
}
|
|
|
|
TCHAR WingTokenizer::FromDecimal(FStringView Digits, FString &Error)
|
|
{
|
|
if (Digits.Len() < 1) { Error = "Empty decimal escape sequence"; return 0; }
|
|
int32 Value = 0;
|
|
for (TCHAR Ch : Digits)
|
|
{
|
|
if (Ch >= '0' && Ch <= '9') Value = Value * 10 + (Ch - '0');
|
|
else { Error = "Invalid decimal digit in escape sequence"; return 0; }
|
|
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
|
|
}
|
|
return (TCHAR)Value;
|
|
}
|
|
|
|
TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error)
|
|
{
|
|
if (!Error.IsEmpty()) return 0;
|
|
// Search for the semicolon.
|
|
int32 SemiPos;
|
|
if (!Rest.FindChar(';', SemiPos))
|
|
{
|
|
Error = "Ampersand escape sequence doesn't end in semicolon";
|
|
return 0;
|
|
}
|
|
if (SemiPos < 3)
|
|
{
|
|
Error = "Ampersand escape sequence too short";
|
|
return 0;
|
|
}
|
|
TCHAR Result = 0;
|
|
if (Rest[1] == '#')
|
|
{
|
|
if ((Rest[2] == 'x') || (Rest[2] == 'X'))
|
|
Result = FromHex(Rest.Mid(3, SemiPos - 3), Error);
|
|
else
|
|
Result = FromDecimal(Rest.Mid(2, SemiPos - 2), Error);
|
|
}
|
|
else
|
|
{
|
|
FString Name(Rest.Mid(1, SemiPos - 1));
|
|
Result = WingEntityList::GetChar(Name);
|
|
if (Result == 0)
|
|
{
|
|
Error = FString::Printf(TEXT("Unknown HTML entity: &%s;"), *Name);
|
|
return 0;
|
|
}
|
|
}
|
|
Rest = Rest.RightChop(SemiPos + 1);
|
|
return Result;
|
|
}
|
|
|
|
FStringView WingTokenizer::TokenizeAssetName(FStringView &Rest, FString &Error)
|
|
{
|
|
if (!Error.IsEmpty()) return FStringView();
|
|
int i = 0;
|
|
while ((i < Rest.Len()) && (Rest[i] != ',')) i++;
|
|
FStringView Result = Rest.SubStr(0, i);
|
|
Rest = Rest.RightChop(i);
|
|
return Result;
|
|
}
|
|
|
|
FName WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
|
|
{
|
|
if (!Error.IsEmpty()) return FName();
|
|
TCHAR Buffer[NAME_SIZE];
|
|
int Len = 0;
|
|
while (!Rest.IsEmpty() && Error.IsEmpty())
|
|
{
|
|
TCHAR Ch = Rest[0];
|
|
if (Ch == ' ') break;
|
|
if (Ch == '.')
|
|
{
|
|
if (Len < NAME_SIZE) Buffer[Len++] = ' ';
|
|
Rest = Rest.RightChop(1);
|
|
continue;
|
|
}
|
|
if (Ch == '&')
|
|
{
|
|
TCHAR Decoded = TokenizeEscapeSequence(Rest, Error);
|
|
if (Len < NAME_SIZE) Buffer[Len++] = Decoded;
|
|
continue;
|
|
}
|
|
Cat Category = WingCharacterClasses::GetCat(Ch);
|
|
if ((Category == Cat::Identifier) || (Category == Cat::Other))
|
|
{
|
|
// We accept other characters in case the LLM sends unicode
|
|
// that isn't on the whitelist. This is intentional.
|
|
if (Len < NAME_SIZE) Buffer[Len++] = Ch;
|
|
Rest = Rest.RightChop(1);
|
|
}
|
|
else break;
|
|
}
|
|
if (!Error.IsEmpty()) return FName();
|
|
// The buffer has just enough room to hold the longest FName,
|
|
// plus the required null terminator. If we filled the whole
|
|
// buffer, leaving no room for the null, it means the name
|
|
// is too long.
|
|
if (Len == NAME_SIZE)
|
|
{
|
|
Error = "FName too long";
|
|
return FName();
|
|
}
|
|
Buffer[Len] = 0;
|
|
return FName(Len, Buffer);
|
|
// Note about code above: we deliberately do not check for empty
|
|
// names here, because we don't have the context to generate a good
|
|
// error message. So instead, we leave it to the caller.
|
|
}
|
|
|
|
WingTokenizer::WingTokenizer(const FString& In)
|
|
{
|
|
Input = In;
|
|
FStringView Rest(Input);
|
|
while (!Rest.IsEmpty() && Error.IsEmpty())
|
|
{
|
|
TCHAR Ch = Rest[0];
|
|
if ((Ch == ' ') || (Ch == '\t'))
|
|
{
|
|
Rest = Rest.RightChop(1);
|
|
continue;
|
|
}
|
|
FStringView Before = Rest;
|
|
if (Ch == '=')
|
|
{
|
|
FStringView Body = Rest.RightChop(1);
|
|
Rest = Rest.RightChop(Rest.Len());
|
|
Add(RestOfLine, Body, Before, Rest);
|
|
break;
|
|
}
|
|
if (Ch == '/')
|
|
{
|
|
FStringView Asset = TokenizeAssetName(Rest, Error);
|
|
Add(AssetName, Asset, Before, Rest);
|
|
continue;
|
|
}
|
|
if ((Ch == '.') || (Ch == '&'))
|
|
{
|
|
FName Id = TokenizeIdentifier(Rest, Error);
|
|
Add(Identifier, Id, Before, Rest);
|
|
continue;
|
|
}
|
|
Cat Category = WingCharacterClasses::GetCat(Ch);
|
|
if (Category == Cat::Punctuation)
|
|
{
|
|
Rest = Rest.RightChop(1);
|
|
Add(Ch, FString(), Before, Rest);
|
|
continue;
|
|
}
|
|
if (Category == Cat::Control)
|
|
{
|
|
Error = "Control characters in input, not allowed";
|
|
break;
|
|
}
|
|
FName Id = TokenizeIdentifier(Rest, Error);
|
|
Add(Identifier, Id, Before, Rest);
|
|
continue;
|
|
}
|
|
if (!Error.IsEmpty()) Tokens.Empty();
|
|
|
|
// Two sentinels means we can safely do lookahead 2 without risk.
|
|
Rest = Rest.LeftChop(Rest.Len());
|
|
Add(0, FName(), Rest, Rest);
|
|
Add(0, FName(), Rest, Rest);
|
|
Next = Tokens.GetData();
|
|
}
|
|
|
|
void WingTokenizer::SaveCursor(FName Name)
|
|
{
|
|
int Cursor = Next - Tokens.GetData();
|
|
for (auto &Pair : SavedCursor) if (Pair.Key == Name) { Pair.Value = Cursor; return; }
|
|
SavedCursor.Emplace(Name, Cursor);
|
|
}
|
|
|
|
FString WingTokenizer::GetRange(FName SavePos, int Extra) const
|
|
{
|
|
int Lo = 0;
|
|
for (auto &Pair : SavedCursor) if (Pair.Key == SavePos) Lo = Pair.Value;
|
|
int Hi = (Next - Tokens.GetData()) + Extra;
|
|
Hi = FMath::Clamp(Hi, Lo, Tokens.Num());
|
|
if (Lo >= Hi) return FString();
|
|
const TCHAR* Start = Tokens[Lo].Source.GetData();
|
|
const TCHAR* End = Tokens[Hi - 1].Source.GetData() + Tokens[Hi - 1].Source.Len();
|
|
return FString(End - Start, Start);
|
|
}
|
|
|
|
void WingTokenizer::PrintEverything(FStringBuilderBase &Out) const
|
|
{
|
|
if (!Error.IsEmpty())
|
|
{
|
|
Out.Appendf(TEXT("Error: %s\n"), *Error);
|
|
}
|
|
for (const Token& T : Tokens)
|
|
{
|
|
Out.Appendf(TEXT("Token '%c': "), T.Type);
|
|
if (T.Type == Identifier)
|
|
{
|
|
for (TCHAR Ch : T.Name.ToString())
|
|
{
|
|
if (Ch >= 0x20 && Ch <= 0x7E)
|
|
{
|
|
Out.AppendChar(Ch);
|
|
Out.AppendChar(' ');
|
|
}
|
|
else
|
|
{
|
|
Out.Appendf(TEXT("%04X "), (int32)Ch);
|
|
}
|
|
}
|
|
}
|
|
if ((T.Type == RestOfLine) || (T.Type == AssetName))
|
|
{
|
|
Out.Appendf(TEXT("[%s]"), *FString(T.Rest));
|
|
}
|
|
Out.AppendChar('\n');
|
|
}
|
|
}
|
|
|
|
FString WingTokenizer::ExternalizeID(FName InternalID)
|
|
{
|
|
TStringBuilder<512> Result;
|
|
TCHAR Buffer[FName::StringBufferSize];
|
|
int32 Len = InternalID.ToString(Buffer);
|
|
for (TCHAR Ch : FStringView(Buffer, Len))
|
|
{
|
|
if (Ch == ' ') Result.AppendChar('.');
|
|
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
|
|
else
|
|
{
|
|
Result.AppendChar('&');
|
|
FStringView Name = WingEntityList::GetName(Ch);
|
|
if (Name.IsEmpty())
|
|
{
|
|
Result.AppendChar('#');
|
|
Result.Appendf(TEXT("%d"), (int32)Ch);
|
|
}
|
|
else
|
|
{
|
|
Result.Append(Name);
|
|
}
|
|
Result.AppendChar(';');
|
|
}
|
|
}
|
|
return Result.ToString();
|
|
}
|
|
|
|
bool WingTokenizer::WouldExternalizeReadably(FName InternalID)
|
|
{
|
|
if (InternalID.IsNone()) return false;
|
|
TCHAR Buffer[FName::StringBufferSize];
|
|
int32 Len = InternalID.ToString(Buffer);
|
|
for (TCHAR Ch : FStringView(Buffer, Len))
|
|
{
|
|
if (Ch == ' ') continue;
|
|
if (WingCharacterClasses::GetCat(Ch) != Cat::Identifier) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
FName WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error)
|
|
{
|
|
FStringView Input(ExternalID);
|
|
Error.Empty();
|
|
FName InternalID = TokenizeIdentifier(Input, Error);
|
|
// If there's already an error, annotate with context
|
|
if (!Error.IsEmpty())
|
|
{
|
|
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *ExternalID, *Error);
|
|
return FName();
|
|
}
|
|
// If the identifier tokenizer stops before consuming the whole
|
|
// input, then we need to generate an error message. We do our best
|
|
// to generate the most informative error possible.
|
|
if (!Input.IsEmpty())
|
|
{
|
|
Cat Category = WingCharacterClasses::GetCat(Input[0]);
|
|
if (Input[0] == ' ')
|
|
{
|
|
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, spaces must be escaped"), *ExternalID);
|
|
}
|
|
else if (Category == Cat::Punctuation)
|
|
{
|
|
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, these marks must be escaped: %s"),
|
|
*ExternalID, WingCharacterClasses::PunctuationString);
|
|
}
|
|
else if (Category == Cat::Control)
|
|
{
|
|
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *ExternalID);
|
|
}
|
|
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *ExternalID);
|
|
return FName();
|
|
}
|
|
// One last error case: empty input
|
|
if (InternalID.IsNone())
|
|
{
|
|
Error = TEXT("ERROR: Empty identifiers are not allowed");
|
|
return FName();
|
|
}
|
|
return InternalID;
|
|
}
|
|
|
|
|
|
FString WingTokenizer::SimplifyID(const FString &ID)
|
|
{
|
|
TStringBuilder<512> Result;
|
|
for (TCHAR Ch : ID)
|
|
{
|
|
if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier)
|
|
Result.AppendChar(Ch);
|
|
}
|
|
return Result.ToString();
|
|
}
|