Tokenizer is now done, we also have the new InternalizeID and ExternalizeID
This commit is contained in:
32
Plugins/UEWingman/Source/UEWingman/Handlers/Test_Sanitizer.h
Normal file
32
Plugins/UEWingman/Source/UEWingman/Handlers/Test_Sanitizer.h
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "CoreMinimal.h"
|
||||||
|
#include "WingServer.h"
|
||||||
|
#include "WingHandler.h"
|
||||||
|
#include "WingTokenizer.h"
|
||||||
|
#include "Test_Sanitizer.generated.h"
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
UCLASS()
|
||||||
|
class UWing_Test_Sanitizer : public UObject, public IWingHandler
|
||||||
|
{
|
||||||
|
GENERATED_BODY()
|
||||||
|
|
||||||
|
public:
|
||||||
|
UPROPERTY(meta=(Description="The string to sanitize"))
|
||||||
|
FString Input;
|
||||||
|
|
||||||
|
virtual FString GetDescription() const override
|
||||||
|
{
|
||||||
|
return TEXT("Test the sanitizer by sanitizing a string and printing the result.");
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void Handle() override
|
||||||
|
{
|
||||||
|
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(Input));
|
||||||
|
}
|
||||||
|
};
|
||||||
33
Plugins/UEWingman/Source/UEWingman/Handlers/Test_Tokenizer.h
Normal file
33
Plugins/UEWingman/Source/UEWingman/Handlers/Test_Tokenizer.h
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "CoreMinimal.h"
|
||||||
|
#include "WingServer.h"
|
||||||
|
#include "WingHandler.h"
|
||||||
|
#include "WingTokenizer.h"
|
||||||
|
#include "Test_Tokenizer.generated.h"
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
UCLASS()
|
||||||
|
class UWing_Test_Tokenizer : public UObject, public IWingHandler
|
||||||
|
{
|
||||||
|
GENERATED_BODY()
|
||||||
|
|
||||||
|
public:
|
||||||
|
UPROPERTY(meta=(Description="The string to tokenize"))
|
||||||
|
FString Input;
|
||||||
|
|
||||||
|
virtual FString GetDescription() const override
|
||||||
|
{
|
||||||
|
return TEXT("Test the tokenizer by tokenizing a string and printing the result.");
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void Handle() override
|
||||||
|
{
|
||||||
|
WingTokenizer T(Input);
|
||||||
|
T.PrintEverything();
|
||||||
|
}
|
||||||
|
};
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "CoreMinimal.h"
|
||||||
|
#include "WingServer.h"
|
||||||
|
#include "WingHandler.h"
|
||||||
|
#include "WingTokenizer.h"
|
||||||
|
#include "Test_Unsanitize.generated.h"
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
UCLASS()
|
||||||
|
class UWing_Test_Unsanitize : public UObject, public IWingHandler
|
||||||
|
{
|
||||||
|
GENERATED_BODY()
|
||||||
|
|
||||||
|
public:
|
||||||
|
UPROPERTY(meta=(Description="The sanitized identifier to unsanitize"))
|
||||||
|
FString Input;
|
||||||
|
|
||||||
|
virtual FString GetDescription() const override
|
||||||
|
{
|
||||||
|
return TEXT("Test the unsanitizer by unsanitizing a string and printing the result.");
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void Handle() override
|
||||||
|
{
|
||||||
|
FString Error;
|
||||||
|
FString Result = WingTokenizer::TryInternalizeID(Input, Error);
|
||||||
|
if (!Error.IsEmpty())
|
||||||
|
{
|
||||||
|
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
|
||||||
|
}
|
||||||
|
if (!Result.IsEmpty())
|
||||||
|
{
|
||||||
|
UWingServer::Printf(TEXT("Result: %s\n"), *Result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
1917
Plugins/UEWingman/Source/UEWingman/Private/WingEntities.cpp
Normal file
1917
Plugins/UEWingman/Source/UEWingman/Private/WingEntities.cpp
Normal file
File diff suppressed because it is too large
Load Diff
302
Plugins/UEWingman/Source/UEWingman/Private/WingTokenizer.cpp
Normal file
302
Plugins/UEWingman/Source/UEWingman/Private/WingTokenizer.cpp
Normal file
@@ -0,0 +1,302 @@
|
|||||||
|
#include "WingTokenizer.h"
|
||||||
|
#include "WingServer.h"
|
||||||
|
|
||||||
|
|
||||||
|
void WingCharacterClasses::Assign(Cat Category, FStringView String)
|
||||||
|
{
|
||||||
|
int32 OldSize = CharCategory.Num();
|
||||||
|
int32 NewSize = 0;
|
||||||
|
for (TCHAR Ch : String) if (Ch >= NewSize) NewSize = Ch + 1;
|
||||||
|
if (NewSize > OldSize)
|
||||||
|
{
|
||||||
|
CharCategory.SetNum(NewSize);
|
||||||
|
for (int i = OldSize; i < NewSize; i++) CharCategory[i] = Cat::Other;
|
||||||
|
}
|
||||||
|
for (TCHAR Ch : String) CharCategory[Ch] = Category;
|
||||||
|
}
|
||||||
|
|
||||||
|
WingCharacterClasses::WingCharacterClasses()
|
||||||
|
{
|
||||||
|
// This is the set of printable, visible, non-whitespace characters that
|
||||||
|
// appear in most ubuntu default fonts. I initially map all of these as
|
||||||
|
// 'Identifier' characters, but later I swap some of them over to punctuation.
|
||||||
|
Assign(Cat::Identifier, TEXT(
|
||||||
|
"!#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefgh"
|
||||||
|
"ijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐ"
|
||||||
|
"ÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖ"
|
||||||
|
"ėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜ"
|
||||||
|
"ŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƀƁƂƃƄƅƆƇƈƉƊƋƌƍƎƏƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟƠơƢ"
|
||||||
|
"ƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƾƿǀǁǂǃǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǦǧǨǩǪǫǬǭǮǯǰǴǵǸǹǼ"
|
||||||
|
"ǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟȤȥȦȧȨȩȪȫȬȭȮȯȰȱȲȳȷʒʼˆˇˉ˘˙˚˛˜˝΄΅ΆΈΉΊ"
|
||||||
|
"ΌΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώЀЁЂЃЄ"
|
||||||
|
"ЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъ"
|
||||||
|
"ыьэюяѐёђѓєѕіїјљњћќѝўџѲҐґҒғҔҕҖҗҘҙҚқҢңҤҥҪҫҬҭҮүҰұҲҳҺһӀӁӂӃӄӇӈӋӌӏӐӑӒӓӔӕӖӗӘә"
|
||||||
|
"ӚӛӜӝӞӟӠӡӢӣӤӥӦӧӨөӪӫӬӭӮӯӰӱӲӳӴӵӶӷӸӹẀẁẂẃẄẅỲỳ–—―‘’‚“”„†‡•…‰‹›⁰⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆"
|
||||||
|
"₇₈₉€₹№™Ω⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞∂∆∏∑−∕∙√∞∫≈≠≤≥◊fifl\"\\"
|
||||||
|
));
|
||||||
|
|
||||||
|
// All the punctuation marks that we do anything interesting with.
|
||||||
|
Assign(Cat::Punctuation, PunctuationString);
|
||||||
|
|
||||||
|
// Control characters.
|
||||||
|
for (int i = 0; i < 32; i++) CharCategory[i] = Cat::Control;
|
||||||
|
CharCategory[0x7F] = Cat::Control;
|
||||||
|
}
|
||||||
|
|
||||||
|
WingCharacterClasses WingCharacterClasses::TheSet;
|
||||||
|
|
||||||
|
void WingTokenizer::Add(TCHAR Type, FString InternalID)
|
||||||
|
{
|
||||||
|
Token T;
|
||||||
|
T.Type = Type;
|
||||||
|
T.InternalID = MoveTemp(InternalID);
|
||||||
|
Tokens.Add(T);
|
||||||
|
}
|
||||||
|
|
||||||
|
TCHAR WingTokenizer::FromHex(FStringView Digits, FString &Error)
|
||||||
|
{
|
||||||
|
if (Digits.Len() < 1) { Error = "Empty hex escape sequence"; return 0; }
|
||||||
|
int32 Value = 0;
|
||||||
|
for (TCHAR Ch : Digits)
|
||||||
|
{
|
||||||
|
if (Ch >= '0' && Ch <= '9') Value = Value * 16 + (Ch - '0');
|
||||||
|
else if (Ch >= 'a' && Ch <= 'f') Value = Value * 16 + (Ch - 'a' + 10);
|
||||||
|
else if (Ch >= 'A' && Ch <= 'F') Value = Value * 16 + (Ch - 'A' + 10);
|
||||||
|
else { Error = "Invalid hex digit in escape sequence"; return 0; }
|
||||||
|
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
|
||||||
|
}
|
||||||
|
return (TCHAR)Value;
|
||||||
|
}
|
||||||
|
|
||||||
|
TCHAR WingTokenizer::FromDecimal(FStringView Digits, FString &Error)
|
||||||
|
{
|
||||||
|
if (Digits.Len() < 1) { Error = "Empty decimal escape sequence"; return 0; }
|
||||||
|
int32 Value = 0;
|
||||||
|
for (TCHAR Ch : Digits)
|
||||||
|
{
|
||||||
|
if (Ch >= '0' && Ch <= '9') Value = Value * 10 + (Ch - '0');
|
||||||
|
else { Error = "Invalid decimal digit in escape sequence"; return 0; }
|
||||||
|
if (Value > 0xFFFF) { Error = "Escape sequence value out of range"; return 0; }
|
||||||
|
}
|
||||||
|
return (TCHAR)Value;
|
||||||
|
}
|
||||||
|
|
||||||
|
TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error)
|
||||||
|
{
|
||||||
|
if (!Error.IsEmpty()) return 0;
|
||||||
|
// Search for the semicolon.
|
||||||
|
int32 SemiPos;
|
||||||
|
if (!Rest.FindChar(';', SemiPos))
|
||||||
|
{
|
||||||
|
Error = "Ampersand escape sequence doesn't end in semicolon";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (SemiPos < 3)
|
||||||
|
{
|
||||||
|
Error = "Ampersand escape sequence too short";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
TCHAR Result = 0;
|
||||||
|
if (Rest[1] == '#')
|
||||||
|
{
|
||||||
|
if ((Rest[2] == 'x') || (Rest[2] == 'X'))
|
||||||
|
Result = FromHex(Rest.Mid(3, SemiPos - 3), Error);
|
||||||
|
else
|
||||||
|
Result = FromDecimal(Rest.Mid(2, SemiPos - 2), Error);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
FString Name(Rest.Mid(1, SemiPos - 1));
|
||||||
|
Result = WingEntityList::GetChar(Name);
|
||||||
|
if (Result == 0)
|
||||||
|
{
|
||||||
|
Error = FString::Printf(TEXT("Unknown HTML entity: &%s;"), *Name);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Rest = Rest.RightChop(SemiPos + 1);
|
||||||
|
return Result;
|
||||||
|
}
|
||||||
|
|
||||||
|
FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
|
||||||
|
{
|
||||||
|
if (!Error.IsEmpty()) return FString();
|
||||||
|
TStringBuilder<512> Decoded;
|
||||||
|
while (!Rest.IsEmpty() && Error.IsEmpty())
|
||||||
|
{
|
||||||
|
TCHAR Ch = Rest[0];
|
||||||
|
if (Ch == ' ') break;
|
||||||
|
if (Ch == '.')
|
||||||
|
{
|
||||||
|
Decoded.AppendChar(' ');
|
||||||
|
Rest = Rest.RightChop(1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (Ch == '&')
|
||||||
|
{
|
||||||
|
Decoded.AppendChar(TokenizeEscapeSequence(Rest, Error));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Cat Category = WingCharacterClasses::GetCat(Ch);
|
||||||
|
if ((Category == Cat::Identifier) || (Category == Cat::Other))
|
||||||
|
{
|
||||||
|
// We accept other characters in case the LLM sends unicode
|
||||||
|
// that isn't on the whitelist. This is intentional.
|
||||||
|
Decoded.AppendChar(Ch);
|
||||||
|
Rest = Rest.RightChop(1);
|
||||||
|
}
|
||||||
|
else break;
|
||||||
|
}
|
||||||
|
if (!Error.IsEmpty()) return FString();
|
||||||
|
// We deliberately do not produce an error message for empty identifiers,
|
||||||
|
// because we can't generate a good message here. We leave it to others
|
||||||
|
// to deal with that case.
|
||||||
|
return Decoded.ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
WingTokenizer::WingTokenizer(const FString& Input)
|
||||||
|
{
|
||||||
|
FStringView Rest(Input);
|
||||||
|
while (!Rest.IsEmpty() && Error.IsEmpty())
|
||||||
|
{
|
||||||
|
TCHAR Ch = Rest[0];
|
||||||
|
if ((Ch == ' ') || (Ch == '\t'))
|
||||||
|
{
|
||||||
|
Rest = Rest.RightChop(1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (Ch == '=')
|
||||||
|
{
|
||||||
|
Add(RestOfLine, FString(Rest.RightChop(1)));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if ((Ch == '.') || (Ch == '&'))
|
||||||
|
{
|
||||||
|
Add(Identifier, TokenizeIdentifier(Rest, Error));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Cat Category = WingCharacterClasses::GetCat(Ch);
|
||||||
|
if (Category == Cat::Punctuation)
|
||||||
|
{
|
||||||
|
Add(Ch, FString());
|
||||||
|
Rest = Rest.RightChop(1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (Category == Cat::Control)
|
||||||
|
{
|
||||||
|
Error = "Control characters in input, not allowed";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Add(Identifier, TokenizeIdentifier(Rest, Error));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!Error.IsEmpty()) Tokens.Empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
void WingTokenizer::PrintEverything() const
|
||||||
|
{
|
||||||
|
if (!Error.IsEmpty())
|
||||||
|
{
|
||||||
|
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
|
||||||
|
}
|
||||||
|
for (const Token& T : Tokens)
|
||||||
|
{
|
||||||
|
TStringBuilder<512> ExtraStr;
|
||||||
|
for (TCHAR Ch : T.InternalID)
|
||||||
|
{
|
||||||
|
if (Ch >= 0x20 && Ch <= 0x7E)
|
||||||
|
{
|
||||||
|
ExtraStr.AppendChar(Ch);
|
||||||
|
ExtraStr.AppendChar(' ');
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ExtraStr.Appendf(TEXT("%04X "), (int32)Ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (T.Type >= 0x20 && T.Type <= 0x7E)
|
||||||
|
UWingServer::Printf(TEXT("Token '%c': %s\n"), T.Type, *ExtraStr);
|
||||||
|
else
|
||||||
|
UWingServer::Printf(TEXT("Token %04X: %s\n"), (int32)T.Type, *ExtraStr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
FString WingTokenizer::ExternalizeID(const FString &S)
|
||||||
|
{
|
||||||
|
TStringBuilder<512> Result;
|
||||||
|
for (TCHAR Ch : S)
|
||||||
|
{
|
||||||
|
if (Ch == ' ') Result.AppendChar('.');
|
||||||
|
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Result.AppendChar('&');
|
||||||
|
FStringView Name = WingEntityList::GetName(Ch);
|
||||||
|
if (Name.IsEmpty())
|
||||||
|
{
|
||||||
|
Result.AppendChar('#');
|
||||||
|
Result.Appendf(TEXT("%d"), (int32)Ch);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Result.Append(Name);
|
||||||
|
}
|
||||||
|
Result.AppendChar(';');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Result.ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
FString WingTokenizer::TryInternalizeID(const FString &S, FString &Error)
|
||||||
|
{
|
||||||
|
FStringView Input(S);
|
||||||
|
Error.Empty();
|
||||||
|
FString Result = TokenizeIdentifier(Input, Error);
|
||||||
|
// If there's already an error, annotate with context
|
||||||
|
if (!Error.IsEmpty())
|
||||||
|
{
|
||||||
|
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *S, *Error);
|
||||||
|
return FString();
|
||||||
|
}
|
||||||
|
// If the identifier tokenizer stops before consuming the whole
|
||||||
|
// input, then we need to generate an error message. We do our best
|
||||||
|
// to generate the most informative error possible.
|
||||||
|
if (!Input.IsEmpty())
|
||||||
|
{
|
||||||
|
Cat Category = WingCharacterClasses::GetCat(Input[0]);
|
||||||
|
if (Input[0] == ' ')
|
||||||
|
{
|
||||||
|
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, spaces must be escaped"), *S);
|
||||||
|
}
|
||||||
|
else if (Category == Cat::Punctuation)
|
||||||
|
{
|
||||||
|
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, these marks must be escaped: %s"),
|
||||||
|
*S, WingCharacterClasses::PunctuationString);
|
||||||
|
}
|
||||||
|
else if (Category == Cat::Control)
|
||||||
|
{
|
||||||
|
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *S);
|
||||||
|
}
|
||||||
|
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *S);
|
||||||
|
return FString();
|
||||||
|
}
|
||||||
|
// One last error case: empty input
|
||||||
|
if (Result.IsEmpty())
|
||||||
|
{
|
||||||
|
Error = TEXT("ERROR: Empty identifiers are not allowed");
|
||||||
|
return FString();
|
||||||
|
}
|
||||||
|
return Result;
|
||||||
|
}
|
||||||
|
|
||||||
|
FString WingTokenizer::CheckInternalizeID(const FString &S)
|
||||||
|
{
|
||||||
|
FString Error;
|
||||||
|
FString Result = TryInternalizeID(S, Error);
|
||||||
|
if (!Error.IsEmpty())
|
||||||
|
{
|
||||||
|
UWingServer::Printf(TEXT("%s\n"), *Error);
|
||||||
|
UWingServer::SuggestManual(WingManual::Section::IdentifierSanitization);
|
||||||
|
}
|
||||||
|
return Result;
|
||||||
|
}
|
||||||
@@ -65,6 +65,9 @@ FString WingUtils::SanitizeName(const FString &InName)
|
|||||||
if (c == ' ') c=L'·';
|
if (c == ' ') c=L'·';
|
||||||
if (c == '<') c=L'◁';
|
if (c == '<') c=L'◁';
|
||||||
if (c == '>') c=L'▷';
|
if (c == '>') c=L'▷';
|
||||||
|
if (c == '(') c=L'❨';
|
||||||
|
if (c == ')') c=L'❩';
|
||||||
|
if (c == '=') c=L'≡';
|
||||||
if (c == ',') c=L'▾';
|
if (c == ',') c=L'▾';
|
||||||
Name[Dst++] = c;
|
Name[Dst++] = c;
|
||||||
}
|
}
|
||||||
@@ -84,6 +87,9 @@ FString WingUtils::UnsanitizeName(const FString &InName)
|
|||||||
if (c == L'·') c=' ';
|
if (c == L'·') c=' ';
|
||||||
if (c == L'◁') c='<';
|
if (c == L'◁') c='<';
|
||||||
if (c == L'▷') c='>';
|
if (c == L'▷') c='>';
|
||||||
|
if (c == L'❨') c='(';
|
||||||
|
if (c == L'❩') c=')';
|
||||||
|
if (c == L'≡') c='=';
|
||||||
if (c == L'▾') c=',';
|
if (c == L'▾') c=',';
|
||||||
Name[Dst++] = c;
|
Name[Dst++] = c;
|
||||||
}
|
}
|
||||||
|
|||||||
169
Plugins/UEWingman/Source/UEWingman/Public/WingTokenizer.h
Normal file
169
Plugins/UEWingman/Source/UEWingman/Public/WingTokenizer.h
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "CoreMinimal.h"
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// WingCharacterClasses
|
||||||
|
//
|
||||||
|
// We recognize these disjoint classes of characters:
|
||||||
|
//
|
||||||
|
// Punctuation. A small hardwired list of punctuation marks
|
||||||
|
// that we want to escape, specifically \"'(),.:;<=>&
|
||||||
|
// These particular punctuation marks were chosen because they
|
||||||
|
// either need to be escaped for json's sake, or for our
|
||||||
|
// parser's sake. Any other punctuation is just classified as
|
||||||
|
// an identifier character.
|
||||||
|
//
|
||||||
|
// Identifier characters. A whitelist of about a thousand
|
||||||
|
// ascii and unicode characters that can be used directly in
|
||||||
|
// identifiers without any kind of escaping. To get on the list,
|
||||||
|
// you need the following: to not be an ascii punctuation mark,
|
||||||
|
// to be printable and visible and not whitespace, and to be
|
||||||
|
// easily rendered by all of the default ubuntu fonts.
|
||||||
|
//
|
||||||
|
// Control Characters. Ascii control characters, including DEL.
|
||||||
|
//
|
||||||
|
// Other Characters. Anything else.
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------
|
||||||
|
|
||||||
|
struct WingCharacterClasses
|
||||||
|
{
|
||||||
|
enum class Cat : uint8
|
||||||
|
{
|
||||||
|
Identifier,
|
||||||
|
Punctuation,
|
||||||
|
Control,
|
||||||
|
Other,
|
||||||
|
};
|
||||||
|
|
||||||
|
static Cat GetCat(TCHAR Ch)
|
||||||
|
{
|
||||||
|
int32 Cp = (int32)Ch;
|
||||||
|
if (Cp < 0 || Cp >= TheSet.CharCategory.Num()) return Cat::Other;
|
||||||
|
return TheSet.CharCategory[Cp];
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr const TCHAR *PunctuationString = TEXT("\\\"'(),.:;<=>&");
|
||||||
|
|
||||||
|
private:
|
||||||
|
TArray<Cat> CharCategory;
|
||||||
|
WingCharacterClasses();
|
||||||
|
void Assign(Cat Category, FStringView String);
|
||||||
|
static WingCharacterClasses TheSet;
|
||||||
|
};
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// The HTML Entity List.
|
||||||
|
//
|
||||||
|
// When escaping identifiers, we use HTML escapes like <
|
||||||
|
// These work well because they have no conflict with the json
|
||||||
|
// parser (MCP protocol is json), they are also easy to deal
|
||||||
|
// with in the tokenizer, and the LLM is already familiar with
|
||||||
|
// that kind of escaping. The names stored in this table do not
|
||||||
|
// include the ampersand or the semicolon.
|
||||||
|
//
|
||||||
|
// This class doesn't handle hex character codes, this is just
|
||||||
|
// a lookup table from character to name and back.
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------
|
||||||
|
|
||||||
|
struct WingEntityList
|
||||||
|
{
|
||||||
|
static TCHAR GetChar(const FString &Name)
|
||||||
|
{
|
||||||
|
TCHAR *Result = TheList.NameToChar.Find(Name);
|
||||||
|
if (Result == nullptr) return 0;
|
||||||
|
return *Result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FStringView GetName(TCHAR Ch)
|
||||||
|
{
|
||||||
|
FString *Result = TheList.CharToName.Find(Ch);
|
||||||
|
if (Result == nullptr) return FStringView();
|
||||||
|
return *Result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
TMap<TCHAR, FString> CharToName;
|
||||||
|
TMap<FString, TCHAR> NameToChar;
|
||||||
|
struct Raw { const char *Name; TCHAR Codepoint; };
|
||||||
|
WingEntityList(std::initializer_list<Raw> Data);
|
||||||
|
static WingEntityList TheList;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct WingTokenizer
|
||||||
|
{
|
||||||
|
using Cat = WingCharacterClasses::Cat;
|
||||||
|
const TCHAR Identifier = 'i';
|
||||||
|
const TCHAR RestOfLine = 'r';
|
||||||
|
|
||||||
|
// A token has a token type which can be Identifier,
|
||||||
|
// RestOfLine, or a single-character punctuation mark.
|
||||||
|
// The InternalID field contains the result of converting
|
||||||
|
// the token from an external ID to an internal ID.
|
||||||
|
struct Token
|
||||||
|
{
|
||||||
|
TCHAR Type;
|
||||||
|
FString InternalID;
|
||||||
|
};
|
||||||
|
|
||||||
|
// The string that we tokenized.
|
||||||
|
FString Input;
|
||||||
|
|
||||||
|
// If the tokenization failed, an error message.
|
||||||
|
FString Error;
|
||||||
|
|
||||||
|
// The result, an array of tokens.
|
||||||
|
TArray<Token> Tokens;
|
||||||
|
|
||||||
|
// Tokenize a line of input. The tokens are stored in
|
||||||
|
// the token array. If there's an error, the error is
|
||||||
|
// stored in the error field, and the token array is
|
||||||
|
// cleared. If the tokens contain identifiers,
|
||||||
|
WingTokenizer(const FString& Input);
|
||||||
|
|
||||||
|
// Convert an internal ID into an external ID.
|
||||||
|
// Spaces are converted to periods. Any other
|
||||||
|
// non-identifier character is HTML escaped.
|
||||||
|
static FString ExternalizeID(const FString &S);
|
||||||
|
|
||||||
|
// Convert an external ID into an internal ID.
|
||||||
|
// Periods are converted back to spaces. HTML escapes
|
||||||
|
// are converted back to raw characters. This could
|
||||||
|
// fail, for example, if the external name contains an
|
||||||
|
// invalid HTML escape. If it does, returns empty
|
||||||
|
// string and sets the error message.
|
||||||
|
static FString TryInternalizeID(const FString &S, FString &Error);
|
||||||
|
|
||||||
|
// Calls TryInternalizeName. If this generates an
|
||||||
|
// error, prints the error message, suggests the manual
|
||||||
|
// entry on identifier sanitization, and returns empty
|
||||||
|
// string.
|
||||||
|
static FString CheckInternalizeID(const FString &S);
|
||||||
|
|
||||||
|
// Print all tokens to the log for debugging.
|
||||||
|
void PrintEverything() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Add a token to the token array.
|
||||||
|
void Add(TCHAR Type, FString InternalID);
|
||||||
|
|
||||||
|
// Convert numbers to TCHAR. If there's an error, set the error
|
||||||
|
// message and return zero.
|
||||||
|
static TCHAR FromHex(FStringView Digits, FString &Error);
|
||||||
|
static TCHAR FromDecimal(FStringView Digits, FString &Error);
|
||||||
|
|
||||||
|
// Tokenize an escape sequence. Attempts to consume a valid escape
|
||||||
|
// sequence from rest, and return the character indicated. On error,
|
||||||
|
// sets the error message and returns zero.
|
||||||
|
static TCHAR TokenizeEscapeSequence(FStringView &Rest, FString &Error);
|
||||||
|
|
||||||
|
// Tokenize an identifier. Attempts to consume a valid identifier
|
||||||
|
// from rest, and return the identifier. On error, sets the error
|
||||||
|
// message and returns empty string.
|
||||||
|
static FString TokenizeIdentifier(FStringView &Rest, FString &Error);
|
||||||
|
};
|
||||||
2233
entities.json
Normal file
2233
entities.json
Normal file
File diff suppressed because it is too large
Load Diff
96
tools/font-glyphs.py
Executable file
96
tools/font-glyphs.py
Executable file
@@ -0,0 +1,96 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Report which Unicode code points have vector outlines in ALL of the given font files.
|
||||||
|
|
||||||
|
Usage: python3 font-glyphs.py font1.ttf font2.ttf ...
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import unicodedata
|
||||||
|
from fontTools.ttLib import TTFont
|
||||||
|
from fontTools.pens.statisticsPen import StatisticsPen
|
||||||
|
|
||||||
|
|
||||||
|
def get_vector_codepoints(path):
|
||||||
|
"""Return the set of code points that have actual vector outlines in the font."""
|
||||||
|
font = TTFont(path)
|
||||||
|
cmap = font.getBestCmap()
|
||||||
|
if cmap is None:
|
||||||
|
print(f"WARNING: {path} has no cmap table", file=sys.stderr)
|
||||||
|
return set()
|
||||||
|
|
||||||
|
glyf = font.get("glyf") # TrueType outlines
|
||||||
|
cff = font.get("CFF ") # CFF outlines
|
||||||
|
|
||||||
|
result = set()
|
||||||
|
for codepoint, glyph_name in cmap.items():
|
||||||
|
has_outline = False
|
||||||
|
if glyf is not None:
|
||||||
|
g = glyf.get(glyph_name)
|
||||||
|
if g is not None and g.numberOfContours != 0:
|
||||||
|
has_outline = True
|
||||||
|
if cff is not None:
|
||||||
|
# CFF fonts store outlines in charstrings.
|
||||||
|
try:
|
||||||
|
cs = cff.cff.topDictIndex[0].CharStrings[glyph_name]
|
||||||
|
pen = StatisticsPen(glyphset=font.getGlyphSet())
|
||||||
|
cs.draw(pen)
|
||||||
|
if pen.area != 0:
|
||||||
|
has_outline = True
|
||||||
|
except (KeyError, AttributeError):
|
||||||
|
pass
|
||||||
|
if has_outline:
|
||||||
|
result.add(codepoint)
|
||||||
|
|
||||||
|
font.close()
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print(f"Usage: {sys.argv[0]} font1.ttf [font2.ttf ...]", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
paths = sys.argv[1:]
|
||||||
|
|
||||||
|
# Process each font and intersect.
|
||||||
|
common = None
|
||||||
|
for path in paths:
|
||||||
|
cps = get_vector_codepoints(path)
|
||||||
|
print(f"{len(cps):6d} glyphs {path}")
|
||||||
|
if common is None:
|
||||||
|
common = cps
|
||||||
|
else:
|
||||||
|
common &= cps
|
||||||
|
|
||||||
|
if len(paths) > 1:
|
||||||
|
print(f"{len(common):6d} glyphs common to all {len(paths)} fonts", file=sys.stderr)
|
||||||
|
|
||||||
|
# Build the character string, excluding quote and backslash.
|
||||||
|
chars = []
|
||||||
|
for cp in sorted(common):
|
||||||
|
if cp == ord('"') or cp == ord('\\'):
|
||||||
|
continue
|
||||||
|
chars.append(chr(cp))
|
||||||
|
|
||||||
|
# Emit C++ file.
|
||||||
|
print("// Auto-generated by tools/font-glyphs.py — do not edit by hand.")
|
||||||
|
print(f"// {len(chars)} characters common to all {len(paths)} font(s).")
|
||||||
|
print()
|
||||||
|
print("const TCHAR *CommonChars = TEXT(")
|
||||||
|
|
||||||
|
# Break into lines of ~70 chars for readability.
|
||||||
|
line = ""
|
||||||
|
for ch in chars:
|
||||||
|
line += ch
|
||||||
|
if len(line) >= 70:
|
||||||
|
print(f'\t"{line}"')
|
||||||
|
line = ""
|
||||||
|
if line:
|
||||||
|
print(f'\t"{line}"')
|
||||||
|
|
||||||
|
print(");")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
54
tools/gen-entities.py
Normal file
54
tools/gen-entities.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Read entities.json (WHATWG HTML named character references) and generate
|
||||||
|
WingEntities.cpp with a lookup table of { "name", codepoint } rows.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Only entries whose key ends with ';' (skip legacy semicolon-less forms).
|
||||||
|
- Only entries with exactly one codepoint.
|
||||||
|
- Codepoint must be <= 0xFFFF (Unreal uses 16-bit TCHAR).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json, os
|
||||||
|
|
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
project_dir = os.path.dirname(script_dir)
|
||||||
|
input_path = os.path.join(project_dir, "entities.json")
|
||||||
|
output_path = os.path.join(project_dir,
|
||||||
|
"Plugins", "UEWingman", "Source", "UEWingman", "Private", "WingEntities.cpp")
|
||||||
|
|
||||||
|
with open(input_path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for key, val in sorted(data.items()):
|
||||||
|
if not key.endswith(";"):
|
||||||
|
continue
|
||||||
|
cps = val["codepoints"]
|
||||||
|
if len(cps) != 1:
|
||||||
|
continue
|
||||||
|
cp = cps[0]
|
||||||
|
if cp > 0xFFFF:
|
||||||
|
continue
|
||||||
|
# Strip leading '&' and trailing ';'
|
||||||
|
name = key[1:-1]
|
||||||
|
rows.append((name, cp))
|
||||||
|
|
||||||
|
with open(output_path, "w") as f:
|
||||||
|
f.write("// Auto-generated by tools/gen-entities.py — do not edit by hand.\n")
|
||||||
|
f.write("// Source: WHATWG HTML named character references (entities.json)\n\n")
|
||||||
|
f.write('#include "WingTokenizer.h"\n\n\n')
|
||||||
|
f.write("WingEntityList::WingEntityList(std::initializer_list<Raw> Data)\n")
|
||||||
|
f.write("{\n")
|
||||||
|
f.write("\tfor (const Raw& Entry : Data)\n")
|
||||||
|
f.write("\t{\n")
|
||||||
|
f.write('\t\tFString XName((const ANSICHAR*)Entry.Name);\n')
|
||||||
|
f.write("\t\tCharToName.Add(Entry.Codepoint, XName);\n")
|
||||||
|
f.write("\t\tNameToChar.Add(XName, Entry.Codepoint);\n")
|
||||||
|
f.write("\t}\n")
|
||||||
|
f.write("}\n\n")
|
||||||
|
f.write("WingEntityList WingEntityList::TheList({\n")
|
||||||
|
for name, cp in rows:
|
||||||
|
f.write(f'\t{{ "{name}", {cp} }},\n')
|
||||||
|
f.write("});\n")
|
||||||
|
|
||||||
|
print(f"Generated {len(rows)} entities -> {output_path}")
|
||||||
Reference in New Issue
Block a user