More work on tokenizer
This commit is contained in:
@@ -27,6 +27,6 @@ public:
|
||||
|
||||
virtual void Handle() override
|
||||
{
|
||||
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(Input));
|
||||
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(FName(Input)));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -28,14 +28,14 @@ public:
|
||||
virtual void Handle() override
|
||||
{
|
||||
FString Error;
|
||||
FString Result = WingTokenizer::TryInternalizeID(Input, Error);
|
||||
FName Result = WingTokenizer::TryInternalizeID(Input, Error);
|
||||
if (!Error.IsEmpty())
|
||||
{
|
||||
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
|
||||
}
|
||||
if (!Result.IsEmpty())
|
||||
if (!Result.IsNone())
|
||||
{
|
||||
UWingServer::Printf(TEXT("Result: %s\n"), *Result);
|
||||
UWingServer::Printf(TEXT("Result: %s\n"), *Result.ToString());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -44,11 +44,19 @@ WingCharacterClasses::WingCharacterClasses()
|
||||
|
||||
WingCharacterClasses WingCharacterClasses::TheSet;
|
||||
|
||||
void WingTokenizer::Add(TCHAR Type, FString InternalID)
|
||||
void WingTokenizer::Add(TCHAR Type, FName InternalID)
|
||||
{
|
||||
Token T;
|
||||
T.Type = Type;
|
||||
T.InternalID = MoveTemp(InternalID);
|
||||
T.InternalID = InternalID;
|
||||
Tokens.Add(T);
|
||||
}
|
||||
|
||||
void WingTokenizer::Add(TCHAR Type, FStringView Rest)
|
||||
{
|
||||
Token T;
|
||||
T.Type = Type;
|
||||
T.Rest = Rest;
|
||||
Tokens.Add(T);
|
||||
}
|
||||
|
||||
@@ -117,23 +125,25 @@ TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error)
|
||||
return Result;
|
||||
}
|
||||
|
||||
FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
|
||||
FName WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
|
||||
{
|
||||
if (!Error.IsEmpty()) return FString();
|
||||
TStringBuilder<512> Decoded;
|
||||
if (!Error.IsEmpty()) return FName();
|
||||
TCHAR Buffer[NAME_SIZE];
|
||||
int Len = 0;
|
||||
while (!Rest.IsEmpty() && Error.IsEmpty())
|
||||
{
|
||||
TCHAR Ch = Rest[0];
|
||||
if (Ch == ' ') break;
|
||||
if (Ch == '.')
|
||||
{
|
||||
Decoded.AppendChar(' ');
|
||||
if (Len < NAME_SIZE) Buffer[Len++] = Ch;
|
||||
Rest = Rest.RightChop(1);
|
||||
continue;
|
||||
}
|
||||
if (Ch == '&')
|
||||
{
|
||||
Decoded.AppendChar(TokenizeEscapeSequence(Rest, Error));
|
||||
TCHAR Decoded = TokenizeEscapeSequence(Rest, Error);
|
||||
if (Len < NAME_SIZE) Buffer[Len++] = Decoded;
|
||||
continue;
|
||||
}
|
||||
Cat Category = WingCharacterClasses::GetCat(Ch);
|
||||
@@ -141,20 +151,31 @@ FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
|
||||
{
|
||||
// We accept other characters in case the LLM sends unicode
|
||||
// that isn't on the whitelist. This is intentional.
|
||||
Decoded.AppendChar(Ch);
|
||||
if (Len < NAME_SIZE) Buffer[Len++] = Ch;
|
||||
Rest = Rest.RightChop(1);
|
||||
}
|
||||
else break;
|
||||
}
|
||||
if (!Error.IsEmpty()) return FString();
|
||||
// We deliberately do not produce an error message for empty identifiers,
|
||||
// because we can't generate a good message here. We leave it to others
|
||||
// to deal with that case.
|
||||
return Decoded.ToString();
|
||||
if (!Error.IsEmpty()) return FName();
|
||||
// The buffer has just enough room to hold the longest FName,
|
||||
// plus the required null terminator. If we filled the whole
|
||||
// buffer, leaving no room for the null, it means the name
|
||||
// is too long.
|
||||
if (Len == NAME_SIZE)
|
||||
{
|
||||
Error = "FName too long";
|
||||
return FName();
|
||||
}
|
||||
Buffer[Len] = 0;
|
||||
return FName(Len, Buffer);
|
||||
// Note about code above: we deliberately do not check for empty
|
||||
// names here, because we don't have the context to generate a good
|
||||
// error message. So instead, we leave it to the caller.
|
||||
}
|
||||
|
||||
WingTokenizer::WingTokenizer(const FString& Input)
|
||||
WingTokenizer::WingTokenizer(const FString& In)
|
||||
{
|
||||
Input = In;
|
||||
FStringView Rest(Input);
|
||||
while (!Rest.IsEmpty() && Error.IsEmpty())
|
||||
{
|
||||
@@ -166,7 +187,7 @@ WingTokenizer::WingTokenizer(const FString& Input)
|
||||
}
|
||||
if (Ch == '=')
|
||||
{
|
||||
Add(RestOfLine, FString(Rest.RightChop(1)));
|
||||
Add(RestOfLine, Rest.RightChop(1));
|
||||
break;
|
||||
}
|
||||
if ((Ch == '.') || (Ch == '&'))
|
||||
@@ -200,30 +221,36 @@ void WingTokenizer::PrintEverything(FStringBuilderBase &Out) const
|
||||
}
|
||||
for (const Token& T : Tokens)
|
||||
{
|
||||
TStringBuilder<512> ExtraStr;
|
||||
for (TCHAR Ch : T.InternalID)
|
||||
Out.Appendf(TEXT("Token '%c': "), T.Type);
|
||||
if (T.Type == Identifier)
|
||||
{
|
||||
if (Ch >= 0x20 && Ch <= 0x7E)
|
||||
for (TCHAR Ch : T.InternalID.ToString())
|
||||
{
|
||||
ExtraStr.AppendChar(Ch);
|
||||
ExtraStr.AppendChar(' ');
|
||||
}
|
||||
else
|
||||
{
|
||||
ExtraStr.Appendf(TEXT("%04X "), (int32)Ch);
|
||||
if (Ch >= 0x20 && Ch <= 0x7E)
|
||||
{
|
||||
Out.AppendChar(Ch);
|
||||
Out.AppendChar(' ');
|
||||
}
|
||||
else
|
||||
{
|
||||
Out.Appendf(TEXT("%04X "), (int32)Ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (T.Type >= 0x20 && T.Type <= 0x7E)
|
||||
Out.Appendf(TEXT("Token '%c': %s\n"), T.Type, *ExtraStr);
|
||||
else
|
||||
Out.Appendf(TEXT("Token %04X: %s\n"), (int32)T.Type, *ExtraStr);
|
||||
if (T.Type == RestOfLine)
|
||||
{
|
||||
Out.Appendf(TEXT("[%s]"), *FString(T.Rest));
|
||||
}
|
||||
Out.AppendChar('\n');
|
||||
}
|
||||
}
|
||||
|
||||
FString WingTokenizer::ExternalizeID(const FString &InternalID)
|
||||
FString WingTokenizer::ExternalizeID(FName InternalID)
|
||||
{
|
||||
TStringBuilder<512> Result;
|
||||
for (TCHAR Ch : InternalID)
|
||||
TCHAR Buffer[FName::StringBufferSize];
|
||||
int32 Len = InternalID.ToString(Buffer);
|
||||
for (TCHAR Ch : FStringView(Buffer, Len))
|
||||
{
|
||||
if (Ch == ' ') Result.AppendChar('.');
|
||||
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
|
||||
@@ -246,10 +273,12 @@ FString WingTokenizer::ExternalizeID(const FString &InternalID)
|
||||
return Result.ToString();
|
||||
}
|
||||
|
||||
bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID)
|
||||
bool WingTokenizer::WouldExternalizeReadably(FName InternalID)
|
||||
{
|
||||
if (InternalID.IsEmpty()) return false;
|
||||
for (TCHAR Ch : InternalID)
|
||||
if (InternalID.IsNone()) return false;
|
||||
TCHAR Buffer[FName::StringBufferSize];
|
||||
int32 Len = InternalID.ToString(Buffer);
|
||||
for (TCHAR Ch : FStringView(Buffer, Len))
|
||||
{
|
||||
if (Ch == ' ') continue;
|
||||
if (WingCharacterClasses::GetCat(Ch) != Cat::Identifier) return false;
|
||||
@@ -257,16 +286,16 @@ bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID)
|
||||
return true;
|
||||
}
|
||||
|
||||
FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error)
|
||||
FName WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error)
|
||||
{
|
||||
FStringView Input(ExternalID);
|
||||
Error.Empty();
|
||||
FString InternalID = TokenizeIdentifier(Input, Error);
|
||||
FName InternalID = TokenizeIdentifier(Input, Error);
|
||||
// If there's already an error, annotate with context
|
||||
if (!Error.IsEmpty())
|
||||
{
|
||||
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *ExternalID, *Error);
|
||||
return FString();
|
||||
return FName();
|
||||
}
|
||||
// If the identifier tokenizer stops before consuming the whole
|
||||
// input, then we need to generate an error message. We do our best
|
||||
@@ -288,13 +317,13 @@ FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Erro
|
||||
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *ExternalID);
|
||||
}
|
||||
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *ExternalID);
|
||||
return FString();
|
||||
return FName();
|
||||
}
|
||||
// One last error case: empty input
|
||||
if (InternalID.IsEmpty())
|
||||
if (InternalID.IsNone())
|
||||
{
|
||||
Error = TEXT("ERROR: Empty identifiers are not allowed");
|
||||
return FString();
|
||||
return FName();
|
||||
}
|
||||
return InternalID;
|
||||
}
|
||||
|
||||
@@ -59,25 +59,25 @@ FName WingUtils::GetFName(const FWingProperty &Prop) { return Prop.Prop->GetFNam
|
||||
|
||||
FString WingUtils::ExternalizeID(FName Name)
|
||||
{
|
||||
return WingTokenizer::ExternalizeID(Name.ToString());
|
||||
return WingTokenizer::ExternalizeID(Name);
|
||||
}
|
||||
|
||||
FName WingUtils::CheckInternalizeID(const FString &ExternalID)
|
||||
{
|
||||
FString Error;
|
||||
FString InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error);
|
||||
FName InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error);
|
||||
if (!Error.IsEmpty())
|
||||
{
|
||||
UWingServer::Printf(TEXT("%s\n"), *Error);
|
||||
UWingServer::SuggestManual(WingManual::Section::EscapeSequences);
|
||||
}
|
||||
return FName(InternalID);
|
||||
return InternalID;
|
||||
}
|
||||
|
||||
FName WingUtils::CheckProposedName(const FString &ExternalID)
|
||||
{
|
||||
FName InternalID = CheckInternalizeID(ExternalID);
|
||||
if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID.ToString()))
|
||||
if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID))
|
||||
{
|
||||
UWingServer::Printf(TEXT("ERROR: id %s would not be a readable id, may not create item with this name"),
|
||||
*ExternalID);
|
||||
|
||||
@@ -105,10 +105,12 @@ struct WingTokenizer
|
||||
// RestOfLine, or a single-character punctuation mark.
|
||||
// The InternalID field contains the result of converting
|
||||
// the token from an external ID to an internal ID.
|
||||
// Rest is only populated if it's a rest-of-line token.
|
||||
struct Token
|
||||
{
|
||||
TCHAR Type;
|
||||
FString InternalID;
|
||||
FName InternalID;
|
||||
FStringView Rest;
|
||||
};
|
||||
|
||||
// The string that we tokenized.
|
||||
@@ -129,12 +131,12 @@ struct WingTokenizer
|
||||
// Convert an internal ID into an external ID.
|
||||
// Spaces are converted to periods. Any other
|
||||
// non-identifier character is HTML escaped.
|
||||
static FString ExternalizeID(const FString &InternalID);
|
||||
static FString ExternalizeID(FName InternalID);
|
||||
|
||||
// Return true if the internal ID would convert
|
||||
// to a readable, easy-to-understand external ID without
|
||||
// HTML escape sequences.
|
||||
static bool WouldExternalizeReadably(const FString &InternalID);
|
||||
static bool WouldExternalizeReadably(FName InternalID);
|
||||
|
||||
// Convert an external ID into an internal ID.
|
||||
// Periods are converted back to spaces. HTML escapes
|
||||
@@ -142,7 +144,7 @@ struct WingTokenizer
|
||||
// fail, for example, if the external name contains an
|
||||
// invalid HTML escape. If it does, returns empty
|
||||
// string and sets the error message.
|
||||
static FString TryInternalizeID(const FString &ExternalID, FString &Error);
|
||||
static FName TryInternalizeID(const FString &ExternalID, FString &Error);
|
||||
|
||||
// Simplify an ID. This removes any non-identifier
|
||||
// characters from the ID. Be careful! This could
|
||||
@@ -156,7 +158,8 @@ struct WingTokenizer
|
||||
|
||||
private:
|
||||
// Add a token to the token array.
|
||||
void Add(TCHAR Type, FString InternalID);
|
||||
void Add(TCHAR Type, FName InternalID);
|
||||
void Add(TCHAR Type, FStringView Rest);
|
||||
|
||||
// Convert numbers to TCHAR. If there's an error, set the error
|
||||
// message and return zero.
|
||||
@@ -171,5 +174,5 @@ private:
|
||||
// Tokenize an identifier. Attempts to consume a valid identifier
|
||||
// from rest, and return the identifier. On error, sets the error
|
||||
// message and returns empty string.
|
||||
static FString TokenizeIdentifier(FStringView &Rest, FString &Error);
|
||||
static FName TokenizeIdentifier(FStringView &Rest, FString &Error);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user