More work on tokenizer
This commit is contained in:
@@ -27,6 +27,6 @@ public:
|
|||||||
|
|
||||||
virtual void Handle() override
|
virtual void Handle() override
|
||||||
{
|
{
|
||||||
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(Input));
|
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(FName(Input)));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -28,14 +28,14 @@ public:
|
|||||||
virtual void Handle() override
|
virtual void Handle() override
|
||||||
{
|
{
|
||||||
FString Error;
|
FString Error;
|
||||||
FString Result = WingTokenizer::TryInternalizeID(Input, Error);
|
FName Result = WingTokenizer::TryInternalizeID(Input, Error);
|
||||||
if (!Error.IsEmpty())
|
if (!Error.IsEmpty())
|
||||||
{
|
{
|
||||||
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
|
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
|
||||||
}
|
}
|
||||||
if (!Result.IsEmpty())
|
if (!Result.IsNone())
|
||||||
{
|
{
|
||||||
UWingServer::Printf(TEXT("Result: %s\n"), *Result);
|
UWingServer::Printf(TEXT("Result: %s\n"), *Result.ToString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -44,11 +44,19 @@ WingCharacterClasses::WingCharacterClasses()
|
|||||||
|
|
||||||
WingCharacterClasses WingCharacterClasses::TheSet;
|
WingCharacterClasses WingCharacterClasses::TheSet;
|
||||||
|
|
||||||
void WingTokenizer::Add(TCHAR Type, FString InternalID)
|
void WingTokenizer::Add(TCHAR Type, FName InternalID)
|
||||||
{
|
{
|
||||||
Token T;
|
Token T;
|
||||||
T.Type = Type;
|
T.Type = Type;
|
||||||
T.InternalID = MoveTemp(InternalID);
|
T.InternalID = InternalID;
|
||||||
|
Tokens.Add(T);
|
||||||
|
}
|
||||||
|
|
||||||
|
void WingTokenizer::Add(TCHAR Type, FStringView Rest)
|
||||||
|
{
|
||||||
|
Token T;
|
||||||
|
T.Type = Type;
|
||||||
|
T.Rest = Rest;
|
||||||
Tokens.Add(T);
|
Tokens.Add(T);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -117,23 +125,25 @@ TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error)
|
|||||||
return Result;
|
return Result;
|
||||||
}
|
}
|
||||||
|
|
||||||
FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
|
FName WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
|
||||||
{
|
{
|
||||||
if (!Error.IsEmpty()) return FString();
|
if (!Error.IsEmpty()) return FName();
|
||||||
TStringBuilder<512> Decoded;
|
TCHAR Buffer[NAME_SIZE];
|
||||||
|
int Len = 0;
|
||||||
while (!Rest.IsEmpty() && Error.IsEmpty())
|
while (!Rest.IsEmpty() && Error.IsEmpty())
|
||||||
{
|
{
|
||||||
TCHAR Ch = Rest[0];
|
TCHAR Ch = Rest[0];
|
||||||
if (Ch == ' ') break;
|
if (Ch == ' ') break;
|
||||||
if (Ch == '.')
|
if (Ch == '.')
|
||||||
{
|
{
|
||||||
Decoded.AppendChar(' ');
|
if (Len < NAME_SIZE) Buffer[Len++] = Ch;
|
||||||
Rest = Rest.RightChop(1);
|
Rest = Rest.RightChop(1);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (Ch == '&')
|
if (Ch == '&')
|
||||||
{
|
{
|
||||||
Decoded.AppendChar(TokenizeEscapeSequence(Rest, Error));
|
TCHAR Decoded = TokenizeEscapeSequence(Rest, Error);
|
||||||
|
if (Len < NAME_SIZE) Buffer[Len++] = Decoded;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Cat Category = WingCharacterClasses::GetCat(Ch);
|
Cat Category = WingCharacterClasses::GetCat(Ch);
|
||||||
@@ -141,20 +151,31 @@ FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
|
|||||||
{
|
{
|
||||||
// We accept other characters in case the LLM sends unicode
|
// We accept other characters in case the LLM sends unicode
|
||||||
// that isn't on the whitelist. This is intentional.
|
// that isn't on the whitelist. This is intentional.
|
||||||
Decoded.AppendChar(Ch);
|
if (Len < NAME_SIZE) Buffer[Len++] = Ch;
|
||||||
Rest = Rest.RightChop(1);
|
Rest = Rest.RightChop(1);
|
||||||
}
|
}
|
||||||
else break;
|
else break;
|
||||||
}
|
}
|
||||||
if (!Error.IsEmpty()) return FString();
|
if (!Error.IsEmpty()) return FName();
|
||||||
// We deliberately do not produce an error message for empty identifiers,
|
// The buffer has just enough room to hold the longest FName,
|
||||||
// because we can't generate a good message here. We leave it to others
|
// plus the required null terminator. If we filled the whole
|
||||||
// to deal with that case.
|
// buffer, leaving no room for the null, it means the name
|
||||||
return Decoded.ToString();
|
// is too long.
|
||||||
|
if (Len == NAME_SIZE)
|
||||||
|
{
|
||||||
|
Error = "FName too long";
|
||||||
|
return FName();
|
||||||
|
}
|
||||||
|
Buffer[Len] = 0;
|
||||||
|
return FName(Len, Buffer);
|
||||||
|
// Note about code above: we deliberately do not check for empty
|
||||||
|
// names here, because we don't have the context to generate a good
|
||||||
|
// error message. So instead, we leave it to the caller.
|
||||||
}
|
}
|
||||||
|
|
||||||
WingTokenizer::WingTokenizer(const FString& Input)
|
WingTokenizer::WingTokenizer(const FString& In)
|
||||||
{
|
{
|
||||||
|
Input = In;
|
||||||
FStringView Rest(Input);
|
FStringView Rest(Input);
|
||||||
while (!Rest.IsEmpty() && Error.IsEmpty())
|
while (!Rest.IsEmpty() && Error.IsEmpty())
|
||||||
{
|
{
|
||||||
@@ -166,7 +187,7 @@ WingTokenizer::WingTokenizer(const FString& Input)
|
|||||||
}
|
}
|
||||||
if (Ch == '=')
|
if (Ch == '=')
|
||||||
{
|
{
|
||||||
Add(RestOfLine, FString(Rest.RightChop(1)));
|
Add(RestOfLine, Rest.RightChop(1));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if ((Ch == '.') || (Ch == '&'))
|
if ((Ch == '.') || (Ch == '&'))
|
||||||
@@ -200,30 +221,36 @@ void WingTokenizer::PrintEverything(FStringBuilderBase &Out) const
|
|||||||
}
|
}
|
||||||
for (const Token& T : Tokens)
|
for (const Token& T : Tokens)
|
||||||
{
|
{
|
||||||
TStringBuilder<512> ExtraStr;
|
Out.Appendf(TEXT("Token '%c': "), T.Type);
|
||||||
for (TCHAR Ch : T.InternalID)
|
if (T.Type == Identifier)
|
||||||
{
|
{
|
||||||
if (Ch >= 0x20 && Ch <= 0x7E)
|
for (TCHAR Ch : T.InternalID.ToString())
|
||||||
{
|
{
|
||||||
ExtraStr.AppendChar(Ch);
|
if (Ch >= 0x20 && Ch <= 0x7E)
|
||||||
ExtraStr.AppendChar(' ');
|
{
|
||||||
}
|
Out.AppendChar(Ch);
|
||||||
else
|
Out.AppendChar(' ');
|
||||||
{
|
}
|
||||||
ExtraStr.Appendf(TEXT("%04X "), (int32)Ch);
|
else
|
||||||
|
{
|
||||||
|
Out.Appendf(TEXT("%04X "), (int32)Ch);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (T.Type >= 0x20 && T.Type <= 0x7E)
|
if (T.Type == RestOfLine)
|
||||||
Out.Appendf(TEXT("Token '%c': %s\n"), T.Type, *ExtraStr);
|
{
|
||||||
else
|
Out.Appendf(TEXT("[%s]"), *FString(T.Rest));
|
||||||
Out.Appendf(TEXT("Token %04X: %s\n"), (int32)T.Type, *ExtraStr);
|
}
|
||||||
|
Out.AppendChar('\n');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
FString WingTokenizer::ExternalizeID(const FString &InternalID)
|
FString WingTokenizer::ExternalizeID(FName InternalID)
|
||||||
{
|
{
|
||||||
TStringBuilder<512> Result;
|
TStringBuilder<512> Result;
|
||||||
for (TCHAR Ch : InternalID)
|
TCHAR Buffer[FName::StringBufferSize];
|
||||||
|
int32 Len = InternalID.ToString(Buffer);
|
||||||
|
for (TCHAR Ch : FStringView(Buffer, Len))
|
||||||
{
|
{
|
||||||
if (Ch == ' ') Result.AppendChar('.');
|
if (Ch == ' ') Result.AppendChar('.');
|
||||||
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
|
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
|
||||||
@@ -246,10 +273,12 @@ FString WingTokenizer::ExternalizeID(const FString &InternalID)
|
|||||||
return Result.ToString();
|
return Result.ToString();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID)
|
bool WingTokenizer::WouldExternalizeReadably(FName InternalID)
|
||||||
{
|
{
|
||||||
if (InternalID.IsEmpty()) return false;
|
if (InternalID.IsNone()) return false;
|
||||||
for (TCHAR Ch : InternalID)
|
TCHAR Buffer[FName::StringBufferSize];
|
||||||
|
int32 Len = InternalID.ToString(Buffer);
|
||||||
|
for (TCHAR Ch : FStringView(Buffer, Len))
|
||||||
{
|
{
|
||||||
if (Ch == ' ') continue;
|
if (Ch == ' ') continue;
|
||||||
if (WingCharacterClasses::GetCat(Ch) != Cat::Identifier) return false;
|
if (WingCharacterClasses::GetCat(Ch) != Cat::Identifier) return false;
|
||||||
@@ -257,16 +286,16 @@ bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error)
|
FName WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error)
|
||||||
{
|
{
|
||||||
FStringView Input(ExternalID);
|
FStringView Input(ExternalID);
|
||||||
Error.Empty();
|
Error.Empty();
|
||||||
FString InternalID = TokenizeIdentifier(Input, Error);
|
FName InternalID = TokenizeIdentifier(Input, Error);
|
||||||
// If there's already an error, annotate with context
|
// If there's already an error, annotate with context
|
||||||
if (!Error.IsEmpty())
|
if (!Error.IsEmpty())
|
||||||
{
|
{
|
||||||
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *ExternalID, *Error);
|
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *ExternalID, *Error);
|
||||||
return FString();
|
return FName();
|
||||||
}
|
}
|
||||||
// If the identifier tokenizer stops before consuming the whole
|
// If the identifier tokenizer stops before consuming the whole
|
||||||
// input, then we need to generate an error message. We do our best
|
// input, then we need to generate an error message. We do our best
|
||||||
@@ -288,13 +317,13 @@ FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Erro
|
|||||||
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *ExternalID);
|
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *ExternalID);
|
||||||
}
|
}
|
||||||
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *ExternalID);
|
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *ExternalID);
|
||||||
return FString();
|
return FName();
|
||||||
}
|
}
|
||||||
// One last error case: empty input
|
// One last error case: empty input
|
||||||
if (InternalID.IsEmpty())
|
if (InternalID.IsNone())
|
||||||
{
|
{
|
||||||
Error = TEXT("ERROR: Empty identifiers are not allowed");
|
Error = TEXT("ERROR: Empty identifiers are not allowed");
|
||||||
return FString();
|
return FName();
|
||||||
}
|
}
|
||||||
return InternalID;
|
return InternalID;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,25 +59,25 @@ FName WingUtils::GetFName(const FWingProperty &Prop) { return Prop.Prop->GetFNam
|
|||||||
|
|
||||||
FString WingUtils::ExternalizeID(FName Name)
|
FString WingUtils::ExternalizeID(FName Name)
|
||||||
{
|
{
|
||||||
return WingTokenizer::ExternalizeID(Name.ToString());
|
return WingTokenizer::ExternalizeID(Name);
|
||||||
}
|
}
|
||||||
|
|
||||||
FName WingUtils::CheckInternalizeID(const FString &ExternalID)
|
FName WingUtils::CheckInternalizeID(const FString &ExternalID)
|
||||||
{
|
{
|
||||||
FString Error;
|
FString Error;
|
||||||
FString InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error);
|
FName InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error);
|
||||||
if (!Error.IsEmpty())
|
if (!Error.IsEmpty())
|
||||||
{
|
{
|
||||||
UWingServer::Printf(TEXT("%s\n"), *Error);
|
UWingServer::Printf(TEXT("%s\n"), *Error);
|
||||||
UWingServer::SuggestManual(WingManual::Section::EscapeSequences);
|
UWingServer::SuggestManual(WingManual::Section::EscapeSequences);
|
||||||
}
|
}
|
||||||
return FName(InternalID);
|
return InternalID;
|
||||||
}
|
}
|
||||||
|
|
||||||
FName WingUtils::CheckProposedName(const FString &ExternalID)
|
FName WingUtils::CheckProposedName(const FString &ExternalID)
|
||||||
{
|
{
|
||||||
FName InternalID = CheckInternalizeID(ExternalID);
|
FName InternalID = CheckInternalizeID(ExternalID);
|
||||||
if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID.ToString()))
|
if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID))
|
||||||
{
|
{
|
||||||
UWingServer::Printf(TEXT("ERROR: id %s would not be a readable id, may not create item with this name"),
|
UWingServer::Printf(TEXT("ERROR: id %s would not be a readable id, may not create item with this name"),
|
||||||
*ExternalID);
|
*ExternalID);
|
||||||
|
|||||||
@@ -105,10 +105,12 @@ struct WingTokenizer
|
|||||||
// RestOfLine, or a single-character punctuation mark.
|
// RestOfLine, or a single-character punctuation mark.
|
||||||
// The InternalID field contains the result of converting
|
// The InternalID field contains the result of converting
|
||||||
// the token from an external ID to an internal ID.
|
// the token from an external ID to an internal ID.
|
||||||
|
// Rest is only populated if it's a rest-of-line token.
|
||||||
struct Token
|
struct Token
|
||||||
{
|
{
|
||||||
TCHAR Type;
|
TCHAR Type;
|
||||||
FString InternalID;
|
FName InternalID;
|
||||||
|
FStringView Rest;
|
||||||
};
|
};
|
||||||
|
|
||||||
// The string that we tokenized.
|
// The string that we tokenized.
|
||||||
@@ -129,12 +131,12 @@ struct WingTokenizer
|
|||||||
// Convert an internal ID into an external ID.
|
// Convert an internal ID into an external ID.
|
||||||
// Spaces are converted to periods. Any other
|
// Spaces are converted to periods. Any other
|
||||||
// non-identifier character is HTML escaped.
|
// non-identifier character is HTML escaped.
|
||||||
static FString ExternalizeID(const FString &InternalID);
|
static FString ExternalizeID(FName InternalID);
|
||||||
|
|
||||||
// Return true if the internal ID would convert
|
// Return true if the internal ID would convert
|
||||||
// to a readable, easy-to-understand external ID without
|
// to a readable, easy-to-understand external ID without
|
||||||
// HTML escape sequences.
|
// HTML escape sequences.
|
||||||
static bool WouldExternalizeReadably(const FString &InternalID);
|
static bool WouldExternalizeReadably(FName InternalID);
|
||||||
|
|
||||||
// Convert an external ID into an internal ID.
|
// Convert an external ID into an internal ID.
|
||||||
// Periods are converted back to spaces. HTML escapes
|
// Periods are converted back to spaces. HTML escapes
|
||||||
@@ -142,7 +144,7 @@ struct WingTokenizer
|
|||||||
// fail, for example, if the external name contains an
|
// fail, for example, if the external name contains an
|
||||||
// invalid HTML escape. If it does, returns empty
|
// invalid HTML escape. If it does, returns empty
|
||||||
// string and sets the error message.
|
// string and sets the error message.
|
||||||
static FString TryInternalizeID(const FString &ExternalID, FString &Error);
|
static FName TryInternalizeID(const FString &ExternalID, FString &Error);
|
||||||
|
|
||||||
// Simplify an ID. This removes any non-identifier
|
// Simplify an ID. This removes any non-identifier
|
||||||
// characters from the ID. Be careful! This could
|
// characters from the ID. Be careful! This could
|
||||||
@@ -156,7 +158,8 @@ struct WingTokenizer
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
// Add a token to the token array.
|
// Add a token to the token array.
|
||||||
void Add(TCHAR Type, FString InternalID);
|
void Add(TCHAR Type, FName InternalID);
|
||||||
|
void Add(TCHAR Type, FStringView Rest);
|
||||||
|
|
||||||
// Convert numbers to TCHAR. If there's an error, set the error
|
// Convert numbers to TCHAR. If there's an error, set the error
|
||||||
// message and return zero.
|
// message and return zero.
|
||||||
@@ -171,5 +174,5 @@ private:
|
|||||||
// Tokenize an identifier. Attempts to consume a valid identifier
|
// Tokenize an identifier. Attempts to consume a valid identifier
|
||||||
// from rest, and return the identifier. On error, sets the error
|
// from rest, and return the identifier. On error, sets the error
|
||||||
// message and returns empty string.
|
// message and returns empty string.
|
||||||
static FString TokenizeIdentifier(FStringView &Rest, FString &Error);
|
static FName TokenizeIdentifier(FStringView &Rest, FString &Error);
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user