More work on tokenizer

This commit is contained in:
2026-03-30 00:57:28 -04:00
parent 6041641c74
commit 339495ae3b
5 changed files with 85 additions and 53 deletions

View File

@@ -27,6 +27,6 @@ public:
virtual void Handle() override virtual void Handle() override
{ {
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(Input)); UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(FName(Input)));
} }
}; };

View File

@@ -28,14 +28,14 @@ public:
virtual void Handle() override virtual void Handle() override
{ {
FString Error; FString Error;
FString Result = WingTokenizer::TryInternalizeID(Input, Error); FName Result = WingTokenizer::TryInternalizeID(Input, Error);
if (!Error.IsEmpty()) if (!Error.IsEmpty())
{ {
UWingServer::Printf(TEXT("Error: %s\n"), *Error); UWingServer::Printf(TEXT("Error: %s\n"), *Error);
} }
if (!Result.IsEmpty()) if (!Result.IsNone())
{ {
UWingServer::Printf(TEXT("Result: %s\n"), *Result); UWingServer::Printf(TEXT("Result: %s\n"), *Result.ToString());
} }
} }
}; };

View File

@@ -44,11 +44,19 @@ WingCharacterClasses::WingCharacterClasses()
WingCharacterClasses WingCharacterClasses::TheSet; WingCharacterClasses WingCharacterClasses::TheSet;
void WingTokenizer::Add(TCHAR Type, FString InternalID) void WingTokenizer::Add(TCHAR Type, FName InternalID)
{ {
Token T; Token T;
T.Type = Type; T.Type = Type;
T.InternalID = MoveTemp(InternalID); T.InternalID = InternalID;
Tokens.Add(T);
}
void WingTokenizer::Add(TCHAR Type, FStringView Rest)
{
Token T;
T.Type = Type;
T.Rest = Rest;
Tokens.Add(T); Tokens.Add(T);
} }
@@ -117,23 +125,25 @@ TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error)
return Result; return Result;
} }
FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error) FName WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
{ {
if (!Error.IsEmpty()) return FString(); if (!Error.IsEmpty()) return FName();
TStringBuilder<512> Decoded; TCHAR Buffer[NAME_SIZE];
int Len = 0;
while (!Rest.IsEmpty() && Error.IsEmpty()) while (!Rest.IsEmpty() && Error.IsEmpty())
{ {
TCHAR Ch = Rest[0]; TCHAR Ch = Rest[0];
if (Ch == ' ') break; if (Ch == ' ') break;
if (Ch == '.') if (Ch == '.')
{ {
Decoded.AppendChar(' '); if (Len < NAME_SIZE) Buffer[Len++] = Ch;
Rest = Rest.RightChop(1); Rest = Rest.RightChop(1);
continue; continue;
} }
if (Ch == '&') if (Ch == '&')
{ {
Decoded.AppendChar(TokenizeEscapeSequence(Rest, Error)); TCHAR Decoded = TokenizeEscapeSequence(Rest, Error);
if (Len < NAME_SIZE) Buffer[Len++] = Decoded;
continue; continue;
} }
Cat Category = WingCharacterClasses::GetCat(Ch); Cat Category = WingCharacterClasses::GetCat(Ch);
@@ -141,20 +151,31 @@ FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
{ {
// We accept other characters in case the LLM sends unicode // We accept other characters in case the LLM sends unicode
// that isn't on the whitelist. This is intentional. // that isn't on the whitelist. This is intentional.
Decoded.AppendChar(Ch); if (Len < NAME_SIZE) Buffer[Len++] = Ch;
Rest = Rest.RightChop(1); Rest = Rest.RightChop(1);
} }
else break; else break;
} }
if (!Error.IsEmpty()) return FString(); if (!Error.IsEmpty()) return FName();
// We deliberately do not produce an error message for empty identifiers, // The buffer has just enough room to hold the longest FName,
// because we can't generate a good message here. We leave it to others // plus the required null terminator. If we filled the whole
// to deal with that case. // buffer, leaving no room for the null, it means the name
return Decoded.ToString(); // is too long.
if (Len == NAME_SIZE)
{
Error = "FName too long";
return FName();
}
Buffer[Len] = 0;
return FName(Len, Buffer);
// Note about code above: we deliberately do not check for empty
// names here, because we don't have the context to generate a good
// error message. So instead, we leave it to the caller.
} }
WingTokenizer::WingTokenizer(const FString& Input) WingTokenizer::WingTokenizer(const FString& In)
{ {
Input = In;
FStringView Rest(Input); FStringView Rest(Input);
while (!Rest.IsEmpty() && Error.IsEmpty()) while (!Rest.IsEmpty() && Error.IsEmpty())
{ {
@@ -166,7 +187,7 @@ WingTokenizer::WingTokenizer(const FString& Input)
} }
if (Ch == '=') if (Ch == '=')
{ {
Add(RestOfLine, FString(Rest.RightChop(1))); Add(RestOfLine, Rest.RightChop(1));
break; break;
} }
if ((Ch == '.') || (Ch == '&')) if ((Ch == '.') || (Ch == '&'))
@@ -200,30 +221,36 @@ void WingTokenizer::PrintEverything(FStringBuilderBase &Out) const
} }
for (const Token& T : Tokens) for (const Token& T : Tokens)
{ {
TStringBuilder<512> ExtraStr; Out.Appendf(TEXT("Token '%c': "), T.Type);
for (TCHAR Ch : T.InternalID) if (T.Type == Identifier)
{ {
if (Ch >= 0x20 && Ch <= 0x7E) for (TCHAR Ch : T.InternalID.ToString())
{ {
ExtraStr.AppendChar(Ch); if (Ch >= 0x20 && Ch <= 0x7E)
ExtraStr.AppendChar(' '); {
} Out.AppendChar(Ch);
else Out.AppendChar(' ');
{ }
ExtraStr.Appendf(TEXT("%04X "), (int32)Ch); else
{
Out.Appendf(TEXT("%04X "), (int32)Ch);
}
} }
} }
if (T.Type >= 0x20 && T.Type <= 0x7E) if (T.Type == RestOfLine)
Out.Appendf(TEXT("Token '%c': %s\n"), T.Type, *ExtraStr); {
else Out.Appendf(TEXT("[%s]"), *FString(T.Rest));
Out.Appendf(TEXT("Token %04X: %s\n"), (int32)T.Type, *ExtraStr); }
Out.AppendChar('\n');
} }
} }
FString WingTokenizer::ExternalizeID(const FString &InternalID) FString WingTokenizer::ExternalizeID(FName InternalID)
{ {
TStringBuilder<512> Result; TStringBuilder<512> Result;
for (TCHAR Ch : InternalID) TCHAR Buffer[FName::StringBufferSize];
int32 Len = InternalID.ToString(Buffer);
for (TCHAR Ch : FStringView(Buffer, Len))
{ {
if (Ch == ' ') Result.AppendChar('.'); if (Ch == ' ') Result.AppendChar('.');
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch); else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
@@ -246,10 +273,12 @@ FString WingTokenizer::ExternalizeID(const FString &InternalID)
return Result.ToString(); return Result.ToString();
} }
bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID) bool WingTokenizer::WouldExternalizeReadably(FName InternalID)
{ {
if (InternalID.IsEmpty()) return false; if (InternalID.IsNone()) return false;
for (TCHAR Ch : InternalID) TCHAR Buffer[FName::StringBufferSize];
int32 Len = InternalID.ToString(Buffer);
for (TCHAR Ch : FStringView(Buffer, Len))
{ {
if (Ch == ' ') continue; if (Ch == ' ') continue;
if (WingCharacterClasses::GetCat(Ch) != Cat::Identifier) return false; if (WingCharacterClasses::GetCat(Ch) != Cat::Identifier) return false;
@@ -257,16 +286,16 @@ bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID)
return true; return true;
} }
FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error) FName WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error)
{ {
FStringView Input(ExternalID); FStringView Input(ExternalID);
Error.Empty(); Error.Empty();
FString InternalID = TokenizeIdentifier(Input, Error); FName InternalID = TokenizeIdentifier(Input, Error);
// If there's already an error, annotate with context // If there's already an error, annotate with context
if (!Error.IsEmpty()) if (!Error.IsEmpty())
{ {
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *ExternalID, *Error); Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *ExternalID, *Error);
return FString(); return FName();
} }
// If the identifier tokenizer stops before consuming the whole // If the identifier tokenizer stops before consuming the whole
// input, then we need to generate an error message. We do our best // input, then we need to generate an error message. We do our best
@@ -288,13 +317,13 @@ FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Erro
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *ExternalID); Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *ExternalID);
} }
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *ExternalID); else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *ExternalID);
return FString(); return FName();
} }
// One last error case: empty input // One last error case: empty input
if (InternalID.IsEmpty()) if (InternalID.IsNone())
{ {
Error = TEXT("ERROR: Empty identifiers are not allowed"); Error = TEXT("ERROR: Empty identifiers are not allowed");
return FString(); return FName();
} }
return InternalID; return InternalID;
} }

View File

@@ -59,25 +59,25 @@ FName WingUtils::GetFName(const FWingProperty &Prop) { return Prop.Prop->GetFNam
FString WingUtils::ExternalizeID(FName Name) FString WingUtils::ExternalizeID(FName Name)
{ {
return WingTokenizer::ExternalizeID(Name.ToString()); return WingTokenizer::ExternalizeID(Name);
} }
FName WingUtils::CheckInternalizeID(const FString &ExternalID) FName WingUtils::CheckInternalizeID(const FString &ExternalID)
{ {
FString Error; FString Error;
FString InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error); FName InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error);
if (!Error.IsEmpty()) if (!Error.IsEmpty())
{ {
UWingServer::Printf(TEXT("%s\n"), *Error); UWingServer::Printf(TEXT("%s\n"), *Error);
UWingServer::SuggestManual(WingManual::Section::EscapeSequences); UWingServer::SuggestManual(WingManual::Section::EscapeSequences);
} }
return FName(InternalID); return InternalID;
} }
FName WingUtils::CheckProposedName(const FString &ExternalID) FName WingUtils::CheckProposedName(const FString &ExternalID)
{ {
FName InternalID = CheckInternalizeID(ExternalID); FName InternalID = CheckInternalizeID(ExternalID);
if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID.ToString())) if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID))
{ {
UWingServer::Printf(TEXT("ERROR: id %s would not be a readable id, may not create item with this name"), UWingServer::Printf(TEXT("ERROR: id %s would not be a readable id, may not create item with this name"),
*ExternalID); *ExternalID);

View File

@@ -105,10 +105,12 @@ struct WingTokenizer
// RestOfLine, or a single-character punctuation mark. // RestOfLine, or a single-character punctuation mark.
// The InternalID field contains the result of converting // The InternalID field contains the result of converting
// the token from an external ID to an internal ID. // the token from an external ID to an internal ID.
// Rest is only populated if it's a rest-of-line token.
struct Token struct Token
{ {
TCHAR Type; TCHAR Type;
FString InternalID; FName InternalID;
FStringView Rest;
}; };
// The string that we tokenized. // The string that we tokenized.
@@ -129,12 +131,12 @@ struct WingTokenizer
// Convert an internal ID into an external ID. // Convert an internal ID into an external ID.
// Spaces are converted to periods. Any other // Spaces are converted to periods. Any other
// non-identifier character is HTML escaped. // non-identifier character is HTML escaped.
static FString ExternalizeID(const FString &InternalID); static FString ExternalizeID(FName InternalID);
// Return true if the internal ID would convert // Return true if the internal ID would convert
// to a readable, easy-to-understand external ID without // to a readable, easy-to-understand external ID without
// HTML escape sequences. // HTML escape sequences.
static bool WouldExternalizeReadably(const FString &InternalID); static bool WouldExternalizeReadably(FName InternalID);
// Convert an external ID into an internal ID. // Convert an external ID into an internal ID.
// Periods are converted back to spaces. HTML escapes // Periods are converted back to spaces. HTML escapes
@@ -142,7 +144,7 @@ struct WingTokenizer
// fail, for example, if the external name contains an // fail, for example, if the external name contains an
// invalid HTML escape. If it does, returns empty // invalid HTML escape. If it does, returns empty
// string and sets the error message. // string and sets the error message.
static FString TryInternalizeID(const FString &ExternalID, FString &Error); static FName TryInternalizeID(const FString &ExternalID, FString &Error);
// Simplify an ID. This removes any non-identifier // Simplify an ID. This removes any non-identifier
// characters from the ID. Be careful! This could // characters from the ID. Be careful! This could
@@ -156,7 +158,8 @@ struct WingTokenizer
private: private:
// Add a token to the token array. // Add a token to the token array.
void Add(TCHAR Type, FString InternalID); void Add(TCHAR Type, FName InternalID);
void Add(TCHAR Type, FStringView Rest);
// Convert numbers to TCHAR. If there's an error, set the error // Convert numbers to TCHAR. If there's an error, set the error
// message and return zero. // message and return zero.
@@ -171,5 +174,5 @@ private:
// Tokenize an identifier. Attempts to consume a valid identifier // Tokenize an identifier. Attempts to consume a valid identifier
// from rest, and return the identifier. On error, sets the error // from rest, and return the identifier. On error, sets the error
// message and returns empty string. // message and returns empty string.
static FString TokenizeIdentifier(FStringView &Rest, FString &Error); static FName TokenizeIdentifier(FStringView &Rest, FString &Error);
}; };