From 339495ae3b6d2f0d77fae1d78a6dbaed02fe704c Mon Sep 17 00:00:00 2001 From: jyelon Date: Mon, 30 Mar 2026 00:57:28 -0400 Subject: [PATCH] More work on tokenizer --- .../UEWingman/Handlers/Test_Sanitizer.h | 2 +- .../UEWingman/Handlers/Test_Unsanitize.h | 6 +- .../UEWingman/Private/WingTokenizer.cpp | 107 +++++++++++------- .../Source/UEWingman/Private/WingUtils.cpp | 8 +- .../Source/UEWingman/Public/WingTokenizer.h | 15 ++- 5 files changed, 85 insertions(+), 53 deletions(-) diff --git a/Plugins/UEWingman/Source/UEWingman/Handlers/Test_Sanitizer.h b/Plugins/UEWingman/Source/UEWingman/Handlers/Test_Sanitizer.h index 5fe04776..8a434326 100644 --- a/Plugins/UEWingman/Source/UEWingman/Handlers/Test_Sanitizer.h +++ b/Plugins/UEWingman/Source/UEWingman/Handlers/Test_Sanitizer.h @@ -27,6 +27,6 @@ public: virtual void Handle() override { - UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(Input)); + UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(FName(Input))); } }; diff --git a/Plugins/UEWingman/Source/UEWingman/Handlers/Test_Unsanitize.h b/Plugins/UEWingman/Source/UEWingman/Handlers/Test_Unsanitize.h index ce77e8d3..791fb9d6 100644 --- a/Plugins/UEWingman/Source/UEWingman/Handlers/Test_Unsanitize.h +++ b/Plugins/UEWingman/Source/UEWingman/Handlers/Test_Unsanitize.h @@ -28,14 +28,14 @@ public: virtual void Handle() override { FString Error; - FString Result = WingTokenizer::TryInternalizeID(Input, Error); + FName Result = WingTokenizer::TryInternalizeID(Input, Error); if (!Error.IsEmpty()) { UWingServer::Printf(TEXT("Error: %s\n"), *Error); } - if (!Result.IsEmpty()) + if (!Result.IsNone()) { - UWingServer::Printf(TEXT("Result: %s\n"), *Result); + UWingServer::Printf(TEXT("Result: %s\n"), *Result.ToString()); } } }; diff --git a/Plugins/UEWingman/Source/UEWingman/Private/WingTokenizer.cpp b/Plugins/UEWingman/Source/UEWingman/Private/WingTokenizer.cpp index c16af66d..d0c19491 100644 --- a/Plugins/UEWingman/Source/UEWingman/Private/WingTokenizer.cpp +++ b/Plugins/UEWingman/Source/UEWingman/Private/WingTokenizer.cpp @@ -44,11 +44,19 @@ WingCharacterClasses::WingCharacterClasses() WingCharacterClasses WingCharacterClasses::TheSet; -void WingTokenizer::Add(TCHAR Type, FString InternalID) +void WingTokenizer::Add(TCHAR Type, FName InternalID) { Token T; T.Type = Type; - T.InternalID = MoveTemp(InternalID); + T.InternalID = InternalID; + Tokens.Add(T); +} + +void WingTokenizer::Add(TCHAR Type, FStringView Rest) +{ + Token T; + T.Type = Type; + T.Rest = Rest; Tokens.Add(T); } @@ -117,23 +125,25 @@ TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error) return Result; } -FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error) +FName WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error) { - if (!Error.IsEmpty()) return FString(); - TStringBuilder<512> Decoded; + if (!Error.IsEmpty()) return FName(); + TCHAR Buffer[NAME_SIZE]; + int Len = 0; while (!Rest.IsEmpty() && Error.IsEmpty()) { TCHAR Ch = Rest[0]; if (Ch == ' ') break; if (Ch == '.') { - Decoded.AppendChar(' '); + if (Len < NAME_SIZE) Buffer[Len++] = Ch; Rest = Rest.RightChop(1); continue; } if (Ch == '&') { - Decoded.AppendChar(TokenizeEscapeSequence(Rest, Error)); + TCHAR Decoded = TokenizeEscapeSequence(Rest, Error); + if (Len < NAME_SIZE) Buffer[Len++] = Decoded; continue; } Cat Category = WingCharacterClasses::GetCat(Ch); @@ -141,20 +151,31 @@ FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error) { // We accept other characters in case the LLM sends unicode // that isn't on the whitelist. This is intentional. - Decoded.AppendChar(Ch); + if (Len < NAME_SIZE) Buffer[Len++] = Ch; Rest = Rest.RightChop(1); } else break; } - if (!Error.IsEmpty()) return FString(); - // We deliberately do not produce an error message for empty identifiers, - // because we can't generate a good message here. We leave it to others - // to deal with that case. - return Decoded.ToString(); + if (!Error.IsEmpty()) return FName(); + // The buffer has just enough room to hold the longest FName, + // plus the required null terminator. If we filled the whole + // buffer, leaving no room for the null, it means the name + // is too long. + if (Len == NAME_SIZE) + { + Error = "FName too long"; + return FName(); + } + Buffer[Len] = 0; + return FName(Len, Buffer); + // Note about code above: we deliberately do not check for empty + // names here, because we don't have the context to generate a good + // error message. So instead, we leave it to the caller. } -WingTokenizer::WingTokenizer(const FString& Input) +WingTokenizer::WingTokenizer(const FString& In) { + Input = In; FStringView Rest(Input); while (!Rest.IsEmpty() && Error.IsEmpty()) { @@ -166,7 +187,7 @@ WingTokenizer::WingTokenizer(const FString& Input) } if (Ch == '=') { - Add(RestOfLine, FString(Rest.RightChop(1))); + Add(RestOfLine, Rest.RightChop(1)); break; } if ((Ch == '.') || (Ch == '&')) @@ -200,30 +221,36 @@ void WingTokenizer::PrintEverything(FStringBuilderBase &Out) const } for (const Token& T : Tokens) { - TStringBuilder<512> ExtraStr; - for (TCHAR Ch : T.InternalID) + Out.Appendf(TEXT("Token '%c': "), T.Type); + if (T.Type == Identifier) { - if (Ch >= 0x20 && Ch <= 0x7E) + for (TCHAR Ch : T.InternalID.ToString()) { - ExtraStr.AppendChar(Ch); - ExtraStr.AppendChar(' '); - } - else - { - ExtraStr.Appendf(TEXT("%04X "), (int32)Ch); + if (Ch >= 0x20 && Ch <= 0x7E) + { + Out.AppendChar(Ch); + Out.AppendChar(' '); + } + else + { + Out.Appendf(TEXT("%04X "), (int32)Ch); + } } } - if (T.Type >= 0x20 && T.Type <= 0x7E) - Out.Appendf(TEXT("Token '%c': %s\n"), T.Type, *ExtraStr); - else - Out.Appendf(TEXT("Token %04X: %s\n"), (int32)T.Type, *ExtraStr); + if (T.Type == RestOfLine) + { + Out.Appendf(TEXT("[%s]"), *FString(T.Rest)); + } + Out.AppendChar('\n'); } } -FString WingTokenizer::ExternalizeID(const FString &InternalID) +FString WingTokenizer::ExternalizeID(FName InternalID) { TStringBuilder<512> Result; - for (TCHAR Ch : InternalID) + TCHAR Buffer[FName::StringBufferSize]; + int32 Len = InternalID.ToString(Buffer); + for (TCHAR Ch : FStringView(Buffer, Len)) { if (Ch == ' ') Result.AppendChar('.'); else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch); @@ -246,10 +273,12 @@ FString WingTokenizer::ExternalizeID(const FString &InternalID) return Result.ToString(); } -bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID) +bool WingTokenizer::WouldExternalizeReadably(FName InternalID) { - if (InternalID.IsEmpty()) return false; - for (TCHAR Ch : InternalID) + if (InternalID.IsNone()) return false; + TCHAR Buffer[FName::StringBufferSize]; + int32 Len = InternalID.ToString(Buffer); + for (TCHAR Ch : FStringView(Buffer, Len)) { if (Ch == ' ') continue; if (WingCharacterClasses::GetCat(Ch) != Cat::Identifier) return false; @@ -257,16 +286,16 @@ bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID) return true; } -FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error) +FName WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error) { FStringView Input(ExternalID); Error.Empty(); - FString InternalID = TokenizeIdentifier(Input, Error); + FName InternalID = TokenizeIdentifier(Input, Error); // If there's already an error, annotate with context if (!Error.IsEmpty()) { Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *ExternalID, *Error); - return FString(); + return FName(); } // If the identifier tokenizer stops before consuming the whole // input, then we need to generate an error message. We do our best @@ -288,13 +317,13 @@ FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Erro Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *ExternalID); } else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *ExternalID); - return FString(); + return FName(); } // One last error case: empty input - if (InternalID.IsEmpty()) + if (InternalID.IsNone()) { Error = TEXT("ERROR: Empty identifiers are not allowed"); - return FString(); + return FName(); } return InternalID; } diff --git a/Plugins/UEWingman/Source/UEWingman/Private/WingUtils.cpp b/Plugins/UEWingman/Source/UEWingman/Private/WingUtils.cpp index 4e3f4057..ff0882ec 100644 --- a/Plugins/UEWingman/Source/UEWingman/Private/WingUtils.cpp +++ b/Plugins/UEWingman/Source/UEWingman/Private/WingUtils.cpp @@ -59,25 +59,25 @@ FName WingUtils::GetFName(const FWingProperty &Prop) { return Prop.Prop->GetFNam FString WingUtils::ExternalizeID(FName Name) { - return WingTokenizer::ExternalizeID(Name.ToString()); + return WingTokenizer::ExternalizeID(Name); } FName WingUtils::CheckInternalizeID(const FString &ExternalID) { FString Error; - FString InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error); + FName InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error); if (!Error.IsEmpty()) { UWingServer::Printf(TEXT("%s\n"), *Error); UWingServer::SuggestManual(WingManual::Section::EscapeSequences); } - return FName(InternalID); + return InternalID; } FName WingUtils::CheckProposedName(const FString &ExternalID) { FName InternalID = CheckInternalizeID(ExternalID); - if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID.ToString())) + if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID)) { UWingServer::Printf(TEXT("ERROR: id %s would not be a readable id, may not create item with this name"), *ExternalID); diff --git a/Plugins/UEWingman/Source/UEWingman/Public/WingTokenizer.h b/Plugins/UEWingman/Source/UEWingman/Public/WingTokenizer.h index 3202d0f7..bbc6814d 100644 --- a/Plugins/UEWingman/Source/UEWingman/Public/WingTokenizer.h +++ b/Plugins/UEWingman/Source/UEWingman/Public/WingTokenizer.h @@ -105,10 +105,12 @@ struct WingTokenizer // RestOfLine, or a single-character punctuation mark. // The InternalID field contains the result of converting // the token from an external ID to an internal ID. + // Rest is only populated if it's a rest-of-line token. struct Token { TCHAR Type; - FString InternalID; + FName InternalID; + FStringView Rest; }; // The string that we tokenized. @@ -129,12 +131,12 @@ struct WingTokenizer // Convert an internal ID into an external ID. // Spaces are converted to periods. Any other // non-identifier character is HTML escaped. - static FString ExternalizeID(const FString &InternalID); + static FString ExternalizeID(FName InternalID); // Return true if the internal ID would convert // to a readable, easy-to-understand external ID without // HTML escape sequences. - static bool WouldExternalizeReadably(const FString &InternalID); + static bool WouldExternalizeReadably(FName InternalID); // Convert an external ID into an internal ID. // Periods are converted back to spaces. HTML escapes @@ -142,7 +144,7 @@ struct WingTokenizer // fail, for example, if the external name contains an // invalid HTML escape. If it does, returns empty // string and sets the error message. - static FString TryInternalizeID(const FString &ExternalID, FString &Error); + static FName TryInternalizeID(const FString &ExternalID, FString &Error); // Simplify an ID. This removes any non-identifier // characters from the ID. Be careful! This could @@ -156,7 +158,8 @@ struct WingTokenizer private: // Add a token to the token array. - void Add(TCHAR Type, FString InternalID); + void Add(TCHAR Type, FName InternalID); + void Add(TCHAR Type, FStringView Rest); // Convert numbers to TCHAR. If there's an error, set the error // message and return zero. @@ -171,5 +174,5 @@ private: // Tokenize an identifier. Attempts to consume a valid identifier // from rest, and return the identifier. On error, sets the error // message and returns empty string. - static FString TokenizeIdentifier(FStringView &Rest, FString &Error); + static FName TokenizeIdentifier(FStringView &Rest, FString &Error); };