More work on tokenizer

This commit is contained in:
2026-03-30 00:57:28 -04:00
parent 6041641c74
commit 339495ae3b
5 changed files with 85 additions and 53 deletions

View File

@@ -27,6 +27,6 @@ public:
virtual void Handle() override
{
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(Input));
UWingServer::Printf(TEXT("%s\n"), *WingTokenizer::ExternalizeID(FName(Input)));
}
};

View File

@@ -28,14 +28,14 @@ public:
virtual void Handle() override
{
FString Error;
FString Result = WingTokenizer::TryInternalizeID(Input, Error);
FName Result = WingTokenizer::TryInternalizeID(Input, Error);
if (!Error.IsEmpty())
{
UWingServer::Printf(TEXT("Error: %s\n"), *Error);
}
if (!Result.IsEmpty())
if (!Result.IsNone())
{
UWingServer::Printf(TEXT("Result: %s\n"), *Result);
UWingServer::Printf(TEXT("Result: %s\n"), *Result.ToString());
}
}
};

View File

@@ -44,11 +44,19 @@ WingCharacterClasses::WingCharacterClasses()
WingCharacterClasses WingCharacterClasses::TheSet;
void WingTokenizer::Add(TCHAR Type, FString InternalID)
void WingTokenizer::Add(TCHAR Type, FName InternalID)
{
Token T;
T.Type = Type;
T.InternalID = MoveTemp(InternalID);
T.InternalID = InternalID;
Tokens.Add(T);
}
void WingTokenizer::Add(TCHAR Type, FStringView Rest)
{
Token T;
T.Type = Type;
T.Rest = Rest;
Tokens.Add(T);
}
@@ -117,23 +125,25 @@ TCHAR WingTokenizer::TokenizeEscapeSequence(FStringView &Rest, FString &Error)
return Result;
}
FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
FName WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
{
if (!Error.IsEmpty()) return FString();
TStringBuilder<512> Decoded;
if (!Error.IsEmpty()) return FName();
TCHAR Buffer[NAME_SIZE];
int Len = 0;
while (!Rest.IsEmpty() && Error.IsEmpty())
{
TCHAR Ch = Rest[0];
if (Ch == ' ') break;
if (Ch == '.')
{
Decoded.AppendChar(' ');
if (Len < NAME_SIZE) Buffer[Len++] = Ch;
Rest = Rest.RightChop(1);
continue;
}
if (Ch == '&')
{
Decoded.AppendChar(TokenizeEscapeSequence(Rest, Error));
TCHAR Decoded = TokenizeEscapeSequence(Rest, Error);
if (Len < NAME_SIZE) Buffer[Len++] = Decoded;
continue;
}
Cat Category = WingCharacterClasses::GetCat(Ch);
@@ -141,20 +151,31 @@ FString WingTokenizer::TokenizeIdentifier(FStringView &Rest, FString &Error)
{
// We accept other characters in case the LLM sends unicode
// that isn't on the whitelist. This is intentional.
Decoded.AppendChar(Ch);
if (Len < NAME_SIZE) Buffer[Len++] = Ch;
Rest = Rest.RightChop(1);
}
else break;
}
if (!Error.IsEmpty()) return FString();
// We deliberately do not produce an error message for empty identifiers,
// because we can't generate a good message here. We leave it to others
// to deal with that case.
return Decoded.ToString();
if (!Error.IsEmpty()) return FName();
// The buffer has just enough room to hold the longest FName,
// plus the required null terminator. If we filled the whole
// buffer, leaving no room for the null, it means the name
// is too long.
if (Len == NAME_SIZE)
{
Error = "FName too long";
return FName();
}
Buffer[Len] = 0;
return FName(Len, Buffer);
// Note about code above: we deliberately do not check for empty
// names here, because we don't have the context to generate a good
// error message. So instead, we leave it to the caller.
}
WingTokenizer::WingTokenizer(const FString& Input)
WingTokenizer::WingTokenizer(const FString& In)
{
Input = In;
FStringView Rest(Input);
while (!Rest.IsEmpty() && Error.IsEmpty())
{
@@ -166,7 +187,7 @@ WingTokenizer::WingTokenizer(const FString& Input)
}
if (Ch == '=')
{
Add(RestOfLine, FString(Rest.RightChop(1)));
Add(RestOfLine, Rest.RightChop(1));
break;
}
if ((Ch == '.') || (Ch == '&'))
@@ -200,30 +221,36 @@ void WingTokenizer::PrintEverything(FStringBuilderBase &Out) const
}
for (const Token& T : Tokens)
{
TStringBuilder<512> ExtraStr;
for (TCHAR Ch : T.InternalID)
Out.Appendf(TEXT("Token '%c': "), T.Type);
if (T.Type == Identifier)
{
if (Ch >= 0x20 && Ch <= 0x7E)
for (TCHAR Ch : T.InternalID.ToString())
{
ExtraStr.AppendChar(Ch);
ExtraStr.AppendChar(' ');
}
else
{
ExtraStr.Appendf(TEXT("%04X "), (int32)Ch);
if (Ch >= 0x20 && Ch <= 0x7E)
{
Out.AppendChar(Ch);
Out.AppendChar(' ');
}
else
{
Out.Appendf(TEXT("%04X "), (int32)Ch);
}
}
}
if (T.Type >= 0x20 && T.Type <= 0x7E)
Out.Appendf(TEXT("Token '%c': %s\n"), T.Type, *ExtraStr);
else
Out.Appendf(TEXT("Token %04X: %s\n"), (int32)T.Type, *ExtraStr);
if (T.Type == RestOfLine)
{
Out.Appendf(TEXT("[%s]"), *FString(T.Rest));
}
Out.AppendChar('\n');
}
}
FString WingTokenizer::ExternalizeID(const FString &InternalID)
FString WingTokenizer::ExternalizeID(FName InternalID)
{
TStringBuilder<512> Result;
for (TCHAR Ch : InternalID)
TCHAR Buffer[FName::StringBufferSize];
int32 Len = InternalID.ToString(Buffer);
for (TCHAR Ch : FStringView(Buffer, Len))
{
if (Ch == ' ') Result.AppendChar('.');
else if (WingCharacterClasses::GetCat(Ch) == Cat::Identifier) Result.AppendChar(Ch);
@@ -246,10 +273,12 @@ FString WingTokenizer::ExternalizeID(const FString &InternalID)
return Result.ToString();
}
bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID)
bool WingTokenizer::WouldExternalizeReadably(FName InternalID)
{
if (InternalID.IsEmpty()) return false;
for (TCHAR Ch : InternalID)
if (InternalID.IsNone()) return false;
TCHAR Buffer[FName::StringBufferSize];
int32 Len = InternalID.ToString(Buffer);
for (TCHAR Ch : FStringView(Buffer, Len))
{
if (Ch == ' ') continue;
if (WingCharacterClasses::GetCat(Ch) != Cat::Identifier) return false;
@@ -257,16 +286,16 @@ bool WingTokenizer::WouldExternalizeReadably(const FString &InternalID)
return true;
}
FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error)
FName WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Error)
{
FStringView Input(ExternalID);
Error.Empty();
FString InternalID = TokenizeIdentifier(Input, Error);
FName InternalID = TokenizeIdentifier(Input, Error);
// If there's already an error, annotate with context
if (!Error.IsEmpty())
{
Error = FString::Printf(TEXT("ERROR parsing id %s: %s"), *ExternalID, *Error);
return FString();
return FName();
}
// If the identifier tokenizer stops before consuming the whole
// input, then we need to generate an error message. We do our best
@@ -288,13 +317,13 @@ FString WingTokenizer::TryInternalizeID(const FString &ExternalID, FString &Erro
Error = FString::Printf(TEXT("ERROR parsing id %s: in ids, control characters must be escaped"), *ExternalID);
}
else Error = FString::Printf(TEXT("ERROR parsing id %s: unparseable character in id"), *ExternalID);
return FString();
return FName();
}
// One last error case: empty input
if (InternalID.IsEmpty())
if (InternalID.IsNone())
{
Error = TEXT("ERROR: Empty identifiers are not allowed");
return FString();
return FName();
}
return InternalID;
}

View File

@@ -59,25 +59,25 @@ FName WingUtils::GetFName(const FWingProperty &Prop) { return Prop.Prop->GetFNam
FString WingUtils::ExternalizeID(FName Name)
{
return WingTokenizer::ExternalizeID(Name.ToString());
return WingTokenizer::ExternalizeID(Name);
}
FName WingUtils::CheckInternalizeID(const FString &ExternalID)
{
FString Error;
FString InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error);
FName InternalID = WingTokenizer::TryInternalizeID(ExternalID, Error);
if (!Error.IsEmpty())
{
UWingServer::Printf(TEXT("%s\n"), *Error);
UWingServer::SuggestManual(WingManual::Section::EscapeSequences);
}
return FName(InternalID);
return InternalID;
}
FName WingUtils::CheckProposedName(const FString &ExternalID)
{
FName InternalID = CheckInternalizeID(ExternalID);
if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID.ToString()))
if (!InternalID.IsNone() && !WingTokenizer::WouldExternalizeReadably(InternalID))
{
UWingServer::Printf(TEXT("ERROR: id %s would not be a readable id, may not create item with this name"),
*ExternalID);

View File

@@ -105,10 +105,12 @@ struct WingTokenizer
// RestOfLine, or a single-character punctuation mark.
// The InternalID field contains the result of converting
// the token from an external ID to an internal ID.
// Rest is only populated if it's a rest-of-line token.
struct Token
{
TCHAR Type;
FString InternalID;
FName InternalID;
FStringView Rest;
};
// The string that we tokenized.
@@ -129,12 +131,12 @@ struct WingTokenizer
// Convert an internal ID into an external ID.
// Spaces are converted to periods. Any other
// non-identifier character is HTML escaped.
static FString ExternalizeID(const FString &InternalID);
static FString ExternalizeID(FName InternalID);
// Return true if the internal ID would convert
// to a readable, easy-to-understand external ID without
// HTML escape sequences.
static bool WouldExternalizeReadably(const FString &InternalID);
static bool WouldExternalizeReadably(FName InternalID);
// Convert an external ID into an internal ID.
// Periods are converted back to spaces. HTML escapes
@@ -142,7 +144,7 @@ struct WingTokenizer
// fail, for example, if the external name contains an
// invalid HTML escape. If it does, returns empty
// string and sets the error message.
static FString TryInternalizeID(const FString &ExternalID, FString &Error);
static FName TryInternalizeID(const FString &ExternalID, FString &Error);
// Simplify an ID. This removes any non-identifier
// characters from the ID. Be careful! This could
@@ -156,7 +158,8 @@ struct WingTokenizer
private:
// Add a token to the token array.
void Add(TCHAR Type, FString InternalID);
void Add(TCHAR Type, FName InternalID);
void Add(TCHAR Type, FStringView Rest);
// Convert numbers to TCHAR. If there's an error, set the error
// message and return zero.
@@ -171,5 +174,5 @@ private:
// Tokenize an identifier. Attempts to consume a valid identifier
// from rest, and return the identifier. On error, sets the error
// message and returns empty string.
static FString TokenizeIdentifier(FStringView &Rest, FString &Error);
static FName TokenizeIdentifier(FStringView &Rest, FString &Error);
};