Tokenizer is now done, we also have the new InternalizeID and ExternalizeID

This commit is contained in:
2026-03-28 19:29:15 -04:00
parent 5aef356199
commit 88fa260c9d
10 changed files with 4883 additions and 0 deletions

96
tools/font-glyphs.py Executable file
View File

@@ -0,0 +1,96 @@
#!/usr/bin/env python3
"""
Report which Unicode code points have vector outlines in ALL of the given font files.
Usage: python3 font-glyphs.py font1.ttf font2.ttf ...
"""
import sys
import unicodedata
from fontTools.ttLib import TTFont
from fontTools.pens.statisticsPen import StatisticsPen
def get_vector_codepoints(path):
"""Return the set of code points that have actual vector outlines in the font."""
font = TTFont(path)
cmap = font.getBestCmap()
if cmap is None:
print(f"WARNING: {path} has no cmap table", file=sys.stderr)
return set()
glyf = font.get("glyf") # TrueType outlines
cff = font.get("CFF ") # CFF outlines
result = set()
for codepoint, glyph_name in cmap.items():
has_outline = False
if glyf is not None:
g = glyf.get(glyph_name)
if g is not None and g.numberOfContours != 0:
has_outline = True
if cff is not None:
# CFF fonts store outlines in charstrings.
try:
cs = cff.cff.topDictIndex[0].CharStrings[glyph_name]
pen = StatisticsPen(glyphset=font.getGlyphSet())
cs.draw(pen)
if pen.area != 0:
has_outline = True
except (KeyError, AttributeError):
pass
if has_outline:
result.add(codepoint)
font.close()
return result
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} font1.ttf [font2.ttf ...]", file=sys.stderr)
sys.exit(1)
paths = sys.argv[1:]
# Process each font and intersect.
common = None
for path in paths:
cps = get_vector_codepoints(path)
print(f"{len(cps):6d} glyphs {path}")
if common is None:
common = cps
else:
common &= cps
if len(paths) > 1:
print(f"{len(common):6d} glyphs common to all {len(paths)} fonts", file=sys.stderr)
# Build the character string, excluding quote and backslash.
chars = []
for cp in sorted(common):
if cp == ord('"') or cp == ord('\\'):
continue
chars.append(chr(cp))
# Emit C++ file.
print("// Auto-generated by tools/font-glyphs.py — do not edit by hand.")
print(f"// {len(chars)} characters common to all {len(paths)} font(s).")
print()
print("const TCHAR *CommonChars = TEXT(")
# Break into lines of ~70 chars for readability.
line = ""
for ch in chars:
line += ch
if len(line) >= 70:
print(f'\t"{line}"')
line = ""
if line:
print(f'\t"{line}"')
print(");")
if __name__ == "__main__":
main()

54
tools/gen-entities.py Normal file
View File

@@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""Read entities.json (WHATWG HTML named character references) and generate
WingEntities.cpp with a lookup table of { "name", codepoint } rows.
Rules:
- Only entries whose key ends with ';' (skip legacy semicolon-less forms).
- Only entries with exactly one codepoint.
- Codepoint must be <= 0xFFFF (Unreal uses 16-bit TCHAR).
"""
import json, os
script_dir = os.path.dirname(os.path.abspath(__file__))
project_dir = os.path.dirname(script_dir)
input_path = os.path.join(project_dir, "entities.json")
output_path = os.path.join(project_dir,
"Plugins", "UEWingman", "Source", "UEWingman", "Private", "WingEntities.cpp")
with open(input_path) as f:
data = json.load(f)
rows = []
for key, val in sorted(data.items()):
if not key.endswith(";"):
continue
cps = val["codepoints"]
if len(cps) != 1:
continue
cp = cps[0]
if cp > 0xFFFF:
continue
# Strip leading '&' and trailing ';'
name = key[1:-1]
rows.append((name, cp))
with open(output_path, "w") as f:
f.write("// Auto-generated by tools/gen-entities.py — do not edit by hand.\n")
f.write("// Source: WHATWG HTML named character references (entities.json)\n\n")
f.write('#include "WingTokenizer.h"\n\n\n')
f.write("WingEntityList::WingEntityList(std::initializer_list<Raw> Data)\n")
f.write("{\n")
f.write("\tfor (const Raw& Entry : Data)\n")
f.write("\t{\n")
f.write('\t\tFString XName((const ANSICHAR*)Entry.Name);\n')
f.write("\t\tCharToName.Add(Entry.Codepoint, XName);\n")
f.write("\t\tNameToChar.Add(XName, Entry.Codepoint);\n")
f.write("\t}\n")
f.write("}\n\n")
f.write("WingEntityList WingEntityList::TheList({\n")
for name, cp in rows:
f.write(f'\t{{ "{name}", {cp} }},\n')
f.write("});\n")
print(f"Generated {len(rows)} entities -> {output_path}")