Lots of work on unicode support

This commit is contained in:
2023-05-19 00:23:23 -04:00
parent a25213d259
commit 7e25be10a4
10 changed files with 249 additions and 228 deletions

View File

@@ -136,10 +136,10 @@ static bool encode_string(lua_State *L, eng::ostringstream &oss) {
std::string_view str(s, len);
oss << '"';
if (sv::valid_utf8(str) && !sv::has_prefix(str, "")) {
// Output the string in the straightforward way,
// using traditional json escaping.
for (char c : str) {
switch (c) {
while (!str.empty()) {
int32_t cp = sv::read_codepoint_utf8(str);
assert(cp >= 0);
switch (cp) {
case '\\': oss << "\\\\"; break;
case '"' : oss << "\\\""; break;
case '\b': oss << "\\b"; break;
@@ -148,10 +148,11 @@ static bool encode_string(lua_State *L, eng::ostringstream &oss) {
case '\n': oss << "\\n"; break;
case '\t': oss << "\\t"; break;
default: {
if (c < 32) {
oss << "\\u" << util::hex16.val(c);
if (cp < 32) {
oss << "\\u" << util::hex16.val(cp);
} else {
oss << c;
bool ok = util::write_codepoint_utf8(cp, &oss);
assert(ok);
}
}
}

View File

@@ -298,7 +298,7 @@ int32_t read_ascii_char(string_view &source) {
return result;
}
int32_t read_codepoint_utf8(string_view &source) {
int32_t read_codepoint_utf8(std::string_view &source) {
size_t size = source.size();
if (size == 0) return -1;
@@ -322,9 +322,8 @@ int32_t read_codepoint_utf8(string_view &source) {
codepoint = (bytes[0] & 0x07);
seqlen = 4;
} else {
// Bad character. Drop a byte and return invalid CP.
source.remove_prefix(1);
return 1;
// Bad character. return invalid CP.
return -2;
}
if (seqlen > size) {
@@ -333,9 +332,8 @@ int32_t read_codepoint_utf8(string_view &source) {
for (size_t i = 1; i < seqlen; ++i) {
if ((bytes[i] & 0xC0) != 0x80) {
// Bad character. Drop a byte and return invalid CP.
source.remove_prefix(1);
return 1;
// Bad character. return invalid CP.
return -2;
}
codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
}
@@ -346,17 +344,15 @@ int32_t read_codepoint_utf8(string_view &source) {
((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
// Bad character. Drop a byte and return invalid CP.
source.remove_prefix(1);
return 1;
// Bad character. return invalid CP.
return -2;
}
source.remove_prefix(seqlen);
return codepoint;
}
bool valid_utf8(string_view s)
{
bool valid_utf8(string_view s) {
while (!s.empty()) {
int32_t codepoint = read_codepoint_utf8(s);
if (codepoint < 0) return false;
@@ -403,27 +399,32 @@ void quote_string(const eng::string &s, std::ostream *os) {
}
bool usesinglequote = (!anysq)||(anydq);
(*os) << (usesinglequote ? '\'' : '"');
for (char c : s) {
if (c >= 32) {
if (c == '"') {
(*os) << (usesinglequote ? "\"" : "\\\"");
} else if (c == '\'') {
(*os) << (usesinglequote ? "\\'" : "'");
} else if (c == '\\') {
(*os) << "\\\\";
} else {
(*os) << c;
}
} else {
unsigned int value = ((unsigned char)c);
switch (c) {
std::string_view str(s);
while (!str.empty()) {
unsigned char c0 = (unsigned char)(str[0]);
int cp = sv::read_codepoint_utf8(str);
if (cp < 0) {
(*os) << "\\" << dec.width(3).fill('0').val(c0);
str.remove_prefix(1);
} else if (cp < 32) {
c0 = ((unsigned char)cp);
switch (c0) {
case '\n': (*os) << "\\n"; break;
case '\t': (*os) << "\\t"; break;
case '\r': (*os) << "\\r"; break;
case '\b': (*os) << "\\b"; break;
default:
(*os) << "\\" << dec.width(3).fill('0').val(value);
(*os) << "\\" << dec.width(3).fill('0').val(c0);
break;
}
} else if (cp == '"') {
(*os) << (usesinglequote ? "\"" : "\\\"");
} else if (cp == '\'') {
(*os) << (usesinglequote ? "\\'" : "'");
} else if (cp == '\\') {
(*os) << "\\\\";
} else {
write_codepoint_utf8(cp, os);
}
}
(*os) << (usesinglequote ? '\'' : '"');
@@ -656,50 +657,52 @@ eng::string toupper(eng::string input) {
return input;
}
static void buffer_codepoint_utf8(int32_t scp, char *buffer) {
static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
uint32_t cp = (uint32_t)scp;
unsigned char *c = (unsigned char *)buffer;
if (cp <= 0x7F) {
if (cp < 0) {
return 0;
}
else if (cp <= 0x7F) {
c[0] = cp;
c[1] = 0;
return 1;
}
else if (cp <= 0x7FF) {
c[0] = (cp>>6)+192;
c[1] = (cp&63)+128;
c[2] = 0;
return 2;
}
else if (cp <= 0xFFFF) {
if (0xd800 <= cp && cp <= 0xdfff) {
c[0] = 0;
} else {
c[0] = (cp>>12)+224;
c[1] = ((cp>>6)&63)+128;
c[2] = (cp&63)+128;
c[3] = 0;
if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
return 0;
}
c[0] = (cp>>12)+224;
c[1] = ((cp>>6)&63)+128;
c[2] = (cp&63)+128;
return 3;
}
else if (cp <= 0x10FFFF) {
c[0] = (cp>>18)+240;
c[1] = ((cp>>12)&63)+128;
c[2] = ((cp>>6)&63)+128;
c[3] = (cp&63)+128;
c[4] = 0;
return 4;
} else {
c[0] = 0;
return 0;
}
}
eng::string get_codepoint_utf8(uint32_t cp) {
char buffer[5];
buffer_codepoint_utf8(cp, buffer);
return eng::string(buffer);
char buffer[4];
int len = buffer_codepoint_utf8(cp, buffer);
return eng::string(buffer, len);
}
bool write_codepoint_utf8(int32_t cp, std::ostream *s) {
char buffer[5];
buffer_codepoint_utf8(cp, buffer);
(*s) << buffer;
return buffer[0] != 0;
char buffer[4];
int len = buffer_codepoint_utf8(cp, buffer);
(*s) << std::string_view(buffer, len);
return (len > 0);
}
double distance_squared(double x1, double y1, double x2, double y2) {

View File

@@ -181,8 +181,14 @@ int32_t read_ascii_char(string_view &source);
// Read a UTF8 codepoint from a string_view.
//
// If the next thing in the string_view isn't a valid
// codepoint, returns -1 and doesn't update the view.
// If the string_view is empty, returns -1 and doesn't update
// the string_view.
//
// If the string_view contains an unfinished but possibly valid
// codepoint, returns -1 and doesn't update the string_view.
//
// If the next thing in the string_view is an invalid codepoint,
// returns -2 and doesn't update the string_view.
//
int32_t read_codepoint_utf8(string_view &source);