Lots of work on unicode support
This commit is contained in:
@@ -136,10 +136,10 @@ static bool encode_string(lua_State *L, eng::ostringstream &oss) {
|
|||||||
std::string_view str(s, len);
|
std::string_view str(s, len);
|
||||||
oss << '"';
|
oss << '"';
|
||||||
if (sv::valid_utf8(str) && !sv::has_prefix(str, "")) {
|
if (sv::valid_utf8(str) && !sv::has_prefix(str, "")) {
|
||||||
// Output the string in the straightforward way,
|
while (!str.empty()) {
|
||||||
// using traditional json escaping.
|
int32_t cp = sv::read_codepoint_utf8(str);
|
||||||
for (char c : str) {
|
assert(cp >= 0);
|
||||||
switch (c) {
|
switch (cp) {
|
||||||
case '\\': oss << "\\\\"; break;
|
case '\\': oss << "\\\\"; break;
|
||||||
case '"' : oss << "\\\""; break;
|
case '"' : oss << "\\\""; break;
|
||||||
case '\b': oss << "\\b"; break;
|
case '\b': oss << "\\b"; break;
|
||||||
@@ -148,10 +148,11 @@ static bool encode_string(lua_State *L, eng::ostringstream &oss) {
|
|||||||
case '\n': oss << "\\n"; break;
|
case '\n': oss << "\\n"; break;
|
||||||
case '\t': oss << "\\t"; break;
|
case '\t': oss << "\\t"; break;
|
||||||
default: {
|
default: {
|
||||||
if (c < 32) {
|
if (cp < 32) {
|
||||||
oss << "\\u" << util::hex16.val(c);
|
oss << "\\u" << util::hex16.val(cp);
|
||||||
} else {
|
} else {
|
||||||
oss << c;
|
bool ok = util::write_codepoint_utf8(cp, &oss);
|
||||||
|
assert(ok);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -298,7 +298,7 @@ int32_t read_ascii_char(string_view &source) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t read_codepoint_utf8(string_view &source) {
|
int32_t read_codepoint_utf8(std::string_view &source) {
|
||||||
size_t size = source.size();
|
size_t size = source.size();
|
||||||
if (size == 0) return -1;
|
if (size == 0) return -1;
|
||||||
|
|
||||||
@@ -322,9 +322,8 @@ int32_t read_codepoint_utf8(string_view &source) {
|
|||||||
codepoint = (bytes[0] & 0x07);
|
codepoint = (bytes[0] & 0x07);
|
||||||
seqlen = 4;
|
seqlen = 4;
|
||||||
} else {
|
} else {
|
||||||
// Bad character. Drop a byte and return invalid CP.
|
// Bad character. return invalid CP.
|
||||||
source.remove_prefix(1);
|
return -2;
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (seqlen > size) {
|
if (seqlen > size) {
|
||||||
@@ -333,9 +332,8 @@ int32_t read_codepoint_utf8(string_view &source) {
|
|||||||
|
|
||||||
for (size_t i = 1; i < seqlen; ++i) {
|
for (size_t i = 1; i < seqlen; ++i) {
|
||||||
if ((bytes[i] & 0xC0) != 0x80) {
|
if ((bytes[i] & 0xC0) != 0x80) {
|
||||||
// Bad character. Drop a byte and return invalid CP.
|
// Bad character. return invalid CP.
|
||||||
source.remove_prefix(1);
|
return -2;
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
|
codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
|
||||||
}
|
}
|
||||||
@@ -346,17 +344,15 @@ int32_t read_codepoint_utf8(string_view &source) {
|
|||||||
((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
|
((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
|
||||||
((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
|
((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
|
||||||
((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
|
((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
|
||||||
// Bad character. Drop a byte and return invalid CP.
|
// Bad character. return invalid CP.
|
||||||
source.remove_prefix(1);
|
return -2;
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
source.remove_prefix(seqlen);
|
source.remove_prefix(seqlen);
|
||||||
return codepoint;
|
return codepoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool valid_utf8(string_view s)
|
bool valid_utf8(string_view s) {
|
||||||
{
|
|
||||||
while (!s.empty()) {
|
while (!s.empty()) {
|
||||||
int32_t codepoint = read_codepoint_utf8(s);
|
int32_t codepoint = read_codepoint_utf8(s);
|
||||||
if (codepoint < 0) return false;
|
if (codepoint < 0) return false;
|
||||||
@@ -403,27 +399,32 @@ void quote_string(const eng::string &s, std::ostream *os) {
|
|||||||
}
|
}
|
||||||
bool usesinglequote = (!anysq)||(anydq);
|
bool usesinglequote = (!anysq)||(anydq);
|
||||||
(*os) << (usesinglequote ? '\'' : '"');
|
(*os) << (usesinglequote ? '\'' : '"');
|
||||||
for (char c : s) {
|
std::string_view str(s);
|
||||||
if (c >= 32) {
|
while (!str.empty()) {
|
||||||
if (c == '"') {
|
unsigned char c0 = (unsigned char)(str[0]);
|
||||||
(*os) << (usesinglequote ? "\"" : "\\\"");
|
int cp = sv::read_codepoint_utf8(str);
|
||||||
} else if (c == '\'') {
|
if (cp < 0) {
|
||||||
(*os) << (usesinglequote ? "\\'" : "'");
|
(*os) << "\\" << dec.width(3).fill('0').val(c0);
|
||||||
} else if (c == '\\') {
|
str.remove_prefix(1);
|
||||||
(*os) << "\\\\";
|
} else if (cp < 32) {
|
||||||
} else {
|
c0 = ((unsigned char)cp);
|
||||||
(*os) << c;
|
switch (c0) {
|
||||||
}
|
|
||||||
} else {
|
|
||||||
unsigned int value = ((unsigned char)c);
|
|
||||||
switch (c) {
|
|
||||||
case '\n': (*os) << "\\n"; break;
|
case '\n': (*os) << "\\n"; break;
|
||||||
case '\t': (*os) << "\\t"; break;
|
case '\t': (*os) << "\\t"; break;
|
||||||
case '\r': (*os) << "\\r"; break;
|
case '\r': (*os) << "\\r"; break;
|
||||||
|
case '\b': (*os) << "\\b"; break;
|
||||||
default:
|
default:
|
||||||
(*os) << "\\" << dec.width(3).fill('0').val(value);
|
(*os) << "\\" << dec.width(3).fill('0').val(c0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
} else if (cp == '"') {
|
||||||
|
(*os) << (usesinglequote ? "\"" : "\\\"");
|
||||||
|
} else if (cp == '\'') {
|
||||||
|
(*os) << (usesinglequote ? "\\'" : "'");
|
||||||
|
} else if (cp == '\\') {
|
||||||
|
(*os) << "\\\\";
|
||||||
|
} else {
|
||||||
|
write_codepoint_utf8(cp, os);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
(*os) << (usesinglequote ? '\'' : '"');
|
(*os) << (usesinglequote ? '\'' : '"');
|
||||||
@@ -656,50 +657,52 @@ eng::string toupper(eng::string input) {
|
|||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void buffer_codepoint_utf8(int32_t scp, char *buffer) {
|
static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
|
||||||
uint32_t cp = (uint32_t)scp;
|
uint32_t cp = (uint32_t)scp;
|
||||||
unsigned char *c = (unsigned char *)buffer;
|
unsigned char *c = (unsigned char *)buffer;
|
||||||
if (cp <= 0x7F) {
|
if (cp < 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else if (cp <= 0x7F) {
|
||||||
c[0] = cp;
|
c[0] = cp;
|
||||||
c[1] = 0;
|
return 1;
|
||||||
}
|
}
|
||||||
else if (cp <= 0x7FF) {
|
else if (cp <= 0x7FF) {
|
||||||
c[0] = (cp>>6)+192;
|
c[0] = (cp>>6)+192;
|
||||||
c[1] = (cp&63)+128;
|
c[1] = (cp&63)+128;
|
||||||
c[2] = 0;
|
return 2;
|
||||||
}
|
}
|
||||||
else if (cp <= 0xFFFF) {
|
else if (cp <= 0xFFFF) {
|
||||||
if (0xd800 <= cp && cp <= 0xdfff) {
|
if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
|
||||||
c[0] = 0;
|
return 0;
|
||||||
} else {
|
|
||||||
c[0] = (cp>>12)+224;
|
|
||||||
c[1] = ((cp>>6)&63)+128;
|
|
||||||
c[2] = (cp&63)+128;
|
|
||||||
c[3] = 0;
|
|
||||||
}
|
}
|
||||||
|
c[0] = (cp>>12)+224;
|
||||||
|
c[1] = ((cp>>6)&63)+128;
|
||||||
|
c[2] = (cp&63)+128;
|
||||||
|
return 3;
|
||||||
}
|
}
|
||||||
else if (cp <= 0x10FFFF) {
|
else if (cp <= 0x10FFFF) {
|
||||||
c[0] = (cp>>18)+240;
|
c[0] = (cp>>18)+240;
|
||||||
c[1] = ((cp>>12)&63)+128;
|
c[1] = ((cp>>12)&63)+128;
|
||||||
c[2] = ((cp>>6)&63)+128;
|
c[2] = ((cp>>6)&63)+128;
|
||||||
c[3] = (cp&63)+128;
|
c[3] = (cp&63)+128;
|
||||||
c[4] = 0;
|
return 4;
|
||||||
} else {
|
} else {
|
||||||
c[0] = 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
eng::string get_codepoint_utf8(uint32_t cp) {
|
eng::string get_codepoint_utf8(uint32_t cp) {
|
||||||
char buffer[5];
|
char buffer[4];
|
||||||
buffer_codepoint_utf8(cp, buffer);
|
int len = buffer_codepoint_utf8(cp, buffer);
|
||||||
return eng::string(buffer);
|
return eng::string(buffer, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool write_codepoint_utf8(int32_t cp, std::ostream *s) {
|
bool write_codepoint_utf8(int32_t cp, std::ostream *s) {
|
||||||
char buffer[5];
|
char buffer[4];
|
||||||
buffer_codepoint_utf8(cp, buffer);
|
int len = buffer_codepoint_utf8(cp, buffer);
|
||||||
(*s) << buffer;
|
(*s) << std::string_view(buffer, len);
|
||||||
return buffer[0] != 0;
|
return (len > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
double distance_squared(double x1, double y1, double x2, double y2) {
|
double distance_squared(double x1, double y1, double x2, double y2) {
|
||||||
|
|||||||
@@ -181,8 +181,14 @@ int32_t read_ascii_char(string_view &source);
|
|||||||
|
|
||||||
// Read a UTF8 codepoint from a string_view.
|
// Read a UTF8 codepoint from a string_view.
|
||||||
//
|
//
|
||||||
// If the next thing in the string_view isn't a valid
|
// If the string_view is empty, returns -1 and doesn't update
|
||||||
// codepoint, returns -1 and doesn't update the view.
|
// the string_view.
|
||||||
|
//
|
||||||
|
// If the string_view contains an unfinished but possibly valid
|
||||||
|
// codepoint, returns -1 and doesn't update the string_view.
|
||||||
|
//
|
||||||
|
// If the next thing in the string_view is an invalid codepoint,
|
||||||
|
// returns -2 and doesn't update the string_view.
|
||||||
//
|
//
|
||||||
int32_t read_codepoint_utf8(string_view &source);
|
int32_t read_codepoint_utf8(string_view &source);
|
||||||
|
|
||||||
|
|||||||
@@ -219,18 +219,18 @@ static int socket_poll(struct pollfd *pollvec, int pollcount, int mstimeout, std
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Write unicode onto the console.
|
// Write unicode onto the console.
|
||||||
static void console_write(const CodepointString &cps) {
|
static void console_write(const std::u32string &cps) {
|
||||||
std::string utf8 = ReadlineDevice::to_utf8(cps);
|
std::string utf8 = drvutil::to_utf8(cps);
|
||||||
write(1, utf8.c_str(), utf8.size());
|
write(1, utf8.c_str(), utf8.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
static CodepointString console_read() {
|
static std::u32string console_read() {
|
||||||
CodepointString result;
|
std::u32string result;
|
||||||
char buffer[512];
|
char buffer[512];
|
||||||
int nread = read(0, buffer, 512);
|
int nread = read(0, buffer, 512);
|
||||||
if (nread > 0) {
|
if (nread > 0) {
|
||||||
std::string_view s(buffer, nread);
|
std::string_view s(buffer, nread);
|
||||||
result = ReadlineDevice::from_utf8(s, nullptr);
|
result = drvutil::from_utf8(s, nullptr);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -230,14 +230,15 @@ static void init_winsock() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void console_write(const CodepointString &cps) {
|
|
||||||
|
static void console_write(const std::u32string &cps) {
|
||||||
if (cps.size() == 0) return;
|
if (cps.size() == 0) return;
|
||||||
// Convert to wstring.
|
// Convert to wstring. Any character not representable as a single wchar_t
|
||||||
// Any character outside the range 0xFFFF is replaced with a box.
|
// is replaced with a box. It's not ideal, but it's pretty good.
|
||||||
std::wstring ws(cps.size(), 0);
|
std::wstring ws(cps.size(), 0);
|
||||||
for (int i = 0; i < int(cps.size()); i++) {
|
for (int i = 0; i < int(cps.size()); i++) {
|
||||||
char32_t c = cps[i];
|
char32_t c = cps[i];
|
||||||
if ((c >= 0)&&(c <= 0xFFFF)) ws[i] = (wchar_t)c;
|
if (drvutil::is_single_wchar_t(c)) ws[i] = (wchar_t)c;
|
||||||
else ws[i] = 0x2610;
|
else ws[i] = 0x2610;
|
||||||
}
|
}
|
||||||
HANDLE hstdout = GetStdHandle(STD_OUTPUT_HANDLE);
|
HANDLE hstdout = GetStdHandle(STD_OUTPUT_HANDLE);
|
||||||
@@ -253,7 +254,7 @@ static void console_write(const CodepointString &cps) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static CodepointString console_read() {
|
static std::u32string console_read() {
|
||||||
HANDLE hstdin = GetStdHandle(STD_INPUT_HANDLE);
|
HANDLE hstdin = GetStdHandle(STD_INPUT_HANDLE);
|
||||||
assert(hstdin != INVALID_HANDLE_VALUE);
|
assert(hstdin != INVALID_HANDLE_VALUE);
|
||||||
INPUT_RECORD inrecords[512];
|
INPUT_RECORD inrecords[512];
|
||||||
@@ -262,7 +263,7 @@ static CodepointString console_read() {
|
|||||||
if (int(nevents) > 0) {
|
if (int(nevents) > 0) {
|
||||||
if (int(nevents) > 512) nevents = 512;
|
if (int(nevents) > 512) nevents = 512;
|
||||||
ReadConsoleInputW(hstdin, inrecords, nevents, &nread);
|
ReadConsoleInputW(hstdin, inrecords, nevents, &nread);
|
||||||
CodepointString result(nread, 0);
|
std::u32string result(nread, 0);
|
||||||
int len = 0;
|
int len = 0;
|
||||||
for (int i = 0; i < int(nread); i++) {
|
for (int i = 0; i < int(nread); i++) {
|
||||||
const INPUT_RECORD &inr = inrecords[i];
|
const INPUT_RECORD &inr = inrecords[i];
|
||||||
@@ -274,7 +275,7 @@ static CodepointString console_read() {
|
|||||||
return result.substr(0, len);
|
return result.substr(0, len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return CodepointString();
|
return std::u32string();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ssl_load_certificate_authorities(SSL_CTX *ctx) {
|
static void ssl_load_certificate_authorities(SSL_CTX *ctx) {
|
||||||
|
|||||||
@@ -206,7 +206,7 @@ class Driver {
|
|||||||
if (ndata > DRV_SHORTSTRING_SIZE) ndata = DRV_SHORTSTRING_SIZE;
|
if (ndata > DRV_SHORTSTRING_SIZE) ndata = DRV_SHORTSTRING_SIZE;
|
||||||
std::string_view src(data, ndata);
|
std::string_view src(data, ndata);
|
||||||
int consumed;
|
int consumed;
|
||||||
CodepointString cps = ReadlineDevice::from_utf8(src, &consumed);
|
std::u32string cps = drvutil::from_utf8(src, &consumed);
|
||||||
readline_device_.print(cps);
|
readline_device_.print(cps);
|
||||||
engw.play_sent_outgoing(&engw, 0, consumed);
|
engw.play_sent_outgoing(&engw, 0, consumed);
|
||||||
}
|
}
|
||||||
@@ -217,16 +217,16 @@ class Driver {
|
|||||||
uint32_t promptlen;
|
uint32_t promptlen;
|
||||||
const char *promptdata;
|
const char *promptdata;
|
||||||
engw.get_console_prompt(&engw, &promptlen, &promptdata);
|
engw.get_console_prompt(&engw, &promptlen, &promptdata);
|
||||||
CodepointString prompt = ReadlineDevice::from_utf8(std::string_view(promptdata, promptlen), nullptr);
|
std::u32string prompt = drvutil::from_utf8(std::string_view(promptdata, promptlen), nullptr);
|
||||||
readline_device_.set_prompt(prompt);
|
readline_device_.set_prompt(prompt);
|
||||||
while (true) {
|
while (true) {
|
||||||
CodepointString cps = console_read();
|
std::u32string cps = console_read();
|
||||||
if (cps.size() == 0) break;
|
if (cps.size() == 0) break;
|
||||||
read_console_recently_ = true;
|
read_console_recently_ = true;
|
||||||
for (char32_t c : cps) {
|
for (char32_t c : cps) {
|
||||||
CodepointString line = readline_device_.putcode(c);
|
std::u32string line = readline_device_.putcode(c);
|
||||||
if (!line.empty()) {
|
if (!line.empty()) {
|
||||||
std::string utf8 = ReadlineDevice::to_utf8(line);
|
std::string utf8 = drvutil::to_utf8(line);
|
||||||
engw.play_recv_incoming(&engw, 0, utf8.size(), utf8.c_str());
|
engw.play_recv_incoming(&engw, 0, utf8.size(), utf8.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -77,6 +77,131 @@ void split_target(std::string_view target, std::string &cert, std::string &host,
|
|||||||
port = std::string(split[2]);
|
port = std::string(split[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool is_single_wchar_t(char32_t c) {
|
||||||
|
if ((c >= 0xD800) && (c <= 0xDFFF)) return false;
|
||||||
|
if ((c >= 0) && (c <= 0xFFFF)) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
|
||||||
|
uint32_t cp = (uint32_t)scp;
|
||||||
|
unsigned char *c = (unsigned char *)buffer;
|
||||||
|
if (cp < 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else if (cp <= 0x7F) {
|
||||||
|
c[0] = cp;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else if (cp <= 0x7FF) {
|
||||||
|
c[0] = (cp>>6)+192;
|
||||||
|
c[1] = (cp&63)+128;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
else if (cp <= 0xFFFF) {
|
||||||
|
if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
c[0] = (cp>>12)+224;
|
||||||
|
c[1] = ((cp>>6)&63)+128;
|
||||||
|
c[2] = (cp&63)+128;
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
else if (cp <= 0x10FFFF) {
|
||||||
|
c[0] = (cp>>18)+240;
|
||||||
|
c[1] = ((cp>>12)&63)+128;
|
||||||
|
c[2] = ((cp>>6)&63)+128;
|
||||||
|
c[3] = (cp&63)+128;
|
||||||
|
return 4;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int32_t read_codepoint_utf8(std::string_view &source) {
|
||||||
|
size_t size = source.size();
|
||||||
|
if (size == 0) return -1;
|
||||||
|
|
||||||
|
const unsigned char *bytes = (const unsigned char *)source.data();
|
||||||
|
int codepoint;
|
||||||
|
size_t seqlen;
|
||||||
|
if ((bytes[0] & 0x80) == 0x00) {
|
||||||
|
// U+0000 to U+007F
|
||||||
|
codepoint = (bytes[0] & 0x7F);
|
||||||
|
seqlen = 1;
|
||||||
|
} else if ((bytes[0] & 0xE0) == 0xC0) {
|
||||||
|
// U+0080 to U+07FF
|
||||||
|
codepoint = (bytes[0] & 0x1F);
|
||||||
|
seqlen = 2;
|
||||||
|
} else if ((bytes[0] & 0xF0) == 0xE0) {
|
||||||
|
// U+0800 to U+FFFF
|
||||||
|
codepoint = (bytes[0] & 0x0F);
|
||||||
|
seqlen = 3;
|
||||||
|
} else if ((bytes[0] & 0xF8) == 0xF0) {
|
||||||
|
// U+10000 to U+10FFFF
|
||||||
|
codepoint = (bytes[0] & 0x07);
|
||||||
|
seqlen = 4;
|
||||||
|
} else {
|
||||||
|
// Bad character. return invalid CP.
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (seqlen > size) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 1; i < seqlen; ++i) {
|
||||||
|
if ((bytes[i] & 0xC0) != 0x80) {
|
||||||
|
// Bad character. return invalid CP.
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((codepoint > 0x10FFFF) ||
|
||||||
|
((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) ||
|
||||||
|
((codepoint <= 0x007F) && (seqlen != 1)) ||
|
||||||
|
((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
|
||||||
|
((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
|
||||||
|
((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
|
||||||
|
// Bad character. return invalid CP.
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
|
||||||
|
source.remove_prefix(seqlen);
|
||||||
|
return codepoint;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string to_utf8(const std::u32string &s) {
|
||||||
|
std::string result(s.size() * 4, 0);
|
||||||
|
char *buffer = &result[0];
|
||||||
|
int len = 0;
|
||||||
|
for (char32_t c : s) {
|
||||||
|
int clen = buffer_codepoint_utf8(c, buffer + len);
|
||||||
|
len += clen;
|
||||||
|
}
|
||||||
|
return result.substr(0, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::u32string from_utf8(std::string_view s, int *consumed) {
|
||||||
|
std::string_view rest = s;
|
||||||
|
std::u32string result(s.size(), 0);
|
||||||
|
int len = 0;
|
||||||
|
while (true) {
|
||||||
|
int32_t c = read_codepoint_utf8(rest);
|
||||||
|
if (c == -1) {
|
||||||
|
break; // EOF reached;
|
||||||
|
} else if (c < 0) {
|
||||||
|
rest.remove_prefix(1);
|
||||||
|
} else {
|
||||||
|
result[len++] = (char32_t)c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (consumed != nullptr) {
|
||||||
|
*consumed = s.size() - rest.size();
|
||||||
|
}
|
||||||
|
return result.substr(0, len);
|
||||||
|
}
|
||||||
|
|
||||||
static std::vector<std::string> parse_control_lst(std::string_view ctrl) {
|
static std::vector<std::string> parse_control_lst(std::string_view ctrl) {
|
||||||
std::vector<std::string> result;
|
std::vector<std::string> result;
|
||||||
|
|||||||
@@ -46,6 +46,23 @@ std::string package_lua_source(const std::filesystem::path &base, std::ostream *
|
|||||||
//
|
//
|
||||||
void split_target(std::string_view target, std::string &cert, std::string &host, std::string &port);
|
void split_target(std::string_view target, std::string &cert, std::string &host, std::string &port);
|
||||||
|
|
||||||
|
// Return true if the unicode codepoint can be converted to a single 16-bit wchar_t.
|
||||||
|
//
|
||||||
|
bool is_single_wchar_t(char32_t c);
|
||||||
|
|
||||||
|
// Convert a codepoint string into a UTF8-string.
|
||||||
|
// If the codepoint string contains invalid codepoints, they're silently dropped.
|
||||||
|
//
|
||||||
|
std::string to_utf8(const std::u32string &cps);
|
||||||
|
|
||||||
|
// Convert a UTF8 string to a codepoint string.
|
||||||
|
//
|
||||||
|
// If the UTF8 string contains invalid sequences, they're silently dropped.
|
||||||
|
// Some of the bytes may not be consumed, if the source ends with an unfinished
|
||||||
|
// utf-8 sequence. Returns the Codepoint string and the number of bytes consumed.
|
||||||
|
//
|
||||||
|
std::u32string from_utf8(std::string_view source, int *consumed);
|
||||||
|
|
||||||
// Get a system error message, in an OS-independent manner.
|
// Get a system error message, in an OS-independent manner.
|
||||||
//
|
//
|
||||||
// These versions of strerror is thread-safe, and it never fails
|
// These versions of strerror is thread-safe, and it never fails
|
||||||
|
|||||||
@@ -2,8 +2,8 @@
|
|||||||
|
|
||||||
#define MAXLINE 512
|
#define MAXLINE 512
|
||||||
|
|
||||||
static CodepointString n_backspaces(int n) {
|
static std::u32string n_backspaces(int n) {
|
||||||
CodepointString result(3 * n, 0);
|
std::u32string result(3 * n, 0);
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
result[i*3 + 0] = '\b';
|
result[i*3 + 0] = '\b';
|
||||||
result[i*3 + 1] = ' ';
|
result[i*3 + 1] = ' ';
|
||||||
@@ -12,7 +12,7 @@ static CodepointString n_backspaces(int n) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int common_prefix_length(const CodepointString &a, const CodepointString &b) {
|
static int common_prefix_length(const std::u32string &a, const std::u32string &b) {
|
||||||
int minlen = std::min(a.size(), b.size());
|
int minlen = std::min(a.size(), b.size());
|
||||||
for (int i = 0; i < minlen; i++) {
|
for (int i = 0; i < minlen; i++) {
|
||||||
if (a[i] != b[i]) return i;
|
if (a[i] != b[i]) return i;
|
||||||
@@ -20,104 +20,11 @@ static int common_prefix_length(const CodepointString &a, const CodepointString
|
|||||||
return minlen;
|
return minlen;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
|
|
||||||
uint32_t cp = (uint32_t)scp;
|
|
||||||
unsigned char *c = (unsigned char *)buffer;
|
|
||||||
if (cp < 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else if (cp <= 0x7F) {
|
|
||||||
c[0] = cp;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
else if (cp <= 0x7FF) {
|
|
||||||
c[0] = (cp>>6)+192;
|
|
||||||
c[1] = (cp&63)+128;
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
else if (cp <= 0xFFFF) {
|
|
||||||
c[0] = (cp>>12)+224;
|
|
||||||
c[1] = ((cp>>6)&63)+128;
|
|
||||||
c[2] = (cp&63)+128;
|
|
||||||
return 3;
|
|
||||||
}
|
|
||||||
else if (cp <= 0x10FFFF) {
|
|
||||||
c[0] = (cp>>18)+240;
|
|
||||||
c[1] = ((cp>>12)&63)+128;
|
|
||||||
c[2] = ((cp>>6)&63)+128;
|
|
||||||
c[3] = (cp&63)+128;
|
|
||||||
return 4;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int32_t read_codepoint_utf8(std::string_view &source) {
|
|
||||||
size_t size = source.size();
|
|
||||||
if (size == 0) return -1;
|
|
||||||
|
|
||||||
const unsigned char *bytes = (const unsigned char *)source.data();
|
|
||||||
int codepoint;
|
|
||||||
size_t seqlen;
|
|
||||||
if ((bytes[0] & 0x80) == 0x00) {
|
|
||||||
// U+0000 to U+007F
|
|
||||||
codepoint = (bytes[0] & 0x7F);
|
|
||||||
seqlen = 1;
|
|
||||||
} else if ((bytes[0] & 0xE0) == 0xC0) {
|
|
||||||
// U+0080 to U+07FF
|
|
||||||
codepoint = (bytes[0] & 0x1F);
|
|
||||||
seqlen = 2;
|
|
||||||
} else if ((bytes[0] & 0xF0) == 0xE0) {
|
|
||||||
// U+0800 to U+FFFF
|
|
||||||
codepoint = (bytes[0] & 0x0F);
|
|
||||||
seqlen = 3;
|
|
||||||
} else if ((bytes[0] & 0xF8) == 0xF0) {
|
|
||||||
// U+10000 to U+10FFFF
|
|
||||||
codepoint = (bytes[0] & 0x07);
|
|
||||||
seqlen = 4;
|
|
||||||
} else {
|
|
||||||
// Bad character. Drop a byte and return invalid CP.
|
|
||||||
source.remove_prefix(1);
|
|
||||||
return -2;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (seqlen > size) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 1; i < seqlen; ++i) {
|
|
||||||
if ((bytes[i] & 0xC0) != 0x80) {
|
|
||||||
// Bad character. Drop a byte and return invalid CP.
|
|
||||||
source.remove_prefix(1);
|
|
||||||
return -2;
|
|
||||||
}
|
|
||||||
codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((codepoint > 0x10FFFF) ||
|
|
||||||
((codepoint <= 0x007F) && (seqlen != 1)) ||
|
|
||||||
((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
|
|
||||||
((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
|
|
||||||
((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
|
|
||||||
// Bad character. Drop a byte and return invalid CP.
|
|
||||||
source.remove_prefix(1);
|
|
||||||
return -2;
|
|
||||||
}
|
|
||||||
|
|
||||||
source.remove_prefix(seqlen);
|
|
||||||
return codepoint;
|
|
||||||
}
|
|
||||||
|
|
||||||
ReadlineDevice::ReadlineDevice() {
|
|
||||||
desired_prompt_ = CodepointString(1, '>');
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void ReadlineDevice::set_print_callback(print_callback cb) {
|
void ReadlineDevice::set_print_callback(print_callback cb) {
|
||||||
print_cb_ = cb;
|
print_cb_ = cb;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ReadlineDevice::set_prompt(const CodepointString &prompt) {
|
void ReadlineDevice::set_prompt(const std::u32string &prompt) {
|
||||||
desired_prompt_ = prompt;
|
desired_prompt_ = prompt;
|
||||||
echo_command();
|
echo_command();
|
||||||
}
|
}
|
||||||
@@ -152,24 +59,24 @@ void ReadlineDevice::echo_command() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Echo the new part.
|
// Echo the new part.
|
||||||
CodepointString newpart = desired_command_.substr(current_command_.size());
|
std::u32string newpart = desired_command_.substr(current_command_.size());
|
||||||
if (!newpart.empty()) {
|
if (!newpart.empty()) {
|
||||||
print_cb_(newpart);
|
print_cb_(newpart);
|
||||||
current_command_ = desired_command_;
|
current_command_ = desired_command_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
CodepointString ReadlineDevice::putcode(char32_t c) {
|
std::u32string ReadlineDevice::putcode(char32_t c) {
|
||||||
if ((c == '\n') && (readline_lastc_ == '\r')) {
|
if ((c == '\n') && (readline_lastc_ == '\r')) {
|
||||||
// Ignore newline immediately after carriage return.
|
// Ignore newline immediately after carriage return.
|
||||||
// Otherwise, crlf produces two newlines.
|
// Otherwise, crlf produces two newlines.
|
||||||
return CodepointString();
|
return std::u32string();
|
||||||
} else if ((c == '\r') || (c == '\n')) {
|
} else if ((c == '\r') || (c == '\n')) {
|
||||||
CodepointString white(1, ' ');
|
std::u32string white(1, ' ');
|
||||||
CodepointString newline(1, '\n');
|
std::u32string newline(1, '\n');
|
||||||
echo_command();
|
echo_command();
|
||||||
print_cb_(white + newline);
|
print_cb_(white + newline);
|
||||||
CodepointString result = desired_command_ + newline;
|
std::u32string result = desired_command_ + newline;
|
||||||
desired_command_.clear();
|
desired_command_.clear();
|
||||||
current_prompt_.clear();
|
current_prompt_.clear();
|
||||||
current_command_.clear();
|
current_command_.clear();
|
||||||
@@ -181,20 +88,20 @@ CodepointString ReadlineDevice::putcode(char32_t c) {
|
|||||||
desired_command_ = desired_command_.substr(0, len-1);
|
desired_command_ = desired_command_.substr(0, len-1);
|
||||||
}
|
}
|
||||||
echo_command();
|
echo_command();
|
||||||
return CodepointString();
|
return std::u32string();
|
||||||
} else if ((c >= 32)&&(c <= 0x10FFFF)) {
|
} else if ((c >= 32)&&(c <= 0x10FFFF)) {
|
||||||
int len = desired_command_.size();
|
int len = desired_command_.size();
|
||||||
if (len < MAXLINE) {
|
if (len < MAXLINE) {
|
||||||
desired_command_ = desired_command_ + c;
|
desired_command_ = desired_command_ + c;
|
||||||
}
|
}
|
||||||
echo_command();
|
echo_command();
|
||||||
return CodepointString();
|
return std::u32string();
|
||||||
}
|
}
|
||||||
readline_lastc_ = c;
|
readline_lastc_ = c;
|
||||||
return CodepointString();
|
return std::u32string();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ReadlineDevice::print(const CodepointString &s) {
|
void ReadlineDevice::print(const std::u32string &s) {
|
||||||
if (!s.empty()) {
|
if (!s.empty()) {
|
||||||
erase_command();
|
erase_command();
|
||||||
print_cb_(s);
|
print_cb_(s);
|
||||||
@@ -202,30 +109,3 @@ void ReadlineDevice::print(const CodepointString &s) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ReadlineDevice::to_utf8(const CodepointString &s) {
|
|
||||||
std::string result(s.size() * 4, 0);
|
|
||||||
char *buffer = &result[0];
|
|
||||||
int len = 0;
|
|
||||||
for (char32_t c : s) {
|
|
||||||
int clen = buffer_codepoint_utf8(c, buffer + len);
|
|
||||||
len += clen;
|
|
||||||
}
|
|
||||||
return result.substr(0, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
CodepointString ReadlineDevice::from_utf8(std::string_view s, int *consumed) {
|
|
||||||
std::string_view rest = s;
|
|
||||||
CodepointString result(s.size(), 0);
|
|
||||||
int len = 0;
|
|
||||||
while (true) {
|
|
||||||
int32_t c = read_codepoint_utf8(rest);
|
|
||||||
if (c == -1) break; // EOF reached;
|
|
||||||
if (c == -2) continue; // Filter out bad UTF8 but continue.
|
|
||||||
result[len++] = (char32_t)c;
|
|
||||||
}
|
|
||||||
if (consumed != nullptr) {
|
|
||||||
*consumed = s.size() - rest.size();
|
|
||||||
}
|
|
||||||
return result.substr(0, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,19 +4,19 @@
|
|||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
|
#include "drvutil.hpp"
|
||||||
|
|
||||||
using CodepointString = std::basic_string<char32_t>;
|
|
||||||
|
|
||||||
class ReadlineDevice {
|
class ReadlineDevice {
|
||||||
public:
|
public:
|
||||||
using print_callback = void (*)(const CodepointString &text);
|
using print_callback = void (*)(const std::u32string &text);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
print_callback print_cb_;
|
print_callback print_cb_;
|
||||||
CodepointString desired_command_;
|
std::u32string desired_command_;
|
||||||
CodepointString current_command_;
|
std::u32string current_command_;
|
||||||
CodepointString desired_prompt_;
|
std::u32string desired_prompt_;
|
||||||
CodepointString current_prompt_;
|
std::u32string current_prompt_;
|
||||||
char32_t readline_lastc_;
|
char32_t readline_lastc_;
|
||||||
|
|
||||||
void erase_command();
|
void erase_command();
|
||||||
@@ -24,31 +24,19 @@ private:
|
|||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ReadlineDevice();
|
|
||||||
|
|
||||||
// The callback must be set before using the readline device.
|
// The callback must be set before using the readline device.
|
||||||
void set_print_callback(print_callback cb);
|
void set_print_callback(print_callback cb);
|
||||||
|
|
||||||
// change the prompt.
|
// change the prompt.
|
||||||
void set_prompt(const CodepointString &prompt);
|
void set_prompt(const std::u32string &prompt);
|
||||||
|
|
||||||
// Use this to print anything on the console.
|
// Use this to print anything on the console.
|
||||||
void print(const CodepointString &cps);
|
void print(const std::u32string &cps);
|
||||||
|
|
||||||
// Whenever the user types a character, call 'putcode'. If the code is
|
// Whenever the user types a character, call 'putcode'. If the code is
|
||||||
// newline, this returns the line of text that was entered, including the
|
// newline, this returns the line of text that was entered, including the
|
||||||
// newline. Otherwise returns empty string. Backspace is handled here.
|
// newline. Otherwise returns empty string. Backspace is handled here.
|
||||||
CodepointString putcode(char32_t codepoint);
|
std::u32string putcode(char32_t codepoint);
|
||||||
|
|
||||||
// This can be used to convert a codepoint string into a
|
|
||||||
// UTF8-string.
|
|
||||||
static std::string to_utf8(const CodepointString &cps);
|
|
||||||
|
|
||||||
// This can be used to convert UTF8 to a codepoint string.
|
|
||||||
// Some of the bytes may not be consumed, if the source contains
|
|
||||||
// a partial utf-8 sequence. Returns the Codepoint string and the
|
|
||||||
// number of bytes consumed.
|
|
||||||
static CodepointString from_utf8(std::string_view source, int *consumed);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user