More refactors to prepare for doc-search, including moving unicode support into ext.
This commit is contained in:
BIN
Content/Luprex/SimpleColorMaterial.uasset
LFS
Normal file
BIN
Content/Luprex/SimpleColorMaterial.uasset
LFS
Normal file
Binary file not shown.
@@ -47,7 +47,7 @@
|
||||
"--header-insertion=never"
|
||||
],
|
||||
"C_Cpp.autocomplete": "disabled",
|
||||
"search.useIgnoreFiles": false
|
||||
"search.useIgnoreFiles": true
|
||||
},
|
||||
"extensions": {
|
||||
"recommendations": [
|
||||
|
||||
@@ -545,6 +545,24 @@ void SourceDB::register_lua_builtins() {
|
||||
lua_close(L);
|
||||
}
|
||||
|
||||
|
||||
|
||||
util::StringVec SourceDB::search_docs(const eng::string &substring) {
|
||||
// This map will hold the results. It maps function name
|
||||
// to a documentation line.
|
||||
eng::map<eng::string, eng::string> results;
|
||||
|
||||
// Search the built-in functions.
|
||||
// for (const LuaFunctionReg *reg = LuaFunctionReg::All; reg != nullptr; reg=reg->next()) {
|
||||
// }
|
||||
|
||||
util::StringVec resultvec;
|
||||
for (const auto &pair : results) {
|
||||
resultvec.push_back(pair.second);
|
||||
}
|
||||
return resultvec;
|
||||
}
|
||||
|
||||
eng::string SourceDB::function_docs(const LuaCoreStack &LS, LuaSlot fn) {
|
||||
lua_State *L = LS.state();
|
||||
if (LS.iscfunction(fn)) {
|
||||
|
||||
@@ -204,6 +204,13 @@ public:
|
||||
//
|
||||
eng::string function_docs(const LuaCoreStack &LS, LuaSlot slot);
|
||||
|
||||
// Search the documentation.
|
||||
//
|
||||
// Search all the documentation for the specified substring.
|
||||
// In the result, each line points to a different result.
|
||||
//
|
||||
util::StringVec search_docs(const eng::string &substring);
|
||||
|
||||
// Serialize and unserialize a source vector.
|
||||
//
|
||||
static void serialize_source(const util::LuaSourceVec &sv, StreamBuffer *sb);
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include "fast-float.hpp"
|
||||
#include "luastack.hpp"
|
||||
|
||||
#include "../../ext/unicode-stuff.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
@@ -13,7 +15,6 @@
|
||||
#include <cmath>
|
||||
#include <charconv>
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
bool case_insensitive_eq(string_view s1, string_view s2) {
|
||||
@@ -336,73 +337,16 @@ int32_t read_ascii_char(string_view &source) {
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t read_codepoint_utf8(std::string_view &source) {
|
||||
size_t size = source.size();
|
||||
if (size == 0) return -1;
|
||||
|
||||
const unsigned char *bytes = (const unsigned char *)source.data();
|
||||
int codepoint;
|
||||
size_t seqlen;
|
||||
if ((bytes[0] & 0x80) == 0x00) {
|
||||
// U+0000 to U+007F
|
||||
codepoint = (bytes[0] & 0x7F);
|
||||
seqlen = 1;
|
||||
} else if ((bytes[0] & 0xE0) == 0xC0) {
|
||||
// U+0080 to U+07FF
|
||||
codepoint = (bytes[0] & 0x1F);
|
||||
seqlen = 2;
|
||||
} else if ((bytes[0] & 0xF0) == 0xE0) {
|
||||
// U+0800 to U+FFFF
|
||||
codepoint = (bytes[0] & 0x0F);
|
||||
seqlen = 3;
|
||||
} else if ((bytes[0] & 0xF8) == 0xF0) {
|
||||
// U+10000 to U+10FFFF
|
||||
codepoint = (bytes[0] & 0x07);
|
||||
seqlen = 4;
|
||||
} else {
|
||||
// Bad character. return invalid CP.
|
||||
return -2;
|
||||
}
|
||||
|
||||
if (seqlen > size) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < seqlen; ++i) {
|
||||
if ((bytes[i] & 0xC0) != 0x80) {
|
||||
// Bad character. return invalid CP.
|
||||
return -2;
|
||||
}
|
||||
codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
|
||||
}
|
||||
|
||||
if ((codepoint > 0x10FFFF) ||
|
||||
((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) ||
|
||||
((codepoint <= 0x007F) && (seqlen != 1)) ||
|
||||
((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
|
||||
((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
|
||||
((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
|
||||
// Bad character. return invalid CP.
|
||||
return -2;
|
||||
}
|
||||
|
||||
source.remove_prefix(seqlen);
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
bool valid_utf8(string_view s) {
|
||||
while (!s.empty()) {
|
||||
int32_t codepoint = read_codepoint_utf8(s);
|
||||
if (codepoint < 0) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool valid_number(string_view s, bool plus, bool minus, bool dec, bool exp) {
|
||||
read_number(s, plus, minus, dec, exp);
|
||||
return s.empty();
|
||||
}
|
||||
|
||||
using UC = UnicodeStuff<eng::string, eng::u16string, eng::u32string>;
|
||||
|
||||
int32_t read_codepoint_utf8(string_view &source) { return UC::read_codepoint_utf8(source); }
|
||||
bool valid_utf8(string_view s) { return UC::valid_utf8(s); }
|
||||
|
||||
} // namespace sv
|
||||
|
||||
|
||||
@@ -989,6 +933,23 @@ LuaDefine(unittests_util, "", "some unit tests") {
|
||||
LuaAssert(L, read_number_x("-123e+5x", true, true, true, true) == "-123e+5");
|
||||
LuaAssert(L, read_number_x("-123e+x", true, true, true, true) == "");
|
||||
|
||||
// Test read_codepoint_utf8.
|
||||
std::string_view str("𝞮ὥπq");
|
||||
LuaAssert(L, str.size() == 10);
|
||||
LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1D7AE); // 4-byte char
|
||||
LuaAssert(L, str.size() == 6);
|
||||
LuaAssert(L, sv::read_codepoint_utf8(str) == 0x1F65); // 3-byte char
|
||||
LuaAssert(L, str.size() == 3);
|
||||
LuaAssert(L, sv::read_codepoint_utf8(str) == 0x3C0); // 2-byte char
|
||||
LuaAssert(L, str.size() == 1);
|
||||
LuaAssert(L, sv::read_codepoint_utf8(str) == 0x71); // 1-byte char
|
||||
LuaAssert(L, str.size() == 0);
|
||||
LuaAssert(L, sv::read_codepoint_utf8(str) == -1); // EOF
|
||||
|
||||
// Test read_codepoint_utf8 on an invalid unicode sequence.
|
||||
std::string_view strbad("\xC0\xC0");
|
||||
LuaAssert(L, sv::read_codepoint_utf8(strbad) == -2);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -196,18 +196,14 @@ int32_t read_ascii_char(string_view &source);
|
||||
|
||||
// Read a UTF8 codepoint from a string_view.
|
||||
//
|
||||
// If the string_view is empty, returns -1 and doesn't update
|
||||
// the string_view.
|
||||
//
|
||||
// If the string_view contains an unfinished but possibly valid
|
||||
// codepoint, returns -1 and doesn't update the string_view.
|
||||
//
|
||||
// If the next thing in the string_view is an invalid codepoint,
|
||||
// returns -2 and doesn't update the string_view.
|
||||
// See documentation in unicode-stuff.hpp
|
||||
//
|
||||
int32_t read_codepoint_utf8(string_view &source);
|
||||
|
||||
// Return true if the string is valid utf-8.
|
||||
//
|
||||
// See documentation in unicode-stuff.hpp
|
||||
//
|
||||
bool valid_utf8(string_view s);
|
||||
|
||||
// Return true if the number conforms to the spec.
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <filesystem>
|
||||
#include "../../ext/unicode-stuff.hpp"
|
||||
|
||||
namespace drvutil {
|
||||
|
||||
@@ -74,189 +75,12 @@ bool is_single_wchar_t(char32_t c) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static int buffer_codepoint_utf8(char32_t scp, char *buffer) {
|
||||
uint32_t cp = (uint32_t)scp;
|
||||
unsigned char *c = (unsigned char *)buffer;
|
||||
if (cp < 0) {
|
||||
return 0;
|
||||
}
|
||||
else if (cp <= 0x7F) {
|
||||
c[0] = cp;
|
||||
return 1;
|
||||
}
|
||||
else if (cp <= 0x7FF) {
|
||||
c[0] = (cp>>6)+192;
|
||||
c[1] = (cp&63)+128;
|
||||
return 2;
|
||||
}
|
||||
else if (cp <= 0xFFFF) {
|
||||
if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
|
||||
return 0;
|
||||
}
|
||||
c[0] = (cp>>12)+224;
|
||||
c[1] = ((cp>>6)&63)+128;
|
||||
c[2] = (cp&63)+128;
|
||||
return 3;
|
||||
}
|
||||
else if (cp <= 0x10FFFF) {
|
||||
c[0] = (cp>>18)+240;
|
||||
c[1] = ((cp>>12)&63)+128;
|
||||
c[2] = ((cp>>6)&63)+128;
|
||||
c[3] = (cp&63)+128;
|
||||
return 4;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
using UC = UnicodeStuff<std::string, std::u16string, std::u32string>;
|
||||
|
||||
static int32_t read_codepoint_utf16(std::u16string_view &source) {
|
||||
if (source.empty()) return -1;
|
||||
|
||||
int32_t word0 = ((const uint16_t *)source.data())[0];
|
||||
source.remove_prefix(1);
|
||||
|
||||
if (word0 < 0xD800) {
|
||||
return word0;
|
||||
} else if (word0 < 0xDC00) {
|
||||
if (source.empty()) {
|
||||
return -2;
|
||||
}
|
||||
int32_t word1 = ((const uint16_t *)source.data())[0];
|
||||
if ((word1 < 0xDC00)||(word1 > 0xDFFF)) {
|
||||
return -2;
|
||||
}
|
||||
int32_t part1 = word0 & 0x3FF;
|
||||
int32_t part2 = word1 & 0x3FF;
|
||||
int32_t result = ((part1 << 10) | part2) + 0x10000;
|
||||
source.remove_prefix(1);
|
||||
return result;
|
||||
} else if (word0 < 0xE000) {
|
||||
return -2;
|
||||
} else {
|
||||
return word0;
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t read_codepoint_utf8(std::string_view &source) {
|
||||
size_t size = source.size();
|
||||
if (size == 0) return -1;
|
||||
|
||||
const unsigned char *bytes = (const unsigned char *)source.data();
|
||||
int codepoint;
|
||||
size_t seqlen;
|
||||
if ((bytes[0] & 0x80) == 0x00) {
|
||||
// U+0000 to U+007F
|
||||
codepoint = (bytes[0] & 0x7F);
|
||||
seqlen = 1;
|
||||
} else if ((bytes[0] & 0xE0) == 0xC0) {
|
||||
// U+0080 to U+07FF
|
||||
codepoint = (bytes[0] & 0x1F);
|
||||
seqlen = 2;
|
||||
} else if ((bytes[0] & 0xF0) == 0xE0) {
|
||||
// U+0800 to U+FFFF
|
||||
codepoint = (bytes[0] & 0x0F);
|
||||
seqlen = 3;
|
||||
} else if ((bytes[0] & 0xF8) == 0xF0) {
|
||||
// U+10000 to U+10FFFF
|
||||
codepoint = (bytes[0] & 0x07);
|
||||
seqlen = 4;
|
||||
} else {
|
||||
// Bad character. return invalid CP.
|
||||
return -2;
|
||||
}
|
||||
|
||||
if (seqlen > size) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < seqlen; ++i) {
|
||||
if ((bytes[i] & 0xC0) != 0x80) {
|
||||
// Bad character. return invalid CP.
|
||||
return -2;
|
||||
}
|
||||
codepoint = (codepoint << 6) | (bytes[i] & 0x3F);
|
||||
}
|
||||
|
||||
if ((codepoint > 0x10FFFF) ||
|
||||
((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) ||
|
||||
((codepoint <= 0x007F) && (seqlen != 1)) ||
|
||||
((codepoint >= 0x0080) && (codepoint <= 0x07FF) && (seqlen != 2)) ||
|
||||
((codepoint >= 0x0800) && (codepoint <= 0xFFFF) && (seqlen != 3)) ||
|
||||
((codepoint >= 0x10000) && (codepoint <= 0x1FFFFF) && (seqlen != 4))) {
|
||||
// Bad character. return invalid CP.
|
||||
return -2;
|
||||
}
|
||||
|
||||
source.remove_prefix(seqlen);
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
std::string utf32_to_utf8(const std::u32string &s) {
|
||||
std::string result(s.size() * 4, 0);
|
||||
char *buffer = &result[0];
|
||||
int len = 0;
|
||||
for (char32_t c : s) {
|
||||
int clen = buffer_codepoint_utf8(c, buffer + len);
|
||||
len += clen;
|
||||
}
|
||||
return result.substr(0, len);
|
||||
}
|
||||
|
||||
std::u32string utf8_to_utf32(std::string_view s, int *consumed) {
|
||||
std::string_view rest = s;
|
||||
std::u32string result(s.size(), 0);
|
||||
int len = 0;
|
||||
while (true) {
|
||||
int32_t c = read_codepoint_utf8(rest);
|
||||
if (c == -1) {
|
||||
break; // EOF reached;
|
||||
} else if (c < 0) {
|
||||
rest.remove_prefix(1);
|
||||
} else {
|
||||
result[len++] = (char32_t)c;
|
||||
}
|
||||
}
|
||||
if (consumed != nullptr) {
|
||||
*consumed = s.size() - rest.size();
|
||||
}
|
||||
return result.substr(0, len);
|
||||
}
|
||||
|
||||
std::u16string utf8_to_ucs2(std::string_view s, int *consumed) {
|
||||
std::string_view rest = s;
|
||||
std::u16string result(s.size(), 0);
|
||||
int len = 0;
|
||||
while (true) {
|
||||
int32_t c = read_codepoint_utf8(rest);
|
||||
if (c == -1) {
|
||||
break; // EOF reached;
|
||||
} else if (c < 0) {
|
||||
rest.remove_prefix(1);
|
||||
} else if ((c >= 0xD800) && (c <= 0xDFFF)) {
|
||||
result[len++] = 0x2610;
|
||||
} else if (c > 0xFFFF) {
|
||||
result[len++] = 0x2610;
|
||||
} else {
|
||||
result[len++] = (char16_t)c;
|
||||
}
|
||||
}
|
||||
if (consumed != nullptr) {
|
||||
*consumed = s.size() - rest.size();
|
||||
}
|
||||
return result.substr(0, len);
|
||||
}
|
||||
|
||||
std::string utf16_to_utf8(std::u16string_view s) {
|
||||
std::string result(s.size() * 4, 0);
|
||||
int len = 0;
|
||||
while (true) {
|
||||
int codepoint = read_codepoint_utf16(s);
|
||||
if (codepoint == -1) break;
|
||||
if (codepoint < 0) continue;
|
||||
len += buffer_codepoint_utf8(codepoint, &result[len]);
|
||||
}
|
||||
return result.substr(0, len);
|
||||
}
|
||||
std::string utf32_to_utf8(const std::u32string &s) { return UC::utf32_to_utf8(s); }
|
||||
std::u32string utf8_to_utf32(std::string_view s, int *consumed) { return UC::utf8_to_utf32(s, consumed); }
|
||||
std::u16string utf8_to_ucs2(std::string_view s, int *consumed) { return UC::utf8_to_ucs2(s, consumed); }
|
||||
std::string utf16_to_utf8(std::u16string_view s) { return UC::utf16_to_utf8(s); }
|
||||
|
||||
static std::vector<std::string> parse_control_lst(std::string_view ctrl) {
|
||||
std::vector<std::string> result;
|
||||
|
||||
@@ -59,6 +59,7 @@ void split_target(std::string_view target, std::string &cert, std::string &host,
|
||||
bool is_single_wchar_t(char32_t c);
|
||||
|
||||
// Convert a codepoint string into a UTF8-string.
|
||||
//
|
||||
// If the codepoint string contains invalid codepoints, they're silently dropped.
|
||||
//
|
||||
std::string utf32_to_utf8(const std::u32string &cps);
|
||||
|
||||
@@ -8,6 +8,8 @@ namespace eng {
|
||||
template<class C, class T=std::char_traits<C>>
|
||||
using basic_string = std::basic_string<C, T, eng::allocator<C>>;
|
||||
using string = basic_string<char>;
|
||||
using u32string = basic_string<char32_t>;
|
||||
using u16string = basic_string<char16_t>;
|
||||
} // namespace eng
|
||||
|
||||
#endif // WRAP_STRING_HPP
|
||||
|
||||
268
luprex/ext/unicode-stuff.hpp
Normal file
268
luprex/ext/unicode-stuff.hpp
Normal file
@@ -0,0 +1,268 @@
|
||||
// This file implements unicode encoding conversions.
|
||||
//
|
||||
// Unicode conversions aren't that complicated. It is possible
|
||||
// to implement them in a few hundred lines of code. Most unicode
|
||||
// libraries are much larger because they also implement many
|
||||
// other pieces of functionality. I don't need anything but
|
||||
// conversions. So I implemented my own tiny library.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
template <class U8STR, class U16STR, class U32STR>
|
||||
class UnicodeStuff
|
||||
{
|
||||
public:
|
||||
using u8string = U8STR;
|
||||
using u16string = U16STR;
|
||||
using u32string = U32STR;
|
||||
|
||||
// Convert a single UTF32 codepoint into a UTF8 string.
|
||||
//
|
||||
// The string is stored in a preallocated buffer. The length of the
|
||||
// codepoint is returned. If it returns 0, it means the codepoint is
|
||||
// not a valid unicode codepoint.
|
||||
//
|
||||
static int codepoint_to_utf8(char32_t scp, char *buffer) {
|
||||
uint32_t cp = (uint32_t)scp;
|
||||
unsigned char *c = (unsigned char *)buffer;
|
||||
if (cp < 0) {
|
||||
return 0;
|
||||
}
|
||||
else if (cp <= 0x7F) {
|
||||
c[0] = cp;
|
||||
return 1;
|
||||
}
|
||||
else if (cp <= 0x7FF) {
|
||||
c[0] = (cp>>6)+192;
|
||||
c[1] = (cp&63)+128;
|
||||
return 2;
|
||||
}
|
||||
else if (cp <= 0xFFFF) {
|
||||
if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
|
||||
return 0;
|
||||
}
|
||||
c[0] = (cp>>12)+224;
|
||||
c[1] = ((cp>>6)&63)+128;
|
||||
c[2] = (cp&63)+128;
|
||||
return 3;
|
||||
}
|
||||
else if (cp <= 0x10FFFF) {
|
||||
c[0] = (cp>>18)+240;
|
||||
c[1] = ((cp>>12)&63)+128;
|
||||
c[2] = ((cp>>6)&63)+128;
|
||||
c[3] = (cp&63)+128;
|
||||
return 4;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Read a single codepoint from a UTF16 string.
|
||||
//
|
||||
// Returns -1 if the string is empty. Returns -2 if the string
|
||||
// starts with an invalid sequence.
|
||||
//
|
||||
// The string-view is updated to remove the codepoint from the view.
|
||||
//
|
||||
static char32_t read_codepoint_utf16(std::u16string_view &source) {
|
||||
if (source.empty()) return -1;
|
||||
|
||||
int32_t word0 = ((const uint16_t *)source.data())[0];
|
||||
source.remove_prefix(1);
|
||||
|
||||
if (word0 < 0xD800) {
|
||||
return word0;
|
||||
} else if (word0 < 0xDC00) {
|
||||
if (source.empty()) {
|
||||
return -2;
|
||||
}
|
||||
int32_t word1 = ((const uint16_t *)source.data())[0];
|
||||
if ((word1 < 0xDC00)||(word1 > 0xDFFF)) {
|
||||
return -2;
|
||||
}
|
||||
int32_t part1 = word0 & 0x3FF;
|
||||
int32_t part2 = word1 & 0x3FF;
|
||||
int32_t result = ((part1 << 10) | part2) + 0x10000;
|
||||
source.remove_prefix(1);
|
||||
return result;
|
||||
} else if (word0 < 0xE000) {
|
||||
return -2;
|
||||
} else {
|
||||
return word0;
|
||||
}
|
||||
}
|
||||
|
||||
// Read a single codepoint from a UTF8 string.
|
||||
//
|
||||
// If the string_view starts with a valid codepoint, the codepoint
|
||||
// is removed from the string_view and is returned.
|
||||
//
|
||||
// If the string_view is empty, returns -1.
|
||||
//
|
||||
// If the string_view starts with an unfinished but possibly
|
||||
// valid codepoint, returns -1.
|
||||
//
|
||||
// If the string_view starts with a finish but invalid codepoint,
|
||||
// returns -2.
|
||||
//
|
||||
static int32_t read_codepoint_utf8(std::string_view &source) {
|
||||
size_t size = source.size();
|
||||
if (size == 0) return -1;
|
||||
|
||||
const unsigned char *bytes = (const unsigned char *)source.data();
|
||||
|
||||
int codepoint;
|
||||
size_t seqlen;
|
||||
if ((bytes[0] & 0x80) == 0x00) {
|
||||
// U+0000 to U+007F
|
||||
codepoint = (bytes[0] & 0x7F);
|
||||
seqlen = 1;
|
||||
} else if ((bytes[0] & 0xE0) == 0xC0) {
|
||||
// U+0080 to U+07FF
|
||||
if (size < 2) return -1;
|
||||
if ((bytes[1] & 0xC0) != 0x80) return -2;
|
||||
codepoint = (bytes[0] & 0x1F);
|
||||
codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
|
||||
seqlen = 2;
|
||||
} else if ((bytes[0] & 0xF0) == 0xE0) {
|
||||
// U+0800 to U+FFFF
|
||||
if (size < 3) return -1;
|
||||
if ((bytes[1] & 0xC0) != 0x80) return -2;
|
||||
if ((bytes[2] & 0xC0) != 0x80) return -2;
|
||||
codepoint = (bytes[0] & 0x0F);
|
||||
codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
|
||||
codepoint = (codepoint << 6) | (bytes[2] & 0x3F);
|
||||
seqlen = 3;
|
||||
} else if ((bytes[0] & 0xF8) == 0xF0) {
|
||||
// U+10000 to U+10FFFF
|
||||
if (size < 4) return -1;
|
||||
if ((bytes[1] & 0xC0) != 0x80) return -2;
|
||||
if ((bytes[2] & 0xC0) != 0x80) return -2;
|
||||
if ((bytes[3] & 0xC0) != 0x80) return -2;
|
||||
codepoint = (bytes[0] & 0x07);
|
||||
codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
|
||||
codepoint = (codepoint << 6) | (bytes[2] & 0x3F);
|
||||
codepoint = (codepoint << 6) | (bytes[3] & 0x3F);
|
||||
if (codepoint >= 0x110000) return -2;
|
||||
seqlen = 4;
|
||||
} else {
|
||||
return -2;
|
||||
}
|
||||
|
||||
if ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) {
|
||||
return -2;
|
||||
}
|
||||
|
||||
source.remove_prefix(seqlen);
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
// Convert a codepoint string into a UTF8-string.
|
||||
// If the codepoint string contains invalid codepoints, they're silently dropped.
|
||||
//
|
||||
static u8string utf32_to_utf8(const u32string &s) {
|
||||
u8string result(s.size() * 4, 0);
|
||||
char *buffer = &result[0];
|
||||
int len = 0;
|
||||
for (char32_t c : s) {
|
||||
int clen = codepoint_to_utf8(c, buffer + len);
|
||||
len += clen;
|
||||
}
|
||||
result.resize(len);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Convert a UTF8 string to a UTF32 string.
|
||||
//
|
||||
// If the UTF8 string contains invalid sequences, they're silently dropped.
|
||||
// Some of the bytes may not be consumed, if the source ends with an unfinished
|
||||
// utf-8 sequence. Returns the Codepoint string and the number of bytes consumed.
|
||||
// You may pass nullptr for consumed if you don't care how many bytes were
|
||||
// consumed.
|
||||
//
|
||||
static u32string utf8_to_utf32(std::string_view s, int *consumed) {
|
||||
std::string_view rest = s;
|
||||
u32string result(s.size(), 0);
|
||||
int len = 0;
|
||||
while (true) {
|
||||
int32_t c = read_codepoint_utf8(rest);
|
||||
if (c == -1) {
|
||||
break; // EOF reached;
|
||||
} else if (c < 0) {
|
||||
rest.remove_prefix(1);
|
||||
} else {
|
||||
result[len++] = (char32_t)c;
|
||||
}
|
||||
}
|
||||
if (consumed != nullptr) {
|
||||
*consumed = s.size() - rest.size();
|
||||
}
|
||||
result.resize(len);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Convert a UTF8 string to a UCS-2 string.
|
||||
//
|
||||
// If the UTF8 string contains invalid sequences, they're silently dropped.
|
||||
// Some of the bytes may not be consumed, if the source ends with an unfinished
|
||||
// utf-8 sequence. Returns the UCS-2 string and the number of bytes consumed.
|
||||
// Of course, UCS-2 can't represent all of unicode, so this is lossy.
|
||||
// Any character that can't be represented is replaced with a box.
|
||||
//
|
||||
static u16string utf8_to_ucs2(std::string_view s, int *consumed) {
|
||||
std::string_view rest = s;
|
||||
u16string result(s.size(), 0);
|
||||
int len = 0;
|
||||
while (true) {
|
||||
int32_t c = read_codepoint_utf8(rest);
|
||||
if (c == -1) {
|
||||
break; // EOF reached;
|
||||
} else if (c < 0) {
|
||||
rest.remove_prefix(1);
|
||||
} else if ((c >= 0xD800) && (c <= 0xDFFF)) {
|
||||
result[len++] = 0x2610;
|
||||
} else if (c > 0xFFFF) {
|
||||
result[len++] = 0x2610;
|
||||
} else {
|
||||
result[len++] = (char16_t)c;
|
||||
}
|
||||
}
|
||||
if (consumed != nullptr) {
|
||||
*consumed = s.size() - rest.size();
|
||||
}
|
||||
result.resize(len);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Convert a UTF16 string to a UTF8 string.
|
||||
//
|
||||
// This also works for ucs2 strings. If the UTF16 string
|
||||
// contains invalid sequences, they're silently dropped.
|
||||
//
|
||||
static u8string utf16_to_utf8(std::u16string_view s) {
|
||||
u8string result(s.size() * 4, 0);
|
||||
int len = 0;
|
||||
while (true) {
|
||||
int codepoint = read_codepoint_utf16(s);
|
||||
if (codepoint == -1) break;
|
||||
if (codepoint < 0) continue;
|
||||
len += codepoint_to_utf8(codepoint, &result[len]);
|
||||
}
|
||||
result.resize(len);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Check if UTF8 is valid.
|
||||
//
|
||||
static bool valid_utf8(std::string_view s) {
|
||||
while (!s.empty()) {
|
||||
int32_t codepoint = read_codepoint_utf8(s);
|
||||
if (codepoint < 0) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user