Files
integration/luprex/ext/unicode-stuff.hpp

268 lines
8.7 KiB
C++

// This file implements unicode encoding conversions.
//
// Unicode conversions aren't that complicated. It is possible
// to implement them in a few hundred lines of code. Most unicode
// libraries are much larger because they also implement many
// other pieces of functionality. I don't need anything but
// conversions. So I implemented my own tiny library.
//
#pragma once
#include <string>
#include <string_view>
template <class U8STR, class U16STR, class U32STR>
class UnicodeStuff
{
public:
using u8string = U8STR;
using u16string = U16STR;
using u32string = U32STR;
// Convert a single UTF32 codepoint into a UTF8 string.
//
// The string is stored in a preallocated buffer. The length of the
// codepoint is returned. If it returns 0, it means the codepoint is
// not a valid unicode codepoint.
//
static int codepoint_to_utf8(char32_t scp, char *buffer) {
uint32_t cp = (uint32_t)scp;
unsigned char *c = (unsigned char *)buffer;
if (cp < 0) {
return 0;
}
else if (cp <= 0x7F) {
c[0] = cp;
return 1;
}
else if (cp <= 0x7FF) {
c[0] = (cp>>6)+192;
c[1] = (cp&63)+128;
return 2;
}
else if (cp <= 0xFFFF) {
if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
return 0;
}
c[0] = (cp>>12)+224;
c[1] = ((cp>>6)&63)+128;
c[2] = (cp&63)+128;
return 3;
}
else if (cp <= 0x10FFFF) {
c[0] = (cp>>18)+240;
c[1] = ((cp>>12)&63)+128;
c[2] = ((cp>>6)&63)+128;
c[3] = (cp&63)+128;
return 4;
} else {
return 0;
}
}
// Read a single UTF32 codepoint from a UTF16 string.
//
// Returns -1 if the string is empty. Returns -2 if the string
// starts with an invalid sequence.
//
// The string-view is updated to remove the codepoint from the view.
//
static char32_t read_codepoint_utf16(std::u16string_view &source) {
if (source.empty()) return -1;
int32_t word0 = ((const uint16_t *)source.data())[0];
source.remove_prefix(1);
if (word0 < 0xD800) {
return word0;
} else if (word0 < 0xDC00) {
if (source.empty()) {
return -2;
}
int32_t word1 = ((const uint16_t *)source.data())[0];
if ((word1 < 0xDC00)||(word1 > 0xDFFF)) {
return -2;
}
int32_t part1 = word0 & 0x3FF;
int32_t part2 = word1 & 0x3FF;
int32_t result = ((part1 << 10) | part2) + 0x10000;
source.remove_prefix(1);
return result;
} else if (word0 < 0xE000) {
return -2;
} else {
return word0;
}
}
// Read a single UTF32 codepoint from a UTF8 string.
//
// If the string_view starts with a valid codepoint, the codepoint
// is removed from the string_view and is returned.
//
// If the string_view is empty, returns -1.
//
// If the string_view starts with an unfinished but possibly
// valid codepoint, returns -1.
//
// If the string_view starts with a finish but invalid codepoint,
// returns -2.
//
static char32_t read_codepoint_utf8(std::string_view &source) {
size_t size = source.size();
if (size == 0) return -1;
const unsigned char *bytes = (const unsigned char *)source.data();
int codepoint;
size_t seqlen;
if ((bytes[0] & 0x80) == 0x00) {
// U+0000 to U+007F
codepoint = (bytes[0] & 0x7F);
seqlen = 1;
} else if ((bytes[0] & 0xE0) == 0xC0) {
// U+0080 to U+07FF
if (size < 2) return -1;
if ((bytes[1] & 0xC0) != 0x80) return -2;
codepoint = (bytes[0] & 0x1F);
codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
seqlen = 2;
} else if ((bytes[0] & 0xF0) == 0xE0) {
// U+0800 to U+FFFF
if (size < 3) return -1;
if ((bytes[1] & 0xC0) != 0x80) return -2;
if ((bytes[2] & 0xC0) != 0x80) return -2;
codepoint = (bytes[0] & 0x0F);
codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
codepoint = (codepoint << 6) | (bytes[2] & 0x3F);
seqlen = 3;
} else if ((bytes[0] & 0xF8) == 0xF0) {
// U+10000 to U+10FFFF
if (size < 4) return -1;
if ((bytes[1] & 0xC0) != 0x80) return -2;
if ((bytes[2] & 0xC0) != 0x80) return -2;
if ((bytes[3] & 0xC0) != 0x80) return -2;
codepoint = (bytes[0] & 0x07);
codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
codepoint = (codepoint << 6) | (bytes[2] & 0x3F);
codepoint = (codepoint << 6) | (bytes[3] & 0x3F);
if (codepoint >= 0x110000) return -2;
seqlen = 4;
} else {
return -2;
}
if ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) {
return -2;
}
source.remove_prefix(seqlen);
return codepoint;
}
// Convert a UTF32 string into a UTF8-string.
// If the codepoint string contains invalid codepoints, they're silently dropped.
//
static u8string utf32_to_utf8(const u32string &s) {
u8string result(s.size() * 4, 0);
char *buffer = &result[0];
int len = 0;
for (char32_t c : s) {
int clen = codepoint_to_utf8(c, buffer + len);
len += clen;
}
result.resize(len);
return result;
}
// Convert a UTF8 string to a UTF32 string.
//
// If the UTF8 string contains invalid sequences, they're silently dropped.
// Some of the bytes may not be consumed, if the source ends with an unfinished
// utf-8 sequence. Returns the Codepoint string and the number of bytes consumed.
// You may pass nullptr for consumed if you don't care how many bytes were
// consumed.
//
static u32string utf8_to_utf32(std::string_view s, int *consumed) {
std::string_view rest = s;
u32string result(s.size(), 0);
int len = 0;
while (true) {
int32_t c = read_codepoint_utf8(rest);
if (c == -1) {
break; // EOF reached;
} else if (c < 0) {
rest.remove_prefix(1);
} else {
result[len++] = (char32_t)c;
}
}
if (consumed != nullptr) {
*consumed = s.size() - rest.size();
}
result.resize(len);
return result;
}
// Convert a UTF8 string to a UCS-2 string.
//
// If the UTF8 string contains invalid sequences, they're silently dropped.
// Some of the bytes may not be consumed, if the source ends with an unfinished
// utf-8 sequence. Returns the UCS-2 string and the number of bytes consumed.
// Of course, UCS-2 can't represent all of unicode, so this is lossy.
// Any character that can't be represented is replaced with a box.
//
static u16string utf8_to_ucs2(std::string_view s, int *consumed) {
std::string_view rest = s;
u16string result(s.size(), 0);
int len = 0;
while (true) {
int32_t c = read_codepoint_utf8(rest);
if (c == -1) {
break; // EOF reached;
} else if (c < 0) {
rest.remove_prefix(1);
} else if ((c >= 0xD800) && (c <= 0xDFFF)) {
result[len++] = 0x2610;
} else if (c > 0xFFFF) {
result[len++] = 0x2610;
} else {
result[len++] = (char16_t)c;
}
}
if (consumed != nullptr) {
*consumed = s.size() - rest.size();
}
result.resize(len);
return result;
}
// Convert a UTF16 string to a UTF8 string.
//
// This also works for ucs2 strings. If the UTF16 string
// contains invalid sequences, they're silently dropped.
//
static u8string utf16_to_utf8(std::u16string_view s) {
u8string result(s.size() * 4, 0);
int len = 0;
while (true) {
int codepoint = read_codepoint_utf16(s);
if (codepoint == -1) break;
if (codepoint < 0) continue;
len += codepoint_to_utf8(codepoint, &result[len]);
}
result.resize(len);
return result;
}
// Check if UTF8 is valid.
//
static bool valid_utf8(std::string_view s) {
while (!s.empty()) {
int32_t codepoint = read_codepoint_utf8(s);
if (codepoint < 0) return false;
}
return true;
}
};