// This file implements unicode encoding conversions. // // Unicode conversions aren't that complicated. It is possible // to implement them in a few hundred lines of code. Most unicode // libraries are much larger because they also implement many // other pieces of functionality. I don't need anything but // conversions. So I implemented my own tiny library. // #pragma once #include #include template class UnicodeStuff { public: using u8string = U8STR; using u16string = U16STR; using u32string = U32STR; // Convert a single UTF32 codepoint into a UTF8 string. // // The string is stored in a preallocated buffer. The length of the // codepoint is returned. If it returns 0, it means the codepoint is // not a valid unicode codepoint. // static int codepoint_to_utf8(char32_t scp, char *buffer) { uint32_t cp = (uint32_t)scp; unsigned char *c = (unsigned char *)buffer; if (cp < 0) { return 0; } else if (cp <= 0x7F) { c[0] = cp; return 1; } else if (cp <= 0x7FF) { c[0] = (cp>>6)+192; c[1] = (cp&63)+128; return 2; } else if (cp <= 0xFFFF) { if ((cp >= 0xD800) && (cp <= 0xDFFF)) { return 0; } c[0] = (cp>>12)+224; c[1] = ((cp>>6)&63)+128; c[2] = (cp&63)+128; return 3; } else if (cp <= 0x10FFFF) { c[0] = (cp>>18)+240; c[1] = ((cp>>12)&63)+128; c[2] = ((cp>>6)&63)+128; c[3] = (cp&63)+128; return 4; } else { return 0; } } // Read a single UTF32 codepoint from a UTF16 string. // // Returns -1 if the string is empty. Returns -2 if the string // starts with an invalid sequence. // // The string-view is updated to remove the codepoint from the view. // static char32_t read_codepoint_utf16(std::u16string_view &source) { if (source.empty()) return -1; int32_t word0 = ((const uint16_t *)source.data())[0]; source.remove_prefix(1); if (word0 < 0xD800) { return word0; } else if (word0 < 0xDC00) { if (source.empty()) { return -2; } int32_t word1 = ((const uint16_t *)source.data())[0]; if ((word1 < 0xDC00)||(word1 > 0xDFFF)) { return -2; } int32_t part1 = word0 & 0x3FF; int32_t part2 = word1 & 0x3FF; int32_t result = ((part1 << 10) | part2) + 0x10000; source.remove_prefix(1); return result; } else if (word0 < 0xE000) { return -2; } else { return word0; } } // Read a single UTF32 codepoint from a UTF8 string. // // If the string_view starts with a valid codepoint, the codepoint // is removed from the string_view and is returned. // // If the string_view is empty, returns -1. // // If the string_view starts with an unfinished but possibly // valid codepoint, returns -1. // // If the string_view starts with a finish but invalid codepoint, // returns -2. // static char32_t read_codepoint_utf8(std::string_view &source) { size_t size = source.size(); if (size == 0) return -1; const unsigned char *bytes = (const unsigned char *)source.data(); int codepoint; size_t seqlen; if ((bytes[0] & 0x80) == 0x00) { // U+0000 to U+007F codepoint = (bytes[0] & 0x7F); seqlen = 1; } else if ((bytes[0] & 0xE0) == 0xC0) { // U+0080 to U+07FF if (size < 2) return -1; if ((bytes[1] & 0xC0) != 0x80) return -2; codepoint = (bytes[0] & 0x1F); codepoint = (codepoint << 6) | (bytes[1] & 0x3F); seqlen = 2; } else if ((bytes[0] & 0xF0) == 0xE0) { // U+0800 to U+FFFF if (size < 3) return -1; if ((bytes[1] & 0xC0) != 0x80) return -2; if ((bytes[2] & 0xC0) != 0x80) return -2; codepoint = (bytes[0] & 0x0F); codepoint = (codepoint << 6) | (bytes[1] & 0x3F); codepoint = (codepoint << 6) | (bytes[2] & 0x3F); seqlen = 3; } else if ((bytes[0] & 0xF8) == 0xF0) { // U+10000 to U+10FFFF if (size < 4) return -1; if ((bytes[1] & 0xC0) != 0x80) return -2; if ((bytes[2] & 0xC0) != 0x80) return -2; if ((bytes[3] & 0xC0) != 0x80) return -2; codepoint = (bytes[0] & 0x07); codepoint = (codepoint << 6) | (bytes[1] & 0x3F); codepoint = (codepoint << 6) | (bytes[2] & 0x3F); codepoint = (codepoint << 6) | (bytes[3] & 0x3F); if (codepoint >= 0x110000) return -2; seqlen = 4; } else { return -2; } if ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) { return -2; } source.remove_prefix(seqlen); return codepoint; } // Convert a UTF32 string into a UTF8-string. // If the codepoint string contains invalid codepoints, they're silently dropped. // static u8string utf32_to_utf8(const u32string &s) { u8string result(s.size() * 4, 0); char *buffer = &result[0]; int len = 0; for (char32_t c : s) { int clen = codepoint_to_utf8(c, buffer + len); len += clen; } result.resize(len); return result; } // Convert a UTF8 string to a UTF32 string. // // If the UTF8 string contains invalid sequences, they're silently dropped. // Some of the bytes may not be consumed, if the source ends with an unfinished // utf-8 sequence. Returns the Codepoint string and the number of bytes consumed. // You may pass nullptr for consumed if you don't care how many bytes were // consumed. // static u32string utf8_to_utf32(std::string_view s, int *consumed) { std::string_view rest = s; u32string result(s.size(), 0); int len = 0; while (true) { int32_t c = read_codepoint_utf8(rest); if (c == -1) { break; // EOF reached; } else if (c < 0) { rest.remove_prefix(1); } else { result[len++] = (char32_t)c; } } if (consumed != nullptr) { *consumed = s.size() - rest.size(); } result.resize(len); return result; } // Convert a UTF8 string to a UCS-2 string. // // If the UTF8 string contains invalid sequences, they're silently dropped. // Some of the bytes may not be consumed, if the source ends with an unfinished // utf-8 sequence. Returns the UCS-2 string and the number of bytes consumed. // Of course, UCS-2 can't represent all of unicode, so this is lossy. // Any character that can't be represented is replaced with a box. // static u16string utf8_to_ucs2(std::string_view s, int *consumed) { std::string_view rest = s; u16string result(s.size(), 0); int len = 0; while (true) { int32_t c = read_codepoint_utf8(rest); if (c == -1) { break; // EOF reached; } else if (c < 0) { rest.remove_prefix(1); } else if ((c >= 0xD800) && (c <= 0xDFFF)) { result[len++] = 0x2610; } else if (c > 0xFFFF) { result[len++] = 0x2610; } else { result[len++] = (char16_t)c; } } if (consumed != nullptr) { *consumed = s.size() - rest.size(); } result.resize(len); return result; } // Convert a UTF16 string to a UTF8 string. // // This also works for ucs2 strings. If the UTF16 string // contains invalid sequences, they're silently dropped. // static u8string utf16_to_utf8(std::u16string_view s) { u8string result(s.size() * 4, 0); int len = 0; while (true) { int codepoint = read_codepoint_utf16(s); if (codepoint == -1) break; if (codepoint < 0) continue; len += codepoint_to_utf8(codepoint, &result[len]); } result.resize(len); return result; } // Check if UTF8 is valid. // static bool valid_utf8(std::string_view s) { while (!s.empty()) { int32_t codepoint = read_codepoint_utf8(s); if (codepoint < 0) return false; } return true; } };