luprex/ext/unicode-stuff.hpp

// This file implements unicode encoding conversions.
//
// Unicode conversions aren't that complicated.  It is possible
// to implement them in a few hundred lines of code.  Most unicode
// libraries are much larger because they also implement many
// other pieces of functionality.  I don't need anything but
// conversions.  So I implemented my own tiny library.
//

#pragma once

#include <string>
#include <string_view>

template <class U8STR, class U16STR, class U32STR>
class UnicodeStuff
{
public:
    using u8string = U8STR;
    using u16string = U16STR;
    using u32string = U32STR;
    
    // Convert a single UTF32 codepoint into a UTF8 string.
    //
    // The string is stored in a preallocated buffer.  The length of the
    // codepoint is returned.  If it returns 0, it means the codepoint is
    // not a valid unicode codepoint.
    //
    static int codepoint_to_utf8(char32_t scp, char *buffer) {
        uint32_t cp = (uint32_t)scp;
        unsigned char *c = (unsigned char *)buffer;
        if (cp < 0) {
            return 0;
        }
        else if (cp <= 0x7F) {
            c[0] = cp;
            return 1;
        }
        else if (cp <= 0x7FF) {
            c[0] = (cp>>6)+192;
            c[1] = (cp&63)+128;
            return 2;
        }
        else if (cp <= 0xFFFF) {
            if ((cp >= 0xD800) && (cp <= 0xDFFF)) {
                return 0;
            }
            c[0] = (cp>>12)+224;
            c[1] = ((cp>>6)&63)+128;
            c[2] = (cp&63)+128;
            return 3;
        }
        else if (cp <= 0x10FFFF) {
            c[0] = (cp>>18)+240;
            c[1] = ((cp>>12)&63)+128;
            c[2] = ((cp>>6)&63)+128;
            c[3] = (cp&63)+128;
            return 4;
        } else {
            return 0;
        }
    }

    // Read a single UTF32 codepoint from a UTF16 string.
    //
    // Returns -1 if the string is empty.  Returns -2 if the string
    // starts with an invalid sequence.
    //
    // The string-view is updated to remove the codepoint from the view.
    //
    static char32_t read_codepoint_utf16(std::u16string_view &source) {
        if (source.empty()) return -1;
        
        int32_t word0 = ((const uint16_t *)source.data())[0];
        source.remove_prefix(1);

        if (word0 < 0xD800) {
            return word0;
        } else if (word0 < 0xDC00) {
            if (source.empty()) {
                return -2;
            }
            int32_t word1 = ((const uint16_t *)source.data())[0];
            if ((word1 < 0xDC00)||(word1 > 0xDFFF)) {
                return -2;
            }
            int32_t part1 = word0 & 0x3FF;
            int32_t part2 = word1 & 0x3FF;
            int32_t result = ((part1 << 10) | part2) + 0x10000;
            source.remove_prefix(1);
            return result;
        } else if (word0 < 0xE000) {
            return -2;
        } else {
            return word0;
        }
    }

    // Read a single UTF32 codepoint from a UTF8 string.
    //
    // If the string_view starts with a valid codepoint, the codepoint
    // is removed from the string_view and is returned.
    //
    // If the string_view is empty, returns -1.
    //
    // If the string_view starts with an unfinished but possibly
    // valid codepoint, returns -1.
    // 
    // If the string_view starts with a finish but invalid codepoint,
    // returns -2.
    //
    static char32_t read_codepoint_utf8(std::string_view &source) {
        size_t size = source.size();
        if (size == 0) return -1;

        const unsigned char *bytes = (const unsigned char *)source.data();

        int codepoint;
        size_t seqlen;
        if ((bytes[0] & 0x80) == 0x00) {
            // U+0000 to U+007F
            codepoint = (bytes[0] & 0x7F);
            seqlen = 1;
        } else if ((bytes[0] & 0xE0) == 0xC0) {
            // U+0080 to U+07FF
            if (size < 2) return -1;
            if ((bytes[1] & 0xC0) != 0x80) return -2;
            codepoint = (bytes[0] & 0x1F);
            codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
            seqlen = 2;
        } else if ((bytes[0] & 0xF0) == 0xE0) {
            // U+0800 to U+FFFF
            if (size < 3) return -1;
            if ((bytes[1] & 0xC0) != 0x80) return -2;
            if ((bytes[2] & 0xC0) != 0x80) return -2;
            codepoint = (bytes[0] & 0x0F);
            codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
            codepoint = (codepoint << 6) | (bytes[2] & 0x3F);
            seqlen = 3;
        } else if ((bytes[0] & 0xF8) == 0xF0) {
            // U+10000 to U+10FFFF
            if (size < 4) return -1;
            if ((bytes[1] & 0xC0) != 0x80) return -2;
            if ((bytes[2] & 0xC0) != 0x80) return -2;
            if ((bytes[3] & 0xC0) != 0x80) return -2;
            codepoint = (bytes[0] & 0x07);
            codepoint = (codepoint << 6) | (bytes[1] & 0x3F);
            codepoint = (codepoint << 6) | (bytes[2] & 0x3F);
            codepoint = (codepoint << 6) | (bytes[3] & 0x3F);
            if (codepoint >= 0x110000) return -2;
            seqlen = 4;
        } else {
            return -2;
        }

        if ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) {
            return -2;
        }

        source.remove_prefix(seqlen);
        return codepoint;
    }

    // Convert a UTF32 string into a UTF8-string.
    // If the codepoint string contains invalid codepoints, they're silently dropped.
    //
    static u8string utf32_to_utf8(const u32string &s) {
        u8string result(s.size() * 4, 0);
        char *buffer = &result[0];
        int len = 0;
        for (char32_t c : s) {
            int clen = codepoint_to_utf8(c, buffer + len);
            len += clen;
        }
        result.resize(len);
        return result;
    }

    // Convert a UTF8 string to a UTF32 string.
    // 
    // If the UTF8 string contains invalid sequences, they're silently dropped.
    // Some of the bytes may not be consumed, if the source ends with an unfinished
    // utf-8 sequence.  Returns the Codepoint string and the number of bytes consumed.
    // You may pass nullptr for consumed if you don't care how many bytes were
    // consumed.
    //
    static u32string utf8_to_utf32(std::string_view s, int *consumed) {
        std::string_view rest = s;
        u32string result(s.size(), 0);
        int len = 0;
        while (true) {
            int32_t c = read_codepoint_utf8(rest);
            if (c == -1) {
                break; // EOF reached;
            } else if (c < 0) {
                rest.remove_prefix(1);
            } else {
                result[len++] = (char32_t)c;
            }
        }
        if (consumed != nullptr) {
            *consumed = s.size() - rest.size();
        }
        result.resize(len);
        return result;
    }

    // Convert a UTF8 string to a UCS-2 string.
    //
    // If the UTF8 string contains invalid sequences, they're silently dropped.
    // Some of the bytes may not be consumed, if the source ends with an unfinished
    // utf-8 sequence.  Returns the UCS-2 string and the number of bytes consumed.
    // Of course, UCS-2 can't represent all of unicode, so this is lossy.
    // Any character that can't be represented is replaced with a box.
    //
    static u16string utf8_to_ucs2(std::string_view s, int *consumed) {
        std::string_view rest = s;
        u16string result(s.size(), 0);
        int len = 0;
        while (true) {
            int32_t c = read_codepoint_utf8(rest);
            if (c == -1) {
                break; // EOF reached;
            } else if (c < 0) {
                rest.remove_prefix(1);
            } else if ((c >= 0xD800) && (c <= 0xDFFF)) {
                result[len++] = 0x2610;
            } else if (c > 0xFFFF) {
                result[len++] = 0x2610;
            } else {
                result[len++] = (char16_t)c;
            }
        }
        if (consumed != nullptr) {
            *consumed = s.size() - rest.size();
        }
        result.resize(len);
        return result;
    }

    // Convert a UTF16 string to a UTF8 string.
    //
    // This also works for ucs2 strings.  If the UTF16 string
    // contains invalid sequences, they're silently dropped.
    //
    static u8string utf16_to_utf8(std::u16string_view s) {
        u8string result(s.size() * 4, 0);
        int len = 0;
        while (true) {
            int codepoint = read_codepoint_utf16(s);
            if (codepoint == -1) break;
            if (codepoint < 0) continue;
            len += codepoint_to_utf8(codepoint, &result[len]);
        }
        result.resize(len);
        return result;
    }

    // Check if UTF8 is valid.
    //
    static bool valid_utf8(std::string_view s) {
        while (!s.empty()) {
            int32_t codepoint = read_codepoint_utf8(s);
            if (codepoint < 0) return false;
        }
        return true;
    }
};
More refactors to prepare for doc-search, including moving unicode support into ext. 2026-01-14 12:30:44 -05:00			`// This file implements unicode encoding conversions.`
			`//`
			`// Unicode conversions aren't that complicated. It is possible`
			`// to implement them in a few hundred lines of code. Most unicode`
			`// libraries are much larger because they also implement many`
			`// other pieces of functionality. I don't need anything but`
			`// conversions. So I implemented my own tiny library.`
			`//`

			`#pragma once`

			`#include <string>`
			`#include <string_view>`

			`template <class U8STR, class U16STR, class U32STR>`
			`class UnicodeStuff`
			`{`
			`public:`
			`using u8string = U8STR;`
			`using u16string = U16STR;`
			`using u32string = U32STR;`

			`// Convert a single UTF32 codepoint into a UTF8 string.`
			`//`
			`// The string is stored in a preallocated buffer. The length of the`
			`// codepoint is returned. If it returns 0, it means the codepoint is`
			`// not a valid unicode codepoint.`
			`//`
			`static int codepoint_to_utf8(char32_t scp, char *buffer) {`
			`uint32_t cp = (uint32_t)scp;`
			`unsigned char c = (unsigned char )buffer;`
			`if (cp < 0) {`
			`return 0;`
			`}`
			`else if (cp <= 0x7F) {`
			`c[0] = cp;`
			`return 1;`
			`}`
			`else if (cp <= 0x7FF) {`
			`c[0] = (cp>>6)+192;`
			`c[1] = (cp&63)+128;`
			`return 2;`
			`}`
			`else if (cp <= 0xFFFF) {`
			`if ((cp >= 0xD800) && (cp <= 0xDFFF)) {`
			`return 0;`
			`}`
			`c[0] = (cp>>12)+224;`
			`c[1] = ((cp>>6)&63)+128;`
			`c[2] = (cp&63)+128;`
			`return 3;`
			`}`
			`else if (cp <= 0x10FFFF) {`
			`c[0] = (cp>>18)+240;`
			`c[1] = ((cp>>12)&63)+128;`
			`c[2] = ((cp>>6)&63)+128;`
			`c[3] = (cp&63)+128;`
			`return 4;`
			`} else {`
			`return 0;`
			`}`
			`}`

The docsearch function is now working. 2026-01-14 14:34:54 -05:00			`// Read a single UTF32 codepoint from a UTF16 string.`
More refactors to prepare for doc-search, including moving unicode support into ext. 2026-01-14 12:30:44 -05:00			`//`
			`// Returns -1 if the string is empty. Returns -2 if the string`
			`// starts with an invalid sequence.`
			`//`
			`// The string-view is updated to remove the codepoint from the view.`
			`//`
			`static char32_t read_codepoint_utf16(std::u16string_view &source) {`
			`if (source.empty()) return -1;`

			`int32_t word0 = ((const uint16_t *)source.data())[0];`
			`source.remove_prefix(1);`

			`if (word0 < 0xD800) {`
			`return word0;`
			`} else if (word0 < 0xDC00) {`
			`if (source.empty()) {`
			`return -2;`
			`}`
			`int32_t word1 = ((const uint16_t *)source.data())[0];`
			`if ((word1 < 0xDC00)\|\|(word1 > 0xDFFF)) {`
			`return -2;`
			`}`
			`int32_t part1 = word0 & 0x3FF;`
			`int32_t part2 = word1 & 0x3FF;`
			`int32_t result = ((part1 << 10) \| part2) + 0x10000;`
			`source.remove_prefix(1);`
			`return result;`
			`} else if (word0 < 0xE000) {`
			`return -2;`
			`} else {`
			`return word0;`
			`}`
			`}`

The docsearch function is now working. 2026-01-14 14:34:54 -05:00			`// Read a single UTF32 codepoint from a UTF8 string.`
More refactors to prepare for doc-search, including moving unicode support into ext. 2026-01-14 12:30:44 -05:00			`//`
			`// If the string_view starts with a valid codepoint, the codepoint`
			`// is removed from the string_view and is returned.`
			`//`
			`// If the string_view is empty, returns -1.`
			`//`
			`// If the string_view starts with an unfinished but possibly`
			`// valid codepoint, returns -1.`
			`//`
			`// If the string_view starts with a finish but invalid codepoint,`
			`// returns -2.`
			`//`
The docsearch function is now working. 2026-01-14 14:34:54 -05:00			`static char32_t read_codepoint_utf8(std::string_view &source) {`
More refactors to prepare for doc-search, including moving unicode support into ext. 2026-01-14 12:30:44 -05:00			`size_t size = source.size();`
			`if (size == 0) return -1;`

			`const unsigned char bytes = (const unsigned char )source.data();`

			`int codepoint;`
			`size_t seqlen;`
			`if ((bytes[0] & 0x80) == 0x00) {`
			`// U+0000 to U+007F`
			`codepoint = (bytes[0] & 0x7F);`
			`seqlen = 1;`
			`} else if ((bytes[0] & 0xE0) == 0xC0) {`
			`// U+0080 to U+07FF`
			`if (size < 2) return -1;`
			`if ((bytes[1] & 0xC0) != 0x80) return -2;`
			`codepoint = (bytes[0] & 0x1F);`
			`codepoint = (codepoint << 6) \| (bytes[1] & 0x3F);`
			`seqlen = 2;`
			`} else if ((bytes[0] & 0xF0) == 0xE0) {`
			`// U+0800 to U+FFFF`
			`if (size < 3) return -1;`
			`if ((bytes[1] & 0xC0) != 0x80) return -2;`
			`if ((bytes[2] & 0xC0) != 0x80) return -2;`
			`codepoint = (bytes[0] & 0x0F);`
			`codepoint = (codepoint << 6) \| (bytes[1] & 0x3F);`
			`codepoint = (codepoint << 6) \| (bytes[2] & 0x3F);`
			`seqlen = 3;`
			`} else if ((bytes[0] & 0xF8) == 0xF0) {`
			`// U+10000 to U+10FFFF`
			`if (size < 4) return -1;`
			`if ((bytes[1] & 0xC0) != 0x80) return -2;`
			`if ((bytes[2] & 0xC0) != 0x80) return -2;`
			`if ((bytes[3] & 0xC0) != 0x80) return -2;`
			`codepoint = (bytes[0] & 0x07);`
			`codepoint = (codepoint << 6) \| (bytes[1] & 0x3F);`
			`codepoint = (codepoint << 6) \| (bytes[2] & 0x3F);`
			`codepoint = (codepoint << 6) \| (bytes[3] & 0x3F);`
			`if (codepoint >= 0x110000) return -2;`
			`seqlen = 4;`
			`} else {`
			`return -2;`
			`}`

			`if ((codepoint >= 0xD800) && (codepoint <= 0xDFFF)) {`
			`return -2;`
			`}`

			`source.remove_prefix(seqlen);`
			`return codepoint;`
			`}`

The docsearch function is now working. 2026-01-14 14:34:54 -05:00			`// Convert a UTF32 string into a UTF8-string.`
More refactors to prepare for doc-search, including moving unicode support into ext. 2026-01-14 12:30:44 -05:00			`// If the codepoint string contains invalid codepoints, they're silently dropped.`
			`//`
			`static u8string utf32_to_utf8(const u32string &s) {`
			`u8string result(s.size() * 4, 0);`
			`char *buffer = &result[0];`
			`int len = 0;`
			`for (char32_t c : s) {`
			`int clen = codepoint_to_utf8(c, buffer + len);`
			`len += clen;`
			`}`
			`result.resize(len);`
			`return result;`
			`}`

			`// Convert a UTF8 string to a UTF32 string.`
			`//`
			`// If the UTF8 string contains invalid sequences, they're silently dropped.`
			`// Some of the bytes may not be consumed, if the source ends with an unfinished`
			`// utf-8 sequence. Returns the Codepoint string and the number of bytes consumed.`
			`// You may pass nullptr for consumed if you don't care how many bytes were`
			`// consumed.`
			`//`
			`static u32string utf8_to_utf32(std::string_view s, int *consumed) {`
			`std::string_view rest = s;`
			`u32string result(s.size(), 0);`
			`int len = 0;`
			`while (true) {`
			`int32_t c = read_codepoint_utf8(rest);`
			`if (c == -1) {`
			`break; // EOF reached;`
			`} else if (c < 0) {`
			`rest.remove_prefix(1);`
			`} else {`
			`result[len++] = (char32_t)c;`
			`}`
			`}`
			`if (consumed != nullptr) {`
			`*consumed = s.size() - rest.size();`
			`}`
			`result.resize(len);`
			`return result;`
			`}`

			`// Convert a UTF8 string to a UCS-2 string.`
			`//`
			`// If the UTF8 string contains invalid sequences, they're silently dropped.`
			`// Some of the bytes may not be consumed, if the source ends with an unfinished`
			`// utf-8 sequence. Returns the UCS-2 string and the number of bytes consumed.`
			`// Of course, UCS-2 can't represent all of unicode, so this is lossy.`
			`// Any character that can't be represented is replaced with a box.`
			`//`
			`static u16string utf8_to_ucs2(std::string_view s, int *consumed) {`
			`std::string_view rest = s;`
			`u16string result(s.size(), 0);`
			`int len = 0;`
			`while (true) {`
			`int32_t c = read_codepoint_utf8(rest);`
			`if (c == -1) {`
			`break; // EOF reached;`
			`} else if (c < 0) {`
			`rest.remove_prefix(1);`
			`} else if ((c >= 0xD800) && (c <= 0xDFFF)) {`
			`result[len++] = 0x2610;`
			`} else if (c > 0xFFFF) {`
			`result[len++] = 0x2610;`
			`} else {`
			`result[len++] = (char16_t)c;`
			`}`
			`}`
			`if (consumed != nullptr) {`
			`*consumed = s.size() - rest.size();`
			`}`
			`result.resize(len);`
			`return result;`
			`}`

			`// Convert a UTF16 string to a UTF8 string.`
			`//`
			`// This also works for ucs2 strings. If the UTF16 string`
			`// contains invalid sequences, they're silently dropped.`
			`//`
			`static u8string utf16_to_utf8(std::u16string_view s) {`
			`u8string result(s.size() * 4, 0);`
			`int len = 0;`
			`while (true) {`
			`int codepoint = read_codepoint_utf16(s);`
			`if (codepoint == -1) break;`
			`if (codepoint < 0) continue;`
			`len += codepoint_to_utf8(codepoint, &result[len]);`
			`}`
			`result.resize(len);`
			`return result;`
			`}`

			`// Check if UTF8 is valid.`
			`//`
			`static bool valid_utf8(std::string_view s) {`
			`while (!s.empty()) {`
			`int32_t codepoint = read_codepoint_utf8(s);`
			`if (codepoint < 0) return false;`
			`}`
			`return true;`
			`}`
			`};`