luprex/cpp/core/util.hpp

///////////////////////////////////////////////////////////////////////
//
// NAMESPACE SV
//
//  * Operate on string_view or just characters.
//  * Do not allocate memory.
//  * Do not copy strings.
//
// NAMESPACE UTIL
//
//  * General purpose utility functions.
//  * Sort of a catch-all.
//
///////////////////////////////////////////////////////////////////////

#pragma once

#include "wrap-string.hpp"
#include "wrap-set.hpp"
#include "wrap-map.hpp"
#include "wrap-vector.hpp"
#include "wrap-sstream.hpp"
#include <ostream>
#include <memory>
#include <utility>
#include <algorithm>
#include <string_view>
#include <limits>
#include <iomanip>
// #include <cstdint>
#include <cstdarg>
#include "spookyv2.hpp"


namespace sv {

// Bring this into our namespace.
using string_view = std::string_view;

// Test character class, ignoring current locale and unicode issues.
inline bool ascii_isupper(char c) { return (c >= 'A') && (c <= 'Z'); }
inline bool ascii_islower(char c) { return (c >= 'a') && (c <= 'z'); }
inline bool ascii_isdigit(char c) { return (c >= '0') && (c <= '9'); }
inline bool ascii_isalpha(char c) { return ascii_isupper(c) || ascii_islower(c); }
inline bool ascii_isualpha(char c) { return ascii_isalpha(c) || (c == '_'); }
inline bool ascii_isalnum(char c) { return ascii_isalpha(c) || ascii_isdigit(c); }
inline bool ascii_isualnum(char c) { return ascii_isalpha(c) || ascii_isdigit(c) || (c == '_'); }
inline bool ascii_isspace(char c) { return (c==' ')||(c=='\t')||(c=='\r')||(c=='\n')||(c=='\f')||(c=='\v'); }

// Check for the null string_view
//
// Note that the null string view is an empty string, 
// but not every empty string is the null string view.
//
inline bool isnull(string_view v) { return v.data() == nullptr; }

// Return true if the two strings are equal, ignoring case.
//
bool case_insensitive_eq(std::string_view s1, std::string_view s2);

// Check if numbers can be parsed as int64/double
bool valid_double(string_view v);
bool valid_int64(string_view v);
bool valid_hex64(string_view v);

// Check if a hostname is a valid DNS (ascii) hostname.
bool valid_hostname(string_view v);

// Convert strings to numbers.  Returns errval on failure.
//
// The integer parser accepts a sequence of digits,
// with or without a + or - sign.   The hex parser
// does not allow a + or - sign.  For both the int64
// and hex64 parser, it is a failure if the number
// does not fit in 64 bits.  The double parser does
// not accept the strings 'nan' or 'inf'.
//
double to_double(string_view v, double errval = std::numeric_limits<double>::quiet_NaN());
int64_t to_int64(string_view v, int64_t errval = std::numeric_limits<int64_t>::max());
uint64_t to_hex64(string_view v, uint64_t errval = std::numeric_limits<uint64_t>::max());

// Trim whitspace from a string_view.
string_view ltrim(string_view v);
string_view rtrim(string_view v);
string_view trim(string_view v);

// Trim specific character (all occurrences) from a string_view.
string_view ltrim(string_view v, char c);
string_view rtrim(string_view v, char c);
string_view trim(string_view v, char c);

// Return true if the string has the specified prefix or suffix.
bool has_prefix(string_view s, string_view prefix);
bool has_suffix(string_view s, string_view suffix);

// Return the length of the common prefix of A and B.
int common_prefix_length(string_view a, string_view b);

// Return true if the string is a lua identifier.
bool is_lua_id(string_view s);

// Return true if the string is a valid lua classname.
bool is_lua_classname(string_view s);

// Return true if the line of code is a lua comment.
bool is_lua_comment(string_view s);

// Return true if the line is entirely whitespace.
bool is_whitespace(string_view s);

// Get the function name from a lua function prototype.
// Returns empty string if the prototype is malformed or
// is not a lua function prototype at all.
string_view lua_function_proto_name(string_view s);

// Return the first character, but if the view is empty,
// return zero.
inline char zfront(string_view &s) {
    return s.empty() ? char(0) : s.front();
}

// Read whitespace from a string_view.
//
string_view read_space(string_view &source);

// Read from a string_view until separator is reached.
//
// If the separator appears in the source, returns everything
// before the separator, and updates the source to everything
// after the separator.
//
// If the separator doesn't appear in the source, returns
// the entire source, and replaces source with the null string_view.
//
string_view read_to_sep(string_view &source, char sep);

// Read from a string_view until newline is reached.
//
// If there's a line-break in the source (newline or CRLF),
// returns the text before the line-break, and updates the
// source to the text after the line-break.
//
// If there's no line-break in the source, returns the entire source,
// and updates source to the null string_view.
//
string_view read_to_line(string_view &source);

// Read a prefix string from a string_view.
//
// Returns false if the string view doesn't start with
// the specified prefix.
//
bool read_prefix(string_view &source, string_view prefix); 

// Read from a string_view until whitespace is reached.
//
// If there's any whitespace in the source, returns the text
// before the whitespace, and update the source to the text
// after the whitespace.
//
// If there's no whitespace in the source, returns the entire
// source, and updates the source to the null string_view.
//
string_view read_to_space(string_view &source);

// Read up to nbytes from a string_view.
//
string_view read_nbytes(string_view &source, int nbytes);

// Read an identifier from a string_view
//
// If there's no valid identifier, returns empty string.
// Underscores are not allowed in the identifier.
//
string_view read_simple_identifier(string_view &source);

// Read an identifier from a string_view
//
// If there's no valid identifier, returns empty string.
// Lua identifiers are allowed to have underscores.
//
string_view read_lua_identifier(string_view &source);

// Read a number from a string view
//
// This is basically a regex pattern matching routine
// hardwired with the regex for numbers.  You must
// specify which of the following parts of the regex
// are allowed or not:
//
//  * plus sign
//  * minus sign
//  * decimal point
//  * scientific notation exponents
//
// Returns the number as a string_view.  There is
// no guarantee that the number is small enough to
// fit into any particular number of bits.  This
// always uses base 10.
//
std::string_view read_number(string_view &source, bool plus, bool minus, bool dec, bool exp);

// Read an ascii character from a string.
//
// Returns -1 if the string is empty.
//
int32_t read_ascii_char(string_view &source);

// Read a UTF8 codepoint from a string_view.
//
// See documentation in unicode-stuff.hpp
//
int32_t read_codepoint_utf8(string_view &source);

// Return true if the string is valid utf-8.
//
// See documentation in unicode-stuff.hpp
//
bool valid_utf8(string_view s);

// Check if a UTF8 string contains a substring.
//
// Eventually, we're going to have a case-insensitive version of this,
// but it's really hard to write!
//
bool contains_substring_utf8(string_view haystack, string_view needle);

// Return true if the number conforms to the spec.
// See read_number for more information.
//
bool valid_number(string_view v, bool plus, bool minus, bool dec, bool exp);

} // namespace sv

namespace util {

enum MessageType {
    MSG_NULL,
    MSG_DIFF,
    MSG_ACK,
    MSG_INVOKE,
};

// Note: IdVector is weird in that it deliberately uses std::vector
// instead of eng::vector.  This is because we want plane scans
// to not touch the engine heap.
//
using IdVector = std::vector<int64_t>;

using StringVec = eng::vector<eng::string>;
using StringPair = std::pair<eng::string, eng::string>;
using StringSet = eng::set<eng::string>;
using LuaSourceVec = eng::vector<StringPair>;
using LuaSourcePtr = std::unique_ptr<LuaSourceVec>;
using HashValue = std::pair<uint64_t, uint64_t>;
using SharedStdString = std::shared_ptr<std::string>;
using SharedStdStringVec = std::vector<SharedStdString>;

// Ascii uppercase and lowercase.
eng::string ascii_tolower(std::string_view c);
eng::string ascii_toupper(std::string_view c);

// Output a string to a stream using Lua string escaping and quoting.
void quote_string(const eng::string &str, std::ostream *os);

// base64 encode.
void base64_encode(std::string_view v, std::ostream *oss);

// base64 decode.
//
// Returns true if the base64 was 'clean' base64, as
// opposed to base64 with extraneous characters.
//
bool base64_decode(std::string_view v, std::ostream *oss);

// ID vector quick create.
IdVector id_vector_create(int64_t id1=-1, int64_t id2=-1, int64_t id3=-1, int64_t id4=-1);

// Print an ID vector to a stream.
void print_id_vector(const IdVector &idv, std::ostream *os);
void print_id_vector(const std::vector<uint64_t> &idv, std::ostream *os);

// ID vector debug string.
eng::string id_vector_debug_string(const IdVector &idv);

// Unions and sorts two ID vectors.
IdVector sort_union_id_vectors(const IdVector &v1, const IdVector &v2);

// Get a 128-bit hashvalue for a string.
HashValue hash_string(std::string_view str);

// Get a 128-bit hashvalue for a string, with a previous value.
HashValue hash_string(HashValue prev, std::string_view str);

// Get a 128-bit hashvalue for an ID vector.
HashValue hash_id_vector(const IdVector &idv);

// Convert a 128-bit hash to a hexadecimal string.
eng::string hash_to_hex(const HashValue &hash);

// Hash four integers together to 64 bits.
// This is a good hash, but not cryptographically good.
uint64_t hash_ints(uint64_t n1, uint64_t n2, uint64_t n3, uint64_t n4);

// Hash a single 64-bit integer.
// This is a good hash, but not cryptographically good.
// Published by David Stafford in his article 'Better Bit Mixing'.
inline uint64_t hash_int(uint64_t x) {
    x = (x ^ (x >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
    x = (x ^ (x >> 27)) * UINT64_C(0x94d049bb133111eb);
    x = x ^ (x >> 31);
    return x;
}

// Convert a 64-bit hash value into a floating point number between 0 and 1.
double hash_to_double(uint64_t hash);

// Split a string into multiple strings
StringVec split(const eng::string &s, char sep);

// Split a string into multiple strings using \r or \n
StringVec split_lines(const eng::string &s);

// Split a string into multiple lines using |, remove any leading blank line.
StringVec split_docstring(const eng::string &s);

// Join multiple strings into one string
eng::string join(const StringVec &strs, eng::string sep);

// Return N repetitions of string A
eng::string repeat_string(const eng::string &a, int n);

// String to lowercase/uppercase.  Ascii only, no unicode.
eng::string tolower(eng::string input);
eng::string toupper(eng::string input);

// Convert a codepoint number into a utf8 string.
// If the codepoint is invalid, returns empty string.
eng::string get_codepoint_utf8(int32_t cp);

// Write a codepoint in utf8 to a stream.
// If the codepoint is invalid, writes nothing and returns false.
bool write_codepoint_utf8(int32_t cp, std::ostream *out);

// Calculate distance between two points
double distance_squared(double x1, double y1, double x2, double y2);

// Make a LuaSourceVec with one element, for unit testing.
LuaSourcePtr make_lua_source(const eng::string &code);

// Remove items from a vector that are nullptr.
template<class T>
void remove_nullptrs(T &vec) {
    auto iter = std::partition(vec.begin(), vec.end(), [] (const auto &x) { return x != nullptr; });
    vec.erase(iter, vec.end());
}

// Remove items from a vector that are marked for deletion.
template<class T>
void remove_marked_items(T &vec) {
    auto iter = std::partition(vec.begin(), vec.end(), [] (const auto &x) { return !x.marked_for_deletion(); });
    vec.erase(iter, vec.end());
}

// An XYZ coordinate, general purpose.
template <typename NUMBER>
struct NumXYZ {
    using Number = NUMBER;
    Number x, y, z;
    NumXYZ() { x=0; y=0; z=0; }
    NumXYZ(Number ix, Number iy, Number iz) { x=ix; y=iy; z=iz; }
    void operator =(const NumXYZ<double> &other) { x = other.x; y = other.y; z = other.z; }
    void operator =(const NumXYZ<float> &other) { x = other.x; y = other.y; z = other.z; }
    void operator =(Number n) { x = n; y = n; z = n; }
    bool operator ==(const NumXYZ &o) const { return x==o.x && y == o.y && z==o.z; }
    bool operator !=(const NumXYZ &o) const { return x!=o.x || y != o.y || z!=o.z; }
    NumXYZ operator -(const NumXYZ &o) const { return NumXYZ(x-o.x, y-o.y, z-o.z); }
    NumXYZ operator +(const NumXYZ &o) const { return NumXYZ(x+o.x, y+o.y, z+o.z); }
    NumXYZ operator *(float scale) const { return NumXYZ(x*scale, y*scale, z*scale); }
    template<typename ONUMBER>
    const NumXYZ<ONUMBER> convert() const { NumXYZ<ONUMBER> r; r.x=ONUMBER(x); r.y=ONUMBER(y); r.z=ONUMBER(z); return r; }
    
    eng::string debug_string() const {
        eng::ostringstream oss;
        oss << "(" << x << "," << y << "," << z << ")";
        return oss.str();
    }
};

using XYZ=NumXYZ<float>;
using DXYZ=NumXYZ<double>;

// util::ostringstream
//
// This is a variant of ostringstream in which it is possible
// to get the contents without copying.   To get the contents
// without copying, use oss.view().
//
class ostringstream : public eng::ostringstream {
    class rstringbuf : public std::basic_stringbuf<char_type, traits_type, allocator_type> {
    public:
        char *eback() const { return std::streambuf::eback(); }
        char *pptr() const { return std::streambuf::pptr(); }
    };
    rstringbuf rstringbuf_;
public:
    ostringstream() {
        std::basic_ostream<char>::rdbuf(&rstringbuf_);
    }
    char *data() const {
        return rstringbuf_.eback();
    }
    size_t size() const {
        return rstringbuf_.pptr() - rstringbuf_.eback();
    }
    std::string_view view() const {
        return std::string_view(data(), size());
    }
    eng::string str() const {
        return rstringbuf_.str();
    }
};

// send_to_stream: send all arguments to the specified stream.
inline void send_to_stream(std::ostream &os) {}
template <typename ARG, typename... REST>
inline void send_to_stream(std::ostream &os, const ARG &arg, const REST & ... rest) {
    os << arg;
    send_to_stream(os, rest...);
}

// ss: convert all arguments to a string by sending them to a stringstream.
template <typename... ARGS>
inline eng::string ss(const ARGS & ... args) {
    eng::ostringstream oss;
    send_to_stream(oss, args...);
    return oss.str();
}

// dprintf / dprint
//
// Send a debugging message to somewhere that it can be seen. This routine
// initially just sends output to stderr.  But it can be hooked to send output
// somewhere else, like to a debug output window.
//
// The hook function must be a function that accepts a single line of text.  The
// hook function will always be passed one line, consisting of printable
// characters only.  There will be no control characters.  The newline is
// implied. 
//
void dprintview(std::string_view view);
void dprintf(const char *format, ...);
void hook_dprint(void (*func)(const char *oneline, size_t size));

template <typename... ARGS>
inline void dprint(const ARGS & ... args) {
    util::ostringstream oss;
    send_to_stream(oss, args...);
    dprintview(oss.view());
}

// A better API than std::setfill, std::hex, std::setw, std::setprecision
//
// Usage examples:
//   std::cout << util::hex.width(5).fill('0').val(123)
//   std::cout << util::dec.fill('$').precision(val(123)
//
// The reason that other API is bad is that it can leave std::cout
// in an unpredictable state.  This API always leaves the stream clean.
//
template <class VALUE>
class FormattedNumber {
public:
    VALUE value_;
    bool hex_;
    int width_;
    char fill_;
    int precision_;

    constexpr FormattedNumber(VALUE v, bool h, int w, char f, int p)
        : value_(v), hex_(h), width_(w), fill_(f), precision_(p) {}

    constexpr FormattedNumber width(int w) const { return FormattedNumber(value_, hex_, w, fill_, precision_); }
    constexpr FormattedNumber fill(char f) const { return FormattedNumber(value_, hex_, width_, f, precision_); }
    constexpr FormattedNumber precision(int p) const { return FormattedNumber(value_, hex_, width_, fill_, p); }

    template <class NVALUE>
    constexpr FormattedNumber val(NVALUE v) const { return FormattedNumber(v, hex_, width_, fill_, precision_); }
};

constexpr auto hex = FormattedNumber<int>(0, true, 0, '0', 6);
constexpr auto hex8 = FormattedNumber<int>(0, true, 2, '0', 6);
constexpr auto hex16 = FormattedNumber<int>(0, true, 4, '0', 6);
constexpr auto hex32 = FormattedNumber<int>(0, true, 8, '0', 6);
constexpr auto hex64 = FormattedNumber<int>(0, true, 16, '0', 6);
constexpr auto dec = FormattedNumber<int>(0, false, 0, ' ', 6);

// Encode a string as a token (fixed-width base38 number).
// Each character is mapped to a digit 1-37 (0 means "no character"),
// and the result is: CH0*38^11 + CH1*38^10 + ... + CH11*38^0.
// Digit mapping: _ → 1, 0-9 → 2-11, a-z → 12-37.
// Returns zero if the string is empty, too long, or contains
// invalid characters.
//
static constexpr uint64_t encode_token(std::string_view str) {
    if (str.size() > 12) return 0;
    if (str.empty()) return 0;

    uint64_t result = 0;
    for (int i = 0; i < int(str.size()); i++) {
        char c = str[i];
        uint64_t digit = 0;
        if (c == '_') {
            digit = 1;
        } else if ((c >= '0') && (c <= '9')) {
            digit = uint64_t(c - '0') + 2;
        } else if ((c >= 'a') && (c <= 'z')) {
            digit = uint64_t(c - 'a') + 12;
        } else if ((c >= 'A') && (c <= 'Z')) {
            digit = uint64_t(c - 'A') + 12;
        } else {
            return 0;
        }
        result = result * 38 + digit;
    }
    // Pad remaining positions with zeros (no character).
    for (int i = int(str.size()); i < 12; i++) {
        result = result * 38;
    }
    return result;
}

// Decode a token (base38 number) back to a string.
//
eng::string decode_token(uint64_t value);

} // namespace util

template<class VALUE>
inline std::ostream &operator<<(std::ostream &oss, util::FormattedNumber<VALUE> n) {
    if (n.hex_) oss << std::hex;
    else oss << std::dec;
    oss << std::setprecision(n.precision_) << std::setfill(n.fill_) << std::setw(n.width_) << n.value_;
    oss << std::dec << std::setfill(' ') << std::setprecision(6);
    return oss;
}

inline std::ostream &operator<<(std::ostream &oss, const util::XYZ &xyz) {
    oss << xyz.x << "," << xyz.y << "," << xyz.z;
    return oss;
}

inline std::ostream &operator<<(std::ostream &oss, const util::DXYZ &xyz) {
    oss << xyz.x << "," << xyz.y << "," << xyz.z;
    return oss;
}