Files
integration/luprex/cpp/core/util.hpp

558 lines
18 KiB
C++
Raw Normal View History

///////////////////////////////////////////////////////////////////////
//
// NAMESPACE SV
//
// * Operate on string_view or just characters.
// * Do not allocate memory.
// * Do not copy strings.
//
// NAMESPACE UTIL
//
// * General purpose utility functions.
// * Sort of a catch-all.
//
///////////////////////////////////////////////////////////////////////
2026-02-25 01:58:19 -05:00
#pragma once
2020-11-13 15:18:09 -05:00
#include "wrap-string.hpp"
#include "wrap-set.hpp"
#include "wrap-map.hpp"
#include "wrap-vector.hpp"
#include "wrap-sstream.hpp"
#include <ostream>
#include <memory>
#include <utility>
#include <algorithm>
2022-02-24 13:50:43 -05:00
#include <string_view>
#include <limits>
#include <iomanip>
2023-01-23 13:49:17 -05:00
// #include <cstdint>
#include <cstdarg>
2021-08-09 12:54:32 -04:00
#include "spookyv2.hpp"
2020-11-13 15:18:09 -05:00
namespace sv {
// Bring this into our namespace.
using string_view = std::string_view;
// Test character class, ignoring current locale and unicode issues.
inline bool ascii_isupper(char c) { return (c >= 'A') && (c <= 'Z'); }
inline bool ascii_islower(char c) { return (c >= 'a') && (c <= 'z'); }
inline bool ascii_isdigit(char c) { return (c >= '0') && (c <= '9'); }
inline bool ascii_isalpha(char c) { return ascii_isupper(c) || ascii_islower(c); }
inline bool ascii_isualpha(char c) { return ascii_isalpha(c) || (c == '_'); }
inline bool ascii_isalnum(char c) { return ascii_isalpha(c) || ascii_isdigit(c); }
inline bool ascii_isualnum(char c) { return ascii_isalpha(c) || ascii_isdigit(c) || (c == '_'); }
inline bool ascii_isspace(char c) { return (c==' ')||(c=='\t')||(c=='\r')||(c=='\n')||(c=='\f')||(c=='\v'); }
// Check for the null string_view
//
// Note that the null string view is an empty string,
// but not every empty string is the null string view.
//
inline bool isnull(string_view v) { return v.data() == nullptr; }
2022-05-20 17:12:58 -04:00
// Return true if the two strings are equal, ignoring case.
//
bool case_insensitive_eq(std::string_view s1, std::string_view s2);
// Check if numbers can be parsed as int64/double
bool valid_double(string_view v);
2022-04-25 13:43:11 -04:00
bool valid_int64(string_view v);
bool valid_hex64(string_view v);
// Check if a hostname is a valid DNS (ascii) hostname.
bool valid_hostname(string_view v);
// Convert strings to numbers. Returns errval on failure.
//
// The integer parser accepts a sequence of digits,
// with or without a + or - sign. The hex parser
// does not allow a + or - sign. For both the int64
// and hex64 parser, it is a failure if the number
// does not fit in 64 bits. The double parser does
// not accept the strings 'nan' or 'inf'.
//
double to_double(string_view v, double errval = std::numeric_limits<double>::quiet_NaN());
int64_t to_int64(string_view v, int64_t errval = std::numeric_limits<int64_t>::max());
2022-04-25 13:43:11 -04:00
uint64_t to_hex64(string_view v, uint64_t errval = std::numeric_limits<uint64_t>::max());
// Trim whitspace from a string_view.
string_view ltrim(string_view v);
string_view rtrim(string_view v);
string_view trim(string_view v);
// Trim specific character (all occurrences) from a string_view.
string_view ltrim(string_view v, char c);
string_view rtrim(string_view v, char c);
string_view trim(string_view v, char c);
// Return true if the string has the specified prefix or suffix.
bool has_prefix(string_view s, string_view prefix);
bool has_suffix(string_view s, string_view suffix);
// Return the length of the common prefix of A and B.
int common_prefix_length(string_view a, string_view b);
// Return true if the string is a lua identifier.
bool is_lua_id(string_view s);
// Return true if the string is a valid lua classname.
bool is_lua_classname(string_view s);
// Return true if the line of code is a lua comment.
bool is_lua_comment(string_view s);
// Return true if the line is entirely whitespace.
bool is_whitespace(string_view s);
// Get the function name from a lua function prototype.
// Returns empty string if the prototype is malformed or
// is not a lua function prototype at all.
string_view lua_function_proto_name(string_view s);
2022-04-25 13:43:11 -04:00
// Return the first character, but if the view is empty,
// return zero.
inline char zfront(string_view &s) {
return s.empty() ? char(0) : s.front();
}
// Read whitespace from a string_view.
//
string_view read_space(string_view &source);
// Read from a string_view until separator is reached.
//
// If the separator appears in the source, returns everything
// before the separator, and updates the source to everything
// after the separator.
//
// If the separator doesn't appear in the source, returns
// the entire source, and replaces source with the null string_view.
//
string_view read_to_sep(string_view &source, char sep);
// Read from a string_view until newline is reached.
//
// If there's a line-break in the source (newline or CRLF),
// returns the text before the line-break, and updates the
// source to the text after the line-break.
//
// If there's no line-break in the source, returns the entire source,
// and updates source to the null string_view.
//
string_view read_to_line(string_view &source);
// Read a prefix string from a string_view.
//
// Returns false if the string view doesn't start with
// the specified prefix.
//
bool read_prefix(string_view &source, string_view prefix);
// Read from a string_view until whitespace is reached.
//
// If there's any whitespace in the source, returns the text
// before the whitespace, and update the source to the text
// after the whitespace.
//
// If there's no whitespace in the source, returns the entire
// source, and updates the source to the null string_view.
//
string_view read_to_space(string_view &source);
2022-04-25 13:43:11 -04:00
// Read up to nbytes from a string_view.
//
string_view read_nbytes(string_view &source, int nbytes);
// Read an identifier from a string_view
//
// If there's no valid identifier, returns empty string.
// Underscores are not allowed in the identifier.
//
string_view read_simple_identifier(string_view &source);
// Read an identifier from a string_view
//
// If there's no valid identifier, returns empty string.
// Lua identifiers are allowed to have underscores.
//
string_view read_lua_identifier(string_view &source);
// Read a number from a string view
//
// This is basically a regex pattern matching routine
// hardwired with the regex for numbers. You must
// specify which of the following parts of the regex
// are allowed or not:
//
// * plus sign
// * minus sign
// * decimal point
// * scientific notation exponents
//
// Returns the number as a string_view. There is
// no guarantee that the number is small enough to
// fit into any particular number of bits. This
// always uses base 10.
//
std::string_view read_number(string_view &source, bool plus, bool minus, bool dec, bool exp);
// Read an ascii character from a string.
//
// Returns -1 if the string is empty.
//
int32_t read_ascii_char(string_view &source);
// Read a UTF8 codepoint from a string_view.
//
// See documentation in unicode-stuff.hpp
//
int32_t read_codepoint_utf8(string_view &source);
2022-04-25 13:43:11 -04:00
// Return true if the string is valid utf-8.
//
// See documentation in unicode-stuff.hpp
//
2022-04-25 13:43:11 -04:00
bool valid_utf8(string_view s);
2026-01-14 14:34:54 -05:00
// Check if a UTF8 string contains a substring.
//
// Eventually, we're going to have a case-insensitive version of this,
// but it's really hard to write!
//
bool contains_substring_utf8(string_view haystack, string_view needle);
// Return true if the number conforms to the spec.
// See read_number for more information.
//
bool valid_number(string_view v, bool plus, bool minus, bool dec, bool exp);
} // namespace sv
2020-11-13 15:18:09 -05:00
namespace util {
2021-11-09 16:27:39 -05:00
enum MessageType {
MSG_NULL,
MSG_DIFF,
MSG_ACK,
MSG_INVOKE,
};
// Note: IdVector is weird in that it deliberately uses std::vector
// instead of eng::vector. This is because we want plane scans
// to not touch the engine heap.
//
using IdVector = std::vector<int64_t>;
using StringVec = eng::vector<eng::string>;
using StringPair = std::pair<eng::string, eng::string>;
using StringSet = eng::set<eng::string>;
using LuaSourceVec = eng::vector<StringPair>;
using LuaSourcePtr = std::unique_ptr<LuaSourceVec>;
using HashValue = std::pair<uint64_t, uint64_t>;
using SharedStdString = std::shared_ptr<std::string>;
using SharedStdStringVec = std::vector<SharedStdString>;
// Ascii uppercase and lowercase.
eng::string ascii_tolower(std::string_view c);
eng::string ascii_toupper(std::string_view c);
2020-11-13 15:18:09 -05:00
2021-09-08 01:32:08 -04:00
// Output a string to a stream using Lua string escaping and quoting.
void quote_string(const eng::string &str, std::ostream *os);
2021-09-08 01:32:08 -04:00
// base64 encode.
void base64_encode(std::string_view v, std::ostream *oss);
// base64 decode.
//
// Returns true if the base64 was 'clean' base64, as
// opposed to base64 with extraneous characters.
//
bool base64_decode(std::string_view v, std::ostream *oss);
2021-08-13 17:02:35 -04:00
// ID vector quick create.
IdVector id_vector_create(int64_t id1=-1, int64_t id2=-1, int64_t id3=-1, int64_t id4=-1);
// Print an ID vector to a stream.
void print_id_vector(const IdVector &idv, std::ostream *os);
void print_id_vector(const std::vector<uint64_t> &idv, std::ostream *os);
2021-08-03 11:25:12 -04:00
// ID vector debug string.
eng::string id_vector_debug_string(const IdVector &idv);
2021-08-03 11:25:12 -04:00
2021-07-30 13:22:23 -04:00
// Unions and sorts two ID vectors.
IdVector sort_union_id_vectors(const IdVector &v1, const IdVector &v2);
// Get a 128-bit hashvalue for a string.
HashValue hash_string(std::string_view str);
// Get a 128-bit hashvalue for a string, with a previous value.
HashValue hash_string(HashValue prev, std::string_view str);
// Get a 128-bit hashvalue for an ID vector.
2021-08-09 12:54:32 -04:00
HashValue hash_id_vector(const IdVector &idv);
// Convert a 128-bit hash to a hexadecimal string.
eng::string hash_to_hex(const HashValue &hash);
// Hash four integers together to 64 bits.
// This is a good hash, but not cryptographically good.
uint64_t hash_ints(uint64_t n1, uint64_t n2, uint64_t n3, uint64_t n4);
2022-07-11 02:32:12 -04:00
// Hash a single 64-bit integer.
// This is a good hash, but not cryptographically good.
// Published by David Stafford in his article 'Better Bit Mixing'.
inline uint64_t hash_int(uint64_t x) {
x = (x ^ (x >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
x = (x ^ (x >> 27)) * UINT64_C(0x94d049bb133111eb);
x = x ^ (x >> 31);
return x;
}
2022-04-06 15:09:28 -04:00
// Convert a 64-bit hash value into a floating point number between 0 and 1.
double hash_to_double(uint64_t hash);
// Split a string into multiple strings
StringVec split(const eng::string &s, char sep);
2021-12-15 23:03:43 -05:00
// Split a string into multiple strings using \r or \n
StringVec split_lines(const eng::string &s);
// Split a string into multiple lines using |, remove any leading blank line.
StringVec split_docstring(const eng::string &s);
2021-12-15 23:03:43 -05:00
2021-11-16 12:20:11 -05:00
// Join multiple strings into one string
eng::string join(const StringVec &strs, eng::string sep);
2021-11-16 12:20:11 -05:00
// Return N repetitions of string A
eng::string repeat_string(const eng::string &a, int n);
2021-07-30 13:22:23 -04:00
// String to lowercase/uppercase. Ascii only, no unicode.
eng::string tolower(eng::string input);
eng::string toupper(eng::string input);
2021-02-07 15:35:31 -05:00
// Convert a codepoint number into a utf8 string.
// If the codepoint is invalid, returns empty string.
eng::string get_codepoint_utf8(int32_t cp);
// Write a codepoint in utf8 to a stream.
// If the codepoint is invalid, writes nothing and returns false.
bool write_codepoint_utf8(int32_t cp, std::ostream *out);
2021-07-30 13:22:23 -04:00
// Calculate distance between two points
double distance_squared(double x1, double y1, double x2, double y2);
// Make a LuaSourceVec with one element, for unit testing.
LuaSourcePtr make_lua_source(const eng::string &code);
// Remove items from a vector that are nullptr.
2021-11-11 16:23:11 -05:00
template<class T>
void remove_nullptrs(T &vec) {
auto iter = std::partition(vec.begin(), vec.end(), [] (const auto &x) { return x != nullptr; });
vec.erase(iter, vec.end());
}
// Remove items from a vector that are marked for deletion.
template<class T>
void remove_marked_items(T &vec) {
auto iter = std::partition(vec.begin(), vec.end(), [] (const auto &x) { return !x.marked_for_deletion(); });
2021-11-11 16:23:11 -05:00
vec.erase(iter, vec.end());
}
2021-02-02 16:29:07 -05:00
// An XYZ coordinate, general purpose.
template <typename NUMBER>
struct NumXYZ {
using Number = NUMBER;
Number x, y, z;
NumXYZ() { x=0; y=0; z=0; }
NumXYZ(Number ix, Number iy, Number iz) { x=ix; y=iy; z=iz; }
void operator =(const NumXYZ<double> &other) { x = other.x; y = other.y; z = other.z; }
void operator =(const NumXYZ<float> &other) { x = other.x; y = other.y; z = other.z; }
void operator =(Number n) { x = n; y = n; z = n; }
bool operator ==(const NumXYZ &o) const { return x==o.x && y == o.y && z==o.z; }
bool operator !=(const NumXYZ &o) const { return x!=o.x || y != o.y || z!=o.z; }
NumXYZ operator -(const NumXYZ &o) const { return NumXYZ(x-o.x, y-o.y, z-o.z); }
NumXYZ operator +(const NumXYZ &o) const { return NumXYZ(x+o.x, y+o.y, z+o.z); }
NumXYZ operator *(float scale) const { return NumXYZ(x*scale, y*scale, z*scale); }
template<typename ONUMBER>
const NumXYZ<ONUMBER> convert() const { NumXYZ<ONUMBER> r; r.x=ONUMBER(x); r.y=ONUMBER(y); r.z=ONUMBER(z); return r; }
eng::string debug_string() const {
eng::ostringstream oss;
oss << "(" << x << "," << y << "," << z << ")";
return oss.str();
}
2021-01-12 16:21:29 -05:00
};
2021-07-18 17:48:39 -04:00
using XYZ=NumXYZ<float>;
using DXYZ=NumXYZ<double>;
// util::ostringstream
//
// This is a variant of ostringstream in which it is possible
// to get the contents without copying. To get the contents
// without copying, use oss.view().
//
class ostringstream : public eng::ostringstream {
class rstringbuf : public std::basic_stringbuf<char_type, traits_type, allocator_type> {
public:
char *eback() const { return std::streambuf::eback(); }
char *pptr() const { return std::streambuf::pptr(); }
};
rstringbuf rstringbuf_;
public:
ostringstream() {
std::basic_ostream<char>::rdbuf(&rstringbuf_);
}
char *data() const {
return rstringbuf_.eback();
}
size_t size() const {
return rstringbuf_.pptr() - rstringbuf_.eback();
}
std::string_view view() const {
return std::string_view(data(), size());
}
eng::string str() const {
return rstringbuf_.str();
}
};
2023-02-27 17:21:00 -05:00
// send_to_stream: send all arguments to the specified stream.
inline void send_to_stream(std::ostream &os) {}
2023-04-07 15:49:50 -04:00
template <typename ARG, typename... REST>
inline void send_to_stream(std::ostream &os, const ARG &arg, const REST & ... rest) {
os << arg;
send_to_stream(os, rest...);
}
// ss: convert all arguments to a string by sending them to a stringstream.
2023-04-07 15:49:50 -04:00
template <typename... ARGS>
inline eng::string ss(const ARGS & ... args) {
eng::ostringstream oss;
send_to_stream(oss, args...);
return oss.str();
}
// dprintf / dprint
2023-02-27 17:21:00 -05:00
//
// Send a debugging message to somewhere that it can be seen. This routine
// initially just sends output to stderr. But it can be hooked to send output
// somewhere else, like to a debug output window.
//
// The hook function must be a function that accepts a single line of text. The
// hook function will always be passed one line, consisting of printable
// characters only. There will be no control characters. The newline is
// implied.
//
void dprintview(std::string_view view);
2023-02-27 17:21:00 -05:00
void dprintf(const char *format, ...);
void hook_dprint(void (*func)(const char *oneline, size_t size));
2023-04-07 15:49:50 -04:00
template <typename... ARGS>
inline void dprint(const ARGS & ... args) {
util::ostringstream oss;
send_to_stream(oss, args...);
dprintview(oss.view());
}
2023-02-27 17:21:00 -05:00
// A better API than std::setfill, std::hex, std::setw, std::setprecision
//
// Usage examples:
// std::cout << util::hex.width(5).fill('0').val(123)
// std::cout << util::dec.fill('$').precision(val(123)
//
// The reason that other API is bad is that it can leave std::cout
// in an unpredictable state. This API always leaves the stream clean.
//
template <class VALUE>
class FormattedNumber {
public:
VALUE value_;
bool hex_;
int width_;
char fill_;
int precision_;
constexpr FormattedNumber(VALUE v, bool h, int w, char f, int p)
: value_(v), hex_(h), width_(w), fill_(f), precision_(p) {}
constexpr FormattedNumber width(int w) const { return FormattedNumber(value_, hex_, w, fill_, precision_); }
constexpr FormattedNumber fill(char f) const { return FormattedNumber(value_, hex_, width_, f, precision_); }
constexpr FormattedNumber precision(int p) const { return FormattedNumber(value_, hex_, width_, fill_, p); }
template <class NVALUE>
constexpr FormattedNumber val(NVALUE v) const { return FormattedNumber(v, hex_, width_, fill_, precision_); }
};
constexpr auto hex = FormattedNumber<int>(0, true, 0, '0', 6);
constexpr auto hex8 = FormattedNumber<int>(0, true, 2, '0', 6);
constexpr auto hex16 = FormattedNumber<int>(0, true, 4, '0', 6);
constexpr auto hex32 = FormattedNumber<int>(0, true, 8, '0', 6);
constexpr auto hex64 = FormattedNumber<int>(0, true, 16, '0', 6);
constexpr auto dec = FormattedNumber<int>(0, false, 0, ' ', 6);
2026-02-24 23:44:10 -05:00
// Encode a string as a token (fixed-width base38 number).
// Each character is mapped to a digit 1-37 (0 means "no character"),
// and the result is: CH0*38^11 + CH1*38^10 + ... + CH11*38^0.
// Digit mapping: _ → 1, 0-9 → 2-11, a-z → 12-37.
// Returns zero if the string is empty, too long, or contains
// invalid characters.
//
static constexpr uint64_t encode_token(std::string_view str) {
if (str.size() > 12) return 0;
if (str.empty()) return 0;
uint64_t result = 0;
for (int i = 0; i < int(str.size()); i++) {
char c = str[i];
uint64_t digit = 0;
if (c == '_') {
digit = 1;
} else if ((c >= '0') && (c <= '9')) {
digit = uint64_t(c - '0') + 2;
} else if ((c >= 'a') && (c <= 'z')) {
digit = uint64_t(c - 'a') + 12;
} else if ((c >= 'A') && (c <= 'Z')) {
digit = uint64_t(c - 'A') + 12;
} else {
return 0;
}
result = result * 38 + digit;
}
// Pad remaining positions with zeros (no character).
for (int i = int(str.size()); i < 12; i++) {
result = result * 38;
}
return result;
}
// Decode a token (base38 number) back to a string.
//
eng::string decode_token(uint64_t value);
2020-11-13 15:18:09 -05:00
} // namespace util
template<class VALUE>
inline std::ostream &operator<<(std::ostream &oss, util::FormattedNumber<VALUE> n) {
if (n.hex_) oss << std::hex;
else oss << std::dec;
oss << std::setprecision(n.precision_) << std::setfill(n.fill_) << std::setw(n.width_) << n.value_;
oss << std::dec << std::setfill(' ') << std::setprecision(6);
return oss;
}
inline std::ostream &operator<<(std::ostream &oss, const util::XYZ &xyz) {
oss << xyz.x << "," << xyz.y << "," << xyz.z;
return oss;
}
2021-07-21 16:10:29 -04:00
inline std::ostream &operator<<(std::ostream &oss, const util::DXYZ &xyz) {
oss << xyz.x << "," << xyz.y << "," << xyz.z;
return oss;
}