Files
integration/luprex/core/cpp/http.cpp
2022-04-25 13:43:11 -04:00

864 lines
25 KiB
C++

//
// Things to worry about:
// Expect: 100-Continue
#include "http.hpp"
#include "wrap-sstream.hpp"
#include "wrap-string.hpp"
#include "util.hpp"
#include "luastack.hpp"
#include <cstdint>
using string_view = std::string_view;
bool words_separated_by_dashes(string_view v) {
while (true) {
if (!sv::ascii_isalpha(sv::zfront(v))) return false;
v.remove_prefix(1);
while (sv::ascii_isalnum(sv::zfront(v))) v.remove_prefix(1);
if (v.empty()) return true;
if (sv::zfront(v) != '-') return false;
v.remove_prefix(1);
}
}
// Technically, this is a true, correct URL encode routine.
static eng::string url_encode_param(string_view value) {
eng::ostringstream result;
const char *hexdigits = "0123456789ABCDEF";
for (int i = 0; i < int(value.size()); i++) {
char c = value[i];
if (sv::ascii_isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') {
result << c;
} else if (c == ' ') {
result << '+';
} else {
result << '%' << hexdigits[c>>4] << hexdigits[c&15];
}
}
return result.str();
}
// This URL encode routine leaves slashes intact. That's not
// technically correct, but it's really what you want for paths.
static eng::string url_encode_path(string_view value) {
eng::ostringstream result;
const char *hexdigits = "0123456789ABCDEF";
for (int i = 0; i < int(value.size()); i++) {
char c = value[i];
if (sv::ascii_isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~' || c == '/') {
result << c;
} else if (c == ' ') {
result << '+';
} else {
result << '%' << hexdigits[c>>4] << hexdigits[c&15];
}
}
return result.str();
}
static eng::string url_decode(string_view eurl) {
eng::ostringstream result;
int i = 0;
int len = eurl.size();
while (i < len) {
char c = eurl[i];
if (c == '+') {
result << ' ';
i += 1;
} else if ((c == '%') && (i + 2 < len)) {
std::string_view code = eurl.substr(i + 1, 2);
uint64_t value = sv::to_hex64(code);
if (value > 255) {
result << '?';
} else {
result << char(value);
}
i += 3;
} else {
result << c;
i += 1;
}
}
return result.str();
}
static void send_encoded_path(std::string_view path, const UrlParameters &params, StreamBuffer *sb) {
sb->write_bytes(url_encode_path(path));
bool first_param = true;
for (const auto &pair : params) {
sb->write_char(first_param ? '?' : '&');
sb->write_bytes(url_encode_param(pair.first));
sb->write_char('=');
sb->write_bytes(url_encode_param(pair.second));
first_param = false;
}
}
static void send_host_and_port(std::string_view host, int port, StreamBuffer *sb) {
sb->write_bytes(host);
if (port != 0) {
sb->write_char(':');
sb->ostream() << port;
}
}
// In a properly-formed url, the hostname and path are url encoded.
// This parser expects an encoded URL.
struct ParsedURL {
public:
bool valid;
eng::string proto;
eng::string host;
int port;
eng::string path;
UrlParameters params;
public:
void clear() {
valid = false;
proto.clear();
host.clear();
port = 0;
path.clear();
params.clear();
}
eng::string str() {
StreamBuffer sb;
sb.write_bytes(proto);
sb.write_bytes("://");
send_host_and_port(host, port, &sb);
send_encoded_path(path, params, &sb);
return eng::string(sb.view());
}
ParsedURL(std::string_view url) {
clear();
proto = util::ascii_tolower(sv::read_to_sep(url, ':'));
if (!sv::has_prefix(url, "//")) { clear(); return; }
url.remove_prefix(2);
if (!words_separated_by_dashes(proto)) { clear(); return; }
// Extract the host and port as a single string.
string_view turl = url;
string_view hostport = sv::read_to_sep(turl, '/');
url.remove_prefix(hostport.size());
// Split the host and port from each other and parse them.
host = util::ascii_tolower(sv::read_to_sep(hostport, ':'));
if (host.empty()) { clear(); return; }
if (!hostport.empty()) {
int64_t iport = sv::to_int64(hostport);
if ((iport < 1) || (iport > 65535)) {
clear(); return;
}
port = iport;
}
// Split off the path.
path = url_decode(sv::read_to_sep(url, '?'));
if (path.empty()) {
path = "/";
}
// Process url parameters.
while (!sv::isnull(url)) {
std::string_view keyval = sv::read_to_sep(url, '&');
if (keyval.empty()) { clear(); return; }
std::string_view key = sv::read_to_sep(keyval, '=');
if (key.empty()) { clear(); return; }
if (sv::isnull(keyval)) { clear(); return; }
eng::string dkey = url_decode(key);
eng::string dval = url_decode(keyval);
params[dkey] = dval;
}
// If we made it here, we have a valid URL
valid = true;
}
};
HttpOutRequest::HttpOutRequest() {
verify_certificate_ = true;
port_ = 0;
}
void HttpOutRequest::fail(string_view s) {
if (error_.empty()) {
error_ = s;
}
}
eng::string HttpOutRequest::target() const {
assert(check().empty());
eng::ostringstream oss;
oss << (verify_certificate_ ? "cert" : "nocert");
oss << ':' << host_ << ':' << port_;
return oss.str();
}
void HttpOutRequest::set_verify_certificate(bool flag) {
verify_certificate_ = flag;
}
void HttpOutRequest::set_method(const eng::string &s) {
eng::string method = util::ascii_toupper(s);
if ((method != "GET") && (method != "HEAD")) {
fail(util::ss("HTTP method not implemented: ", method, ".",
"Currently, only HEAD and GET are implemented."));
return;
}
if ((!method_.empty()) && (method_ != method)) {
fail(util::ss("HTTP method specified twice: ", method_, " and ", method));
return;
}
method_ = method;
}
void HttpOutRequest::set_host(const eng::string &s) {
eng::string host = util::ascii_tolower(s);
if (host.empty()) {
fail(util::ss("HTTP hostname cannot be empty string."));
return;
}
// This is not quite strict, but it's close. I believe
// the DNS lookup will fail for invalid hostnames anyway.
for (char c : host) {
if ((c != '-') && (c != '.') && (!sv::ascii_isalnum(c))) {
fail(util::ss("HTTP hostnames can only contain letters, digits, and hyphen: ", host));
return;
}
}
if (!host_.empty()) {
fail(util::ss("HTTP hostname specified twice: ", host_, " and ", host));
return;
}
host_ = host;
}
void HttpOutRequest::set_port(int port) {
if ((port < 1) || (port > 65535)) {
fail(util::ss("HTTP port must be between 1 and 65535: ", port));
return;
}
if (port_ != 0) {
fail(util::ss("HTTP port specified twice: ", port_, " and ", port));
return;
}
port_ = port;
}
void HttpOutRequest::set_path(string_view path) {
if (!sv::has_prefix(path, "/")) {
fail(util::ss("HTTP path must start with slash"));
return;
}
if (!path_.empty()) {
fail(util::ss("HTTP path specified twice: ", path_, " and ", path));
return;
}
path_ = path;
}
void HttpOutRequest::set_param(const eng::string &key, const eng::string &val) {
if (params_.find(key) != params_.end()) {
fail(util::ss("HTTP url parameter specified twice: ", key));
return;
}
if (key.empty()) {
fail(util::ss("HTTP parameter key cannot be empty"));
return;
}
params_[key] = val;
}
void HttpOutRequest::set_url(string_view url) {
ParsedURL parsed_url(url);
if (!parsed_url.valid) {
fail(util::ss("syntactically invalid URL: ", url));
return;
}
if (parsed_url.proto != "https") {
fail(util::ss("unsupported protocol: ", parsed_url.proto));
return;
}
set_host(parsed_url.host);
if (parsed_url.port) set_port(parsed_url.port);
set_path(parsed_url.path);
for (const auto &pair : parsed_url.params) {
set_param(pair.first, pair.second);
}
}
void HttpOutRequest::set_verify_certificate(LuaStack &LS, LuaSlot val) {
if (!LS.isboolean(val)) {
fail(util::ss("HTTP verify_certificate must be a boolean"));
return;
}
set_verify_certificate(LS.ckboolean(val));
}
void HttpOutRequest::set_method(LuaStack &LS, LuaSlot val) {
if (!LS.isstring(val)) {
fail(util::ss("HTTP method must be a string"));
return;
}
set_method(LS.ckstring(val));
}
void HttpOutRequest::set_host(LuaStack &LS, LuaSlot val) {
if (!LS.isstring(val)) {
fail(util::ss("HTTP host must be a string"));
return;
}
set_host(LS.ckstring(val));
}
void HttpOutRequest::set_port(LuaStack &LS, LuaSlot val) {
if (!LS.isint(val)) {
fail(util::ss("HTTP port must be an int"));
return;
}
set_port(LS.ckint(val));
}
void HttpOutRequest::set_path(LuaStack &LS, LuaSlot val) {
if (!LS.isstring(val)) {
fail(util::ss("HTTP path must be a string"));
return;
}
set_path(LS.ckstring(val));
}
void HttpOutRequest::set_param(LuaStack &LS, LuaSlot key, LuaSlot val) {
if (!LS.isstring(key)) {
fail(util::ss("HTTP url parameter key must be a string"));
return;
}
if (!LS.isstring(val)) {
fail(util::ss("HTTP url parameter val must be a string"));
return;
}
set_param(LS.ckstring(key), LS.ckstring(val));
}
void HttpOutRequest::set_params(LuaStack &LS0, LuaSlot tab) {
if (!LS0.istable(tab)) {
fail(util::ss("HTTP params must be a table"));
return;
}
LuaVar key, val;
LuaStack LS(LS0.state(), key, val);
LS.set(key, LuaNil);
while (LS.next(tab, key, val)) {
set_param(LS, key, val);
}
}
void HttpOutRequest::set_url(LuaStack &LS, LuaSlot val) {
if (!LS.isstring(val)) {
fail(util::ss("HTTP url must be a string"));
return;
}
set_url(LS.ckstring(val));
}
void HttpOutRequest::set_defaults() {
if (method_.empty()) {
method_ = "GET";
}
if (port_ == 0) {
port_ = 443;
}
}
void HttpOutRequest::set_config(LuaStack &LS0, LuaSlot tab) {
LuaVar key, val;
LuaStack LS(LS0.state(), key, val);
LS.set(key, LuaNil);
while (LS.next(tab, key, val)) {
eng::string kstr;
if (LS.isstring(key)) kstr = LS.ckstring(key);
if (kstr == "method") {
set_method(LS, val);
} else if (kstr == "host") {
set_host(LS, val);
} else if (kstr == "port") {
set_port(LS, val);
} else if (kstr == "path") {
set_path(LS, val);
} else if (kstr == "encodedpath") {
set_path(LS, val);
} else if (kstr == "params") {
set_params(LS, val);
} else if (kstr == "url") {
set_url(LS, val);
} else if (kstr == "verifycertificate") {
set_verify_certificate(LS, val);
} else if (kstr == "") {
fail(util::ss("HTTP config parameter names must be strings."));
} else {
fail(util::ss("HTTP unrecognized config parameter: ", kstr));
}
}
}
eng::string HttpOutRequest::check() const {
if (!error_.empty()) {
return error_;
}
if (method_.empty()) {
return "HTTP method has not been set";
}
if (host_.empty()) {
return "HTTP host has not been set";
}
if (port_ == 0) {
return "HTTP port has not been set";
}
if (path_.empty()) {
return "HTTP url has not been set";
}
return "";
}
void HttpOutRequest::send_internal(StreamBuffer *sb, bool debug_string) const {
// If there's an error in the request, handle it. In debug string mode,
// we just put the error into the output. In production mode, we assert
// fail.
eng::string error = check();
if (debug_string) {
if (!error.empty()) {
sb->write_bytes(error);
return;
}
} else {
assert(error.empty());
}
// Choose a linebreak.
eng::string linebreak = (debug_string) ? "\n" : "\r\n";
// Send the command.
sb->write_bytes(method_);
sb->write_char(' ');
send_encoded_path(path_, params_, sb);
sb->write_bytes(" HTTP/1.1");
sb->write_bytes(linebreak);
// Send the host header.
sb->write_bytes("Host: ");
send_host_and_port(host_, port_, sb);
sb->write_bytes(linebreak);
// The empty accept-encoding header notifies the
// server that we don't support gzip, deflate, or
// other content compression.
sb->write_bytes("Accept-encoding:");
sb->write_bytes(linebreak);
// Add a user-agent header. Not sure why.
sb->write_bytes("User-agent: Mozilla 5.0 (luprex)");
sb->write_bytes(linebreak);
// Send the extra linebreak.
if (!debug_string) {
sb->write_bytes(linebreak);
}
}
eng::string HttpOutRequest::DebugString() {
StreamBuffer sb;
send_internal(&sb, true);
return eng::string(sb.view());
}
HttpInResponse::HttpInResponse() {
status_code_ = 0;
response_length_ = 0;
mime_type_ = "";
content_length_ = -1;
}
eng::string HttpInResponse::DebugString() const {
eng::ostringstream oss;
oss << "HttpInResponse:" << std::endl;
oss << " status_code: " << status_code_ << std::endl;
oss << " error: " << error_ << std::endl;
oss << " content_length: " << content_length_ << std::endl;
oss << " transfer_encoding: " << transfer_encoding_ << std::endl;
oss << " location: " << location_ << std::endl;
oss << " mime_type: " << mime_type_ << std::endl;
oss << " charset: " << charset_ << std::endl;
oss << " content: " << content_ << std::endl;
oss << " response_length: " << response_length_ << std::endl;
return oss.str();
}
void HttpInResponse::fail(int code, string_view message) {
status_code_ = code;
error_ = message;
mime_type_ = "";
charset_ = "";
content_ = "";
}
void HttpInResponse::incomplete(bool closed) {
if (closed) {
fail(500, "response truncated");
} else {
fail(0, "response not yet fully received");
}
}
void HttpInResponse::parse_content_encoding(string_view value) {
content_encoding_ = util::ascii_tolower(value);
}
void HttpInResponse::parse_content_length(string_view value) {
int64_t code = sv::to_int64(value);
if ((code < 0) || (code > INT_MAX)) {
fail(500, util::ss("unparseable content-length: ", value));
}
content_length_ = code;
}
void HttpInResponse::parse_content_type(string_view value) {
eng::string ctype = util::ascii_tolower(value);
string_view ctview(ctype);
mime_type_ = sv::trim(sv::read_to_sep(ctview, ';'));
if (mime_type_.empty()) {
fail(500, util::ss("unparseable content-type: ", value));
return;
}
while (true) {
string_view feature = sv::trim(sv::read_to_sep(ctview, ';'));
if (feature.empty()) {
return;
}
string_view ftype = sv::trim(sv::read_to_sep(feature, '='));
if (ftype == "charset") {
charset_ = sv::trim(feature);
}
}
}
void HttpInResponse::parse_location(string_view value) {
location_ = url_decode(value);
}
void HttpInResponse::parse_transfer_encoding(string_view value) {
transfer_encoding_ = util::ascii_tolower(value);
}
void HttpInResponse::parse_header(string_view header, string_view value) {
if (header == "content-encoding") {
parse_content_encoding(value);
} else if (header == "content-length") {
parse_content_length(value);
} else if (header == "content-type") {
parse_content_type(value);
} else if (header == "location") {
parse_location(value);
} else if (header == "transfer-encoding") {
parse_transfer_encoding(value);
} else if (header == "content-range") {
fail(500, util::ss("unsupported response header: ", header));
}
}
bool HttpInResponse::parse_content_basic(std::string_view &view, bool closed) {
if (content_length_ >= 0) {
if (content_length_ > MAX_CONTENT_LENGTH) {
fail(500, "content too long");
return false;
}
if (int(view.size()) < content_length_) {
incomplete(closed);
return false;
}
content_ = sv::read_nbytes(view, content_length_);
} else {
if (int64_t(view.size()) > MAX_CONTENT_LENGTH) {
fail(500, "content too long");
return false;
}
if (!closed) {
incomplete(closed);
return false;
}
content_ = sv::read_nbytes(view, view.size());
}
return true;
}
bool HttpInResponse::parse_content_chunked(std::string_view &view, bool closed) {
int64_t total_size = 0;
std::vector<string_view> chunks;
while (true) {
std::string_view chunk_header = sv::trim(sv::read_to_line(view));
if (sv::isnull(view)) {
incomplete(closed);
return false;
}
int64_t chunk_size = sv::to_hex64(chunk_header, -1);
if (chunk_size < 0) {
fail(500, "unparseable chunk header");
return false;
}
if (chunk_size > MAX_CONTENT_LENGTH) {
fail(500, "content too long");
return false;
}
if (chunk_size == 0) break;
total_size += chunk_size;
if (total_size > MAX_CONTENT_LENGTH) {
fail(500, "content too long");
return false;
}
std::string_view chunk = sv::read_nbytes(view, chunk_size);
if (int64_t(chunk.size()) != chunk_size) {
incomplete(closed);
return false;
}
std::string_view newline = sv::read_to_line(view);
if (!newline.empty()) {
fail(500, "corrupted chunk encoding");
return false;
}
if (sv::isnull(view)) {
incomplete(closed);
return false;
}
chunks.push_back(chunk);
}
content_.resize(total_size);
size_t offset = 0;
for (string_view chunk : chunks) {
content_.replace(offset, chunk.size(), chunk);
offset += chunk.size();
}
return true;
}
void HttpInResponse::parse(const StreamBuffer *sb, bool closed) {
// We're not going to modify the StreamBuffer at all.
// Instead, we work entirely on a view.
string_view view = sb->view();
// Get the status line.
string_view status = sv::trim(sv::read_to_line(view));
if (sv::isnull(view)) {
incomplete(closed);
return;
}
// Parse the status line.
string_view scode = sv::read_to_space(status);
int64_t code = sv::to_int64(scode, 0);
if ((code < 100) || (code > 599)) {
fail(500, util::ss("protocol error: invalid response code: ", scode));
}
status_code_ = code;
// Responses outside the range 200-299 are errors,
// and therefore must store an error message.
if ((code < 200) || (code > 299)) {
error_ = status;
if (error_.empty()) {
fail(code, util::ss("error code ", code));
}
}
// Parse the headers.
while (true) {
string_view header = sv::read_to_line(view);
if (sv::isnull(view)) {
incomplete(closed);
return;
}
if (header.empty()) {
break;
}
eng::string command = util::ascii_tolower(sv::trim(sv::read_to_sep(header, ':')));
if (sv::isnull(header)) {
fail(500, util::ss("protocol error: no colon in header line: ", command));
return;
}
if (!words_separated_by_dashes(command)) {
fail(500, util::ss("protocol error: invalid header: ", command));
return;
}
parse_header(command, sv::trim(header));
}
// Process the content using the transfer encoding.
if (transfer_encoding_ == "") {
if (!parse_content_basic(view, closed)) return;
} else if (transfer_encoding_ == "chunked") {
if (!parse_content_chunked(view, closed)) return;
} else {
fail(500, util::ss("unsupported transfer-encoding: ", transfer_encoding_));
return;
}
// Calculate the response length.
response_length_ = sb->fill() - view.size();
// If it's not a redirect, disallow 'location'.
if ((status_code_ < 300) || (status_code_ > 399)) {
if (!location_.empty()) {
fail(500, util::ss("redirect specified, but result code not 300-399: ", code));
return;
}
}
// If the server didn't specify content-type, make a guess.
if (mime_type_.empty()) {
if (sv::valid_utf8(content_)) {
mime_type_ = "text/plain";
charset_ = "utf-8";
} else {
mime_type_ = "application/octet-stream";
charset_ = "";
}
}
// If it's multipart, reject it.
if (sv::has_prefix(mime_type_, "multipart/")) {
fail(500, "multipart messages not implemented");
return;
}
// If it's text, demand a reasonable charset. Otherwise,
// ignore the charset.
if (sv::has_prefix(mime_type_, "text/")) {
if (charset_.empty()) {
charset_ = "utf-8";
}
if (charset_ != "utf-8") {
fail(500, util::ss("charset not supported: ", charset_));
return;
}
} else {
charset_.clear();
}
// Uncompress the content.
if ((content_encoding_ == "") || (content_encoding_ == "identity")) {
} else {
fail(500, util::ss("content-encoding not supported: ", content_encoding_));
return;
}
// If there's an error code, throw out the content.
if ((status_code_ < 200) || (status_code_ > 299)) {
mime_type_.clear();
charset_.clear();
content_.clear();
}
}
void HttpInResponse::store(LuaStack &LS0, LuaSlot tab) {
LuaStack LS(LS0.state());
LS.newtable(tab);
LS.rawset(tab, "responsecode", status_code_);
if (!error_.empty()) {
LS.rawset(tab, "error", error_);
}
if (!location_.empty()) {
LS.rawset(tab, "location", location_);
}
if (!mime_type_.empty()) {
LS.rawset(tab, "mimetype", mime_type_);
LS.rawset(tab, "content", content_);
}
// Debugging fields. Do not use for lua programming.
LS.rawset(tab, "dbg-content-length", content_length_);
LS.rawset(tab, "dbg-transfer-encoding", transfer_encoding_);
LS.rawset(tab, "dbg-charset", charset_);
LS.rawset(tab, "dbg-response-length", response_length_);
}
LuaDefine(http_fixurl, "url", "validate URL and repair minor flaws in the URL syntax") {
LuaArg url;
LuaRet fixed;
LuaStack LS(L, url, fixed);
ParsedURL parsed(LS.ckstring(url));
if (!parsed.valid) {
luaL_error(L, "invalid URL, not fixable");
return LS.result();
}
LS.set(fixed, parsed.str());
return LS.result();
}
LuaDefine(http_request, "reqtab",
"|Given an HTTP request in the form of a table, returns the same "
"|request as a string, to assist with debugging."
"|"
"|The table can contain:"
"|"
"| method (ie, GET, HEAD, POST, etc)"
"| host (ie, 'google.com')"
"| port (default: 443)"
"| url (ie, '/index.html')"
"| params (a table of url parameters)"
"| verifycertificate (default: true)"
"|"
"|The url can start with 'https://', or with '/'. If it starts"
"|with 'https://', then the URL includes the host and port, which"
"|then must not be specified separately."
"|"
"|Note that plain HTTP is not supported - we only allow HTTPS."
"|However, you can talk to a server that has a dummy certificate"
"|by specifying verifycertificate=false."
"|"
"|This module will automatically url encode everything for you."
"|Therefore, you shouldn't url encode anything, otherwise,"
"|you'll end up double-encoding."
"|"
"|You cannot include url parameters as part of the url. If you try,"
"|then your ?, &, and = characters will get url encoded, which will"
"|cause them to not function. To use url parameters, you must"
"|use the separate params table."
"|") {
LuaArg tab;
LuaRet str;
LuaStack LS(L, tab, str);
HttpOutRequest req;
req.set_config(LS, tab);
req.set_defaults();
eng::string error = req.check();
if (!error.empty()) {
luaL_error(L, "%s", error.c_str());
return 0;
}
LS.set(str, req.DebugString());
return LS.result();
}
LuaDefine(http_response, "text", "") {
LuaArg text;
LuaRet tab;
LuaStack LS(L, text, tab);
HttpInResponse resp;
StreamBuffer sb;
sb.write_bytes(LS.ckstring(text));
resp.parse(&sb, true);
resp.store(LS, tab);
return LS.result();
}