From 4466921c3df85bd46095e224088443154394587c Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Mon, 4 Jul 2022 14:49:42 +0200 Subject: [PATCH] Support for POST, better logging and error handling, sparql-results+json now default, command-line argument --access-token (#657) 1. There is now proper support also for POST queries. The POST queries allow both variants specified in the SPARQL 1.1 standard: with a URL-encoded query string ("query=...") and with content type "application/sparql-query" and just the unencoded SPARQL query in the body of the POST request. 2. Make the logging and handling of errors more consistent. The standard should be that the error message in the log is equivalent to that send to the server. Use the occasion to improve the server logging in general. For example, the server now shows the method of the query, the requested media type, the time needed for query planning, and the total query processing time. Also, the query planning now comes after determining the media type, as it should. 3. Make sparql-results+json the default media type, as required by the standard. This is possible now because the latest version of the QLever UI explicitly asks for qlever-results+json for every SPARQL query it sends. 4. Add a command-line argument for specifying the access token to be used for restricted API calls like clear-cache complete. This was simple enough and completes work from an earlier PR. Also support an explicit "ping" command. --- e2e/e2e.sh | 4 +- src/ServerMain.cpp | 7 +- src/engine/Operation.cpp | 27 ++- src/engine/QueryExecutionTree.cpp | 16 +- src/engine/QueryExecutionTree.h | 5 +- src/engine/ResultTable.h | 11 +- src/engine/Server.cpp | 373 +++++++++++++++++++++--------- src/engine/Server.h | 19 +- src/util/HttpServer/HttpServer.h | 32 +-- src/util/HttpServer/HttpUtils.h | 34 +-- src/util/HttpServer/UrlParser.cpp | 60 +++-- src/util/HttpServer/UrlParser.h | 35 ++- src/util/Timer.h | 10 +- 13 files changed, 426 insertions(+), 207 deletions(-) diff --git a/e2e/e2e.sh b/e2e/e2e.sh index 3c86c540b7..fc37f09dff 100755 --- a/e2e/e2e.sh +++ b/e2e/e2e.sh @@ -112,8 +112,8 @@ if [ ${REBUILD_THE_INDEX} == "YES" ] || ! [ -f "${INDEX}.vocabulary" ]; then popd fi -# Launch the Server using the freshly baked index. Can't simply use a subshell here because -# then we can't easily get the SERVER_PID out of that subshell +# Launch the Server using the freshly baked index. Can't simply use a subshell +# here because then we can't easily get the SERVER_PID out of that subshell pushd "$BINARY_DIR" echo "Launching server from path $(pwd)" ./ServerMain -i "$INDEX" -p 9099 -m 1 -t &> server_log.txt & diff --git a/src/ServerMain.cpp b/src/ServerMain.cpp index b1816a160d..a717e67c0b 100644 --- a/src/ServerMain.cpp +++ b/src/ServerMain.cpp @@ -39,7 +39,8 @@ int main(int argc, char** argv) { // filled / set depending on the options. using ad_utility::NonNegative; - string indexBasename; + std::string indexBasename; + std::string accessToken; bool text = false; int port; NonNegative numSimultaneousQueries = 1; @@ -62,6 +63,8 @@ int main(int argc, char** argv) { "The basename of the index files (required)."); add("port,p", po::value(&port)->required(), "The port on which HTTP requests are served (required)."); + add("access-token,a", po::value(&accessToken)->default_value(""), + "Access token for restricted API calls (default: no access)."); add("num-simultaneous-queries,j", po::value(&numSimultaneousQueries)->default_value(1), "The number of queries that can be processed simultaneously."); @@ -123,7 +126,7 @@ int main(int argc, char** argv) { try { Server server(port, static_cast(numSimultaneousQueries), - memoryMaxSizeGb); + memoryMaxSizeGb, std::move(accessToken)); server.run(indexBasename, text, !noPatterns, !noPatternTrick, !onlyPsoAndPosPermutations); } catch (const std::exception& e) { diff --git a/src/engine/Operation.cpp b/src/engine/Operation.cpp index 84ae835d98..dca61f0e90 100644 --- a/src/engine/Operation.cpp +++ b/src/engine/Operation.cpp @@ -56,9 +56,8 @@ void Operation::recursivelySetTimeoutTimer( } } -// Get the result for the subtree rooted at this element. -// Use existing results if they are already available, otherwise -// trigger computation. +// Get the result for the subtree rooted at this element. Use existing results +// if they are already available, otherwise trigger computation. shared_ptr Operation::getResult(bool isRoot) { ad_utility::Timer timer; timer.start(); @@ -114,30 +113,34 @@ shared_ptr Operation::getResult(bool isRoot) { timer.stop(); createRuntimeInformation(result, timer.msecs()); + auto resultNumRows = result._resultPointer->_resultTable->size(); + auto resultNumCols = result._resultPointer->_resultTable->width(); + LOG(DEBUG) << "Computed result of size " << resultNumRows << " x " + << resultNumCols << std::endl; return result._resultPointer->_resultTable; } catch (const ad_semsearch::AbortException& e) { // A child Operation was aborted, do not print the information again. throw; } catch (const ad_utility::WaitedForResultWhichThenFailedException& e) { + // Here and in the following, show the detailed information (it's the + // runtime info) only in the DEBUG log. Note that the exception will be + // caught by the `processQuery` method, where the error message will be + // printed *and* included in an error response sent to the client. LOG(ERROR) << "Waited for a result from another thread which then failed" << endl; - LOG(ERROR) << asString(); + LOG(DEBUG) << asString(); throw ad_semsearch::AbortException(e); } catch (const std::exception& e) { // We are in the innermost level of the exception, so print - LOG(ERROR) << "Aborted Operation:" << endl; - LOG(ERROR) << asString() << endl; - LOG(ERROR) << e.what() << endl; + LOG(ERROR) << "Aborted Operation" << endl; + LOG(DEBUG) << asString() << endl; // Rethrow as QUERY_ABORTED allowing us to print the Operation // only at innermost failure of a recursive call throw ad_semsearch::AbortException(e); } catch (...) { // We are in the innermost level of the exception, so print - LOG(ERROR) << "Aborted Operation:" << endl; - LOG(ERROR) << asString() << endl; - LOG(ERROR) - << "Unexpected exception that is not a subclass of std::exception" - << endl; + LOG(ERROR) << "Aborted Operation" << endl; + LOG(DEBUG) << asString() << endl; // Rethrow as QUERY_ABORTED allowing us to print the Operation // only at innermost failure of a recursive call throw ad_semsearch::AbortException( diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp index 44b127a122..cc083077cd 100644 --- a/src/engine/QueryExecutionTree.cpp +++ b/src/engine/QueryExecutionTree.cpp @@ -401,7 +401,9 @@ ad_utility::streams::stream_generator QueryExecutionTree::generateResults( // This call triggers the possibly expensive computation of the query result // unless the result is already cached. shared_ptr resultTable = getResult(); - LOG(DEBUG) << "Resolving strings for finished binary result...\n"; + resultTable->logResultSize(); + LOG(DEBUG) << "Converting result IDs to their corresponding strings ..." + << std::endl; auto selectedColumnIndices = selectedVariablesToColumnIndices( selectedVarsOrAsterisk, *resultTable, true); @@ -518,8 +520,10 @@ QueryExecutionTree::generateRdfGraph( // _____________________________________________________________________________ ad_utility::streams::stream_generator QueryExecutionTree::writeRdfGraphTurtle( const ad_utility::sparql_types::Triples& constructTriples, size_t limit, - size_t offset, std::shared_ptr res) const { - auto generator = generateRdfGraph(constructTriples, limit, offset, res); + size_t offset, std::shared_ptr resultTable) const { + resultTable->logResultSize(); + auto generator = + generateRdfGraph(constructTriples, limit, offset, resultTable); for (const auto& triple : generator) { co_yield triple._subject; co_yield ' '; @@ -535,7 +539,7 @@ template ad_utility::streams::stream_generator QueryExecutionTree::writeRdfGraphSeparatedValues( const ad_utility::sparql_types::Triples& constructTriples, size_t limit, - size_t offset, std::shared_ptr res) const { + size_t offset, std::shared_ptr resultTable) const { static_assert(format == ExportSubFormat::BINARY || format == ExportSubFormat::CSV || format == ExportSubFormat::TSV); @@ -543,11 +547,13 @@ QueryExecutionTree::writeRdfGraphSeparatedValues( throw std::runtime_error{ "Binary export is not supported for CONSTRUCT queries"}; } + resultTable->logResultSize(); constexpr auto& escapeFunction = format == ExportSubFormat::TSV ? RdfEscaping::escapeForTsv : RdfEscaping::escapeForCsv; constexpr char sep = format == ExportSubFormat::TSV ? '\t' : ','; - auto generator = generateRdfGraph(constructTriples, limit, offset, res); + auto generator = + generateRdfGraph(constructTriples, limit, offset, resultTable); for (auto& triple : generator) { co_yield escapeFunction(std::move(triple._subject)); co_yield sep; diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 722624f229..a86eb06e14 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -23,9 +23,8 @@ using std::shared_ptr; using std::string; -// A query execution tree. -// Processed bottom up, this gives an ordering to the operations -// needed to solve a query. +// A query execution tree. Processed bottom up, which gives an ordering to the +// operations needed to solve a query. class QueryExecutionTree { public: explicit QueryExecutionTree(QueryExecutionContext* const qec); diff --git a/src/engine/ResultTable.h b/src/engine/ResultTable.h index 6971ddf7c2..0e4c29026e 100644 --- a/src/engine/ResultTable.h +++ b/src/engine/ResultTable.h @@ -74,6 +74,15 @@ class ResultTable { size_t size() const; size_t width() const { return _idTable.cols(); } + // Log to INFO the size of this result. + // + // NOTE: Due to the current sub-optimal design of `Server::processQuery`, we + // need the same message in multiple places and so instead of duplicating the + // message, we should have a method for it. + void logResultSize() const { + LOG(INFO) << "Result has size " << size() << " x " << width() << std::endl; + } + void clear(); string asDebugString() const; @@ -86,4 +95,4 @@ class ResultTable { } private: -}; \ No newline at end of file +}; diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index ae88bcf447..abd773f361 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -36,19 +36,47 @@ void Server::initialize(const string& indexBaseName, bool useText, // Set flag. _initialized = true; - LOG(INFO) << "The server is ready, listening for requests on port " << _port - << " ..." << std::endl; + LOG(INFO) << "Access token for restricted API calls is \"" << accessToken_ + << "\"" << std::endl; + LOG(INFO) << "The server is ready, listening for requests on port " + << std::to_string(_port) << " ..." << std::endl; } // _____________________________________________________________________________ void Server::run(const string& indexBaseName, bool useText, bool usePatterns, bool usePatternTrick, bool loadAllPermutations) { - // First set up the HTTP server, so that it binds to the socket, and - // the "socket already in use" error appears quickly. + using namespace ad_utility::httpUtils; + + // Function that handles a request asynchronously, will be passed as argument + // to `HttpServer` below. auto httpSessionHandler = [this](auto request, auto&& send) -> boost::asio::awaitable { - co_await process(std::move(request), send); + // Version of send with maximally permissive CORS header (that allows the + // client that receives the response to do with it what it wants). + auto sendWithCors = [&send](auto response) -> boost::asio::awaitable { + response.set(http::field::access_control_allow_origin, "*"); + co_return co_await send(std::move(response)); + }; + // Process the request using the `process` method and if it throws an + // exception, log the error message and send a HTTP/1.1 400 Bad Request + // response with that message. Note that the C++ standard forbids co_await + // in the catch block, hence the workaround with the `exceptionErrorMsg`. + std::optional exceptionErrorMsg; + try { + co_await process(std::move(request), sendWithCors); + } catch (const std::exception& e) { + exceptionErrorMsg = e.what(); + } + if (exceptionErrorMsg) { + LOG(ERROR) << exceptionErrorMsg.value() << std::endl; + auto badRequestResponse = createBadRequestResponse( + absl::StrCat(exceptionErrorMsg.value(), "\n"), request); + co_await sendWithCors(std::move(badRequestResponse)); + } }; + + // First set up the HTTP server, so that it binds to the socket, and + // the "socket already in use" error appears quickly. auto httpServer = HttpServer{static_cast(_port), "0.0.0.0", _numThreads, std::move(httpSessionHandler)}; @@ -60,65 +88,134 @@ void Server::run(const string& indexBaseName, bool useText, bool usePatterns, httpServer.run(); } +// _____________________________________________________________________________ +ad_utility::UrlParser::UrlPathAndParameters Server::getUrlPathAndParameters( + const ad_utility::httpUtils::HttpRequest auto& request) { + if (request.method() == http::verb::get) { + // For a GET request, `request.target()` yields the part after the domain, + // which is a concatenation of the path and the query string (the query + // string starting with "?"). + return ad_utility::UrlParser::parseGetRequestTarget(request.target()); + } + if (request.method() == http::verb::post) { + // For a POST request, the content type *must* be either + // "application/x-www-form-urlencoded" or "application/sparql-query". In + // the first case, the body of the POST request contains a URL-encoded + // query (just like in the part of a GET request after the "?"). In the + // second case, the body of the POST request contains *only* the SPARQL + // query, but not URL-encoded, and no other URL parameters. See Sections + // 2.1.2 and 2.1.3 of the SPARQL 1.1 standard: + // https://www.w3.org/TR/2013/REC-sparql11-protocol-20130321 + std::string_view contentType = request.base()[http::field::content_type]; + LOG(DEBUG) << "Content-type: \"" << contentType << "\"" << std::endl; + static constexpr std::string_view contentTypeUrlEncoded = + "application/x-www-form-urlencoded"; + static constexpr std::string_view contentTypeSparqlQuery = + "application/sparql-query"; + + // In either of the two cases explained above, we convert the data to a + // format as if it came from a GET request. The second argument to + // `parseGetRequestTarget` says whether the function should apply URL + // decoding. + if (contentType == contentTypeUrlEncoded) { + return ad_utility::UrlParser::parseGetRequestTarget( + absl::StrCat(request.target(), "?", request.body()), true); + } + if (contentType == contentTypeSparqlQuery) { + return ad_utility::UrlParser::parseGetRequestTarget( + absl::StrCat(request.target(), "?query=", request.body()), false); + } + throw std::runtime_error( + absl::StrCat("POST request with content type \"", contentType, + "\" not supported (must be \"", contentTypeUrlEncoded, + "\" or \"", contentTypeSparqlQuery, "\")")); + } + std::ostringstream requestMethodName; + requestMethodName << request.method(); + throw std::runtime_error( + absl::StrCat("Request method \"", requestMethodName.str(), + "\" not supported (has to be GET or POST)")); +}; + // _____________________________________________________________________________ Awaitable Server::process( const ad_utility::httpUtils::HttpRequest auto& request, auto&& send) { using namespace ad_utility::httpUtils; + + // Log some basic information about the request. Start with an empty line so + // that in a low-traffic scenario (or when the query processing is very fast), + // we have one visual block per request in the log. + std::string_view contentType = request.base()[http::field::content_type]; + LOG(INFO) << std::endl; + LOG(INFO) << "Request received via " << request.method() + << (contentType.empty() + ? absl::StrCat(", no content type specified") + : absl::StrCat(", content type \"", contentType, "\"")) + << std::endl; + + // Start timing. ad_utility::Timer requestTimer; requestTimer.start(); - auto filenameAndParams = ad_utility::UrlParser::parseTarget(request.target()); - const auto& params = filenameAndParams._parameters; + // Parse the path and the URL parameters from the given request. Works for GET + // requests as well as the two kinds of POST requests allowed by the SPARQL + // standard, see method `getUrlPathAndParameters`. + const auto urlPathAndParameters = getUrlPathAndParameters(request); + const auto& parameters = urlPathAndParameters._parameters; - // Lambda for sending a response asynchronously. Will be called with `co_await - // sendWithCors(...)`, see below. - auto sendWithCors = [&send](auto response) -> boost::asio::awaitable { - response.set(http::field::access_control_allow_origin, "*"); - co_return co_await send(std::move(response)); - }; - - // Lambda that checks if a URL parameter with the given specification exists - // and we are allowed to perform the associated action. - // - // 1. If value is `std::nullopt`, check if the given key exists, and if yes, - // return the corresponding value, and `std::nullopt` otherwise. - // - // 2. If value is not `std::nullopt`, check if the given key-value pair - // exists, and if yes, return the value (not really needed, but just so that - // the interface is the same as in the first case), and `std::nullopt` - // otherwise. + // Lambda for checking if a URL parameter exists in the request and if we are + // allowed to access it. If yes, return the value, otherwise return + // `std::nullopt`. // - // 3. If one of the two above would have returned a value, but the third - // argument is false, print a log message that the access to this URL - // parameter is denied and return `std::nullopt`. - auto checkParam = - [¶ms](const std::string key, std::optional value, - bool accessAllowed = true) -> std::optional { - // If key not found, always return std::nullopt. - if (!params.contains(key)) { + // If `value` is `std::nullopt`, only check if the key exists. We need this + // because we have parameters like "cmd=stats", where a fixed combination of + // the key and value determines the kind of action, as well as parameters + // like "index-decription=...", where the key determines the kind of action. + auto checkParameter = + [¶meters](const std::string key, std::optional value, + bool accessAllowed = true) -> std::optional { + // If the key is not found, always return std::nullopt. + if (!parameters.contains(key)) { return std::nullopt; } // If value is given, but not equal to param value, return std::nullopt. If // no value is given, set it to param value. if (value == std::nullopt) { - value = params.at(key); - } else if (value != params.at(key)) { + value = parameters.at(key); + } else if (value != parameters.at(key)) { return std::nullopt; } - // At this point, we have a value. Either return it or say access denied. + // Now that we have the value, check if there is a problem with the access. + // If yes, we abort the query processing at this point. if (accessAllowed == false) { - LOG(INFO) << "Access to \"" << key << "=" << value.value() - << "\" is denied, this URL parameter is ignored" << std::endl; - return std::nullopt; - } else { - return value; + throw std::runtime_error(absl::StrCat("Access to \"", key, "=", + value.value(), "\" denied", + " (requires a valid access token)", + ", processing of request aborted")); } + return value; }; - auto accessToken = checkParam("access-token", std::nullopt); - auto accessTokenOk = accessToken == "1622"; - if (accessToken && !accessTokenOk) { - LOG(INFO) << "Access token \"access-token=" << accessToken.value() << "\"" - << " provided, but not correct" << std::endl; + + // Check the access token. If an access token is provided and the check fails, + // throw an exception and do not process any part of the query (even if the + // processing would have been allowed without access token). + auto accessToken = checkParameter("access-token", std::nullopt); + bool accessTokenOk = false; + if (accessToken) { + auto accessTokenProvidedMsg = absl::StrCat( + "Access token \"access-token=", accessToken.value(), "\" provided"); + auto requestIgnoredMsg = ", request is ignored"; + if (accessToken_.empty()) { + throw std::runtime_error(absl::StrCat( + accessTokenProvidedMsg, + " but server was started without --access-token", requestIgnoredMsg)); + } else if (accessToken != accessToken_) { + throw std::runtime_error(absl::StrCat( + accessTokenProvidedMsg, " but not correct", requestIgnoredMsg)); + } else { + LOG(DEBUG) << accessTokenProvidedMsg << " and correct" << std::endl; + accessTokenOk = true; + } } // Process all URL parameters known to QLever. If there is more than one, @@ -131,29 +228,45 @@ Awaitable Server::process( std::optional> response; // Execute commands (URL parameter with key "cmd"). - if (checkParam("cmd", "stats")) { - LOG(INFO) << "Processing command \"stats\"" << std::endl; + auto logCommand = [](std::optional& cmd, std::string actionMsg) { + LOG(INFO) << "Processing command \"" << cmd.value() << "\"" + << ": " << actionMsg << std::endl; + }; + if (auto cmd = checkParameter("cmd", "stats")) { + logCommand(cmd, "get index statistics"); response = createJsonResponse(composeStatsJson(), request); - } else if (checkParam("cmd", "cache-stats")) { - LOG(INFO) << "Processing command \"cache-stats\"" << std::endl; + } else if (auto cmd = checkParameter("cmd", "cache-stats")) { + logCommand(cmd, "get cache statistics"); response = createJsonResponse(composeCacheStatsJson(), request); - } else if (checkParam("cmd", "clear-cache")) { - LOG(INFO) << "Clearing the cache, unpinned elements only" << std::endl; + } else if (auto cmd = checkParameter("cmd", "clear-cache")) { + logCommand(cmd, "clear the cache (unpinned elements only)"); _cache.clearUnpinnedOnly(); response = createJsonResponse(composeCacheStatsJson(), request); - } else if (checkParam("cmd", "clear-cache-complete", accessTokenOk)) { - LOG(INFO) << "Clearing the cache completely, including unpinned elements" - << std::endl; + } else if (auto cmd = + checkParameter("cmd", "clear-cache-complete", accessTokenOk)) { + logCommand(cmd, "clear cache completely (including unpinned elements)"); _cache.clearAll(); response = createJsonResponse(composeCacheStatsJson(), request); - } else if (checkParam("cmd", "get-settings")) { - LOG(INFO) << "Getting server settings" << std::endl; + } else if (auto cmd = checkParameter("cmd", "get-settings")) { + logCommand(cmd, "get server settings"); response = createJsonResponse(RuntimeParameters().toMap(), request); } + // Ping with or without messsage. + if (urlPathAndParameters._path == "/ping") { + if (auto msg = checkParameter("msg", std::nullopt)) { + LOG(INFO) << "Alive check with message \"" << msg.value() << "\"" + << std::endl; + } else { + LOG(INFO) << "Alive check without message" << std::endl; + } + response = createOkResponse("This QLever server is up and running\n", + request, ad_utility::MediaType::textPlain); + } + // Set description of KB index. if (auto description = - checkParam("index-description", std::nullopt, accessTokenOk)) { + checkParameter("index-description", std::nullopt, accessTokenOk)) { LOG(INFO) << "Setting index description to: \"" << description.value() << "\"" << std::endl; _index.setKbName(description.value()); @@ -162,7 +275,7 @@ Awaitable Server::process( // Set description of text index. if (auto description = - checkParam("text-description", std::nullopt, accessTokenOk)) { + checkParameter("text-description", std::nullopt, accessTokenOk)) { LOG(INFO) << "Setting text description to: \"" << description.value() << "\"" << std::endl; _index.setTextName(description.value()); @@ -171,7 +284,7 @@ Awaitable Server::process( // Set one or several of the runtime parameters. for (auto key : RuntimeParameters().getKeys()) { - if (auto value = checkParam(key, std::nullopt, accessTokenOk)) { + if (auto value = checkParameter(key, std::nullopt, accessTokenOk)) { LOG(INFO) << "Setting runtime parameter \"" << key << "\"" << " to value \"" << value.value() << "\"" << std::endl; RuntimeParameters().set(key, value.value()); @@ -180,26 +293,32 @@ Awaitable Server::process( } // If "query" parameter is given, process query. - if (auto query = checkParam("query", std::nullopt)) { - if (!query.value().empty()) { - co_return co_await processQuery(params, requestTimer, std::move(request), - sendWithCors); - } else { - response = createBadRequestResponse( - "Parameter \"query\" must not have an empty value", request); + if (auto query = checkParameter("query", std::nullopt)) { + if (query.value().empty()) { + throw std::runtime_error( + "Parameter \"query\" must not have an empty value"); } + co_return co_await processQuery(parameters, requestTimer, + std::move(request), send); } - // If there was no non-empty "query", but any of the above produced a - // `response`, send that now. + // If there was no "query", but any of the URL parameters processed before + // produced a `response`, send that now. Note that if multiple URL parameters + // were processed, only the `response` from the last one is sent. if (response.has_value()) { - co_return co_await sendWithCors(std::move(response.value())); + co_return co_await send(std::move(response.value())); } - // At this point, none of the URL parameters was recognized by QLever. We - // then consider the URL parameter string as a path name and try to serve the - // corresponding file (though we only allow a very restricted whitelist, see - // below). + // At this point, if there is a "?" in the query string, it means that there + // are URL parameters which QLever does not know or did not process. + if (request.target().find("?") != std::string::npos) { + throw std::runtime_error( + "Request with URL parameters, but none of them could be processed"); + } + + // At this point, we only have a path and no URL paraeters. We then interpret + // the request as a request for a file. However, only files from a very + // restricted whitelist (see below) will actually be served. // // NOTE 1: `makeFileServer` returns a function. The first argument is the // document root, the second one is the whitelist. @@ -213,6 +332,8 @@ Awaitable Server::process( // is not found, a corresponding response is returned to the requestion // client, but the log says nothing about it. The place to change this would // be in `src/util/HttpServer/HttpUtils.h`. + LOG(INFO) << "Treating request target \"" << request.target() << "\"" + << " as a request for a file with that name" << std::endl; auto serveFileRequest = makeFileServer( ".", ad_utility::HashSet{"index.html", "script.js", "style.css"})( @@ -227,6 +348,7 @@ Awaitable Server::composeResponseQleverJson( auto compute = [&, maxSend] { shared_ptr resultTable = qet.getResult(); requestTimer.stop(); + resultTable->logResultSize(); off_t compResultUsecs = requestTimer.usecs(); size_t resultSize = resultTable->size(); @@ -282,6 +404,7 @@ Awaitable Server::composeResponseSparqlJson( } auto compute = [&, maxSend] { shared_ptr resultTable = qet.getResult(); + resultTable->logResultSize(); requestTimer.stop(); nlohmann::json j; size_t limit = std::min(query._limitOffset._limit, maxSend); @@ -329,8 +452,9 @@ ad_utility::streams::stream_generator Server::composeTurtleResponse( } // _____________________________________________________________________________ -json Server::composeExceptionJson(const string& query, const std::exception& e, - ad_utility::Timer& requestTimer) { +json Server::composeErrorResponseJson(const string& query, + const std::string& errorMsg, + ad_utility::Timer& requestTimer) { requestTimer.stop(); json j; @@ -339,7 +463,7 @@ json Server::composeExceptionJson(const string& query, const std::exception& e, j["resultsize"] = 0; j["time"]["total"] = requestTimer.msecs(); j["time"]["computeResult"] = requestTimer.msecs(); - j["exception"] = e.what(); + j["exception"] = errorMsg; return j; } @@ -393,8 +517,11 @@ boost::asio::awaitable Server::processQuery( co_return co_await send(std::move(response)); }; - std::optional errorResponse; - + // Put the whole query processing in a try-catch block. If any exception + // occurs, log the error message and send a JSON response with all the details + // to the client. Note that the C++ standard forbids co_await in the catch + // block, hence the workaround with the optional `exceptionErrorMsg`. + std::optional exceptionErrorMsg; try { ad_utility::SharedConcurrentTimeoutTimer timeoutTimer = std::make_shared( @@ -413,36 +540,26 @@ boost::asio::awaitable Server::processQuery( : MAX_NOF_ROWS_IN_RESULT; const bool pinSubtrees = containsParam("pinsubtrees", "true"); const bool pinResult = containsParam("pinresult", "true"); - LOG(INFO) << "Query" << ((pinSubtrees) ? " (Cache pinned)" : "") - << ((pinResult) ? " (Result pinned)" : "") << ":\n" + LOG(INFO) << "Processing the following SPARQL query:" + << (pinResult ? " [pin result]" : "") + << (pinSubtrees ? " [pin subresults]" : "") << "\n" << query << std::endl; ParsedQuery pq = SparqlParser(query).parse(); pq.expandPrefixes(); - QueryExecutionContext qec(_index, _engine, &_cache, _allocator, - _sortPerformanceEstimator, pinSubtrees, - pinResult); - // start the shared timeout timer here to also include - // the query planning - timeoutTimer->wlock()->start(); - - QueryPlanner qp(&qec); - qp.setEnablePatternTrick(_enablePatternTrick); - QueryExecutionTree qet = qp.createExecutionTree(pq); - qet.isRoot() = true; // allow pinning of the final result - qet.recursivelySetTimeoutTimer(timeoutTimer); - LOG(TRACE) << qet.asString() << std::endl; - + // The following code block determines the media type to be used for the + // result. The media type is either determined by the "Accept:" header of + // the request or by the URL parameter "action=..." (for TSV and CSV export, + // for QLever-historical reasons). using ad_utility::MediaType; - // Determine the result media type. - // TODO qleverJson should not be the default as soon - // as the UI explicitly requests it. - // TODO Add sparqlJson as soon as it is supported. + // The first media type in this list is the default, if no other type is + // specified in the request. It's "application/sparql-results+json", as + // required by the SPARQL standard. const auto supportedMediaTypes = []() { static const std::vector mediaTypes{ - ad_utility::MediaType::qleverJson, ad_utility::MediaType::sparqlJson, + ad_utility::MediaType::qleverJson, ad_utility::MediaType::tsv, ad_utility::MediaType::csv, ad_utility::MediaType::turtle, @@ -484,8 +601,35 @@ boost::asio::awaitable Server::processQuery( supportedMediaTypes()), request)); } - AD_CHECK(mediaType.has_value()); + LOG(INFO) << "Requested media type of result is \"" + << ad_utility::toString(mediaType.value()) << "\"" << std::endl; + + // Do the query planning. This creates a `QueryExecutionTree`, which will + // then be used to process the query. Start the shared `timeoutTimer` here + // to also include the query planning. + // + // NOTE: This should come after determining the media type. Otherwise it + // might happen that the query planner runs for a while (recall that it many + // do index scans) and then we get an error message afterwards that a + // certain media type is not supported. + timeoutTimer->wlock()->start(); + QueryExecutionContext qec(_index, _engine, &_cache, _allocator, + _sortPerformanceEstimator, pinSubtrees, + pinResult); + QueryPlanner qp(&qec); + qp.setEnablePatternTrick(_enablePatternTrick); + QueryExecutionTree qet = qp.createExecutionTree(pq); + qet.isRoot() = true; // allow pinning of the final result + qet.recursivelySetTimeoutTimer(timeoutTimer); + requestTimer.stop(); + LOG(INFO) << "Query planning done in " << requestTimer.msecs() << " ms" + << " (can include index scans)" << std::endl; + requestTimer.cont(); + LOG(TRACE) << qet.asString() << std::endl; + + // This actually processes the query and sends the result in the requested + // format. switch (mediaType.value()) { case ad_utility::MediaType::csv: { auto responseGenerator = co_await composeResponseSepValues< @@ -538,19 +682,28 @@ boost::asio::awaitable Server::processQuery( // Print the runtime info. This needs to be done after the query // was computed. - // TODO Also log the processing time and an identifier of the - // query. - LOG(INFO) << "Done processing query" << std::endl; - LOG(DEBUG) << "\nRuntime Info:\n" + // Log that we are done with the query and how long it took. + // + // NOTE: We need to explicitly stop the `requestTimer` here because in the + // sending code above, it is done only in some cases and not in others (in + // particular, not for TSV and CSV because for those, the result does not + // contain timing information). + // + // TODO Also log an identifier of the query. + requestTimer.stop(); + LOG(INFO) << "Done processing query and sending result" + << ", total time was " << requestTimer.msecs() << " ms" + << std::endl; + LOG(DEBUG) << "Runtime Info:\n" << qet.getRootOperation()->getRuntimeInfo().toString() << std::endl; - } catch (const ad_semsearch::Exception& e) { - errorResponse = composeExceptionJson(query, e, requestTimer); } catch (const std::exception& e) { - errorResponse = composeExceptionJson(query, e, requestTimer); + exceptionErrorMsg = e.what(); } - if (errorResponse.has_value()) { - co_return co_await sendJson(errorResponse.value(), - http::status::bad_request); + if (exceptionErrorMsg) { + LOG(ERROR) << exceptionErrorMsg.value() << std::endl; + auto errorResponseJson = composeErrorResponseJson( + query, exceptionErrorMsg.value(), requestTimer); + co_return co_await sendJson(errorResponseJson, http::status::bad_request); } } diff --git a/src/engine/Server.h b/src/engine/Server.h index 4445426eda..61cde9012e 100644 --- a/src/engine/Server.h +++ b/src/engine/Server.h @@ -30,9 +30,11 @@ using ad_utility::Socket; //! The HTTP Server used. class Server { public: - explicit Server(const int port, const int numThreads, size_t maxMemGB) + explicit Server(const int port, const int numThreads, size_t maxMemGB, + std::string accessToken) : _numThreads(numThreads), _port(port), + accessToken_(accessToken), _allocator{ad_utility::makeAllocationMemoryLeftThreadsafeObject( maxMemGB * (1ull << 30u)), [this](size_t numBytesToAllocate) { @@ -90,6 +92,7 @@ class Server { private: const int _numThreads; int _port; + std::string accessToken_; QueryResultCache _cache; ad_utility::AllocatorWithLimit _allocator; SortPerformanceEstimator _sortPerformanceEstimator; @@ -106,14 +109,19 @@ class Server { template using Awaitable = boost::asio::awaitable; + /// Parse the path and URL parameters from the given request. Supports both + /// GET and POST request according to the SPARQL 1.1 standard. + ad_utility::UrlParser::UrlPathAndParameters getUrlPathAndParameters( + const ad_utility::httpUtils::HttpRequest auto& request); + /// Handle a single HTTP request. Check whether a file request or a query was /// sent, and dispatch to functions handling these cases. This function /// requires the constraints for the `HttpHandler` in `HttpServer.h`. /// \param req The HTTP request. /// \param send The action that sends a http:response. (see the /// `HttpServer.h` for documentation). - Awaitable process(const ad_utility::httpUtils::HttpRequest auto& req, - auto&& send); + Awaitable process( + const ad_utility::httpUtils::HttpRequest auto& request, auto&& send); /// Handle a http request that asks for the processing of a query. /// \param params The key-value-pairs sent in the HTTP GET request. When this @@ -141,8 +149,9 @@ class Server { Awaitable composeResponseSepValues( const ParsedQuery& query, const QueryExecutionTree& qet) const; - static json composeExceptionJson(const string& query, const std::exception& e, - ad_utility::Timer& requestTimer); + static json composeErrorResponseJson(const string& query, + const std::string& errorMsg, + ad_utility::Timer& requestTimer); static ad_utility::streams::stream_generator composeTurtleResponse( const ParsedQuery& query, const QueryExecutionTree& qet); diff --git a/src/util/HttpServer/HttpServer.h b/src/util/HttpServer/HttpServer.h index 85c6d27c59..4eec41049a 100644 --- a/src/util/HttpServer/HttpServer.h +++ b/src/util/HttpServer/HttpServer.h @@ -21,20 +21,24 @@ using tcp = boost::asio::ip::tcp; // from /* * \brief A Simple HttpServer, based on Boost::Beast. Its can be configured via - * the mandatory HttpHandler parameter. \tparam HttpHandler A callable type that - * takes two parameters, a `http::request<...>` , and a `sendAction` and returns - * an awaitable type. sendAction always is a callable that takes a - * http::message, and returns an awaitable; The behavior is then as - * follows: as soon as the Server receives a http request, co_await - * _httpHandler(move(request), sendAction) is called. (_httpHandler is a member - * of type HttpHandler). The expected behavior of this call is that _httpHandler - * takes the request, computes the corresponding `response`, and calls co_await - * sendAction(response). The `sendAction` is needed because the `response` might - * different types (in beast, a http::message is templated on the body type). - * For this reason, this approach is more flexible, than having _httpHandler - * simply return the response. A very basic HttpHandler, which simply serves - * files from a directory, can be obtained via - * `ad_utility::httpUtils::makeFileServer()`. + * the mandatory HttpHandler parameter. + * + * \tparam HttpHandler A callable type that takes two parameters, a + * `http::request<...>` , and a `sendAction` and returns an awaitable + * type. sendAction always is a callable that takes a http::message, and returns + * an awaitable. + * + * The behavior is then as follows: as soon as the server receives a HTTP + * request, co_await _httpHandler(move(request), sendAction) is called. + * (_httpHandler is a member of type HttpHandler). The expected behavior of this + * call is that _httpHandler takes the request, computes the corresponding + * `response`, and calls co_await sendAction(response). The `sendAction` is + * needed because the `response` can have different types (in beast, a + * http::message is templated on the body type). For this reason, this approach + * is more flexible, than having _httpHandler simply return the response. + * + * A very basic HttpHandler, which simply serves files from a directory, can be + * obtained via `ad_utility::httpUtils::makeFileServer()`. */ template class HttpServer { diff --git a/src/util/HttpServer/HttpUtils.h b/src/util/HttpServer/HttpUtils.h index 584d181f67..7f20f2c36d 100644 --- a/src/util/HttpServer/HttpUtils.h +++ b/src/util/HttpServer/HttpUtils.h @@ -134,11 +134,10 @@ static auto createJsonResponse(const json& j, const auto& request, /// Create a HttpResponse with status 404 Not Found. The string body will be a /// default message including the name of the file that was not found, which can /// be read from the request directly. -static auto createNotFoundResponse(const HttpRequest auto& request) { - return createHttpResponseFromString("Resource \"" + - std::string(request.target()) + - "\" was not found on this server", - http::status::not_found, request); +static auto createNotFoundResponse(const std::string& errorMsg, + const HttpRequest auto& request) { + return createHttpResponseFromString(errorMsg, http::status::not_found, + request); } /// Create a HttpResponse with status 400 Bad Request. @@ -206,27 +205,25 @@ boost::asio::awaitable makeFileServerImpl( // Make sure we can handle the method if (request.method() != http::verb::get && request.method() != http::verb::head) { - co_await send(createBadRequestResponse( - "Unknown HTTP-method, only GET and HEAD requests are supported", - request)); - co_return; + throw std::runtime_error( + "When serving files, only GET and HEAD requests are supported"); } // Decode the path and check that it is absolute and contains no "..". auto urlPath = ad_utility::UrlParser::getDecodedPathAndCheck(request.target()); if (!urlPath.has_value()) { - co_await send(createBadRequestResponse( - "Invalid url path \"" + std::string{request.target()} + '"', request)); - co_return; + throw std::runtime_error( + absl::StrCat("Invalid URL path \"", request.target(), "\"")); } // Check if the target is in the whitelist. The `target()` starts with a // slash, entries in the whitelist don't. + auto urlPathWithFirstCharRemoved = urlPath.value().substr(1); if (whitelist.has_value() && - !whitelist.value().contains(urlPath.value().substr(1))) { - co_await send(createNotFoundResponse(request)); - co_return; + !whitelist.value().contains(urlPathWithFirstCharRemoved)) { + throw std::runtime_error(absl::StrCat( + "Resource \"", urlPathWithFirstCharRemoved, "\" not in whitelist")); } // Build the path to the requested file on the file system. @@ -240,12 +237,15 @@ boost::asio::awaitable makeFileServerImpl( // Handle the case where the file doesn't exist. if (errorCode == beast::errc::no_such_file_or_directory) { - co_await send(createNotFoundResponse(request)); - co_return; + std::string errorMsg = + absl::StrCat("Resource \"", request.target(), "\" not found"); + LOG(ERROR) << errorMsg << std::endl; + co_return co_await send(createNotFoundResponse(errorMsg, request)); } // Handle an unknown error. if (errorCode) { + LOG(ERROR) << errorCode.message() << std::endl; co_return co_await send( createServerErrorResponse(errorCode.message(), request)); } diff --git a/src/util/HttpServer/UrlParser.cpp b/src/util/HttpServer/UrlParser.cpp index 598b1cbe82..b290046b8c 100644 --- a/src/util/HttpServer/UrlParser.cpp +++ b/src/util/HttpServer/UrlParser.cpp @@ -1,18 +1,23 @@ - - -// Copyright 2021, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2022, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Authors: Johannes Kalmbach +// Hannah Bast #include "UrlParser.h" #include "../Exception.h" + using namespace ad_utility; -using std::string; // _____________________________________________________________________________ -string UrlParser::applyPercentDecoding(std::string_view url) { - string decoded; +std::string UrlParser::applyPercentDecoding(std::string_view url, + bool urlDecode) { + // If not decoding wanted, just convert to `std::string`. + if (urlDecode == false) { + return std::string{url}; + } + // Otherwise resolve all %XX. + std::string decoded; for (size_t i = 0; i < url.size(); ++i) { if (url[i] == '+') { decoded += ' '; @@ -45,27 +50,42 @@ string UrlParser::applyPercentDecoding(std::string_view url) { } // ___________________________________________________________________________ -UrlParser::UrlTarget UrlParser::parseTarget(std::string_view target) { - static constexpr auto npos = std::string_view::npos; - UrlTarget result; +UrlParser::UrlPathAndParameters UrlParser::parseGetRequestTarget( + std::string_view target, bool urlDecode) { + UrlPathAndParameters result; - target = target.substr(0, target.find('#')); + // Remove everything after # (including it). Does nothing if there is no #. + // Don't do this is `urlDecode == false` because in that case, the given + // string contains an unencode SPARQL query, which frequently contains a # as + // a regular character. + if (urlDecode == true) { + target = target.substr(0, target.find('#')); + } + + // Set `_path` and remove it from `target`. If there is no query string (part + // starting with "?"), we are done at this point. size_t index = target.find('?'); - result._target = target.substr(0, index); - if (index == npos) { + result._path = target.substr(0, index); + if (index == std::string::npos) { return result; } target.remove_prefix(index + 1); + + // Parse the query string and store the result in a hash map. Throw an error + // if the same key appears twice in the query string. Note that this excludes + // having two "cmd=..." parameters, although that would be meaningful (though + // not necessary) to support. while (true) { auto next = target.find('&'); - auto paramAndValue = parseSingleKeyValuePair(target.substr(0, next)); + auto paramAndValue = + parseSingleKeyValuePair(target.substr(0, next), urlDecode); auto [iterator, isNewElement] = result._parameters.insert(std::move(paramAndValue)); if (!isNewElement) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Duplicate HTTP parameter: " + iterator->first); } - if (next == npos) { + if (next == std::string::npos) { break; } target.remove_prefix(next + 1); @@ -75,14 +95,14 @@ UrlParser::UrlTarget UrlParser::parseTarget(std::string_view target) { // ____________________________________________________________________________ std::pair UrlParser::parseSingleKeyValuePair( - std::string_view input) { + std::string_view input, bool urlDecode) { size_t posOfEq = input.find('='); if (posOfEq == std::string_view::npos) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Parameter without \"=\" in HTTP Request. " + std::string{input}); } - std::string param{applyPercentDecoding(input.substr(0, posOfEq))}; - std::string value{applyPercentDecoding(input.substr(posOfEq + 1))}; + std::string param{applyPercentDecoding(input.substr(0, posOfEq), urlDecode)}; + std::string value{applyPercentDecoding(input.substr(posOfEq + 1), urlDecode)}; return {std::move(param), std::move(value)}; } @@ -90,7 +110,7 @@ std::pair UrlParser::parseSingleKeyValuePair( std::optional UrlParser::getDecodedPathAndCheck( std::string_view target) noexcept { try { - auto filename = parseTarget(target)._target; + auto filename = parseGetRequestTarget(target)._path; AD_CHECK(filename.starts_with('/')); AD_CHECK(filename.find("..") == string::npos); return filename; diff --git a/src/util/HttpServer/UrlParser.h b/src/util/HttpServer/UrlParser.h index a83636d100..e602d50759 100644 --- a/src/util/HttpServer/UrlParser.h +++ b/src/util/HttpServer/UrlParser.h @@ -1,6 +1,7 @@ -// Copyright 2021, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2022, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Authors: Johannes Kalmbach +// Hannah Bast #ifndef QLEVER_URLPARSER_H #define QLEVER_URLPARSER_H @@ -21,17 +22,26 @@ namespace ad_utility { */ class UrlParser { public: - struct UrlTarget { - std::string _target; + /// Representation of the "path" and "query" of a URL. For a GET request, the + /// "path" is the part before the "?" (or everything if there is no "?"), and + /// the "query" is the part after the "?" (empty if there is no "?"). The + /// key-value pairs of the "query" are stored in a hash map. + struct UrlPathAndParameters { + std::string _path; ad_utility::HashMap _parameters; }; - // ___________________________________________________________________________ - static std::string applyPercentDecoding(std::string_view url); + // URL-decode the given (part of a) URL. If the second argument is false, do + // nothing except converting the given `std::string_view` to `std::string`. + static std::string applyPercentDecoding(std::string_view url, + bool urlDecode = true); - /// Parse the `target` part of an HTTP GET Request, - /// for example, `/api.html?someKey=some+val%0Fue`. - static UrlTarget parseTarget(std::string_view target); + /// Parse the `target` part of an HTTP GET Request, for example, + /// `/api.html?someKey=some+val%0Fue`. The second argument specifies whether + /// the key-value pairs of the query string should be URL-decoded (default: + /// yes). + static UrlPathAndParameters parseGetRequestTarget(std::string_view target, + bool urlDecode = true); /// From the `target` part of an HTTP GET request, only extract the path, /// with percent decoding applied. E.g. `/target.html?key=value` will become @@ -43,8 +53,11 @@ class UrlParser { std::string_view target) noexcept; private: + // Helper function that parses a single key-value pair from a URL query + // string. The second argument specifies whether the key and value should be + // URL-decoded (default: yes). static std::pair parseSingleKeyValuePair( - std::string_view input); + std::string_view input, bool urlDecode = true); }; } // namespace ad_utility diff --git a/src/util/Timer.h b/src/util/Timer.h index 381be5d6eb..4ed65135e5 100644 --- a/src/util/Timer.h +++ b/src/util/Timer.h @@ -90,11 +90,11 @@ class Timer { inline void setMsecs(off_t msecs) { _usecs = msecs * (off_t)(1000); } inline void setSecs(off_t secs) { _usecs = secs * (off_t)(1000000); } - //! Time at last stop (initially zero) - off_t value() const { return _usecs; } /* in microseconds */ - off_t usecs() const { return _usecs; } /* in microseconds */ - off_t msecs() const { return _usecs / 1000; } /* in milliseconds */ - float secs() const { return _usecs / 1000000.0; } /* in seconds */ + //! Time at last stop (initially zero). + off_t value() const { return _usecs; } + off_t usecs() const { return _usecs; } + off_t msecs() const { return (_usecs + 500) / 1000; } + float secs() const { return _usecs / 1000000.0; } // is the timer currently running bool isRunning() const { return _running; }