diff --git a/e2e/e2e.sh b/e2e/e2e.sh index 3c86c540b7..fc37f09dff 100755 --- a/e2e/e2e.sh +++ b/e2e/e2e.sh @@ -112,8 +112,8 @@ if [ ${REBUILD_THE_INDEX} == "YES" ] || ! [ -f "${INDEX}.vocabulary" ]; then popd fi -# Launch the Server using the freshly baked index. Can't simply use a subshell here because -# then we can't easily get the SERVER_PID out of that subshell +# Launch the Server using the freshly baked index. Can't simply use a subshell +# here because then we can't easily get the SERVER_PID out of that subshell pushd "$BINARY_DIR" echo "Launching server from path $(pwd)" ./ServerMain -i "$INDEX" -p 9099 -m 1 -t &> server_log.txt & diff --git a/src/ServerMain.cpp b/src/ServerMain.cpp index b1816a160d..a717e67c0b 100644 --- a/src/ServerMain.cpp +++ b/src/ServerMain.cpp @@ -39,7 +39,8 @@ int main(int argc, char** argv) { // filled / set depending on the options. using ad_utility::NonNegative; - string indexBasename; + std::string indexBasename; + std::string accessToken; bool text = false; int port; NonNegative numSimultaneousQueries = 1; @@ -62,6 +63,8 @@ int main(int argc, char** argv) { "The basename of the index files (required)."); add("port,p", po::value<int>(&port)->required(), "The port on which HTTP requests are served (required)."); + add("access-token,a", po::value<std::string>(&accessToken)->default_value(""), + "Access token for restricted API calls (default: no access)."); add("num-simultaneous-queries,j", po::value<NonNegative>(&numSimultaneousQueries)->default_value(1), "The number of queries that can be processed simultaneously."); @@ -123,7 +126,7 @@ int main(int argc, char** argv) { try { Server server(port, static_cast<int>(numSimultaneousQueries), - memoryMaxSizeGb); + memoryMaxSizeGb, std::move(accessToken)); server.run(indexBasename, text, !noPatterns, !noPatternTrick, !onlyPsoAndPosPermutations); } catch (const std::exception& e) { diff --git a/src/engine/Operation.cpp b/src/engine/Operation.cpp index 84ae835d98..dca61f0e90 100644 --- a/src/engine/Operation.cpp +++ b/src/engine/Operation.cpp @@ -56,9 +56,8 @@ void Operation::recursivelySetTimeoutTimer( } } -// Get the result for the subtree rooted at this element. -// Use existing results if they are already available, otherwise -// trigger computation. +// Get the result for the subtree rooted at this element. Use existing results +// if they are already available, otherwise trigger computation. shared_ptr<const ResultTable> Operation::getResult(bool isRoot) { ad_utility::Timer timer; timer.start(); @@ -114,30 +113,34 @@ shared_ptr<const ResultTable> Operation::getResult(bool isRoot) { timer.stop(); createRuntimeInformation(result, timer.msecs()); + auto resultNumRows = result._resultPointer->_resultTable->size(); + auto resultNumCols = result._resultPointer->_resultTable->width(); + LOG(DEBUG) << "Computed result of size " << resultNumRows << " x " + << resultNumCols << std::endl; return result._resultPointer->_resultTable; } catch (const ad_semsearch::AbortException& e) { // A child Operation was aborted, do not print the information again. throw; } catch (const ad_utility::WaitedForResultWhichThenFailedException& e) { + // Here and in the following, show the detailed information (it's the + // runtime info) only in the DEBUG log. Note that the exception will be + // caught by the `processQuery` method, where the error message will be + // printed *and* included in an error response sent to the client. LOG(ERROR) << "Waited for a result from another thread which then failed" << endl; - LOG(ERROR) << asString(); + LOG(DEBUG) << asString(); throw ad_semsearch::AbortException(e); } catch (const std::exception& e) { // We are in the innermost level of the exception, so print - LOG(ERROR) << "Aborted Operation:" << endl; - LOG(ERROR) << asString() << endl; - LOG(ERROR) << e.what() << endl; + LOG(ERROR) << "Aborted Operation" << endl; + LOG(DEBUG) << asString() << endl; // Rethrow as QUERY_ABORTED allowing us to print the Operation // only at innermost failure of a recursive call throw ad_semsearch::AbortException(e); } catch (...) { // We are in the innermost level of the exception, so print - LOG(ERROR) << "Aborted Operation:" << endl; - LOG(ERROR) << asString() << endl; - LOG(ERROR) - << "Unexpected exception that is not a subclass of std::exception" - << endl; + LOG(ERROR) << "Aborted Operation" << endl; + LOG(DEBUG) << asString() << endl; // Rethrow as QUERY_ABORTED allowing us to print the Operation // only at innermost failure of a recursive call throw ad_semsearch::AbortException( diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp index 44b127a122..cc083077cd 100644 --- a/src/engine/QueryExecutionTree.cpp +++ b/src/engine/QueryExecutionTree.cpp @@ -401,7 +401,9 @@ ad_utility::streams::stream_generator QueryExecutionTree::generateResults( // This call triggers the possibly expensive computation of the query result // unless the result is already cached. shared_ptr<const ResultTable> resultTable = getResult(); - LOG(DEBUG) << "Resolving strings for finished binary result...\n"; + resultTable->logResultSize(); + LOG(DEBUG) << "Converting result IDs to their corresponding strings ..." + << std::endl; auto selectedColumnIndices = selectedVariablesToColumnIndices( selectedVarsOrAsterisk, *resultTable, true); @@ -518,8 +520,10 @@ QueryExecutionTree::generateRdfGraph( // _____________________________________________________________________________ ad_utility::streams::stream_generator QueryExecutionTree::writeRdfGraphTurtle( const ad_utility::sparql_types::Triples& constructTriples, size_t limit, - size_t offset, std::shared_ptr<const ResultTable> res) const { - auto generator = generateRdfGraph(constructTriples, limit, offset, res); + size_t offset, std::shared_ptr<const ResultTable> resultTable) const { + resultTable->logResultSize(); + auto generator = + generateRdfGraph(constructTriples, limit, offset, resultTable); for (const auto& triple : generator) { co_yield triple._subject; co_yield ' '; @@ -535,7 +539,7 @@ template <QueryExecutionTree::ExportSubFormat format> ad_utility::streams::stream_generator QueryExecutionTree::writeRdfGraphSeparatedValues( const ad_utility::sparql_types::Triples& constructTriples, size_t limit, - size_t offset, std::shared_ptr<const ResultTable> res) const { + size_t offset, std::shared_ptr<const ResultTable> resultTable) const { static_assert(format == ExportSubFormat::BINARY || format == ExportSubFormat::CSV || format == ExportSubFormat::TSV); @@ -543,11 +547,13 @@ QueryExecutionTree::writeRdfGraphSeparatedValues( throw std::runtime_error{ "Binary export is not supported for CONSTRUCT queries"}; } + resultTable->logResultSize(); constexpr auto& escapeFunction = format == ExportSubFormat::TSV ? RdfEscaping::escapeForTsv : RdfEscaping::escapeForCsv; constexpr char sep = format == ExportSubFormat::TSV ? '\t' : ','; - auto generator = generateRdfGraph(constructTriples, limit, offset, res); + auto generator = + generateRdfGraph(constructTriples, limit, offset, resultTable); for (auto& triple : generator) { co_yield escapeFunction(std::move(triple._subject)); co_yield sep; diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 722624f229..a86eb06e14 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -23,9 +23,8 @@ using std::shared_ptr; using std::string; -// A query execution tree. -// Processed bottom up, this gives an ordering to the operations -// needed to solve a query. +// A query execution tree. Processed bottom up, which gives an ordering to the +// operations needed to solve a query. class QueryExecutionTree { public: explicit QueryExecutionTree(QueryExecutionContext* const qec); diff --git a/src/engine/ResultTable.h b/src/engine/ResultTable.h index 6971ddf7c2..0e4c29026e 100644 --- a/src/engine/ResultTable.h +++ b/src/engine/ResultTable.h @@ -74,6 +74,15 @@ class ResultTable { size_t size() const; size_t width() const { return _idTable.cols(); } + // Log to INFO the size of this result. + // + // NOTE: Due to the current sub-optimal design of `Server::processQuery`, we + // need the same message in multiple places and so instead of duplicating the + // message, we should have a method for it. + void logResultSize() const { + LOG(INFO) << "Result has size " << size() << " x " << width() << std::endl; + } + void clear(); string asDebugString() const; @@ -86,4 +95,4 @@ class ResultTable { } private: -}; \ No newline at end of file +}; diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index ae88bcf447..abd773f361 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -36,19 +36,47 @@ void Server::initialize(const string& indexBaseName, bool useText, // Set flag. _initialized = true; - LOG(INFO) << "The server is ready, listening for requests on port " << _port - << " ..." << std::endl; + LOG(INFO) << "Access token for restricted API calls is \"" << accessToken_ + << "\"" << std::endl; + LOG(INFO) << "The server is ready, listening for requests on port " + << std::to_string(_port) << " ..." << std::endl; } // _____________________________________________________________________________ void Server::run(const string& indexBaseName, bool useText, bool usePatterns, bool usePatternTrick, bool loadAllPermutations) { - // First set up the HTTP server, so that it binds to the socket, and - // the "socket already in use" error appears quickly. + using namespace ad_utility::httpUtils; + + // Function that handles a request asynchronously, will be passed as argument + // to `HttpServer` below. auto httpSessionHandler = [this](auto request, auto&& send) -> boost::asio::awaitable<void> { - co_await process(std::move(request), send); + // Version of send with maximally permissive CORS header (that allows the + // client that receives the response to do with it what it wants). + auto sendWithCors = [&send](auto response) -> boost::asio::awaitable<void> { + response.set(http::field::access_control_allow_origin, "*"); + co_return co_await send(std::move(response)); + }; + // Process the request using the `process` method and if it throws an + // exception, log the error message and send a HTTP/1.1 400 Bad Request + // response with that message. Note that the C++ standard forbids co_await + // in the catch block, hence the workaround with the `exceptionErrorMsg`. + std::optional<std::string> exceptionErrorMsg; + try { + co_await process(std::move(request), sendWithCors); + } catch (const std::exception& e) { + exceptionErrorMsg = e.what(); + } + if (exceptionErrorMsg) { + LOG(ERROR) << exceptionErrorMsg.value() << std::endl; + auto badRequestResponse = createBadRequestResponse( + absl::StrCat(exceptionErrorMsg.value(), "\n"), request); + co_await sendWithCors(std::move(badRequestResponse)); + } }; + + // First set up the HTTP server, so that it binds to the socket, and + // the "socket already in use" error appears quickly. auto httpServer = HttpServer{static_cast<unsigned short>(_port), "0.0.0.0", _numThreads, std::move(httpSessionHandler)}; @@ -60,65 +88,134 @@ void Server::run(const string& indexBaseName, bool useText, bool usePatterns, httpServer.run(); } +// _____________________________________________________________________________ +ad_utility::UrlParser::UrlPathAndParameters Server::getUrlPathAndParameters( + const ad_utility::httpUtils::HttpRequest auto& request) { + if (request.method() == http::verb::get) { + // For a GET request, `request.target()` yields the part after the domain, + // which is a concatenation of the path and the query string (the query + // string starting with "?"). + return ad_utility::UrlParser::parseGetRequestTarget(request.target()); + } + if (request.method() == http::verb::post) { + // For a POST request, the content type *must* be either + // "application/x-www-form-urlencoded" or "application/sparql-query". In + // the first case, the body of the POST request contains a URL-encoded + // query (just like in the part of a GET request after the "?"). In the + // second case, the body of the POST request contains *only* the SPARQL + // query, but not URL-encoded, and no other URL parameters. See Sections + // 2.1.2 and 2.1.3 of the SPARQL 1.1 standard: + // https://www.w3.org/TR/2013/REC-sparql11-protocol-20130321 + std::string_view contentType = request.base()[http::field::content_type]; + LOG(DEBUG) << "Content-type: \"" << contentType << "\"" << std::endl; + static constexpr std::string_view contentTypeUrlEncoded = + "application/x-www-form-urlencoded"; + static constexpr std::string_view contentTypeSparqlQuery = + "application/sparql-query"; + + // In either of the two cases explained above, we convert the data to a + // format as if it came from a GET request. The second argument to + // `parseGetRequestTarget` says whether the function should apply URL + // decoding. + if (contentType == contentTypeUrlEncoded) { + return ad_utility::UrlParser::parseGetRequestTarget( + absl::StrCat(request.target(), "?", request.body()), true); + } + if (contentType == contentTypeSparqlQuery) { + return ad_utility::UrlParser::parseGetRequestTarget( + absl::StrCat(request.target(), "?query=", request.body()), false); + } + throw std::runtime_error( + absl::StrCat("POST request with content type \"", contentType, + "\" not supported (must be \"", contentTypeUrlEncoded, + "\" or \"", contentTypeSparqlQuery, "\")")); + } + std::ostringstream requestMethodName; + requestMethodName << request.method(); + throw std::runtime_error( + absl::StrCat("Request method \"", requestMethodName.str(), + "\" not supported (has to be GET or POST)")); +}; + // _____________________________________________________________________________ Awaitable<void> Server::process( const ad_utility::httpUtils::HttpRequest auto& request, auto&& send) { using namespace ad_utility::httpUtils; + + // Log some basic information about the request. Start with an empty line so + // that in a low-traffic scenario (or when the query processing is very fast), + // we have one visual block per request in the log. + std::string_view contentType = request.base()[http::field::content_type]; + LOG(INFO) << std::endl; + LOG(INFO) << "Request received via " << request.method() + << (contentType.empty() + ? absl::StrCat(", no content type specified") + : absl::StrCat(", content type \"", contentType, "\"")) + << std::endl; + + // Start timing. ad_utility::Timer requestTimer; requestTimer.start(); - auto filenameAndParams = ad_utility::UrlParser::parseTarget(request.target()); - const auto& params = filenameAndParams._parameters; + // Parse the path and the URL parameters from the given request. Works for GET + // requests as well as the two kinds of POST requests allowed by the SPARQL + // standard, see method `getUrlPathAndParameters`. + const auto urlPathAndParameters = getUrlPathAndParameters(request); + const auto& parameters = urlPathAndParameters._parameters; - // Lambda for sending a response asynchronously. Will be called with `co_await - // sendWithCors(...)`, see below. - auto sendWithCors = [&send](auto response) -> boost::asio::awaitable<void> { - response.set(http::field::access_control_allow_origin, "*"); - co_return co_await send(std::move(response)); - }; - - // Lambda that checks if a URL parameter with the given specification exists - // and we are allowed to perform the associated action. - // - // 1. If value is `std::nullopt`, check if the given key exists, and if yes, - // return the corresponding value, and `std::nullopt` otherwise. - // - // 2. If value is not `std::nullopt`, check if the given key-value pair - // exists, and if yes, return the value (not really needed, but just so that - // the interface is the same as in the first case), and `std::nullopt` - // otherwise. + // Lambda for checking if a URL parameter exists in the request and if we are + // allowed to access it. If yes, return the value, otherwise return + // `std::nullopt`. // - // 3. If one of the two above would have returned a value, but the third - // argument is false, print a log message that the access to this URL - // parameter is denied and return `std::nullopt`. - auto checkParam = - [¶ms](const std::string key, std::optional<std::string> value, - bool accessAllowed = true) -> std::optional<std::string> { - // If key not found, always return std::nullopt. - if (!params.contains(key)) { + // If `value` is `std::nullopt`, only check if the key exists. We need this + // because we have parameters like "cmd=stats", where a fixed combination of + // the key and value determines the kind of action, as well as parameters + // like "index-decription=...", where the key determines the kind of action. + auto checkParameter = + [¶meters](const std::string key, std::optional<std::string> value, + bool accessAllowed = true) -> std::optional<std::string> { + // If the key is not found, always return std::nullopt. + if (!parameters.contains(key)) { return std::nullopt; } // If value is given, but not equal to param value, return std::nullopt. If // no value is given, set it to param value. if (value == std::nullopt) { - value = params.at(key); - } else if (value != params.at(key)) { + value = parameters.at(key); + } else if (value != parameters.at(key)) { return std::nullopt; } - // At this point, we have a value. Either return it or say access denied. + // Now that we have the value, check if there is a problem with the access. + // If yes, we abort the query processing at this point. if (accessAllowed == false) { - LOG(INFO) << "Access to \"" << key << "=" << value.value() - << "\" is denied, this URL parameter is ignored" << std::endl; - return std::nullopt; - } else { - return value; + throw std::runtime_error(absl::StrCat("Access to \"", key, "=", + value.value(), "\" denied", + " (requires a valid access token)", + ", processing of request aborted")); } + return value; }; - auto accessToken = checkParam("access-token", std::nullopt); - auto accessTokenOk = accessToken == "1622"; - if (accessToken && !accessTokenOk) { - LOG(INFO) << "Access token \"access-token=" << accessToken.value() << "\"" - << " provided, but not correct" << std::endl; + + // Check the access token. If an access token is provided and the check fails, + // throw an exception and do not process any part of the query (even if the + // processing would have been allowed without access token). + auto accessToken = checkParameter("access-token", std::nullopt); + bool accessTokenOk = false; + if (accessToken) { + auto accessTokenProvidedMsg = absl::StrCat( + "Access token \"access-token=", accessToken.value(), "\" provided"); + auto requestIgnoredMsg = ", request is ignored"; + if (accessToken_.empty()) { + throw std::runtime_error(absl::StrCat( + accessTokenProvidedMsg, + " but server was started without --access-token", requestIgnoredMsg)); + } else if (accessToken != accessToken_) { + throw std::runtime_error(absl::StrCat( + accessTokenProvidedMsg, " but not correct", requestIgnoredMsg)); + } else { + LOG(DEBUG) << accessTokenProvidedMsg << " and correct" << std::endl; + accessTokenOk = true; + } } // Process all URL parameters known to QLever. If there is more than one, @@ -131,29 +228,45 @@ Awaitable<void> Server::process( std::optional<http::response<http::string_body>> response; // Execute commands (URL parameter with key "cmd"). - if (checkParam("cmd", "stats")) { - LOG(INFO) << "Processing command \"stats\"" << std::endl; + auto logCommand = [](std::optional<std::string>& cmd, std::string actionMsg) { + LOG(INFO) << "Processing command \"" << cmd.value() << "\"" + << ": " << actionMsg << std::endl; + }; + if (auto cmd = checkParameter("cmd", "stats")) { + logCommand(cmd, "get index statistics"); response = createJsonResponse(composeStatsJson(), request); - } else if (checkParam("cmd", "cache-stats")) { - LOG(INFO) << "Processing command \"cache-stats\"" << std::endl; + } else if (auto cmd = checkParameter("cmd", "cache-stats")) { + logCommand(cmd, "get cache statistics"); response = createJsonResponse(composeCacheStatsJson(), request); - } else if (checkParam("cmd", "clear-cache")) { - LOG(INFO) << "Clearing the cache, unpinned elements only" << std::endl; + } else if (auto cmd = checkParameter("cmd", "clear-cache")) { + logCommand(cmd, "clear the cache (unpinned elements only)"); _cache.clearUnpinnedOnly(); response = createJsonResponse(composeCacheStatsJson(), request); - } else if (checkParam("cmd", "clear-cache-complete", accessTokenOk)) { - LOG(INFO) << "Clearing the cache completely, including unpinned elements" - << std::endl; + } else if (auto cmd = + checkParameter("cmd", "clear-cache-complete", accessTokenOk)) { + logCommand(cmd, "clear cache completely (including unpinned elements)"); _cache.clearAll(); response = createJsonResponse(composeCacheStatsJson(), request); - } else if (checkParam("cmd", "get-settings")) { - LOG(INFO) << "Getting server settings" << std::endl; + } else if (auto cmd = checkParameter("cmd", "get-settings")) { + logCommand(cmd, "get server settings"); response = createJsonResponse(RuntimeParameters().toMap(), request); } + // Ping with or without messsage. + if (urlPathAndParameters._path == "/ping") { + if (auto msg = checkParameter("msg", std::nullopt)) { + LOG(INFO) << "Alive check with message \"" << msg.value() << "\"" + << std::endl; + } else { + LOG(INFO) << "Alive check without message" << std::endl; + } + response = createOkResponse("This QLever server is up and running\n", + request, ad_utility::MediaType::textPlain); + } + // Set description of KB index. if (auto description = - checkParam("index-description", std::nullopt, accessTokenOk)) { + checkParameter("index-description", std::nullopt, accessTokenOk)) { LOG(INFO) << "Setting index description to: \"" << description.value() << "\"" << std::endl; _index.setKbName(description.value()); @@ -162,7 +275,7 @@ Awaitable<void> Server::process( // Set description of text index. if (auto description = - checkParam("text-description", std::nullopt, accessTokenOk)) { + checkParameter("text-description", std::nullopt, accessTokenOk)) { LOG(INFO) << "Setting text description to: \"" << description.value() << "\"" << std::endl; _index.setTextName(description.value()); @@ -171,7 +284,7 @@ Awaitable<void> Server::process( // Set one or several of the runtime parameters. for (auto key : RuntimeParameters().getKeys()) { - if (auto value = checkParam(key, std::nullopt, accessTokenOk)) { + if (auto value = checkParameter(key, std::nullopt, accessTokenOk)) { LOG(INFO) << "Setting runtime parameter \"" << key << "\"" << " to value \"" << value.value() << "\"" << std::endl; RuntimeParameters().set(key, value.value()); @@ -180,26 +293,32 @@ Awaitable<void> Server::process( } // If "query" parameter is given, process query. - if (auto query = checkParam("query", std::nullopt)) { - if (!query.value().empty()) { - co_return co_await processQuery(params, requestTimer, std::move(request), - sendWithCors); - } else { - response = createBadRequestResponse( - "Parameter \"query\" must not have an empty value", request); + if (auto query = checkParameter("query", std::nullopt)) { + if (query.value().empty()) { + throw std::runtime_error( + "Parameter \"query\" must not have an empty value"); } + co_return co_await processQuery(parameters, requestTimer, + std::move(request), send); } - // If there was no non-empty "query", but any of the above produced a - // `response`, send that now. + // If there was no "query", but any of the URL parameters processed before + // produced a `response`, send that now. Note that if multiple URL parameters + // were processed, only the `response` from the last one is sent. if (response.has_value()) { - co_return co_await sendWithCors(std::move(response.value())); + co_return co_await send(std::move(response.value())); } - // At this point, none of the URL parameters was recognized by QLever. We - // then consider the URL parameter string as a path name and try to serve the - // corresponding file (though we only allow a very restricted whitelist, see - // below). + // At this point, if there is a "?" in the query string, it means that there + // are URL parameters which QLever does not know or did not process. + if (request.target().find("?") != std::string::npos) { + throw std::runtime_error( + "Request with URL parameters, but none of them could be processed"); + } + + // At this point, we only have a path and no URL paraeters. We then interpret + // the request as a request for a file. However, only files from a very + // restricted whitelist (see below) will actually be served. // // NOTE 1: `makeFileServer` returns a function. The first argument is the // document root, the second one is the whitelist. @@ -213,6 +332,8 @@ Awaitable<void> Server::process( // is not found, a corresponding response is returned to the requestion // client, but the log says nothing about it. The place to change this would // be in `src/util/HttpServer/HttpUtils.h`. + LOG(INFO) << "Treating request target \"" << request.target() << "\"" + << " as a request for a file with that name" << std::endl; auto serveFileRequest = makeFileServer( ".", ad_utility::HashSet<std::string>{"index.html", "script.js", "style.css"})( @@ -227,6 +348,7 @@ Awaitable<json> Server::composeResponseQleverJson( auto compute = [&, maxSend] { shared_ptr<const ResultTable> resultTable = qet.getResult(); requestTimer.stop(); + resultTable->logResultSize(); off_t compResultUsecs = requestTimer.usecs(); size_t resultSize = resultTable->size(); @@ -282,6 +404,7 @@ Awaitable<json> Server::composeResponseSparqlJson( } auto compute = [&, maxSend] { shared_ptr<const ResultTable> resultTable = qet.getResult(); + resultTable->logResultSize(); requestTimer.stop(); nlohmann::json j; size_t limit = std::min(query._limitOffset._limit, maxSend); @@ -329,8 +452,9 @@ ad_utility::streams::stream_generator Server::composeTurtleResponse( } // _____________________________________________________________________________ -json Server::composeExceptionJson(const string& query, const std::exception& e, - ad_utility::Timer& requestTimer) { +json Server::composeErrorResponseJson(const string& query, + const std::string& errorMsg, + ad_utility::Timer& requestTimer) { requestTimer.stop(); json j; @@ -339,7 +463,7 @@ json Server::composeExceptionJson(const string& query, const std::exception& e, j["resultsize"] = 0; j["time"]["total"] = requestTimer.msecs(); j["time"]["computeResult"] = requestTimer.msecs(); - j["exception"] = e.what(); + j["exception"] = errorMsg; return j; } @@ -393,8 +517,11 @@ boost::asio::awaitable<void> Server::processQuery( co_return co_await send(std::move(response)); }; - std::optional<json> errorResponse; - + // Put the whole query processing in a try-catch block. If any exception + // occurs, log the error message and send a JSON response with all the details + // to the client. Note that the C++ standard forbids co_await in the catch + // block, hence the workaround with the optional `exceptionErrorMsg`. + std::optional<std::string> exceptionErrorMsg; try { ad_utility::SharedConcurrentTimeoutTimer timeoutTimer = std::make_shared<ad_utility::ConcurrentTimeoutTimer>( @@ -413,36 +540,26 @@ boost::asio::awaitable<void> Server::processQuery( : MAX_NOF_ROWS_IN_RESULT; const bool pinSubtrees = containsParam("pinsubtrees", "true"); const bool pinResult = containsParam("pinresult", "true"); - LOG(INFO) << "Query" << ((pinSubtrees) ? " (Cache pinned)" : "") - << ((pinResult) ? " (Result pinned)" : "") << ":\n" + LOG(INFO) << "Processing the following SPARQL query:" + << (pinResult ? " [pin result]" : "") + << (pinSubtrees ? " [pin subresults]" : "") << "\n" << query << std::endl; ParsedQuery pq = SparqlParser(query).parse(); pq.expandPrefixes(); - QueryExecutionContext qec(_index, _engine, &_cache, _allocator, - _sortPerformanceEstimator, pinSubtrees, - pinResult); - // start the shared timeout timer here to also include - // the query planning - timeoutTimer->wlock()->start(); - - QueryPlanner qp(&qec); - qp.setEnablePatternTrick(_enablePatternTrick); - QueryExecutionTree qet = qp.createExecutionTree(pq); - qet.isRoot() = true; // allow pinning of the final result - qet.recursivelySetTimeoutTimer(timeoutTimer); - LOG(TRACE) << qet.asString() << std::endl; - + // The following code block determines the media type to be used for the + // result. The media type is either determined by the "Accept:" header of + // the request or by the URL parameter "action=..." (for TSV and CSV export, + // for QLever-historical reasons). using ad_utility::MediaType; - // Determine the result media type. - // TODO<joka921> qleverJson should not be the default as soon - // as the UI explicitly requests it. - // TODO<joka921> Add sparqlJson as soon as it is supported. + // The first media type in this list is the default, if no other type is + // specified in the request. It's "application/sparql-results+json", as + // required by the SPARQL standard. const auto supportedMediaTypes = []() { static const std::vector<MediaType> mediaTypes{ - ad_utility::MediaType::qleverJson, ad_utility::MediaType::sparqlJson, + ad_utility::MediaType::qleverJson, ad_utility::MediaType::tsv, ad_utility::MediaType::csv, ad_utility::MediaType::turtle, @@ -484,8 +601,35 @@ boost::asio::awaitable<void> Server::processQuery( supportedMediaTypes()), request)); } - AD_CHECK(mediaType.has_value()); + LOG(INFO) << "Requested media type of result is \"" + << ad_utility::toString(mediaType.value()) << "\"" << std::endl; + + // Do the query planning. This creates a `QueryExecutionTree`, which will + // then be used to process the query. Start the shared `timeoutTimer` here + // to also include the query planning. + // + // NOTE: This should come after determining the media type. Otherwise it + // might happen that the query planner runs for a while (recall that it many + // do index scans) and then we get an error message afterwards that a + // certain media type is not supported. + timeoutTimer->wlock()->start(); + QueryExecutionContext qec(_index, _engine, &_cache, _allocator, + _sortPerformanceEstimator, pinSubtrees, + pinResult); + QueryPlanner qp(&qec); + qp.setEnablePatternTrick(_enablePatternTrick); + QueryExecutionTree qet = qp.createExecutionTree(pq); + qet.isRoot() = true; // allow pinning of the final result + qet.recursivelySetTimeoutTimer(timeoutTimer); + requestTimer.stop(); + LOG(INFO) << "Query planning done in " << requestTimer.msecs() << " ms" + << " (can include index scans)" << std::endl; + requestTimer.cont(); + LOG(TRACE) << qet.asString() << std::endl; + + // This actually processes the query and sends the result in the requested + // format. switch (mediaType.value()) { case ad_utility::MediaType::csv: { auto responseGenerator = co_await composeResponseSepValues< @@ -538,19 +682,28 @@ boost::asio::awaitable<void> Server::processQuery( // Print the runtime info. This needs to be done after the query // was computed. - // TODO<joka921> Also log the processing time and an identifier of the - // query. - LOG(INFO) << "Done processing query" << std::endl; - LOG(DEBUG) << "\nRuntime Info:\n" + // Log that we are done with the query and how long it took. + // + // NOTE: We need to explicitly stop the `requestTimer` here because in the + // sending code above, it is done only in some cases and not in others (in + // particular, not for TSV and CSV because for those, the result does not + // contain timing information). + // + // TODO<joka921> Also log an identifier of the query. + requestTimer.stop(); + LOG(INFO) << "Done processing query and sending result" + << ", total time was " << requestTimer.msecs() << " ms" + << std::endl; + LOG(DEBUG) << "Runtime Info:\n" << qet.getRootOperation()->getRuntimeInfo().toString() << std::endl; - } catch (const ad_semsearch::Exception& e) { - errorResponse = composeExceptionJson(query, e, requestTimer); } catch (const std::exception& e) { - errorResponse = composeExceptionJson(query, e, requestTimer); + exceptionErrorMsg = e.what(); } - if (errorResponse.has_value()) { - co_return co_await sendJson(errorResponse.value(), - http::status::bad_request); + if (exceptionErrorMsg) { + LOG(ERROR) << exceptionErrorMsg.value() << std::endl; + auto errorResponseJson = composeErrorResponseJson( + query, exceptionErrorMsg.value(), requestTimer); + co_return co_await sendJson(errorResponseJson, http::status::bad_request); } } diff --git a/src/engine/Server.h b/src/engine/Server.h index 4445426eda..61cde9012e 100644 --- a/src/engine/Server.h +++ b/src/engine/Server.h @@ -30,9 +30,11 @@ using ad_utility::Socket; //! The HTTP Server used. class Server { public: - explicit Server(const int port, const int numThreads, size_t maxMemGB) + explicit Server(const int port, const int numThreads, size_t maxMemGB, + std::string accessToken) : _numThreads(numThreads), _port(port), + accessToken_(accessToken), _allocator{ad_utility::makeAllocationMemoryLeftThreadsafeObject( maxMemGB * (1ull << 30u)), [this](size_t numBytesToAllocate) { @@ -90,6 +92,7 @@ class Server { private: const int _numThreads; int _port; + std::string accessToken_; QueryResultCache _cache; ad_utility::AllocatorWithLimit<Id> _allocator; SortPerformanceEstimator _sortPerformanceEstimator; @@ -106,14 +109,19 @@ class Server { template <typename T> using Awaitable = boost::asio::awaitable<T>; + /// Parse the path and URL parameters from the given request. Supports both + /// GET and POST request according to the SPARQL 1.1 standard. + ad_utility::UrlParser::UrlPathAndParameters getUrlPathAndParameters( + const ad_utility::httpUtils::HttpRequest auto& request); + /// Handle a single HTTP request. Check whether a file request or a query was /// sent, and dispatch to functions handling these cases. This function /// requires the constraints for the `HttpHandler` in `HttpServer.h`. /// \param req The HTTP request. /// \param send The action that sends a http:response. (see the /// `HttpServer.h` for documentation). - Awaitable<void> process(const ad_utility::httpUtils::HttpRequest auto& req, - auto&& send); + Awaitable<void> process( + const ad_utility::httpUtils::HttpRequest auto& request, auto&& send); /// Handle a http request that asks for the processing of a query. /// \param params The key-value-pairs sent in the HTTP GET request. When this @@ -141,8 +149,9 @@ class Server { Awaitable<ad_utility::streams::stream_generator> composeResponseSepValues( const ParsedQuery& query, const QueryExecutionTree& qet) const; - static json composeExceptionJson(const string& query, const std::exception& e, - ad_utility::Timer& requestTimer); + static json composeErrorResponseJson(const string& query, + const std::string& errorMsg, + ad_utility::Timer& requestTimer); static ad_utility::streams::stream_generator composeTurtleResponse( const ParsedQuery& query, const QueryExecutionTree& qet); diff --git a/src/util/HttpServer/HttpServer.h b/src/util/HttpServer/HttpServer.h index 85c6d27c59..4eec41049a 100644 --- a/src/util/HttpServer/HttpServer.h +++ b/src/util/HttpServer/HttpServer.h @@ -21,20 +21,24 @@ using tcp = boost::asio::ip::tcp; // from <boost/asio/ip/tcp.hpp> /* * \brief A Simple HttpServer, based on Boost::Beast. Its can be configured via - * the mandatory HttpHandler parameter. \tparam HttpHandler A callable type that - * takes two parameters, a `http::request<...>` , and a `sendAction` and returns - * an awaitable<void> type. sendAction always is a callable that takes a - * http::message, and returns an awaitable<void>; The behavior is then as - * follows: as soon as the Server receives a http request, co_await - * _httpHandler(move(request), sendAction) is called. (_httpHandler is a member - * of type HttpHandler). The expected behavior of this call is that _httpHandler - * takes the request, computes the corresponding `response`, and calls co_await - * sendAction(response). The `sendAction` is needed because the `response` might - * different types (in beast, a http::message is templated on the body type). - * For this reason, this approach is more flexible, than having _httpHandler - * simply return the response. A very basic HttpHandler, which simply serves - * files from a directory, can be obtained via - * `ad_utility::httpUtils::makeFileServer()`. + * the mandatory HttpHandler parameter. + * + * \tparam HttpHandler A callable type that takes two parameters, a + * `http::request<...>` , and a `sendAction` and returns an awaitable<void> + * type. sendAction always is a callable that takes a http::message, and returns + * an awaitable<void>. + * + * The behavior is then as follows: as soon as the server receives a HTTP + * request, co_await _httpHandler(move(request), sendAction) is called. + * (_httpHandler is a member of type HttpHandler). The expected behavior of this + * call is that _httpHandler takes the request, computes the corresponding + * `response`, and calls co_await sendAction(response). The `sendAction` is + * needed because the `response` can have different types (in beast, a + * http::message is templated on the body type). For this reason, this approach + * is more flexible, than having _httpHandler simply return the response. + * + * A very basic HttpHandler, which simply serves files from a directory, can be + * obtained via `ad_utility::httpUtils::makeFileServer()`. */ template <typename HttpHandler> class HttpServer { diff --git a/src/util/HttpServer/HttpUtils.h b/src/util/HttpServer/HttpUtils.h index 584d181f67..7f20f2c36d 100644 --- a/src/util/HttpServer/HttpUtils.h +++ b/src/util/HttpServer/HttpUtils.h @@ -134,11 +134,10 @@ static auto createJsonResponse(const json& j, const auto& request, /// Create a HttpResponse with status 404 Not Found. The string body will be a /// default message including the name of the file that was not found, which can /// be read from the request directly. -static auto createNotFoundResponse(const HttpRequest auto& request) { - return createHttpResponseFromString("Resource \"" + - std::string(request.target()) + - "\" was not found on this server", - http::status::not_found, request); +static auto createNotFoundResponse(const std::string& errorMsg, + const HttpRequest auto& request) { + return createHttpResponseFromString(errorMsg, http::status::not_found, + request); } /// Create a HttpResponse with status 400 Bad Request. @@ -206,27 +205,25 @@ boost::asio::awaitable<void> makeFileServerImpl( // Make sure we can handle the method if (request.method() != http::verb::get && request.method() != http::verb::head) { - co_await send(createBadRequestResponse( - "Unknown HTTP-method, only GET and HEAD requests are supported", - request)); - co_return; + throw std::runtime_error( + "When serving files, only GET and HEAD requests are supported"); } // Decode the path and check that it is absolute and contains no "..". auto urlPath = ad_utility::UrlParser::getDecodedPathAndCheck(request.target()); if (!urlPath.has_value()) { - co_await send(createBadRequestResponse( - "Invalid url path \"" + std::string{request.target()} + '"', request)); - co_return; + throw std::runtime_error( + absl::StrCat("Invalid URL path \"", request.target(), "\"")); } // Check if the target is in the whitelist. The `target()` starts with a // slash, entries in the whitelist don't. + auto urlPathWithFirstCharRemoved = urlPath.value().substr(1); if (whitelist.has_value() && - !whitelist.value().contains(urlPath.value().substr(1))) { - co_await send(createNotFoundResponse(request)); - co_return; + !whitelist.value().contains(urlPathWithFirstCharRemoved)) { + throw std::runtime_error(absl::StrCat( + "Resource \"", urlPathWithFirstCharRemoved, "\" not in whitelist")); } // Build the path to the requested file on the file system. @@ -240,12 +237,15 @@ boost::asio::awaitable<void> makeFileServerImpl( // Handle the case where the file doesn't exist. if (errorCode == beast::errc::no_such_file_or_directory) { - co_await send(createNotFoundResponse(request)); - co_return; + std::string errorMsg = + absl::StrCat("Resource \"", request.target(), "\" not found"); + LOG(ERROR) << errorMsg << std::endl; + co_return co_await send(createNotFoundResponse(errorMsg, request)); } // Handle an unknown error. if (errorCode) { + LOG(ERROR) << errorCode.message() << std::endl; co_return co_await send( createServerErrorResponse(errorCode.message(), request)); } diff --git a/src/util/HttpServer/UrlParser.cpp b/src/util/HttpServer/UrlParser.cpp index 598b1cbe82..b290046b8c 100644 --- a/src/util/HttpServer/UrlParser.cpp +++ b/src/util/HttpServer/UrlParser.cpp @@ -1,18 +1,23 @@ - - -// Copyright 2021, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach <kalmbach@cs.uni-freiburg.de> +// Copyright 2022, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Authors: Johannes Kalmbach <kalmbach@cs.uni-freiburg.de> +// Hannah Bast <bast@cs.uni-freiburg.de> #include "UrlParser.h" #include "../Exception.h" + using namespace ad_utility; -using std::string; // _____________________________________________________________________________ -string UrlParser::applyPercentDecoding(std::string_view url) { - string decoded; +std::string UrlParser::applyPercentDecoding(std::string_view url, + bool urlDecode) { + // If not decoding wanted, just convert to `std::string`. + if (urlDecode == false) { + return std::string{url}; + } + // Otherwise resolve all %XX. + std::string decoded; for (size_t i = 0; i < url.size(); ++i) { if (url[i] == '+') { decoded += ' '; @@ -45,27 +50,42 @@ string UrlParser::applyPercentDecoding(std::string_view url) { } // ___________________________________________________________________________ -UrlParser::UrlTarget UrlParser::parseTarget(std::string_view target) { - static constexpr auto npos = std::string_view::npos; - UrlTarget result; +UrlParser::UrlPathAndParameters UrlParser::parseGetRequestTarget( + std::string_view target, bool urlDecode) { + UrlPathAndParameters result; - target = target.substr(0, target.find('#')); + // Remove everything after # (including it). Does nothing if there is no #. + // Don't do this is `urlDecode == false` because in that case, the given + // string contains an unencode SPARQL query, which frequently contains a # as + // a regular character. + if (urlDecode == true) { + target = target.substr(0, target.find('#')); + } + + // Set `_path` and remove it from `target`. If there is no query string (part + // starting with "?"), we are done at this point. size_t index = target.find('?'); - result._target = target.substr(0, index); - if (index == npos) { + result._path = target.substr(0, index); + if (index == std::string::npos) { return result; } target.remove_prefix(index + 1); + + // Parse the query string and store the result in a hash map. Throw an error + // if the same key appears twice in the query string. Note that this excludes + // having two "cmd=..." parameters, although that would be meaningful (though + // not necessary) to support. while (true) { auto next = target.find('&'); - auto paramAndValue = parseSingleKeyValuePair(target.substr(0, next)); + auto paramAndValue = + parseSingleKeyValuePair(target.substr(0, next), urlDecode); auto [iterator, isNewElement] = result._parameters.insert(std::move(paramAndValue)); if (!isNewElement) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Duplicate HTTP parameter: " + iterator->first); } - if (next == npos) { + if (next == std::string::npos) { break; } target.remove_prefix(next + 1); @@ -75,14 +95,14 @@ UrlParser::UrlTarget UrlParser::parseTarget(std::string_view target) { // ____________________________________________________________________________ std::pair<std::string, std::string> UrlParser::parseSingleKeyValuePair( - std::string_view input) { + std::string_view input, bool urlDecode) { size_t posOfEq = input.find('='); if (posOfEq == std::string_view::npos) { AD_THROW(ad_semsearch::Exception::BAD_REQUEST, "Parameter without \"=\" in HTTP Request. " + std::string{input}); } - std::string param{applyPercentDecoding(input.substr(0, posOfEq))}; - std::string value{applyPercentDecoding(input.substr(posOfEq + 1))}; + std::string param{applyPercentDecoding(input.substr(0, posOfEq), urlDecode)}; + std::string value{applyPercentDecoding(input.substr(posOfEq + 1), urlDecode)}; return {std::move(param), std::move(value)}; } @@ -90,7 +110,7 @@ std::pair<std::string, std::string> UrlParser::parseSingleKeyValuePair( std::optional<std::string> UrlParser::getDecodedPathAndCheck( std::string_view target) noexcept { try { - auto filename = parseTarget(target)._target; + auto filename = parseGetRequestTarget(target)._path; AD_CHECK(filename.starts_with('/')); AD_CHECK(filename.find("..") == string::npos); return filename; diff --git a/src/util/HttpServer/UrlParser.h b/src/util/HttpServer/UrlParser.h index a83636d100..e602d50759 100644 --- a/src/util/HttpServer/UrlParser.h +++ b/src/util/HttpServer/UrlParser.h @@ -1,6 +1,7 @@ -// Copyright 2021, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach <kalmbach@cs.uni-freiburg.de> +// Copyright 2022, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Authors: Johannes Kalmbach <kalmbach@cs.uni-freiburg.de> +// Hannah Bast <bast@cs.uni-freiburg.de> #ifndef QLEVER_URLPARSER_H #define QLEVER_URLPARSER_H @@ -21,17 +22,26 @@ namespace ad_utility { */ class UrlParser { public: - struct UrlTarget { - std::string _target; + /// Representation of the "path" and "query" of a URL. For a GET request, the + /// "path" is the part before the "?" (or everything if there is no "?"), and + /// the "query" is the part after the "?" (empty if there is no "?"). The + /// key-value pairs of the "query" are stored in a hash map. + struct UrlPathAndParameters { + std::string _path; ad_utility::HashMap<std::string, std::string> _parameters; }; - // ___________________________________________________________________________ - static std::string applyPercentDecoding(std::string_view url); + // URL-decode the given (part of a) URL. If the second argument is false, do + // nothing except converting the given `std::string_view` to `std::string`. + static std::string applyPercentDecoding(std::string_view url, + bool urlDecode = true); - /// Parse the `target` part of an HTTP GET Request, - /// for example, `/api.html?someKey=some+val%0Fue`. - static UrlTarget parseTarget(std::string_view target); + /// Parse the `target` part of an HTTP GET Request, for example, + /// `/api.html?someKey=some+val%0Fue`. The second argument specifies whether + /// the key-value pairs of the query string should be URL-decoded (default: + /// yes). + static UrlPathAndParameters parseGetRequestTarget(std::string_view target, + bool urlDecode = true); /// From the `target` part of an HTTP GET request, only extract the path, /// with percent decoding applied. E.g. `/target.html?key=value` will become @@ -43,8 +53,11 @@ class UrlParser { std::string_view target) noexcept; private: + // Helper function that parses a single key-value pair from a URL query + // string. The second argument specifies whether the key and value should be + // URL-decoded (default: yes). static std::pair<std::string, std::string> parseSingleKeyValuePair( - std::string_view input); + std::string_view input, bool urlDecode = true); }; } // namespace ad_utility diff --git a/src/util/Timer.h b/src/util/Timer.h index 381be5d6eb..4ed65135e5 100644 --- a/src/util/Timer.h +++ b/src/util/Timer.h @@ -90,11 +90,11 @@ class Timer { inline void setMsecs(off_t msecs) { _usecs = msecs * (off_t)(1000); } inline void setSecs(off_t secs) { _usecs = secs * (off_t)(1000000); } - //! Time at last stop (initially zero) - off_t value() const { return _usecs; } /* in microseconds */ - off_t usecs() const { return _usecs; } /* in microseconds */ - off_t msecs() const { return _usecs / 1000; } /* in milliseconds */ - float secs() const { return _usecs / 1000000.0; } /* in seconds */ + //! Time at last stop (initially zero). + off_t value() const { return _usecs; } + off_t usecs() const { return _usecs; } + off_t msecs() const { return (_usecs + 500) / 1000; } + float secs() const { return _usecs / 1000000.0; } // is the timer currently running bool isRunning() const { return _running; }