From 19ea9fc6e3fd4a1bbba89872ec47f7e59dce1866 Mon Sep 17 00:00:00 2001 From: Jens Alfke Date: Tue, 28 Jan 2025 13:29:42 -0800 Subject: [PATCH] Optimize parallel use of JSONSchema Some more optimizations to avoid lock contention (on SharedKeys) when multiple threads validate against the same schema. Also added the travel-sample schema document to the repo. --- Experimental/JSONSchema.cc | 20 ++-- Experimental/JSONSchema.hh | 4 +- Tests/SchemaTests.cc | 53 +++++++--- Tests/travel-schema.json | 207 +++++++++++++++++++++++++++++++++++++ 4 files changed, 261 insertions(+), 23 deletions(-) create mode 100644 Tests/travel-schema.json diff --git a/Experimental/JSONSchema.cc b/Experimental/JSONSchema.cc index bcb075b6..53365a46 100644 --- a/Experimental/JSONSchema.cc +++ b/Experimental/JSONSchema.cc @@ -228,6 +228,7 @@ namespace fleece { AnyOf, Const, Contains, + Else, Enum, ExclusiveMaximum, ExclusiveMinimum, @@ -253,6 +254,7 @@ namespace fleece { PropertyNames, Ref, Required, + Then, Type, UniqueItems, NKeys_ @@ -265,6 +267,7 @@ namespace fleece { "anyOf", "const", "contains", + "else", "enum", "exclusiveMaximum", "exclusiveMinimum", @@ -290,6 +293,7 @@ namespace fleece { "propertyNames", "$ref", "required", + "then", "type", "uniqueItems", }; @@ -663,16 +667,14 @@ namespace fleece { ,_value(value) { _result = check(value, _schema.schema(), _schema.schema().asDict()); - if (ok()) { + if (ok()) _result = {}; // ensure _result.value is nullptr - _value = nullptr; // release my reference - } } using Result = JSONSchema::Validation::Result; - static Result mkResult(JSONSchema::Error error, Value value, Dict schema, slice schemaKey) { + static Result mkResult(JSONSchema::Error error, Value value, Value schema, slice schemaKey) { // cerr << "\tError: " << JSONSchema::errorString(error) << " for " << value.toJSONString() << " failed " << string_view(schemaKey) // << ": " << schema[schemaKey].toJSONString() << endl; return Result{error, value, schema, schemaKey}; @@ -705,7 +707,7 @@ namespace fleece { } } else if (schemaVal.type() == kFLBoolean) [[likely]] { // `true` matches anything, `false` matches nothing: - return mkResult(schemaVal.asBool() ? Error::ok : Error::invalid, value, nullptr, nullslice); + return mkResult(schemaVal.asBool() ? Error::ok : Error::invalid, value, schemaVal, nullslice); } else { fail("invalid value type in schema"); } @@ -778,7 +780,7 @@ namespace fleece { // "if", "then", "else": if (Value ifSchema = schema[SHARED_KEY(If)]) { - Value thenSchema = schema["then"], elseSchema = schema["else"]; + Value thenSchema = schema[SHARED_KEY(Then)], elseSchema = schema[SHARED_KEY(Else)]; if (thenSchema || elseSchema) { bool ifOK = ok(check(value, ifSchema, schemaBase)); if (Value nextSchema = ifOK ? thenSchema : elseSchema) { @@ -957,7 +959,7 @@ namespace fleece { } // "properties": Specific property names with their own sub-schemas - for (Dict::iterator i(properties); i; ++i) { + for (Dict::iterator i(properties, sSchemaSharedKeys); i; ++i) { slice key = i.keyString(); if (Value val = dict[key]) { if (auto err = check(val, i.value(), schemaBase); !ok(err)) [[unlikely]] @@ -969,7 +971,7 @@ namespace fleece { // "patternProperties": Sub-schemas to apply to properties whose names match patterns if (patternProperties) { - for (Dict::iterator i(patternProperties); i; ++i) { + for (Dict::iterator i(patternProperties, sSchemaSharedKeys); i; ++i) { slice pattern = i.keyString(); for (Dict::iterator j(dict); j; ++j) { slice dictKey = j.keyString(); @@ -1047,7 +1049,7 @@ namespace fleece { "outOfRange", "notMultiple", "tooShort", - "tooLong" + "tooLong", "patternMismatch", "missingProperty", "unknownProperty", diff --git a/Experimental/JSONSchema.hh b/Experimental/JSONSchema.hh index 94ba4327..d389b9b6 100644 --- a/Experimental/JSONSchema.hh +++ b/Experimental/JSONSchema.hh @@ -145,7 +145,7 @@ namespace fleece { /// to register it, then call \ref validate again to retry. std::string const& unknownSchemaID() const noexcept {return _unknownSchema;} - struct Result {Error error; Value value; Dict schema; slice schemaKey;}; + struct Result {Error error; Value value; Value schema; slice schemaKey;}; static bool ok(Result const& e) noexcept {return e.error == Error::ok;} private: friend class JSONSchema; @@ -164,7 +164,7 @@ namespace fleece { struct pathItem { slice key; int index = -1; }; JSONSchema const& _schema; // Schema (unused after constructor) - RetainedValue _value; // The root Value being validated + Value _value; // The root Value being validated Result _result {}; // Result of last check std::string _unknownSchema; // Unknown schema ID found during validation }; diff --git a/Tests/SchemaTests.cc b/Tests/SchemaTests.cc index 72ba53ec..73f58b96 100644 --- a/Tests/SchemaTests.cc +++ b/Tests/SchemaTests.cc @@ -5,6 +5,7 @@ #include "JSONSchema.hh" #include "JSON5.hh" #include "FleeceTests.hh" +#include #include #include #include @@ -177,10 +178,11 @@ TEST_CASE_METHOD(SchemaTest,"JSON Schema Test Suite", "[Schema]") { TEST_CASE("JSON Schema benchmark", "[.Perf]") { + static constexpr const char* kDataFile = "/Users/snej/Couchbase/DataSets/travel-sample/travel.json"; vector database; { Benchmark bench; - FILE* in = fopen("/Users/snej/Couchbase/DataSets/travel-sample/travel.json", "r"); + FILE* in = fopen(kDataFile, "r"); REQUIRE(in); char* lineBuf = nullptr; size_t bufSize = 0; @@ -195,6 +197,9 @@ TEST_CASE("JSON Schema benchmark", "[.Perf]") { bench.stop(); REQUIRE(doc); database.push_back(std::move(doc)); + #ifndef NDEBUG + //if (database.size() > 2000) {break;} // speeds up debugging + #endif } free(lineBuf); fclose(in); @@ -203,19 +208,43 @@ TEST_CASE("JSON Schema benchmark", "[.Perf]") { bench.printReport(1.0, "document"); } - JSONSchema schema(readFile("/Users/snej/Couchbase/DataSets/travel-sample/travel-schema.json")); + JSONSchema schema(readFile((string(kTestFilesDir) + "travel-schema.json").c_str())); - Benchmark bench; - for (auto& doc : database) { + SECTION("Single-threaded") { + Benchmark bench; + for (auto& doc : database) { + bench.start(); + auto result = schema.validate(doc.root()); + bench.stop(); + if (!result) { + slice id = doc.asDict()["_id"].asString(); + FAIL("Doc " << id << " failed: " << result.errorString() << " at " << result.errorPath() + << " (" << result.errorValue().toJSONString() << "), schema at " << result.errorSchemaURI()); + } + } + fprintf(stderr, "Checked %zu documents: ", database.size()); + bench.printReport(1.0, "document"); + } + + SECTION("Parallel") { + static const size_t kBatchSize = (database.size() + 15) / 16; + size_t const n = database.size(); + vector> futures; + Benchmark bench; bench.start(); - auto result = schema.validate(doc.root()); - bench.stop(); - if (!result) { - slice id = doc.asDict()["_id"].asString(); - FAIL("Doc " << id << " failed: " << result.errorString() << " at " << result.errorPath() - << ", schema at " << result.errorSchemaURI()); + for (size_t taskFirst = 0; taskFirst < n; taskFirst += kBatchSize) { + futures.emplace_back( async(function([&](size_t first) { + size_t last = std::min(first + kBatchSize, n); + for (size_t i = first; i < last; ++i) { + auto result = schema.validate(database[i].root()); + if (!result) + throw runtime_error("Validation failed!"); + } + }), taskFirst)); } + for (auto& f : futures) f.wait(); + bench.stop(); + fprintf(stderr, "Checked %zu documents: ", database.size()); + bench.printReport(1.0 / n, "document"); } - fprintf(stderr, "Checked %zu documents: ", database.size()); - bench.printReport(1.0, "document"); } diff --git a/Tests/travel-schema.json b/Tests/travel-schema.json new file mode 100644 index 00000000..3ad316c3 --- /dev/null +++ b/Tests/travel-schema.json @@ -0,0 +1,207 @@ +{ + "$comment": "JSON Schema for Couchbase's travel-sample database.", + "$schema": "https://json-schema.org/draft/2020-12/schema", + + "type": "object", + + "if": { + "properties": {"type": {"const": "route"}} + }, + "then": + {"$ref": "#/$defs/route"}, + "else": { + "if": { + "properties": {"type": {"const": "airline"}} + }, + "then": { + "$ref": "#/$defs/airline" + }, + "else": { + "if": { + "properties": {"type": {"const": "airport"}} + }, + "then": { + "$ref": "#/$defs/airport" + }, + "else": { + "if": { + "properties": {"type": {"const": "hotel"}} + }, + "then": { + "$ref": "#/$defs/hotel" + }, + "else": { + "$ref": "#/$defs/landmark" + } + } + } + }, + + + "$defs": { + "airline": { + "properties": { + "type": {"const": "airline"}, + "id": {"type": "number"}, + "_id": {"type": "string", "$comment": "Used in some JSON dumps"}, + "callsign": {"type": ["string", "null"], "minLength": 2, "maxLength": 20}, + "country": {"type": "string"}, + "iata": {"type": ["string", "null"], "minLength": 2, "maxLength": 4}, + "icao": {"type": "string", "minLength": 2, "maxLength": 4}, + "name": {"type": "string"} + }, + "required": ["id", "type", "name", "callsign", "country"], + "additionalProperties": false + }, + + "airport": { + "properties": { + "type": {"const": "airport"}, + "id": {"type": "number"}, + "_id": {"type": "string"}, + "airportname": {"type": "string"}, + "city": {"type": "string"}, + "country": {"type": "string"}, + "faa": {"type": ["string", "null"], "minLength": 3, "maxLength": 3}, + "geo": {"$ref": "#/$defs/geo"}, + "icao": {"type": ["string", "null"], "minLength": 2, "maxLength": 4}, + "tz": {"type": "string"} + }, + "required": ["id", "type", "airportname", "city", "country", "icao", "tz", "geo"], + "additionalProperties": false + }, + + "hotel": { + "properties": { + "type": {"const": "hotel"}, + "id": {"type": "number"}, + "_id": {"type": "string"}, + "address": {"type": ["string", "null"]}, + "alias": {"type": ["string", "null"]}, + "checkin": {"type": ["string", "null"]}, + "checkout": {"type": ["string", "null"]}, + "city": {"type": ["string", "null"]}, + "country": {"type": "string"}, + "description": {"type": "string"}, + "directions": {"type": ["string", "null"]}, + "email": {"$ref": "#/$defs/email"}, + "fax": {"type": ["string", "null"]}, + "free_breakfast": {"type": "boolean"}, + "free_internet": {"type": "boolean"}, + "free_parking": {"type": "boolean"}, + "geo": {"$ref": "#/$defs/geo"}, + "name": {"type": "string"}, + "pets_ok": {"type": "boolean"}, + "phone": {"type": ["string", "null"]}, + "price": {"type": ["string", "null"]}, + "public_likes": { + "type": ["array"], + "items": {"type": "string"} + }, + "reviews": { + "type": "array", + "items": {"$ref": "#/$defs/review"} + }, + "state": {"type": ["string", "null"]}, + "title": {"type": "string"}, + "tollfree": {"type": ["string", "null"]}, + "url": {"$ref": "#/$defs/url"}, + "vacancy": {"type": "boolean"} + }, + "required": ["id", "type", "name", "country", "city", "geo"], + "additionalProperties": false + }, + + "landmark": { + "properties": { + "type": {"const": "landmark"}, + "id": {"type": "number"}, + "_id": {"type": "string"}, + "activity": {"enum": ["buy", "do", "drink", "eat", "listing", "see"]}, + "address": {"type": ["string", "null"]}, + "alt": {"type": ["string", "null"]}, + "city": {"type": ["string", "null"]}, + "content": {"type": "string"}, + "country": {"type": "string"}, + "directions": {"type": ["string", "null"]}, + "email": {"$ref": "#/$defs/email"}, + "geo": {"$ref": "#/$defs/geo"}, + "hours": {"type": ["string", "null"]}, + "image": {"type": ["string", "null"]}, + "image_direct_url": {"$ref": "#/$defs/url"}, + "name": {"type": "string"}, + "phone": {"type": ["string", "null"]}, + "price": {"type": ["string", "null"]}, + "state": {"type": ["string", "null"]}, + "title": {"type": "string"}, + "tollfree": {"type": ["string", "null"]}, + "url": {"$ref": "#/$defs/url"} + }, + "required": ["id", "type", "name", "country", "city", "geo", "content", "activity"], + "additionalProperties": false + }, + + "route": { + "properties": { + "type": {"const": "route"}, + "id": {"type": "number"}, + "_id": {"type": "string"}, + "airline": {"type": "string"}, + "airlineid": {"type": "string"}, + "destinationairport": {"type": "string"}, + "distance": {"type": "number", "exclusiveMinimum": 0.0}, + "equipment": {"type": ["string", "null"]}, + "sourceairport": {"type": "string"}, + "stops": {"type": "integer", "minimum": 0}, + "schedule": { + "type": "array", + "items": { + "type": "object", + "properties": { + "day": {"type": "integer", "minimum": 0, "maximum": 6}, + "utc": {"type": "string", "pattern": "^\\d\\d:\\d\\d:\\d\\d$"}, + "flight": {"type": "string"} + }, + "required": ["day", "utc", "flight"], + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": ["id", "type", "airline", "airlineid", "distance", "equipment", + "sourceairport", "destinationairport", "schedule", "stops"], + "additionalProperties": false + }, + + "geo": { + "type": "object", + "properties": { + "lat": {"type": "number", "minimum": -90.0, "maximum": 90.0}, + "lon": {"type": "number", "minimum": -180.0, "maximum": 180.0}, + "alt": {"type": "number"}, + "accuracy": {"enum": ["ROOFTOP", "RANGE_INTERPOLATED", "APPROXIMATE"]} + }, + "required": ["lat", "lon"], + "additionalProperties": false + }, + + "review": { + "type": "object", + "properties": { + "author": {"type": "string"}, + "content": {"type": "string"}, + "date": {"type": "string"}, + "ratings": { + "type": "object", + "additionalProperties": {"type": "integer", "minimum": -1, "maximum": 5} + } + }, + "required": ["author", "content", "date"], + "additionalProperties": false + }, + + "email": {"type": ["string", "null"], "pattern": "@"}, + + "url": {"type": ["string", "null"], "pattern": "^([hH]ttps?://.*)?$"} + } +}