From a63bfa27030971a10fd2bfe20dc4b2e5224e90d8 Mon Sep 17 00:00:00 2001 From: Dmitry Dygalo Date: Fri, 31 Jan 2025 19:57:57 +0100 Subject: [PATCH 1/7] chore: Rework Registry Signed-off-by: Dmitry Dygalo --- crates/jsonschema-referencing/Cargo.toml | 1 + .../src/anchors/keys.rs | 11 +- .../jsonschema-referencing/src/anchors/mod.rs | 111 ++++++--- crates/jsonschema-referencing/src/cache.rs | 100 ++++++++ crates/jsonschema-referencing/src/hasher.rs | 88 +++++++ crates/jsonschema-referencing/src/lib.rs | 2 + crates/jsonschema-referencing/src/registry.rs | 222 +++++++++++------- crates/jsonschema-referencing/src/resolver.rs | 27 ++- crates/jsonschema-referencing/src/resource.rs | 181 ++++++++++---- .../src/specification/draft201909.rs | 8 +- .../src/specification/draft4.rs | 4 +- .../src/specification/draft6.rs | 4 +- .../src/specification/draft7.rs | 4 +- .../src/specification/mod.rs | 3 +- .../src/specification/subresources.rs | 12 +- crates/jsonschema/src/compiler.rs | 2 +- 16 files changed, 582 insertions(+), 198 deletions(-) create mode 100644 crates/jsonschema-referencing/src/cache.rs create mode 100644 crates/jsonschema-referencing/src/hasher.rs diff --git a/crates/jsonschema-referencing/Cargo.toml b/crates/jsonschema-referencing/Cargo.toml index cbb8556b..5ad06e31 100644 --- a/crates/jsonschema-referencing/Cargo.toml +++ b/crates/jsonschema-referencing/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true ahash.workspace = true fluent-uri = { version = "0.3.2", features = ["serde"] } once_cell = "1.20.1" +parking_lot = "0.12.3" percent-encoding = "2.3.1" serde_json.workspace = true diff --git a/crates/jsonschema-referencing/src/anchors/keys.rs b/crates/jsonschema-referencing/src/anchors/keys.rs index 882ee9e6..a3d2b1b1 100644 --- a/crates/jsonschema-referencing/src/anchors/keys.rs +++ b/crates/jsonschema-referencing/src/anchors/keys.rs @@ -12,18 +12,21 @@ use std::{ borrow::Borrow, hash::{Hash, Hasher}, + sync::Arc, }; use fluent_uri::Uri; +use super::AnchorName; + #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub(crate) struct AnchorKey { - uri: Uri, - name: String, + uri: Arc>, + name: AnchorName, } impl AnchorKey { - pub(crate) fn new(uri: Uri, name: String) -> Self { + pub(crate) fn new(uri: Arc>, name: AnchorName) -> Self { Self { uri, name } } } @@ -50,7 +53,7 @@ pub(crate) trait BorrowDyn { impl BorrowDyn for AnchorKey { fn borrowed_key(&self) -> AnchorKeyRef { - AnchorKeyRef::new(&self.uri, &self.name) + AnchorKeyRef::new(&self.uri, self.name.as_str()) } } diff --git a/crates/jsonschema-referencing/src/anchors/mod.rs b/crates/jsonschema-referencing/src/anchors/mod.rs index df6eaa25..94b81a24 100644 --- a/crates/jsonschema-referencing/src/anchors/mod.rs +++ b/crates/jsonschema-referencing/src/anchors/mod.rs @@ -1,33 +1,80 @@ -use std::sync::Arc; +use std::{ + hash::Hash, + sync::atomic::{AtomicPtr, Ordering}, +}; use serde_json::Value; mod keys; -use crate::{Draft, Error, Resolved, Resolver, Resource}; +use crate::{resource::InnerResourcePtr, Draft, Error, Resolved, Resolver}; pub(crate) use keys::{AnchorKey, AnchorKeyRef}; +#[derive(Debug)] +pub(crate) struct AnchorName { + ptr: AtomicPtr, + len: usize, +} + +impl AnchorName { + fn new(s: &str) -> Self { + Self { + ptr: AtomicPtr::new(s.as_ptr().cast_mut()), + len: s.len(), + } + } + + fn as_str(&self) -> &str { + unsafe { + std::str::from_utf8_unchecked(std::slice::from_raw_parts( + self.ptr.load(Ordering::Relaxed), + self.len, + )) + } + } +} + +impl Clone for AnchorName { + fn clone(&self) -> Self { + Self { + ptr: AtomicPtr::new(self.ptr.load(Ordering::Relaxed)), + len: self.len, + } + } +} + +impl Hash for AnchorName { + fn hash(&self, state: &mut H) { + self.as_str().hash(state); + } +} + +impl PartialEq for AnchorName { + fn eq(&self, other: &Self) -> bool { + self.as_str() == other.as_str() + } +} + +impl Eq for AnchorName {} + /// An anchor within a resource. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone)] pub(crate) enum Anchor { Default { - draft: Draft, - name: String, - resource: Arc, + name: AnchorName, + resource: InnerResourcePtr, }, - /// Dynamic anchors from Draft 2020-12. Dynamic { - draft: Draft, - name: String, - resource: Arc, + name: AnchorName, + resource: InnerResourcePtr, }, } impl Anchor { /// Anchor's name. - pub(crate) fn name(&self) -> &str { + pub(crate) fn name(&self) -> AnchorName { match self { - Anchor::Default { name, .. } | Anchor::Dynamic { name, .. } => name, + Anchor::Default { name, .. } | Anchor::Dynamic { name, .. } => name.clone(), } } /// Get the resource for this anchor. @@ -38,10 +85,10 @@ impl Anchor { resolver, resource.draft(), )), - Anchor::Dynamic { name, resource, .. } => { + Anchor::Dynamic { name, resource } => { let mut last = resource; for uri in &resolver.dynamic_scope() { - match resolver.registry.anchor(uri, name) { + match resolver.registry.anchor(uri, name.as_str()) { Ok(anchor) => { if let Anchor::Dynamic { resource, .. } = anchor { last = resource; @@ -53,7 +100,7 @@ impl Anchor { } Ok(Resolved::new( last.contents(), - resolver.in_subresource((**last).as_ref())?, + resolver.in_subresource_inner(last)?, last.draft(), )) } @@ -68,18 +115,16 @@ pub(crate) fn anchor(draft: Draft, contents: &Value) -> Box Box( Box::new( contents .as_object() - .and_then(|schema| schema.get("id").and_then(Value::as_str)) + .and_then(|schema| schema.get("id")) + .and_then(Value::as_str) .and_then(|id| id.strip_prefix('#')) .map(move |id| Anchor::Default { - draft, - name: id.to_string(), - resource: Arc::new(draft.create_resource(contents.clone())), + name: AnchorName::new(id), + resource: InnerResourcePtr::new(contents, draft), }) .into_iter(), ) diff --git a/crates/jsonschema-referencing/src/cache.rs b/crates/jsonschema-referencing/src/cache.rs new file mode 100644 index 00000000..3bccb5dd --- /dev/null +++ b/crates/jsonschema-referencing/src/cache.rs @@ -0,0 +1,100 @@ +use core::hash::{BuildHasherDefault, Hash, Hasher}; +use std::{ + collections::{hash_map::Entry, HashMap}, + sync::Arc, +}; + +use ahash::AHasher; +use fluent_uri::Uri; +use parking_lot::RwLock; + +use crate::{hasher::BuildNoHashHasher, uri, Error}; + +#[derive(Debug, Clone)] +pub(crate) struct UriCache { + cache: HashMap>, BuildNoHashHasher>, +} + +impl UriCache { + pub(crate) fn new() -> Self { + Self { + cache: HashMap::with_hasher(BuildHasherDefault::default()), + } + } + + pub(crate) fn with_capacity(capacity: usize) -> Self { + Self { + cache: HashMap::with_capacity_and_hasher(capacity, BuildHasherDefault::default()), + } + } + + pub(crate) fn resolve_against( + &mut self, + base: &Uri<&str>, + uri: impl AsRef, + ) -> Result>, Error> { + let mut hasher = AHasher::default(); + (base.as_str(), uri.as_ref()).hash(&mut hasher); + let hash = hasher.finish(); + + Ok(match self.cache.entry(hash) { + Entry::Occupied(entry) => Arc::clone(entry.get()), + Entry::Vacant(entry) => { + let new = Arc::new(uri::resolve_against(base, uri.as_ref())?); + Arc::clone(entry.insert(new)) + } + }) + } + + pub(crate) fn into_shared(self) -> SharedUriCache { + SharedUriCache { + cache: RwLock::new(self.cache), + } + } +} + +/// A dedicated type for URI resolution caching. +#[derive(Debug)] +pub(crate) struct SharedUriCache { + cache: RwLock>, BuildNoHashHasher>>, +} + +impl Clone for SharedUriCache { + fn clone(&self) -> Self { + Self { + cache: RwLock::new( + self.cache + .read() + .iter() + .map(|(k, v)| (*k, Arc::clone(v))) + .collect(), + ), + } + } +} + +impl SharedUriCache { + pub(crate) fn resolve_against( + &self, + base: &Uri<&str>, + uri: impl AsRef, + ) -> Result>, Error> { + let mut hasher = AHasher::default(); + (base.as_str(), uri.as_ref()).hash(&mut hasher); + let hash = hasher.finish(); + + if let Some(cached) = self.cache.read().get(&hash).cloned() { + return Ok(cached); + } + + let new = Arc::new(uri::resolve_against(base, uri.as_ref())?); + self.cache.write().insert(hash, Arc::clone(&new)); + Ok(new) + } + + pub(crate) fn into_local(self) -> UriCache { + UriCache { + cache: self.cache.into_inner(), + } + } +} diff --git a/crates/jsonschema-referencing/src/hasher.rs b/crates/jsonschema-referencing/src/hasher.rs new file mode 100644 index 00000000..c9e8d8da --- /dev/null +++ b/crates/jsonschema-referencing/src/hasher.rs @@ -0,0 +1,88 @@ +use core::hash::{BuildHasherDefault, Hasher}; + +pub(crate) type BuildNoHashHasher = BuildHasherDefault; + +#[derive(Default)] +pub(crate) struct NoHashHasher(u64); + +impl Hasher for NoHashHasher { + fn finish(&self) -> u64 { + self.0 + } + fn write(&mut self, _: &[u8]) { + unreachable!("Should not be used") + } + fn write_u8(&mut self, _: u8) { + unreachable!("Should not be used") + } + fn write_u16(&mut self, _: u16) { + unreachable!("Should not be used") + } + fn write_u32(&mut self, _: u32) { + unreachable!("Should not be used") + } + fn write_u64(&mut self, n: u64) { + self.0 = n; + } + fn write_usize(&mut self, _: usize) { + unreachable!("Should not be used") + } + fn write_i8(&mut self, _: i8) { + unreachable!("Should not be used") + } + fn write_i16(&mut self, _: i16) { + unreachable!("Should not be used") + } + fn write_i32(&mut self, _: i32) { + unreachable!("Should not be used") + } + fn write_i64(&mut self, _: i64) { + unreachable!("Should not be used") + } + fn write_isize(&mut self, _: isize) { + unreachable!("Should not be used") + } +} + +#[cfg(test)] +mod tests { + use super::NoHashHasher; + use std::hash::Hasher; + + macro_rules! test_panic { + ($($method:ident),+ $(,)?) => { + $( + mod $method { + use super::NoHashHasher; + use std::hash::Hasher; + + #[test] + #[should_panic(expected = "Should not be used")] + fn test_panic() { + let mut hasher = NoHashHasher::default(); + hasher.$method(42); + } + } + )+ + }; + } + + test_panic!( + write_u8, + write_u16, + write_u32, + write_usize, + write_i8, + write_i16, + write_i32, + write_i64, + write_isize + ); + + #[test] + #[should_panic(expected = "Should not be used")] + fn test_panic_write() { + let mut hasher = NoHashHasher::default(); + hasher.write(b"a"); + } +} diff --git a/crates/jsonschema-referencing/src/lib.rs b/crates/jsonschema-referencing/src/lib.rs index 230bef76..c909d95c 100644 --- a/crates/jsonschema-referencing/src/lib.rs +++ b/crates/jsonschema-referencing/src/lib.rs @@ -2,7 +2,9 @@ //! //! An implementation-agnostic JSON reference resolution library for Rust. mod anchors; +mod cache; mod error; +mod hasher; mod list; pub mod meta; mod registry; diff --git a/crates/jsonschema-referencing/src/registry.rs b/crates/jsonschema-referencing/src/registry.rs index a8bc6a3b..2c481a19 100644 --- a/crates/jsonschema-referencing/src/registry.rs +++ b/crates/jsonschema-referencing/src/registry.rs @@ -1,8 +1,8 @@ use std::{ - collections::VecDeque, - fmt::Debug, + collections::{hash_map::Entry, HashSet, VecDeque}, hash::{Hash, Hasher}, - sync::{Arc, RwLock}, + pin::Pin, + sync::Arc, }; use ahash::{AHashMap, AHashSet, AHasher}; @@ -12,15 +12,20 @@ use serde_json::Value; use crate::{ anchors::{AnchorKey, AnchorKeyRef}, + cache::{SharedUriCache, UriCache}, + hasher::BuildNoHashHasher, list::List, meta, - resource::unescape_segment, + resource::{unescape_segment, InnerResourcePtr, JsonSchemaResource}, uri, vocabularies::{self, VocabularySet}, Anchor, DefaultRetriever, Draft, Error, Resolver, Resource, Retrieve, }; -type ResourceMap = AHashMap, Arc>; +// SAFETY: `Pin` guarantees stable memory locations for resource pointers, +// while `Arc` enables cheap sharing between multiple registries +type DocumentStore = AHashMap>, Pin>>; +type ResourceMap = AHashMap>, InnerResourcePtr>; pub static SPECIFICATIONS: Lazy = Lazy::new(|| { let pairs = meta::META_SCHEMAS.into_iter().map(|(uri, schema)| { @@ -29,21 +34,27 @@ pub static SPECIFICATIONS: Lazy = Lazy::new(|| { Resource::from_contents(schema.clone()).expect("Invalid resource"), ) }); + // The capacity is known upfront + let mut documents = DocumentStore::with_capacity(18); let mut resources = ResourceMap::with_capacity(18); let mut anchors = AHashMap::with_capacity(8); + let mut resolution_cache = UriCache::with_capacity(35); process_resources( pairs, &DefaultRetriever, + &mut documents, &mut resources, &mut anchors, + &mut resolution_cache, Draft::default(), ) .expect("Failed to process meta schemas"); Registry { + documents, resources, anchors, - resolving_cache: RwLock::new(AHashMap::new()), + resolution_cache: resolution_cache.into_shared(), } }); @@ -55,17 +66,20 @@ pub static SPECIFICATIONS: Lazy = Lazy::new(|| { /// discoverable and retrievable via their own IDs. #[derive(Debug)] pub struct Registry { - resources: ResourceMap, + // Pinned storage for primary documents + documents: DocumentStore, + pub(crate) resources: ResourceMap, anchors: AHashMap, - resolving_cache: RwLock>>>, + resolution_cache: SharedUriCache, } impl Clone for Registry { fn clone(&self) -> Self { Self { + documents: self.documents.clone(), resources: self.resources.clone(), anchors: self.anchors.clone(), - resolving_cache: RwLock::new(AHashMap::new()), + resolution_cache: self.resolution_cache.clone(), } } } @@ -102,7 +116,7 @@ impl RegistryOptions { /// # Errors /// /// Returns an error if the URI is invalid or if there's an issue processing the resource. - pub fn try_new(self, uri: impl Into, resource: Resource) -> Result { + pub fn try_new(self, uri: impl AsRef, resource: Resource) -> Result { Registry::try_new_impl(uri, resource, &*self.retriever, self.draft) } /// Create a [`Registry`] from multiple resources using these options. @@ -112,7 +126,7 @@ impl RegistryOptions { /// Returns an error if any URI is invalid or if there's an issue processing the resources. pub fn try_from_resources( self, - pairs: impl Iterator, Resource)>, + pairs: impl Iterator, Resource)>, ) -> Result { Registry::try_from_resources_impl(pairs, &*self.retriever, self.draft) } @@ -140,7 +154,7 @@ impl Registry { /// # Errors /// /// Returns an error if the URI is invalid or if there's an issue processing the resource. - pub fn try_new(uri: impl Into, resource: Resource) -> Result { + pub fn try_new(uri: impl AsRef, resource: Resource) -> Result { Self::try_new_impl(uri, resource, &DefaultRetriever, Draft::default()) } /// Create a new [`Registry`] from an iterator of (URI, Resource) pairs. @@ -153,12 +167,12 @@ impl Registry { /// /// Returns an error if any URI is invalid or if there's an issue processing the resources. pub fn try_from_resources( - pairs: impl Iterator, Resource)>, + pairs: impl Iterator, Resource)>, ) -> Result { Self::try_from_resources_impl(pairs, &DefaultRetriever, Draft::default()) } fn try_new_impl( - uri: impl Into, + uri: impl AsRef, resource: Resource, retriever: &dyn Retrieve, draft: Draft, @@ -166,17 +180,28 @@ impl Registry { Self::try_from_resources_impl([(uri, resource)].into_iter(), retriever, draft) } fn try_from_resources_impl( - pairs: impl Iterator, Resource)>, + pairs: impl Iterator, Resource)>, retriever: &dyn Retrieve, draft: Draft, ) -> Result { + let mut documents = AHashMap::new(); let mut resources = ResourceMap::new(); let mut anchors = AHashMap::new(); - process_resources(pairs, retriever, &mut resources, &mut anchors, draft)?; + let mut resolution_cache = UriCache::new(); + process_resources( + pairs, + retriever, + &mut documents, + &mut resources, + &mut anchors, + &mut resolution_cache, + draft, + )?; Ok(Registry { + documents, resources, anchors, - resolving_cache: RwLock::new(AHashMap::new()), + resolution_cache: resolution_cache.into_shared(), }) } /// Create a new registry with a new resource. @@ -186,7 +211,7 @@ impl Registry { /// Returns an error if the URI is invalid or if there's an issue processing the resource. pub fn try_with_resource( self, - uri: impl Into, + uri: impl AsRef, resource: Resource, ) -> Result { let draft = resource.draft(); @@ -199,7 +224,7 @@ impl Registry { /// Returns an error if the URI is invalid or if there's an issue processing the resource. pub fn try_with_resource_and_retriever( self, - uri: impl Into, + uri: impl AsRef, resource: Resource, retriever: &dyn Retrieve, ) -> Result { @@ -213,7 +238,7 @@ impl Registry { /// Returns an error if any URI is invalid or if there's an issue processing the resources. pub fn try_with_resources( self, - pairs: impl Iterator, Resource)>, + pairs: impl Iterator, Resource)>, draft: Draft, ) -> Result { self.try_with_resources_and_retriever(pairs, &DefaultRetriever, draft) @@ -225,17 +250,28 @@ impl Registry { /// Returns an error if any URI is invalid or if there's an issue processing the resources. pub fn try_with_resources_and_retriever( self, - pairs: impl Iterator, Resource)>, + pairs: impl Iterator, Resource)>, retriever: &dyn Retrieve, draft: Draft, ) -> Result { + let mut documents = self.documents; let mut resources = self.resources; let mut anchors = self.anchors; - process_resources(pairs, retriever, &mut resources, &mut anchors, draft)?; + let mut resolution_cache = self.resolution_cache.into_local(); + process_resources( + pairs, + retriever, + &mut documents, + &mut resources, + &mut anchors, + &mut resolution_cache, + draft, + )?; Ok(Registry { + documents, resources, anchors, - resolving_cache: RwLock::new(AHashMap::new()), + resolution_cache: resolution_cache.into_shared(), }) } /// Create a new [`Resolver`] for this registry with the given base URI. @@ -260,17 +296,6 @@ impl Registry { ) -> Resolver { Resolver::from_parts(self, base_uri, scopes) } - pub(crate) fn get_or_retrieve<'r>(&'r self, uri: &Uri) -> Result<&'r Resource, Error> { - if let Some(resource) = self.resources.get(uri) { - Ok(resource) - } else { - Err(Error::unretrievable( - uri.as_str(), - "Retrieving external resources is not supported once the registry is populated" - .into(), - )) - } - } pub(crate) fn anchor<'a>(&self, uri: &'a Uri, name: &'a str) -> Result<&Anchor, Error> { let key = AnchorKeyRef::new(uri, name); if let Some(value) = self.anchors.get(key.borrow_dyn()) { @@ -291,32 +316,12 @@ impl Registry { } } - pub(crate) fn cached_resolve_against( + pub(crate) fn resolve_against( &self, base: &Uri<&str>, uri: &str, ) -> Result>, Error> { - let mut hasher = AHasher::default(); - (base.as_str(), uri).hash(&mut hasher); - let hash = hasher.finish(); - - let value = self - .resolving_cache - .read() - .expect("Lock is poisoned") - .get(&hash) - .cloned(); - - if let Some(cached) = value { - Ok(cached) - } else { - let new = Arc::new(uri::resolve_against(base, uri)?); - self.resolving_cache - .write() - .expect("Lock is poisoned") - .insert(hash, new.clone()); - Ok(new) - } + self.resolution_cache.resolve_against(base, uri) } #[must_use] pub fn find_vocabularies(&self, draft: Draft, contents: &Value) -> VocabularySet { @@ -339,23 +344,48 @@ impl Registry { } fn process_resources( - pairs: impl Iterator, Resource)>, + pairs: impl Iterator, Resource)>, retriever: &dyn Retrieve, + documents: &mut DocumentStore, resources: &mut ResourceMap, anchors: &mut AHashMap, + resolution_cache: &mut UriCache, default_draft: Draft, ) -> Result<(), Error> { let mut queue = VecDeque::with_capacity(32); - let mut seen = AHashSet::new(); + let mut seen = HashSet::with_hasher(BuildNoHashHasher::default()); let mut external = AHashSet::new(); let mut scratch = String::new(); + // TODO: Implement `Registry::combine` - // Populate the resources & queue from the input - for (uri, resource) in pairs { - let uri = uri::from_str(uri.into().trim_end_matches('#'))?; - let resource = Arc::new(resource); - resources.insert(uri.clone(), Arc::clone(&resource)); - queue.push_back((uri, resource)); + // SAFETY: Deduplicate input URIs keeping the last occurrence to prevent creation + // of resources pointing to values that could be dropped by later insertions + let mut input_pairs: Vec<(Uri, Resource)> = pairs + .map(|(uri, resource)| Ok((uri::from_str(uri.as_ref().trim_end_matches('#'))?, resource))) + .collect::, Error>>()? + .into_iter() + .rev() + .collect(); + input_pairs.dedup_by(|(lhs, _), (rhs, _)| lhs == rhs); + + // Store documents and create initial InnerResourcePtrs + for (uri, resource) in input_pairs { + let key = Arc::new(uri); + match documents.entry(Arc::clone(&key)) { + Entry::Occupied(_) => { + // SAFETY: Do not remove any existing documents so that all pointers are valid + // The registry does not allow overriding existing resources right now + } + Entry::Vacant(entry) => { + let (draft, contents) = resource.into_inner(); + let boxed = Arc::pin(contents); + let contents = std::ptr::addr_of!(*boxed); + let resource = InnerResourcePtr::new(contents, draft); + resources.insert(Arc::clone(&key), resource.clone()); + queue.push_back((key, resource)); + entry.insert(boxed); + } + } } loop { @@ -366,15 +396,12 @@ fn process_resources( // Process current queue and collect references to external resources while let Some((mut base, resource)) = queue.pop_front() { if let Some(id) = resource.id() { - base = uri::resolve_against(&base.borrow(), id)?; + base = resolution_cache.resolve_against(&base.borrow(), id)?; } // Look for anchors for anchor in resource.anchors() { - anchors.insert( - AnchorKey::new(base.clone(), anchor.name().to_string()), - anchor, - ); + anchors.insert(AnchorKey::new(base.clone(), anchor.name()), anchor); } // Collect references to external resources in this resource @@ -383,20 +410,21 @@ fn process_resources( resource.contents(), &mut external, &mut seen, + resolution_cache, &mut scratch, )?; // Process subresources for subresource in resource.subresources() { - let subresource = Arc::new(subresource?); - // Collect references to external resources at this level + let subresource = subresource?; if let Some(sub_id) = subresource.id() { - let base = uri::resolve_against(&base.borrow(), sub_id)?; + let base = resolution_cache.resolve_against(&base.borrow(), sub_id)?; collect_external_resources( &base, subresource.contents(), &mut external, &mut seen, + resolution_cache, &mut scratch, )?; } else { @@ -405,13 +433,15 @@ fn process_resources( subresource.contents(), &mut external, &mut seen, + resolution_cache, &mut scratch, )?; }; + queue.push_back((base.clone(), subresource)); } if resource.id().is_some() { - resources.insert(base, resource); + resources.insert(base, resource.clone()); } } // Retrieve external resources @@ -422,26 +452,28 @@ fn process_resources( let retrieved = retriever .retrieve(&fragmentless.borrow()) .map_err(|err| Error::unretrievable(fragmentless.as_str(), err))?; - let resource = Arc::new(Resource::from_contents_and_specification( - retrieved, - default_draft, - )?); - resources.insert(fragmentless.clone(), Arc::clone(&resource)); + + let draft = default_draft.detect(&retrieved)?; + let boxed = Arc::pin(retrieved); + let contents = std::ptr::addr_of!(*boxed); + let resource = InnerResourcePtr::new(contents, draft); + let key = Arc::new(fragmentless); + documents.insert(Arc::clone(&key), boxed); + resources.insert(Arc::clone(&key), resource.clone()); + if let Some(fragment) = uri.fragment() { // The original `$ref` could have a fragment that points to a place that won't // be discovered via the regular sub-resources discovery. Therefore we need to // explicitly check it if let Some(resolved) = pointer(resource.contents(), fragment.as_str()) { - queue.push_back(( - uri, - Arc::new(Resource::from_contents_and_specification( - resolved.clone(), - default_draft, - )?), - )); + let draft = default_draft.detect(resolved)?; + let contents = std::ptr::addr_of!(*resolved); + let resource = InnerResourcePtr::new(contents, draft); + queue.push_back((Arc::new(uri), resource)); } } - queue.push_back((fragmentless, resource)); + + queue.push_back((key, resource)); } } } @@ -453,7 +485,8 @@ fn collect_external_resources( base: &Uri, contents: &Value, collected: &mut AHashSet>, - seen: &mut AHashSet, + seen: &mut HashSet, + resolution_cache: &mut UriCache, scratch: &mut String, ) -> Result<(), Error> { // URN schemes are not supported for external resolution @@ -487,7 +520,14 @@ fn collect_external_resources( // Handle local references separately as they may have nested references to external resources if reference.starts_with('#') { if let Some(referenced) = pointer(contents, reference.trim_start_matches('#')) { - collect_external_resources(base, referenced, collected, seen, scratch)?; + collect_external_resources( + base, + referenced, + collected, + seen, + resolution_cache, + scratch, + )?; } continue; } @@ -501,7 +541,9 @@ fn collect_external_resources( None => (reference, None), }; - let mut resolved = uri::resolve_against(&base_without_fragment.borrow(), path)?; + let mut resolved = (*resolution_cache + .resolve_against(&base_without_fragment.borrow(), path)?) + .clone(); // Add the fragment back if present if let Some(fragment) = fragment { // It is cheaper to check if it is properly encoded than allocate given that @@ -517,7 +559,7 @@ fn collect_external_resources( } resolved } else { - uri::resolve_against(&base.borrow(), reference)? + (*resolution_cache.resolve_against(&base.borrow(), reference)?).clone() }; collected.insert(resolved); diff --git a/crates/jsonschema-referencing/src/resolver.rs b/crates/jsonschema-referencing/src/resolver.rs index 63abbe60..d6368554 100644 --- a/crates/jsonschema-referencing/src/resolver.rs +++ b/crates/jsonschema-referencing/src/resolver.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use fluent_uri::Uri; use serde_json::Value; -use crate::{list::List, Draft, Error, Registry, ResourceRef}; +use crate::{list::List, resource::JsonSchemaResource, Draft, Error, Registry, ResourceRef}; /// A reference resolver. /// @@ -84,11 +84,17 @@ impl<'r> Resolver<'r> { }; let uri = self .registry - .cached_resolve_against(&self.base_uri.borrow(), uri)?; + .resolve_against(&self.base_uri.borrow(), uri)?; (uri, fragment) }; - let retrieved = self.registry.get_or_retrieve(&uri)?; + let Some(retrieved) = self.registry.resources.get(&*uri) else { + return Err(Error::unretrievable( + uri.as_str(), + "Retrieving external resources is not supported once the registry is populated" + .into(), + )); + }; if fragment.starts_with('/') { let resolver = self.evolve(uri); @@ -156,11 +162,16 @@ impl<'r> Resolver<'r> { /// # Errors /// /// Returns an error if the resource id cannot be resolved against the base URI of this resolver. - pub fn in_subresource(&self, subresource: ResourceRef) -> Result { + pub fn in_subresource(&self, subresource: ResourceRef<'_>) -> Result { + self.in_subresource_inner(&subresource) + } + + pub(crate) fn in_subresource_inner( + &self, + subresource: &impl JsonSchemaResource, + ) -> Result { if let Some(id) = subresource.id() { - let base_uri = self - .registry - .cached_resolve_against(&self.base_uri.borrow(), id)?; + let base_uri = self.registry.resolve_against(&self.base_uri.borrow(), id)?; Ok(Resolver { registry: self.registry, base_uri, @@ -197,7 +208,7 @@ impl<'r> Resolver<'r> { /// /// If the reference is invalid. pub fn resolve_against(&self, base: &Uri<&str>, uri: &str) -> Result>, Error> { - self.registry.cached_resolve_against(base, uri) + self.registry.resolve_against(base, uri) } } diff --git a/crates/jsonschema-referencing/src/resource.rs b/crates/jsonschema-referencing/src/resource.rs index 1e4fb442..94828789 100644 --- a/crates/jsonschema-referencing/src/resource.rs +++ b/crates/jsonschema-referencing/src/resource.rs @@ -1,10 +1,20 @@ -use std::borrow::Cow; +use std::{borrow::Cow, sync::atomic::AtomicPtr}; use serde_json::Value; use crate::{Anchor, Draft, Error, Resolved, Resolver, Segments}; -/// A document with a concrete interpretation under a JSON Schema specification. +pub(crate) trait JsonSchemaResource { + fn contents(&self) -> &Value; + fn draft(&self) -> Draft; + fn id(&self) -> Option<&str> { + self.draft() + .id_of(self.contents()) + .map(|id| id.trim_end_matches('#')) + } +} + +/// An owned document with a concrete interpretation under a JSON Schema specification. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Resource { contents: Value, @@ -15,6 +25,9 @@ impl Resource { pub(crate) fn new(contents: Value, draft: Draft) -> Self { Self { contents, draft } } + pub(crate) fn into_inner(self) -> (Draft, Value) { + (self.draft, self.contents) + } /// Resource contents. #[must_use] pub fn contents(&self) -> &Value { @@ -43,25 +56,113 @@ impl Resource { /// Resource identifier. #[must_use] pub fn id(&self) -> Option<&str> { - self.as_ref().id() + self.draft + .id_of(&self.contents) + .map(|id| id.trim_end_matches('#')) + } + #[must_use] + pub fn as_ref(&self) -> ResourceRef<'_> { + ResourceRef { + contents: &self.contents, + draft: self.draft, + } + } +} + +/// A borrowed document with a concrete interpretation under a JSON Schema specification. +#[derive(Debug, Clone, Copy)] +pub struct ResourceRef<'a> { + contents: &'a Value, + draft: Draft, +} + +impl<'a> ResourceRef<'a> { + #[must_use] + pub fn new(contents: &'a Value, draft: Draft) -> Self { + Self { contents, draft } + } + #[must_use] + pub fn contents(&self) -> &'a Value { + self.contents + } + #[must_use] + pub fn draft(&self) -> Draft { + self.draft + } +} + +impl JsonSchemaResource for ResourceRef<'_> { + fn contents(&self) -> &Value { + self.contents } - pub(crate) fn subresources(&self) -> Box> + '_> { - Box::new(self.draft.subresources_of(&self.contents).map(|contents| { - Resource::from_contents_and_specification(contents.clone(), self.draft) - })) + fn draft(&self) -> Draft { + self.draft + } +} + +/// A pointer to a pinned resource. +pub(crate) struct InnerResourcePtr { + contents: AtomicPtr, + draft: Draft, +} + +impl Clone for InnerResourcePtr { + fn clone(&self) -> Self { + Self { + contents: AtomicPtr::new(self.contents.load(std::sync::atomic::Ordering::Relaxed)), + draft: self.draft, + } + } +} + +impl std::fmt::Debug for InnerResourcePtr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("InnerResourcePtr") + .field("contents", self.contents()) + .field("draft", &self.draft) + .finish() + } +} + +impl InnerResourcePtr { + pub(crate) fn new(contents: *const Value, draft: Draft) -> Self { + Self { + contents: AtomicPtr::new(contents.cast_mut()), + draft, + } + } + + pub(crate) fn contents(&self) -> &Value { + // SAFETY: The pointer is valid as long as the registry exists + unsafe { &*self.contents.load(std::sync::atomic::Ordering::Relaxed) } + } + + pub(crate) fn draft(&self) -> Draft { + self.draft } pub(crate) fn anchors(&self) -> impl Iterator + '_ { - self.draft.anchors(&self.contents) + self.draft().anchors(self.contents()) + } + + pub(crate) fn subresources( + &self, + ) -> Box> + '_> { + Box::new( + self.draft + .subresources_of(self.contents()) + .map(|contents| Ok(InnerResourcePtr::new(contents, self.draft))), + ) } + pub(crate) fn pointer<'r>( &'r self, pointer: &str, mut resolver: Resolver<'r>, ) -> Result, Error> { // INVARIANT: Pointer always starts with `/` - let mut contents = &self.contents; + let mut contents = self.contents(); let mut segments = Segments::new(); let original_pointer = pointer; let pointer = percent_encoding::percent_decode_str(&pointer[1..]) @@ -88,10 +189,10 @@ impl Resource { segments.push(segment); } let last = &resolver; - let new_resolver = self.draft.maybe_in_subresource( + let new_resolver = self.draft().maybe_in_subresource( &segments, &resolver, - self.draft.create_resource_ref(contents), + &InnerResourcePtr::new(contents, self.draft()), )?; if new_resolver != *last { segments = Segments::new(); @@ -100,40 +201,14 @@ impl Resource { } Ok(Resolved::new(contents, resolver, self.draft())) } - /// Give a reference to the underlying contents together with draft. - #[must_use] - pub fn as_ref(&self) -> ResourceRef<'_> { - ResourceRef::new(&self.contents, self.draft) - } } -/// A reference to a document with a concrete interpretation under a JSON Schema specification. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct ResourceRef<'a> { - contents: &'a Value, - draft: Draft, -} - -impl<'a> ResourceRef<'a> { - /// Create a new resource reference. - #[must_use] - pub(crate) fn new(contents: &'a Value, draft: Draft) -> Self { - ResourceRef { contents, draft } - } - /// Resource identifier. - #[must_use] - pub fn id(&self) -> Option<&'a str> { - self.draft - .id_of(self.contents) - .map(|id| id.trim_end_matches('#')) - } - /// Resource contents. - #[must_use] - pub fn contents(&self) -> &'a Value { - self.contents +impl JsonSchemaResource for InnerResourcePtr { + fn contents(&self) -> &Value { + self.contents() } - #[must_use] - pub fn draft(&self) -> Draft { + + fn draft(&self) -> Draft { self.draft } } @@ -187,9 +262,9 @@ pub(crate) fn unescape_segment(mut segment: &str) -> Cow { #[cfg(test)] mod tests { - use std::error::Error; + use std::{error::Error, sync::Arc}; - use crate::{Draft, Registry}; + use crate::{resource::InnerResourcePtr, Draft, Registry}; use super::unescape_segment; use serde_json::json; @@ -254,7 +329,23 @@ mod tests { .expect("Invalid base URI"); let resolved = resolver.lookup("#").expect("Lookup failed"); - assert_eq!(resolved.contents(), &schema.contents); + assert_eq!(resolved.contents(), schema.contents()); + } + + #[test] + fn test_inner_resource_ptr_debug() { + let value = Arc::pin(json!({ + "foo": "bar", + "number": 42 + })); + + let ptr = InnerResourcePtr::new(std::ptr::addr_of!(*value), Draft::Draft202012); + + let expected = format!( + "InnerResourcePtr {{ contents: {:?}, draft: Draft202012 }}", + *value + ); + assert_eq!(format!("{ptr:?}"), expected); } #[test] diff --git a/crates/jsonschema-referencing/src/specification/draft201909.rs b/crates/jsonschema-referencing/src/specification/draft201909.rs index d26bbfd5..a7136cc6 100644 --- a/crates/jsonschema-referencing/src/specification/draft201909.rs +++ b/crates/jsonschema-referencing/src/specification/draft201909.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use crate::{segments::Segment, Error, Resolver, ResourceRef, Segments}; +use crate::{resource::InnerResourcePtr, segments::Segment, Error, Resolver, Segments}; use super::subresources::SubresourceIterator; @@ -37,7 +37,7 @@ pub(crate) fn subresources_of(contents: &Value) -> SubresourceIterator<'_> { pub(crate) fn maybe_in_subresource<'r>( segments: &Segments, resolver: &Resolver<'r>, - subresource: ResourceRef<'r>, + subresource: &InnerResourcePtr, ) -> Result, Error> { const IN_VALUE: &[&str] = &[ "additionalItems", @@ -67,7 +67,7 @@ pub(crate) fn maybe_in_subresource<'r>( while let Some(segment) = iter.next() { if let Segment::Key(key) = segment { if *key == "items" && subresource.contents().is_object() { - return resolver.in_subresource(subresource); + return resolver.in_subresource_inner(subresource); } if !IN_VALUE.contains(&key.as_ref()) && (!IN_CHILD.contains(&key.as_ref()) || iter.next().is_none()) @@ -76,5 +76,5 @@ pub(crate) fn maybe_in_subresource<'r>( } } } - resolver.in_subresource(subresource) + resolver.in_subresource_inner(subresource) } diff --git a/crates/jsonschema-referencing/src/specification/draft4.rs b/crates/jsonschema-referencing/src/specification/draft4.rs index 8d177ec1..081b1d8e 100644 --- a/crates/jsonschema-referencing/src/specification/draft4.rs +++ b/crates/jsonschema-referencing/src/specification/draft4.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use crate::{Error, Resolver, ResourceRef, Segments}; +use crate::{resource::InnerResourcePtr, Error, Resolver, Segments}; use super::subresources::{self, SubresourceIterator}; @@ -37,7 +37,7 @@ pub(crate) fn subresources_of(contents: &Value) -> SubresourceIterator<'_> { pub(crate) fn maybe_in_subresource<'r>( segments: &Segments, resolver: &Resolver<'r>, - subresource: ResourceRef<'r>, + subresource: &InnerResourcePtr, ) -> Result, Error> { const IN_VALUE: &[&str] = &["additionalItems", "additionalProperties", "not"]; const IN_CHILD: &[&str] = &[ diff --git a/crates/jsonschema-referencing/src/specification/draft6.rs b/crates/jsonschema-referencing/src/specification/draft6.rs index 8b9dbc79..0c2ff011 100644 --- a/crates/jsonschema-referencing/src/specification/draft6.rs +++ b/crates/jsonschema-referencing/src/specification/draft6.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use crate::{Error, Resolver, ResourceRef, Segments}; +use crate::{resource::InnerResourcePtr, Error, Resolver, Segments}; use super::subresources::{self, SubresourceIterator}; @@ -38,7 +38,7 @@ pub(crate) fn subresources_of(contents: &Value) -> SubresourceIterator<'_> { pub(crate) fn maybe_in_subresource<'r>( segments: &Segments, resolver: &Resolver<'r>, - subresource: ResourceRef<'r>, + subresource: &InnerResourcePtr, ) -> Result, Error> { const IN_VALUE: &[&str] = &[ "additionalItems", diff --git a/crates/jsonschema-referencing/src/specification/draft7.rs b/crates/jsonschema-referencing/src/specification/draft7.rs index 0ac2a173..6302f7c5 100644 --- a/crates/jsonschema-referencing/src/specification/draft7.rs +++ b/crates/jsonschema-referencing/src/specification/draft7.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use crate::{Error, Resolver, ResourceRef, Segments}; +use crate::{resource::InnerResourcePtr, Error, Resolver, Segments}; use super::subresources::{self, SubresourceIterator}; @@ -41,7 +41,7 @@ pub(crate) fn subresources_of(contents: &Value) -> SubresourceIterator<'_> { pub(crate) fn maybe_in_subresource<'r>( segments: &Segments, resolver: &Resolver<'r>, - subresource: ResourceRef<'r>, + subresource: &InnerResourcePtr, ) -> Result, Error> { const IN_VALUE: &[&str] = &[ "additionalItems", diff --git a/crates/jsonschema-referencing/src/specification/mod.rs b/crates/jsonschema-referencing/src/specification/mod.rs index 73f4e90b..bfc4c3fe 100644 --- a/crates/jsonschema-referencing/src/specification/mod.rs +++ b/crates/jsonschema-referencing/src/specification/mod.rs @@ -9,6 +9,7 @@ mod subresources; use crate::{ anchors, + resource::InnerResourcePtr, vocabularies::{VocabularySet, DRAFT_2019_09_VOCABULARIES, DRAFT_2020_12_VOCABULARIES}, Anchor, Error, Resolver, Resource, ResourceRef, Segments, }; @@ -94,7 +95,7 @@ impl Draft { self, segments: &Segments, resolver: &Resolver<'r>, - subresource: ResourceRef<'r>, + subresource: &InnerResourcePtr, ) -> Result, Error> { match self { Draft::Draft4 => draft4::maybe_in_subresource(segments, resolver, subresource), diff --git a/crates/jsonschema-referencing/src/specification/subresources.rs b/crates/jsonschema-referencing/src/specification/subresources.rs index d18a4336..bc5911ff 100644 --- a/crates/jsonschema-referencing/src/specification/subresources.rs +++ b/crates/jsonschema-referencing/src/specification/subresources.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use crate::{segments::Segment, Error, Resolver, ResourceRef, Segments}; +use crate::{resource::InnerResourcePtr, segments::Segment, Error, Resolver, Segments}; pub(crate) type SubresourceIterator<'a> = Box + 'a>; @@ -35,7 +35,7 @@ pub(crate) fn subresources_of(contents: &Value) -> SubresourceIterator<'_> { pub(crate) fn maybe_in_subresource<'r>( segments: &Segments, resolver: &Resolver<'r>, - subresource: ResourceRef<'r>, + subresource: &InnerResourcePtr, ) -> Result, Error> { const IN_VALUE: &[&str] = &[ "additionalProperties", @@ -72,14 +72,14 @@ pub(crate) fn maybe_in_subresource<'r>( } } } - resolver.in_subresource(subresource) + resolver.in_subresource_inner(subresource) } #[inline] pub(crate) fn maybe_in_subresource_with_items_and_dependencies<'r>( segments: &Segments, resolver: &Resolver<'r>, - subresource: ResourceRef<'r>, + subresource: &InnerResourcePtr, in_value: &[&str], in_child: &[&str], ) -> Result, Error> { @@ -87,7 +87,7 @@ pub(crate) fn maybe_in_subresource_with_items_and_dependencies<'r>( while let Some(segment) = iter.next() { if let Segment::Key(key) = segment { if (*key == "items" || *key == "dependencies") && subresource.contents().is_object() { - return resolver.in_subresource(subresource); + return resolver.in_subresource_inner(subresource); } if !in_value.contains(&key.as_ref()) && (!in_child.contains(&key.as_ref()) || iter.next().is_none()) @@ -96,7 +96,7 @@ pub(crate) fn maybe_in_subresource_with_items_and_dependencies<'r>( } } } - resolver.in_subresource(subresource) + resolver.in_subresource_inner(subresource) } #[cfg(test)] diff --git a/crates/jsonschema/src/compiler.rs b/crates/jsonschema/src/compiler.rs index 7f720242..9e6cd349 100644 --- a/crates/jsonschema/src/compiler.rs +++ b/crates/jsonschema/src/compiler.rs @@ -69,7 +69,7 @@ impl<'a> Context<'a> { /// Create a context for this schema. pub(crate) fn in_subresource( &'a self, - resource: ResourceRef, + resource: ResourceRef<'_>, ) -> Result, referencing::Error> { let resolver = self.resolver.in_subresource(resource)?; Ok(Context { From 15e45a07839626de3a4a205538c36cd568ec8045 Mon Sep 17 00:00:00 2001 From: Dmitry Dygalo Date: Tue, 4 Feb 2025 14:11:43 +0100 Subject: [PATCH 2/7] wip Signed-off-by: Dmitry Dygalo --- crates/jsonschema-referencing/src/meta.rs | 26 +++--- crates/jsonschema-referencing/src/registry.rs | 10 +- crates/jsonschema-referencing/src/resource.rs | 9 +- crates/jsonschema/src/compiler.rs | 15 +-- crates/jsonschema/src/lib.rs | 92 +++++++++++++++++++ 5 files changed, 124 insertions(+), 28 deletions(-) diff --git a/crates/jsonschema-referencing/src/meta.rs b/crates/jsonschema-referencing/src/meta.rs index fd00992e..990b120d 100644 --- a/crates/jsonschema-referencing/src/meta.rs +++ b/crates/jsonschema-referencing/src/meta.rs @@ -18,56 +18,56 @@ schema!(pub DRAFT6, "../metaschemas/draft6.json"); schema!(pub DRAFT7, "../metaschemas/draft7.json"); schema!(pub DRAFT201909, "../metaschemas/draft2019-09/schema.json"); schema!( - DRAFT201909_APPLICATOR, + pub DRAFT201909_APPLICATOR, "../metaschemas/draft2019-09/meta/applicator.json" ); schema!( - DRAFT201909_CONTENT, + pub DRAFT201909_CONTENT, "../metaschemas/draft2019-09/meta/content.json" ); schema!( - DRAFT201909_CORE, + pub DRAFT201909_CORE, "../metaschemas/draft2019-09/meta/core.json" ); schema!( - DRAFT201909_FORMAT, + pub DRAFT201909_FORMAT, "../metaschemas/draft2019-09/meta/format.json" ); schema!( - DRAFT201909_META_DATA, + pub DRAFT201909_META_DATA, "../metaschemas/draft2019-09/meta/meta-data.json" ); schema!( - DRAFT201909_VALIDATION, + pub DRAFT201909_VALIDATION, "../metaschemas/draft2019-09/meta/validation.json" ); schema!(pub DRAFT202012, "../metaschemas/draft2020-12/schema.json"); schema!( - DRAFT202012_CORE, + pub DRAFT202012_CORE, "../metaschemas/draft2020-12/meta/core.json" ); schema!( - DRAFT202012_APPLICATOR, + pub DRAFT202012_APPLICATOR, "../metaschemas/draft2020-12/meta/applicator.json" ); schema!( - DRAFT202012_UNEVALUATED, + pub DRAFT202012_UNEVALUATED, "../metaschemas/draft2020-12/meta/unevaluated.json" ); schema!( - DRAFT202012_VALIDATION, + pub DRAFT202012_VALIDATION, "../metaschemas/draft2020-12/meta/validation.json" ); schema!( - DRAFT202012_META_DATA, + pub DRAFT202012_META_DATA, "../metaschemas/draft2020-12/meta/meta-data.json" ); schema!( - DRAFT202012_FORMAT_ANNOTATION, + pub DRAFT202012_FORMAT_ANNOTATION, "../metaschemas/draft2020-12/meta/format-annotation.json" ); schema!( - DRAFT202012_CONTENT, + pub DRAFT202012_CONTENT, "../metaschemas/draft2020-12/meta/content.json" ); pub(crate) static META_SCHEMAS: Lazy<[(&'static str, &Value); 18]> = Lazy::new(|| { diff --git a/crates/jsonschema-referencing/src/registry.rs b/crates/jsonschema-referencing/src/registry.rs index 2c481a19..895cc146 100644 --- a/crates/jsonschema-referencing/src/registry.rs +++ b/crates/jsonschema-referencing/src/registry.rs @@ -86,7 +86,7 @@ impl Clone for Registry { /// Configuration options for creating a [`Registry`]. pub struct RegistryOptions { - retriever: Box, + retriever: Arc, draft: Draft, } @@ -95,13 +95,13 @@ impl RegistryOptions { #[must_use] pub fn new() -> Self { Self { - retriever: Box::new(DefaultRetriever), + retriever: Arc::new(DefaultRetriever), draft: Draft::default(), } } /// Set a custom retriever for the [`Registry`]. #[must_use] - pub fn retriever(mut self, retriever: Box) -> Self { + pub fn retriever(mut self, retriever: Arc) -> Self { self.retriever = retriever; self } @@ -595,7 +595,7 @@ fn parse_index(s: &str) -> Option { } #[cfg(test)] mod tests { - use std::error::Error as _; + use std::{error::Error as _, sync::Arc}; use ahash::AHashMap; use fluent_uri::Uri; @@ -867,7 +867,7 @@ mod tests { }); let registry = Registry::options() - .retriever(Box::new(retriever)) + .retriever(Arc::new(retriever)) .try_from_resources(input_pairs) .expect("Invalid resources"); // Verify that all expected URIs are resolved and present in resources diff --git a/crates/jsonschema-referencing/src/resource.rs b/crates/jsonschema-referencing/src/resource.rs index 94828789..8ffde06a 100644 --- a/crates/jsonschema-referencing/src/resource.rs +++ b/crates/jsonschema-referencing/src/resource.rs @@ -1,4 +1,7 @@ -use std::{borrow::Cow, sync::atomic::AtomicPtr}; +use std::{ + borrow::Cow, + sync::atomic::{AtomicPtr, Ordering}, +}; use serde_json::Value; @@ -110,7 +113,7 @@ pub(crate) struct InnerResourcePtr { impl Clone for InnerResourcePtr { fn clone(&self) -> Self { Self { - contents: AtomicPtr::new(self.contents.load(std::sync::atomic::Ordering::Relaxed)), + contents: AtomicPtr::new(self.contents.load(Ordering::Relaxed)), draft: self.draft, } } @@ -135,7 +138,7 @@ impl InnerResourcePtr { pub(crate) fn contents(&self) -> &Value { // SAFETY: The pointer is valid as long as the registry exists - unsafe { &*self.contents.load(std::sync::atomic::Ordering::Relaxed) } + unsafe { &*self.contents.load(Ordering::Relaxed) } } pub(crate) fn draft(&self) -> Draft { diff --git a/crates/jsonschema/src/compiler.rs b/crates/jsonschema/src/compiler.rs index 9e6cd349..20a2ce14 100644 --- a/crates/jsonschema/src/compiler.rs +++ b/crates/jsonschema/src/compiler.rs @@ -16,7 +16,7 @@ use crate::{ use ahash::{AHashMap, AHashSet}; use referencing::{ uri, Draft, List, Registry, Resolved, Resolver, Resource, ResourceRef, Uri, Vocabulary, - VocabularySet, SPECIFICATIONS, + VocabularySet, }; use serde_json::Value; use std::{cell::RefCell, rc::Rc, sync::Arc}; @@ -258,15 +258,16 @@ pub(crate) fn build_validator( resources.push((uri, resource)); } - // Get retriever for external resources let retriever = Arc::clone(&config.retriever); + let registry = Arc::new( + Registry::options() + .draft(draft) + .retriever(retriever) + .try_from_resources(resources.into_iter())?, + ); + // Build a registry & resolver needed for validator compilation - let registry = Arc::new(SPECIFICATIONS.clone().try_with_resources_and_retriever( - resources.into_iter(), - &*retriever, - draft, - )?); let vocabularies = registry.find_vocabularies(draft, schema); let resolver = Rc::new(registry.try_resolver(&base_uri)?); diff --git a/crates/jsonschema/src/lib.rs b/crates/jsonschema/src/lib.rs index be2ec0f3..6a2bbf51 100644 --- a/crates/jsonschema/src/lib.rs +++ b/crates/jsonschema/src/lib.rs @@ -651,6 +651,7 @@ pub mod meta { pub(crate) mod validators { use crate::Validator; use once_cell::sync::Lazy; + use referencing::Resource; pub static DRAFT4_META_VALIDATOR: Lazy = Lazy::new(|| { crate::options() @@ -676,6 +677,47 @@ pub mod meta { pub static DRAFT201909_META_VALIDATOR: Lazy = Lazy::new(|| { crate::options() .without_schema_validation() + .with_resources( + [ + ( + "https://json-schema.org/draft/2019-09/meta/applicator", + Resource::from_contents( + referencing::meta::DRAFT201909_APPLICATOR.clone(), + ) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2019-09/meta/content", + Resource::from_contents(referencing::meta::DRAFT201909_CONTENT.clone()) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2019-09/meta/core", + Resource::from_contents(referencing::meta::DRAFT201909_CORE.clone()) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2019-09/meta/format", + Resource::from_contents(referencing::meta::DRAFT201909_FORMAT.clone()) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2019-09/meta/meta-data", + Resource::from_contents( + referencing::meta::DRAFT201909_META_DATA.clone(), + ) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2019-09/meta/validation", + Resource::from_contents( + referencing::meta::DRAFT201909_VALIDATION.clone(), + ) + .expect("Invalid resource"), + ), + ] + .into_iter(), + ) .build(&referencing::meta::DRAFT201909) .expect("Draft 2019-09 meta-schema should be valid") }); @@ -683,6 +725,56 @@ pub mod meta { pub static DRAFT202012_META_VALIDATOR: Lazy = Lazy::new(|| { crate::options() .without_schema_validation() + .with_resources( + [ + ( + "https://json-schema.org/draft/2020-12/meta/core", + Resource::from_contents(referencing::meta::DRAFT202012_CORE.clone()) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2020-12/meta/applicator", + Resource::from_contents( + referencing::meta::DRAFT202012_APPLICATOR.clone(), + ) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2020-12/meta/unevaluated", + Resource::from_contents( + referencing::meta::DRAFT202012_UNEVALUATED.clone(), + ) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2020-12/meta/validation", + Resource::from_contents( + referencing::meta::DRAFT202012_VALIDATION.clone(), + ) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2020-12/meta/meta-data", + Resource::from_contents( + referencing::meta::DRAFT202012_META_DATA.clone(), + ) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2020-12/meta/format-annotation", + Resource::from_contents( + referencing::meta::DRAFT202012_FORMAT_ANNOTATION.clone(), + ) + .expect("Invalid resource"), + ), + ( + "https://json-schema.org/draft/2020-12/meta/content", + Resource::from_contents(referencing::meta::DRAFT202012_CONTENT.clone()) + .expect("Invalid resource"), + ), + ] + .into_iter(), + ) .build(&referencing::meta::DRAFT202012) .expect("Draft 2020-12 meta-schema should be valid") }); From 59ef2c3fea866de296649a1f76e8fc17ca1675f5 Mon Sep 17 00:00:00 2001 From: Dmitry Dygalo Date: Tue, 4 Feb 2025 17:48:15 +0100 Subject: [PATCH 3/7] wip Signed-off-by: Dmitry Dygalo --- .../draft2020-12/meta/format-annotation.json | 26 +++--- .../draft2020-12/meta/format-assertion.json | 15 +++ crates/jsonschema-referencing/src/meta.rs | 10 +- crates/jsonschema-referencing/src/registry.rs | 76 +++++++++++---- crates/jsonschema/src/compiler.rs | 2 +- crates/jsonschema/src/lib.rs | 92 ------------------- 6 files changed, 99 insertions(+), 122 deletions(-) create mode 100644 crates/jsonschema-referencing/metaschemas/draft2020-12/meta/format-assertion.json diff --git a/crates/jsonschema-referencing/metaschemas/draft2020-12/meta/format-annotation.json b/crates/jsonschema-referencing/metaschemas/draft2020-12/meta/format-annotation.json index 51ef7ea1..b945f224 100644 --- a/crates/jsonschema-referencing/metaschemas/draft2020-12/meta/format-annotation.json +++ b/crates/jsonschema-referencing/metaschemas/draft2020-12/meta/format-annotation.json @@ -1,14 +1,18 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://json-schema.org/draft/2020-12/meta/format-annotation", - "$vocabulary": { - "https://json-schema.org/draft/2020-12/vocab/format-annotation": true - }, - "$dynamicAnchor": "meta", - - "title": "Format vocabulary meta-schema for annotation results", - "type": ["object", "boolean"], - "properties": { - "format": { "type": "string" } + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://json-schema.org/draft/2020-12/meta/format-annotation", + "$vocabulary": { + "https://json-schema.org/draft/2020-12/vocab/format-annotation": true + }, + "$dynamicAnchor": "meta", + "title": "Format vocabulary meta-schema for annotation results", + "type": [ + "object", + "boolean" + ], + "properties": { + "format": { + "type": "string" } + } } diff --git a/crates/jsonschema-referencing/metaschemas/draft2020-12/meta/format-assertion.json b/crates/jsonschema-referencing/metaschemas/draft2020-12/meta/format-assertion.json new file mode 100644 index 00000000..68f93d66 --- /dev/null +++ b/crates/jsonschema-referencing/metaschemas/draft2020-12/meta/format-assertion.json @@ -0,0 +1,15 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://json-schema.org/draft/2020-12/meta/format-assertion", + "$dynamicAnchor": "meta", + "title": "Format vocabulary meta-schema for assertion results", + "type": [ + "object", + "boolean" + ], + "properties": { + "format": { + "type": "string" + } + } +} diff --git a/crates/jsonschema-referencing/src/meta.rs b/crates/jsonschema-referencing/src/meta.rs index 990b120d..c2bb5ff0 100644 --- a/crates/jsonschema-referencing/src/meta.rs +++ b/crates/jsonschema-referencing/src/meta.rs @@ -1,11 +1,13 @@ +use std::sync::Arc; + use once_cell::sync::Lazy; use serde_json::Value; macro_rules! schema { ($vis:vis $name:ident, $path:expr) => { - $vis static $name: once_cell::sync::Lazy = + $vis static $name: once_cell::sync::Lazy> = once_cell::sync::Lazy::new(|| { - serde_json::from_slice(include_bytes!($path)).expect("Invalid schema") + Arc::new(serde_json::from_slice(include_bytes!($path)).expect("Invalid schema")) }); }; ($name:ident, $path:expr) => { @@ -66,6 +68,10 @@ schema!( pub DRAFT202012_FORMAT_ANNOTATION, "../metaschemas/draft2020-12/meta/format-annotation.json" ); +schema!( + pub DRAFT202012_FORMAT_ASSERTION, + "../metaschemas/draft2020-12/meta/format-assertion.json" +); schema!( pub DRAFT202012_CONTENT, "../metaschemas/draft2020-12/meta/content.json" diff --git a/crates/jsonschema-referencing/src/registry.rs b/crates/jsonschema-referencing/src/registry.rs index 895cc146..0c25e8f1 100644 --- a/crates/jsonschema-referencing/src/registry.rs +++ b/crates/jsonschema-referencing/src/registry.rs @@ -449,12 +449,66 @@ fn process_resources( let mut fragmentless = uri.clone(); fragmentless.set_fragment(None); if !resources.contains_key(&fragmentless) { - let retrieved = retriever - .retrieve(&fragmentless.borrow()) - .map_err(|err| Error::unretrievable(fragmentless.as_str(), err))?; + let boxed = match fragmentless.as_str() { + "https://json-schema.org/draft/2020-12/schema" => { + Pin::new(Arc::clone(&meta::DRAFT202012)) + } + "https://json-schema.org/draft/2020-12/meta/applicator" => { + Pin::new(Arc::clone(&meta::DRAFT202012_APPLICATOR)) + } + "https://json-schema.org/draft/2020-12/meta/core" => { + Pin::new(Arc::clone(&meta::DRAFT202012_CORE)) + } + "https://json-schema.org/draft/2020-12/meta/validation" => { + Pin::new(Arc::clone(&meta::DRAFT202012_VALIDATION)) + } + "https://json-schema.org/draft/2020-12/meta/unevaluated" => { + Pin::new(Arc::clone(&meta::DRAFT202012_UNEVALUATED)) + } + "https://json-schema.org/draft/2020-12/meta/format-annotation" => { + Pin::new(Arc::clone(&meta::DRAFT202012_FORMAT_ANNOTATION)) + } + "https://json-schema.org/draft/2020-12/meta/format-assertion" => { + Pin::new(Arc::clone(&meta::DRAFT202012_FORMAT_ASSERTION)) + } + "https://json-schema.org/draft/2020-12/meta/content" => { + Pin::new(Arc::clone(&meta::DRAFT202012_CONTENT)) + } + "https://json-schema.org/draft/2020-12/meta/meta-data" => { + Pin::new(Arc::clone(&meta::DRAFT202012_META_DATA)) + } + "https://json-schema.org/draft/2019-09/schema" => { + Pin::new(Arc::clone(&meta::DRAFT201909)) + } + "https://json-schema.org/draft/2019-09/meta/applicator" => { + Pin::new(Arc::clone(&meta::DRAFT201909_APPLICATOR)) + } + "https://json-schema.org/draft/2019-09/meta/core" => { + Pin::new(Arc::clone(&meta::DRAFT201909_CORE)) + } + "https://json-schema.org/draft/2019-09/meta/content" => { + Pin::new(Arc::clone(&meta::DRAFT201909_CONTENT)) + } + "https://json-schema.org/draft/2019-09/meta/validation" => { + Pin::new(Arc::clone(&meta::DRAFT201909_VALIDATION)) + } + "https://json-schema.org/draft/2019-09/meta/format" => { + Pin::new(Arc::clone(&meta::DRAFT201909_FORMAT)) + } + "https://json-schema.org/draft/2019-09/meta/meta-data" => { + Pin::new(Arc::clone(&meta::DRAFT201909_META_DATA)) + } + "http://json-schema.org/draft-07/schema" => Pin::new(Arc::clone(&meta::DRAFT7)), + "http://json-schema.org/draft-06/schema" => Pin::new(Arc::clone(&meta::DRAFT6)), + "http://json-schema.org/draft-04/schema" => Pin::new(Arc::clone(&meta::DRAFT4)), + _ => Arc::pin( + retriever + .retrieve(&fragmentless.borrow()) + .map_err(|err| Error::unretrievable(fragmentless.as_str(), err))?, + ), + }; - let draft = default_draft.detect(&retrieved)?; - let boxed = Arc::pin(retrieved); + let draft = default_draft.detect(&boxed)?; let contents = std::ptr::addr_of!(*boxed); let resource = InnerResourcePtr::new(contents, draft); let key = Arc::new(fragmentless); @@ -469,7 +523,7 @@ fn process_resources( let draft = default_draft.detect(resolved)?; let contents = std::ptr::addr_of!(*resolved); let resource = InnerResourcePtr::new(contents, draft); - queue.push_back((Arc::new(uri), resource)); + queue.push_back((Arc::clone(&key), resource)); } } @@ -495,16 +549,6 @@ fn collect_external_resources( } for key in ["$ref", "$schema"] { if let Some(reference) = contents.get(key).and_then(Value::as_str) { - // Skip well-known schema references - if reference.starts_with("https://json-schema.org/draft/2020-12/") - || reference.starts_with("https://json-schema.org/draft/2019-09/") - || reference.starts_with("http://json-schema.org/draft-07/") - || reference.starts_with("http://json-schema.org/draft-06/") - || reference.starts_with("http://json-schema.org/draft-04/") - { - continue; - } - if reference == "#" { continue; } diff --git a/crates/jsonschema/src/compiler.rs b/crates/jsonschema/src/compiler.rs index 20a2ce14..fef47209 100644 --- a/crates/jsonschema/src/compiler.rs +++ b/crates/jsonschema/src/compiler.rs @@ -217,7 +217,7 @@ impl<'a> Context<'a> { return Ok(None); }; let resource = self.draft().create_resource(resolved.contents().clone()); - let mut base_uri = resolved.resolver().base_uri().to_owned(); + let mut base_uri = resolved.resolver().base_uri(); let scopes = resolved.resolver().dynamic_scope(); if let Some(id) = resource.id() { base_uri = Arc::new(uri::resolve_against(&base_uri.borrow(), id)?); diff --git a/crates/jsonschema/src/lib.rs b/crates/jsonschema/src/lib.rs index 6a2bbf51..be2ec0f3 100644 --- a/crates/jsonschema/src/lib.rs +++ b/crates/jsonschema/src/lib.rs @@ -651,7 +651,6 @@ pub mod meta { pub(crate) mod validators { use crate::Validator; use once_cell::sync::Lazy; - use referencing::Resource; pub static DRAFT4_META_VALIDATOR: Lazy = Lazy::new(|| { crate::options() @@ -677,47 +676,6 @@ pub mod meta { pub static DRAFT201909_META_VALIDATOR: Lazy = Lazy::new(|| { crate::options() .without_schema_validation() - .with_resources( - [ - ( - "https://json-schema.org/draft/2019-09/meta/applicator", - Resource::from_contents( - referencing::meta::DRAFT201909_APPLICATOR.clone(), - ) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2019-09/meta/content", - Resource::from_contents(referencing::meta::DRAFT201909_CONTENT.clone()) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2019-09/meta/core", - Resource::from_contents(referencing::meta::DRAFT201909_CORE.clone()) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2019-09/meta/format", - Resource::from_contents(referencing::meta::DRAFT201909_FORMAT.clone()) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2019-09/meta/meta-data", - Resource::from_contents( - referencing::meta::DRAFT201909_META_DATA.clone(), - ) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2019-09/meta/validation", - Resource::from_contents( - referencing::meta::DRAFT201909_VALIDATION.clone(), - ) - .expect("Invalid resource"), - ), - ] - .into_iter(), - ) .build(&referencing::meta::DRAFT201909) .expect("Draft 2019-09 meta-schema should be valid") }); @@ -725,56 +683,6 @@ pub mod meta { pub static DRAFT202012_META_VALIDATOR: Lazy = Lazy::new(|| { crate::options() .without_schema_validation() - .with_resources( - [ - ( - "https://json-schema.org/draft/2020-12/meta/core", - Resource::from_contents(referencing::meta::DRAFT202012_CORE.clone()) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2020-12/meta/applicator", - Resource::from_contents( - referencing::meta::DRAFT202012_APPLICATOR.clone(), - ) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2020-12/meta/unevaluated", - Resource::from_contents( - referencing::meta::DRAFT202012_UNEVALUATED.clone(), - ) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2020-12/meta/validation", - Resource::from_contents( - referencing::meta::DRAFT202012_VALIDATION.clone(), - ) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2020-12/meta/meta-data", - Resource::from_contents( - referencing::meta::DRAFT202012_META_DATA.clone(), - ) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2020-12/meta/format-annotation", - Resource::from_contents( - referencing::meta::DRAFT202012_FORMAT_ANNOTATION.clone(), - ) - .expect("Invalid resource"), - ), - ( - "https://json-schema.org/draft/2020-12/meta/content", - Resource::from_contents(referencing::meta::DRAFT202012_CONTENT.clone()) - .expect("Invalid resource"), - ), - ] - .into_iter(), - ) .build(&referencing::meta::DRAFT202012) .expect("Draft 2020-12 meta-schema should be valid") }); From 83635cba86bfbe72fc27bc4e27926881684ba0af Mon Sep 17 00:00:00 2001 From: Dmitry Dygalo Date: Tue, 4 Feb 2025 19:31:35 +0100 Subject: [PATCH 4/7] wip Signed-off-by: Dmitry Dygalo --- crates/jsonschema-referencing/src/registry.rs | 142 ++++++++++-------- 1 file changed, 80 insertions(+), 62 deletions(-) diff --git a/crates/jsonschema-referencing/src/registry.rs b/crates/jsonschema-referencing/src/registry.rs index 0c25e8f1..6d17db9e 100644 --- a/crates/jsonschema-referencing/src/registry.rs +++ b/crates/jsonschema-referencing/src/registry.rs @@ -40,14 +40,13 @@ pub static SPECIFICATIONS: Lazy = Lazy::new(|| { let mut resources = ResourceMap::with_capacity(18); let mut anchors = AHashMap::with_capacity(8); let mut resolution_cache = UriCache::with_capacity(35); - process_resources( + // TODO: Drop resolution cache + process_meta_schemas( pairs, - &DefaultRetriever, &mut documents, &mut resources, &mut anchors, &mut resolution_cache, - Draft::default(), ) .expect("Failed to process meta schemas"); Registry { @@ -343,6 +342,50 @@ impl Registry { } } +fn process_meta_schemas( + pairs: impl Iterator, Resource)>, + documents: &mut DocumentStore, + resources: &mut ResourceMap, + anchors: &mut AHashMap, + resolution_cache: &mut UriCache, +) -> Result<(), Error> { + let mut queue = VecDeque::with_capacity(32); + + for (uri, resource) in pairs { + let uri = uri::from_str(uri.as_ref().trim_end_matches('#'))?; + let key = Arc::new(uri); + let (draft, contents) = resource.into_inner(); + let boxed = Arc::pin(contents); + let contents = std::ptr::addr_of!(*boxed); + let resource = InnerResourcePtr::new(contents, draft); + documents.insert(Arc::clone(&key), boxed); + resources.insert(Arc::clone(&key), resource.clone()); + queue.push_back((key, resource)); + } + + // Process current queue and collect references to external resources + while let Some((mut base, resource)) = queue.pop_front() { + if let Some(id) = resource.id() { + base = resolution_cache.resolve_against(&base.borrow(), id)?; + } + + // Look for anchors + for anchor in resource.anchors() { + anchors.insert(AnchorKey::new(base.clone(), anchor.name()), anchor); + } + + // Process subresources + for subresource in resource.subresources() { + let subresource = subresource?; + queue.push_back((base.clone(), subresource)); + } + if resource.id().is_some() { + resources.insert(base, resource.clone()); + } + } + Ok(()) +} + fn process_resources( pairs: impl Iterator, Resource)>, retriever: &dyn Retrieve, @@ -356,6 +399,7 @@ fn process_resources( let mut seen = HashSet::with_hasher(BuildNoHashHasher::default()); let mut external = AHashSet::new(); let mut scratch = String::new(); + let mut refers_metaschemas = false; // TODO: Implement `Registry::combine` // SAFETY: Deduplicate input URIs keeping the last occurrence to prevent creation @@ -412,6 +456,7 @@ fn process_resources( &mut seen, resolution_cache, &mut scratch, + &mut refers_metaschemas, )?; // Process subresources @@ -426,6 +471,7 @@ fn process_resources( &mut seen, resolution_cache, &mut scratch, + &mut refers_metaschemas, )?; } else { collect_external_resources( @@ -435,6 +481,7 @@ fn process_resources( &mut seen, resolution_cache, &mut scratch, + &mut refers_metaschemas, )?; }; @@ -449,66 +496,12 @@ fn process_resources( let mut fragmentless = uri.clone(); fragmentless.set_fragment(None); if !resources.contains_key(&fragmentless) { - let boxed = match fragmentless.as_str() { - "https://json-schema.org/draft/2020-12/schema" => { - Pin::new(Arc::clone(&meta::DRAFT202012)) - } - "https://json-schema.org/draft/2020-12/meta/applicator" => { - Pin::new(Arc::clone(&meta::DRAFT202012_APPLICATOR)) - } - "https://json-schema.org/draft/2020-12/meta/core" => { - Pin::new(Arc::clone(&meta::DRAFT202012_CORE)) - } - "https://json-schema.org/draft/2020-12/meta/validation" => { - Pin::new(Arc::clone(&meta::DRAFT202012_VALIDATION)) - } - "https://json-schema.org/draft/2020-12/meta/unevaluated" => { - Pin::new(Arc::clone(&meta::DRAFT202012_UNEVALUATED)) - } - "https://json-schema.org/draft/2020-12/meta/format-annotation" => { - Pin::new(Arc::clone(&meta::DRAFT202012_FORMAT_ANNOTATION)) - } - "https://json-schema.org/draft/2020-12/meta/format-assertion" => { - Pin::new(Arc::clone(&meta::DRAFT202012_FORMAT_ASSERTION)) - } - "https://json-schema.org/draft/2020-12/meta/content" => { - Pin::new(Arc::clone(&meta::DRAFT202012_CONTENT)) - } - "https://json-schema.org/draft/2020-12/meta/meta-data" => { - Pin::new(Arc::clone(&meta::DRAFT202012_META_DATA)) - } - "https://json-schema.org/draft/2019-09/schema" => { - Pin::new(Arc::clone(&meta::DRAFT201909)) - } - "https://json-schema.org/draft/2019-09/meta/applicator" => { - Pin::new(Arc::clone(&meta::DRAFT201909_APPLICATOR)) - } - "https://json-schema.org/draft/2019-09/meta/core" => { - Pin::new(Arc::clone(&meta::DRAFT201909_CORE)) - } - "https://json-schema.org/draft/2019-09/meta/content" => { - Pin::new(Arc::clone(&meta::DRAFT201909_CONTENT)) - } - "https://json-schema.org/draft/2019-09/meta/validation" => { - Pin::new(Arc::clone(&meta::DRAFT201909_VALIDATION)) - } - "https://json-schema.org/draft/2019-09/meta/format" => { - Pin::new(Arc::clone(&meta::DRAFT201909_FORMAT)) - } - "https://json-schema.org/draft/2019-09/meta/meta-data" => { - Pin::new(Arc::clone(&meta::DRAFT201909_META_DATA)) - } - "http://json-schema.org/draft-07/schema" => Pin::new(Arc::clone(&meta::DRAFT7)), - "http://json-schema.org/draft-06/schema" => Pin::new(Arc::clone(&meta::DRAFT6)), - "http://json-schema.org/draft-04/schema" => Pin::new(Arc::clone(&meta::DRAFT4)), - _ => Arc::pin( - retriever - .retrieve(&fragmentless.borrow()) - .map_err(|err| Error::unretrievable(fragmentless.as_str(), err))?, - ), - }; + let retrieved = retriever + .retrieve(&fragmentless.borrow()) + .map_err(|err| Error::unretrievable(fragmentless.as_str(), err))?; - let draft = default_draft.detect(&boxed)?; + let draft = default_draft.detect(&retrieved)?; + let boxed = Arc::pin(retrieved); let contents = std::ptr::addr_of!(*boxed); let resource = InnerResourcePtr::new(contents, draft); let key = Arc::new(fragmentless); @@ -532,6 +525,15 @@ fn process_resources( } } + if refers_metaschemas { + for (key, resource) in &SPECIFICATIONS.resources { + resources.insert(Arc::clone(key), resource.clone()); + } + for (key, anchor) in &SPECIFICATIONS.anchors { + anchors.insert(key.clone(), anchor.clone()); + } + } + Ok(()) } @@ -542,6 +544,7 @@ fn collect_external_resources( seen: &mut HashSet, resolution_cache: &mut UriCache, scratch: &mut String, + refers_metaschemas: &mut bool, ) -> Result<(), Error> { // URN schemes are not supported for external resolution if base.scheme().as_str() == "urn" { @@ -549,6 +552,19 @@ fn collect_external_resources( } for key in ["$ref", "$schema"] { if let Some(reference) = contents.get(key).and_then(Value::as_str) { + // Skip well-known schema references + if reference.starts_with("https://json-schema.org/draft/2020-12/") + || reference.starts_with("https://json-schema.org/draft/2019-09/") + || reference.starts_with("http://json-schema.org/draft-07/") + || reference.starts_with("http://json-schema.org/draft-06/") + || reference.starts_with("http://json-schema.org/draft-04/") + || base.as_str() == "https://json-schema.org/draft/2020-12/schema" + || base.as_str() == "https://json-schema.org/draft/2019-09/schema" + { + *refers_metaschemas = true; + continue; + } + if reference == "#" { continue; } @@ -571,6 +587,7 @@ fn collect_external_resources( seen, resolution_cache, scratch, + refers_metaschemas, )?; } continue; @@ -637,6 +654,7 @@ fn parse_index(s: &str) -> Option { } s.parse().ok() } + #[cfg(test)] mod tests { use std::{error::Error as _, sync::Arc}; From 971e053245a72f8891d7fd9396be3693a4c12420 Mon Sep 17 00:00:00 2001 From: Dmitry Dygalo Date: Tue, 4 Feb 2025 20:13:14 +0100 Subject: [PATCH 5/7] wip Signed-off-by: Dmitry Dygalo --- .github/workflows/ci.yml | 19 +++++++++++ Justfile | 2 ++ crates/jsonschema-referencing/src/registry.rs | 7 ++-- crates/jsonschema-referencing/src/resource.rs | 4 +++ crates/jsonschema/src/compiler.rs | 33 +++++++++---------- 5 files changed, 42 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 99020e07..ffd57f85 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -105,6 +105,25 @@ jobs: files: lcov.info fail_ci_if_error: true + miri: + name: Test with Miri + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - uses: dtolnay/rust-toolchain@nightly + with: + components: miri + + - uses: Swatinem/rust-cache@v2 + with: + cache-all-crates: "true" + + - name: Run tests with Miri + run: cargo miri test -p referencing + lints-python: name: Python lints runs-on: ubuntu-22.04 diff --git a/Justfile b/Justfile index 91b5a275..637bb741 100644 --- a/Justfile +++ b/Justfile @@ -32,3 +32,5 @@ test-py-no-rebuild *FLAGS: bench-py *FLAGS: uvx --with="crates/jsonschema-py[bench]" --refresh pytest crates/jsonschema-py/benches/bench.py --benchmark-columns=min {{FLAGS}} +miri: + cargo +nightly miri test -p referencing diff --git a/crates/jsonschema-referencing/src/registry.rs b/crates/jsonschema-referencing/src/registry.rs index 6d17db9e..6252f644 100644 --- a/crates/jsonschema-referencing/src/registry.rs +++ b/crates/jsonschema-referencing/src/registry.rs @@ -315,13 +315,10 @@ impl Registry { } } - pub(crate) fn resolve_against( - &self, - base: &Uri<&str>, - uri: &str, - ) -> Result>, Error> { + pub fn resolve_against(&self, base: &Uri<&str>, uri: &str) -> Result>, Error> { self.resolution_cache.resolve_against(base, uri) } + #[must_use] pub fn find_vocabularies(&self, draft: Draft, contents: &Value) -> VocabularySet { match draft.detect(contents) { diff --git a/crates/jsonschema-referencing/src/resource.rs b/crates/jsonschema-referencing/src/resource.rs index 8ffde06a..97b906d5 100644 --- a/crates/jsonschema-referencing/src/resource.rs +++ b/crates/jsonschema-referencing/src/resource.rs @@ -92,6 +92,10 @@ impl<'a> ResourceRef<'a> { pub fn draft(&self) -> Draft { self.draft } + #[must_use] + pub fn id(&self) -> Option<&str> { + JsonSchemaResource::id(self) + } } impl JsonSchemaResource for ResourceRef<'_> { diff --git a/crates/jsonschema/src/compiler.rs b/crates/jsonschema/src/compiler.rs index fef47209..af35e812 100644 --- a/crates/jsonschema/src/compiler.rs +++ b/crates/jsonschema/src/compiler.rs @@ -15,11 +15,11 @@ use crate::{ }; use ahash::{AHashMap, AHashSet}; use referencing::{ - uri, Draft, List, Registry, Resolved, Resolver, Resource, ResourceRef, Uri, Vocabulary, + Draft, List, Registry, Resolved, Resolver, Resource, ResourceRef, Uri, Vocabulary, VocabularySet, }; use serde_json::Value; -use std::{cell::RefCell, rc::Rc, sync::Arc}; +use std::{borrow::Cow, cell::RefCell, iter::once, rc::Rc, sync::Arc}; const DEFAULT_SCHEME: &str = "json-schema"; pub(crate) const DEFAULT_ROOT_URL: &str = "json-schema:///"; @@ -220,7 +220,7 @@ impl<'a> Context<'a> { let mut base_uri = resolved.resolver().base_uri(); let scopes = resolved.resolver().dynamic_scope(); if let Some(id) = resource.id() { - base_uri = Arc::new(uri::resolve_against(&base_uri.borrow(), id)?); + base_uri = self.registry.resolve_against(&base_uri.borrow(), id)?; }; Ok(Some((base_uri, scopes, resource))) } @@ -249,27 +249,24 @@ pub(crate) fn build_validator( let draft = config.draft_for(schema)?; let resource_ref = draft.create_resource_ref(schema); let resource = draft.create_resource(schema.clone()); - let base_uri = resource.id().unwrap_or(DEFAULT_ROOT_URL).to_string(); - - // Prepare additional resources to use in resolving - let mut resources = Vec::with_capacity(1 + config.resources.len()); - resources.push((base_uri.clone(), resource)); - for (uri, resource) in config.resources.drain() { - resources.push((uri, resource)); - } - - let retriever = Arc::clone(&config.retriever); + let base_uri = resource_ref.id().unwrap_or(DEFAULT_ROOT_URL); + // Build a registry & resolver needed for validator compilation let registry = Arc::new( Registry::options() .draft(draft) - .retriever(retriever) - .try_from_resources(resources.into_iter())?, + .retriever(Arc::clone(&config.retriever)) + .try_from_resources( + once((Cow::Borrowed(base_uri), resource)).chain( + config + .resources + .drain() + .map(|(uri, resource)| (Cow::Owned(uri), resource)), + ), + )?, ); - - // Build a registry & resolver needed for validator compilation let vocabularies = registry.find_vocabularies(draft, schema); - let resolver = Rc::new(registry.try_resolver(&base_uri)?); + let resolver = Rc::new(registry.try_resolver(base_uri)?); let config = Arc::new(config); let ctx = Context::new( From 0b79816da7532b61f1ca0e2ce268cd37c9704bc1 Mon Sep 17 00:00:00 2001 From: Dmitry Dygalo Date: Tue, 4 Feb 2025 20:31:03 +0100 Subject: [PATCH 6/7] wip Signed-off-by: Dmitry Dygalo --- CHANGELOG.md | 4 ++++ crates/jsonschema-py/CHANGELOG.md | 4 ++++ crates/jsonschema-referencing/src/anchors/mod.rs | 2 ++ crates/jsonschema-referencing/src/list.rs | 5 +++++ crates/jsonschema-referencing/src/registry.rs | 9 +++++++-- crates/jsonschema-referencing/src/resource.rs | 1 + crates/jsonschema-referencing/src/retriever.rs | 1 + crates/jsonschema-referencing/src/vocabularies.rs | 3 +++ 8 files changed, 27 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0f254df..04d18f2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Performance + +- Significantly improved validator compilation speed by using pointer-based references to schema fragments instead of cloning them during traversal. + ## [0.28.3] - 2025-01-24 ### Fixed diff --git a/crates/jsonschema-py/CHANGELOG.md b/crates/jsonschema-py/CHANGELOG.md index 9f5a5622..cfd0474d 100644 --- a/crates/jsonschema-py/CHANGELOG.md +++ b/crates/jsonschema-py/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Performance + +- Significantly improved validator compilation speed by using pointer-based references to schema fragments instead of cloning them during traversal. + ## [0.28.3] - 2025-01-24 ### Fixed diff --git a/crates/jsonschema-referencing/src/anchors/mod.rs b/crates/jsonschema-referencing/src/anchors/mod.rs index 94b81a24..6bbc77bb 100644 --- a/crates/jsonschema-referencing/src/anchors/mod.rs +++ b/crates/jsonschema-referencing/src/anchors/mod.rs @@ -24,7 +24,9 @@ impl AnchorName { } } + #[allow(unsafe_code)] fn as_str(&self) -> &str { + // SAFETY: The pointer is valid as long as the registry exists unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts( self.ptr.load(Ordering::Relaxed), diff --git a/crates/jsonschema-referencing/src/list.rs b/crates/jsonschema-referencing/src/list.rs index 279135f4..a5914df4 100644 --- a/crates/jsonschema-referencing/src/list.rs +++ b/crates/jsonschema-referencing/src/list.rs @@ -1,5 +1,6 @@ use std::sync::Arc; +/// An immutable singly-linked list. pub struct List { head: Option>>, } @@ -29,10 +30,12 @@ impl List { pub(crate) fn new() -> Self { Self { head: None } } + /// Returns true if the list contains no elements. #[must_use] pub fn is_empty(&self) -> bool { self.head.is_none() } + /// Creates a new list with the given value at the front, sharing the rest of the nodes. #[must_use] pub fn push_front(&self, value: Arc) -> Self { List { @@ -42,6 +45,7 @@ impl List { })), } } + /// Returns an iterator over references to the list elements. #[must_use] pub fn iter(&self) -> Iter<'_, T> { Iter { @@ -56,6 +60,7 @@ pub(crate) struct Node { next: Option>>, } +/// Iterator over references to elements in a `List`. #[derive(Debug)] pub struct Iter<'a, T> { current: Option<&'a Arc>>, diff --git a/crates/jsonschema-referencing/src/registry.rs b/crates/jsonschema-referencing/src/registry.rs index 6252f644..f97f8851 100644 --- a/crates/jsonschema-referencing/src/registry.rs +++ b/crates/jsonschema-referencing/src/registry.rs @@ -27,6 +27,7 @@ use crate::{ type DocumentStore = AHashMap>, Pin>>; type ResourceMap = AHashMap>, InnerResourcePtr>; +/// Pre-loaded registry containing all JSON Schema meta-schemas and their vocabularies pub static SPECIFICATIONS: Lazy = Lazy::new(|| { let pairs = meta::META_SCHEMAS.into_iter().map(|(uri, schema)| { ( @@ -314,11 +315,15 @@ impl Registry { Err(Error::no_such_anchor(name.to_string())) } } - + /// Resolves a reference URI against a base URI using registry's cache. + /// + /// # Errors + /// + /// Returns an error if base has not schema or there is a fragment. pub fn resolve_against(&self, base: &Uri<&str>, uri: &str) -> Result>, Error> { self.resolution_cache.resolve_against(base, uri) } - + /// Returns vocabulary set configured for given draft and contents. #[must_use] pub fn find_vocabularies(&self, draft: Draft, contents: &Value) -> VocabularySet { match draft.detect(contents) { diff --git a/crates/jsonschema-referencing/src/resource.rs b/crates/jsonschema-referencing/src/resource.rs index 97b906d5..1889ff73 100644 --- a/crates/jsonschema-referencing/src/resource.rs +++ b/crates/jsonschema-referencing/src/resource.rs @@ -140,6 +140,7 @@ impl InnerResourcePtr { } } + #[allow(unsafe_code)] pub(crate) fn contents(&self) -> &Value { // SAFETY: The pointer is valid as long as the registry exists unsafe { &*self.contents.load(Ordering::Relaxed) } diff --git a/crates/jsonschema-referencing/src/retriever.rs b/crates/jsonschema-referencing/src/retriever.rs index 21a8e094..d1015fa6 100644 --- a/crates/jsonschema-referencing/src/retriever.rs +++ b/crates/jsonschema-referencing/src/retriever.rs @@ -20,6 +20,7 @@ pub trait Retrieve: Send + Sync { fn retrieve(&self, uri: &Uri<&str>) -> Result>; } +/// A retriever that always fails, used as a default when external resource fetching is not needed. #[derive(Debug, Clone)] struct DefaultRetrieverError; diff --git a/crates/jsonschema-referencing/src/vocabularies.rs b/crates/jsonschema-referencing/src/vocabularies.rs index 0ac8feaf..3840a39f 100644 --- a/crates/jsonschema-referencing/src/vocabularies.rs +++ b/crates/jsonschema-referencing/src/vocabularies.rs @@ -6,6 +6,8 @@ use ahash::AHashSet; use fluent_uri::Uri; use serde_json::Value; +/// A JSON Schema vocabulary identifier, representing standard vocabularies (Core, Applicator, etc.) +/// or custom ones via URI. #[derive(Debug, PartialEq, Eq, Clone)] pub enum Vocabulary { Core, @@ -51,6 +53,7 @@ impl FromStr for Vocabulary { } } +/// A set of enabled JSON Schema vocabularies. #[derive(Clone, Default, PartialEq, Eq)] pub struct VocabularySet { known: u8, From 6d9691714a45ad2398008701aa2cf035c87518d8 Mon Sep 17 00:00:00 2001 From: Dmitry Dygalo Date: Tue, 4 Feb 2025 21:01:35 +0100 Subject: [PATCH 7/7] wip Signed-off-by: Dmitry Dygalo --- .github/workflows/ci.yml | 2 +- crates/jsonschema-referencing/src/registry.rs | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ffd57f85..b739e758 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -122,7 +122,7 @@ jobs: cache-all-crates: "true" - name: Run tests with Miri - run: cargo miri test -p referencing + run: cargo miri test -p referencing --lib lints-python: name: Python lints diff --git a/crates/jsonschema-referencing/src/registry.rs b/crates/jsonschema-referencing/src/registry.rs index f97f8851..fbb66d97 100644 --- a/crates/jsonschema-referencing/src/registry.rs +++ b/crates/jsonschema-referencing/src/registry.rs @@ -41,7 +41,6 @@ pub static SPECIFICATIONS: Lazy = Lazy::new(|| { let mut resources = ResourceMap::with_capacity(18); let mut anchors = AHashMap::with_capacity(8); let mut resolution_cache = UriCache::with_capacity(35); - // TODO: Drop resolution cache process_meta_schemas( pairs, &mut documents, @@ -402,7 +401,6 @@ fn process_resources( let mut external = AHashSet::new(); let mut scratch = String::new(); let mut refers_metaschemas = false; - // TODO: Implement `Registry::combine` // SAFETY: Deduplicate input URIs keeping the last occurrence to prevent creation // of resources pointing to values that could be dropped by later insertions @@ -414,7 +412,6 @@ fn process_resources( .collect(); input_pairs.dedup_by(|(lhs, _), (rhs, _)| lhs == rhs); - // Store documents and create initial InnerResourcePtrs for (uri, resource) in input_pairs { let key = Arc::new(uri); match documents.entry(Arc::clone(&key)) {