diff --git a/parser/Cargo.lock b/parser/Cargo.lock index 79935d93..29f854a7 100644 --- a/parser/Cargo.lock +++ b/parser/Cargo.lock @@ -206,6 +206,31 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "derivre" version = "0.1.0" @@ -230,6 +255,12 @@ dependencies = [ "syn", ] +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + [[package]] name = "email_address" version = "0.2.9" @@ -523,7 +554,7 @@ dependencies = [ "num-cmp", "once_cell", "percent-encoding", - "referencing", + "referencing 0.24.3", "regex-syntax", "serde", "serde_json", @@ -562,9 +593,12 @@ dependencies = [ "anyhow", "cbindgen", "derivre", + "indexmap", "instant", "jsonschema", "lazy_static", + "rayon", + "referencing 0.26.1", "regex-syntax", "rustc-hash", "serde", @@ -700,6 +734,26 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "ref-cast" version = "1.0.23" @@ -733,6 +787,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "referencing" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb853437e467c693ac1dc8c1520105a31b8c2588544ff2f3cfa5a7c706c6c069" +dependencies = [ + "ahash", + "fluent-uri", + "once_cell", + "percent-encoding", + "serde_json", +] + [[package]] name = "regex-automata" version = "0.4.8" diff --git a/parser/Cargo.toml b/parser/Cargo.toml index e9c581eb..0852a675 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -15,6 +15,8 @@ jsonschema = { version = "0.24.0", default-features = false, optional = true } url = "2.5.2" lazy_static = { version = "1.5.0", optional = true } regex-syntax = "0.8.5" +indexmap = "2.6.0" +referencing = "0.26.1" rayon = { version = "1.10.0", optional = true } [features] diff --git a/parser/build.rs b/parser/build.rs index 8b804f0b..69e1b6a1 100644 --- a/parser/build.rs +++ b/parser/build.rs @@ -25,7 +25,10 @@ fn main() { }, |bindings| { bindings.write_to_file("llguidance.h"); - bindings.write_to_file(format!("{}/../../../llguidance.h", env::var("OUT_DIR").unwrap())); + bindings.write_to_file(format!( + "{}/../../../llguidance.h", + env::var("OUT_DIR").unwrap() + )); }, ); } diff --git a/parser/src/earley/from_guidance.rs b/parser/src/earley/from_guidance.rs index 68c879cb..a7600a04 100644 --- a/parser/src/earley/from_guidance.rs +++ b/parser/src/earley/from_guidance.rs @@ -93,12 +93,12 @@ fn grammar_from_json( "cannot have both json_schema/lark_grammar and nodes/rx_nodes" ); - let mut new_grm = if let Some(json_schema) = input.json_schema.as_ref() { + let mut new_grm = if let Some(json_schema) = input.json_schema.take() { ensure!( input.lark_grammar.is_none(), "cannot have both json_schema and lark_grammar" ); - let opts = JsonCompileOptions { compact: false }; + let opts: JsonCompileOptions = JsonCompileOptions::default(); opts.json_to_llg(json_schema)? } else { lark_to_llguidance(input.lark_grammar.as_ref().unwrap())? diff --git a/parser/src/ffi.rs b/parser/src/ffi.rs index 95084bb5..1c752b9e 100644 --- a/parser/src/ffi.rs +++ b/parser/src/ffi.rs @@ -356,9 +356,9 @@ fn new_constraint_json(init: &LlgConstraintInit, json_schema: *const c_char) -> .map_err(|_| anyhow::anyhow!("Invalid UTF-8 in json_schema"))?; let json_schema = serde_json::from_str(json_schema) .map_err(|e| anyhow::anyhow!("Invalid JSON in json_schema: {e}"))?; - let opts = JsonCompileOptions { compact: false }; + let opts = JsonCompileOptions::default(); let grammar = opts - .json_to_llg(&json_schema) + .json_to_llg(json_schema) .map_err(|e| anyhow::anyhow!("Error compiling JSON schema to LLG: {e}"))?; init.build_constraint(grammar) } diff --git a/parser/src/grammar_builder.rs b/parser/src/grammar_builder.rs index 99ad23bf..8dac71f6 100644 --- a/parser/src/grammar_builder.rs +++ b/parser/src/grammar_builder.rs @@ -7,7 +7,7 @@ use crate::api::{ RegexSpec, TopLevelGrammar, }; -#[derive(Clone, Copy, PartialEq, Eq, Debug)] +#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)] pub struct NodeRef { idx: usize, grammar_id: u32, @@ -91,6 +91,14 @@ impl RegexBuilder { self.add_node(RegexNode::Repeat(node, min, max)) } + pub fn not(&mut self, node: RegexId) -> RegexId { + self.add_node(RegexNode::Not(node)) + } + + pub fn and(&mut self, nodes: Vec) -> RegexId { + self.add_node(RegexNode::And(nodes)) + } + fn finalize(&mut self) -> Vec { let r = std::mem::take(&mut self.nodes); *self = Self::new(); diff --git a/parser/src/json.rs b/parser/src/json.rs deleted file mode 100644 index 0fde7f56..00000000 --- a/parser/src/json.rs +++ /dev/null @@ -1,625 +0,0 @@ -use anyhow::{anyhow, bail, Result}; -use serde_json::{json, Value}; -use std::{collections::HashMap, vec}; - -use crate::{ - api::{GrammarWithLexer, RegexSpec, TopLevelGrammar}, - GrammarBuilder, NodeRef, -}; - -// TODO: grammar size limit -// TODO: array maxItems etc limits -// TODO: schemastore/src/schemas/json/BizTalkServerApplicationSchema.json - this breaks 1M fuel on lexer, why?! - -#[derive(Debug, Default, Clone)] -pub struct JsonCompileOptions { - pub compact: bool, -} - -fn to_compact_json(target: &serde_json::Value) -> String { - serde_json::to_string(target).unwrap() -} - -const KEYWORDS: [&str; 10] = [ - "anyOf", - "oneOf", - "allOf", - "$ref", - "const", - "enum", - "type", - "pattern", - "minLength", - "maxLength", -]; -const IGNORED_KEYS: [&str; 19] = [ - "$schema", - "$id", - "id", - "$comment", - "title", - "description", - "default", - "examples", - "authors", - "deprecationMessage", - "enumDescriptions", - "example", - "postActions", - "readOnly", - "markdownDescription", - "deprecated", - "dependencies", - "discriminator", // we hope it's part of the grammar anyways - "required", // TODO: implement and remove from ignored list -]; -// these are also just ignored -const DEFS_KEYS: [&str; 4] = ["$defs", "definitions", "defs", "refs"]; - -const ARRAY_KEYS: [&str; 4] = ["items", "prefixItems", "minItems", "maxItems"]; -const OBJECT_KEYS: [&str; 2] = ["properties", "additionalProperties"]; - -const CHAR_REGEX: &str = r#"(\\([\"\\\/bfnrt]|u[a-fA-F0-9]{4})|[^\"\\\x00-\x1F\x7F])"#; - -fn limited_str(node: &Value) -> String { - let s = node.to_string(); - if s.len() > 100 { - format!("{}...", &s[..100]) - } else { - s - } -} - -fn validate_json_node_keys(node: &Value) -> Result<()> { - let node = node - .as_object() - .ok_or_else(|| anyhow!("Expected object as json schema, got: {}", limited_str(node))) - .unwrap(); - - let typ = node.get("type").and_then(|v| v.as_str()).unwrap_or(""); - - for key in node.keys() { - let key = &key.as_str(); - if KEYWORDS.contains(key) || IGNORED_KEYS.contains(key) || DEFS_KEYS.contains(key) { - continue; - } - if typ == "array" && ARRAY_KEYS.contains(key) { - continue; - } - if typ == "object" && OBJECT_KEYS.contains(key) { - continue; - } - if key.starts_with("x-") || key.starts_with("$xsd-") { - continue; - } - bail!("Unknown key in JSON schema: {:?}", key); - } - - Ok(()) -} - -struct Compiler { - builder: GrammarBuilder, - options: JsonCompileOptions, - definitions: HashMap, - pending_definitions: Vec<(String, NodeRef)>, - - any_cache: Option, - lexeme_cache: HashMap, -} - -macro_rules! cache { - ($field:expr, $gen:expr) => { - if $field.is_none() { - $field = Some($gen); - } - return $field.unwrap(); - }; -} - -impl JsonCompileOptions { - pub fn json_to_llg(&self, schema: &Value) -> Result { - let mut compiler = Compiler::new(self.clone()); - #[cfg(feature = "jsonschema_validation")] - { - use crate::json_validation::validate_schema; - validate_schema(schema)?; - } - - compiler.execute(schema)?; - compiler.builder.finalize() - } - - pub fn json_to_llg_no_validate(&self, schema: &Value) -> Result { - let mut compiler = Compiler::new(self.clone()); - compiler.execute(schema)?; - compiler.builder.finalize() - } -} - -fn mk_regex(rx: &str) -> RegexSpec { - RegexSpec::Regex(rx.to_string()) -} - -trait OptionalField { - fn opt_u64(&self, key: &str) -> Result>; - fn opt_str(&self, key: &str) -> Result>; - fn opt_array(&self, key: &str) -> Result>>; - #[allow(dead_code)] - fn opt_bool(&self, key: &str) -> Result>; - fn opt_object(&self, key: &str) -> Result>>; -} - -fn expected_err(key: &str, val: &Value, expected: &str) -> anyhow::Error { - anyhow!( - "Expected {} for field {:?}, got: {}", - expected, - key, - limited_str(val) - ) -} - -impl OptionalField for Value { - fn opt_u64(&self, key: &str) -> Result> { - if let Some(val) = self.get(key) { - val.as_u64() - .ok_or_else(|| expected_err(key, val, "unsigned integer")) - .map(Some) - } else { - Ok(None) - } - } - - fn opt_str(&self, key: &str) -> Result> { - if let Some(val) = self.get(key) { - val.as_str() - .ok_or_else(|| expected_err(key, val, "string")) - .map(Some) - } else { - Ok(None) - } - } - - fn opt_array(&self, key: &str) -> Result>> { - if let Some(val) = self.get(key) { - val.as_array() - .ok_or_else(|| expected_err(key, val, "array")) - .map(Some) - } else { - Ok(None) - } - } - - fn opt_bool(&self, key: &str) -> Result> { - if let Some(val) = self.get(key) { - val.as_bool() - .ok_or_else(|| expected_err(key, val, "boolean")) - .map(Some) - } else { - Ok(None) - } - } - - fn opt_object(&self, key: &str) -> Result>> { - if let Some(val) = self.get(key) { - val.as_object() - .ok_or_else(|| expected_err(key, val, "object")) - .map(Some) - } else { - Ok(None) - } - } -} - -impl Compiler { - pub fn new(options: JsonCompileOptions) -> Self { - Self { - builder: GrammarBuilder::new(), - options, - definitions: HashMap::new(), - pending_definitions: vec![], - lexeme_cache: HashMap::new(), - any_cache: None, - } - } - - pub fn execute(&mut self, schema: &Value) -> Result<()> { - self.builder.add_grammar(GrammarWithLexer { - greedy_skip_rx: if self.options.compact { - None - } else { - Some(mk_regex(r"[\x20\x0A\x0D\x09]+")) - }, - ..GrammarWithLexer::default() - }); - - let root = self.gen_json(schema)?; - self.builder.set_start_node(root); - - while let Some((path0, pl)) = self.pending_definitions.pop() { - // path is #/foo/bar/baz, first split into elements - let path = path0.trim_start_matches("#/"); - let path = path.split('/').collect::>(); - let mut node = schema; - for elem in path { - node = &node[elem]; - } - if node.is_null() { - bail!("Definition not found: {}", path0); - } - - let compiled = self.gen_json(node)?; - self.builder.set_placeholder(pl, compiled); - } - - Ok(()) - } - - fn process_any_of(&mut self, obj: &Value) -> Result { - let arr = obj - .as_array() - .ok_or_else(|| anyhow!("Expected array in anyOf, got: {}", limited_str(obj)))? - .iter() - .map(|json_schema| self.gen_json(json_schema)) - .collect::>>()?; - Ok(self.builder.select(&arr)) - } - - fn gen_json(&mut self, json_schema: &Value) -> Result { - if json_schema.as_bool() == Some(true) { - return Ok(self.gen_json_any()); - } - - if json_schema.as_bool() == Some(false) { - bail!("'false' not supported as schema here"); - } - - // eprintln!("gen_json: {}", limited_str(json_schema)); - validate_json_node_keys(json_schema)?; - - // Process anyOf - if let Some(any_of) = json_schema.get("anyOf") { - return self.process_any_of(any_of); - } - - // Process oneOf (same handling as anyOf for now) - if let Some(one_of) = json_schema.get("oneOf") { - return self.process_any_of(one_of); - } - - // Process allOf - if let Some(all_of_list) = json_schema.opt_array("allOf")? { - if all_of_list.len() != 1 { - bail!("Only support allOf with exactly one item"); - } - return self.gen_json(&all_of_list[0]); - } - - // Process $ref - if let Some(reference) = json_schema.get("$ref") { - let ref_str = reference.as_str().ok_or_else(|| { - anyhow!("Expected string in $ref, got: {}", limited_str(reference)) - })?; - return self.get_definition(ref_str); - } - - // Process const - if let Some(const_value) = json_schema.get("const") { - let compact_const = to_compact_json(const_value); - return Ok(self.builder.string(&compact_const)); - } - - // Process enum - if let Some(enum_array) = json_schema.opt_array("enum")? { - let options = enum_array - .iter() - .map(|opt| self.builder.string(&to_compact_json(opt))) - .collect::>(); - return Ok(self.builder.select(&options)); - } - - // Process type-specific keywords - if let Some(arr) = json_schema["type"].as_array() { - let nodes = arr - .iter() - .map(|v| { - let tp = v.as_str().ok_or_else(|| { - anyhow!("Expected string in type list, got: {}", limited_str(v)) - })?; - self.gen_json_type(tp, json_schema) - }) - .collect::>>()?; - return Ok(self.builder.select(&nodes)); - } - - if let Some(target_type_str) = json_schema.opt_str("type")? { - return self.gen_json_type(target_type_str, json_schema); - } - - // Fallback to "any" type - Ok(self.gen_json_any()) - } - - fn gen_json_type(&mut self, target_type_str: &str, json_schema: &Value) -> Result { - match target_type_str { - "null" => return Ok(self.builder.string("null")), - "boolean" => return Ok(self.lexeme(r"true|false")), - "integer" => return Ok(self.json_int()), - "number" => return Ok(self.json_number()), - "string" => { - let min_length = json_schema.opt_u64("minLength")?.unwrap_or(0); - let max_length = json_schema.opt_u64("maxLength")?; - let pattern = json_schema.opt_str("pattern")?; - return self.gen_json_string(min_length, max_length, pattern); - } - "array" => { - let empty = vec![]; - let prefix_items = json_schema.opt_array("prefixItems")?.unwrap_or(&empty); - let item_schema = json_schema.get("items").unwrap_or(&Value::Bool(true)); - let min_items = json_schema.opt_u64("minItems")?.unwrap_or(0); - let max_items = json_schema.opt_u64("maxItems")?; - return self.gen_json_array(prefix_items, item_schema, min_items, max_items); - } - "object" => { - let empty = serde_json::Map::default(); - let properties = json_schema.opt_object("properties")?.unwrap_or(&empty); - let additional_properties = json_schema - .get("additionalProperties") - .unwrap_or(&Value::Bool(true)); - return self.gen_json_object(properties, additional_properties); - } - _ => bail!("Unsupported type in schema: {}", target_type_str), - } - } - - fn lexeme(&mut self, rx: &str) -> NodeRef { - if self.lexeme_cache.contains_key(rx) { - return self.lexeme_cache[rx]; - } - let r = self.builder.lexeme(mk_regex(rx), false); - self.lexeme_cache.insert(rx.to_string(), r); - r - } - - fn json_int(&mut self) -> NodeRef { - self.lexeme(r"-?(?:0|[1-9][0-9]*)") - } - - fn json_number(&mut self) -> NodeRef { - self.lexeme(r"-?(?:0|[1-9][0-9]*)(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+)?") - } - - fn json_simple_string(&mut self) -> NodeRef { - self.lexeme(&format!("\"{}*\"", CHAR_REGEX)) - } - - fn get_definition(&mut self, reference: &str) -> Result { - if let Some(definition) = self.definitions.get(reference) { - return Ok(*definition); - } - let r = self.builder.placeholder(); - self.definitions.insert(reference.to_string(), r); - self.pending_definitions.push((reference.to_string(), r)); - Ok(r) - } - - fn gen_json_any(&mut self) -> NodeRef { - cache!(self.any_cache, { - let json_any = self.builder.placeholder(); - self.any_cache = Some(json_any); // avoid infinite recursion - let all_jsons = json!([ - {"type": "null"}, - {"type": "boolean"}, - {"type": "integer"}, - {"type": "number"}, - {"type": "string"}, - {"type": "array", "items": true}, - {"type": "object", "additionalProperties": true}, - ]); - let ch = all_jsons - .as_array() - .unwrap() - .iter() - .map(|json_schema| self.gen_json(json_schema)) - .collect::>>() - .unwrap(); - let inner = self.builder.select(&ch); - self.builder.set_placeholder(json_any, inner); - json_any - }); - } - - fn gen_json_object( - &mut self, - properties: &serde_json::Map, - additional_properties: &Value, - ) -> Result { - let mut grammars: Vec = vec![self.builder.string("{")]; - - if !properties.is_empty() { - grammars.extend(self.process_properties(properties)?); - if additional_properties != &Value::Bool(false) { - grammars.push(self.builder.string(",")); - } - } - - if additional_properties != &Value::Bool(false) { - grammars.push(self.process_additional_properties(additional_properties)?); - } - - grammars.push(self.builder.string("}")); - Ok(self.builder.join(&grammars)) - } - - fn process_properties( - &mut self, - properties: &serde_json::Map, - ) -> Result> { - let mut result = vec![]; - let mut properties_added = 0; - - for (name, property_schema) in properties { - result.push(self.builder.string(&format!("\"{}\"", name))); - result.push(self.builder.string(":")); - result.push(self.gen_json(property_schema)?); - properties_added += 1; - if properties_added < properties.len() { - result.push(self.builder.string(",")); - } - } - - Ok(result) - } - - fn process_additional_properties(&mut self, additional_properties: &Value) -> Result { - let str = self.json_simple_string(); - let colon = self.builder.string(":"); - let the_rest = self.gen_json(additional_properties)?; - let item = self.builder.join(&[str, colon, the_rest]); - let inner = self.sequence(item); - Ok(self.builder.optional(inner)) - } - - fn sequence(&mut self, item: NodeRef) -> NodeRef { - let comma = self.builder.string(","); - let item_comma = self.builder.join(&[item, comma]); - let item_comma_star = self.builder.zero_or_more(item_comma); - self.builder.join(&[item_comma_star, item]) - } - - fn gen_json_string( - &mut self, - min_length: u64, - max_length: Option, - regex: Option<&str>, - ) -> Result { - if min_length == 0 && max_length.is_none() && regex.is_none() { - return Ok(self.json_simple_string()); - } - - if let Some(regex) = regex { - if min_length > 0 || max_length.is_some() { - bail!("If a pattern is specified, minLength and maxLength must be unspecified."); - } - // the regex has implicit ^...$ anyways - let regex = regex.trim_start_matches('^').trim_end_matches('$'); - let node = self.builder.lexeme(mk_regex(regex), true); - Ok(node) - } else { - Ok(self.lexeme(&format!( - "\"{}{{{},{}}}\"", - CHAR_REGEX, - min_length, - max_length.map_or("".to_string(), |v| v.to_string()) - ))) - } - } - - fn gen_json_array( - &mut self, - prefix_items: &[Value], - item_schema: &Value, - min_items: u64, - max_items: Option, - ) -> Result { - let anything_goes = json!({}); - let item_schema = if item_schema.as_bool() == Some(true) { - &anything_goes - } else { - item_schema - }; - let item_schema_is_false = item_schema.as_bool() == Some(false); - - if item_schema_is_false && prefix_items.len() < min_items as usize { - bail!( - "PrefixItems has too few elements ({}) to satisfy minItems ({}) but no extra items were allowed", - prefix_items.len(), - min_items - ); - } - - if let Some(max_items_value) = max_items { - if max_items_value < min_items { - bail!( - "maxItems ({}) can't be less than minItems ({})", - max_items_value, - min_items - ); - } - } - - let mut required_items = vec![]; - let mut optional_items = vec![]; - - // If max_items is None, we can add an infinite tail of items later - let n_to_add = max_items.map_or(prefix_items.len().max(min_items as usize), |max| { - max as usize - }); - - if let Some(item_arr) = item_schema.as_array() { - for item in item_arr { - required_items.push(self.gen_json(item)?); - } - } else { - let item_schema_compiled = if item_schema_is_false { - None - } else { - Some(self.gen_json(item_schema)?) - }; - - for i in 0..n_to_add { - let item = if i < prefix_items.len() { - self.gen_json(&prefix_items[i])? - } else if let Some(compiled) = &item_schema_compiled { - compiled.clone() - } else { - break; - }; - - if i < min_items as usize { - required_items.push(item); - } else { - optional_items.push(item); - } - } - - if max_items.is_none() && !item_schema_is_false { - // Add an infinite tail of items - optional_items.push(self.sequence(item_schema_compiled.unwrap())); - } - } - - let mut grammars: Vec = vec![self.builder.string("[")]; - let comma = self.builder.string(","); - - if !required_items.is_empty() { - grammars.push(required_items[0]); - for item in &required_items[1..] { - grammars.push(comma); - grammars.push(*item); - } - } - - if !optional_items.is_empty() { - let first = optional_items[0]; - let tail = optional_items - .into_iter() - .skip(1) - .rev() - .fold(first, |acc, item| { - let j = self.builder.join(&[comma, item, acc]); - self.builder.optional(j) - }); - - if !required_items.is_empty() { - let j = self.builder.join(&[comma, tail]); - grammars.push(self.builder.optional(j)); - } else { - grammars.push(self.builder.optional(tail)); - } - } - - grammars.push(self.builder.string("]")); - Ok(self.builder.join(&grammars)) - } -} diff --git a/parser/src/json/compiler.rs b/parser/src/json/compiler.rs new file mode 100644 index 00000000..12d41c9f --- /dev/null +++ b/parser/src/json/compiler.rs @@ -0,0 +1,713 @@ +use anyhow::{anyhow, bail, Context, Result}; +use indexmap::IndexMap; +use serde_json::{json, Value}; +use std::{collections::HashMap, vec}; + +use super::formats::lookup_format; +use super::numeric::{rx_float_range, rx_int_range}; +use super::schema::{build_schema, Schema}; +use crate::{ + api::{GrammarWithLexer, RegexSpec, TopLevelGrammar}, + GrammarBuilder, NodeRef, +}; + +// TODO: grammar size limit +// TODO: array maxItems etc limits +// TODO: schemastore/src/schemas/json/BizTalkServerApplicationSchema.json - this breaks 1M fuel on lexer, why?! + +#[derive(Debug, Clone)] +pub struct JsonCompileOptions { + pub item_separator: String, + pub key_separator: String, + pub whitespace_flexible: bool, +} + +fn json_dumps(target: &serde_json::Value) -> String { + serde_json::to_string(target).unwrap() +} + +#[derive(Debug)] +struct UnsatisfiableSchemaError { + message: String, +} + +impl std::fmt::Display for UnsatisfiableSchemaError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Unsatisfiable schema: {}", self.message) + } +} + +const CHAR_REGEX: &str = r#"(\\([\"\\\/bfnrt]|u[a-fA-F0-9]{4})|[^\"\\\x00-\x1F\x7F])"#; + +fn check_number_bounds( + minimum: Option, + maximum: Option, + exclusive_minimum: bool, + exclusive_maximum: bool, +) -> Result<()> { + if let (Some(min), Some(max)) = (minimum, maximum) { + if min > max { + return Err(anyhow!(UnsatisfiableSchemaError { + message: format!("minimum ({}) is greater than maximum ({})", min, max), + })); + } + if min == max && (exclusive_minimum || exclusive_maximum) { + let minimum_repr = if exclusive_minimum { + "exclusiveMinimum" + } else { + "minimum" + }; + let maximum_repr = if exclusive_maximum { + "exclusiveMaximum" + } else { + "maximum" + }; + return Err(anyhow!(UnsatisfiableSchemaError { + message: format!( + "{} ({}) is equal to {} ({})", + minimum_repr, min, maximum_repr, max + ), + })); + } + } + Ok(()) +} + +struct Compiler { + builder: GrammarBuilder, + options: JsonCompileOptions, + definitions: HashMap, + pending_definitions: Vec<(String, NodeRef)>, + + any_cache: Option, + lexeme_cache: HashMap, +} + +macro_rules! cache { + ($field:expr, $gen:expr) => { + if $field.is_none() { + $field = Some($gen); + } + return Ok($field.unwrap()); + }; +} + +impl Default for JsonCompileOptions { + fn default() -> Self { + Self { + item_separator: ",".to_string(), + key_separator: ":".to_string(), + whitespace_flexible: true, + } + } +} + +impl JsonCompileOptions { + pub fn json_to_llg(&self, schema: Value) -> Result { + let mut compiler = Compiler::new(self.clone()); + #[cfg(feature = "jsonschema_validation")] + { + use crate::json_validation::validate_schema; + validate_schema(&schema)?; + } + + compiler.execute(schema)?; + compiler.builder.finalize() + } + + pub fn json_to_llg_no_validate(&self, schema: Value) -> Result { + let mut compiler = Compiler::new(self.clone()); + compiler.execute(schema)?; + compiler.builder.finalize() + } +} + +fn mk_regex(rx: &str) -> RegexSpec { + RegexSpec::Regex(rx.to_string()) +} + +impl Compiler { + pub fn new(options: JsonCompileOptions) -> Self { + Self { + builder: GrammarBuilder::new(), + options, + definitions: HashMap::new(), + pending_definitions: vec![], + lexeme_cache: HashMap::new(), + any_cache: None, + } + } + + pub fn execute(&mut self, schema: Value) -> Result<()> { + self.builder.add_grammar(GrammarWithLexer { + greedy_skip_rx: if self.options.whitespace_flexible { + Some(mk_regex(r"[\x20\x0A\x0D\x09]+")) + } else { + None + }, + ..GrammarWithLexer::default() + }); + + let (compiled_schema, definitions) = build_schema(schema)?; + + let root = self.gen_json(&compiled_schema)?; + self.builder.set_start_node(root); + + while let Some((path, pl)) = self.pending_definitions.pop() { + let schema = definitions + .get(&path) + .ok_or_else(|| anyhow!("Definition not found: {}", path))?; + let compiled = self.gen_json(schema)?; + self.builder.set_placeholder(pl, compiled); + } + + Ok(()) + } + + fn gen_json(&mut self, json_schema: &Schema) -> Result { + match json_schema { + Schema::Any => self.gen_json_any(), + Schema::Unsatisfiable { reason } => Err(anyhow!(UnsatisfiableSchemaError { + message: reason.to_string(), + })), + Schema::Null => Ok(self.builder.string("null")), + Schema::Boolean => Ok(self.lexeme(r"true|false")), + Schema::Number { + minimum, + maximum, + exclusive_minimum, + exclusive_maximum, + integer, + } => { + let (minimum, exclusive_minimum) = match (minimum, exclusive_minimum) { + (Some(min), Some(xmin)) => { + if xmin >= min { + (Some(*xmin), true) + } else { + (Some(*min), false) + } + } + (Some(min), None) => (Some(*min), false), + (None, Some(xmin)) => (Some(*xmin), true), + (None, None) => (None, false), + }; + let (maximum, exclusive_maximum) = match (maximum, exclusive_maximum) { + (Some(max), Some(xmax)) => { + if xmax <= max { + (Some(*xmax), true) + } else { + (Some(*max), false) + } + } + (Some(max), None) => (Some(*max), false), + (None, Some(xmax)) => (Some(*xmax), true), + (None, None) => (None, false), + }; + if *integer { + self.json_int(minimum, maximum, exclusive_minimum, exclusive_maximum) + } else { + self.json_number(minimum, maximum, exclusive_minimum, exclusive_maximum) + } + } + Schema::String { + min_length, + max_length, + pattern, + format, + } => self.gen_json_string( + *min_length, + *max_length, + pattern.as_deref(), + format.as_deref(), + ), + Schema::Array { + min_items, + max_items, + prefix_items, + items, + } => self.gen_json_array( + prefix_items, + items.as_deref().unwrap_or(&Schema::Any), + *min_items, + *max_items, + ), + Schema::Object { + properties, + additional_properties, + required, + } => self.gen_json_object( + properties, + additional_properties.as_deref().unwrap_or(&Schema::Any), + required.iter().cloned().collect(), + ), + Schema::Const { value } => self.gen_json_const(value.clone()), + Schema::Enum { options } => self.gen_json_enum(options.clone()), + Schema::AnyOf { options } => self.process_any_of(options.clone()), + Schema::OneOf { options } => self.process_any_of(options.clone()), + Schema::Ref { uri, .. } => self.get_definition(uri), + } + } + + fn process_any_of(&mut self, options: Vec) -> Result { + let options = options + .iter() + .map(|v| self.gen_json(v)) + .collect::>>()?; + Ok(self.builder.select(&options)) + } + + fn gen_json_enum(&mut self, options: Vec) -> Result { + let options = options + .into_iter() + .map(|v| self.gen_json_const(v)) + .collect::>>()?; + Ok(self.builder.select(&options)) + } + + fn gen_json_const(&mut self, const_value: Value) -> Result { + // Recursively build a grammar for a constant value (just to play nicely with separators and whitespace flexibility) + match const_value { + Value::Object(values) => { + let properties = IndexMap::from_iter( + values + .into_iter() + .map(|(k, v)| (k, Schema::Const { value: v })), + ); + let required = properties.keys().cloned().collect(); + self.gen_json_object(&properties, &Schema::false_schema(), required) + } + Value::Array(values) => { + let n_items = values.len() as u64; + let prefix_items = values + .into_iter() + .map(|v| Schema::Const { value: v }) + .collect::>(); + self.gen_json_array( + &prefix_items, + &Schema::false_schema(), + n_items, + Some(n_items), + ) + } + _ => { + // let serde_json dump simple values + let const_str = json_dumps(&const_value); + Ok(self.builder.string(&const_str)) + } + } + } + + fn lexeme(&mut self, rx: &str) -> NodeRef { + if self.lexeme_cache.contains_key(rx) { + return self.lexeme_cache[rx]; + } + let r = self.builder.lexeme(mk_regex(rx), false); + self.lexeme_cache.insert(rx.to_string(), r); + r + } + + fn json_int( + &mut self, + minimum: Option, + maximum: Option, + exclusive_minimum: bool, + exclusive_maximum: bool, + ) -> Result { + check_number_bounds(minimum, maximum, exclusive_minimum, exclusive_maximum)?; + let minimum = match (minimum, exclusive_minimum) { + (Some(min_val), true) => { + if min_val.fract() != 0.0 { + Some(min_val.ceil()) + } else { + Some(min_val + 1.0) + } + } + (Some(min_val), false) => Some(min_val.ceil()), + _ => None, + } + .map(|val| val as i64); + let maximum = match (maximum, exclusive_maximum) { + (Some(max_val), true) => { + if max_val.fract() != 0.0 { + Some(max_val.floor()) + } else { + Some(max_val - 1.0) + } + } + (Some(max_val), false) => Some(max_val.floor()), + _ => None, + } + .map(|val| val as i64); + // TODO: handle errors in rx_int_range; currently it just panics + let rx = rx_int_range(minimum, maximum).with_context(|| { + format!( + "Failed to generate regex for integer range: min={:?}, max={:?}", + minimum, maximum + ) + })?; + Ok(self.lexeme(&rx)) + } + + fn json_number( + &mut self, + minimum: Option, + maximum: Option, + exclusive_minimum: bool, + exclusive_maximum: bool, + ) -> Result { + check_number_bounds(minimum, maximum, exclusive_minimum, exclusive_maximum)?; + // TODO: handle errors in rx_float_range; currently it just panics + let rx = rx_float_range(minimum, maximum, !exclusive_minimum, !exclusive_maximum) + .with_context(|| { + format!( + "Failed to generate regex for float range: min={:?}, max={:?}", + minimum, maximum + ) + })?; + Ok(self.lexeme(&rx)) + } + + fn json_simple_string(&mut self) -> NodeRef { + self.lexeme(&format!("\"{}*\"", CHAR_REGEX)) + } + + fn get_definition(&mut self, reference: &str) -> Result { + if let Some(definition) = self.definitions.get(reference) { + return Ok(*definition); + } + let r = self.builder.placeholder(); + self.definitions.insert(reference.to_string(), r); + self.pending_definitions.push((reference.to_string(), r)); + Ok(r) + } + + fn gen_json_any(&mut self) -> Result { + cache!(self.any_cache, { + let json_any = self.builder.placeholder(); + self.any_cache = Some(json_any); // avoid infinite recursion + let options = vec![ + self.builder.string("null"), + self.builder.lexeme(mk_regex(r"true|false"), false), + self.json_number(None, None, false, false)?, + self.json_simple_string(), + self.gen_json_array(&[], &Schema::Any, 0, None)?, + self.gen_json_object(&IndexMap::new(), &Schema::Any, vec![])?, + ]; + let inner = self.builder.select(&options); + self.builder.set_placeholder(json_any, inner); + json_any + }); + } + + fn gen_json_object( + &mut self, + properties: &IndexMap, + additional_properties: &Schema, + required: Vec, + ) -> Result { + let mut taken_names: Vec = vec![]; + let mut items: Vec<(NodeRef, bool)> = vec![]; + for name in properties.keys().chain( + required + .iter() + .filter(|n| !properties.contains_key(n.as_str())), + ) { + let property_schema = properties.get(name).unwrap_or(additional_properties); + let is_required = required.contains(name); + // Quote (and escape) the name + let quoted_name = json_dumps(&json!(name)); + let property = match self.gen_json(property_schema) { + Ok(node) => node, + Err(e) => match e.downcast_ref::() { + // If it's not an UnsatisfiableSchemaError, just propagate it normally + None => return Err(e), + // Property is optional; don't raise UnsatisfiableSchemaError but mark name as taken + Some(_) if !is_required => { + taken_names.push(quoted_name); + continue; + } + // Property is required; add context and propagate UnsatisfiableSchemaError + Some(_) => { + return Err(e.context(UnsatisfiableSchemaError { + message: format!("required property '{}' is unsatisfiable", name), + })); + } + }, + }; + let name = self.builder.string("ed_name); + taken_names.push(quoted_name); + let colon = self.builder.string(&self.options.key_separator); + let item = self.builder.join(&[name, colon, property]); + items.push((item, is_required)); + } + + match self.gen_json(additional_properties) { + Err(e) => { + if e.downcast_ref::().is_none() { + // Propagate errors that aren't UnsatisfiableSchemaError + return Err(e); + } + // Ignore UnsatisfiableSchemaError for additionalProperties + } + Ok(property) => { + let name = if taken_names.is_empty() { + self.json_simple_string() + } else { + let taken_name_ids = taken_names + .iter() + .map(|n| self.builder.regex.literal(n.to_string())) + .collect::>(); + let taken = self.builder.regex.select(taken_name_ids); + let not_taken = self.builder.regex.not(taken); + let valid = self + .builder + .regex + .regex(r#""([^"\\]|\\["\\/bfnrt]|\\u[0-9a-fA-F]{4})*""#.to_string()); + let valid_and_not_taken = self.builder.regex.and(vec![valid, not_taken]); + let rx = RegexSpec::RegexId(valid_and_not_taken); + self.builder.lexeme(rx, false) + }; + let colon = self.builder.string(&self.options.key_separator); + let item = self.builder.join(&[name, colon, property]); + let seq = self.sequence(item); + items.push((seq, false)); + } + } + let opener = self.builder.string("{"); + let inner = self.ordered_sequence(&items, false, &mut HashMap::new()); + let closer = self.builder.string("}"); + Ok(self.builder.join(&[opener, inner, closer])) + } + + fn ordered_sequence<'a>( + &mut self, + items: &'a [(NodeRef, bool)], + prefixed: bool, + cache: &mut HashMap<(&'a [(NodeRef, bool)], bool), NodeRef>, + ) -> NodeRef { + // Cache to reduce number of nodes from O(n^2) to O(n) + if let Some(node) = cache.get(&(items, prefixed)) { + return node.clone(); + } + if items.is_empty() { + return self.builder.string(""); + } + let comma = self.builder.string(&self.options.item_separator); + let (item, required) = items[0]; + let rest = &items[1..]; + + let node = match (prefixed, required) { + (true, true) => { + // If we know we have preceeding elements, we can safely just add a (',' + e) + let rest_seq = self.ordered_sequence(rest, true, cache); + self.builder.join(&[comma, item, rest_seq]) + } + (true, false) => { + // If we know we have preceeding elements, we can safely just add an optional(',' + e) + // TODO optimization: if the rest is all optional, we can nest the rest in the optional + let comma_item = self.builder.join(&[comma, item]); + let optional_comma_item = self.builder.optional(comma_item); + let rest_seq = self.ordered_sequence(rest, true, cache); + self.builder.join(&[optional_comma_item, rest_seq]) + } + (false, true) => { + // No preceding elements, so we just add the element (no comma) + let rest_seq = self.ordered_sequence(rest, true, cache); + self.builder.join(&[item, rest_seq]) + } + (false, false) => { + // No preceding elements, but our element is optional. If we add the element, the remaining + // will be prefixed, else they are not. + // TODO: same nested optimization as above + let prefixed_rest = self.ordered_sequence(rest, true, cache); + let unprefixed_rest = self.ordered_sequence(rest, false, cache); + let opts = [self.builder.join(&[item, prefixed_rest]), unprefixed_rest]; + self.builder.select(&opts) + } + }; + cache.insert((items, prefixed), node.clone()); + node + } + + fn sequence(&mut self, item: NodeRef) -> NodeRef { + let comma = self.builder.string(&self.options.item_separator); + let item_comma = self.builder.join(&[item, comma]); + let item_comma_star = self.builder.zero_or_more(item_comma); + self.builder.join(&[item_comma_star, item]) + } + + fn gen_json_string( + &mut self, + min_length: u64, + max_length: Option, + regex: Option<&str>, + format: Option<&str>, + ) -> Result { + if let Some(max_length) = max_length { + if min_length > max_length { + return Err(anyhow!(UnsatisfiableSchemaError { + message: format!( + "minLength ({}) is greater than maxLength ({})", + min_length, max_length + ), + })); + } + } + + let mut regex = regex; + + if let Some(format) = format { + if regex.is_some() { + bail!("Cannot specify both a regex and a format for a JSON string"); + } + if let Some(r) = lookup_format(format) { + regex = Some(r); + } else { + bail!("Unknown format: {}", format) + }; + } + + if min_length == 0 && max_length.is_none() && regex.is_none() { + return Ok(self.json_simple_string()); + } + + if let Some(regex) = regex { + if min_length > 0 || max_length.is_some() { + bail!("If a pattern is specified, minLength and maxLength must be unspecified."); + } + // the regex has implicit ^...$ anyways + let regex = regex.trim_start_matches('^').trim_end_matches('$'); + let node = self.builder.lexeme(mk_regex(regex), true); + Ok(node) + } else { + Ok(self.lexeme(&format!( + "\"{}{{{},{}}}\"", + CHAR_REGEX, + min_length, + max_length.map_or("".to_string(), |v| v.to_string()) + ))) + } + } + + fn gen_json_array( + &mut self, + prefix_items: &[Schema], + item_schema: &Schema, + min_items: u64, + max_items: Option, + ) -> Result { + let mut max_items = max_items; + + if let Some(max_items) = max_items { + if min_items > max_items { + return Err(anyhow!(UnsatisfiableSchemaError { + message: format!( + "minItems ({}) is greater than maxItems ({})", + min_items, max_items + ), + })); + } + } + + let additional_item_grm = match self.gen_json(item_schema) { + Ok(node) => Some(node), + Err(e) => match e.downcast_ref::() { + // If it's not an UnsatisfiableSchemaError, just propagate it normally + None => return Err(e), + // Item is optional; don't raise UnsatisfiableSchemaError + Some(_) if prefix_items.len() >= min_items as usize => None, + // Item is required; add context and propagate UnsatisfiableSchemaError + Some(_) => { + return Err(e.context(UnsatisfiableSchemaError { + message: format!("required item is unsatisfiable"), + })); + } + }, + }; + + let mut required_items = vec![]; + let mut optional_items = vec![]; + + // If max_items is None, we can add an infinite tail of items later + let n_to_add = max_items.map_or(prefix_items.len().max(min_items as usize), |max| { + max as usize + }); + + for i in 0..n_to_add { + let item = if i < prefix_items.len() { + match self.gen_json(&prefix_items[i]) { + Ok(node) => node, + Err(e) => match e.downcast_ref::() { + // If it's not an UnsatisfiableSchemaError, just propagate it normally + None => return Err(e), + // Item is optional; don't raise UnsatisfiableSchemaError. + // Set max_items to the current index, as we can't satisfy any more items. + Some(_) if i >= min_items as usize => { + max_items = Some(i as u64); + break; + } + // Item is required; add context and propagate UnsatisfiableSchemaError + Some(_) => { + return Err(e.context(UnsatisfiableSchemaError { + message: format!( + "prefixItems[{}] is unsatisfiable but minItems is {}", + i, min_items + ), + })); + } + }, + } + } else if let Some(compiled) = &additional_item_grm { + compiled.clone() + } else { + break; + }; + + if i < min_items as usize { + required_items.push(item); + } else { + optional_items.push(item); + } + } + + if max_items.is_none() && !additional_item_grm.is_none() { + // Add an infinite tail of items + optional_items.push(self.sequence(additional_item_grm.unwrap())); + } + + let mut grammars: Vec = vec![self.builder.string("[")]; + let comma = self.builder.string(&self.options.item_separator); + + if !required_items.is_empty() { + grammars.push(required_items[0]); + for item in &required_items[1..] { + grammars.push(comma); + grammars.push(*item); + } + } + + if !optional_items.is_empty() { + let first = optional_items[0]; + let tail = + optional_items + .into_iter() + .skip(1) + .rev() + .fold(self.builder.empty(), |acc, item| { + let j = self.builder.join(&[comma, item, acc]); + self.builder.optional(j) + }); + let tail = self.builder.join(&[first, tail]); + + if !required_items.is_empty() { + let j = self.builder.join(&[comma, tail]); + grammars.push(self.builder.optional(j)); + } else { + grammars.push(self.builder.optional(tail)); + } + } + + grammars.push(self.builder.string("]")); + Ok(self.builder.join(&grammars)) + } +} diff --git a/parser/src/json/formats.rs b/parser/src/json/formats.rs new file mode 100644 index 00000000..3ec381c3 --- /dev/null +++ b/parser/src/json/formats.rs @@ -0,0 +1,50 @@ +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref FORMAT_PATTERNS: HashMap<&'static str, &'static str> = { + HashMap::from([ + ( + "date-time", + r"(?P[0-9]{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12][0-9]|3[01]))[tT](?P