diff --git a/parser/Cargo.toml b/parser/Cargo.toml index 23f5b7b6..5e0e7cbb 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] toktrie = { git = "https://github.com/microsoft/toktrie", rev = "6934722328ee1d3d679f95fcd5c669d47cee08f2" } -derivre = { git = "https://github.com/microsoft/derivre", rev = "fb0ba7b6307782e0d43a0ca598b237836cb6d304" } +derivre = { git = "https://github.com/microsoft/derivre", rev = "ad363698cc95d7e63c5116aa114596f18dc79385" } serde = { version = "1.0.192", features = ["derive"] } serde_json = "1.0.108" anyhow = "1.0.75" diff --git a/parser/src/api.rs b/parser/src/api.rs index 5c59f023..a0e4ef19 100644 --- a/parser/src/api.rs +++ b/parser/src/api.rs @@ -86,6 +86,23 @@ pub enum Node { /// Override sampling temperature. temperature: Option, + /// When set, the lexeme will be quoted as a JSON string. + /// For example, /[a-z"]+/ will be quoted as /([a-z]|\\")+/ + json_string: Option, + + /// It lists the allowed escape sequences, typically one of: + /// "nrbtf\\\"u" - to allow all JSON escapes, including \u00XX for control characters + /// this is the default + /// "nrbtf\\\"" - to disallow \u00XX control characters + /// "nrt\\\"" - to also disallow unusual escapes (\f and \b) + /// "" - to disallow all escapes + /// Note that \uXXXX for non-control characters (code points above U+001F) are never allowed, + /// as they never have to be quoted in JSON. + json_allowed_escapes: Option, + + /// When set and json_string is also set, "..." will not be added around the regular expression. + json_raw: Option, + #[serde(flatten)] props: NodeProps, }, @@ -115,6 +132,12 @@ pub enum Node { }, } +pub enum JsonQuoteOptions { + /// Do not allow \uXXXX in strings. Will allow \n, \t, \" etc + NoUnicodeEscapes, + WithUnicodeEscapes, +} + /// Optional fields allowed on any Node #[derive(Serialize, Deserialize, Default, Clone)] pub struct NodeProps { diff --git a/parser/src/earley/from_guidance.rs b/parser/src/earley/from_guidance.rs index 0f83c94e..a578e059 100644 --- a/parser/src/earley/from_guidance.rs +++ b/parser/src/earley/from_guidance.rs @@ -6,8 +6,8 @@ use crate::api::{ GrammarWithLexer, Node, RegexId, RegexNode, RegexSpec, TopLevelGrammar, DEFAULT_CONTEXTUAL, }; use crate::Logger; -use anyhow::{bail, Result}; -use derivre::{ExprRef, RegexAst, RegexBuilder}; +use anyhow::{bail, ensure, Result}; +use derivre::{ExprRef, JsonQuoteOptions, RegexAst, RegexBuilder}; fn resolve_rx(rx_refs: &[ExprRef], node: &RegexSpec) -> Result { match node { @@ -166,12 +166,32 @@ fn grammar_from_json(input: GrammarWithLexer) -> Result<(LexerSpec, Grammar)> { rx, contextual, temperature, + json_allowed_escapes, + json_raw, + json_string, .. } => { + let json_options = if json_string.unwrap_or(false) { + Some(JsonQuoteOptions { + allowed_escapes: json_allowed_escapes + .as_ref() + .map_or("nrbtf\\\"u", |e| e.as_str()) + .to_string(), + raw_mode: json_raw.unwrap_or(false), + }) + } else { + ensure!( + json_allowed_escapes.is_none(), + "json_allowed_escapes is only valid for json_string" + ); + ensure!(json_raw.is_none(), "json_raw is only valid for json_string"); + None + }; let idx = lexer_spec.add_greedy_lexeme( format!("lex_{}", grm.sym_name(lhs)), resolve_rx(&rx_nodes, rx)?, contextual.unwrap_or(input.contextual.unwrap_or(DEFAULT_CONTEXTUAL)), + json_options, )?; if let Some(t) = temperature { let symprops = grm.sym_props_mut(lhs); diff --git a/parser/src/earley/lexerspec.rs b/parser/src/earley/lexerspec.rs index e9dd3537..0fc492ad 100644 --- a/parser/src/earley/lexerspec.rs +++ b/parser/src/earley/lexerspec.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use derivre::{ExprRef, RegexAst, RegexBuilder}; +use derivre::{ExprRef, JsonQuoteOptions, RegexAst, RegexBuilder}; use std::{fmt::Debug, hash::Hash}; use toktrie::{bytes::limit_str, SimpleVob}; @@ -22,6 +22,7 @@ pub struct LexemeSpec { ends_at_eos: bool, lazy: bool, contextual: bool, + json_options: Option, } #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] @@ -129,6 +130,11 @@ impl LexerSpec { fn add_lexeme_spec(&mut self, mut spec: LexemeSpec) -> Result { let compiled = self.regex_builder.mk(&spec.rx)?; + let compiled = if let Some(ref opts) = spec.json_options { + self.regex_builder.json_quote(compiled, opts)? + } else { + compiled + }; if let Some(idx) = self .lexemes .iter() @@ -152,6 +158,7 @@ impl LexerSpec { lazy: false, contextual: false, ends_at_eos: false, + json_options: None, } } @@ -195,11 +202,13 @@ impl LexerSpec { name: String, rx: RegexAst, contextual: bool, + json_options: Option, ) -> Result { self.add_lexeme_spec(LexemeSpec { name, rx, contextual, + json_options, ..self.empty_spec() }) }