diff --git a/Cargo.lock b/Cargo.lock index b8fe1ebaf8019..d8493cb7f2b59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -774,7 +774,7 @@ dependencies = [ "tracing-subscriber", "unified-diff", "walkdir", - "windows", + "windows 0.52.0", ] [[package]] @@ -875,8 +875,10 @@ dependencies = [ [[package]] name = "crossbeam-utils" version = "0.8.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +dependencies = [ + "loom", + "rand", +] [[package]] name = "crypto-common" @@ -1533,6 +1535,19 @@ dependencies = [ "windows-bindgen", ] +[[package]] +name = "generator" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc16584ff22b460a382b7feec54b23d2908d858152e5739a120b949293bd74e" +dependencies = [ + "cc", + "libc", + "log", + "rustversion", + "windows 0.48.0", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -2304,6 +2319,19 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "loom" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e045d70ddfbc984eacfa964ded019534e8f6cbf36f6410aee0ed5cefa5a9175" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "lzma-sys" version = "0.1.20" @@ -2461,13 +2489,12 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" dependencies = [ "adler", "compiler_builtins", "rustc-std-workspace-alloc", "rustc-std-workspace-core", + "simd-adler32", ] [[package]] @@ -3084,6 +3111,15 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9e1dcb320d6839f6edb64f7a4a59d39b30480d4d1765b56873f7c858538a5fe" +[[package]] +name = "quickcheck" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" +dependencies = [ + "rand", +] + [[package]] name = "quine-mc_cluskey" version = "0.2.4" @@ -3211,11 +3247,12 @@ dependencies = [ [[package]] name = "regex" version = "1.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" dependencies = [ "aho-corasick", + "lazy_static", "memchr", + "quickcheck", + "rand", "regex-syntax 0.7.5", ] @@ -3713,7 +3750,7 @@ dependencies = [ "thorin-dwp", "tracing", "wasm-encoder", - "windows", + "windows 0.52.0", ] [[package]] @@ -3770,7 +3807,7 @@ dependencies = [ "tempfile", "thin-vec", "tracing", - "windows", + "windows 0.52.0", ] [[package]] @@ -3832,7 +3869,7 @@ dependencies = [ "shlex", "time", "tracing", - "windows", + "windows 0.52.0", ] [[package]] @@ -3884,7 +3921,7 @@ dependencies = [ "termize", "tracing", "unicode-width", - "windows", + "windows 0.52.0", ] [[package]] @@ -4582,7 +4619,7 @@ dependencies = [ "smallvec", "termize", "tracing", - "windows", + "windows 0.52.0", ] [[package]] @@ -5100,6 +5137,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "siphasher" version = "0.3.11" @@ -5365,7 +5408,7 @@ dependencies = [ "libc", "ntapi", "once_cell", - "windows", + "windows 0.52.0", ] [[package]] @@ -6307,6 +6350,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index e12c968e205b7..33a43c7c19f0a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,9 @@ [workspace] resolver = "1" members = [ + "miniz_oxide-0.7.2", + "regex-1.8.4", + "crossbeam-utils-0.8.19", "compiler/rustc", "library/std", "library/sysroot", @@ -112,5 +115,9 @@ rustc-std-workspace-core = { path = 'library/rustc-std-workspace-core' } rustc-std-workspace-alloc = { path = 'library/rustc-std-workspace-alloc' } rustc-std-workspace-std = { path = 'library/rustc-std-workspace-std' } +miniz_oxide = { path = 'miniz_oxide-0.7.2' } +regex = { path = 'regex-1.8.4' } +crossbeam-utils = { path = 'crossbeam-utils-0.8.19' } + [patch."https://github.com/rust-lang/rust-clippy"] clippy_lints = { path = "src/tools/clippy/clippy_lints" } diff --git a/compiler/rustc_codegen_ssa/src/back/metadata.rs b/compiler/rustc_codegen_ssa/src/back/metadata.rs index ab1bc0b6cd2eb..18b1370b00552 100644 --- a/compiler/rustc_codegen_ssa/src/back/metadata.rs +++ b/compiler/rustc_codegen_ssa/src/back/metadata.rs @@ -174,7 +174,7 @@ pub(super) fn get_metadata_xcoff<'a>(path: &Path, data: &'a [u8]) -> Result<&'a return Ok(&info_data[offset..(offset + len)]); } else { return Err(format!("Unable to find symbol {AIX_METADATA_SYMBOL_NAME}")); - }; + } } pub(crate) fn create_object_file(sess: &Session) -> Option> { diff --git a/compiler/rustc_const_eval/src/const_eval/eval_queries.rs b/compiler/rustc_const_eval/src/const_eval/eval_queries.rs index 5a1c7cc4209ad..8f06a889de73d 100644 --- a/compiler/rustc_const_eval/src/const_eval/eval_queries.rs +++ b/compiler/rustc_const_eval/src/const_eval/eval_queries.rs @@ -257,9 +257,7 @@ pub fn eval_to_const_value_raw_provider<'tcx>( // Catch such calls and evaluate them instead of trying to load a constant's MIR. if let ty::InstanceDef::Intrinsic(def_id) = key.value.instance.def { let ty = key.value.instance.ty(tcx, key.param_env); - let ty::FnDef(_, args) = ty.kind() else { - bug!("intrinsic with type {:?}", ty); - }; + let ty::FnDef(_, args) = ty.kind() else { bug!("intrinsic with type {:?}", ty) }; return eval_nullary_intrinsic(tcx, key.param_env, def_id, args).map_err(|error| { let span = tcx.def_span(def_id); diff --git a/compiler/rustc_const_eval/src/interpret/validity.rs b/compiler/rustc_const_eval/src/interpret/validity.rs index d18600ce7d755..34c3e916a1330 100644 --- a/compiler/rustc_const_eval/src/interpret/validity.rs +++ b/compiler/rustc_const_eval/src/interpret/validity.rs @@ -476,7 +476,7 @@ impl<'rt, 'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> ValidityVisitor<'rt, 'mir, ' Some(CtfeValidationMode::Const { .. }) => { // We can't recursively validate `extern static`, so we better reject them. if self.ecx.tcx.is_foreign_item(did) { - throw_validation_failure!(self.path, ConstRefToExtern); + throw_validation_failure!(self.path, ConstRefToExtern) } } None => {} @@ -518,14 +518,14 @@ impl<'rt, 'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> ValidityVisitor<'rt, 'mir, ' if ptr_expected_mutbl == Mutability::Mut && alloc_actual_mutbl == Mutability::Not { - throw_validation_failure!(self.path, MutableRefToImmutable); + throw_validation_failure!(self.path, MutableRefToImmutable) } // In a const, everything must be completely immutable. if matches!(self.ctfe_mode, Some(CtfeValidationMode::Const { .. })) { if ptr_expected_mutbl == Mutability::Mut || alloc_actual_mutbl == Mutability::Mut { - throw_validation_failure!(self.path, ConstRefToMutable); + throw_validation_failure!(self.path, ConstRefToMutable) } } } @@ -621,7 +621,7 @@ impl<'rt, 'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> ValidityVisitor<'rt, 'mir, ' } else { // Otherwise (for standalone Miri), we have to still check it to be non-null. if self.ecx.scalar_may_be_null(value)? { - throw_validation_failure!(self.path, NullFnPtr); + throw_validation_failure!(self.path, NullFnPtr) } } Ok(true) @@ -786,7 +786,7 @@ impl<'rt, 'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> ValueVisitor<'mir, 'tcx, M> if self.ctfe_mode.is_some_and(|c| !c.allow_immutable_unsafe_cell()) { if !op.layout.is_zst() && !op.layout.ty.is_freeze(*self.ecx.tcx, self.ecx.param_env) { if !self.in_mutable_memory(op) { - throw_validation_failure!(self.path, UnsafeCellInImmutable); + throw_validation_failure!(self.path, UnsafeCellInImmutable) } } } @@ -819,7 +819,7 @@ impl<'rt, 'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> ValueVisitor<'mir, 'tcx, M> && def.is_unsafe_cell() { if !self.in_mutable_memory(op) { - throw_validation_failure!(self.path, UnsafeCellInImmutable); + throw_validation_failure!(self.path, UnsafeCellInImmutable) } } } @@ -934,7 +934,7 @@ impl<'rt, 'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> ValueVisitor<'mir, 'tcx, M> match op.layout.abi { Abi::Uninhabited => { let ty = op.layout.ty; - throw_validation_failure!(self.path, UninhabitedVal { ty }); + throw_validation_failure!(self.path, UninhabitedVal { ty }) } Abi::Scalar(scalar_layout) => { if !scalar_layout.is_uninit_valid() { diff --git a/compiler/rustc_driver_impl/src/lib.rs b/compiler/rustc_driver_impl/src/lib.rs index 716e31080dd30..cc01c4867bb0f 100644 --- a/compiler/rustc_driver_impl/src/lib.rs +++ b/compiler/rustc_driver_impl/src/lib.rs @@ -642,21 +642,19 @@ fn process_rlink(sess: &Session, compiler: &interface::Compiler) { }); let (codegen_results, outputs) = match CodegenResults::deserialize_rlink(sess, rlink_data) { Ok((codegen, outputs)) => (codegen, outputs), - Err(err) => { - match err { - CodegenErrors::WrongFileType => dcx.emit_fatal(RLinkWrongFileType), - CodegenErrors::EmptyVersionNumber => dcx.emit_fatal(RLinkEmptyVersionNumber), - CodegenErrors::EncodingVersionMismatch { version_array, rlink_version } => sess - .dcx() - .emit_fatal(RLinkEncodingVersionMismatch { version_array, rlink_version }), - CodegenErrors::RustcVersionMismatch { rustc_version } => { - dcx.emit_fatal(RLinkRustcVersionMismatch { - rustc_version, - current_version: sess.cfg_version, - }) - } - }; - } + Err(err) => match err { + CodegenErrors::WrongFileType => dcx.emit_fatal(RLinkWrongFileType), + CodegenErrors::EmptyVersionNumber => dcx.emit_fatal(RLinkEmptyVersionNumber), + CodegenErrors::EncodingVersionMismatch { version_array, rlink_version } => sess + .dcx() + .emit_fatal(RLinkEncodingVersionMismatch { version_array, rlink_version }), + CodegenErrors::RustcVersionMismatch { rustc_version } => { + dcx.emit_fatal(RLinkRustcVersionMismatch { + rustc_version, + current_version: sess.cfg_version, + }) + } + }, }; if compiler.codegen_backend.link(sess, codegen_results, &outputs).is_err() { FatalError.raise(); diff --git a/compiler/rustc_hir_analysis/src/collect/type_of.rs b/compiler/rustc_hir_analysis/src/collect/type_of.rs index 722def2563cbd..0b77d83ce5aa1 100644 --- a/compiler/rustc_hir_analysis/src/collect/type_of.rs +++ b/compiler/rustc_hir_analysis/src/collect/type_of.rs @@ -125,9 +125,7 @@ fn anon_const_type_of<'tcx>(tcx: TyCtxt<'tcx>, def_id: LocalDefId) -> Ty<'tcx> { .filter(|arg| arg.is_ty_or_const()) .position(|arg| arg.hir_id() == hir_id) }) - .unwrap_or_else(|| { - bug!("no arg matching AnonConst in segment"); - }); + .unwrap_or_else(|| bug!("no arg matching AnonConst in segment")); (generics, arg_index) } else { @@ -164,9 +162,7 @@ fn anon_const_type_of<'tcx>(tcx: TyCtxt<'tcx>, def_id: LocalDefId) -> Ty<'tcx> { .filter(|arg| arg.is_ty_or_const()) .position(|arg| arg.hir_id() == hir_id) }) - .unwrap_or_else(|| { - bug!("no arg matching AnonConst in segment"); - }); + .unwrap_or_else(|| bug!("no arg matching AnonConst in segment")); (tcx.generics_of(type_dependent_def), idx) } diff --git a/compiler/rustc_hir_typeck/src/fn_ctxt/mod.rs b/compiler/rustc_hir_typeck/src/fn_ctxt/mod.rs index efa2862177e16..99061ff7af1f8 100644 --- a/compiler/rustc_hir_typeck/src/fn_ctxt/mod.rs +++ b/compiler/rustc_hir_typeck/src/fn_ctxt/mod.rs @@ -438,6 +438,11 @@ fn parse_never_type_options_attr( } let fallback = fallback.unwrap_or_else(|| { + if true { + // For a crater experiment, turn off all fallback + return NoFallback; + } + if tcx.features().never_type_fallback { FallbackToNiko } else { FallbackToUnit } }); diff --git a/compiler/rustc_hir_typeck/src/lib.rs b/compiler/rustc_hir_typeck/src/lib.rs index 0b67b37df29d8..b5441bd802e6d 100644 --- a/compiler/rustc_hir_typeck/src/lib.rs +++ b/compiler/rustc_hir_typeck/src/lib.rs @@ -344,9 +344,8 @@ pub struct EnclosingBreakables<'tcx> { impl<'tcx> EnclosingBreakables<'tcx> { fn find_breakable(&mut self, target_id: hir::HirId) -> &mut BreakableCtxt<'tcx> { - self.opt_find_breakable(target_id).unwrap_or_else(|| { - bug!("could not find enclosing breakable with id {}", target_id); - }) + self.opt_find_breakable(target_id) + .unwrap_or_else(|| bug!("could not find enclosing breakable with id {}", target_id)) } fn opt_find_breakable(&mut self, target_id: hir::HirId) -> Option<&mut BreakableCtxt<'tcx>> { diff --git a/compiler/rustc_lint/src/types.rs b/compiler/rustc_lint/src/types.rs index 5331d2fb752de..667e442b0c892 100644 --- a/compiler/rustc_lint/src/types.rs +++ b/compiler/rustc_lint/src/types.rs @@ -1118,7 +1118,7 @@ pub(crate) fn repr_nullable_ptr<'tcx>( WrappingRange { start, end } => { unreachable!("Unhandled start and end range: ({}, {})", start, end) } - }; + } } } None diff --git a/compiler/rustc_macros/src/diagnostics/diagnostic_builder.rs b/compiler/rustc_macros/src/diagnostics/diagnostic_builder.rs index ae481efb263df..30c0e79652663 100644 --- a/compiler/rustc_macros/src/diagnostics/diagnostic_builder.rs +++ b/compiler/rustc_macros/src/diagnostics/diagnostic_builder.rs @@ -152,7 +152,7 @@ impl DiagnosticDeriveVariantBuilder { if let SubdiagnosticKind::MultipartSuggestion { .. } = subdiag.kind { throw_invalid_attr!(attr, |diag| diag - .help("consider creating a `Subdiagnostic` instead")); + .help("consider creating a `Subdiagnostic` instead")) } let slug = subdiag.slug.unwrap_or_else(|| match subdiag.kind { @@ -238,7 +238,7 @@ impl DiagnosticDeriveVariantBuilder { } SubdiagnosticKind::Label | SubdiagnosticKind::Suggestion { .. } => { throw_invalid_attr!(attr, |diag| diag - .help("`#[label]` and `#[suggestion]` can only be applied to fields")); + .help("`#[label]` and `#[suggestion]` can only be applied to fields")) } SubdiagnosticKind::MultipartSuggestion { .. } => unreachable!(), } @@ -373,7 +373,7 @@ impl DiagnosticDeriveVariantBuilder { .note("`#[suggestion(...)]` applied to `Vec` field is ambiguous") .help("to show a suggestion consisting of multiple parts, use a `Subdiagnostic` annotated with `#[multipart_suggestion(...)]`") .help("to show a variable set of suggestions, use a `Vec` of `Subdiagnostic`s annotated with `#[suggestion(...)]`") - }); + }) } let (span_field, mut applicability) = self.span_and_applicability_of_ty(info)?; @@ -458,15 +458,13 @@ impl DiagnosticDeriveVariantBuilder { } else if type_matches_path(elem, &["rustc_errors", "Applicability"]) { applicability_idx.set_once(syn::Index::from(idx), elem.span().unwrap()); } else { - type_err(&elem.span())?; + type_err(&elem.span())? } } - let Some((span_idx, _)) = span_idx else { - type_err(&tup.span())?; - }; + let Some((span_idx, _)) = span_idx else { type_err(&tup.span())? }; let Some((applicability_idx, applicability_span)) = applicability_idx else { - type_err(&tup.span())?; + type_err(&tup.span())? }; let binding = &info.binding.binding; let span = quote!(#binding.#span_idx); diff --git a/compiler/rustc_macros/src/diagnostics/subdiagnostic.rs b/compiler/rustc_macros/src/diagnostics/subdiagnostic.rs index ced782cdbc016..6ce13cfcc1fb6 100644 --- a/compiler/rustc_macros/src/diagnostics/subdiagnostic.rs +++ b/compiler/rustc_macros/src/diagnostics/subdiagnostic.rs @@ -205,7 +205,7 @@ impl<'parent, 'a> SubdiagnosticDeriveVariantBuilder<'parent, 'a> { format!( "diagnostic slug must be first argument of a `#[{name}(...)]` attribute" ) - ); + ) }; kind_slugs.push((kind, slug, no_span)); @@ -327,7 +327,7 @@ impl<'parent, 'a> SubdiagnosticDeriveVariantBuilder<'parent, 'a> { } else { diag } - }); + }) } self.span_field.set_once(binding, span); @@ -489,7 +489,7 @@ impl<'parent, 'a> SubdiagnosticDeriveVariantBuilder<'parent, 'a> { throw_span_err!( self.variant.ast().ident.span().unwrap(), "subdiagnostic kind not specified" - ); + ) } }; diff --git a/compiler/rustc_macros/src/diagnostics/utils.rs b/compiler/rustc_macros/src/diagnostics/utils.rs index 4684306e23592..9783a846b7228 100644 --- a/compiler/rustc_macros/src/diagnostics/utils.rs +++ b/compiler/rustc_macros/src/diagnostics/utils.rs @@ -77,7 +77,7 @@ pub(crate) fn report_type_error( }, ty_name ) - ); + ) } /// Reports an error if the field's type does not match `path`. @@ -88,7 +88,7 @@ fn report_error_if_not_applied_to_ty( ty_name: &str, ) -> Result<(), DiagnosticDeriveError> { if !type_matches_path(info.ty.inner_type(), path) { - report_type_error(attr, ty_name)?; + report_type_error(attr, ty_name)? } Ok(()) @@ -115,7 +115,7 @@ pub(crate) fn report_error_if_not_applied_to_span( if !type_matches_path(info.ty.inner_type(), &["rustc_span", "Span"]) && !type_matches_path(info.ty.inner_type(), &["rustc_errors", "MultiSpan"]) { - report_type_error(attr, "`Span` or `MultiSpan`")?; + report_type_error(attr, "`Span` or `MultiSpan`")? } Ok(()) @@ -662,7 +662,7 @@ impl SubdiagnosticVariant { applicability: None, } } else { - throw_invalid_attr!(attr); + throw_invalid_attr!(attr) } } }; diff --git a/compiler/rustc_metadata/src/dependency_format.rs b/compiler/rustc_metadata/src/dependency_format.rs index 4d1bd45541231..a1b8bf0dedb0e 100644 --- a/compiler/rustc_metadata/src/dependency_format.rs +++ b/compiler/rustc_metadata/src/dependency_format.rs @@ -374,9 +374,8 @@ fn verify_ok(tcx: TyCtxt<'_>, list: &[Linkage]) { } panic_runtime = Some(( cnum, - tcx.required_panic_strategy(cnum).unwrap_or_else(|| { - bug!("cannot determine panic strategy of a panic runtime"); - }), + tcx.required_panic_strategy(cnum) + .unwrap_or_else(|| bug!("cannot determine panic strategy of a panic runtime")), )); } } diff --git a/compiler/rustc_middle/src/ty/mod.rs b/compiler/rustc_middle/src/ty/mod.rs index 6ce53ccc8cd7a..66650e84b863f 100644 --- a/compiler/rustc_middle/src/ty/mod.rs +++ b/compiler/rustc_middle/src/ty/mod.rs @@ -1598,9 +1598,8 @@ impl<'tcx> TyCtxt<'tcx> { /// /// [`opt_item_name`]: Self::opt_item_name pub fn item_name(self, id: DefId) -> Symbol { - self.opt_item_name(id).unwrap_or_else(|| { - bug!("item_name: no name for {:?}", self.def_path(id)); - }) + self.opt_item_name(id) + .unwrap_or_else(|| bug!("item_name: no name for {:?}", self.def_path(id))) } /// Look up the name and span of a definition. diff --git a/compiler/rustc_middle/src/ty/typeck_results.rs b/compiler/rustc_middle/src/ty/typeck_results.rs index d8541f4b25a53..09ee0dd4e2772 100644 --- a/compiler/rustc_middle/src/ty/typeck_results.rs +++ b/compiler/rustc_middle/src/ty/typeck_results.rs @@ -518,7 +518,7 @@ fn invalid_hir_id_for_typeck_results(hir_owner: OwnerId, hir_id: hir::HirId) { tcx.hir().node_to_string(hir_id), hir_owner ) - }); + }) } pub struct LocalTableInContext<'a, V> { diff --git a/compiler/rustc_mir_transform/src/dataflow_const_prop.rs b/compiler/rustc_mir_transform/src/dataflow_const_prop.rs index 3e9c1459f1cbe..d2eab21ef0ceb 100644 --- a/compiler/rustc_mir_transform/src/dataflow_const_prop.rs +++ b/compiler/rustc_mir_transform/src/dataflow_const_prop.rs @@ -857,4 +857,4 @@ impl<'tcx> Visitor<'tcx> for OperandCollector<'tcx, '_, '_, '_> { } } } -} +} \ No newline at end of file diff --git a/compiler/rustc_mir_transform/src/jump_threading.rs b/compiler/rustc_mir_transform/src/jump_threading.rs index a458297210db8..8e1758a9686cd 100644 --- a/compiler/rustc_mir_transform/src/jump_threading.rs +++ b/compiler/rustc_mir_transform/src/jump_threading.rs @@ -546,7 +546,7 @@ impl<'tcx, 'a> TOFinder<'tcx, 'a> { }; if writes_discriminant { let discr = self.ecx.discriminant_for_variant(enum_ty, *variant_index).ok()?; - self.process_immediate(bb, discr_target, discr, state)?; + self.process_immediate(bb, discr_target, discr, state)? } } // If we expect `lhs ?= true`, we have an opportunity if we assume `lhs == true`. @@ -557,7 +557,7 @@ impl<'tcx, 'a> TOFinder<'tcx, 'a> { conditions.iter_matches(ScalarInt::TRUE).for_each(register_opportunity); } StatementKind::Assign(box (lhs_place, rhs)) => { - self.process_assign(bb, lhs_place, rhs, state)?; + self.process_assign(bb, lhs_place, rhs, state)? } _ => {} } diff --git a/crossbeam-utils-0.8.19/.cargo-ok b/crossbeam-utils-0.8.19/.cargo-ok new file mode 100644 index 0000000000000..5f8b795830acb --- /dev/null +++ b/crossbeam-utils-0.8.19/.cargo-ok @@ -0,0 +1 @@ +{"v":1} \ No newline at end of file diff --git a/crossbeam-utils-0.8.19/.cargo_vcs_info.json b/crossbeam-utils-0.8.19/.cargo_vcs_info.json new file mode 100644 index 0000000000000..52ab2c10365d8 --- /dev/null +++ b/crossbeam-utils-0.8.19/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "9c3182abebb36bdc9446d75d4644190fef70fa01" + }, + "path_in_vcs": "crossbeam-utils" +} \ No newline at end of file diff --git a/crossbeam-utils-0.8.19/CHANGELOG.md b/crossbeam-utils-0.8.19/CHANGELOG.md new file mode 100644 index 0000000000000..f6f346381ba3a --- /dev/null +++ b/crossbeam-utils-0.8.19/CHANGELOG.md @@ -0,0 +1,235 @@ +# Version 0.8.19 + +- Remove dependency on `cfg-if`. (#1072) + +# Version 0.8.18 + +- Relax the minimum supported Rust version to 1.60. (#1056) +- Improve scalability of `AtomicCell` fallback. (#1055) + +# Version 0.8.17 + +- Bump the minimum supported Rust version to 1.61. (#1037) +- Improve support for targets without atomic CAS or 64-bit atomic. (#1037) +- Always implement `UnwindSafe` and `RefUnwindSafe` for `AtomicCell`. (#1045) +- Improve compatibility with Miri, TSan, and loom. (#995, #1003) +- Improve compatibility with unstable `oom=panic`. (#1045) +- Improve implementation of `CachePadded`. (#1014, #1025) +- Update `loom` dependency to 0.7. + +# Version 0.8.16 + +- Improve implementation of `CachePadded`. (#967) + +# Version 0.8.15 + +- Add `#[clippy::has_significant_drop]` to `ShardedLock{Read,Write}Guard`. (#958) +- Improve handling of very large timeout. (#953) +- Soft-deprecate `thread::scope()` in favor of the more efficient `std::thread::scope` that stabilized on Rust 1.63. (#954) + +# Version 0.8.14 + +- Fix build script bug introduced in 0.8.13. (#932) + +# Version 0.8.13 + +**Note:** This release has been yanked due to regression fixed in 0.8.14. + +- Improve support for custom targets. (#922) + +# Version 0.8.12 + +- Removes the dependency on the `once_cell` crate to restore the MSRV. (#913) +- Work around [rust-lang#98302](https://github.com/rust-lang/rust/issues/98302), which causes compile error on windows-gnu when LTO is enabled. (#913) + +# Version 0.8.11 + +- Bump the minimum supported Rust version to 1.38. (#877) + +# Version 0.8.10 + +- Fix unsoundness of `AtomicCell` on types containing niches. (#834) + This fix contains breaking changes, but they are allowed because this is a soundness bug fix. See #834 for more. + +# Version 0.8.9 + +- Replace lazy_static with once_cell. (#817) + +# Version 0.8.8 + +- Fix a bug when unstable `loom` support is enabled. (#787) + +# Version 0.8.7 + +- Add `AtomicCell<{i*,u*}>::{fetch_max,fetch_min}`. (#785) +- Add `AtomicCell<{i*,u*,bool}>::fetch_nand`. (#785) +- Fix unsoundness of `AtomicCell<{i,u}64>` arithmetics on 32-bit targets that support `Atomic{I,U}64` (#781) + +# Version 0.8.6 + +**Note:** This release has been yanked. See [GHSA-qc84-gqf4-9926](https://github.com/crossbeam-rs/crossbeam/security/advisories/GHSA-qc84-gqf4-9926) for details. + +- Re-add `AtomicCell<{i,u}64>::{fetch_add,fetch_sub,fetch_and,fetch_or,fetch_xor}` that were accidentally removed in 0.8.0 on targets that do not support `Atomic{I,U}64`. (#767) +- Re-add `AtomicCell<{i,u}128>::{fetch_add,fetch_sub,fetch_and,fetch_or,fetch_xor}` that were accidentally removed in 0.8.0. (#767) + +# Version 0.8.5 + +**Note:** This release has been yanked. See [GHSA-qc84-gqf4-9926](https://github.com/crossbeam-rs/crossbeam/security/advisories/GHSA-qc84-gqf4-9926) for details. + +- Add `AtomicCell::fetch_update`. (#704) +- Support targets that do not have atomic CAS on stable Rust. (#698) + +# Version 0.8.4 + +**Note:** This release has been yanked. See [GHSA-qc84-gqf4-9926](https://github.com/crossbeam-rs/crossbeam/security/advisories/GHSA-qc84-gqf4-9926) for details. + +- Bump `loom` dependency to version 0.5. (#686) + +# Version 0.8.3 + +**Note:** This release has been yanked. See [GHSA-qc84-gqf4-9926](https://github.com/crossbeam-rs/crossbeam/security/advisories/GHSA-qc84-gqf4-9926) for details. + +- Make `loom` dependency optional. (#666) + +# Version 0.8.2 + +**Note:** This release has been yanked. See [GHSA-qc84-gqf4-9926](https://github.com/crossbeam-rs/crossbeam/security/advisories/GHSA-qc84-gqf4-9926) for details. + +- Deprecate `AtomicCell::compare_and_swap`. Use `AtomicCell::compare_exchange` instead. (#619) +- Add `Parker::park_deadline`. (#563) +- Improve implementation of `CachePadded`. (#636) +- Add unstable support for `loom`. (#487) + +# Version 0.8.1 + +**Note:** This release has been yanked. See [GHSA-qc84-gqf4-9926](https://github.com/crossbeam-rs/crossbeam/security/advisories/GHSA-qc84-gqf4-9926) for details. + +- Make `AtomicCell::is_lock_free` always const fn. (#600) +- Fix a bug in `seq_lock_wide`. (#596) +- Remove `const_fn` dependency. (#600) +- `crossbeam-utils` no longer fails to compile if unable to determine rustc version. Instead, it now displays a warning. (#604) + +# Version 0.8.0 + +**Note:** This release has been yanked. See [GHSA-qc84-gqf4-9926](https://github.com/crossbeam-rs/crossbeam/security/advisories/GHSA-qc84-gqf4-9926) for details. + +- Bump the minimum supported Rust version to 1.36. +- Remove deprecated `AtomicCell::get_mut()` and `Backoff::is_complete()` methods. +- Remove `alloc` feature. +- Make `CachePadded::new()` const function. +- Make `AtomicCell::is_lock_free()` const function at 1.46+. +- Implement `From` for `AtomicCell`. + +# Version 0.7.2 + +- Fix bug in release (yanking 0.7.1) + +# Version 0.7.1 + +- Bump `autocfg` dependency to version 1.0. (#460) +- Make `AtomicCell` lockfree for u8, u16, u32, u64 sized values at 1.34+. (#454) + +# Version 0.7.0 + +- Bump the minimum required version to 1.28. +- Fix breakage with nightly feature due to rust-lang/rust#65214. +- Apply `#[repr(transparent)]` to `AtomicCell`. +- Make `AtomicCell::new()` const function at 1.31+. + +# Version 0.6.6 + +- Add `UnwindSafe` and `RefUnwindSafe` impls for `AtomicCell`. +- Add `AtomicCell::as_ptr()`. +- Add `AtomicCell::take()`. +- Fix a bug in `AtomicCell::compare_exchange()` and `AtomicCell::compare_and_swap()`. +- Various documentation improvements. + +# Version 0.6.5 + +- Rename `Backoff::is_complete()` to `Backoff::is_completed()`. + +# Version 0.6.4 + +- Add `WaitGroup`, `ShardedLock`, and `Backoff`. +- Add `fetch_*` methods for `AtomicCell` and `AtomicCell`. +- Expand documentation. + +# Version 0.6.3 + +- Add `AtomicCell`. +- Improve documentation. + +# Version 0.6.2 + +- Add `Parker`. +- Improve documentation. + +# Version 0.6.1 + +- Fix a soundness bug in `Scope::spawn()`. +- Remove the `T: 'scope` bound on `ScopedJoinHandle`. + +# Version 0.6.0 + +- Move `AtomicConsume` to `atomic` module. +- `scope()` returns a `Result` of thread joins. +- Remove `spawn_unchecked`. +- Fix a soundness bug due to incorrect lifetimes. +- Improve documentation. +- Support nested scoped spawns. +- Implement `Copy`, `Hash`, `PartialEq`, and `Eq` for `CachePadded`. +- Add `CachePadded::into_inner()`. + +# Version 0.5.0 + +- Reorganize sub-modules and rename functions. + +# Version 0.4.1 + +- Fix a documentation link. + +# Version 0.4.0 + +- `CachePadded` supports types bigger than 64 bytes. +- Fix a bug in scoped threads where unitialized memory was being dropped. +- Minimum required Rust version is now 1.25. + +# Version 0.3.2 + +- Mark `load_consume` with `#[inline]`. + +# Version 0.3.1 + +- `load_consume` on ARM and AArch64. + +# Version 0.3.0 + +- Add `join` for scoped thread API. +- Add `load_consume` for atomic load-consume memory ordering. +- Remove `AtomicOption`. + +# Version 0.2.2 + +- Support Rust 1.12.1. +- Call `T::clone` when cloning a `CachePadded`. + +# Version 0.2.1 + +- Add `use_std` feature. + +# Version 0.2.0 + +- Add `nightly` feature. +- Use `repr(align(64))` on `CachePadded` with the `nightly` feature. +- Implement `Drop` for `CachePadded`. +- Implement `Clone` for `CachePadded`. +- Implement `From` for `CachePadded`. +- Implement better `Debug` for `CachePadded`. +- Write more tests. +- Add this changelog. +- Change cache line length to 64 bytes. +- Remove `ZerosValid`. + +# Version 0.1.0 + +- Old implementation of `CachePadded` from `crossbeam` version 0.3.0 diff --git a/crossbeam-utils-0.8.19/Cargo.toml b/crossbeam-utils-0.8.19/Cargo.toml new file mode 100644 index 0000000000000..180f72130c873 --- /dev/null +++ b/crossbeam-utils-0.8.19/Cargo.toml @@ -0,0 +1,47 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +rust-version = "1.60" +name = "crossbeam-utils" +version = "0.8.19" +description = "Utilities for concurrent programming" +homepage = "https://github.com/crossbeam-rs/crossbeam/tree/master/crossbeam-utils" +readme = "README.md" +keywords = [ + "scoped", + "thread", + "atomic", + "cache", +] +categories = [ + "algorithms", + "concurrency", + "data-structures", + "no-std", +] +license = "MIT OR Apache-2.0" +repository = "https://github.com/crossbeam-rs/crossbeam" + +[dependencies] + +[dev-dependencies.rand] +version = "0.8" + +[features] +default = ["std"] +nightly = [] +std = [] + +[target."cfg(crossbeam_loom)".dependencies.loom] +version = "0.7.1" +optional = true diff --git a/crossbeam-utils-0.8.19/Cargo.toml.orig b/crossbeam-utils-0.8.19/Cargo.toml.orig new file mode 100644 index 0000000000000..b9a2756d28cc1 --- /dev/null +++ b/crossbeam-utils-0.8.19/Cargo.toml.orig @@ -0,0 +1,43 @@ +[package] +name = "crossbeam-utils" +# When publishing a new version: +# - Update CHANGELOG.md +# - Update README.md +# - Create "crossbeam-utils-X.Y.Z" git tag +version = "0.8.19" +edition = "2021" +rust-version = "1.60" +license = "MIT OR Apache-2.0" +repository = "https://github.com/crossbeam-rs/crossbeam" +homepage = "https://github.com/crossbeam-rs/crossbeam/tree/master/crossbeam-utils" +description = "Utilities for concurrent programming" +keywords = ["scoped", "thread", "atomic", "cache"] +categories = ["algorithms", "concurrency", "data-structures", "no-std"] + +[features] +default = ["std"] + +# Enable to use APIs that require `std`. +# This is enabled by default. +std = [] + +# These features are no longer used. +# TODO: remove in the next major version. +# Enable to use of unstable functionality. +# This is disabled by default and requires recent nightly compiler. +# +# NOTE: This feature is outside of the normal semver guarantees and minor or +# patch versions of crossbeam may make breaking changes to them at any time. +nightly = [] + +[dependencies] + +# Enable the use of loom for concurrency testing. +# +# NOTE: This feature is outside of the normal semver guarantees and minor or +# patch versions of crossbeam may make breaking changes to them at any time. +[target.'cfg(crossbeam_loom)'.dependencies] +loom = { version = "0.7.1", optional = true } + +[dev-dependencies] +rand = "0.8" diff --git a/crossbeam-utils-0.8.19/LICENSE-APACHE b/crossbeam-utils-0.8.19/LICENSE-APACHE new file mode 100644 index 0000000000000..16fe87b06e802 --- /dev/null +++ b/crossbeam-utils-0.8.19/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/crossbeam-utils-0.8.19/LICENSE-MIT b/crossbeam-utils-0.8.19/LICENSE-MIT new file mode 100644 index 0000000000000..068d491fd551a --- /dev/null +++ b/crossbeam-utils-0.8.19/LICENSE-MIT @@ -0,0 +1,27 @@ +The MIT License (MIT) + +Copyright (c) 2019 The Crossbeam Project Developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/crossbeam-utils-0.8.19/README.md b/crossbeam-utils-0.8.19/README.md new file mode 100644 index 0000000000000..7d6a679487929 --- /dev/null +++ b/crossbeam-utils-0.8.19/README.md @@ -0,0 +1,73 @@ +# Crossbeam Utils + +[![Build Status](https://github.com/crossbeam-rs/crossbeam/workflows/CI/badge.svg)]( +https://github.com/crossbeam-rs/crossbeam/actions) +[![License](https://img.shields.io/badge/license-MIT_OR_Apache--2.0-blue.svg)]( +https://github.com/crossbeam-rs/crossbeam/tree/master/crossbeam-utils#license) +[![Cargo](https://img.shields.io/crates/v/crossbeam-utils.svg)]( +https://crates.io/crates/crossbeam-utils) +[![Documentation](https://docs.rs/crossbeam-utils/badge.svg)]( +https://docs.rs/crossbeam-utils) +[![Rust 1.60+](https://img.shields.io/badge/rust-1.60+-lightgray.svg)]( +https://www.rust-lang.org) +[![chat](https://img.shields.io/discord/569610676205781012.svg?logo=discord)](https://discord.com/invite/JXYwgWZ) + +This crate provides miscellaneous tools for concurrent programming: + +#### Atomics + +* [`AtomicCell`], a thread-safe mutable memory location.(no_std) +* [`AtomicConsume`], for reading from primitive atomic types with "consume" ordering.(no_std) + +#### Thread synchronization + +* [`Parker`], a thread parking primitive. +* [`ShardedLock`], a sharded reader-writer lock with fast concurrent reads. +* [`WaitGroup`], for synchronizing the beginning or end of some computation. + +#### Utilities + +* [`Backoff`], for exponential backoff in spin loops.(no_std) +* [`CachePadded`], for padding and aligning a value to the length of a cache line.(no_std) +* [`scope`], for spawning threads that borrow local variables from the stack. + +*Features marked with (no_std) can be used in `no_std` environments.*
+ +[`AtomicCell`]: https://docs.rs/crossbeam-utils/*/crossbeam_utils/atomic/struct.AtomicCell.html +[`AtomicConsume`]: https://docs.rs/crossbeam-utils/*/crossbeam_utils/atomic/trait.AtomicConsume.html +[`Parker`]: https://docs.rs/crossbeam-utils/*/crossbeam_utils/sync/struct.Parker.html +[`ShardedLock`]: https://docs.rs/crossbeam-utils/*/crossbeam_utils/sync/struct.ShardedLock.html +[`WaitGroup`]: https://docs.rs/crossbeam-utils/*/crossbeam_utils/sync/struct.WaitGroup.html +[`Backoff`]: https://docs.rs/crossbeam-utils/*/crossbeam_utils/struct.Backoff.html +[`CachePadded`]: https://docs.rs/crossbeam-utils/*/crossbeam_utils/struct.CachePadded.html +[`scope`]: https://docs.rs/crossbeam-utils/*/crossbeam_utils/thread/fn.scope.html + +## Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +crossbeam-utils = "0.8" +``` + +## Compatibility + +Crossbeam Utils supports stable Rust releases going back at least six months, +and every time the minimum supported Rust version is increased, a new minor +version is released. Currently, the minimum supported Rust version is 1.60. + +## License + +Licensed under either of + + * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) + * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) + +at your option. + +#### Contribution + +Unless you explicitly state otherwise, any contribution intentionally submitted +for inclusion in the work by you, as defined in the Apache-2.0 license, shall be +dual licensed as above, without any additional terms or conditions. diff --git a/crossbeam-utils-0.8.19/benches/atomic_cell.rs b/crossbeam-utils-0.8.19/benches/atomic_cell.rs new file mode 100644 index 0000000000000..844f7c02b63e1 --- /dev/null +++ b/crossbeam-utils-0.8.19/benches/atomic_cell.rs @@ -0,0 +1,156 @@ +#![feature(test)] + +extern crate test; + +use std::sync::Barrier; + +use crossbeam_utils::atomic::AtomicCell; +use crossbeam_utils::thread; + +#[bench] +fn load_u8(b: &mut test::Bencher) { + let a = AtomicCell::new(0u8); + let mut sum = 0; + b.iter(|| sum += a.load()); + test::black_box(sum); +} + +#[bench] +fn store_u8(b: &mut test::Bencher) { + let a = AtomicCell::new(0u8); + b.iter(|| a.store(1)); +} + +#[bench] +fn fetch_add_u8(b: &mut test::Bencher) { + let a = AtomicCell::new(0u8); + b.iter(|| a.fetch_add(1)); +} + +#[bench] +fn compare_exchange_u8(b: &mut test::Bencher) { + let a = AtomicCell::new(0u8); + let mut i = 0; + b.iter(|| { + let _ = a.compare_exchange(i, i.wrapping_add(1)); + i = i.wrapping_add(1); + }); +} + +#[bench] +fn concurrent_load_u8(b: &mut test::Bencher) { + const THREADS: usize = 2; + const STEPS: usize = 1_000_000; + + let start = Barrier::new(THREADS + 1); + let end = Barrier::new(THREADS + 1); + let exit = AtomicCell::new(false); + + let a = AtomicCell::new(0u8); + + thread::scope(|scope| { + for _ in 0..THREADS { + scope.spawn(|_| loop { + start.wait(); + + let mut sum = 0; + for _ in 0..STEPS { + sum += a.load(); + } + test::black_box(sum); + + end.wait(); + if exit.load() { + break; + } + }); + } + + start.wait(); + end.wait(); + + b.iter(|| { + start.wait(); + end.wait(); + }); + + start.wait(); + exit.store(true); + end.wait(); + }) + .unwrap(); +} + +#[bench] +fn load_usize(b: &mut test::Bencher) { + let a = AtomicCell::new(0usize); + let mut sum = 0; + b.iter(|| sum += a.load()); + test::black_box(sum); +} + +#[bench] +fn store_usize(b: &mut test::Bencher) { + let a = AtomicCell::new(0usize); + b.iter(|| a.store(1)); +} + +#[bench] +fn fetch_add_usize(b: &mut test::Bencher) { + let a = AtomicCell::new(0usize); + b.iter(|| a.fetch_add(1)); +} + +#[bench] +fn compare_exchange_usize(b: &mut test::Bencher) { + let a = AtomicCell::new(0usize); + let mut i = 0; + b.iter(|| { + let _ = a.compare_exchange(i, i.wrapping_add(1)); + i = i.wrapping_add(1); + }); +} + +#[bench] +fn concurrent_load_usize(b: &mut test::Bencher) { + const THREADS: usize = 2; + const STEPS: usize = 1_000_000; + + let start = Barrier::new(THREADS + 1); + let end = Barrier::new(THREADS + 1); + let exit = AtomicCell::new(false); + + let a = AtomicCell::new(0usize); + + thread::scope(|scope| { + for _ in 0..THREADS { + scope.spawn(|_| loop { + start.wait(); + + let mut sum = 0; + for _ in 0..STEPS { + sum += a.load(); + } + test::black_box(sum); + + end.wait(); + if exit.load() { + break; + } + }); + } + + start.wait(); + end.wait(); + + b.iter(|| { + start.wait(); + end.wait(); + }); + + start.wait(); + exit.store(true); + end.wait(); + }) + .unwrap(); +} diff --git a/crossbeam-utils-0.8.19/build-common.rs b/crossbeam-utils-0.8.19/build-common.rs new file mode 100644 index 0000000000000..e91bb4d4714a7 --- /dev/null +++ b/crossbeam-utils-0.8.19/build-common.rs @@ -0,0 +1,13 @@ +// The target triplets have the form of 'arch-vendor-system'. +// +// When building for Linux (e.g. the 'system' part is +// 'linux-something'), replace the vendor with 'unknown' +// so that mapping to rust standard targets happens correctly. +fn convert_custom_linux_target(target: String) -> String { + let mut parts: Vec<&str> = target.split('-').collect(); + let system = parts.get(2); + if system == Some(&"linux") { + parts[1] = "unknown"; + }; + parts.join("-") +} diff --git a/crossbeam-utils-0.8.19/build.rs b/crossbeam-utils-0.8.19/build.rs new file mode 100644 index 0000000000000..c71c231369f71 --- /dev/null +++ b/crossbeam-utils-0.8.19/build.rs @@ -0,0 +1,47 @@ +// The rustc-cfg listed below are considered public API, but it is *unstable* +// and outside of the normal semver guarantees: +// +// - `crossbeam_no_atomic` +// Assume the target does *not* support any atomic operations. +// This is usually detected automatically by the build script, but you may +// need to enable it manually when building for custom targets or using +// non-cargo build systems that don't run the build script. +// +// With the exceptions mentioned above, the rustc-cfg emitted by the build +// script are *not* public API. + +#![warn(rust_2018_idioms)] + +use std::env; + +include!("no_atomic.rs"); +include!("build-common.rs"); + +fn main() { + println!("cargo:rerun-if-changed=no_atomic.rs"); + + let target = match env::var("TARGET") { + Ok(target) => convert_custom_linux_target(target), + Err(e) => { + println!( + "cargo:warning={}: unable to get TARGET environment variable: {}", + env!("CARGO_PKG_NAME"), + e + ); + return; + } + }; + + // Note that this is `no_`*, not `has_*`. This allows treating as the latest + // stable rustc is used when the build script doesn't run. This is useful + // for non-cargo build systems that don't run the build script. + if NO_ATOMIC.contains(&&*target) { + println!("cargo:rustc-cfg=crossbeam_no_atomic"); + } + + // `cfg(sanitize = "..")` is not stabilized. + let sanitize = env::var("CARGO_CFG_SANITIZE").unwrap_or_default(); + if sanitize.contains("thread") { + println!("cargo:rustc-cfg=crossbeam_sanitize_thread"); + } +} diff --git a/crossbeam-utils-0.8.19/no_atomic.rs b/crossbeam-utils-0.8.19/no_atomic.rs new file mode 100644 index 0000000000000..b97f39706e8b5 --- /dev/null +++ b/crossbeam-utils-0.8.19/no_atomic.rs @@ -0,0 +1,12 @@ +// This file is @generated by no_atomic.sh. +// It is not intended for manual editing. + +const NO_ATOMIC: &[&str] = &[ + "bpfeb-unknown-none", + "bpfel-unknown-none", + "mipsel-sony-psx", + "msp430-none-elf", + "riscv32i-unknown-none-elf", + "riscv32im-unknown-none-elf", + "riscv32imc-unknown-none-elf", +]; diff --git a/crossbeam-utils-0.8.19/src/atomic/atomic_cell.rs b/crossbeam-utils-0.8.19/src/atomic/atomic_cell.rs new file mode 100644 index 0000000000000..06ccf2eb506e1 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/atomic/atomic_cell.rs @@ -0,0 +1,1182 @@ +// Necessary for implementing atomic methods for `AtomicUnit` +#![allow(clippy::unit_arg)] + +use crate::primitive::sync::atomic::{self, Ordering}; +use crate::CachePadded; +use core::cell::UnsafeCell; +use core::cmp; +use core::fmt; +use core::mem::{self, ManuallyDrop, MaybeUninit}; +use core::panic::{RefUnwindSafe, UnwindSafe}; +use core::ptr; + +use super::seq_lock::SeqLock; + +/// A thread-safe mutable memory location. +/// +/// This type is equivalent to [`Cell`], except it can also be shared among multiple threads. +/// +/// Operations on `AtomicCell`s use atomic instructions whenever possible, and synchronize using +/// global locks otherwise. You can call [`AtomicCell::::is_lock_free()`] to check whether +/// atomic instructions or locks will be used. +/// +/// Atomic loads use the [`Acquire`] ordering and atomic stores use the [`Release`] ordering. +/// +/// [`Cell`]: std::cell::Cell +/// [`AtomicCell::::is_lock_free()`]: AtomicCell::is_lock_free +/// [`Acquire`]: std::sync::atomic::Ordering::Acquire +/// [`Release`]: std::sync::atomic::Ordering::Release +#[repr(transparent)] +pub struct AtomicCell { + /// The inner value. + /// + /// If this value can be transmuted into a primitive atomic type, it will be treated as such. + /// Otherwise, all potentially concurrent operations on this data will be protected by a global + /// lock. + /// + /// Using MaybeUninit to prevent code outside the cell from observing partially initialized state: + /// + /// + /// Note: + /// - we'll never store uninitialized `T` due to our API only using initialized `T`. + /// - this `MaybeUninit` does *not* fix . + value: UnsafeCell>, +} + +unsafe impl Send for AtomicCell {} +unsafe impl Sync for AtomicCell {} + +impl UnwindSafe for AtomicCell {} +impl RefUnwindSafe for AtomicCell {} + +impl AtomicCell { + /// Creates a new atomic cell initialized with `val`. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(7); + /// ``` + pub const fn new(val: T) -> AtomicCell { + AtomicCell { + value: UnsafeCell::new(MaybeUninit::new(val)), + } + } + + /// Consumes the atomic and returns the contained value. + /// + /// This is safe because passing `self` by value guarantees that no other threads are + /// concurrently accessing the atomic data. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(7); + /// let v = a.into_inner(); + /// + /// assert_eq!(v, 7); + /// ``` + pub fn into_inner(self) -> T { + let this = ManuallyDrop::new(self); + // SAFETY: + // - passing `self` by value guarantees that no other threads are concurrently + // accessing the atomic data + // - the raw pointer passed in is valid because we got it from an owned value. + // - `ManuallyDrop` prevents double dropping `T` + unsafe { this.as_ptr().read() } + } + + /// Returns `true` if operations on values of this type are lock-free. + /// + /// If the compiler or the platform doesn't support the necessary atomic instructions, + /// `AtomicCell` will use global locks for every potentially concurrent atomic operation. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// // This type is internally represented as `AtomicUsize` so we can just use atomic + /// // operations provided by it. + /// assert_eq!(AtomicCell::::is_lock_free(), true); + /// + /// // A wrapper struct around `isize`. + /// struct Foo { + /// bar: isize, + /// } + /// // `AtomicCell` will be internally represented as `AtomicIsize`. + /// assert_eq!(AtomicCell::::is_lock_free(), true); + /// + /// // Operations on zero-sized types are always lock-free. + /// assert_eq!(AtomicCell::<()>::is_lock_free(), true); + /// + /// // Very large types cannot be represented as any of the standard atomic types, so atomic + /// // operations on them will have to use global locks for synchronization. + /// assert_eq!(AtomicCell::<[u8; 1000]>::is_lock_free(), false); + /// ``` + pub const fn is_lock_free() -> bool { + atomic_is_lock_free::() + } + + /// Stores `val` into the atomic cell. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(7); + /// + /// assert_eq!(a.load(), 7); + /// a.store(8); + /// assert_eq!(a.load(), 8); + /// ``` + pub fn store(&self, val: T) { + if mem::needs_drop::() { + drop(self.swap(val)); + } else { + unsafe { + atomic_store(self.as_ptr(), val); + } + } + } + + /// Stores `val` into the atomic cell and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(7); + /// + /// assert_eq!(a.load(), 7); + /// assert_eq!(a.swap(8), 7); + /// assert_eq!(a.load(), 8); + /// ``` + pub fn swap(&self, val: T) -> T { + unsafe { atomic_swap(self.as_ptr(), val) } + } + + /// Returns a raw pointer to the underlying data in this atomic cell. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(5); + /// + /// let ptr = a.as_ptr(); + /// ``` + #[inline] + pub fn as_ptr(&self) -> *mut T { + self.value.get().cast::() + } +} + +impl AtomicCell { + /// Takes the value of the atomic cell, leaving `Default::default()` in its place. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(5); + /// let five = a.take(); + /// + /// assert_eq!(five, 5); + /// assert_eq!(a.into_inner(), 0); + /// ``` + pub fn take(&self) -> T { + self.swap(Default::default()) + } +} + +impl AtomicCell { + /// Loads a value from the atomic cell. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(7); + /// + /// assert_eq!(a.load(), 7); + /// ``` + pub fn load(&self) -> T { + unsafe { atomic_load(self.as_ptr()) } + } +} + +impl AtomicCell { + /// If the current value equals `current`, stores `new` into the atomic cell. + /// + /// The return value is always the previous value. If it is equal to `current`, then the value + /// was updated. + /// + /// # Examples + /// + /// ``` + /// # #![allow(deprecated)] + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(1); + /// + /// assert_eq!(a.compare_and_swap(2, 3), 1); + /// assert_eq!(a.load(), 1); + /// + /// assert_eq!(a.compare_and_swap(1, 2), 1); + /// assert_eq!(a.load(), 2); + /// ``` + // TODO: remove in the next major version. + #[deprecated(note = "Use `compare_exchange` instead")] + pub fn compare_and_swap(&self, current: T, new: T) -> T { + match self.compare_exchange(current, new) { + Ok(v) => v, + Err(v) => v, + } + } + + /// If the current value equals `current`, stores `new` into the atomic cell. + /// + /// The return value is a result indicating whether the new value was written and containing + /// the previous value. On success this value is guaranteed to be equal to `current`. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(1); + /// + /// assert_eq!(a.compare_exchange(2, 3), Err(1)); + /// assert_eq!(a.load(), 1); + /// + /// assert_eq!(a.compare_exchange(1, 2), Ok(1)); + /// assert_eq!(a.load(), 2); + /// ``` + pub fn compare_exchange(&self, current: T, new: T) -> Result { + unsafe { atomic_compare_exchange_weak(self.as_ptr(), current, new) } + } + + /// Fetches the value, and applies a function to it that returns an optional + /// new value. Returns a `Result` of `Ok(previous_value)` if the function returned `Some(_)`, else + /// `Err(previous_value)`. + /// + /// Note: This may call the function multiple times if the value has been changed from other threads in + /// the meantime, as long as the function returns `Some(_)`, but the function will have been applied + /// only once to the stored value. + /// + /// # Examples + /// + /// ```rust + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(7); + /// assert_eq!(a.fetch_update(|_| None), Err(7)); + /// assert_eq!(a.fetch_update(|a| Some(a + 1)), Ok(7)); + /// assert_eq!(a.fetch_update(|a| Some(a + 1)), Ok(8)); + /// assert_eq!(a.load(), 9); + /// ``` + #[inline] + pub fn fetch_update(&self, mut f: F) -> Result + where + F: FnMut(T) -> Option, + { + let mut prev = self.load(); + while let Some(next) = f(prev) { + match self.compare_exchange(prev, next) { + x @ Ok(_) => return x, + Err(next_prev) => prev = next_prev, + } + } + Err(prev) + } +} + +// `MaybeUninit` prevents `T` from being dropped, so we need to implement `Drop` +// for `AtomicCell` to avoid leaks of non-`Copy` types. +impl Drop for AtomicCell { + fn drop(&mut self) { + if mem::needs_drop::() { + // SAFETY: + // - the mutable reference guarantees that no other threads are concurrently accessing the atomic data + // - the raw pointer passed in is valid because we got it from a reference + // - `MaybeUninit` prevents double dropping `T` + unsafe { + self.as_ptr().drop_in_place(); + } + } + } +} + +macro_rules! atomic { + // If values of type `$t` can be transmuted into values of the primitive atomic type `$atomic`, + // declares variable `$a` of type `$atomic` and executes `$atomic_op`, breaking out of the loop. + (@check, $t:ty, $atomic:ty, $a:ident, $atomic_op:expr) => { + if can_transmute::<$t, $atomic>() { + let $a: &$atomic; + break $atomic_op; + } + }; + + // If values of type `$t` can be transmuted into values of a primitive atomic type, declares + // variable `$a` of that type and executes `$atomic_op`. Otherwise, just executes + // `$fallback_op`. + ($t:ty, $a:ident, $atomic_op:expr, $fallback_op:expr) => { + loop { + atomic!(@check, $t, AtomicUnit, $a, $atomic_op); + + atomic!(@check, $t, atomic::AtomicU8, $a, $atomic_op); + atomic!(@check, $t, atomic::AtomicU16, $a, $atomic_op); + atomic!(@check, $t, atomic::AtomicU32, $a, $atomic_op); + #[cfg(target_has_atomic = "64")] + atomic!(@check, $t, atomic::AtomicU64, $a, $atomic_op); + // TODO: AtomicU128 is unstable + // atomic!(@check, $t, atomic::AtomicU128, $a, $atomic_op); + + break $fallback_op; + } + }; +} + +macro_rules! impl_arithmetic { + ($t:ty, fallback, $example:tt) => { + impl AtomicCell<$t> { + /// Increments the current value by `val` and returns the previous value. + /// + /// The addition wraps on overflow. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_add(3), 7); + /// assert_eq!(a.load(), 10); + /// ``` + #[inline] + pub fn fetch_add(&self, val: $t) -> $t { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = value.wrapping_add(val); + old + } + + /// Decrements the current value by `val` and returns the previous value. + /// + /// The subtraction wraps on overflow. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_sub(3), 7); + /// assert_eq!(a.load(), 4); + /// ``` + #[inline] + pub fn fetch_sub(&self, val: $t) -> $t { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = value.wrapping_sub(val); + old + } + + /// Applies bitwise "and" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_and(3), 7); + /// assert_eq!(a.load(), 3); + /// ``` + #[inline] + pub fn fetch_and(&self, val: $t) -> $t { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value &= val; + old + } + + /// Applies bitwise "nand" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_nand(3), 7); + /// assert_eq!(a.load(), !(7 & 3)); + /// ``` + #[inline] + pub fn fetch_nand(&self, val: $t) -> $t { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = !(old & val); + old + } + + /// Applies bitwise "or" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_or(16), 7); + /// assert_eq!(a.load(), 23); + /// ``` + #[inline] + pub fn fetch_or(&self, val: $t) -> $t { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value |= val; + old + } + + /// Applies bitwise "xor" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_xor(2), 7); + /// assert_eq!(a.load(), 5); + /// ``` + #[inline] + pub fn fetch_xor(&self, val: $t) -> $t { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value ^= val; + old + } + + /// Compares and sets the maximum of the current value and `val`, + /// and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_max(2), 7); + /// assert_eq!(a.load(), 7); + /// ``` + #[inline] + pub fn fetch_max(&self, val: $t) -> $t { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = cmp::max(old, val); + old + } + + /// Compares and sets the minimum of the current value and `val`, + /// and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_min(2), 7); + /// assert_eq!(a.load(), 2); + /// ``` + #[inline] + pub fn fetch_min(&self, val: $t) -> $t { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = cmp::min(old, val); + old + } + } + }; + ($t:ty, $atomic:ident, $example:tt) => { + impl AtomicCell<$t> { + /// Increments the current value by `val` and returns the previous value. + /// + /// The addition wraps on overflow. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_add(3), 7); + /// assert_eq!(a.load(), 10); + /// ``` + #[inline] + pub fn fetch_add(&self, val: $t) -> $t { + atomic! { + $t, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::$atomic) }; + a.fetch_add(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = value.wrapping_add(val); + old + } + } + } + + /// Decrements the current value by `val` and returns the previous value. + /// + /// The subtraction wraps on overflow. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_sub(3), 7); + /// assert_eq!(a.load(), 4); + /// ``` + #[inline] + pub fn fetch_sub(&self, val: $t) -> $t { + atomic! { + $t, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::$atomic) }; + a.fetch_sub(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = value.wrapping_sub(val); + old + } + } + } + + /// Applies bitwise "and" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_and(3), 7); + /// assert_eq!(a.load(), 3); + /// ``` + #[inline] + pub fn fetch_and(&self, val: $t) -> $t { + atomic! { + $t, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::$atomic) }; + a.fetch_and(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value &= val; + old + } + } + } + + /// Applies bitwise "nand" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_nand(3), 7); + /// assert_eq!(a.load(), !(7 & 3)); + /// ``` + #[inline] + pub fn fetch_nand(&self, val: $t) -> $t { + atomic! { + $t, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::$atomic) }; + a.fetch_nand(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = !(old & val); + old + } + } + } + + /// Applies bitwise "or" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_or(16), 7); + /// assert_eq!(a.load(), 23); + /// ``` + #[inline] + pub fn fetch_or(&self, val: $t) -> $t { + atomic! { + $t, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::$atomic) }; + a.fetch_or(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value |= val; + old + } + } + } + + /// Applies bitwise "xor" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_xor(2), 7); + /// assert_eq!(a.load(), 5); + /// ``` + #[inline] + pub fn fetch_xor(&self, val: $t) -> $t { + atomic! { + $t, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::$atomic) }; + a.fetch_xor(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value ^= val; + old + } + } + } + + /// Compares and sets the maximum of the current value and `val`, + /// and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_max(9), 7); + /// assert_eq!(a.load(), 9); + /// ``` + #[inline] + pub fn fetch_max(&self, val: $t) -> $t { + atomic! { + $t, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::$atomic) }; + a.fetch_max(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = cmp::max(old, val); + old + } + } + } + + /// Compares and sets the minimum of the current value and `val`, + /// and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + #[doc = $example] + /// + /// assert_eq!(a.fetch_min(2), 7); + /// assert_eq!(a.load(), 2); + /// ``` + #[inline] + pub fn fetch_min(&self, val: $t) -> $t { + atomic! { + $t, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::$atomic) }; + a.fetch_min(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = cmp::min(old, val); + old + } + } + } + } + }; +} + +impl_arithmetic!(u8, AtomicU8, "let a = AtomicCell::new(7u8);"); +impl_arithmetic!(i8, AtomicI8, "let a = AtomicCell::new(7i8);"); +impl_arithmetic!(u16, AtomicU16, "let a = AtomicCell::new(7u16);"); +impl_arithmetic!(i16, AtomicI16, "let a = AtomicCell::new(7i16);"); + +impl_arithmetic!(u32, AtomicU32, "let a = AtomicCell::new(7u32);"); +impl_arithmetic!(i32, AtomicI32, "let a = AtomicCell::new(7i32);"); + +#[cfg(target_has_atomic = "64")] +impl_arithmetic!(u64, AtomicU64, "let a = AtomicCell::new(7u64);"); +#[cfg(target_has_atomic = "64")] +impl_arithmetic!(i64, AtomicI64, "let a = AtomicCell::new(7i64);"); +#[cfg(not(target_has_atomic = "64"))] +impl_arithmetic!(u64, fallback, "let a = AtomicCell::new(7u64);"); +#[cfg(not(target_has_atomic = "64"))] +impl_arithmetic!(i64, fallback, "let a = AtomicCell::new(7i64);"); + +// TODO: AtomicU128 is unstable +// impl_arithmetic!(u128, AtomicU128, "let a = AtomicCell::new(7u128);"); +// impl_arithmetic!(i128, AtomicI128, "let a = AtomicCell::new(7i128);"); +impl_arithmetic!(u128, fallback, "let a = AtomicCell::new(7u128);"); +impl_arithmetic!(i128, fallback, "let a = AtomicCell::new(7i128);"); + +impl_arithmetic!(usize, AtomicUsize, "let a = AtomicCell::new(7usize);"); +impl_arithmetic!(isize, AtomicIsize, "let a = AtomicCell::new(7isize);"); + +impl AtomicCell { + /// Applies logical "and" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(true); + /// + /// assert_eq!(a.fetch_and(true), true); + /// assert_eq!(a.load(), true); + /// + /// assert_eq!(a.fetch_and(false), true); + /// assert_eq!(a.load(), false); + /// ``` + #[inline] + pub fn fetch_and(&self, val: bool) -> bool { + atomic! { + bool, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::AtomicBool) }; + a.fetch_and(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value &= val; + old + } + } + } + + /// Applies logical "nand" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(true); + /// + /// assert_eq!(a.fetch_nand(false), true); + /// assert_eq!(a.load(), true); + /// + /// assert_eq!(a.fetch_nand(true), true); + /// assert_eq!(a.load(), false); + /// + /// assert_eq!(a.fetch_nand(false), false); + /// assert_eq!(a.load(), true); + /// ``` + #[inline] + pub fn fetch_nand(&self, val: bool) -> bool { + atomic! { + bool, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::AtomicBool) }; + a.fetch_nand(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value = !(old & val); + old + } + } + } + + /// Applies logical "or" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(false); + /// + /// assert_eq!(a.fetch_or(false), false); + /// assert_eq!(a.load(), false); + /// + /// assert_eq!(a.fetch_or(true), false); + /// assert_eq!(a.load(), true); + /// ``` + #[inline] + pub fn fetch_or(&self, val: bool) -> bool { + atomic! { + bool, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::AtomicBool) }; + a.fetch_or(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value |= val; + old + } + } + } + + /// Applies logical "xor" to the current value and returns the previous value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::atomic::AtomicCell; + /// + /// let a = AtomicCell::new(true); + /// + /// assert_eq!(a.fetch_xor(false), true); + /// assert_eq!(a.load(), true); + /// + /// assert_eq!(a.fetch_xor(true), true); + /// assert_eq!(a.load(), false); + /// ``` + #[inline] + pub fn fetch_xor(&self, val: bool) -> bool { + atomic! { + bool, _a, + { + let a = unsafe { &*(self.as_ptr() as *const atomic::AtomicBool) }; + a.fetch_xor(val, Ordering::AcqRel) + }, + { + let _guard = lock(self.as_ptr() as usize).write(); + let value = unsafe { &mut *(self.as_ptr()) }; + let old = *value; + *value ^= val; + old + } + } + } +} + +impl Default for AtomicCell { + fn default() -> AtomicCell { + AtomicCell::new(T::default()) + } +} + +impl From for AtomicCell { + #[inline] + fn from(val: T) -> AtomicCell { + AtomicCell::new(val) + } +} + +impl fmt::Debug for AtomicCell { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AtomicCell") + .field("value", &self.load()) + .finish() + } +} + +/// Returns `true` if values of type `A` can be transmuted into values of type `B`. +const fn can_transmute() -> bool { + // Sizes must be equal, but alignment of `A` must be greater or equal than that of `B`. + (mem::size_of::() == mem::size_of::()) & (mem::align_of::() >= mem::align_of::()) +} + +/// Returns a reference to the global lock associated with the `AtomicCell` at address `addr`. +/// +/// This function is used to protect atomic data which doesn't fit into any of the primitive atomic +/// types in `std::sync::atomic`. Operations on such atomics must therefore use a global lock. +/// +/// However, there is not only one global lock but an array of many locks, and one of them is +/// picked based on the given address. Having many locks reduces contention and improves +/// scalability. +#[inline] +#[must_use] +fn lock(addr: usize) -> &'static SeqLock { + // The number of locks is a prime number because we want to make sure `addr % LEN` gets + // dispersed across all locks. + // + // Note that addresses are always aligned to some power of 2, depending on type `T` in + // `AtomicCell`. If `LEN` was an even number, then `addr % LEN` would be an even number, + // too, which means only half of the locks would get utilized! + // + // It is also possible for addresses to accidentally get aligned to a number that is not a + // power of 2. Consider this example: + // + // ``` + // #[repr(C)] + // struct Foo { + // a: AtomicCell, + // b: u8, + // c: u8, + // } + // ``` + // + // Now, if we have a slice of type `&[Foo]`, it is possible that field `a` in all items gets + // stored at addresses that are multiples of 3. It'd be too bad if `LEN` was divisible by 3. + // In order to protect from such cases, we simply choose a large prime number for `LEN`. + const LEN: usize = 67; + #[allow(clippy::declare_interior_mutable_const)] + const L: CachePadded = CachePadded::new(SeqLock::new()); + static LOCKS: [CachePadded; LEN] = [L; LEN]; + + // If the modulus is a constant number, the compiler will use crazy math to transform this into + // a sequence of cheap arithmetic operations rather than using the slow modulo instruction. + &LOCKS[addr % LEN] +} + +/// An atomic `()`. +/// +/// All operations are noops. +struct AtomicUnit; + +impl AtomicUnit { + #[inline] + fn load(&self, _order: Ordering) {} + + #[inline] + fn store(&self, _val: (), _order: Ordering) {} + + #[inline] + fn swap(&self, _val: (), _order: Ordering) {} + + #[inline] + fn compare_exchange_weak( + &self, + _current: (), + _new: (), + _success: Ordering, + _failure: Ordering, + ) -> Result<(), ()> { + Ok(()) + } +} + +/// Returns `true` if operations on `AtomicCell` are lock-free. +const fn atomic_is_lock_free() -> bool { + atomic! { T, _a, true, false } +} + +/// Atomically reads data from `src`. +/// +/// This operation uses the `Acquire` ordering. If possible, an atomic instructions is used, and a +/// global lock otherwise. +unsafe fn atomic_load(src: *mut T) -> T +where + T: Copy, +{ + atomic! { + T, a, + { + a = &*(src as *const _ as *const _); + mem::transmute_copy(&a.load(Ordering::Acquire)) + }, + { + let lock = lock(src as usize); + + // Try doing an optimistic read first. + if let Some(stamp) = lock.optimistic_read() { + // We need a volatile read here because other threads might concurrently modify the + // value. In theory, data races are *always* UB, even if we use volatile reads and + // discard the data when a data race is detected. The proper solution would be to + // do atomic reads and atomic writes, but we can't atomically read and write all + // kinds of data since `AtomicU8` is not available on stable Rust yet. + // Load as `MaybeUninit` because we may load a value that is not valid as `T`. + let val = ptr::read_volatile(src.cast::>()); + + if lock.validate_read(stamp) { + return val.assume_init(); + } + } + + // Grab a regular write lock so that writers don't starve this load. + let guard = lock.write(); + let val = ptr::read(src); + // The value hasn't been changed. Drop the guard without incrementing the stamp. + guard.abort(); + val + } + } +} + +/// Atomically writes `val` to `dst`. +/// +/// This operation uses the `Release` ordering. If possible, an atomic instructions is used, and a +/// global lock otherwise. +unsafe fn atomic_store(dst: *mut T, val: T) { + atomic! { + T, a, + { + a = &*(dst as *const _ as *const _); + a.store(mem::transmute_copy(&val), Ordering::Release); + mem::forget(val); + }, + { + let _guard = lock(dst as usize).write(); + ptr::write(dst, val); + } + } +} + +/// Atomically swaps data at `dst` with `val`. +/// +/// This operation uses the `AcqRel` ordering. If possible, an atomic instructions is used, and a +/// global lock otherwise. +unsafe fn atomic_swap(dst: *mut T, val: T) -> T { + atomic! { + T, a, + { + a = &*(dst as *const _ as *const _); + let res = mem::transmute_copy(&a.swap(mem::transmute_copy(&val), Ordering::AcqRel)); + mem::forget(val); + res + }, + { + let _guard = lock(dst as usize).write(); + ptr::replace(dst, val) + } + } +} + +/// Atomically compares data at `dst` to `current` and, if equal byte-for-byte, exchanges data at +/// `dst` with `new`. +/// +/// Returns the old value on success, or the current value at `dst` on failure. +/// +/// This operation uses the `AcqRel` ordering. If possible, an atomic instructions is used, and a +/// global lock otherwise. +#[allow(clippy::let_unit_value)] +unsafe fn atomic_compare_exchange_weak(dst: *mut T, mut current: T, new: T) -> Result +where + T: Copy + Eq, +{ + atomic! { + T, a, + { + a = &*(dst as *const _ as *const _); + let mut current_raw = mem::transmute_copy(¤t); + let new_raw = mem::transmute_copy(&new); + + loop { + match a.compare_exchange_weak( + current_raw, + new_raw, + Ordering::AcqRel, + Ordering::Acquire, + ) { + Ok(_) => break Ok(current), + Err(previous_raw) => { + let previous = mem::transmute_copy(&previous_raw); + + if !T::eq(&previous, ¤t) { + break Err(previous); + } + + // The compare-exchange operation has failed and didn't store `new`. The + // failure is either spurious, or `previous` was semantically equal to + // `current` but not byte-equal. Let's retry with `previous` as the new + // `current`. + current = previous; + current_raw = previous_raw; + } + } + } + }, + { + let guard = lock(dst as usize).write(); + + if T::eq(&*dst, ¤t) { + Ok(ptr::replace(dst, new)) + } else { + let val = ptr::read(dst); + // The value hasn't been changed. Drop the guard without incrementing the stamp. + guard.abort(); + Err(val) + } + } + } +} diff --git a/crossbeam-utils-0.8.19/src/atomic/consume.rs b/crossbeam-utils-0.8.19/src/atomic/consume.rs new file mode 100644 index 0000000000000..ff8e316b2c385 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/atomic/consume.rs @@ -0,0 +1,111 @@ +#[cfg(not(crossbeam_no_atomic))] +use core::sync::atomic::Ordering; + +/// Trait which allows reading from primitive atomic types with "consume" ordering. +pub trait AtomicConsume { + /// Type returned by `load_consume`. + type Val; + + /// Loads a value from the atomic using a "consume" memory ordering. + /// + /// This is similar to the "acquire" ordering, except that an ordering is + /// only guaranteed with operations that "depend on" the result of the load. + /// However consume loads are usually much faster than acquire loads on + /// architectures with a weak memory model since they don't require memory + /// fence instructions. + /// + /// The exact definition of "depend on" is a bit vague, but it works as you + /// would expect in practice since a lot of software, especially the Linux + /// kernel, rely on this behavior. + /// + /// This is currently only implemented on ARM and AArch64, where a fence + /// can be avoided. On other architectures this will fall back to a simple + /// `load(Ordering::Acquire)`. + fn load_consume(&self) -> Self::Val; +} + +#[cfg(not(crossbeam_no_atomic))] +// Miri and Loom don't support "consume" ordering and ThreadSanitizer doesn't treat +// load(Relaxed) + compiler_fence(Acquire) as "consume" load. +// LLVM generates machine code equivalent to fence(Acquire) in compiler_fence(Acquire) +// on PowerPC, MIPS, etc. (https://godbolt.org/z/hffvjvW7h), so for now the fence +// can be actually avoided here only on ARM and AArch64. See also +// https://github.com/rust-lang/rust/issues/62256. +#[cfg(all( + any(target_arch = "arm", target_arch = "aarch64"), + not(any(miri, crossbeam_loom, crossbeam_sanitize_thread)), +))] +macro_rules! impl_consume { + () => { + #[inline] + fn load_consume(&self) -> Self::Val { + use crate::primitive::sync::atomic::compiler_fence; + let result = self.load(Ordering::Relaxed); + compiler_fence(Ordering::Acquire); + result + } + }; +} + +#[cfg(not(crossbeam_no_atomic))] +#[cfg(not(all( + any(target_arch = "arm", target_arch = "aarch64"), + not(any(miri, crossbeam_loom, crossbeam_sanitize_thread)), +)))] +macro_rules! impl_consume { + () => { + #[inline] + fn load_consume(&self) -> Self::Val { + self.load(Ordering::Acquire) + } + }; +} + +macro_rules! impl_atomic { + ($atomic:ident, $val:ty) => { + #[cfg(not(crossbeam_no_atomic))] + impl AtomicConsume for core::sync::atomic::$atomic { + type Val = $val; + impl_consume!(); + } + #[cfg(crossbeam_loom)] + impl AtomicConsume for loom::sync::atomic::$atomic { + type Val = $val; + impl_consume!(); + } + }; +} + +impl_atomic!(AtomicBool, bool); +impl_atomic!(AtomicUsize, usize); +impl_atomic!(AtomicIsize, isize); +impl_atomic!(AtomicU8, u8); +impl_atomic!(AtomicI8, i8); +impl_atomic!(AtomicU16, u16); +impl_atomic!(AtomicI16, i16); +#[cfg(any(target_has_atomic = "32", not(target_pointer_width = "16")))] +impl_atomic!(AtomicU32, u32); +#[cfg(any(target_has_atomic = "32", not(target_pointer_width = "16")))] +impl_atomic!(AtomicI32, i32); +#[cfg(any( + target_has_atomic = "64", + not(any(target_pointer_width = "16", target_pointer_width = "32")), +))] +impl_atomic!(AtomicU64, u64); +#[cfg(any( + target_has_atomic = "64", + not(any(target_pointer_width = "16", target_pointer_width = "32")), +))] +impl_atomic!(AtomicI64, i64); + +#[cfg(not(crossbeam_no_atomic))] +impl AtomicConsume for core::sync::atomic::AtomicPtr { + type Val = *mut T; + impl_consume!(); +} + +#[cfg(crossbeam_loom)] +impl AtomicConsume for loom::sync::atomic::AtomicPtr { + type Val = *mut T; + impl_consume!(); +} diff --git a/crossbeam-utils-0.8.19/src/atomic/mod.rs b/crossbeam-utils-0.8.19/src/atomic/mod.rs new file mode 100644 index 0000000000000..7b39fe4747dac --- /dev/null +++ b/crossbeam-utils-0.8.19/src/atomic/mod.rs @@ -0,0 +1,32 @@ +//! Atomic types. +//! +//! * [`AtomicCell`], a thread-safe mutable memory location. +//! * [`AtomicConsume`], for reading from primitive atomic types with "consume" ordering. + +#[cfg(target_has_atomic = "ptr")] +#[cfg(not(crossbeam_loom))] +// Use "wide" sequence lock if the pointer width <= 32 for preventing its counter against wrap +// around. +// +// In narrow architectures (pointer width <= 16), the counter is still <= 32-bit and may be +// vulnerable to wrap around. But it's mostly okay, since in such a primitive hardware, the +// counter will not be increased that fast. +// Note that Rust (and C99) pointers must be at least 16-bits: https://github.com/rust-lang/rust/pull/49305 +#[cfg_attr( + any(target_pointer_width = "16", target_pointer_width = "32"), + path = "seq_lock_wide.rs" +)] +mod seq_lock; + +#[cfg(target_has_atomic = "ptr")] +// We cannot provide AtomicCell under cfg(crossbeam_loom) because loom's atomic +// types have a different in-memory representation than the underlying type. +// TODO: The latest loom supports fences, so fallback using seqlock may be available. +#[cfg(not(crossbeam_loom))] +mod atomic_cell; +#[cfg(target_has_atomic = "ptr")] +#[cfg(not(crossbeam_loom))] +pub use atomic_cell::AtomicCell; + +mod consume; +pub use consume::AtomicConsume; diff --git a/crossbeam-utils-0.8.19/src/atomic/seq_lock.rs b/crossbeam-utils-0.8.19/src/atomic/seq_lock.rs new file mode 100644 index 0000000000000..ff8defd26dab8 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/atomic/seq_lock.rs @@ -0,0 +1,112 @@ +use core::mem; +use core::sync::atomic::{self, AtomicUsize, Ordering}; + +use crate::Backoff; + +/// A simple stamped lock. +pub(crate) struct SeqLock { + /// The current state of the lock. + /// + /// All bits except the least significant one hold the current stamp. When locked, the state + /// equals 1 and doesn't contain a valid stamp. + state: AtomicUsize, +} + +impl SeqLock { + pub(crate) const fn new() -> Self { + Self { + state: AtomicUsize::new(0), + } + } + + /// If not locked, returns the current stamp. + /// + /// This method should be called before optimistic reads. + #[inline] + pub(crate) fn optimistic_read(&self) -> Option { + let state = self.state.load(Ordering::Acquire); + if state == 1 { + None + } else { + Some(state) + } + } + + /// Returns `true` if the current stamp is equal to `stamp`. + /// + /// This method should be called after optimistic reads to check whether they are valid. The + /// argument `stamp` should correspond to the one returned by method `optimistic_read`. + #[inline] + pub(crate) fn validate_read(&self, stamp: usize) -> bool { + atomic::fence(Ordering::Acquire); + self.state.load(Ordering::Relaxed) == stamp + } + + /// Grabs the lock for writing. + #[inline] + pub(crate) fn write(&'static self) -> SeqLockWriteGuard { + let backoff = Backoff::new(); + loop { + let previous = self.state.swap(1, Ordering::Acquire); + + if previous != 1 { + atomic::fence(Ordering::Release); + + return SeqLockWriteGuard { + lock: self, + state: previous, + }; + } + + backoff.snooze(); + } + } +} + +/// An RAII guard that releases the lock and increments the stamp when dropped. +pub(crate) struct SeqLockWriteGuard { + /// The parent lock. + lock: &'static SeqLock, + + /// The stamp before locking. + state: usize, +} + +impl SeqLockWriteGuard { + /// Releases the lock without incrementing the stamp. + #[inline] + pub(crate) fn abort(self) { + self.lock.state.store(self.state, Ordering::Release); + + // We specifically don't want to call drop(), since that's + // what increments the stamp. + mem::forget(self); + } +} + +impl Drop for SeqLockWriteGuard { + #[inline] + fn drop(&mut self) { + // Release the lock and increment the stamp. + self.lock + .state + .store(self.state.wrapping_add(2), Ordering::Release); + } +} + +#[cfg(test)] +mod tests { + use super::SeqLock; + + #[test] + fn test_abort() { + static LK: SeqLock = SeqLock::new(); + let before = LK.optimistic_read().unwrap(); + { + let guard = LK.write(); + guard.abort(); + } + let after = LK.optimistic_read().unwrap(); + assert_eq!(before, after, "aborted write does not update the stamp"); + } +} diff --git a/crossbeam-utils-0.8.19/src/atomic/seq_lock_wide.rs b/crossbeam-utils-0.8.19/src/atomic/seq_lock_wide.rs new file mode 100644 index 0000000000000..ef5d94a45413d --- /dev/null +++ b/crossbeam-utils-0.8.19/src/atomic/seq_lock_wide.rs @@ -0,0 +1,155 @@ +use core::mem; +use core::sync::atomic::{self, AtomicUsize, Ordering}; + +use crate::Backoff; + +/// A simple stamped lock. +/// +/// The state is represented as two `AtomicUsize`: `state_hi` for high bits and `state_lo` for low +/// bits. +pub(crate) struct SeqLock { + /// The high bits of the current state of the lock. + state_hi: AtomicUsize, + + /// The low bits of the current state of the lock. + /// + /// All bits except the least significant one hold the current stamp. When locked, the state_lo + /// equals 1 and doesn't contain a valid stamp. + state_lo: AtomicUsize, +} + +impl SeqLock { + pub(crate) const fn new() -> Self { + Self { + state_hi: AtomicUsize::new(0), + state_lo: AtomicUsize::new(0), + } + } + + /// If not locked, returns the current stamp. + /// + /// This method should be called before optimistic reads. + #[inline] + pub(crate) fn optimistic_read(&self) -> Option<(usize, usize)> { + // The acquire loads from `state_hi` and `state_lo` synchronize with the release stores in + // `SeqLockWriteGuard::drop`. + // + // As a consequence, we can make sure that (1) all writes within the era of `state_hi - 1` + // happens before now; and therefore, (2) if `state_lo` is even, all writes within the + // critical section of (`state_hi`, `state_lo`) happens before now. + let state_hi = self.state_hi.load(Ordering::Acquire); + let state_lo = self.state_lo.load(Ordering::Acquire); + if state_lo == 1 { + None + } else { + Some((state_hi, state_lo)) + } + } + + /// Returns `true` if the current stamp is equal to `stamp`. + /// + /// This method should be called after optimistic reads to check whether they are valid. The + /// argument `stamp` should correspond to the one returned by method `optimistic_read`. + #[inline] + pub(crate) fn validate_read(&self, stamp: (usize, usize)) -> bool { + // Thanks to the fence, if we're noticing any modification to the data at the critical + // section of `(a, b)`, then the critical section's write of 1 to state_lo should be + // visible. + atomic::fence(Ordering::Acquire); + + // So if `state_lo` coincides with `stamp.1`, then either (1) we're noticing no modification + // to the data after the critical section of `(stamp.0, stamp.1)`, or (2) `state_lo` wrapped + // around. + // + // If (2) is the case, the acquire ordering ensures we see the new value of `state_hi`. + let state_lo = self.state_lo.load(Ordering::Acquire); + + // If (2) is the case and `state_hi` coincides with `stamp.0`, then `state_hi` also wrapped + // around, which we give up to correctly validate the read. + let state_hi = self.state_hi.load(Ordering::Relaxed); + + // Except for the case that both `state_hi` and `state_lo` wrapped around, the following + // condition implies that we're noticing no modification to the data after the critical + // section of `(stamp.0, stamp.1)`. + (state_hi, state_lo) == stamp + } + + /// Grabs the lock for writing. + #[inline] + pub(crate) fn write(&'static self) -> SeqLockWriteGuard { + let backoff = Backoff::new(); + loop { + let previous = self.state_lo.swap(1, Ordering::Acquire); + + if previous != 1 { + // To synchronize with the acquire fence in `validate_read` via any modification to + // the data at the critical section of `(state_hi, previous)`. + atomic::fence(Ordering::Release); + + return SeqLockWriteGuard { + lock: self, + state_lo: previous, + }; + } + + backoff.snooze(); + } + } +} + +/// An RAII guard that releases the lock and increments the stamp when dropped. +pub(crate) struct SeqLockWriteGuard { + /// The parent lock. + lock: &'static SeqLock, + + /// The stamp before locking. + state_lo: usize, +} + +impl SeqLockWriteGuard { + /// Releases the lock without incrementing the stamp. + #[inline] + pub(crate) fn abort(self) { + self.lock.state_lo.store(self.state_lo, Ordering::Release); + mem::forget(self); + } +} + +impl Drop for SeqLockWriteGuard { + #[inline] + fn drop(&mut self) { + let state_lo = self.state_lo.wrapping_add(2); + + // Increase the high bits if the low bits wrap around. + // + // Release ordering for synchronizing with `optimistic_read`. + if state_lo == 0 { + let state_hi = self.lock.state_hi.load(Ordering::Relaxed); + self.lock + .state_hi + .store(state_hi.wrapping_add(1), Ordering::Release); + } + + // Release the lock and increment the stamp. + // + // Release ordering for synchronizing with `optimistic_read`. + self.lock.state_lo.store(state_lo, Ordering::Release); + } +} + +#[cfg(test)] +mod tests { + use super::SeqLock; + + #[test] + fn test_abort() { + static LK: SeqLock = SeqLock::new(); + let before = LK.optimistic_read().unwrap(); + { + let guard = LK.write(); + guard.abort(); + } + let after = LK.optimistic_read().unwrap(); + assert_eq!(before, after, "aborted write does not update the stamp"); + } +} diff --git a/crossbeam-utils-0.8.19/src/backoff.rs b/crossbeam-utils-0.8.19/src/backoff.rs new file mode 100644 index 0000000000000..7a505ed614e43 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/backoff.rs @@ -0,0 +1,287 @@ +use crate::primitive::hint; +use core::cell::Cell; +use core::fmt; + +const SPIN_LIMIT: u32 = 6; +const YIELD_LIMIT: u32 = 10; + +/// Performs exponential backoff in spin loops. +/// +/// Backing off in spin loops reduces contention and improves overall performance. +/// +/// This primitive can execute *YIELD* and *PAUSE* instructions, yield the current thread to the OS +/// scheduler, and tell when is a good time to block the thread using a different synchronization +/// mechanism. Each step of the back off procedure takes roughly twice as long as the previous +/// step. +/// +/// # Examples +/// +/// Backing off in a lock-free loop: +/// +/// ``` +/// use crossbeam_utils::Backoff; +/// use std::sync::atomic::AtomicUsize; +/// use std::sync::atomic::Ordering::SeqCst; +/// +/// fn fetch_mul(a: &AtomicUsize, b: usize) -> usize { +/// let backoff = Backoff::new(); +/// loop { +/// let val = a.load(SeqCst); +/// if a.compare_exchange(val, val.wrapping_mul(b), SeqCst, SeqCst).is_ok() { +/// return val; +/// } +/// backoff.spin(); +/// } +/// } +/// ``` +/// +/// Waiting for an [`AtomicBool`] to become `true`: +/// +/// ``` +/// use crossbeam_utils::Backoff; +/// use std::sync::atomic::AtomicBool; +/// use std::sync::atomic::Ordering::SeqCst; +/// +/// fn spin_wait(ready: &AtomicBool) { +/// let backoff = Backoff::new(); +/// while !ready.load(SeqCst) { +/// backoff.snooze(); +/// } +/// } +/// ``` +/// +/// Waiting for an [`AtomicBool`] to become `true` and parking the thread after a long wait. +/// Note that whoever sets the atomic variable to `true` must notify the parked thread by calling +/// [`unpark()`]: +/// +/// ``` +/// use crossbeam_utils::Backoff; +/// use std::sync::atomic::AtomicBool; +/// use std::sync::atomic::Ordering::SeqCst; +/// use std::thread; +/// +/// fn blocking_wait(ready: &AtomicBool) { +/// let backoff = Backoff::new(); +/// while !ready.load(SeqCst) { +/// if backoff.is_completed() { +/// thread::park(); +/// } else { +/// backoff.snooze(); +/// } +/// } +/// } +/// ``` +/// +/// [`is_completed`]: Backoff::is_completed +/// [`std::thread::park()`]: std::thread::park +/// [`Condvar`]: std::sync::Condvar +/// [`AtomicBool`]: std::sync::atomic::AtomicBool +/// [`unpark()`]: std::thread::Thread::unpark +pub struct Backoff { + step: Cell, +} + +impl Backoff { + /// Creates a new `Backoff`. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::Backoff; + /// + /// let backoff = Backoff::new(); + /// ``` + #[inline] + pub fn new() -> Self { + Backoff { step: Cell::new(0) } + } + + /// Resets the `Backoff`. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::Backoff; + /// + /// let backoff = Backoff::new(); + /// backoff.reset(); + /// ``` + #[inline] + pub fn reset(&self) { + self.step.set(0); + } + + /// Backs off in a lock-free loop. + /// + /// This method should be used when we need to retry an operation because another thread made + /// progress. + /// + /// The processor may yield using the *YIELD* or *PAUSE* instruction. + /// + /// # Examples + /// + /// Backing off in a lock-free loop: + /// + /// ``` + /// use crossbeam_utils::Backoff; + /// use std::sync::atomic::AtomicUsize; + /// use std::sync::atomic::Ordering::SeqCst; + /// + /// fn fetch_mul(a: &AtomicUsize, b: usize) -> usize { + /// let backoff = Backoff::new(); + /// loop { + /// let val = a.load(SeqCst); + /// if a.compare_exchange(val, val.wrapping_mul(b), SeqCst, SeqCst).is_ok() { + /// return val; + /// } + /// backoff.spin(); + /// } + /// } + /// + /// let a = AtomicUsize::new(7); + /// assert_eq!(fetch_mul(&a, 8), 7); + /// assert_eq!(a.load(SeqCst), 56); + /// ``` + #[inline] + pub fn spin(&self) { + for _ in 0..1 << self.step.get().min(SPIN_LIMIT) { + hint::spin_loop(); + } + + if self.step.get() <= SPIN_LIMIT { + self.step.set(self.step.get() + 1); + } + } + + /// Backs off in a blocking loop. + /// + /// This method should be used when we need to wait for another thread to make progress. + /// + /// The processor may yield using the *YIELD* or *PAUSE* instruction and the current thread + /// may yield by giving up a timeslice to the OS scheduler. + /// + /// In `#[no_std]` environments, this method is equivalent to [`spin`]. + /// + /// If possible, use [`is_completed`] to check when it is advised to stop using backoff and + /// block the current thread using a different synchronization mechanism instead. + /// + /// [`spin`]: Backoff::spin + /// [`is_completed`]: Backoff::is_completed + /// + /// # Examples + /// + /// Waiting for an [`AtomicBool`] to become `true`: + /// + /// ``` + /// use crossbeam_utils::Backoff; + /// use std::sync::Arc; + /// use std::sync::atomic::AtomicBool; + /// use std::sync::atomic::Ordering::SeqCst; + /// use std::thread; + /// use std::time::Duration; + /// + /// fn spin_wait(ready: &AtomicBool) { + /// let backoff = Backoff::new(); + /// while !ready.load(SeqCst) { + /// backoff.snooze(); + /// } + /// } + /// + /// let ready = Arc::new(AtomicBool::new(false)); + /// let ready2 = ready.clone(); + /// + /// thread::spawn(move || { + /// thread::sleep(Duration::from_millis(100)); + /// ready2.store(true, SeqCst); + /// }); + /// + /// assert_eq!(ready.load(SeqCst), false); + /// spin_wait(&ready); + /// assert_eq!(ready.load(SeqCst), true); + /// # std::thread::sleep(std::time::Duration::from_millis(500)); // wait for background threads closed: https://github.com/rust-lang/miri/issues/1371 + /// ``` + /// + /// [`AtomicBool`]: std::sync::atomic::AtomicBool + #[inline] + pub fn snooze(&self) { + if self.step.get() <= SPIN_LIMIT { + for _ in 0..1 << self.step.get() { + hint::spin_loop(); + } + } else { + #[cfg(not(feature = "std"))] + for _ in 0..1 << self.step.get() { + hint::spin_loop(); + } + + #[cfg(feature = "std")] + ::std::thread::yield_now(); + } + + if self.step.get() <= YIELD_LIMIT { + self.step.set(self.step.get() + 1); + } + } + + /// Returns `true` if exponential backoff has completed and blocking the thread is advised. + /// + /// # Examples + /// + /// Waiting for an [`AtomicBool`] to become `true` and parking the thread after a long wait: + /// + /// ``` + /// use crossbeam_utils::Backoff; + /// use std::sync::Arc; + /// use std::sync::atomic::AtomicBool; + /// use std::sync::atomic::Ordering::SeqCst; + /// use std::thread; + /// use std::time::Duration; + /// + /// fn blocking_wait(ready: &AtomicBool) { + /// let backoff = Backoff::new(); + /// while !ready.load(SeqCst) { + /// if backoff.is_completed() { + /// thread::park(); + /// } else { + /// backoff.snooze(); + /// } + /// } + /// } + /// + /// let ready = Arc::new(AtomicBool::new(false)); + /// let ready2 = ready.clone(); + /// let waiter = thread::current(); + /// + /// thread::spawn(move || { + /// thread::sleep(Duration::from_millis(100)); + /// ready2.store(true, SeqCst); + /// waiter.unpark(); + /// }); + /// + /// assert_eq!(ready.load(SeqCst), false); + /// blocking_wait(&ready); + /// assert_eq!(ready.load(SeqCst), true); + /// # std::thread::sleep(std::time::Duration::from_millis(500)); // wait for background threads closed: https://github.com/rust-lang/miri/issues/1371 + /// ``` + /// + /// [`AtomicBool`]: std::sync::atomic::AtomicBool + #[inline] + pub fn is_completed(&self) -> bool { + self.step.get() > YIELD_LIMIT + } +} + +impl fmt::Debug for Backoff { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Backoff") + .field("step", &self.step) + .field("is_completed", &self.is_completed()) + .finish() + } +} + +impl Default for Backoff { + fn default() -> Backoff { + Backoff::new() + } +} diff --git a/crossbeam-utils-0.8.19/src/cache_padded.rs b/crossbeam-utils-0.8.19/src/cache_padded.rs new file mode 100644 index 0000000000000..f44f2d7b47ee8 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/cache_padded.rs @@ -0,0 +1,209 @@ +use core::fmt; +use core::ops::{Deref, DerefMut}; + +/// Pads and aligns a value to the length of a cache line. +/// +/// In concurrent programming, sometimes it is desirable to make sure commonly accessed pieces of +/// data are not placed into the same cache line. Updating an atomic value invalidates the whole +/// cache line it belongs to, which makes the next access to the same cache line slower for other +/// CPU cores. Use `CachePadded` to ensure updating one piece of data doesn't invalidate other +/// cached data. +/// +/// # Size and alignment +/// +/// Cache lines are assumed to be N bytes long, depending on the architecture: +/// +/// * On x86-64, aarch64, and powerpc64, N = 128. +/// * On arm, mips, mips64, sparc, and hexagon, N = 32. +/// * On m68k, N = 16. +/// * On s390x, N = 256. +/// * On all others, N = 64. +/// +/// Note that N is just a reasonable guess and is not guaranteed to match the actual cache line +/// length of the machine the program is running on. On modern Intel architectures, spatial +/// prefetcher is pulling pairs of 64-byte cache lines at a time, so we pessimistically assume that +/// cache lines are 128 bytes long. +/// +/// The size of `CachePadded` is the smallest multiple of N bytes large enough to accommodate +/// a value of type `T`. +/// +/// The alignment of `CachePadded` is the maximum of N bytes and the alignment of `T`. +/// +/// # Examples +/// +/// Alignment and padding: +/// +/// ``` +/// use crossbeam_utils::CachePadded; +/// +/// let array = [CachePadded::new(1i8), CachePadded::new(2i8)]; +/// let addr1 = &*array[0] as *const i8 as usize; +/// let addr2 = &*array[1] as *const i8 as usize; +/// +/// assert!(addr2 - addr1 >= 32); +/// assert_eq!(addr1 % 32, 0); +/// assert_eq!(addr2 % 32, 0); +/// ``` +/// +/// When building a concurrent queue with a head and a tail index, it is wise to place them in +/// different cache lines so that concurrent threads pushing and popping elements don't invalidate +/// each other's cache lines: +/// +/// ``` +/// use crossbeam_utils::CachePadded; +/// use std::sync::atomic::AtomicUsize; +/// +/// struct Queue { +/// head: CachePadded, +/// tail: CachePadded, +/// buffer: *mut T, +/// } +/// ``` +#[derive(Clone, Copy, Default, Hash, PartialEq, Eq)] +// Starting from Intel's Sandy Bridge, spatial prefetcher is now pulling pairs of 64-byte cache +// lines at a time, so we have to align to 128 bytes rather than 64. +// +// Sources: +// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf +// - https://github.com/facebook/folly/blob/1b5288e6eea6df074758f877c849b6e73bbb9fbb/folly/lang/Align.h#L107 +// +// ARM's big.LITTLE architecture has asymmetric cores and "big" cores have 128-byte cache line size. +// +// Sources: +// - https://www.mono-project.com/news/2016/09/12/arm64-icache/ +// +// powerpc64 has 128-byte cache line size. +// +// Sources: +// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_ppc64x.go#L9 +// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/powerpc/include/asm/cache.h#L26 +#[cfg_attr( + any( + target_arch = "x86_64", + target_arch = "aarch64", + target_arch = "powerpc64", + ), + repr(align(128)) +)] +// arm, mips, mips64, sparc, and hexagon have 32-byte cache line size. +// +// Sources: +// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_arm.go#L7 +// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips.go#L7 +// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mipsle.go#L7 +// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips64x.go#L9 +// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L17 +// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/hexagon/include/asm/cache.h#L12 +#[cfg_attr( + any( + target_arch = "arm", + target_arch = "mips", + target_arch = "mips32r6", + target_arch = "mips64", + target_arch = "mips64r6", + target_arch = "sparc", + target_arch = "hexagon", + ), + repr(align(32)) +)] +// m68k has 16-byte cache line size. +// +// Sources: +// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/m68k/include/asm/cache.h#L9 +#[cfg_attr(target_arch = "m68k", repr(align(16)))] +// s390x has 256-byte cache line size. +// +// Sources: +// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_s390x.go#L7 +// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/s390/include/asm/cache.h#L13 +#[cfg_attr(target_arch = "s390x", repr(align(256)))] +// x86, wasm, riscv, and sparc64 have 64-byte cache line size. +// +// Sources: +// - https://github.com/golang/go/blob/dda2991c2ea0c5914714469c4defc2562a907230/src/internal/cpu/cpu_x86.go#L9 +// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_wasm.go#L7 +// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/riscv/include/asm/cache.h#L10 +// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L19 +// +// All others are assumed to have 64-byte cache line size. +#[cfg_attr( + not(any( + target_arch = "x86_64", + target_arch = "aarch64", + target_arch = "powerpc64", + target_arch = "arm", + target_arch = "mips", + target_arch = "mips32r6", + target_arch = "mips64", + target_arch = "mips64r6", + target_arch = "sparc", + target_arch = "hexagon", + target_arch = "m68k", + target_arch = "s390x", + )), + repr(align(64)) +)] +pub struct CachePadded { + value: T, +} + +unsafe impl Send for CachePadded {} +unsafe impl Sync for CachePadded {} + +impl CachePadded { + /// Pads and aligns a value to the length of a cache line. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::CachePadded; + /// + /// let padded_value = CachePadded::new(1); + /// ``` + pub const fn new(t: T) -> CachePadded { + CachePadded:: { value: t } + } + + /// Returns the inner value. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::CachePadded; + /// + /// let padded_value = CachePadded::new(7); + /// let value = padded_value.into_inner(); + /// assert_eq!(value, 7); + /// ``` + pub fn into_inner(self) -> T { + self.value + } +} + +impl Deref for CachePadded { + type Target = T; + + fn deref(&self) -> &T { + &self.value + } +} + +impl DerefMut for CachePadded { + fn deref_mut(&mut self) -> &mut T { + &mut self.value + } +} + +impl fmt::Debug for CachePadded { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("CachePadded") + .field("value", &self.value) + .finish() + } +} + +impl From for CachePadded { + fn from(t: T) -> Self { + CachePadded::new(t) + } +} diff --git a/crossbeam-utils-0.8.19/src/lib.rs b/crossbeam-utils-0.8.19/src/lib.rs new file mode 100644 index 0000000000000..106095061b056 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/lib.rs @@ -0,0 +1,103 @@ +//! Miscellaneous tools for concurrent programming. +//! +//! ## Atomics +//! +//! * [`AtomicCell`], a thread-safe mutable memory location. +//! * [`AtomicConsume`], for reading from primitive atomic types with "consume" ordering. +//! +//! ## Thread synchronization +//! +//! * [`Parker`], a thread parking primitive. +//! * [`ShardedLock`], a sharded reader-writer lock with fast concurrent reads. +//! * [`WaitGroup`], for synchronizing the beginning or end of some computation. +//! +//! ## Utilities +//! +//! * [`Backoff`], for exponential backoff in spin loops. +//! * [`CachePadded`], for padding and aligning a value to the length of a cache line. +//! * [`scope`], for spawning threads that borrow local variables from the stack. +//! +//! [`AtomicCell`]: atomic::AtomicCell +//! [`AtomicConsume`]: atomic::AtomicConsume +//! [`Parker`]: sync::Parker +//! [`ShardedLock`]: sync::ShardedLock +//! [`WaitGroup`]: sync::WaitGroup +//! [`scope`]: thread::scope + +#![doc(test( + no_crate_inject, + attr(deny(warnings, rust_2018_idioms), allow(dead_code, unused_assignments, unused_variables)) +))] +#![warn(missing_docs, missing_debug_implementations, rust_2018_idioms, unreachable_pub)] +#![cfg_attr(not(feature = "std"), no_std)] +#![allow(unexpected_cfgs)] +#![allow(rustc::default_hash_types)] +#![allow(unused_imports)] +#![allow(elided_lifetimes_in_paths)] + +#[cfg(crossbeam_loom)] +#[allow(unused_imports)] +mod primitive { + pub(crate) mod hint { + pub(crate) use loom::hint::spin_loop; + } + pub(crate) mod sync { + pub(crate) mod atomic { + pub(crate) use loom::sync::atomic::{ + AtomicBool, AtomicI16, AtomicI32, AtomicI64, AtomicI8, AtomicIsize, AtomicU16, + AtomicU32, AtomicU64, AtomicU8, AtomicUsize, Ordering, + }; + + // FIXME: loom does not support compiler_fence at the moment. + // https://github.com/tokio-rs/loom/issues/117 + // we use fence as a stand-in for compiler_fence for the time being. + // this may miss some races since fence is stronger than compiler_fence, + // but it's the best we can do for the time being. + pub(crate) use loom::sync::atomic::fence as compiler_fence; + } + pub(crate) use loom::sync::{Arc, Condvar, Mutex}; + } +} +#[cfg(not(crossbeam_loom))] +#[allow(unused_imports)] +mod primitive { + pub(crate) mod hint { + pub(crate) use core::hint::spin_loop; + } + pub(crate) mod sync { + pub(crate) mod atomic { + pub(crate) use core::sync::atomic::{compiler_fence, Ordering}; + #[cfg(not(crossbeam_no_atomic))] + pub(crate) use core::sync::atomic::{ + AtomicBool, AtomicI16, AtomicI8, AtomicIsize, AtomicU16, AtomicU8, AtomicUsize, + }; + #[cfg(not(crossbeam_no_atomic))] + #[cfg(any(target_has_atomic = "32", not(target_pointer_width = "16")))] + pub(crate) use core::sync::atomic::{AtomicI32, AtomicU32}; + #[cfg(not(crossbeam_no_atomic))] + #[cfg(any( + target_has_atomic = "64", + not(any(target_pointer_width = "16", target_pointer_width = "32")), + ))] + pub(crate) use core::sync::atomic::{AtomicI64, AtomicU64}; + } + + #[cfg(feature = "std")] + pub(crate) use std::sync::{Arc, Condvar, Mutex}; + } +} + +pub mod atomic; + +mod cache_padded; +pub use crate::cache_padded::CachePadded; + +mod backoff; +pub use crate::backoff::Backoff; + +#[cfg(feature = "std")] +pub mod sync; + +#[cfg(feature = "std")] +#[cfg(not(crossbeam_loom))] +pub mod thread; diff --git a/crossbeam-utils-0.8.19/src/sync/mod.rs b/crossbeam-utils-0.8.19/src/sync/mod.rs new file mode 100644 index 0000000000000..f9eec71fb3f0c --- /dev/null +++ b/crossbeam-utils-0.8.19/src/sync/mod.rs @@ -0,0 +1,17 @@ +//! Thread synchronization primitives. +//! +//! * [`Parker`], a thread parking primitive. +//! * [`ShardedLock`], a sharded reader-writer lock with fast concurrent reads. +//! * [`WaitGroup`], for synchronizing the beginning or end of some computation. + +#[cfg(not(crossbeam_loom))] +mod once_lock; +mod parker; +#[cfg(not(crossbeam_loom))] +mod sharded_lock; +mod wait_group; + +pub use self::parker::{Parker, Unparker}; +#[cfg(not(crossbeam_loom))] +pub use self::sharded_lock::{ShardedLock, ShardedLockReadGuard, ShardedLockWriteGuard}; +pub use self::wait_group::WaitGroup; diff --git a/crossbeam-utils-0.8.19/src/sync/once_lock.rs b/crossbeam-utils-0.8.19/src/sync/once_lock.rs new file mode 100644 index 0000000000000..e057aca7d5d26 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/sync/once_lock.rs @@ -0,0 +1,88 @@ +// Based on unstable std::sync::OnceLock. +// +// Source: https://github.com/rust-lang/rust/blob/8e9c93df464b7ada3fc7a1c8ccddd9dcb24ee0a0/library/std/src/sync/once_lock.rs + +use core::cell::UnsafeCell; +use core::mem::MaybeUninit; +use std::sync::Once; + +pub(crate) struct OnceLock { + once: Once, + value: UnsafeCell>, + // Unlike std::sync::OnceLock, we don't need PhantomData here because + // we don't use #[may_dangle]. +} + +unsafe impl Sync for OnceLock {} +unsafe impl Send for OnceLock {} + +impl OnceLock { + /// Creates a new empty cell. + #[must_use] + pub(crate) const fn new() -> Self { + Self { + once: Once::new(), + value: UnsafeCell::new(MaybeUninit::uninit()), + } + } + + /// Gets the contents of the cell, initializing it with `f` if the cell + /// was empty. + /// + /// Many threads may call `get_or_init` concurrently with different + /// initializing functions, but it is guaranteed that only one function + /// will be executed. + /// + /// # Panics + /// + /// If `f` panics, the panic is propagated to the caller, and the cell + /// remains uninitialized. + /// + /// It is an error to reentrantly initialize the cell from `f`. The + /// exact outcome is unspecified. Current implementation deadlocks, but + /// this may be changed to a panic in the future. + pub(crate) fn get_or_init(&self, f: F) -> &T + where + F: FnOnce() -> T, + { + // Fast path check + if self.once.is_completed() { + // SAFETY: The inner value has been initialized + return unsafe { self.get_unchecked() }; + } + self.initialize(f); + + // SAFETY: The inner value has been initialized + unsafe { self.get_unchecked() } + } + + #[cold] + fn initialize(&self, f: F) + where + F: FnOnce() -> T, + { + let slot = self.value.get(); + + self.once.call_once(|| { + let value = f(); + unsafe { slot.write(MaybeUninit::new(value)) } + }); + } + + /// # Safety + /// + /// The value must be initialized + unsafe fn get_unchecked(&self) -> &T { + debug_assert!(self.once.is_completed()); + &*self.value.get().cast::() + } +} + +impl Drop for OnceLock { + fn drop(&mut self) { + if self.once.is_completed() { + // SAFETY: The inner value has been initialized + unsafe { (*self.value.get()).assume_init_drop() }; + } + } +} diff --git a/crossbeam-utils-0.8.19/src/sync/parker.rs b/crossbeam-utils-0.8.19/src/sync/parker.rs new file mode 100644 index 0000000000000..263ee15184a44 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/sync/parker.rs @@ -0,0 +1,415 @@ +use crate::primitive::sync::atomic::{AtomicUsize, Ordering::SeqCst}; +use crate::primitive::sync::{Arc, Condvar, Mutex}; +use std::fmt; +use std::marker::PhantomData; +use std::time::{Duration, Instant}; + +/// A thread parking primitive. +/// +/// Conceptually, each `Parker` has an associated token which is initially not present: +/// +/// * The [`park`] method blocks the current thread unless or until the token is available, at +/// which point it automatically consumes the token. +/// +/// * The [`park_timeout`] and [`park_deadline`] methods work the same as [`park`], but block for +/// a specified maximum time. +/// +/// * The [`unpark`] method atomically makes the token available if it wasn't already. Because the +/// token is initially absent, [`unpark`] followed by [`park`] will result in the second call +/// returning immediately. +/// +/// In other words, each `Parker` acts a bit like a spinlock that can be locked and unlocked using +/// [`park`] and [`unpark`]. +/// +/// # Examples +/// +/// ``` +/// use std::thread; +/// use std::time::Duration; +/// use crossbeam_utils::sync::Parker; +/// +/// let p = Parker::new(); +/// let u = p.unparker().clone(); +/// +/// // Make the token available. +/// u.unpark(); +/// // Wakes up immediately and consumes the token. +/// p.park(); +/// +/// thread::spawn(move || { +/// thread::sleep(Duration::from_millis(500)); +/// u.unpark(); +/// }); +/// +/// // Wakes up when `u.unpark()` provides the token. +/// p.park(); +/// # std::thread::sleep(std::time::Duration::from_millis(500)); // wait for background threads closed: https://github.com/rust-lang/miri/issues/1371 +/// ``` +/// +/// [`park`]: Parker::park +/// [`park_timeout`]: Parker::park_timeout +/// [`park_deadline`]: Parker::park_deadline +/// [`unpark`]: Unparker::unpark +pub struct Parker { + unparker: Unparker, + _marker: PhantomData<*const ()>, +} + +unsafe impl Send for Parker {} + +impl Default for Parker { + fn default() -> Self { + Self { + unparker: Unparker { + inner: Arc::new(Inner { + state: AtomicUsize::new(EMPTY), + lock: Mutex::new(()), + cvar: Condvar::new(), + }), + }, + _marker: PhantomData, + } + } +} + +impl Parker { + /// Creates a new `Parker`. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::Parker; + /// + /// let p = Parker::new(); + /// ``` + /// + pub fn new() -> Parker { + Self::default() + } + + /// Blocks the current thread until the token is made available. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::Parker; + /// + /// let p = Parker::new(); + /// let u = p.unparker().clone(); + /// + /// // Make the token available. + /// u.unpark(); + /// + /// // Wakes up immediately and consumes the token. + /// p.park(); + /// ``` + pub fn park(&self) { + self.unparker.inner.park(None); + } + + /// Blocks the current thread until the token is made available, but only for a limited time. + /// + /// # Examples + /// + /// ``` + /// use std::time::Duration; + /// use crossbeam_utils::sync::Parker; + /// + /// let p = Parker::new(); + /// + /// // Waits for the token to become available, but will not wait longer than 500 ms. + /// p.park_timeout(Duration::from_millis(500)); + /// ``` + pub fn park_timeout(&self, timeout: Duration) { + match Instant::now().checked_add(timeout) { + Some(deadline) => self.park_deadline(deadline), + None => self.park(), + } + } + + /// Blocks the current thread until the token is made available, or until a certain deadline. + /// + /// # Examples + /// + /// ``` + /// use std::time::{Duration, Instant}; + /// use crossbeam_utils::sync::Parker; + /// + /// let p = Parker::new(); + /// let deadline = Instant::now() + Duration::from_millis(500); + /// + /// // Waits for the token to become available, but will not wait longer than 500 ms. + /// p.park_deadline(deadline); + /// ``` + pub fn park_deadline(&self, deadline: Instant) { + self.unparker.inner.park(Some(deadline)) + } + + /// Returns a reference to an associated [`Unparker`]. + /// + /// The returned [`Unparker`] doesn't have to be used by reference - it can also be cloned. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::Parker; + /// + /// let p = Parker::new(); + /// let u = p.unparker().clone(); + /// + /// // Make the token available. + /// u.unpark(); + /// // Wakes up immediately and consumes the token. + /// p.park(); + /// ``` + /// + /// [`park`]: Parker::park + /// [`park_timeout`]: Parker::park_timeout + pub fn unparker(&self) -> &Unparker { + &self.unparker + } + + /// Converts a `Parker` into a raw pointer. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::Parker; + /// + /// let p = Parker::new(); + /// let raw = Parker::into_raw(p); + /// # let _ = unsafe { Parker::from_raw(raw) }; + /// ``` + pub fn into_raw(this: Parker) -> *const () { + Unparker::into_raw(this.unparker) + } + + /// Converts a raw pointer into a `Parker`. + /// + /// # Safety + /// + /// This method is safe to use only with pointers returned by [`Parker::into_raw`]. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::Parker; + /// + /// let p = Parker::new(); + /// let raw = Parker::into_raw(p); + /// let p = unsafe { Parker::from_raw(raw) }; + /// ``` + pub unsafe fn from_raw(ptr: *const ()) -> Parker { + Parker { + unparker: Unparker::from_raw(ptr), + _marker: PhantomData, + } + } +} + +impl fmt::Debug for Parker { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.pad("Parker { .. }") + } +} + +/// Unparks a thread parked by the associated [`Parker`]. +pub struct Unparker { + inner: Arc, +} + +unsafe impl Send for Unparker {} +unsafe impl Sync for Unparker {} + +impl Unparker { + /// Atomically makes the token available if it is not already. + /// + /// This method will wake up the thread blocked on [`park`] or [`park_timeout`], if there is + /// any. + /// + /// # Examples + /// + /// ``` + /// use std::thread; + /// use std::time::Duration; + /// use crossbeam_utils::sync::Parker; + /// + /// let p = Parker::new(); + /// let u = p.unparker().clone(); + /// + /// thread::spawn(move || { + /// thread::sleep(Duration::from_millis(500)); + /// u.unpark(); + /// }); + /// + /// // Wakes up when `u.unpark()` provides the token. + /// p.park(); + /// # std::thread::sleep(std::time::Duration::from_millis(500)); // wait for background threads closed: https://github.com/rust-lang/miri/issues/1371 + /// ``` + /// + /// [`park`]: Parker::park + /// [`park_timeout`]: Parker::park_timeout + pub fn unpark(&self) { + self.inner.unpark() + } + + /// Converts an `Unparker` into a raw pointer. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::{Parker, Unparker}; + /// + /// let p = Parker::new(); + /// let u = p.unparker().clone(); + /// let raw = Unparker::into_raw(u); + /// # let _ = unsafe { Unparker::from_raw(raw) }; + /// ``` + pub fn into_raw(this: Unparker) -> *const () { + Arc::into_raw(this.inner).cast::<()>() + } + + /// Converts a raw pointer into an `Unparker`. + /// + /// # Safety + /// + /// This method is safe to use only with pointers returned by [`Unparker::into_raw`]. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::{Parker, Unparker}; + /// + /// let p = Parker::new(); + /// let u = p.unparker().clone(); + /// + /// let raw = Unparker::into_raw(u); + /// let u = unsafe { Unparker::from_raw(raw) }; + /// ``` + pub unsafe fn from_raw(ptr: *const ()) -> Unparker { + Unparker { + inner: Arc::from_raw(ptr.cast::()), + } + } +} + +impl fmt::Debug for Unparker { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.pad("Unparker { .. }") + } +} + +impl Clone for Unparker { + fn clone(&self) -> Unparker { + Unparker { + inner: self.inner.clone(), + } + } +} + +const EMPTY: usize = 0; +const PARKED: usize = 1; +const NOTIFIED: usize = 2; + +struct Inner { + state: AtomicUsize, + lock: Mutex<()>, + cvar: Condvar, +} + +impl Inner { + fn park(&self, deadline: Option) { + // If we were previously notified then we consume this notification and return quickly. + if self + .state + .compare_exchange(NOTIFIED, EMPTY, SeqCst, SeqCst) + .is_ok() + { + return; + } + + // If the timeout is zero, then there is no need to actually block. + if let Some(deadline) = deadline { + if deadline <= Instant::now() { + return; + } + } + + // Otherwise we need to coordinate going to sleep. + let mut m = self.lock.lock().unwrap(); + + match self.state.compare_exchange(EMPTY, PARKED, SeqCst, SeqCst) { + Ok(_) => {} + // Consume this notification to avoid spurious wakeups in the next park. + Err(NOTIFIED) => { + // We must read `state` here, even though we know it will be `NOTIFIED`. This is + // because `unpark` may have been called again since we read `NOTIFIED` in the + // `compare_exchange` above. We must perform an acquire operation that synchronizes + // with that `unpark` to observe any writes it made before the call to `unpark`. To + // do that we must read from the write it made to `state`. + let old = self.state.swap(EMPTY, SeqCst); + assert_eq!(old, NOTIFIED, "park state changed unexpectedly"); + return; + } + Err(n) => panic!("inconsistent park_timeout state: {}", n), + } + + loop { + // Block the current thread on the conditional variable. + m = match deadline { + None => self.cvar.wait(m).unwrap(), + Some(deadline) => { + let now = Instant::now(); + if now < deadline { + // We could check for a timeout here, in the return value of wait_timeout, + // but in the case that a timeout and an unpark arrive simultaneously, we + // prefer to report the former. + self.cvar.wait_timeout(m, deadline - now).unwrap().0 + } else { + // We've timed out; swap out the state back to empty on our way out + match self.state.swap(EMPTY, SeqCst) { + NOTIFIED | PARKED => return, + n => panic!("inconsistent park_timeout state: {}", n), + } + } + } + }; + + if self + .state + .compare_exchange(NOTIFIED, EMPTY, SeqCst, SeqCst) + .is_ok() + { + // got a notification + return; + } + + // Spurious wakeup, go back to sleep. Alternatively, if we timed out, it will be caught + // in the branch above, when we discover the deadline is in the past + } + } + + pub(crate) fn unpark(&self) { + // To ensure the unparked thread will observe any writes we made before this call, we must + // perform a release operation that `park` can synchronize with. To do that we must write + // `NOTIFIED` even if `state` is already `NOTIFIED`. That is why this must be a swap rather + // than a compare-and-swap that returns if it reads `NOTIFIED` on failure. + match self.state.swap(NOTIFIED, SeqCst) { + EMPTY => return, // no one was waiting + NOTIFIED => return, // already unparked + PARKED => {} // gotta go wake someone up + _ => panic!("inconsistent state in unpark"), + } + + // There is a period between when the parked thread sets `state` to `PARKED` (or last + // checked `state` in the case of a spurious wakeup) and when it actually waits on `cvar`. + // If we were to notify during this period it would be ignored and then when the parked + // thread went to sleep it would never wake up. Fortunately, it has `lock` locked at this + // stage so we can acquire `lock` to wait until it is ready to receive the notification. + // + // Releasing `lock` before the call to `notify_one` means that when the parked thread wakes + // it doesn't get woken only to have to wait for us to release `lock`. + drop(self.lock.lock().unwrap()); + self.cvar.notify_one(); + } +} diff --git a/crossbeam-utils-0.8.19/src/sync/sharded_lock.rs b/crossbeam-utils-0.8.19/src/sync/sharded_lock.rs new file mode 100644 index 0000000000000..5aee56f88c005 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/sync/sharded_lock.rs @@ -0,0 +1,636 @@ +use std::cell::UnsafeCell; +use std::collections::HashMap; +use std::fmt; +use std::marker::PhantomData; +use std::mem; +use std::ops::{Deref, DerefMut}; +use std::panic::{RefUnwindSafe, UnwindSafe}; +use std::sync::{LockResult, PoisonError, TryLockError, TryLockResult}; +use std::sync::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard}; +use std::thread::{self, ThreadId}; + +use crate::sync::once_lock::OnceLock; +use crate::CachePadded; + +/// The number of shards per sharded lock. Must be a power of two. +const NUM_SHARDS: usize = 8; + +/// A shard containing a single reader-writer lock. +struct Shard { + /// The inner reader-writer lock. + lock: RwLock<()>, + + /// The write-guard keeping this shard locked. + /// + /// Write operations will lock each shard and store the guard here. These guards get dropped at + /// the same time the big guard is dropped. + write_guard: UnsafeCell>>, +} + +/// A sharded reader-writer lock. +/// +/// This lock is equivalent to [`RwLock`], except read operations are faster and write operations +/// are slower. +/// +/// A `ShardedLock` is internally made of a list of *shards*, each being a [`RwLock`] occupying a +/// single cache line. Read operations will pick one of the shards depending on the current thread +/// and lock it. Write operations need to lock all shards in succession. +/// +/// By splitting the lock into shards, concurrent read operations will in most cases choose +/// different shards and thus update different cache lines, which is good for scalability. However, +/// write operations need to do more work and are therefore slower than usual. +/// +/// The priority policy of the lock is dependent on the underlying operating system's +/// implementation, and this type does not guarantee that any particular policy will be used. +/// +/// # Poisoning +/// +/// A `ShardedLock`, like [`RwLock`], will become poisoned on a panic. Note that it may only be +/// poisoned if a panic occurs while a write operation is in progress. If a panic occurs in any +/// read operation, the lock will not be poisoned. +/// +/// # Examples +/// +/// ``` +/// use crossbeam_utils::sync::ShardedLock; +/// +/// let lock = ShardedLock::new(5); +/// +/// // Any number of read locks can be held at once. +/// { +/// let r1 = lock.read().unwrap(); +/// let r2 = lock.read().unwrap(); +/// assert_eq!(*r1, 5); +/// assert_eq!(*r2, 5); +/// } // Read locks are dropped at this point. +/// +/// // However, only one write lock may be held. +/// { +/// let mut w = lock.write().unwrap(); +/// *w += 1; +/// assert_eq!(*w, 6); +/// } // Write lock is dropped here. +/// ``` +/// +/// [`RwLock`]: std::sync::RwLock +pub struct ShardedLock { + /// A list of locks protecting the internal data. + shards: Box<[CachePadded]>, + + /// The internal data. + value: UnsafeCell, +} + +unsafe impl Send for ShardedLock {} +unsafe impl Sync for ShardedLock {} + +impl UnwindSafe for ShardedLock {} +impl RefUnwindSafe for ShardedLock {} + +impl ShardedLock { + /// Creates a new sharded reader-writer lock. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::ShardedLock; + /// + /// let lock = ShardedLock::new(5); + /// ``` + pub fn new(value: T) -> ShardedLock { + ShardedLock { + shards: (0..NUM_SHARDS) + .map(|_| { + CachePadded::new(Shard { + lock: RwLock::new(()), + write_guard: UnsafeCell::new(None), + }) + }) + .collect::>(), + value: UnsafeCell::new(value), + } + } + + /// Consumes this lock, returning the underlying data. + /// + /// # Errors + /// + /// This method will return an error if the lock is poisoned. A lock gets poisoned when a write + /// operation panics. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::ShardedLock; + /// + /// let lock = ShardedLock::new(String::new()); + /// { + /// let mut s = lock.write().unwrap(); + /// *s = "modified".to_owned(); + /// } + /// assert_eq!(lock.into_inner().unwrap(), "modified"); + /// ``` + pub fn into_inner(self) -> LockResult { + let is_poisoned = self.is_poisoned(); + let inner = self.value.into_inner(); + + if is_poisoned { + Err(PoisonError::new(inner)) + } else { + Ok(inner) + } + } +} + +impl ShardedLock { + /// Returns `true` if the lock is poisoned. + /// + /// If another thread can still access the lock, it may become poisoned at any time. A `false` + /// result should not be trusted without additional synchronization. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::ShardedLock; + /// use std::sync::Arc; + /// use std::thread; + /// + /// let lock = Arc::new(ShardedLock::new(0)); + /// let c_lock = lock.clone(); + /// + /// let _ = thread::spawn(move || { + /// let _lock = c_lock.write().unwrap(); + /// panic!(); // the lock gets poisoned + /// }).join(); + /// assert_eq!(lock.is_poisoned(), true); + /// ``` + pub fn is_poisoned(&self) -> bool { + self.shards[0].lock.is_poisoned() + } + + /// Returns a mutable reference to the underlying data. + /// + /// Since this call borrows the lock mutably, no actual locking needs to take place. + /// + /// # Errors + /// + /// This method will return an error if the lock is poisoned. A lock gets poisoned when a write + /// operation panics. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::ShardedLock; + /// + /// let mut lock = ShardedLock::new(0); + /// *lock.get_mut().unwrap() = 10; + /// assert_eq!(*lock.read().unwrap(), 10); + /// ``` + pub fn get_mut(&mut self) -> LockResult<&mut T> { + let is_poisoned = self.is_poisoned(); + let inner = unsafe { &mut *self.value.get() }; + + if is_poisoned { + Err(PoisonError::new(inner)) + } else { + Ok(inner) + } + } + + /// Attempts to acquire this lock with shared read access. + /// + /// If the access could not be granted at this time, an error is returned. Otherwise, a guard + /// is returned which will release the shared access when it is dropped. This method does not + /// provide any guarantees with respect to the ordering of whether contentious readers or + /// writers will acquire the lock first. + /// + /// # Errors + /// + /// This method will return an error if the lock is poisoned. A lock gets poisoned when a write + /// operation panics. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::ShardedLock; + /// + /// let lock = ShardedLock::new(1); + /// + /// match lock.try_read() { + /// Ok(n) => assert_eq!(*n, 1), + /// Err(_) => unreachable!(), + /// }; + /// ``` + pub fn try_read(&self) -> TryLockResult> { + // Take the current thread index and map it to a shard index. Thread indices will tend to + // distribute shards among threads equally, thus reducing contention due to read-locking. + let current_index = current_index().unwrap_or(0); + let shard_index = current_index & (self.shards.len() - 1); + + match self.shards[shard_index].lock.try_read() { + Ok(guard) => Ok(ShardedLockReadGuard { + lock: self, + _guard: guard, + _marker: PhantomData, + }), + Err(TryLockError::Poisoned(err)) => { + let guard = ShardedLockReadGuard { + lock: self, + _guard: err.into_inner(), + _marker: PhantomData, + }; + Err(TryLockError::Poisoned(PoisonError::new(guard))) + } + Err(TryLockError::WouldBlock) => Err(TryLockError::WouldBlock), + } + } + + /// Locks with shared read access, blocking the current thread until it can be acquired. + /// + /// The calling thread will be blocked until there are no more writers which hold the lock. + /// There may be other readers currently inside the lock when this method returns. This method + /// does not provide any guarantees with respect to the ordering of whether contentious readers + /// or writers will acquire the lock first. + /// + /// Returns a guard which will release the shared access when dropped. + /// + /// # Errors + /// + /// This method will return an error if the lock is poisoned. A lock gets poisoned when a write + /// operation panics. + /// + /// # Panics + /// + /// This method might panic when called if the lock is already held by the current thread. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::ShardedLock; + /// use std::sync::Arc; + /// use std::thread; + /// + /// let lock = Arc::new(ShardedLock::new(1)); + /// let c_lock = lock.clone(); + /// + /// let n = lock.read().unwrap(); + /// assert_eq!(*n, 1); + /// + /// thread::spawn(move || { + /// let r = c_lock.read(); + /// assert!(r.is_ok()); + /// }).join().unwrap(); + /// ``` + pub fn read(&self) -> LockResult> { + // Take the current thread index and map it to a shard index. Thread indices will tend to + // distribute shards among threads equally, thus reducing contention due to read-locking. + let current_index = current_index().unwrap_or(0); + let shard_index = current_index & (self.shards.len() - 1); + + match self.shards[shard_index].lock.read() { + Ok(guard) => Ok(ShardedLockReadGuard { + lock: self, + _guard: guard, + _marker: PhantomData, + }), + Err(err) => Err(PoisonError::new(ShardedLockReadGuard { + lock: self, + _guard: err.into_inner(), + _marker: PhantomData, + })), + } + } + + /// Attempts to acquire this lock with exclusive write access. + /// + /// If the access could not be granted at this time, an error is returned. Otherwise, a guard + /// is returned which will release the exclusive access when it is dropped. This method does + /// not provide any guarantees with respect to the ordering of whether contentious readers or + /// writers will acquire the lock first. + /// + /// # Errors + /// + /// This method will return an error if the lock is poisoned. A lock gets poisoned when a write + /// operation panics. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::ShardedLock; + /// + /// let lock = ShardedLock::new(1); + /// + /// let n = lock.read().unwrap(); + /// assert_eq!(*n, 1); + /// + /// assert!(lock.try_write().is_err()); + /// ``` + pub fn try_write(&self) -> TryLockResult> { + let mut poisoned = false; + let mut blocked = None; + + // Write-lock each shard in succession. + for (i, shard) in self.shards.iter().enumerate() { + let guard = match shard.lock.try_write() { + Ok(guard) => guard, + Err(TryLockError::Poisoned(err)) => { + poisoned = true; + err.into_inner() + } + Err(TryLockError::WouldBlock) => { + blocked = Some(i); + break; + } + }; + + // Store the guard into the shard. + unsafe { + let guard: RwLockWriteGuard<'static, ()> = mem::transmute(guard); + let dest: *mut _ = shard.write_guard.get(); + *dest = Some(guard); + } + } + + if let Some(i) = blocked { + // Unlock the shards in reverse order of locking. + for shard in self.shards[0..i].iter().rev() { + unsafe { + let dest: *mut _ = shard.write_guard.get(); + let guard = (*dest).take(); + drop(guard); + } + } + Err(TryLockError::WouldBlock) + } else if poisoned { + let guard = ShardedLockWriteGuard { + lock: self, + _marker: PhantomData, + }; + Err(TryLockError::Poisoned(PoisonError::new(guard))) + } else { + Ok(ShardedLockWriteGuard { + lock: self, + _marker: PhantomData, + }) + } + } + + /// Locks with exclusive write access, blocking the current thread until it can be acquired. + /// + /// The calling thread will be blocked until there are no more writers which hold the lock. + /// There may be other readers currently inside the lock when this method returns. This method + /// does not provide any guarantees with respect to the ordering of whether contentious readers + /// or writers will acquire the lock first. + /// + /// Returns a guard which will release the exclusive access when dropped. + /// + /// # Errors + /// + /// This method will return an error if the lock is poisoned. A lock gets poisoned when a write + /// operation panics. + /// + /// # Panics + /// + /// This method might panic when called if the lock is already held by the current thread. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::ShardedLock; + /// + /// let lock = ShardedLock::new(1); + /// + /// let mut n = lock.write().unwrap(); + /// *n = 2; + /// + /// assert!(lock.try_read().is_err()); + /// ``` + pub fn write(&self) -> LockResult> { + let mut poisoned = false; + + // Write-lock each shard in succession. + for shard in self.shards.iter() { + let guard = match shard.lock.write() { + Ok(guard) => guard, + Err(err) => { + poisoned = true; + err.into_inner() + } + }; + + // Store the guard into the shard. + unsafe { + let guard: RwLockWriteGuard<'_, ()> = guard; + let guard: RwLockWriteGuard<'static, ()> = mem::transmute(guard); + let dest: *mut _ = shard.write_guard.get(); + *dest = Some(guard); + } + } + + if poisoned { + Err(PoisonError::new(ShardedLockWriteGuard { + lock: self, + _marker: PhantomData, + })) + } else { + Ok(ShardedLockWriteGuard { + lock: self, + _marker: PhantomData, + }) + } + } +} + +impl fmt::Debug for ShardedLock { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.try_read() { + Ok(guard) => f + .debug_struct("ShardedLock") + .field("data", &&*guard) + .finish(), + Err(TryLockError::Poisoned(err)) => f + .debug_struct("ShardedLock") + .field("data", &&**err.get_ref()) + .finish(), + Err(TryLockError::WouldBlock) => { + struct LockedPlaceholder; + impl fmt::Debug for LockedPlaceholder { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("") + } + } + f.debug_struct("ShardedLock") + .field("data", &LockedPlaceholder) + .finish() + } + } + } +} + +impl Default for ShardedLock { + fn default() -> ShardedLock { + ShardedLock::new(Default::default()) + } +} + +impl From for ShardedLock { + fn from(t: T) -> Self { + ShardedLock::new(t) + } +} + +/// A guard used to release the shared read access of a [`ShardedLock`] when dropped. +#[clippy::has_significant_drop] +pub struct ShardedLockReadGuard<'a, T: ?Sized> { + lock: &'a ShardedLock, + _guard: RwLockReadGuard<'a, ()>, + _marker: PhantomData>, +} + +unsafe impl Sync for ShardedLockReadGuard<'_, T> {} + +impl Deref for ShardedLockReadGuard<'_, T> { + type Target = T; + + fn deref(&self) -> &T { + unsafe { &*self.lock.value.get() } + } +} + +impl fmt::Debug for ShardedLockReadGuard<'_, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ShardedLockReadGuard") + .field("lock", &self.lock) + .finish() + } +} + +impl fmt::Display for ShardedLockReadGuard<'_, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + (**self).fmt(f) + } +} + +/// A guard used to release the exclusive write access of a [`ShardedLock`] when dropped. +#[clippy::has_significant_drop] +pub struct ShardedLockWriteGuard<'a, T: ?Sized> { + lock: &'a ShardedLock, + _marker: PhantomData>, +} + +unsafe impl Sync for ShardedLockWriteGuard<'_, T> {} + +impl Drop for ShardedLockWriteGuard<'_, T> { + fn drop(&mut self) { + // Unlock the shards in reverse order of locking. + for shard in self.lock.shards.iter().rev() { + unsafe { + let dest: *mut _ = shard.write_guard.get(); + let guard = (*dest).take(); + drop(guard); + } + } + } +} + +impl fmt::Debug for ShardedLockWriteGuard<'_, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ShardedLockWriteGuard") + .field("lock", &self.lock) + .finish() + } +} + +impl fmt::Display for ShardedLockWriteGuard<'_, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + (**self).fmt(f) + } +} + +impl Deref for ShardedLockWriteGuard<'_, T> { + type Target = T; + + fn deref(&self) -> &T { + unsafe { &*self.lock.value.get() } + } +} + +impl DerefMut for ShardedLockWriteGuard<'_, T> { + fn deref_mut(&mut self) -> &mut T { + unsafe { &mut *self.lock.value.get() } + } +} + +/// Returns a `usize` that identifies the current thread. +/// +/// Each thread is associated with an 'index'. While there are no particular guarantees, indices +/// usually tend to be consecutive numbers between 0 and the number of running threads. +/// +/// Since this function accesses TLS, `None` might be returned if the current thread's TLS is +/// tearing down. +#[inline] +fn current_index() -> Option { + REGISTRATION.try_with(|reg| reg.index).ok() +} + +/// The global registry keeping track of registered threads and indices. +struct ThreadIndices { + /// Mapping from `ThreadId` to thread index. + mapping: HashMap, + + /// A list of free indices. + free_list: Vec, + + /// The next index to allocate if the free list is empty. + next_index: usize, +} + +fn thread_indices() -> &'static Mutex { + static THREAD_INDICES: OnceLock> = OnceLock::new(); + fn init() -> Mutex { + Mutex::new(ThreadIndices { + mapping: HashMap::new(), + free_list: Vec::new(), + next_index: 0, + }) + } + THREAD_INDICES.get_or_init(init) +} + +/// A registration of a thread with an index. +/// +/// When dropped, unregisters the thread and frees the reserved index. +struct Registration { + index: usize, + thread_id: ThreadId, +} + +impl Drop for Registration { + fn drop(&mut self) { + let mut indices = thread_indices().lock().unwrap(); + indices.mapping.remove(&self.thread_id); + indices.free_list.push(self.index); + } +} + +thread_local! { + static REGISTRATION: Registration = { + let thread_id = thread::current().id(); + let mut indices = thread_indices().lock().unwrap(); + + let index = match indices.free_list.pop() { + Some(i) => i, + None => { + let i = indices.next_index; + indices.next_index += 1; + i + } + }; + indices.mapping.insert(thread_id, index); + + Registration { + index, + thread_id, + } + }; +} diff --git a/crossbeam-utils-0.8.19/src/sync/wait_group.rs b/crossbeam-utils-0.8.19/src/sync/wait_group.rs new file mode 100644 index 0000000000000..19d6074157078 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/sync/wait_group.rs @@ -0,0 +1,145 @@ +use crate::primitive::sync::{Arc, Condvar, Mutex}; +use std::fmt; + +/// Enables threads to synchronize the beginning or end of some computation. +/// +/// # Wait groups vs barriers +/// +/// `WaitGroup` is very similar to [`Barrier`], but there are a few differences: +/// +/// * [`Barrier`] needs to know the number of threads at construction, while `WaitGroup` is cloned to +/// register more threads. +/// +/// * A [`Barrier`] can be reused even after all threads have synchronized, while a `WaitGroup` +/// synchronizes threads only once. +/// +/// * All threads wait for others to reach the [`Barrier`]. With `WaitGroup`, each thread can choose +/// to either wait for other threads or to continue without blocking. +/// +/// # Examples +/// +/// ``` +/// use crossbeam_utils::sync::WaitGroup; +/// use std::thread; +/// +/// // Create a new wait group. +/// let wg = WaitGroup::new(); +/// +/// for _ in 0..4 { +/// // Create another reference to the wait group. +/// let wg = wg.clone(); +/// +/// thread::spawn(move || { +/// // Do some work. +/// +/// // Drop the reference to the wait group. +/// drop(wg); +/// }); +/// } +/// +/// // Block until all threads have finished their work. +/// wg.wait(); +/// # std::thread::sleep(std::time::Duration::from_millis(500)); // wait for background threads closed: https://github.com/rust-lang/miri/issues/1371 +/// ``` +/// +/// [`Barrier`]: std::sync::Barrier +pub struct WaitGroup { + inner: Arc, +} + +/// Inner state of a `WaitGroup`. +struct Inner { + cvar: Condvar, + count: Mutex, +} + +impl Default for WaitGroup { + fn default() -> Self { + Self { + inner: Arc::new(Inner { + cvar: Condvar::new(), + count: Mutex::new(1), + }), + } + } +} + +impl WaitGroup { + /// Creates a new wait group and returns the single reference to it. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::WaitGroup; + /// + /// let wg = WaitGroup::new(); + /// ``` + pub fn new() -> Self { + Self::default() + } + + /// Drops this reference and waits until all other references are dropped. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::sync::WaitGroup; + /// use std::thread; + /// + /// let wg = WaitGroup::new(); + /// + /// thread::spawn({ + /// let wg = wg.clone(); + /// move || { + /// // Block until both threads have reached `wait()`. + /// wg.wait(); + /// } + /// }); + /// + /// // Block until both threads have reached `wait()`. + /// wg.wait(); + /// # std::thread::sleep(std::time::Duration::from_millis(500)); // wait for background threads closed: https://github.com/rust-lang/miri/issues/1371 + /// ``` + pub fn wait(self) { + if *self.inner.count.lock().unwrap() == 1 { + return; + } + + let inner = self.inner.clone(); + drop(self); + + let mut count = inner.count.lock().unwrap(); + while *count > 0 { + count = inner.cvar.wait(count).unwrap(); + } + } +} + +impl Drop for WaitGroup { + fn drop(&mut self) { + let mut count = self.inner.count.lock().unwrap(); + *count -= 1; + + if *count == 0 { + self.inner.cvar.notify_all(); + } + } +} + +impl Clone for WaitGroup { + fn clone(&self) -> WaitGroup { + let mut count = self.inner.count.lock().unwrap(); + *count += 1; + + WaitGroup { + inner: self.inner.clone(), + } + } +} + +impl fmt::Debug for WaitGroup { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let count: &usize = &*self.inner.count.lock().unwrap(); + f.debug_struct("WaitGroup").field("count", count).finish() + } +} diff --git a/crossbeam-utils-0.8.19/src/thread.rs b/crossbeam-utils-0.8.19/src/thread.rs new file mode 100644 index 0000000000000..b2e063ae60d18 --- /dev/null +++ b/crossbeam-utils-0.8.19/src/thread.rs @@ -0,0 +1,608 @@ +//! Threads that can borrow variables from the stack. +//! +//! Create a scope when spawned threads need to access variables on the stack: +//! +//! ``` +//! use crossbeam_utils::thread; +//! +//! let people = vec![ +//! "Alice".to_string(), +//! "Bob".to_string(), +//! "Carol".to_string(), +//! ]; +//! +//! thread::scope(|s| { +//! for person in &people { +//! s.spawn(move |_| { +//! println!("Hello, {}!", person); +//! }); +//! } +//! }).unwrap(); +//! ``` +//! +//! # Why scoped threads? +//! +//! Suppose we wanted to re-write the previous example using plain threads: +//! +//! ```compile_fail,E0597 +//! use std::thread; +//! +//! let people = vec![ +//! "Alice".to_string(), +//! "Bob".to_string(), +//! "Carol".to_string(), +//! ]; +//! +//! let mut threads = Vec::new(); +//! +//! for person in &people { +//! threads.push(thread::spawn(move || { +//! println!("Hello, {}!", person); +//! })); +//! } +//! +//! for thread in threads { +//! thread.join().unwrap(); +//! } +//! ``` +//! +//! This doesn't work because the borrow checker complains about `people` not living long enough: +//! +//! ```text +//! error[E0597]: `people` does not live long enough +//! --> src/main.rs:12:20 +//! | +//! 12 | for person in &people { +//! | ^^^^^^ borrowed value does not live long enough +//! ... +//! 21 | } +//! | - borrowed value only lives until here +//! | +//! = note: borrowed value must be valid for the static lifetime... +//! ``` +//! +//! The problem here is that spawned threads are not allowed to borrow variables on stack because +//! the compiler cannot prove they will be joined before `people` is destroyed. +//! +//! Scoped threads are a mechanism to guarantee to the compiler that spawned threads will be joined +//! before the scope ends. +//! +//! # How scoped threads work +//! +//! If a variable is borrowed by a thread, the thread must complete before the variable is +//! destroyed. Threads spawned using [`std::thread::spawn`] can only borrow variables with the +//! `'static` lifetime because the borrow checker cannot be sure when the thread will complete. +//! +//! A scope creates a clear boundary between variables outside the scope and threads inside the +//! scope. Whenever a scope spawns a thread, it promises to join the thread before the scope ends. +//! This way we guarantee to the borrow checker that scoped threads only live within the scope and +//! can safely access variables outside it. +//! +//! # Nesting scoped threads +//! +//! Sometimes scoped threads need to spawn more threads within the same scope. This is a little +//! tricky because argument `s` lives *inside* the invocation of `thread::scope()` and as such +//! cannot be borrowed by scoped threads: +//! +//! ```compile_fail,E0521 +//! use crossbeam_utils::thread; +//! +//! thread::scope(|s| { +//! s.spawn(|_| { +//! // Not going to compile because we're trying to borrow `s`, +//! // which lives *inside* the scope! :( +//! s.spawn(|_| println!("nested thread")); +//! }); +//! }); +//! ``` +//! +//! Fortunately, there is a solution. Every scoped thread is passed a reference to its scope as an +//! argument, which can be used for spawning nested threads: +//! +//! ``` +//! use crossbeam_utils::thread; +//! +//! thread::scope(|s| { +//! // Note the `|s|` here. +//! s.spawn(|s| { +//! // Yay, this works because we're using a fresh argument `s`! :) +//! s.spawn(|_| println!("nested thread")); +//! }); +//! }).unwrap(); +//! ``` + +use std::fmt; +use std::io; +use std::marker::PhantomData; +use std::mem; +use std::panic; +use std::sync::{Arc, Mutex}; +use std::thread; + +use crate::sync::WaitGroup; + +type SharedVec = Arc>>; +type SharedOption = Arc>>; + +/// Creates a new scope for spawning threads. +/// +/// All child threads that haven't been manually joined will be automatically joined just before +/// this function invocation ends. If all joined threads have successfully completed, `Ok` is +/// returned with the return value of `f`. If any of the joined threads has panicked, an `Err` is +/// returned containing errors from panicked threads. Note that if panics are implemented by +/// aborting the process, no error is returned; see the notes of [std::panic::catch_unwind]. +/// +/// **Note:** Since Rust 1.63, this function is soft-deprecated in favor of the more efficient [`std::thread::scope`]. +/// +/// # Examples +/// +/// ``` +/// use crossbeam_utils::thread; +/// +/// let var = vec![1, 2, 3]; +/// +/// thread::scope(|s| { +/// s.spawn(|_| { +/// println!("A child thread borrowing `var`: {:?}", var); +/// }); +/// }).unwrap(); +/// ``` +pub fn scope<'env, F, R>(f: F) -> thread::Result +where + F: FnOnce(&Scope<'env>) -> R, +{ + struct AbortOnPanic; + impl Drop for AbortOnPanic { + fn drop(&mut self) { + if thread::panicking() { + std::process::abort(); + } + } + } + + let wg = WaitGroup::new(); + let scope = Scope::<'env> { + handles: SharedVec::default(), + wait_group: wg.clone(), + _marker: PhantomData, + }; + + // Execute the scoped function, but catch any panics. + let result = panic::catch_unwind(panic::AssertUnwindSafe(|| f(&scope))); + + // If an unwinding panic occurs before all threads are joined + // promote it to an aborting panic to prevent any threads from escaping the scope. + let guard = AbortOnPanic; + + // Wait until all nested scopes are dropped. + drop(scope.wait_group); + wg.wait(); + + // Join all remaining spawned threads. + let panics: Vec<_> = scope + .handles + .lock() + .unwrap() + // Filter handles that haven't been joined, join them, and collect errors. + .drain(..) + .filter_map(|handle| handle.lock().unwrap().take()) + .filter_map(|handle| handle.join().err()) + .collect(); + + mem::forget(guard); + + // If `f` has panicked, resume unwinding. + // If any of the child threads have panicked, return the panic errors. + // Otherwise, everything is OK and return the result of `f`. + match result { + Err(err) => panic::resume_unwind(err), + Ok(res) => { + if panics.is_empty() { + Ok(res) + } else { + Err(Box::new(panics)) + } + } + } +} + +/// A scope for spawning threads. +pub struct Scope<'env> { + /// The list of the thread join handles. + handles: SharedVec>>, + + /// Used to wait until all subscopes all dropped. + wait_group: WaitGroup, + + /// Borrows data with invariant lifetime `'env`. + _marker: PhantomData<&'env mut &'env ()>, +} + +unsafe impl Sync for Scope<'_> {} + +impl<'env> Scope<'env> { + /// Spawns a scoped thread. + /// + /// This method is similar to the [`spawn`] function in Rust's standard library. The difference + /// is that this thread is scoped, meaning it's guaranteed to terminate before the scope exits, + /// allowing it to reference variables outside the scope. + /// + /// The scoped thread is passed a reference to this scope as an argument, which can be used for + /// spawning nested threads. + /// + /// The returned [handle](ScopedJoinHandle) can be used to manually + /// [join](ScopedJoinHandle::join) the thread before the scope exits. + /// + /// This will create a thread using default parameters of [`ScopedThreadBuilder`], if you want to specify the + /// stack size or the name of the thread, use this API instead. + /// + /// [`spawn`]: std::thread::spawn + /// + /// # Panics + /// + /// Panics if the OS fails to create a thread; use [`ScopedThreadBuilder::spawn`] + /// to recover from such errors. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::thread; + /// + /// thread::scope(|s| { + /// let handle = s.spawn(|_| { + /// println!("A child thread is running"); + /// 42 + /// }); + /// + /// // Join the thread and retrieve its result. + /// let res = handle.join().unwrap(); + /// assert_eq!(res, 42); + /// }).unwrap(); + /// ``` + pub fn spawn<'scope, F, T>(&'scope self, f: F) -> ScopedJoinHandle<'scope, T> + where + F: FnOnce(&Scope<'env>) -> T, + F: Send + 'env, + T: Send + 'env, + { + self.builder() + .spawn(f) + .expect("failed to spawn scoped thread") + } + + /// Creates a builder that can configure a thread before spawning. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::thread; + /// + /// thread::scope(|s| { + /// s.builder() + /// .spawn(|_| println!("A child thread is running")) + /// .unwrap(); + /// }).unwrap(); + /// ``` + pub fn builder<'scope>(&'scope self) -> ScopedThreadBuilder<'scope, 'env> { + ScopedThreadBuilder { + scope: self, + builder: thread::Builder::new(), + } + } +} + +impl fmt::Debug for Scope<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.pad("Scope { .. }") + } +} + +/// Configures the properties of a new thread. +/// +/// The two configurable properties are: +/// +/// - [`name`]: Specifies an [associated name for the thread][naming-threads]. +/// - [`stack_size`]: Specifies the [desired stack size for the thread][stack-size]. +/// +/// The [`spawn`] method will take ownership of the builder and return an [`io::Result`] of the +/// thread handle with the given configuration. +/// +/// The [`Scope::spawn`] method uses a builder with default configuration and unwraps its return +/// value. You may want to use this builder when you want to recover from a failure to launch a +/// thread. +/// +/// # Examples +/// +/// ``` +/// use crossbeam_utils::thread; +/// +/// thread::scope(|s| { +/// s.builder() +/// .spawn(|_| println!("Running a child thread")) +/// .unwrap(); +/// }).unwrap(); +/// ``` +/// +/// [`name`]: ScopedThreadBuilder::name +/// [`stack_size`]: ScopedThreadBuilder::stack_size +/// [`spawn`]: ScopedThreadBuilder::spawn +/// [`io::Result`]: std::io::Result +/// [naming-threads]: std::thread#naming-threads +/// [stack-size]: std::thread#stack-size +#[derive(Debug)] +pub struct ScopedThreadBuilder<'scope, 'env> { + scope: &'scope Scope<'env>, + builder: thread::Builder, +} + +impl<'scope, 'env> ScopedThreadBuilder<'scope, 'env> { + /// Sets the name for the new thread. + /// + /// The name must not contain null bytes (`\0`). + /// + /// For more information about named threads, see [here][naming-threads]. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::thread; + /// use std::thread::current; + /// + /// thread::scope(|s| { + /// s.builder() + /// .name("my thread".to_string()) + /// .spawn(|_| assert_eq!(current().name(), Some("my thread"))) + /// .unwrap(); + /// }).unwrap(); + /// ``` + /// + /// [naming-threads]: std::thread#naming-threads + pub fn name(mut self, name: String) -> ScopedThreadBuilder<'scope, 'env> { + self.builder = self.builder.name(name); + self + } + + /// Sets the size of the stack for the new thread. + /// + /// The stack size is measured in bytes. + /// + /// For more information about the stack size for threads, see [here][stack-size]. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::thread; + /// + /// thread::scope(|s| { + /// s.builder() + /// .stack_size(32 * 1024) + /// .spawn(|_| println!("Running a child thread")) + /// .unwrap(); + /// }).unwrap(); + /// ``` + /// + /// [stack-size]: std::thread#stack-size + pub fn stack_size(mut self, size: usize) -> ScopedThreadBuilder<'scope, 'env> { + self.builder = self.builder.stack_size(size); + self + } + + /// Spawns a scoped thread with this configuration. + /// + /// The scoped thread is passed a reference to this scope as an argument, which can be used for + /// spawning nested threads. + /// + /// The returned handle can be used to manually join the thread before the scope exits. + /// + /// # Errors + /// + /// Unlike the [`Scope::spawn`] method, this method yields an + /// [`io::Result`] to capture any failure to create the thread at + /// the OS level. + /// + /// [`io::Result`]: std::io::Result + /// + /// # Panics + /// + /// Panics if a thread name was set and it contained null bytes. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::thread; + /// + /// thread::scope(|s| { + /// let handle = s.builder() + /// .spawn(|_| { + /// println!("A child thread is running"); + /// 42 + /// }) + /// .unwrap(); + /// + /// // Join the thread and retrieve its result. + /// let res = handle.join().unwrap(); + /// assert_eq!(res, 42); + /// }).unwrap(); + /// ``` + pub fn spawn(self, f: F) -> io::Result> + where + F: FnOnce(&Scope<'env>) -> T, + F: Send + 'env, + T: Send + 'env, + { + // The result of `f` will be stored here. + let result = SharedOption::default(); + + // Spawn the thread and grab its join handle and thread handle. + let (handle, thread) = { + let result = Arc::clone(&result); + + // A clone of the scope that will be moved into the new thread. + let scope = Scope::<'env> { + handles: Arc::clone(&self.scope.handles), + wait_group: self.scope.wait_group.clone(), + _marker: PhantomData, + }; + + // Spawn the thread. + let handle = { + let closure = move || { + // Make sure the scope is inside the closure with the proper `'env` lifetime. + let scope: Scope<'env> = scope; + + // Run the closure. + let res = f(&scope); + + // Store the result if the closure didn't panic. + *result.lock().unwrap() = Some(res); + }; + + // Allocate `closure` on the heap and erase the `'env` bound. + let closure: Box = Box::new(closure); + let closure: Box = + unsafe { mem::transmute(closure) }; + + // Finally, spawn the closure. + self.builder.spawn(closure)? + }; + + let thread = handle.thread().clone(); + let handle = Arc::new(Mutex::new(Some(handle))); + (handle, thread) + }; + + // Add the handle to the shared list of join handles. + self.scope.handles.lock().unwrap().push(Arc::clone(&handle)); + + Ok(ScopedJoinHandle { + handle, + result, + thread, + _marker: PhantomData, + }) + } +} + +unsafe impl Send for ScopedJoinHandle<'_, T> {} +unsafe impl Sync for ScopedJoinHandle<'_, T> {} + +/// A handle that can be used to join its scoped thread. +/// +/// This struct is created by the [`Scope::spawn`] method and the +/// [`ScopedThreadBuilder::spawn`] method. +pub struct ScopedJoinHandle<'scope, T> { + /// A join handle to the spawned thread. + handle: SharedOption>, + + /// Holds the result of the inner closure. + result: SharedOption, + + /// A handle to the the spawned thread. + thread: thread::Thread, + + /// Borrows the parent scope with lifetime `'scope`. + _marker: PhantomData<&'scope ()>, +} + +impl ScopedJoinHandle<'_, T> { + /// Waits for the thread to finish and returns its result. + /// + /// If the child thread panics, an error is returned. Note that if panics are implemented by + /// aborting the process, no error is returned; see the notes of [std::panic::catch_unwind]. + /// + /// # Panics + /// + /// This function may panic on some platforms if a thread attempts to join itself or otherwise + /// may create a deadlock with joining threads. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::thread; + /// + /// thread::scope(|s| { + /// let handle1 = s.spawn(|_| println!("I'm a happy thread :)")); + /// let handle2 = s.spawn(|_| panic!("I'm a sad thread :(")); + /// + /// // Join the first thread and verify that it succeeded. + /// let res = handle1.join(); + /// assert!(res.is_ok()); + /// + /// // Join the second thread and verify that it panicked. + /// let res = handle2.join(); + /// assert!(res.is_err()); + /// }).unwrap(); + /// ``` + pub fn join(self) -> thread::Result { + // Take out the handle. The handle will surely be available because the root scope waits + // for nested scopes before joining remaining threads. + let handle = self.handle.lock().unwrap().take().unwrap(); + + // Join the thread and then take the result out of its inner closure. + handle + .join() + .map(|()| self.result.lock().unwrap().take().unwrap()) + } + + /// Returns a handle to the underlying thread. + /// + /// # Examples + /// + /// ``` + /// use crossbeam_utils::thread; + /// + /// thread::scope(|s| { + /// let handle = s.spawn(|_| println!("A child thread is running")); + /// println!("The child thread ID: {:?}", handle.thread().id()); + /// }).unwrap(); + /// ``` + pub fn thread(&self) -> &thread::Thread { + &self.thread + } +} + +/// Unix-specific extensions. +#[cfg(unix)] +mod unix { + use super::ScopedJoinHandle; + use std::os::unix::thread::{JoinHandleExt, RawPthread}; + + impl JoinHandleExt for ScopedJoinHandle<'_, T> { + fn as_pthread_t(&self) -> RawPthread { + // Borrow the handle. The handle will surely be available because the root scope waits + // for nested scopes before joining remaining threads. + let handle = self.handle.lock().unwrap(); + handle.as_ref().unwrap().as_pthread_t() + } + fn into_pthread_t(self) -> RawPthread { + self.as_pthread_t() + } + } +} +/// Windows-specific extensions. +#[cfg(windows)] +mod windows { + use super::ScopedJoinHandle; + use std::os::windows::io::{AsRawHandle, IntoRawHandle, RawHandle}; + + impl AsRawHandle for ScopedJoinHandle<'_, T> { + fn as_raw_handle(&self) -> RawHandle { + // Borrow the handle. The handle will surely be available because the root scope waits + // for nested scopes before joining remaining threads. + let handle = self.handle.lock().unwrap(); + handle.as_ref().unwrap().as_raw_handle() + } + } + + impl IntoRawHandle for ScopedJoinHandle<'_, T> { + fn into_raw_handle(self) -> RawHandle { + self.as_raw_handle() + } + } +} + +impl fmt::Debug for ScopedJoinHandle<'_, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.pad("ScopedJoinHandle { .. }") + } +} diff --git a/crossbeam-utils-0.8.19/tests/atomic_cell.rs b/crossbeam-utils-0.8.19/tests/atomic_cell.rs new file mode 100644 index 0000000000000..9fe69328dfad7 --- /dev/null +++ b/crossbeam-utils-0.8.19/tests/atomic_cell.rs @@ -0,0 +1,374 @@ +use std::mem; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering::SeqCst; + +use crossbeam_utils::atomic::AtomicCell; + +#[test] +fn is_lock_free() { + struct UsizeWrap(#[allow(dead_code)] usize); + struct U8Wrap(#[allow(dead_code)] bool); + struct I16Wrap(#[allow(dead_code)] i16); + #[repr(align(8))] + struct U64Align8(#[allow(dead_code)] u64); + + assert!(AtomicCell::::is_lock_free()); + assert!(AtomicCell::::is_lock_free()); + assert!(AtomicCell::::is_lock_free()); + + assert!(AtomicCell::<()>::is_lock_free()); + + assert!(AtomicCell::::is_lock_free()); + assert!(AtomicCell::::is_lock_free()); + assert!(AtomicCell::::is_lock_free()); + assert!(AtomicCell::::is_lock_free()); + + assert!(AtomicCell::::is_lock_free()); + assert!(AtomicCell::::is_lock_free()); + assert!(AtomicCell::::is_lock_free()); + + assert!(AtomicCell::::is_lock_free()); + assert!(AtomicCell::::is_lock_free()); + + // Sizes of both types must be equal, and the alignment of `u64` must be greater or equal than + // that of `AtomicU64`. In i686-unknown-linux-gnu, the alignment of `u64` is `4` and alignment + // of `AtomicU64` is `8`, so `AtomicCell` is not lock-free. + assert_eq!( + AtomicCell::::is_lock_free(), + cfg!(target_has_atomic = "64") && std::mem::align_of::() == 8 + ); + assert_eq!(mem::size_of::(), 8); + assert_eq!(mem::align_of::(), 8); + assert_eq!( + AtomicCell::::is_lock_free(), + cfg!(target_has_atomic = "64") + ); + + // AtomicU128 is unstable + assert!(!AtomicCell::::is_lock_free()); +} + +#[test] +fn const_is_lock_free() { + const _U: bool = AtomicCell::::is_lock_free(); + const _I: bool = AtomicCell::::is_lock_free(); +} + +#[test] +fn drops_unit() { + static CNT: AtomicUsize = AtomicUsize::new(0); + CNT.store(0, SeqCst); + + #[derive(Debug, PartialEq, Eq)] + struct Foo(); + + impl Foo { + fn new() -> Foo { + CNT.fetch_add(1, SeqCst); + Foo() + } + } + + impl Drop for Foo { + fn drop(&mut self) { + CNT.fetch_sub(1, SeqCst); + } + } + + impl Default for Foo { + fn default() -> Foo { + Foo::new() + } + } + + let a = AtomicCell::new(Foo::new()); + + assert_eq!(a.swap(Foo::new()), Foo::new()); + assert_eq!(CNT.load(SeqCst), 1); + + a.store(Foo::new()); + assert_eq!(CNT.load(SeqCst), 1); + + assert_eq!(a.swap(Foo::default()), Foo::new()); + assert_eq!(CNT.load(SeqCst), 1); + + drop(a); + assert_eq!(CNT.load(SeqCst), 0); +} + +#[test] +fn drops_u8() { + static CNT: AtomicUsize = AtomicUsize::new(0); + CNT.store(0, SeqCst); + + #[derive(Debug, PartialEq, Eq)] + struct Foo(u8); + + impl Foo { + fn new(val: u8) -> Foo { + CNT.fetch_add(1, SeqCst); + Foo(val) + } + } + + impl Drop for Foo { + fn drop(&mut self) { + CNT.fetch_sub(1, SeqCst); + } + } + + impl Default for Foo { + fn default() -> Foo { + Foo::new(0) + } + } + + let a = AtomicCell::new(Foo::new(5)); + + assert_eq!(a.swap(Foo::new(6)), Foo::new(5)); + assert_eq!(a.swap(Foo::new(1)), Foo::new(6)); + assert_eq!(CNT.load(SeqCst), 1); + + a.store(Foo::new(2)); + assert_eq!(CNT.load(SeqCst), 1); + + assert_eq!(a.swap(Foo::default()), Foo::new(2)); + assert_eq!(CNT.load(SeqCst), 1); + + assert_eq!(a.swap(Foo::default()), Foo::new(0)); + assert_eq!(CNT.load(SeqCst), 1); + + drop(a); + assert_eq!(CNT.load(SeqCst), 0); +} + +#[test] +fn drops_usize() { + static CNT: AtomicUsize = AtomicUsize::new(0); + CNT.store(0, SeqCst); + + #[derive(Debug, PartialEq, Eq)] + struct Foo(usize); + + impl Foo { + fn new(val: usize) -> Foo { + CNT.fetch_add(1, SeqCst); + Foo(val) + } + } + + impl Drop for Foo { + fn drop(&mut self) { + CNT.fetch_sub(1, SeqCst); + } + } + + impl Default for Foo { + fn default() -> Foo { + Foo::new(0) + } + } + + let a = AtomicCell::new(Foo::new(5)); + + assert_eq!(a.swap(Foo::new(6)), Foo::new(5)); + assert_eq!(a.swap(Foo::new(1)), Foo::new(6)); + assert_eq!(CNT.load(SeqCst), 1); + + a.store(Foo::new(2)); + assert_eq!(CNT.load(SeqCst), 1); + + assert_eq!(a.swap(Foo::default()), Foo::new(2)); + assert_eq!(CNT.load(SeqCst), 1); + + assert_eq!(a.swap(Foo::default()), Foo::new(0)); + assert_eq!(CNT.load(SeqCst), 1); + + drop(a); + assert_eq!(CNT.load(SeqCst), 0); +} + +#[test] +fn modular_u8() { + #[derive(Clone, Copy, Eq, Debug, Default)] + struct Foo(u8); + + impl PartialEq for Foo { + fn eq(&self, other: &Foo) -> bool { + self.0 % 5 == other.0 % 5 + } + } + + let a = AtomicCell::new(Foo(1)); + + assert_eq!(a.load(), Foo(1)); + assert_eq!(a.swap(Foo(2)), Foo(11)); + assert_eq!(a.load(), Foo(52)); + + a.store(Foo(0)); + assert_eq!(a.compare_exchange(Foo(0), Foo(5)), Ok(Foo(100))); + assert_eq!(a.load().0, 5); + assert_eq!(a.compare_exchange(Foo(10), Foo(15)), Ok(Foo(100))); + assert_eq!(a.load().0, 15); +} + +#[test] +fn modular_usize() { + #[derive(Clone, Copy, Eq, Debug, Default)] + struct Foo(usize); + + impl PartialEq for Foo { + fn eq(&self, other: &Foo) -> bool { + self.0 % 5 == other.0 % 5 + } + } + + let a = AtomicCell::new(Foo(1)); + + assert_eq!(a.load(), Foo(1)); + assert_eq!(a.swap(Foo(2)), Foo(11)); + assert_eq!(a.load(), Foo(52)); + + a.store(Foo(0)); + assert_eq!(a.compare_exchange(Foo(0), Foo(5)), Ok(Foo(100))); + assert_eq!(a.load().0, 5); + assert_eq!(a.compare_exchange(Foo(10), Foo(15)), Ok(Foo(100))); + assert_eq!(a.load().0, 15); +} + +#[test] +fn garbage_padding() { + #[derive(Copy, Clone, Eq, PartialEq)] + struct Object { + a: i64, + b: i32, + } + + let cell = AtomicCell::new(Object { a: 0, b: 0 }); + let _garbage = [0xfe, 0xfe, 0xfe, 0xfe, 0xfe]; // Needed + let next = Object { a: 0, b: 0 }; + + let prev = cell.load(); + assert!(cell.compare_exchange(prev, next).is_ok()); + println!(); +} + +#[test] +fn const_atomic_cell_new() { + static CELL: AtomicCell = AtomicCell::new(0); + + CELL.store(1); + assert_eq!(CELL.load(), 1); +} + +// https://github.com/crossbeam-rs/crossbeam/pull/767 +macro_rules! test_arithmetic { + ($test_name:ident, $ty:ident) => { + #[test] + fn $test_name() { + let a: AtomicCell<$ty> = AtomicCell::new(7); + + assert_eq!(a.fetch_add(3), 7); + assert_eq!(a.load(), 10); + + assert_eq!(a.fetch_sub(3), 10); + assert_eq!(a.load(), 7); + + assert_eq!(a.fetch_and(3), 7); + assert_eq!(a.load(), 3); + + assert_eq!(a.fetch_or(16), 3); + assert_eq!(a.load(), 19); + + assert_eq!(a.fetch_xor(2), 19); + assert_eq!(a.load(), 17); + + assert_eq!(a.fetch_max(18), 17); + assert_eq!(a.load(), 18); + + assert_eq!(a.fetch_min(17), 18); + assert_eq!(a.load(), 17); + + assert_eq!(a.fetch_nand(7), 17); + assert_eq!(a.load(), !(17 & 7)); + } + }; +} +test_arithmetic!(arithmetic_u8, u8); +test_arithmetic!(arithmetic_i8, i8); +test_arithmetic!(arithmetic_u16, u16); +test_arithmetic!(arithmetic_i16, i16); +test_arithmetic!(arithmetic_u32, u32); +test_arithmetic!(arithmetic_i32, i32); +test_arithmetic!(arithmetic_u64, u64); +test_arithmetic!(arithmetic_i64, i64); +test_arithmetic!(arithmetic_u128, u128); +test_arithmetic!(arithmetic_i128, i128); + +// https://github.com/crossbeam-rs/crossbeam/issues/748 +#[cfg_attr(miri, ignore)] // TODO +#[test] +fn issue_748() { + #[allow(dead_code)] + #[repr(align(8))] + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + enum Test { + Field(u32), + FieldLess, + } + + assert_eq!(mem::size_of::(), 8); + assert_eq!( + AtomicCell::::is_lock_free(), + cfg!(target_has_atomic = "64") + ); + let x = AtomicCell::new(Test::FieldLess); + assert_eq!(x.load(), Test::FieldLess); +} + +// https://github.com/crossbeam-rs/crossbeam/issues/833 +#[test] +fn issue_833() { + use std::num::NonZeroU128; + use std::sync::atomic::{AtomicBool, Ordering}; + use std::thread; + + #[cfg(miri)] + const N: usize = 10_000; + #[cfg(not(miri))] + const N: usize = 1_000_000; + + #[allow(dead_code)] + enum Enum { + NeverConstructed, + Cell(AtomicCell), + } + + static STATIC: Enum = Enum::Cell(AtomicCell::new(match NonZeroU128::new(1) { + Some(nonzero) => nonzero, + None => unreachable!(), + })); + static FINISHED: AtomicBool = AtomicBool::new(false); + + let handle = thread::spawn(|| { + let cell = match &STATIC { + Enum::NeverConstructed => unreachable!(), + Enum::Cell(cell) => cell, + }; + let x = NonZeroU128::new(0xFFFF_FFFF_FFFF_FFFF_0000_0000_0000_0000).unwrap(); + let y = NonZeroU128::new(0x0000_0000_0000_0000_FFFF_FFFF_FFFF_FFFF).unwrap(); + while !FINISHED.load(Ordering::Relaxed) { + cell.store(x); + cell.store(y); + } + }); + + for _ in 0..N { + if let Enum::NeverConstructed = STATIC { + unreachable!(":("); + } + } + + FINISHED.store(true, Ordering::Relaxed); + handle.join().unwrap(); +} diff --git a/crossbeam-utils-0.8.19/tests/cache_padded.rs b/crossbeam-utils-0.8.19/tests/cache_padded.rs new file mode 100644 index 0000000000000..86e9a7709c671 --- /dev/null +++ b/crossbeam-utils-0.8.19/tests/cache_padded.rs @@ -0,0 +1,113 @@ +use std::cell::Cell; +use std::mem; + +use crossbeam_utils::CachePadded; + +#[test] +fn default() { + let x: CachePadded = Default::default(); + assert_eq!(*x, 0); +} + +#[test] +fn store_u64() { + let x: CachePadded = CachePadded::new(17); + assert_eq!(*x, 17); +} + +#[test] +fn store_pair() { + let x: CachePadded<(u64, u64)> = CachePadded::new((17, 37)); + assert_eq!(x.0, 17); + assert_eq!(x.1, 37); +} + +#[test] +fn distance() { + let arr = [CachePadded::new(17u8), CachePadded::new(37u8)]; + let a = &*arr[0] as *const u8; + let b = &*arr[1] as *const u8; + let align = mem::align_of::>(); + assert!(align >= 32); + assert_eq!(unsafe { a.add(align) }, b); +} + +#[test] +fn different_sizes() { + CachePadded::new(17u8); + CachePadded::new(17u16); + CachePadded::new(17u32); + CachePadded::new([17u64; 0]); + CachePadded::new([17u64; 1]); + CachePadded::new([17u64; 2]); + CachePadded::new([17u64; 3]); + CachePadded::new([17u64; 4]); + CachePadded::new([17u64; 5]); + CachePadded::new([17u64; 6]); + CachePadded::new([17u64; 7]); + CachePadded::new([17u64; 8]); +} + +#[test] +fn large() { + let a = [17u64; 9]; + let b = CachePadded::new(a); + assert!(mem::size_of_val(&a) <= mem::size_of_val(&b)); +} + +#[test] +fn debug() { + assert_eq!( + format!("{:?}", CachePadded::new(17u64)), + "CachePadded { value: 17 }" + ); +} + +#[test] +fn drops() { + let count = Cell::new(0); + + struct Foo<'a>(&'a Cell); + + impl<'a> Drop for Foo<'a> { + fn drop(&mut self) { + self.0.set(self.0.get() + 1); + } + } + + let a = CachePadded::new(Foo(&count)); + let b = CachePadded::new(Foo(&count)); + + assert_eq!(count.get(), 0); + drop(a); + assert_eq!(count.get(), 1); + drop(b); + assert_eq!(count.get(), 2); +} + +#[allow(clippy::clone_on_copy)] // This is intentional. +#[test] +fn clone() { + let a = CachePadded::new(17); + let b = a.clone(); + assert_eq!(*a, *b); +} + +#[test] +fn runs_custom_clone() { + let count = Cell::new(0); + + struct Foo<'a>(&'a Cell); + + impl<'a> Clone for Foo<'a> { + fn clone(&self) -> Foo<'a> { + self.0.set(self.0.get() + 1); + Foo::<'a>(self.0) + } + } + + let a = CachePadded::new(Foo(&count)); + let _ = a.clone(); + + assert_eq!(count.get(), 1); +} diff --git a/crossbeam-utils-0.8.19/tests/parker.rs b/crossbeam-utils-0.8.19/tests/parker.rs new file mode 100644 index 0000000000000..2bf9c37d491e6 --- /dev/null +++ b/crossbeam-utils-0.8.19/tests/parker.rs @@ -0,0 +1,41 @@ +use std::thread::sleep; +use std::time::Duration; +use std::u32; + +use crossbeam_utils::sync::Parker; +use crossbeam_utils::thread; + +#[test] +fn park_timeout_unpark_before() { + let p = Parker::new(); + for _ in 0..10 { + p.unparker().unpark(); + p.park_timeout(Duration::from_millis(u32::MAX as u64)); + } +} + +#[test] +fn park_timeout_unpark_not_called() { + let p = Parker::new(); + for _ in 0..10 { + p.park_timeout(Duration::from_millis(10)) + } +} + +#[test] +fn park_timeout_unpark_called_other_thread() { + for _ in 0..10 { + let p = Parker::new(); + let u = p.unparker().clone(); + + thread::scope(|scope| { + scope.spawn(move |_| { + sleep(Duration::from_millis(50)); + u.unpark(); + }); + + p.park_timeout(Duration::from_millis(u32::MAX as u64)) + }) + .unwrap(); + } +} diff --git a/crossbeam-utils-0.8.19/tests/sharded_lock.rs b/crossbeam-utils-0.8.19/tests/sharded_lock.rs new file mode 100644 index 0000000000000..002f7f5e19341 --- /dev/null +++ b/crossbeam-utils-0.8.19/tests/sharded_lock.rs @@ -0,0 +1,252 @@ +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::mpsc::channel; +use std::sync::{Arc, TryLockError}; +use std::thread; + +use crossbeam_utils::sync::ShardedLock; +use rand::Rng; + +#[derive(Eq, PartialEq, Debug)] +struct NonCopy(i32); + +#[test] +fn smoke() { + let l = ShardedLock::new(()); + drop(l.read().unwrap()); + drop(l.write().unwrap()); + drop((l.read().unwrap(), l.read().unwrap())); + drop(l.write().unwrap()); +} + +#[test] +fn frob() { + const N: u32 = 10; + #[cfg(miri)] + const M: usize = 50; + #[cfg(not(miri))] + const M: usize = 1000; + + let r = Arc::new(ShardedLock::new(())); + + let (tx, rx) = channel::<()>(); + for _ in 0..N { + let tx = tx.clone(); + let r = r.clone(); + thread::spawn(move || { + let mut rng = rand::thread_rng(); + for _ in 0..M { + if rng.gen_bool(1.0 / (N as f64)) { + drop(r.write().unwrap()); + } else { + drop(r.read().unwrap()); + } + } + drop(tx); + }); + } + drop(tx); + let _ = rx.recv(); +} + +#[test] +fn arc_poison_wr() { + let arc = Arc::new(ShardedLock::new(1)); + let arc2 = arc.clone(); + let _: Result<(), _> = thread::spawn(move || { + let _lock = arc2.write().unwrap(); + panic!(); + }) + .join(); + assert!(arc.read().is_err()); +} + +#[test] +fn arc_poison_ww() { + let arc = Arc::new(ShardedLock::new(1)); + assert!(!arc.is_poisoned()); + let arc2 = arc.clone(); + let _: Result<(), _> = thread::spawn(move || { + let _lock = arc2.write().unwrap(); + panic!(); + }) + .join(); + assert!(arc.write().is_err()); + assert!(arc.is_poisoned()); +} + +#[test] +fn arc_no_poison_rr() { + let arc = Arc::new(ShardedLock::new(1)); + let arc2 = arc.clone(); + let _: Result<(), _> = thread::spawn(move || { + let _lock = arc2.read().unwrap(); + panic!(); + }) + .join(); + let lock = arc.read().unwrap(); + assert_eq!(*lock, 1); +} +#[test] +fn arc_no_poison_sl() { + let arc = Arc::new(ShardedLock::new(1)); + let arc2 = arc.clone(); + let _: Result<(), _> = thread::spawn(move || { + let _lock = arc2.read().unwrap(); + panic!() + }) + .join(); + let lock = arc.write().unwrap(); + assert_eq!(*lock, 1); +} + +#[test] +fn arc() { + let arc = Arc::new(ShardedLock::new(0)); + let arc2 = arc.clone(); + let (tx, rx) = channel(); + + thread::spawn(move || { + let mut lock = arc2.write().unwrap(); + for _ in 0..10 { + let tmp = *lock; + *lock = -1; + thread::yield_now(); + *lock = tmp + 1; + } + tx.send(()).unwrap(); + }); + + // Readers try to catch the writer in the act + let mut children = Vec::new(); + for _ in 0..5 { + let arc3 = arc.clone(); + children.push(thread::spawn(move || { + let lock = arc3.read().unwrap(); + assert!(*lock >= 0); + })); + } + + // Wait for children to pass their asserts + for r in children { + assert!(r.join().is_ok()); + } + + // Wait for writer to finish + rx.recv().unwrap(); + let lock = arc.read().unwrap(); + assert_eq!(*lock, 10); +} + +#[test] +fn arc_access_in_unwind() { + let arc = Arc::new(ShardedLock::new(1)); + let arc2 = arc.clone(); + let _ = thread::spawn(move || { + struct Unwinder { + i: Arc>, + } + impl Drop for Unwinder { + fn drop(&mut self) { + let mut lock = self.i.write().unwrap(); + *lock += 1; + } + } + let _u = Unwinder { i: arc2 }; + panic!(); + }) + .join(); + let lock = arc.read().unwrap(); + assert_eq!(*lock, 2); +} + +#[test] +fn unsized_type() { + let sl: &ShardedLock<[i32]> = &ShardedLock::new([1, 2, 3]); + { + let b = &mut *sl.write().unwrap(); + b[0] = 4; + b[2] = 5; + } + let comp: &[i32] = &[4, 2, 5]; + assert_eq!(&*sl.read().unwrap(), comp); +} + +#[test] +fn try_write() { + let lock = ShardedLock::new(0isize); + let read_guard = lock.read().unwrap(); + + let write_result = lock.try_write(); + match write_result { + Err(TryLockError::WouldBlock) => (), + Ok(_) => panic!("try_write should not succeed while read_guard is in scope"), + Err(_) => panic!("unexpected error"), + } + + drop(read_guard); +} + +#[test] +fn test_into_inner() { + let m = ShardedLock::new(NonCopy(10)); + assert_eq!(m.into_inner().unwrap(), NonCopy(10)); +} + +#[test] +fn test_into_inner_drop() { + struct Foo(Arc); + impl Drop for Foo { + fn drop(&mut self) { + self.0.fetch_add(1, Ordering::SeqCst); + } + } + let num_drops = Arc::new(AtomicUsize::new(0)); + let m = ShardedLock::new(Foo(num_drops.clone())); + assert_eq!(num_drops.load(Ordering::SeqCst), 0); + { + let _inner = m.into_inner().unwrap(); + assert_eq!(num_drops.load(Ordering::SeqCst), 0); + } + assert_eq!(num_drops.load(Ordering::SeqCst), 1); +} + +#[test] +fn test_into_inner_poison() { + let m = Arc::new(ShardedLock::new(NonCopy(10))); + let m2 = m.clone(); + let _ = thread::spawn(move || { + let _lock = m2.write().unwrap(); + panic!("test panic in inner thread to poison ShardedLock"); + }) + .join(); + + assert!(m.is_poisoned()); + match Arc::try_unwrap(m).unwrap().into_inner() { + Err(e) => assert_eq!(e.into_inner(), NonCopy(10)), + Ok(x) => panic!("into_inner of poisoned ShardedLock is Ok: {:?}", x), + } +} + +#[test] +fn test_get_mut() { + let mut m = ShardedLock::new(NonCopy(10)); + *m.get_mut().unwrap() = NonCopy(20); + assert_eq!(m.into_inner().unwrap(), NonCopy(20)); +} + +#[test] +fn test_get_mut_poison() { + let m = Arc::new(ShardedLock::new(NonCopy(10))); + let m2 = m.clone(); + let _ = thread::spawn(move || { + let _lock = m2.write().unwrap(); + panic!("test panic in inner thread to poison ShardedLock"); + }) + .join(); + + assert!(m.is_poisoned()); + match Arc::try_unwrap(m).unwrap().get_mut() { + Err(e) => assert_eq!(*e.into_inner(), NonCopy(10)), + Ok(x) => panic!("get_mut of poisoned ShardedLock is Ok: {:?}", x), + } +} diff --git a/crossbeam-utils-0.8.19/tests/thread.rs b/crossbeam-utils-0.8.19/tests/thread.rs new file mode 100644 index 0000000000000..0dfad90bd6d93 --- /dev/null +++ b/crossbeam-utils-0.8.19/tests/thread.rs @@ -0,0 +1,215 @@ +use std::any::Any; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::thread::sleep; +use std::time::Duration; + +use crossbeam_utils::thread; + +const THREADS: usize = 10; +const SMALL_STACK_SIZE: usize = 20; + +#[test] +fn join() { + let counter = AtomicUsize::new(0); + thread::scope(|scope| { + let handle = scope.spawn(|_| { + counter.store(1, Ordering::Relaxed); + }); + assert!(handle.join().is_ok()); + + let panic_handle = scope.spawn(|_| { + panic!("\"My honey is running out!\", said Pooh."); + }); + assert!(panic_handle.join().is_err()); + }) + .unwrap(); + + // There should be sufficient synchronization. + assert_eq!(1, counter.load(Ordering::Relaxed)); +} + +#[test] +fn counter() { + let counter = AtomicUsize::new(0); + thread::scope(|scope| { + for _ in 0..THREADS { + scope.spawn(|_| { + counter.fetch_add(1, Ordering::Relaxed); + }); + } + }) + .unwrap(); + + assert_eq!(THREADS, counter.load(Ordering::Relaxed)); +} + +#[test] +fn counter_builder() { + let counter = AtomicUsize::new(0); + thread::scope(|scope| { + for i in 0..THREADS { + scope + .builder() + .name(format!("child-{}", i)) + .stack_size(SMALL_STACK_SIZE) + .spawn(|_| { + counter.fetch_add(1, Ordering::Relaxed); + }) + .unwrap(); + } + }) + .unwrap(); + + assert_eq!(THREADS, counter.load(Ordering::Relaxed)); +} + +#[test] +fn counter_panic() { + let counter = AtomicUsize::new(0); + let result = thread::scope(|scope| { + scope.spawn(|_| { + panic!("\"My honey is running out!\", said Pooh."); + }); + sleep(Duration::from_millis(100)); + + for _ in 0..THREADS { + scope.spawn(|_| { + counter.fetch_add(1, Ordering::Relaxed); + }); + } + }); + + assert_eq!(THREADS, counter.load(Ordering::Relaxed)); + assert!(result.is_err()); +} + +#[test] +fn panic_twice() { + let result = thread::scope(|scope| { + scope.spawn(|_| { + sleep(Duration::from_millis(500)); + panic!("thread #1"); + }); + scope.spawn(|_| { + panic!("thread #2"); + }); + }); + + let err = result.unwrap_err(); + let vec = err + .downcast_ref::>>() + .unwrap(); + assert_eq!(2, vec.len()); + + let first = vec[0].downcast_ref::<&str>().unwrap(); + let second = vec[1].downcast_ref::<&str>().unwrap(); + assert_eq!("thread #1", *first); + assert_eq!("thread #2", *second) +} + +#[test] +fn panic_many() { + let result = thread::scope(|scope| { + scope.spawn(|_| panic!("deliberate panic #1")); + scope.spawn(|_| panic!("deliberate panic #2")); + scope.spawn(|_| panic!("deliberate panic #3")); + }); + + let err = result.unwrap_err(); + let vec = err + .downcast_ref::>>() + .unwrap(); + assert_eq!(3, vec.len()); + + for panic in vec.iter() { + let panic = panic.downcast_ref::<&str>().unwrap(); + assert!( + *panic == "deliberate panic #1" + || *panic == "deliberate panic #2" + || *panic == "deliberate panic #3" + ); + } +} + +#[test] +fn nesting() { + let var = "foo".to_string(); + + struct Wrapper<'a> { + var: &'a String, + } + + impl<'a> Wrapper<'a> { + fn recurse(&'a self, scope: &thread::Scope<'a>, depth: usize) { + assert_eq!(self.var, "foo"); + + if depth > 0 { + scope.spawn(move |scope| { + self.recurse(scope, depth - 1); + }); + } + } + } + + let wrapper = Wrapper { var: &var }; + + thread::scope(|scope| { + scope.spawn(|scope| { + scope.spawn(|scope| { + wrapper.recurse(scope, 5); + }); + }); + }) + .unwrap(); +} + +#[test] +fn join_nested() { + thread::scope(|scope| { + scope.spawn(|scope| { + let handle = scope.spawn(|_| 7); + + sleep(Duration::from_millis(200)); + handle.join().unwrap(); + }); + + sleep(Duration::from_millis(100)); + }) + .unwrap(); +} + +#[test] +fn scope_returns_ok() { + let result = thread::scope(|scope| scope.spawn(|_| 1234).join().unwrap()).unwrap(); + assert_eq!(result, 1234); +} + +#[cfg(unix)] +#[test] +fn as_pthread_t() { + use std::os::unix::thread::JoinHandleExt; + thread::scope(|scope| { + let handle = scope.spawn(|_scope| { + sleep(Duration::from_millis(100)); + 42 + }); + let _pthread_t = handle.as_pthread_t(); + handle.join().unwrap(); + }) + .unwrap(); +} + +#[cfg(windows)] +#[test] +fn as_raw_handle() { + use std::os::windows::io::AsRawHandle; + thread::scope(|scope| { + let handle = scope.spawn(|_scope| { + sleep(Duration::from_millis(100)); + 42 + }); + let _raw_handle = handle.as_raw_handle(); + handle.join().unwrap(); + }) + .unwrap(); +} diff --git a/crossbeam-utils-0.8.19/tests/wait_group.rs b/crossbeam-utils-0.8.19/tests/wait_group.rs new file mode 100644 index 0000000000000..5b549b849cdae --- /dev/null +++ b/crossbeam-utils-0.8.19/tests/wait_group.rs @@ -0,0 +1,67 @@ +use std::sync::mpsc; +use std::thread; +use std::time::Duration; + +use crossbeam_utils::sync::WaitGroup; + +const THREADS: usize = 10; + +#[test] +fn wait() { + let wg = WaitGroup::new(); + let (tx, rx) = mpsc::channel(); + + for _ in 0..THREADS { + let wg = wg.clone(); + let tx = tx.clone(); + + thread::spawn(move || { + wg.wait(); + tx.send(()).unwrap(); + }); + } + + thread::sleep(Duration::from_millis(100)); + + // At this point, all spawned threads should be blocked, so we shouldn't get anything from the + // channel. + assert!(rx.try_recv().is_err()); + + wg.wait(); + + // Now, the wait group is cleared and we should receive messages. + for _ in 0..THREADS { + rx.recv().unwrap(); + } +} + +#[test] +fn wait_and_drop() { + let wg = WaitGroup::new(); + let wg2 = WaitGroup::new(); + let (tx, rx) = mpsc::channel(); + + for _ in 0..THREADS { + let wg = wg.clone(); + let wg2 = wg2.clone(); + let tx = tx.clone(); + + thread::spawn(move || { + wg2.wait(); + tx.send(()).unwrap(); + drop(wg); + }); + } + + // At this point, no thread has gotten past `wg2.wait()`, so we shouldn't get anything from the + // channel. + assert!(rx.try_recv().is_err()); + drop(wg2); + + wg.wait(); + + // Now, the wait group is cleared and we should receive messages. + for _ in 0..THREADS { + rx.try_recv().unwrap(); + } +} diff --git a/library/core/src/panic.rs b/library/core/src/panic.rs index b520efe93f90d..b64b860527d0d 100644 --- a/library/core/src/panic.rs +++ b/library/core/src/panic.rs @@ -17,7 +17,7 @@ pub use self::unwind_safe::{AssertUnwindSafe, RefUnwindSafe, UnwindSafe}; #[doc(hidden)] #[unstable(feature = "edition_panic", issue = "none", reason = "use panic!() instead")] -#[allow_internal_unstable(panic_internals, const_format_args)] +#[allow_internal_unstable(panic_internals, const_format_args, never_type, stmt_expr_attributes)] #[rustc_diagnostic_item = "core_panic_2015_macro"] #[rustc_macro_transparency = "semitransparent"] pub macro panic_2015 { @@ -28,23 +28,26 @@ pub macro panic_2015 { $crate::panicking::panic($msg) ), // Use `panic_str` instead of `panic_display::<&str>` for non_fmt_panic lint. - ($msg:expr $(,)?) => ({ - $crate::panicking::panic_str($msg); - }), + ($msg:expr $(,)?) => (#[allow(unreachable_code)] $crate::convert::identity::({ + let _never = $crate::panicking::panic_str($msg); + _never + })), // Special-case the single-argument case for const_panic. - ("{}", $arg:expr $(,)?) => ({ - $crate::panicking::panic_display(&$arg); - }), - ($fmt:expr, $($arg:tt)+) => ({ + ("{}", $arg:expr $(,)?) => (#[allow(unreachable_code)] $crate::convert::identity::({ + let _never = $crate::panicking::panic_display(&$arg); + _never + })), + ($fmt:expr, $($arg:tt)+) => (#[allow(unreachable_code)] $crate::convert::identity::({ // Semicolon to prevent temporaries inside the formatting machinery from // being considered alive in the caller after the panic_fmt call. - $crate::panicking::panic_fmt($crate::const_format_args!($fmt, $($arg)+)); - }), + let _never = $crate::panicking::panic_fmt($crate::const_format_args!($fmt, $($arg)+)); + _never + })), } #[doc(hidden)] #[unstable(feature = "edition_panic", issue = "none", reason = "use panic!() instead")] -#[allow_internal_unstable(panic_internals, const_format_args)] +#[allow_internal_unstable(panic_internals, const_format_args, never_type, stmt_expr_attributes)] #[rustc_diagnostic_item = "core_panic_2021_macro"] #[rustc_macro_transparency = "semitransparent"] #[cfg(feature = "panic_immediate_abort")] @@ -53,14 +56,16 @@ pub macro panic_2021 { $crate::panicking::panic("explicit panic") ), // Special-case the single-argument case for const_panic. - ("{}", $arg:expr $(,)?) => ({ - $crate::panicking::panic_display(&$arg); - }), - ($($t:tt)+) => ({ + ("{}", $arg:expr $(,)?) => (#[allow(unreachable_code)] $crate::convert::identity::({ + let _never = $crate::panicking::panic_display(&$arg); + _never + })), + ($($t:tt)+) => (#[allow(unreachable_code)] $crate::convert::identity::({ // Semicolon to prevent temporaries inside the formatting machinery from // being considered alive in the caller after the panic_fmt call. - $crate::panicking::panic_fmt($crate::const_format_args!($($t)+)); - }), + let _never = $crate::panicking::panic_fmt($crate::const_format_args!($($t)+)); + _never + })), } #[doc(hidden)] @@ -71,13 +76,15 @@ pub macro panic_2021 { const_dispatch, const_eval_select, const_format_args, - rustc_attrs + rustc_attrs, + never_type, + stmt_expr_attributes )] #[rustc_diagnostic_item = "core_panic_2021_macro"] #[rustc_macro_transparency = "semitransparent"] #[cfg(not(feature = "panic_immediate_abort"))] pub macro panic_2021 { - () => ({ + () => (#[allow(unreachable_code)] $crate::convert::identity::({ // Create a function so that the argument for `track_caller` // can be moved inside if possible. #[cold] @@ -86,10 +93,11 @@ pub macro panic_2021 { const fn panic_cold_explicit() -> ! { $crate::panicking::panic_explicit() } - panic_cold_explicit(); - }), + let _never = panic_cold_explicit(); + _never + })), // Special-case the single-argument case for const_panic. - ("{}", $arg:expr $(,)?) => ({ + ("{}", $arg:expr $(,)?) => (#[allow(unreachable_code)] $crate::convert::identity::({ #[cold] #[track_caller] #[inline(never)] @@ -98,18 +106,20 @@ pub macro panic_2021 { const fn panic_cold_display(arg: &T) -> ! { $crate::panicking::panic_display(arg) } - panic_cold_display(&$arg); - }), - ($($t:tt)+) => ({ + let _never = panic_cold_display(&$arg); + _never + })), + ($($t:tt)+) => (#[allow(unreachable_code)] $crate::convert::identity::({ // Semicolon to prevent temporaries inside the formatting machinery from // being considered alive in the caller after the panic_fmt call. - $crate::panicking::panic_fmt($crate::const_format_args!($($t)+)); - }), + let _never = $crate::panicking::panic_fmt($crate::const_format_args!($($t)+)); + _never + })), } #[doc(hidden)] #[unstable(feature = "edition_panic", issue = "none", reason = "use unreachable!() instead")] -#[allow_internal_unstable(panic_internals)] +#[allow_internal_unstable(panic_internals, never_type, stmt_expr_attributes)] #[rustc_diagnostic_item = "unreachable_2015_macro"] #[rustc_macro_transparency = "semitransparent"] pub macro unreachable_2015 { @@ -118,9 +128,10 @@ pub macro unreachable_2015 { ), // Use of `unreachable_display` for non_fmt_panic lint. // NOTE: the message ("internal error ...") is embedded directly in unreachable_display - ($msg:expr $(,)?) => ({ - $crate::panicking::unreachable_display(&$msg); - }), + ($msg:expr $(,)?) => (#[allow(unreachable_code)] $crate::convert::identity::({ + let _never = $crate::panicking::unreachable_display(&$msg); + _never + })), ($fmt:expr, $($arg:tt)*) => ( $crate::panic!($crate::concat!("internal error: entered unreachable code: ", $fmt), $($arg)*) ), diff --git a/library/core/src/panicking.rs b/library/core/src/panicking.rs index 9e8dac888166c..d529a4c7e999a 100644 --- a/library/core/src/panicking.rs +++ b/library/core/src/panicking.rs @@ -264,7 +264,7 @@ pub const fn const_panic_fmt(fmt: fmt::Arguments<'_>) -> ! { // SAFETY: This is only evaluated at compile time, which reliably // handles this UB (in case this branch turns out to be reachable // somehow). - unsafe { crate::hint::unreachable_unchecked() }; + unsafe { crate::hint::unreachable_unchecked() } } } diff --git a/miniz_oxide-0.7.2/.cargo-ok b/miniz_oxide-0.7.2/.cargo-ok new file mode 100644 index 0000000000000..5f8b795830acb --- /dev/null +++ b/miniz_oxide-0.7.2/.cargo-ok @@ -0,0 +1 @@ +{"v":1} \ No newline at end of file diff --git a/miniz_oxide-0.7.2/.cargo_vcs_info.json b/miniz_oxide-0.7.2/.cargo_vcs_info.json new file mode 100644 index 0000000000000..c37835266545a --- /dev/null +++ b/miniz_oxide-0.7.2/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "79307d7bd6e4aa3e86b30fd225a00ecb412188a4" + }, + "path_in_vcs": "miniz_oxide" +} \ No newline at end of file diff --git a/miniz_oxide-0.7.2/Cargo.toml b/miniz_oxide-0.7.2/Cargo.toml new file mode 100644 index 0000000000000..8318bbd056be2 --- /dev/null +++ b/miniz_oxide-0.7.2/Cargo.toml @@ -0,0 +1,74 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "miniz_oxide" +version = "0.7.2" +authors = [ + "Frommi ", + "oyvindln ", +] +exclude = [ + "benches/*", + "tests/*", +] +description = "DEFLATE compression and decompression library rewritten in Rust based on miniz" +homepage = "https://github.com/Frommi/miniz_oxide/tree/master/miniz_oxide" +documentation = "https://docs.rs/miniz_oxide" +readme = "Readme.md" +keywords = [ + "zlib", + "miniz", + "deflate", + "encoding", +] +categories = ["compression"] +license = "MIT OR Zlib OR Apache-2.0" +repository = "https://github.com/Frommi/miniz_oxide/tree/master/miniz_oxide" + +[lib] +name = "miniz_oxide" + +[dependencies.adler] +version = "1.0" +default-features = false + +[dependencies.alloc] +version = "1.0.0" +optional = true +package = "rustc-std-workspace-alloc" + +[dependencies.compiler_builtins] +version = "0.1.2" +optional = true + +[dependencies.core] +version = "1.0.0" +optional = true +package = "rustc-std-workspace-core" + +[dependencies.simd-adler32] +version = "0.3" +optional = true +default-features = false + +[features] +default = ["with-alloc"] +rustc-dep-of-std = [ + "core", + "alloc", + "compiler_builtins", + "adler/rustc-dep-of-std", +] +simd = ["simd-adler32"] +std = [] +with-alloc = [] diff --git a/miniz_oxide-0.7.2/Cargo.toml.orig b/miniz_oxide-0.7.2/Cargo.toml.orig new file mode 100644 index 0000000000000..f104ef227aee1 --- /dev/null +++ b/miniz_oxide-0.7.2/Cargo.toml.orig @@ -0,0 +1,38 @@ +[package] +name = "miniz_oxide" +authors = ["Frommi ", "oyvindln "] +version = "0.7.2" +license = "MIT OR Zlib OR Apache-2.0" +readme = "Readme.md" +keywords = ["zlib", "miniz", "deflate", "encoding"] +categories = ["compression"] +repository = "https://github.com/Frommi/miniz_oxide/tree/master/miniz_oxide" +homepage = "https://github.com/Frommi/miniz_oxide/tree/master/miniz_oxide" +documentation = "https://docs.rs/miniz_oxide" +description = "DEFLATE compression and decompression library rewritten in Rust based on miniz" +edition = "2018" +exclude = ["benches/*", "tests/*"] + +[lib] +name = "miniz_oxide" + +[dependencies] +adler = { version = "1.0", default-features = false } +simd-adler32 = { version = "0.3", default-features = false, optional = true } + +# Internal feature, only used when building as part of libstd, not part of the +# stable interface of this crate. +core = { version = '1.0.0', optional = true, package = 'rustc-std-workspace-core' } +alloc = { version = '1.0.0', optional = true, package = 'rustc-std-workspace-alloc' } +compiler_builtins = { version = '0.1.2', optional = true } + +[features] +default = ["with-alloc"] +with-alloc = [] +std = [] + +# Internal feature, only used when building as part of libstd, not part of the +# stable interface of this crate. +rustc-dep-of-std = ['core', 'alloc', 'compiler_builtins', 'adler/rustc-dep-of-std'] + +simd = ['simd-adler32'] diff --git a/miniz_oxide-0.7.2/LICENSE b/miniz_oxide-0.7.2/LICENSE new file mode 100644 index 0000000000000..64c53792ced0f --- /dev/null +++ b/miniz_oxide-0.7.2/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Frommi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/miniz_oxide-0.7.2/LICENSE-APACHE.md b/miniz_oxide-0.7.2/LICENSE-APACHE.md new file mode 100644 index 0000000000000..f433b1a53f5b8 --- /dev/null +++ b/miniz_oxide-0.7.2/LICENSE-APACHE.md @@ -0,0 +1,177 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/miniz_oxide-0.7.2/LICENSE-MIT.md b/miniz_oxide-0.7.2/LICENSE-MIT.md new file mode 100644 index 0000000000000..64c53792ced0f --- /dev/null +++ b/miniz_oxide-0.7.2/LICENSE-MIT.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Frommi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/miniz_oxide-0.7.2/LICENSE-ZLIB.md b/miniz_oxide-0.7.2/LICENSE-ZLIB.md new file mode 100644 index 0000000000000..7f513d1acbab9 --- /dev/null +++ b/miniz_oxide-0.7.2/LICENSE-ZLIB.md @@ -0,0 +1,11 @@ +Copyright (c) 2020 Frommi + +This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. + +Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. + +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. + +3. This notice may not be removed or altered from any source distribution. diff --git a/miniz_oxide-0.7.2/Readme.md b/miniz_oxide-0.7.2/Readme.md new file mode 100644 index 0000000000000..6c177b0e1a9c4 --- /dev/null +++ b/miniz_oxide-0.7.2/Readme.md @@ -0,0 +1,44 @@ +# miniz_oxide + +A fully safe, pure rust replacement for the [miniz](https://github.com/richgel999/miniz) DEFLATE/zlib encoder/decoder. +The main intention of this crate is to be used as a back-end for the [flate2](https://github.com/alexcrichton/flate2-rs), but it can also be used on its own. Using flate2 with the ```rust_backend``` feature provides an easy to use streaming API for miniz_oxide. + +The library is fully [no_std](https://docs.rust-embedded.org/book/intro/no-std.html). By default, the `with-alloc` feature is enabled, which requires the use of the `alloc` and `collection` crates as it allocates memory. + +The `std` feature additionally turns on things only available if `no_std` is not used. Currently this only means implementing [Error](https://doc.rust-lang.org/stable/std/error/trait.Error.html) for the `DecompressError` error struct returned by the simple decompression functions if enabled together with `with-alloc`. + +Using the library with `default-features = false` removes the dependency on `alloc` +and `collection` crates, making it suitable for systems without an allocator. +Running without allocation reduces crate functionality: + +- The `deflate` module is removed completely +- Some `inflate` functions which return a `Vec` are removed + +miniz_oxide 0.5.x and 0.6.x Requires at least rust 1.40.0 0.3.x requires at least rust 0.36.0. + +miniz_oxide features no use of unsafe code. + +miniz_oxide can optionally be made to use a simd-accelerated version of adler32 via the [simd-adler32](https://crates.io/crates/simd-adler32) crate by enabling the 'simd' feature. This is not enabled by default as due to the use of simd intrinsics, the simd-adler32 has to use unsafe. The default setup uses the [adler](https://crates.io/crates/adler) crate which features no unsafe code. + +## Usage +Simple compression/decompression: +```rust + +use miniz_oxide::deflate::compress_to_vec; +use miniz_oxide::inflate::decompress_to_vec; + +fn roundtrip(data: &[u8]) { + // Compress the input + let compressed = compress_to_vec(data, 6); + // Decompress the compressed input and limit max output size to avoid going out of memory on large/malformed input. + let decompressed = decompress_to_vec_with_limit(compressed.as_slice(), 60000).expect("Failed to decompress!"); + // Check roundtrip succeeded + assert_eq!(data, decompressed); +} + +fn main() { + roundtrip("Hello, world!".as_bytes()); +} + +``` +These simple functions will do everything in one go and are thus not recommended for use cases outside of prototyping/testing as real world data can have any size and thus result in very large memory allocations for the output Vector. Consider using miniz_oxide via [flate2](https://github.com/alexcrichton/flate2-rs) which makes it easy to do streaming (de)compression or the low-level streaming functions instead. diff --git a/miniz_oxide-0.7.2/src/deflate/buffer.rs b/miniz_oxide-0.7.2/src/deflate/buffer.rs new file mode 100644 index 0000000000000..f246c07dfbeff --- /dev/null +++ b/miniz_oxide-0.7.2/src/deflate/buffer.rs @@ -0,0 +1,58 @@ +//! Buffer wrappers implementing default so we can allocate the buffers with `Box::default()` +//! to avoid stack copies. Box::new() doesn't at the moment, and using a vec means we would lose +//! static length info. + +use crate::deflate::core::{LZ_DICT_SIZE, MAX_MATCH_LEN}; + +/// Size of the buffer of lz77 encoded data. +pub const LZ_CODE_BUF_SIZE: usize = 64 * 1024; +/// Size of the output buffer. +pub const OUT_BUF_SIZE: usize = (LZ_CODE_BUF_SIZE * 13) / 10; +pub const LZ_DICT_FULL_SIZE: usize = LZ_DICT_SIZE + MAX_MATCH_LEN - 1 + 1; + +/// Size of hash values in the hash chains. +pub const LZ_HASH_BITS: i32 = 15; +/// How many bits to shift when updating the current hash value. +pub const LZ_HASH_SHIFT: i32 = (LZ_HASH_BITS + 2) / 3; +/// Size of the chained hash tables. +pub const LZ_HASH_SIZE: usize = 1 << LZ_HASH_BITS; + +#[inline] +pub fn update_hash(current_hash: u16, byte: u8) -> u16 { + ((current_hash << LZ_HASH_SHIFT) ^ u16::from(byte)) & (LZ_HASH_SIZE as u16 - 1) +} + +pub struct HashBuffers { + pub dict: [u8; LZ_DICT_FULL_SIZE], + pub next: [u16; LZ_DICT_SIZE], + pub hash: [u16; LZ_DICT_SIZE], +} + +impl HashBuffers { + #[inline] + pub fn reset(&mut self) { + *self = HashBuffers::default(); + } +} + +impl Default for HashBuffers { + fn default() -> HashBuffers { + HashBuffers { + dict: [0; LZ_DICT_FULL_SIZE], + next: [0; LZ_DICT_SIZE], + hash: [0; LZ_DICT_SIZE], + } + } +} + +pub struct LocalBuf { + pub b: [u8; OUT_BUF_SIZE], +} + +impl Default for LocalBuf { + fn default() -> LocalBuf { + LocalBuf { + b: [0; OUT_BUF_SIZE], + } + } +} diff --git a/miniz_oxide-0.7.2/src/deflate/core.rs b/miniz_oxide-0.7.2/src/deflate/core.rs new file mode 100644 index 0000000000000..b0a532dcedb13 --- /dev/null +++ b/miniz_oxide-0.7.2/src/deflate/core.rs @@ -0,0 +1,2456 @@ +//! Streaming compression functionality. + +use alloc::boxed::Box; +use core::convert::TryInto; +use core::{cmp, mem}; + +use super::super::*; +use super::deflate_flags::*; +use super::CompressionLevel; +use crate::deflate::buffer::{ + update_hash, HashBuffers, LocalBuf, LZ_CODE_BUF_SIZE, LZ_DICT_FULL_SIZE, LZ_HASH_BITS, + LZ_HASH_SHIFT, LZ_HASH_SIZE, OUT_BUF_SIZE, +}; +use crate::shared::{update_adler32, HUFFMAN_LENGTH_ORDER, MZ_ADLER32_INIT}; +use crate::DataFormat; + +// Currently not bubbled up outside this module, so can fill in with more +// context eventually if needed. +type Result = core::result::Result; +struct Error {} + +const MAX_PROBES_MASK: i32 = 0xFFF; + +const MAX_SUPPORTED_HUFF_CODESIZE: usize = 32; + +/// Length code for length values. +#[rustfmt::skip] +const LEN_SYM: [u16; 256] = [ + 257, 258, 259, 260, 261, 262, 263, 264, 265, 265, 266, 266, 267, 267, 268, 268, + 269, 269, 269, 269, 270, 270, 270, 270, 271, 271, 271, 271, 272, 272, 272, 272, + 273, 273, 273, 273, 273, 273, 273, 273, 274, 274, 274, 274, 274, 274, 274, 274, + 275, 275, 275, 275, 275, 275, 275, 275, 276, 276, 276, 276, 276, 276, 276, 276, + 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, + 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, + 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, + 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, + 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, + 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, + 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, + 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, + 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, + 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, + 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, + 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 285 +]; + +/// Number of extra bits for length values. +#[rustfmt::skip] +const LEN_EXTRA: [u8; 256] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0 +]; + +/// Distance codes for distances smaller than 512. +#[rustfmt::skip] +const SMALL_DIST_SYM: [u8; 512] = [ + 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17 +]; + +/// Number of extra bits for distances smaller than 512. +#[rustfmt::skip] +const SMALL_DIST_EXTRA: [u8; 512] = [ + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +]; + +/// Base values to calculate distances above 512. +#[rustfmt::skip] +const LARGE_DIST_SYM: [u8; 128] = [ + 0, 0, 18, 19, 20, 20, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29 +]; + +/// Number of extra bits distances above 512. +#[rustfmt::skip] +const LARGE_DIST_EXTRA: [u8; 128] = [ + 0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 +]; + +#[rustfmt::skip] +const BITMASKS: [u32; 17] = [ + 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, + 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF +]; + +/// The maximum number of checks for matches in the hash table the compressor will make for each +/// compression level. +const NUM_PROBES: [u32; 11] = [0, 1, 6, 32, 16, 32, 128, 256, 512, 768, 1500]; + +#[derive(Copy, Clone)] +struct SymFreq { + key: u16, + sym_index: u16, +} + +pub mod deflate_flags { + /// Whether to use a zlib wrapper. + pub const TDEFL_WRITE_ZLIB_HEADER: u32 = 0x0000_1000; + /// Should we compute the adler32 checksum. + pub const TDEFL_COMPUTE_ADLER32: u32 = 0x0000_2000; + /// Should we use greedy parsing (as opposed to lazy parsing where look ahead one or more + /// bytes to check for better matches.) + pub const TDEFL_GREEDY_PARSING_FLAG: u32 = 0x0000_4000; + /// Used in miniz to skip zero-initializing hash and dict. We don't do this here, so + /// this flag is ignored. + pub const TDEFL_NONDETERMINISTIC_PARSING_FLAG: u32 = 0x0000_8000; + /// Only look for matches with a distance of 0. + pub const TDEFL_RLE_MATCHES: u32 = 0x0001_0000; + /// Only use matches that are at least 6 bytes long. + pub const TDEFL_FILTER_MATCHES: u32 = 0x0002_0000; + /// Force the compressor to only output static blocks. (Blocks using the default huffman codes + /// specified in the deflate specification.) + pub const TDEFL_FORCE_ALL_STATIC_BLOCKS: u32 = 0x0004_0000; + /// Force the compressor to only output raw/uncompressed blocks. + pub const TDEFL_FORCE_ALL_RAW_BLOCKS: u32 = 0x0008_0000; +} + +/// Strategy setting for compression. +/// +/// The non-default settings offer some special-case compression variants. +#[repr(i32)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum CompressionStrategy { + /// Don't use any of the special strategies. + Default = 0, + /// Only use matches that are at least 5 bytes long. + Filtered = 1, + /// Don't look for matches, only huffman encode the literals. + HuffmanOnly = 2, + /// Only look for matches with a distance of 1, i.e do run-length encoding only. + RLE = 3, + /// Only use static/fixed blocks. (Blocks using the default huffman codes + /// specified in the deflate specification.) + Fixed = 4, +} + +/// A list of deflate flush types. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum TDEFLFlush { + /// Normal operation. + /// + /// Compress as much as there is space for, and then return waiting for more input. + None = 0, + + /// Try to flush all the current data and output an empty raw block. + Sync = 2, + + /// Same as [`Sync`][Self::Sync], but reset the dictionary so that the following data does not + /// depend on previous data. + Full = 3, + + /// Try to flush everything and end the deflate stream. + /// + /// On success this will yield a [`TDEFLStatus::Done`] return status. + Finish = 4, +} + +impl From for TDEFLFlush { + fn from(flush: MZFlush) -> Self { + match flush { + MZFlush::None => TDEFLFlush::None, + MZFlush::Sync => TDEFLFlush::Sync, + MZFlush::Full => TDEFLFlush::Full, + MZFlush::Finish => TDEFLFlush::Finish, + _ => TDEFLFlush::None, // TODO: ??? What to do ??? + } + } +} + +impl TDEFLFlush { + pub fn new(flush: i32) -> Result { + match flush { + 0 => Ok(TDEFLFlush::None), + 2 => Ok(TDEFLFlush::Sync), + 3 => Ok(TDEFLFlush::Full), + 4 => Ok(TDEFLFlush::Finish), + _ => Err(MZError::Param), + } + } +} + +/// Return status of compression. +#[repr(i32)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum TDEFLStatus { + /// Usage error. + /// + /// This indicates that either the [`CompressorOxide`] experienced a previous error, or the + /// stream has already been [`TDEFLFlush::Finish`]'d. + BadParam = -2, + + /// Error putting data into output buffer. + /// + /// This usually indicates a too-small buffer. + PutBufFailed = -1, + + /// Compression succeeded normally. + Okay = 0, + + /// Compression succeeded and the deflate stream was ended. + /// + /// This is the result of calling compression with [`TDEFLFlush::Finish`]. + Done = 1, +} + +const MAX_HUFF_SYMBOLS: usize = 288; +/// Size of hash chain for fast compression mode. +const LEVEL1_HASH_SIZE_MASK: u32 = 4095; +/// The number of huffman tables used by the compressor. +/// Literal/length, Distances and Length of the huffman codes for the other two tables. +const MAX_HUFF_TABLES: usize = 3; +/// Literal/length codes +const MAX_HUFF_SYMBOLS_0: usize = 288; +/// Distance codes. +const MAX_HUFF_SYMBOLS_1: usize = 32; +/// Huffman length values. +const MAX_HUFF_SYMBOLS_2: usize = 19; +/// Size of the chained hash table. +pub(crate) const LZ_DICT_SIZE: usize = 32_768; +/// Mask used when stepping through the hash chains. +const LZ_DICT_SIZE_MASK: usize = (LZ_DICT_SIZE as u32 - 1) as usize; +/// The minimum length of a match. +const MIN_MATCH_LEN: u8 = 3; +/// The maximum length of a match. +pub(crate) const MAX_MATCH_LEN: usize = 258; + +const DEFAULT_FLAGS: u32 = NUM_PROBES[4] | TDEFL_WRITE_ZLIB_HEADER; + +mod zlib { + const DEFAULT_CM: u8 = 8; + const DEFAULT_CINFO: u8 = 7 << 4; + const _DEFAULT_FDICT: u8 = 0; + const DEFAULT_CMF: u8 = DEFAULT_CM | DEFAULT_CINFO; + /// The 16-bit value consisting of CMF and FLG must be divisible by this to be valid. + const FCHECK_DIVISOR: u8 = 31; + + /// Generate FCHECK from CMF and FLG (without FCKECH )so that they are correct according to the + /// specification, i.e (CMF*256 + FCHK) % 31 = 0. + /// Returns flg with the FCHKECK bits added (any existing FCHECK bits are ignored). + fn add_fcheck(cmf: u8, flg: u8) -> u8 { + let rem = ((usize::from(cmf) * 256) + usize::from(flg)) % usize::from(FCHECK_DIVISOR); + + // Clear existing FCHECK if any + let flg = flg & 0b11100000; + + // Casting is safe as rem can't overflow since it is a value mod 31 + // We can simply add the value to flg as (31 - rem) will never be above 2^5 + flg + (FCHECK_DIVISOR - rem as u8) + } + + fn zlib_level_from_flags(flags: u32) -> u8 { + use super::NUM_PROBES; + + let num_probes = flags & (super::MAX_PROBES_MASK as u32); + if flags & super::TDEFL_GREEDY_PARSING_FLAG != 0 { + if num_probes <= 1 { + 0 + } else { + 1 + } + } else if num_probes >= NUM_PROBES[9] { + 3 + } else { + 2 + } + } + + /// Get the zlib header for the level using the default window size and no + /// dictionary. + fn header_from_level(level: u8) -> [u8; 2] { + let cmf = DEFAULT_CMF; + [cmf, add_fcheck(cmf, level << 6)] + } + + /// Create a zlib header from the given compression flags. + /// Only level is considered. + pub fn header_from_flags(flags: u32) -> [u8; 2] { + let level = zlib_level_from_flags(flags); + header_from_level(level) + } + + #[cfg(test)] + mod test { + #[test] + fn zlib() { + use super::super::*; + use super::*; + + let test_level = |level, expected| { + let flags = create_comp_flags_from_zip_params( + level, + MZ_DEFAULT_WINDOW_BITS, + CompressionStrategy::Default as i32, + ); + assert_eq!(zlib_level_from_flags(flags), expected); + }; + + assert_eq!(zlib_level_from_flags(DEFAULT_FLAGS), 2); + test_level(0, 0); + test_level(1, 0); + test_level(2, 1); + test_level(3, 1); + for i in 4..=8 { + test_level(i, 2) + } + test_level(9, 3); + test_level(10, 3); + } + + #[test] + fn test_header() { + let header = super::header_from_level(3); + assert_eq!( + ((usize::from(header[0]) * 256) + usize::from(header[1])) % 31, + 0 + ); + } + } +} + +fn memset(slice: &mut [T], val: T) { + for x in slice { + *x = val + } +} + +#[cfg(test)] +#[inline] +fn write_u16_le(val: u16, slice: &mut [u8], pos: usize) { + slice[pos] = val as u8; + slice[pos + 1] = (val >> 8) as u8; +} + +// Read the two bytes starting at pos and interpret them as an u16. +#[inline] +const fn read_u16_le(slice: &[u8], pos: usize) -> u16 { + // The compiler is smart enough to optimize this into an unaligned load. + slice[pos] as u16 | ((slice[pos + 1] as u16) << 8) +} + +/// Main compression struct. +pub struct CompressorOxide { + lz: LZOxide, + params: ParamsOxide, + /// Put HuffmanOxide on the heap with default trick to avoid + /// excessive stack copies. + huff: Box, + dict: DictOxide, +} + +impl CompressorOxide { + /// Create a new `CompressorOxide` with the given flags. + /// + /// # Notes + /// This function may be changed to take different parameters in the future. + pub fn new(flags: u32) -> Self { + CompressorOxide { + lz: LZOxide::new(), + params: ParamsOxide::new(flags), + huff: Box::default(), + dict: DictOxide::new(flags), + } + } + + /// Get the adler32 checksum of the currently encoded data. + pub const fn adler32(&self) -> u32 { + self.params.adler32 + } + + /// Get the return status of the previous [`compress`](fn.compress.html) + /// call with this compressor. + pub const fn prev_return_status(&self) -> TDEFLStatus { + self.params.prev_return_status + } + + /// Get the raw compressor flags. + /// + /// # Notes + /// This function may be deprecated or changed in the future to use more rust-style flags. + pub const fn flags(&self) -> i32 { + self.params.flags as i32 + } + + /// Returns whether the compressor is wrapping the data in a zlib format or not. + pub fn data_format(&self) -> DataFormat { + if (self.params.flags & TDEFL_WRITE_ZLIB_HEADER) != 0 { + DataFormat::Zlib + } else { + DataFormat::Raw + } + } + + /// Reset the state of the compressor, keeping the same parameters. + /// + /// This avoids re-allocating data. + pub fn reset(&mut self) { + // LZ buf and huffman has no settings or dynamic memory + // that needs to be saved, so we simply replace them. + self.lz = LZOxide::new(); + self.params.reset(); + *self.huff = HuffmanOxide::default(); + self.dict.reset(); + } + + /// Set the compression level of the compressor. + /// + /// Using this to change level after compression has started is supported. + /// # Notes + /// The compression strategy will be reset to the default one when this is called. + pub fn set_compression_level(&mut self, level: CompressionLevel) { + let format = self.data_format(); + self.set_format_and_level(format, level as u8); + } + + /// Set the compression level of the compressor using an integer value. + /// + /// Using this to change level after compression has started is supported. + /// # Notes + /// The compression strategy will be reset to the default one when this is called. + pub fn set_compression_level_raw(&mut self, level: u8) { + let format = self.data_format(); + self.set_format_and_level(format, level); + } + + /// Update the compression settings of the compressor. + /// + /// Changing the `DataFormat` after compression has started will result in + /// a corrupted stream. + /// + /// # Notes + /// This function mainly intended for setting the initial settings after e.g creating with + /// `default` or after calling `CompressorOxide::reset()`, and behaviour may be changed + /// to disallow calling it after starting compression in the future. + pub fn set_format_and_level(&mut self, data_format: DataFormat, level: u8) { + let flags = create_comp_flags_from_zip_params( + level.into(), + data_format.to_window_bits(), + CompressionStrategy::Default as i32, + ); + self.params.update_flags(flags); + self.dict.update_flags(flags); + } +} + +impl Default for CompressorOxide { + /// Initialize the compressor with a level of 4, zlib wrapper and + /// the default strategy. + fn default() -> Self { + CompressorOxide { + lz: LZOxide::new(), + params: ParamsOxide::new(DEFAULT_FLAGS), + huff: Box::default(), + dict: DictOxide::new(DEFAULT_FLAGS), + } + } +} + +/// Callback function and user used in `compress_to_output`. +pub struct CallbackFunc<'a> { + pub put_buf_func: &'a mut dyn FnMut(&[u8]) -> bool, +} + +impl<'a> CallbackFunc<'a> { + fn flush_output( + &mut self, + saved_output: SavedOutputBufferOxide, + params: &mut ParamsOxide, + ) -> i32 { + // TODO: As this could be unsafe since + // we can't verify the function pointer + // this whole function should maybe be unsafe as well. + let call_success = (self.put_buf_func)(¶ms.local_buf.b[0..saved_output.pos]); + + if !call_success { + params.prev_return_status = TDEFLStatus::PutBufFailed; + return params.prev_return_status as i32; + } + + params.flush_remaining as i32 + } +} + +struct CallbackBuf<'a> { + pub out_buf: &'a mut [u8], +} + +impl<'a> CallbackBuf<'a> { + fn flush_output( + &mut self, + saved_output: SavedOutputBufferOxide, + params: &mut ParamsOxide, + ) -> i32 { + if saved_output.local { + let n = cmp::min(saved_output.pos, self.out_buf.len() - params.out_buf_ofs); + (self.out_buf[params.out_buf_ofs..params.out_buf_ofs + n]) + .copy_from_slice(¶ms.local_buf.b[..n]); + + params.out_buf_ofs += n; + if saved_output.pos != n { + params.flush_ofs = n as u32; + params.flush_remaining = (saved_output.pos - n) as u32; + } + } else { + params.out_buf_ofs += saved_output.pos; + } + + params.flush_remaining as i32 + } +} + +enum CallbackOut<'a> { + Func(CallbackFunc<'a>), + Buf(CallbackBuf<'a>), +} + +impl<'a> CallbackOut<'a> { + fn new_output_buffer<'b>( + &'b mut self, + local_buf: &'b mut [u8], + out_buf_ofs: usize, + ) -> OutputBufferOxide<'b> { + let is_local; + let buf_len = OUT_BUF_SIZE - 16; + let chosen_buffer = match *self { + CallbackOut::Buf(ref mut cb) if cb.out_buf.len() - out_buf_ofs >= OUT_BUF_SIZE => { + is_local = false; + &mut cb.out_buf[out_buf_ofs..out_buf_ofs + buf_len] + } + _ => { + is_local = true; + &mut local_buf[..buf_len] + } + }; + + OutputBufferOxide { + inner: chosen_buffer, + inner_pos: 0, + local: is_local, + bit_buffer: 0, + bits_in: 0, + } + } +} + +struct CallbackOxide<'a> { + in_buf: Option<&'a [u8]>, + in_buf_size: Option<&'a mut usize>, + out_buf_size: Option<&'a mut usize>, + out: CallbackOut<'a>, +} + +impl<'a> CallbackOxide<'a> { + fn new_callback_buf(in_buf: &'a [u8], out_buf: &'a mut [u8]) -> Self { + CallbackOxide { + in_buf: Some(in_buf), + in_buf_size: None, + out_buf_size: None, + out: CallbackOut::Buf(CallbackBuf { out_buf }), + } + } + + fn new_callback_func(in_buf: &'a [u8], callback_func: CallbackFunc<'a>) -> Self { + CallbackOxide { + in_buf: Some(in_buf), + in_buf_size: None, + out_buf_size: None, + out: CallbackOut::Func(callback_func), + } + } + + fn update_size(&mut self, in_size: Option, out_size: Option) { + if let (Some(in_size), Some(size)) = (in_size, self.in_buf_size.as_mut()) { + **size = in_size; + } + + if let (Some(out_size), Some(size)) = (out_size, self.out_buf_size.as_mut()) { + **size = out_size + } + } + + fn flush_output( + &mut self, + saved_output: SavedOutputBufferOxide, + params: &mut ParamsOxide, + ) -> i32 { + if saved_output.pos == 0 { + return params.flush_remaining as i32; + } + + self.update_size(Some(params.src_pos), None); + match self.out { + CallbackOut::Func(ref mut cf) => cf.flush_output(saved_output, params), + CallbackOut::Buf(ref mut cb) => cb.flush_output(saved_output, params), + } + } +} + +struct OutputBufferOxide<'a> { + pub inner: &'a mut [u8], + pub inner_pos: usize, + pub local: bool, + + pub bit_buffer: u32, + pub bits_in: u32, +} + +impl<'a> OutputBufferOxide<'a> { + fn put_bits(&mut self, bits: u32, len: u32) { + assert!(bits <= ((1u32 << len) - 1u32)); + self.bit_buffer |= bits << self.bits_in; + self.bits_in += len; + while self.bits_in >= 8 { + self.inner[self.inner_pos] = self.bit_buffer as u8; + self.inner_pos += 1; + self.bit_buffer >>= 8; + self.bits_in -= 8; + } + } + + const fn save(&self) -> SavedOutputBufferOxide { + SavedOutputBufferOxide { + pos: self.inner_pos, + bit_buffer: self.bit_buffer, + bits_in: self.bits_in, + local: self.local, + } + } + + fn load(&mut self, saved: SavedOutputBufferOxide) { + self.inner_pos = saved.pos; + self.bit_buffer = saved.bit_buffer; + self.bits_in = saved.bits_in; + self.local = saved.local; + } + + fn pad_to_bytes(&mut self) { + if self.bits_in != 0 { + let len = 8 - self.bits_in; + self.put_bits(0, len); + } + } +} + +struct SavedOutputBufferOxide { + pub pos: usize, + pub bit_buffer: u32, + pub bits_in: u32, + pub local: bool, +} + +struct BitBuffer { + pub bit_buffer: u64, + pub bits_in: u32, +} + +impl BitBuffer { + fn put_fast(&mut self, bits: u64, len: u32) { + self.bit_buffer |= bits << self.bits_in; + self.bits_in += len; + } + + fn flush(&mut self, output: &mut OutputBufferOxide) -> Result<()> { + let pos = output.inner_pos; + { + // isolation to please borrow checker + let inner = &mut output.inner[pos..pos + 8]; + let bytes = u64::to_le_bytes(self.bit_buffer); + inner.copy_from_slice(&bytes); + } + match output.inner_pos.checked_add((self.bits_in >> 3) as usize) { + Some(n) if n <= output.inner.len() => output.inner_pos = n, + _ => return Err(Error {}), + } + self.bit_buffer >>= self.bits_in & !7; + self.bits_in &= 7; + Ok(()) + } +} + +/// A struct containing data about huffman codes and symbol frequencies. +/// +/// NOTE: Only the literal/lengths have enough symbols to actually use +/// the full array. It's unclear why it's defined like this in miniz, +/// it could be for cache/alignment reasons. +struct HuffmanOxide { + /// Number of occurrences of each symbol. + pub count: [[u16; MAX_HUFF_SYMBOLS]; MAX_HUFF_TABLES], + /// The bits of the huffman code assigned to the symbol + pub codes: [[u16; MAX_HUFF_SYMBOLS]; MAX_HUFF_TABLES], + /// The length of the huffman code assigned to the symbol. + pub code_sizes: [[u8; MAX_HUFF_SYMBOLS]; MAX_HUFF_TABLES], +} + +/// Tables used for literal/lengths in `HuffmanOxide`. +const LITLEN_TABLE: usize = 0; +/// Tables for distances. +const DIST_TABLE: usize = 1; +/// Tables for the run-length encoded huffman lengths for literals/lengths/distances. +const HUFF_CODES_TABLE: usize = 2; + +/// Status of RLE encoding of huffman code lengths. +struct Rle { + pub z_count: u32, + pub repeat_count: u32, + pub prev_code_size: u8, +} + +impl Rle { + fn prev_code_size( + &mut self, + packed_code_sizes: &mut [u8], + packed_pos: &mut usize, + h: &mut HuffmanOxide, + ) -> Result<()> { + let mut write = |buf| write(buf, packed_code_sizes, packed_pos); + let counts = &mut h.count[HUFF_CODES_TABLE]; + if self.repeat_count != 0 { + if self.repeat_count < 3 { + counts[self.prev_code_size as usize] = + counts[self.prev_code_size as usize].wrapping_add(self.repeat_count as u16); + let code = self.prev_code_size; + write(&[code, code, code][..self.repeat_count as usize])?; + } else { + counts[16] = counts[16].wrapping_add(1); + write(&[16, (self.repeat_count - 3) as u8][..])?; + } + self.repeat_count = 0; + } + + Ok(()) + } + + fn zero_code_size( + &mut self, + packed_code_sizes: &mut [u8], + packed_pos: &mut usize, + h: &mut HuffmanOxide, + ) -> Result<()> { + let mut write = |buf| write(buf, packed_code_sizes, packed_pos); + let counts = &mut h.count[HUFF_CODES_TABLE]; + if self.z_count != 0 { + if self.z_count < 3 { + counts[0] = counts[0].wrapping_add(self.z_count as u16); + write(&[0, 0, 0][..self.z_count as usize])?; + } else if self.z_count <= 10 { + counts[17] = counts[17].wrapping_add(1); + write(&[17, (self.z_count - 3) as u8][..])?; + } else { + counts[18] = counts[18].wrapping_add(1); + write(&[18, (self.z_count - 11) as u8][..])?; + } + self.z_count = 0; + } + + Ok(()) + } +} + +fn write(src: &[u8], dst: &mut [u8], dst_pos: &mut usize) -> Result<()> { + match dst.get_mut(*dst_pos..*dst_pos + src.len()) { + Some(s) => s.copy_from_slice(src), + None => return Err(Error {}), + } + *dst_pos += src.len(); + Ok(()) +} + +impl Default for HuffmanOxide { + fn default() -> Self { + HuffmanOxide { + count: [[0; MAX_HUFF_SYMBOLS]; MAX_HUFF_TABLES], + codes: [[0; MAX_HUFF_SYMBOLS]; MAX_HUFF_TABLES], + code_sizes: [[0; MAX_HUFF_SYMBOLS]; MAX_HUFF_TABLES], + } + } +} + +impl HuffmanOxide { + fn radix_sort_symbols<'a>( + symbols0: &'a mut [SymFreq], + symbols1: &'a mut [SymFreq], + ) -> &'a mut [SymFreq] { + let mut hist = [[0; 256]; 2]; + + for freq in symbols0.iter() { + hist[0][(freq.key & 0xFF) as usize] += 1; + hist[1][((freq.key >> 8) & 0xFF) as usize] += 1; + } + + let mut n_passes = 2; + if symbols0.len() == hist[1][0] { + n_passes -= 1; + } + + let mut current_symbols = symbols0; + let mut new_symbols = symbols1; + + for (pass, hist_item) in hist.iter().enumerate().take(n_passes) { + let mut offsets = [0; 256]; + let mut offset = 0; + for i in 0..256 { + offsets[i] = offset; + offset += hist_item[i]; + } + + for sym in current_symbols.iter() { + let j = ((sym.key >> (pass * 8)) & 0xFF) as usize; + new_symbols[offsets[j]] = *sym; + offsets[j] += 1; + } + + mem::swap(&mut current_symbols, &mut new_symbols); + } + + current_symbols + } + + fn calculate_minimum_redundancy(symbols: &mut [SymFreq]) { + match symbols.len() { + 0 => (), + 1 => symbols[0].key = 1, + n => { + symbols[0].key += symbols[1].key; + let mut root = 0; + let mut leaf = 2; + for next in 1..n - 1 { + if (leaf >= n) || (symbols[root].key < symbols[leaf].key) { + symbols[next].key = symbols[root].key; + symbols[root].key = next as u16; + root += 1; + } else { + symbols[next].key = symbols[leaf].key; + leaf += 1; + } + + if (leaf >= n) || (root < next && symbols[root].key < symbols[leaf].key) { + symbols[next].key = symbols[next].key.wrapping_add(symbols[root].key); + symbols[root].key = next as u16; + root += 1; + } else { + symbols[next].key = symbols[next].key.wrapping_add(symbols[leaf].key); + leaf += 1; + } + } + + symbols[n - 2].key = 0; + for next in (0..n - 2).rev() { + symbols[next].key = symbols[symbols[next].key as usize].key + 1; + } + + let mut avbl = 1; + let mut used = 0; + let mut dpth = 0; + let mut root = (n - 2) as i32; + let mut next = (n - 1) as i32; + while avbl > 0 { + while (root >= 0) && (symbols[root as usize].key == dpth) { + used += 1; + root -= 1; + } + while avbl > used { + symbols[next as usize].key = dpth; + next -= 1; + avbl -= 1; + } + avbl = 2 * used; + dpth += 1; + used = 0; + } + } + } + } + + fn enforce_max_code_size(num_codes: &mut [i32], code_list_len: usize, max_code_size: usize) { + if code_list_len <= 1 { + return; + } + + num_codes[max_code_size] += num_codes[max_code_size + 1..].iter().sum::(); + let total = num_codes[1..=max_code_size] + .iter() + .rev() + .enumerate() + .fold(0u32, |total, (i, &x)| total + ((x as u32) << i)); + + for _ in (1 << max_code_size)..total { + num_codes[max_code_size] -= 1; + for i in (1..max_code_size).rev() { + if num_codes[i] != 0 { + num_codes[i] -= 1; + num_codes[i + 1] += 2; + break; + } + } + } + } + + fn optimize_table( + &mut self, + table_num: usize, + table_len: usize, + code_size_limit: usize, + static_table: bool, + ) { + let mut num_codes = [0i32; MAX_SUPPORTED_HUFF_CODESIZE + 1]; + let mut next_code = [0u32; MAX_SUPPORTED_HUFF_CODESIZE + 1]; + + if static_table { + for &code_size in &self.code_sizes[table_num][..table_len] { + num_codes[code_size as usize] += 1; + } + } else { + let mut symbols0 = [SymFreq { + key: 0, + sym_index: 0, + }; MAX_HUFF_SYMBOLS]; + let mut symbols1 = [SymFreq { + key: 0, + sym_index: 0, + }; MAX_HUFF_SYMBOLS]; + + let mut num_used_symbols = 0; + for i in 0..table_len { + if self.count[table_num][i] != 0 { + symbols0[num_used_symbols] = SymFreq { + key: self.count[table_num][i], + sym_index: i as u16, + }; + num_used_symbols += 1; + } + } + + let symbols = Self::radix_sort_symbols( + &mut symbols0[..num_used_symbols], + &mut symbols1[..num_used_symbols], + ); + Self::calculate_minimum_redundancy(symbols); + + for symbol in symbols.iter() { + num_codes[symbol.key as usize] += 1; + } + + Self::enforce_max_code_size(&mut num_codes, num_used_symbols, code_size_limit); + + memset(&mut self.code_sizes[table_num][..], 0); + memset(&mut self.codes[table_num][..], 0); + + let mut last = num_used_symbols; + for (i, &num_item) in num_codes + .iter() + .enumerate() + .take(code_size_limit + 1) + .skip(1) + { + let first = last - num_item as usize; + for symbol in &symbols[first..last] { + self.code_sizes[table_num][symbol.sym_index as usize] = i as u8; + } + last = first; + } + } + + let mut j = 0; + next_code[1] = 0; + for i in 2..=code_size_limit { + j = (j + num_codes[i - 1]) << 1; + next_code[i] = j as u32; + } + + for (&code_size, huff_code) in self.code_sizes[table_num] + .iter() + .take(table_len) + .zip(self.codes[table_num].iter_mut().take(table_len)) + { + if code_size == 0 { + continue; + } + + let mut code = next_code[code_size as usize]; + next_code[code_size as usize] += 1; + + let mut rev_code = 0; + for _ in 0..code_size { + rev_code = (rev_code << 1) | (code & 1); + code >>= 1; + } + *huff_code = rev_code as u16; + } + } + + fn start_static_block(&mut self, output: &mut OutputBufferOxide) { + memset(&mut self.code_sizes[LITLEN_TABLE][0..144], 8); + memset(&mut self.code_sizes[LITLEN_TABLE][144..256], 9); + memset(&mut self.code_sizes[LITLEN_TABLE][256..280], 7); + memset(&mut self.code_sizes[LITLEN_TABLE][280..288], 8); + + memset(&mut self.code_sizes[DIST_TABLE][..32], 5); + + self.optimize_table(LITLEN_TABLE, 288, 15, true); + self.optimize_table(DIST_TABLE, 32, 15, true); + + output.put_bits(0b01, 2) + } + + fn start_dynamic_block(&mut self, output: &mut OutputBufferOxide) -> Result<()> { + // There will always be one, and only one end of block code. + self.count[0][256] = 1; + + self.optimize_table(0, MAX_HUFF_SYMBOLS_0, 15, false); + self.optimize_table(1, MAX_HUFF_SYMBOLS_1, 15, false); + + let num_lit_codes = 286 + - &self.code_sizes[0][257..286] + .iter() + .rev() + .take_while(|&x| *x == 0) + .count(); + + let num_dist_codes = 30 + - &self.code_sizes[1][1..30] + .iter() + .rev() + .take_while(|&x| *x == 0) + .count(); + + let mut code_sizes_to_pack = [0u8; MAX_HUFF_SYMBOLS_0 + MAX_HUFF_SYMBOLS_1]; + let mut packed_code_sizes = [0u8; MAX_HUFF_SYMBOLS_0 + MAX_HUFF_SYMBOLS_1]; + + let total_code_sizes_to_pack = num_lit_codes + num_dist_codes; + + code_sizes_to_pack[..num_lit_codes].copy_from_slice(&self.code_sizes[0][..num_lit_codes]); + + code_sizes_to_pack[num_lit_codes..total_code_sizes_to_pack] + .copy_from_slice(&self.code_sizes[1][..num_dist_codes]); + + let mut rle = Rle { + z_count: 0, + repeat_count: 0, + prev_code_size: 0xFF, + }; + + memset(&mut self.count[HUFF_CODES_TABLE][..MAX_HUFF_SYMBOLS_2], 0); + + let mut packed_pos = 0; + for &code_size in &code_sizes_to_pack[..total_code_sizes_to_pack] { + if code_size == 0 { + rle.prev_code_size(&mut packed_code_sizes, &mut packed_pos, self)?; + rle.z_count += 1; + if rle.z_count == 138 { + rle.zero_code_size(&mut packed_code_sizes, &mut packed_pos, self)?; + } + } else { + rle.zero_code_size(&mut packed_code_sizes, &mut packed_pos, self)?; + if code_size != rle.prev_code_size { + rle.prev_code_size(&mut packed_code_sizes, &mut packed_pos, self)?; + self.count[HUFF_CODES_TABLE][code_size as usize] = + self.count[HUFF_CODES_TABLE][code_size as usize].wrapping_add(1); + write(&[code_size], &mut packed_code_sizes, &mut packed_pos)?; + } else { + rle.repeat_count += 1; + if rle.repeat_count == 6 { + rle.prev_code_size(&mut packed_code_sizes, &mut packed_pos, self)?; + } + } + } + rle.prev_code_size = code_size; + } + + if rle.repeat_count != 0 { + rle.prev_code_size(&mut packed_code_sizes, &mut packed_pos, self)?; + } else { + rle.zero_code_size(&mut packed_code_sizes, &mut packed_pos, self)?; + } + + self.optimize_table(2, MAX_HUFF_SYMBOLS_2, 7, false); + + output.put_bits(2, 2); + + output.put_bits((num_lit_codes - 257) as u32, 5); + output.put_bits((num_dist_codes - 1) as u32, 5); + + let mut num_bit_lengths = 18 + - HUFFMAN_LENGTH_ORDER + .iter() + .rev() + .take_while(|&swizzle| self.code_sizes[HUFF_CODES_TABLE][*swizzle as usize] == 0) + .count(); + + num_bit_lengths = cmp::max(4, num_bit_lengths + 1); + output.put_bits(num_bit_lengths as u32 - 4, 4); + for &swizzle in &HUFFMAN_LENGTH_ORDER[..num_bit_lengths] { + output.put_bits( + u32::from(self.code_sizes[HUFF_CODES_TABLE][swizzle as usize]), + 3, + ); + } + + let mut packed_code_size_index = 0; + while packed_code_size_index < packed_pos { + let code = packed_code_sizes[packed_code_size_index] as usize; + packed_code_size_index += 1; + assert!(code < MAX_HUFF_SYMBOLS_2); + output.put_bits( + u32::from(self.codes[HUFF_CODES_TABLE][code]), + u32::from(self.code_sizes[HUFF_CODES_TABLE][code]), + ); + if code >= 16 { + output.put_bits( + u32::from(packed_code_sizes[packed_code_size_index]), + [2, 3, 7][code - 16], + ); + packed_code_size_index += 1; + } + } + + Ok(()) + } +} + +struct DictOxide { + /// The maximum number of checks in the hash chain, for the initial, + /// and the lazy match respectively. + pub max_probes: [u32; 2], + /// Buffer of input data. + /// Padded with 1 byte to simplify matching code in `compress_fast`. + pub b: Box, + + pub code_buf_dict_pos: usize, + pub lookahead_size: usize, + pub lookahead_pos: usize, + pub size: usize, +} + +const fn probes_from_flags(flags: u32) -> [u32; 2] { + [ + 1 + ((flags & 0xFFF) + 2) / 3, + 1 + (((flags & 0xFFF) >> 2) + 2) / 3, + ] +} + +impl DictOxide { + fn new(flags: u32) -> Self { + DictOxide { + max_probes: probes_from_flags(flags), + b: Box::default(), + code_buf_dict_pos: 0, + lookahead_size: 0, + lookahead_pos: 0, + size: 0, + } + } + + fn update_flags(&mut self, flags: u32) { + self.max_probes = probes_from_flags(flags); + } + + fn reset(&mut self) { + self.b.reset(); + self.code_buf_dict_pos = 0; + self.lookahead_size = 0; + self.lookahead_pos = 0; + self.size = 0; + } + + /// Do an unaligned read of the data at `pos` in the dictionary and treat it as if it was of + /// type T. + #[inline] + fn read_unaligned_u32(&self, pos: usize) -> u32 { + // Masking the value here helps avoid bounds checks. + let pos = pos & LZ_DICT_SIZE_MASK; + let end = pos + 4; + // Somehow this assertion makes things faster. + assert!(end < LZ_DICT_FULL_SIZE); + + let bytes: [u8; 4] = self.b.dict[pos..end].try_into().unwrap(); + u32::from_le_bytes(bytes) + } + + /// Do an unaligned read of the data at `pos` in the dictionary and treat it as if it was of + /// type T. + #[inline] + fn read_unaligned_u64(&self, pos: usize) -> u64 { + let bytes: [u8; 8] = self.b.dict[pos..pos + 8].try_into().unwrap(); + u64::from_le_bytes(bytes) + } + + /// Do an unaligned read of the data at `pos` in the dictionary and treat it as if it was of + /// type T. + #[inline] + fn read_as_u16(&self, pos: usize) -> u16 { + read_u16_le(&self.b.dict[..], pos) + } + + /// Try to find a match for the data at lookahead_pos in the dictionary that is + /// longer than `match_len`. + /// Returns a tuple containing (match_distance, match_length). Will be equal to the input + /// values if no better matches were found. + fn find_match( + &self, + lookahead_pos: usize, + max_dist: usize, + max_match_len: u32, + mut match_dist: u32, + mut match_len: u32, + ) -> (u32, u32) { + // Clamp the match len and max_match_len to be valid. (It should be when this is called, but + // do it for now just in case for safety reasons.) + // This should normally end up as at worst conditional moves, + // so it shouldn't slow us down much. + // TODO: Statically verify these so we don't need to do this. + let max_match_len = cmp::min(MAX_MATCH_LEN as u32, max_match_len); + match_len = cmp::max(match_len, 1); + + let pos = lookahead_pos & LZ_DICT_SIZE_MASK; + let mut probe_pos = pos; + // Number of probes into the hash chains. + let mut num_probes_left = self.max_probes[(match_len >= 32) as usize]; + + // If we already have a match of the full length don't bother searching for another one. + if max_match_len <= match_len { + return (match_dist, match_len); + } + + // Read the last byte of the current match, and the next one, used to compare matches. + let mut c01: u16 = self.read_as_u16(pos + match_len as usize - 1); + // Read the two bytes at the end position of the current match. + let s01: u16 = self.read_as_u16(pos); + + 'outer: loop { + let mut dist; + 'found: loop { + num_probes_left -= 1; + if num_probes_left == 0 { + // We have done as many probes in the hash chain as the current compression + // settings allow, so return the best match we found, if any. + return (match_dist, match_len); + } + + for _ in 0..3 { + let next_probe_pos = self.b.next[probe_pos] as usize; + + dist = (lookahead_pos - next_probe_pos) & 0xFFFF; + if next_probe_pos == 0 || dist > max_dist { + // We reached the end of the hash chain, or the next value is further away + // than the maximum allowed distance, so return the best match we found, if + // any. + return (match_dist, match_len); + } + + // Mask the position value to get the position in the hash chain of the next + // position to match against. + probe_pos = next_probe_pos & LZ_DICT_SIZE_MASK; + + if self.read_as_u16(probe_pos + match_len as usize - 1) == c01 { + break 'found; + } + } + } + + if dist == 0 { + // We've looked through the whole match range, so return the best match we + // found. + return (match_dist, match_len); + } + + // Check if the two first bytes match. + if self.read_as_u16(probe_pos) != s01 { + continue; + } + + let mut p = pos + 2; + let mut q = probe_pos + 2; + // The first two bytes matched, so check the full length of the match. + for _ in 0..32 { + let p_data: u64 = self.read_unaligned_u64(p); + let q_data: u64 = self.read_unaligned_u64(q); + // Compare of 8 bytes at a time by using unaligned loads of 64-bit integers. + let xor_data = p_data ^ q_data; + if xor_data == 0 { + p += 8; + q += 8; + } else { + // If not all of the last 8 bytes matched, check how may of them did. + let trailing = xor_data.trailing_zeros(); + + let probe_len = p - pos + (trailing as usize >> 3); + if probe_len > match_len as usize { + match_dist = dist as u32; + match_len = cmp::min(max_match_len, probe_len as u32); + if match_len == max_match_len { + // We found a match that had the maximum allowed length, + // so there is now point searching further. + return (match_dist, match_len); + } + // We found a better match, so save the last two bytes for further match + // comparisons. + c01 = self.read_as_u16(pos + match_len as usize - 1) + } + continue 'outer; + } + } + + return (dist as u32, cmp::min(max_match_len, MAX_MATCH_LEN as u32)); + } + } +} + +struct ParamsOxide { + pub flags: u32, + pub greedy_parsing: bool, + pub block_index: u32, + + pub saved_match_dist: u32, + pub saved_match_len: u32, + pub saved_lit: u8, + + pub flush: TDEFLFlush, + pub flush_ofs: u32, + pub flush_remaining: u32, + pub finished: bool, + + pub adler32: u32, + + pub src_pos: usize, + + pub out_buf_ofs: usize, + pub prev_return_status: TDEFLStatus, + + pub saved_bit_buffer: u32, + pub saved_bits_in: u32, + + pub local_buf: Box, +} + +impl ParamsOxide { + fn new(flags: u32) -> Self { + ParamsOxide { + flags, + greedy_parsing: flags & TDEFL_GREEDY_PARSING_FLAG != 0, + block_index: 0, + saved_match_dist: 0, + saved_match_len: 0, + saved_lit: 0, + flush: TDEFLFlush::None, + flush_ofs: 0, + flush_remaining: 0, + finished: false, + adler32: MZ_ADLER32_INIT, + src_pos: 0, + out_buf_ofs: 0, + prev_return_status: TDEFLStatus::Okay, + saved_bit_buffer: 0, + saved_bits_in: 0, + local_buf: Box::default(), + } + } + + fn update_flags(&mut self, flags: u32) { + self.flags = flags; + self.greedy_parsing = self.flags & TDEFL_GREEDY_PARSING_FLAG != 0; + } + + /// Reset state, saving settings. + fn reset(&mut self) { + self.block_index = 0; + self.saved_match_len = 0; + self.saved_match_dist = 0; + self.saved_lit = 0; + self.flush = TDEFLFlush::None; + self.flush_ofs = 0; + self.flush_remaining = 0; + self.finished = false; + self.adler32 = MZ_ADLER32_INIT; + self.src_pos = 0; + self.out_buf_ofs = 0; + self.prev_return_status = TDEFLStatus::Okay; + self.saved_bit_buffer = 0; + self.saved_bits_in = 0; + self.local_buf.b = [0; OUT_BUF_SIZE]; + } +} + +struct LZOxide { + pub codes: [u8; LZ_CODE_BUF_SIZE], + pub code_position: usize, + pub flag_position: usize, + + // The total number of bytes in the current block. + // (Could maybe use usize, but it's not possible to exceed a block size of ) + pub total_bytes: u32, + pub num_flags_left: u32, +} + +impl LZOxide { + const fn new() -> Self { + LZOxide { + codes: [0; LZ_CODE_BUF_SIZE], + code_position: 1, + flag_position: 0, + total_bytes: 0, + num_flags_left: 8, + } + } + + fn write_code(&mut self, val: u8) { + self.codes[self.code_position] = val; + self.code_position += 1; + } + + fn init_flag(&mut self) { + if self.num_flags_left == 8 { + *self.get_flag() = 0; + self.code_position -= 1; + } else { + *self.get_flag() >>= self.num_flags_left; + } + } + + fn get_flag(&mut self) -> &mut u8 { + &mut self.codes[self.flag_position] + } + + fn plant_flag(&mut self) { + self.flag_position = self.code_position; + self.code_position += 1; + } + + fn consume_flag(&mut self) { + self.num_flags_left -= 1; + if self.num_flags_left == 0 { + self.num_flags_left = 8; + self.plant_flag(); + } + } +} + +fn compress_lz_codes( + huff: &HuffmanOxide, + output: &mut OutputBufferOxide, + lz_code_buf: &[u8], +) -> Result { + let mut flags = 1; + let mut bb = BitBuffer { + bit_buffer: u64::from(output.bit_buffer), + bits_in: output.bits_in, + }; + + let mut i: usize = 0; + while i < lz_code_buf.len() { + if flags == 1 { + flags = u32::from(lz_code_buf[i]) | 0x100; + i += 1; + } + + // The lz code was a length code + if flags & 1 == 1 { + flags >>= 1; + + let sym; + let num_extra_bits; + + let match_len = lz_code_buf[i] as usize; + + let match_dist = read_u16_le(lz_code_buf, i + 1); + + i += 3; + + debug_assert!(huff.code_sizes[0][LEN_SYM[match_len] as usize] != 0); + bb.put_fast( + u64::from(huff.codes[0][LEN_SYM[match_len] as usize]), + u32::from(huff.code_sizes[0][LEN_SYM[match_len] as usize]), + ); + bb.put_fast( + match_len as u64 & u64::from(BITMASKS[LEN_EXTRA[match_len] as usize]), + u32::from(LEN_EXTRA[match_len]), + ); + + if match_dist < 512 { + sym = SMALL_DIST_SYM[match_dist as usize] as usize; + num_extra_bits = SMALL_DIST_EXTRA[match_dist as usize] as usize; + } else { + sym = LARGE_DIST_SYM[(match_dist >> 8) as usize] as usize; + num_extra_bits = LARGE_DIST_EXTRA[(match_dist >> 8) as usize] as usize; + } + + debug_assert!(huff.code_sizes[1][sym] != 0); + bb.put_fast( + u64::from(huff.codes[1][sym]), + u32::from(huff.code_sizes[1][sym]), + ); + bb.put_fast( + u64::from(match_dist) & u64::from(BITMASKS[num_extra_bits]), + num_extra_bits as u32, + ); + } else { + // The lz code was a literal + for _ in 0..3 { + flags >>= 1; + let lit = lz_code_buf[i]; + i += 1; + + debug_assert!(huff.code_sizes[0][lit as usize] != 0); + bb.put_fast( + u64::from(huff.codes[0][lit as usize]), + u32::from(huff.code_sizes[0][lit as usize]), + ); + + if flags & 1 == 1 || i >= lz_code_buf.len() { + break; + } + } + } + + bb.flush(output)?; + } + + output.bits_in = 0; + output.bit_buffer = 0; + while bb.bits_in != 0 { + let n = cmp::min(bb.bits_in, 16); + output.put_bits(bb.bit_buffer as u32 & BITMASKS[n as usize], n); + bb.bit_buffer >>= n; + bb.bits_in -= n; + } + + // Output the end of block symbol. + output.put_bits( + u32::from(huff.codes[0][256]), + u32::from(huff.code_sizes[0][256]), + ); + + Ok(true) +} + +fn compress_block( + huff: &mut HuffmanOxide, + output: &mut OutputBufferOxide, + lz: &LZOxide, + static_block: bool, +) -> Result { + if static_block { + huff.start_static_block(output); + } else { + huff.start_dynamic_block(output)?; + } + + compress_lz_codes(huff, output, &lz.codes[..lz.code_position]) +} + +fn flush_block( + d: &mut CompressorOxide, + callback: &mut CallbackOxide, + flush: TDEFLFlush, +) -> Result { + let mut saved_buffer; + { + let mut output = callback + .out + .new_output_buffer(&mut d.params.local_buf.b, d.params.out_buf_ofs); + output.bit_buffer = d.params.saved_bit_buffer; + output.bits_in = d.params.saved_bits_in; + + let use_raw_block = (d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0) + && (d.dict.lookahead_pos - d.dict.code_buf_dict_pos) <= d.dict.size; + + assert!(d.params.flush_remaining == 0); + d.params.flush_ofs = 0; + d.params.flush_remaining = 0; + + d.lz.init_flag(); + + // If we are at the start of the stream, write the zlib header if requested. + if d.params.flags & TDEFL_WRITE_ZLIB_HEADER != 0 && d.params.block_index == 0 { + let header = zlib::header_from_flags(d.params.flags); + output.put_bits(header[0].into(), 8); + output.put_bits(header[1].into(), 8); + } + + // Output the block header. + output.put_bits((flush == TDEFLFlush::Finish) as u32, 1); + + saved_buffer = output.save(); + + let comp_success = if !use_raw_block { + let use_static = + (d.params.flags & TDEFL_FORCE_ALL_STATIC_BLOCKS != 0) || (d.lz.total_bytes < 48); + compress_block(&mut d.huff, &mut output, &d.lz, use_static)? + } else { + false + }; + + // If we failed to compress anything and the output would take up more space than the output + // data, output a stored block instead, which has at most 5 bytes of overhead. + // We only use some simple heuristics for now. + // A stored block will have an overhead of at least 4 bytes containing the block length + // but usually more due to the length parameters having to start at a byte boundary and thus + // requiring up to 5 bytes of padding. + // As a static block will have an overhead of at most 1 bit per byte + // (as literals are either 8 or 9 bytes), a raw block will + // never take up less space if the number of input bytes are less than 32. + let expanded = (d.lz.total_bytes > 32) + && (output.inner_pos - saved_buffer.pos + 1 >= (d.lz.total_bytes as usize)) + && (d.dict.lookahead_pos - d.dict.code_buf_dict_pos <= d.dict.size); + + if use_raw_block || expanded { + output.load(saved_buffer); + + // Block header. + output.put_bits(0, 2); + + // Block length has to start on a byte boundary, s opad. + output.pad_to_bytes(); + + // Block length and ones complement of block length. + output.put_bits(d.lz.total_bytes & 0xFFFF, 16); + output.put_bits(!d.lz.total_bytes & 0xFFFF, 16); + + // Write the actual bytes. + for i in 0..d.lz.total_bytes { + let pos = (d.dict.code_buf_dict_pos + i as usize) & LZ_DICT_SIZE_MASK; + output.put_bits(u32::from(d.dict.b.dict[pos]), 8); + } + } else if !comp_success { + output.load(saved_buffer); + compress_block(&mut d.huff, &mut output, &d.lz, true)?; + } + + if flush != TDEFLFlush::None { + if flush == TDEFLFlush::Finish { + output.pad_to_bytes(); + if d.params.flags & TDEFL_WRITE_ZLIB_HEADER != 0 { + let mut adler = d.params.adler32; + for _ in 0..4 { + output.put_bits((adler >> 24) & 0xFF, 8); + adler <<= 8; + } + } + } else { + // Sync or Full flush. + // Output an empty raw block. + output.put_bits(0, 3); + output.pad_to_bytes(); + output.put_bits(0, 16); + output.put_bits(0xFFFF, 16); + } + } + + memset(&mut d.huff.count[0][..MAX_HUFF_SYMBOLS_0], 0); + memset(&mut d.huff.count[1][..MAX_HUFF_SYMBOLS_1], 0); + + d.lz.code_position = 1; + d.lz.flag_position = 0; + d.lz.num_flags_left = 8; + d.dict.code_buf_dict_pos += d.lz.total_bytes as usize; + d.lz.total_bytes = 0; + d.params.block_index += 1; + + saved_buffer = output.save(); + + d.params.saved_bit_buffer = saved_buffer.bit_buffer; + d.params.saved_bits_in = saved_buffer.bits_in; + } + + Ok(callback.flush_output(saved_buffer, &mut d.params)) +} + +fn record_literal(h: &mut HuffmanOxide, lz: &mut LZOxide, lit: u8) { + lz.total_bytes += 1; + lz.write_code(lit); + + *lz.get_flag() >>= 1; + lz.consume_flag(); + + h.count[0][lit as usize] += 1; +} + +fn record_match(h: &mut HuffmanOxide, lz: &mut LZOxide, mut match_len: u32, mut match_dist: u32) { + assert!(match_len >= MIN_MATCH_LEN.into()); + assert!(match_dist >= 1); + assert!(match_dist as usize <= LZ_DICT_SIZE); + + lz.total_bytes += match_len; + match_dist -= 1; + match_len -= u32::from(MIN_MATCH_LEN); + lz.write_code(match_len as u8); + lz.write_code(match_dist as u8); + lz.write_code((match_dist >> 8) as u8); + + *lz.get_flag() >>= 1; + *lz.get_flag() |= 0x80; + lz.consume_flag(); + + let symbol = if match_dist < 512 { + SMALL_DIST_SYM[match_dist as usize] + } else { + LARGE_DIST_SYM[((match_dist >> 8) & 127) as usize] + } as usize; + h.count[1][symbol] += 1; + h.count[0][LEN_SYM[match_len as usize] as usize] += 1; +} + +fn compress_normal(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> bool { + let mut src_pos = d.params.src_pos; + let in_buf = match callback.in_buf { + None => return true, + Some(in_buf) => in_buf, + }; + + let mut lookahead_size = d.dict.lookahead_size; + let mut lookahead_pos = d.dict.lookahead_pos; + let mut saved_lit = d.params.saved_lit; + let mut saved_match_dist = d.params.saved_match_dist; + let mut saved_match_len = d.params.saved_match_len; + + while src_pos < in_buf.len() || (d.params.flush != TDEFLFlush::None && lookahead_size != 0) { + let src_buf_left = in_buf.len() - src_pos; + let num_bytes_to_process = cmp::min(src_buf_left, MAX_MATCH_LEN - lookahead_size); + + if lookahead_size + d.dict.size >= usize::from(MIN_MATCH_LEN) - 1 + && num_bytes_to_process > 0 + { + let dictb = &mut d.dict.b; + + let mut dst_pos = (lookahead_pos + lookahead_size) & LZ_DICT_SIZE_MASK; + let mut ins_pos = lookahead_pos + lookahead_size - 2; + // Start the hash value from the first two bytes + let mut hash = update_hash( + u16::from(dictb.dict[ins_pos & LZ_DICT_SIZE_MASK]), + dictb.dict[(ins_pos + 1) & LZ_DICT_SIZE_MASK], + ); + + lookahead_size += num_bytes_to_process; + + for &c in &in_buf[src_pos..src_pos + num_bytes_to_process] { + // Add byte to input buffer. + dictb.dict[dst_pos] = c; + if dst_pos < MAX_MATCH_LEN - 1 { + dictb.dict[LZ_DICT_SIZE + dst_pos] = c; + } + + // Generate hash from the current byte, + hash = update_hash(hash, c); + dictb.next[ins_pos & LZ_DICT_SIZE_MASK] = dictb.hash[hash as usize]; + // and insert it into the hash chain. + dictb.hash[hash as usize] = ins_pos as u16; + dst_pos = (dst_pos + 1) & LZ_DICT_SIZE_MASK; + ins_pos += 1; + } + src_pos += num_bytes_to_process; + } else { + let dictb = &mut d.dict.b; + for &c in &in_buf[src_pos..src_pos + num_bytes_to_process] { + let dst_pos = (lookahead_pos + lookahead_size) & LZ_DICT_SIZE_MASK; + dictb.dict[dst_pos] = c; + if dst_pos < MAX_MATCH_LEN - 1 { + dictb.dict[LZ_DICT_SIZE + dst_pos] = c; + } + + lookahead_size += 1; + if lookahead_size + d.dict.size >= MIN_MATCH_LEN.into() { + let ins_pos = lookahead_pos + lookahead_size - 3; + let hash = ((u32::from(dictb.dict[ins_pos & LZ_DICT_SIZE_MASK]) + << (LZ_HASH_SHIFT * 2)) + ^ ((u32::from(dictb.dict[(ins_pos + 1) & LZ_DICT_SIZE_MASK]) + << LZ_HASH_SHIFT) + ^ u32::from(c))) + & (LZ_HASH_SIZE as u32 - 1); + + dictb.next[ins_pos & LZ_DICT_SIZE_MASK] = dictb.hash[hash as usize]; + dictb.hash[hash as usize] = ins_pos as u16; + } + } + + src_pos += num_bytes_to_process; + } + + d.dict.size = cmp::min(LZ_DICT_SIZE - lookahead_size, d.dict.size); + if d.params.flush == TDEFLFlush::None && lookahead_size < MAX_MATCH_LEN { + break; + } + + let mut len_to_move = 1; + let mut cur_match_dist = 0; + let mut cur_match_len = if saved_match_len != 0 { + saved_match_len + } else { + u32::from(MIN_MATCH_LEN) - 1 + }; + let cur_pos = lookahead_pos & LZ_DICT_SIZE_MASK; + if d.params.flags & (TDEFL_RLE_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS) != 0 { + // If TDEFL_RLE_MATCHES is set, we only look for repeating sequences of the current byte. + if d.dict.size != 0 && d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS == 0 { + let c = d.dict.b.dict[(cur_pos.wrapping_sub(1)) & LZ_DICT_SIZE_MASK]; + cur_match_len = d.dict.b.dict[cur_pos..(cur_pos + lookahead_size)] + .iter() + .take_while(|&x| *x == c) + .count() as u32; + if cur_match_len < MIN_MATCH_LEN.into() { + cur_match_len = 0 + } else { + cur_match_dist = 1 + } + } + } else { + // Try to find a match for the bytes at the current position. + let dist_len = d.dict.find_match( + lookahead_pos, + d.dict.size, + lookahead_size as u32, + cur_match_dist, + cur_match_len, + ); + cur_match_dist = dist_len.0; + cur_match_len = dist_len.1; + } + + let far_and_small = cur_match_len == MIN_MATCH_LEN.into() && cur_match_dist >= 8 * 1024; + let filter_small = d.params.flags & TDEFL_FILTER_MATCHES != 0 && cur_match_len <= 5; + if far_and_small || filter_small || cur_pos == cur_match_dist as usize { + cur_match_dist = 0; + cur_match_len = 0; + } + + if saved_match_len != 0 { + if cur_match_len > saved_match_len { + record_literal(&mut d.huff, &mut d.lz, saved_lit); + if cur_match_len >= 128 { + record_match(&mut d.huff, &mut d.lz, cur_match_len, cur_match_dist); + saved_match_len = 0; + len_to_move = cur_match_len as usize; + } else { + saved_lit = d.dict.b.dict[cur_pos]; + saved_match_dist = cur_match_dist; + saved_match_len = cur_match_len; + } + } else { + record_match(&mut d.huff, &mut d.lz, saved_match_len, saved_match_dist); + len_to_move = (saved_match_len - 1) as usize; + saved_match_len = 0; + } + } else if cur_match_dist == 0 { + record_literal( + &mut d.huff, + &mut d.lz, + d.dict.b.dict[cmp::min(cur_pos, d.dict.b.dict.len() - 1)], + ); + } else if d.params.greedy_parsing + || (d.params.flags & TDEFL_RLE_MATCHES != 0) + || cur_match_len >= 128 + { + // If we are using lazy matching, check for matches at the next byte if the current + // match was shorter than 128 bytes. + record_match(&mut d.huff, &mut d.lz, cur_match_len, cur_match_dist); + len_to_move = cur_match_len as usize; + } else { + saved_lit = d.dict.b.dict[cmp::min(cur_pos, d.dict.b.dict.len() - 1)]; + saved_match_dist = cur_match_dist; + saved_match_len = cur_match_len; + } + + lookahead_pos += len_to_move; + assert!(lookahead_size >= len_to_move); + lookahead_size -= len_to_move; + d.dict.size = cmp::min(d.dict.size + len_to_move, LZ_DICT_SIZE); + + let lz_buf_tight = d.lz.code_position > LZ_CODE_BUF_SIZE - 8; + let raw = d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0; + let fat = ((d.lz.code_position * 115) >> 7) >= d.lz.total_bytes as usize; + let fat_or_raw = (d.lz.total_bytes > 31 * 1024) && (fat || raw); + + if lz_buf_tight || fat_or_raw { + d.params.src_pos = src_pos; + // These values are used in flush_block, so we need to write them back here. + d.dict.lookahead_size = lookahead_size; + d.dict.lookahead_pos = lookahead_pos; + + let n = flush_block(d, callback, TDEFLFlush::None) + .unwrap_or(TDEFLStatus::PutBufFailed as i32); + if n != 0 { + d.params.saved_lit = saved_lit; + d.params.saved_match_dist = saved_match_dist; + d.params.saved_match_len = saved_match_len; + return n > 0; + } + } + } + + d.params.src_pos = src_pos; + d.dict.lookahead_size = lookahead_size; + d.dict.lookahead_pos = lookahead_pos; + d.params.saved_lit = saved_lit; + d.params.saved_match_dist = saved_match_dist; + d.params.saved_match_len = saved_match_len; + true +} + +const COMP_FAST_LOOKAHEAD_SIZE: usize = 4096; + +fn compress_fast(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> bool { + let mut src_pos = d.params.src_pos; + let mut lookahead_size = d.dict.lookahead_size; + let mut lookahead_pos = d.dict.lookahead_pos; + + let mut cur_pos = lookahead_pos & LZ_DICT_SIZE_MASK; + let in_buf = match callback.in_buf { + None => return true, + Some(in_buf) => in_buf, + }; + + debug_assert!(d.lz.code_position < LZ_CODE_BUF_SIZE - 2); + + while src_pos < in_buf.len() || (d.params.flush != TDEFLFlush::None && lookahead_size > 0) { + let mut dst_pos = (lookahead_pos + lookahead_size) & LZ_DICT_SIZE_MASK; + let mut num_bytes_to_process = cmp::min( + in_buf.len() - src_pos, + COMP_FAST_LOOKAHEAD_SIZE - lookahead_size, + ); + lookahead_size += num_bytes_to_process; + + while num_bytes_to_process != 0 { + let n = cmp::min(LZ_DICT_SIZE - dst_pos, num_bytes_to_process); + d.dict.b.dict[dst_pos..dst_pos + n].copy_from_slice(&in_buf[src_pos..src_pos + n]); + + if dst_pos < MAX_MATCH_LEN - 1 { + let m = cmp::min(n, MAX_MATCH_LEN - 1 - dst_pos); + d.dict.b.dict[dst_pos + LZ_DICT_SIZE..dst_pos + LZ_DICT_SIZE + m] + .copy_from_slice(&in_buf[src_pos..src_pos + m]); + } + + src_pos += n; + dst_pos = (dst_pos + n) & LZ_DICT_SIZE_MASK; + num_bytes_to_process -= n; + } + + d.dict.size = cmp::min(LZ_DICT_SIZE - lookahead_size, d.dict.size); + if d.params.flush == TDEFLFlush::None && lookahead_size < COMP_FAST_LOOKAHEAD_SIZE { + break; + } + + while lookahead_size >= 4 { + let mut cur_match_len = 1; + + let first_trigram = d.dict.read_unaligned_u32(cur_pos) & 0xFF_FFFF; + + let hash = (first_trigram ^ (first_trigram >> (24 - (LZ_HASH_BITS - 8)))) + & LEVEL1_HASH_SIZE_MASK; + + let mut probe_pos = usize::from(d.dict.b.hash[hash as usize]); + d.dict.b.hash[hash as usize] = lookahead_pos as u16; + + let mut cur_match_dist = (lookahead_pos - probe_pos) as u16; + if cur_match_dist as usize <= d.dict.size { + probe_pos &= LZ_DICT_SIZE_MASK; + + let trigram = d.dict.read_unaligned_u32(probe_pos) & 0xFF_FFFF; + + if first_trigram == trigram { + // Trigram was tested, so we can start with "+ 3" displacement. + let mut p = cur_pos + 3; + let mut q = probe_pos + 3; + cur_match_len = (|| { + for _ in 0..32 { + let p_data: u64 = d.dict.read_unaligned_u64(p); + let q_data: u64 = d.dict.read_unaligned_u64(q); + let xor_data = p_data ^ q_data; + if xor_data == 0 { + p += 8; + q += 8; + } else { + let trailing = xor_data.trailing_zeros(); + return p as u32 - cur_pos as u32 + (trailing >> 3); + } + } + + if cur_match_dist == 0 { + 0 + } else { + MAX_MATCH_LEN as u32 + } + })(); + + if cur_match_len < MIN_MATCH_LEN.into() + || (cur_match_len == MIN_MATCH_LEN.into() && cur_match_dist >= 8 * 1024) + { + let lit = first_trigram as u8; + cur_match_len = 1; + d.lz.write_code(lit); + *d.lz.get_flag() >>= 1; + d.huff.count[0][lit as usize] += 1; + } else { + // Limit the match to the length of the lookahead so we don't create a match + // that ends after the end of the input data. + cur_match_len = cmp::min(cur_match_len, lookahead_size as u32); + debug_assert!(cur_match_len >= MIN_MATCH_LEN.into()); + debug_assert!(cur_match_dist >= 1); + debug_assert!(cur_match_dist as usize <= LZ_DICT_SIZE); + cur_match_dist -= 1; + + d.lz.write_code((cur_match_len - u32::from(MIN_MATCH_LEN)) as u8); + d.lz.write_code(cur_match_dist as u8); + d.lz.write_code((cur_match_dist >> 8) as u8); + + *d.lz.get_flag() >>= 1; + *d.lz.get_flag() |= 0x80; + if cur_match_dist < 512 { + d.huff.count[1][SMALL_DIST_SYM[cur_match_dist as usize] as usize] += 1; + } else { + d.huff.count[1] + [LARGE_DIST_SYM[(cur_match_dist >> 8) as usize] as usize] += 1; + } + + d.huff.count[0][LEN_SYM[(cur_match_len - u32::from(MIN_MATCH_LEN)) as usize] + as usize] += 1; + } + } else { + d.lz.write_code(first_trigram as u8); + *d.lz.get_flag() >>= 1; + d.huff.count[0][first_trigram as u8 as usize] += 1; + } + + d.lz.consume_flag(); + d.lz.total_bytes += cur_match_len; + lookahead_pos += cur_match_len as usize; + d.dict.size = cmp::min(d.dict.size + cur_match_len as usize, LZ_DICT_SIZE); + cur_pos = (cur_pos + cur_match_len as usize) & LZ_DICT_SIZE_MASK; + lookahead_size -= cur_match_len as usize; + + if d.lz.code_position > LZ_CODE_BUF_SIZE - 8 { + // These values are used in flush_block, so we need to write them back here. + d.dict.lookahead_size = lookahead_size; + d.dict.lookahead_pos = lookahead_pos; + + let n = match flush_block(d, callback, TDEFLFlush::None) { + Err(_) => { + d.params.src_pos = src_pos; + d.params.prev_return_status = TDEFLStatus::PutBufFailed; + return false; + } + Ok(status) => status, + }; + if n != 0 { + d.params.src_pos = src_pos; + return n > 0; + } + debug_assert!(d.lz.code_position < LZ_CODE_BUF_SIZE - 2); + + lookahead_size = d.dict.lookahead_size; + lookahead_pos = d.dict.lookahead_pos; + } + } + } + + while lookahead_size != 0 { + let lit = d.dict.b.dict[cur_pos]; + d.lz.total_bytes += 1; + d.lz.write_code(lit); + *d.lz.get_flag() >>= 1; + d.lz.consume_flag(); + + d.huff.count[0][lit as usize] += 1; + lookahead_pos += 1; + d.dict.size = cmp::min(d.dict.size + 1, LZ_DICT_SIZE); + cur_pos = (cur_pos + 1) & LZ_DICT_SIZE_MASK; + lookahead_size -= 1; + + if d.lz.code_position > LZ_CODE_BUF_SIZE - 8 { + // These values are used in flush_block, so we need to write them back here. + d.dict.lookahead_size = lookahead_size; + d.dict.lookahead_pos = lookahead_pos; + + let n = match flush_block(d, callback, TDEFLFlush::None) { + Err(_) => { + d.params.prev_return_status = TDEFLStatus::PutBufFailed; + d.params.src_pos = src_pos; + return false; + } + Ok(status) => status, + }; + if n != 0 { + d.params.src_pos = src_pos; + return n > 0; + } + + lookahead_size = d.dict.lookahead_size; + lookahead_pos = d.dict.lookahead_pos; + } + } + } + + d.params.src_pos = src_pos; + d.dict.lookahead_size = lookahead_size; + d.dict.lookahead_pos = lookahead_pos; + true +} + +fn flush_output_buffer(c: &mut CallbackOxide, p: &mut ParamsOxide) -> (TDEFLStatus, usize, usize) { + let mut res = (TDEFLStatus::Okay, p.src_pos, 0); + if let CallbackOut::Buf(ref mut cb) = c.out { + let n = cmp::min(cb.out_buf.len() - p.out_buf_ofs, p.flush_remaining as usize); + if n != 0 { + cb.out_buf[p.out_buf_ofs..p.out_buf_ofs + n] + .copy_from_slice(&p.local_buf.b[p.flush_ofs as usize..p.flush_ofs as usize + n]); + } + p.flush_ofs += n as u32; + p.flush_remaining -= n as u32; + p.out_buf_ofs += n; + res.2 = p.out_buf_ofs; + } + + if p.finished && p.flush_remaining == 0 { + res.0 = TDEFLStatus::Done + } + res +} + +/// Main compression function. Tries to compress as much as possible from `in_buf` and +/// puts compressed output into `out_buf`. +/// +/// The value of `flush` determines if the compressor should attempt to flush all output +/// and alternatively try to finish the stream. +/// +/// Use [`TDEFLFlush::Finish`] on the final call to signal that the stream is finishing. +/// +/// Note that this function does not keep track of whether a flush marker has been output, so +/// if called using [`TDEFLFlush::Sync`], the caller needs to ensure there is enough space in the +/// output buffer if they want to avoid repeated flush markers. +/// See #105 for details. +/// +/// # Returns +/// Returns a tuple containing the current status of the compressor, the current position +/// in the input buffer and the current position in the output buffer. +pub fn compress( + d: &mut CompressorOxide, + in_buf: &[u8], + out_buf: &mut [u8], + flush: TDEFLFlush, +) -> (TDEFLStatus, usize, usize) { + compress_inner( + d, + &mut CallbackOxide::new_callback_buf(in_buf, out_buf), + flush, + ) +} + +/// Main compression function. Callbacks output. +/// +/// # Returns +/// Returns a tuple containing the current status of the compressor, the current position +/// in the input buffer. +/// +/// The caller is responsible for ensuring the `CallbackFunc` struct will not cause undefined +/// behaviour. +pub fn compress_to_output( + d: &mut CompressorOxide, + in_buf: &[u8], + flush: TDEFLFlush, + mut callback_func: impl FnMut(&[u8]) -> bool, +) -> (TDEFLStatus, usize) { + let res = compress_inner( + d, + &mut CallbackOxide::new_callback_func( + in_buf, + CallbackFunc { + put_buf_func: &mut callback_func, + }, + ), + flush, + ); + + (res.0, res.1) +} + +fn compress_inner( + d: &mut CompressorOxide, + callback: &mut CallbackOxide, + flush: TDEFLFlush, +) -> (TDEFLStatus, usize, usize) { + d.params.out_buf_ofs = 0; + d.params.src_pos = 0; + + let prev_ok = d.params.prev_return_status == TDEFLStatus::Okay; + let flush_finish_once = d.params.flush != TDEFLFlush::Finish || flush == TDEFLFlush::Finish; + + d.params.flush = flush; + if !prev_ok || !flush_finish_once { + d.params.prev_return_status = TDEFLStatus::BadParam; + return (d.params.prev_return_status, 0, 0); + } + + if d.params.flush_remaining != 0 || d.params.finished { + let res = flush_output_buffer(callback, &mut d.params); + d.params.prev_return_status = res.0; + return res; + } + + let one_probe = d.params.flags & MAX_PROBES_MASK as u32 == 1; + let greedy = d.params.flags & TDEFL_GREEDY_PARSING_FLAG != 0; + let filter_or_rle_or_raw = d.params.flags + & (TDEFL_FILTER_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS | TDEFL_RLE_MATCHES) + != 0; + + let compress_success = if one_probe && greedy && !filter_or_rle_or_raw { + compress_fast(d, callback) + } else { + compress_normal(d, callback) + }; + + if !compress_success { + return ( + d.params.prev_return_status, + d.params.src_pos, + d.params.out_buf_ofs, + ); + } + + if let Some(in_buf) = callback.in_buf { + if d.params.flags & (TDEFL_WRITE_ZLIB_HEADER | TDEFL_COMPUTE_ADLER32) != 0 { + d.params.adler32 = update_adler32(d.params.adler32, &in_buf[..d.params.src_pos]); + } + } + + let flush_none = d.params.flush == TDEFLFlush::None; + let in_left = callback.in_buf.map_or(0, |buf| buf.len()) - d.params.src_pos; + let remaining = in_left != 0 || d.params.flush_remaining != 0; + if !flush_none && d.dict.lookahead_size == 0 && !remaining { + let flush = d.params.flush; + match flush_block(d, callback, flush) { + Err(_) => { + d.params.prev_return_status = TDEFLStatus::PutBufFailed; + return ( + d.params.prev_return_status, + d.params.src_pos, + d.params.out_buf_ofs, + ); + } + Ok(x) if x < 0 => { + return ( + d.params.prev_return_status, + d.params.src_pos, + d.params.out_buf_ofs, + ) + } + _ => { + d.params.finished = d.params.flush == TDEFLFlush::Finish; + if d.params.flush == TDEFLFlush::Full { + memset(&mut d.dict.b.hash[..], 0); + memset(&mut d.dict.b.next[..], 0); + d.dict.size = 0; + } + } + } + } + + let res = flush_output_buffer(callback, &mut d.params); + d.params.prev_return_status = res.0; + + res +} + +/// Create a set of compression flags using parameters used by zlib and other compressors. +/// Mainly intended for use with transition from c libraries as it deals with raw integers. +/// +/// # Parameters +/// `level` determines compression level. Clamped to maximum of 10. Negative values result in +/// `CompressionLevel::DefaultLevel`. +/// `window_bits`: Above 0, wraps the stream in a zlib wrapper, 0 or negative for a raw deflate +/// stream. +/// `strategy`: Sets the strategy if this conforms to any of the values in `CompressionStrategy`. +/// +/// # Notes +/// This function may be removed or moved to the `miniz_oxide_c_api` in the future. +pub fn create_comp_flags_from_zip_params(level: i32, window_bits: i32, strategy: i32) -> u32 { + let num_probes = (if level >= 0 { + cmp::min(10, level) + } else { + CompressionLevel::DefaultLevel as i32 + }) as usize; + let greedy = if level <= 3 { + TDEFL_GREEDY_PARSING_FLAG + } else { + 0 + }; + let mut comp_flags = NUM_PROBES[num_probes] | greedy; + + if window_bits > 0 { + comp_flags |= TDEFL_WRITE_ZLIB_HEADER; + } + + if level == 0 { + comp_flags |= TDEFL_FORCE_ALL_RAW_BLOCKS; + } else if strategy == CompressionStrategy::Filtered as i32 { + comp_flags |= TDEFL_FILTER_MATCHES; + } else if strategy == CompressionStrategy::HuffmanOnly as i32 { + comp_flags &= !MAX_PROBES_MASK as u32; + } else if strategy == CompressionStrategy::Fixed as i32 { + comp_flags |= TDEFL_FORCE_ALL_STATIC_BLOCKS; + } else if strategy == CompressionStrategy::RLE as i32 { + comp_flags |= TDEFL_RLE_MATCHES; + } + + comp_flags +} + +#[cfg(test)] +mod test { + use super::{ + compress_to_output, create_comp_flags_from_zip_params, read_u16_le, write_u16_le, + CompressionStrategy, CompressorOxide, TDEFLFlush, TDEFLStatus, DEFAULT_FLAGS, + MZ_DEFAULT_WINDOW_BITS, + }; + use crate::inflate::decompress_to_vec; + use alloc::vec; + + #[test] + fn u16_to_slice() { + let mut slice = [0, 0]; + write_u16_le(2000, &mut slice, 0); + assert_eq!(slice, [208, 7]); + } + + #[test] + fn u16_from_slice() { + let mut slice = [208, 7]; + assert_eq!(read_u16_le(&mut slice, 0), 2000); + } + + #[test] + fn compress_output() { + assert_eq!( + DEFAULT_FLAGS, + create_comp_flags_from_zip_params( + 4, + MZ_DEFAULT_WINDOW_BITS, + CompressionStrategy::Default as i32 + ) + ); + + let slice = [ + 1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 1, 2, 6, 1, 2, 3, 1, 2, 3, 2, 3, 1, 2, 3, + ]; + let mut encoded = vec![]; + let flags = create_comp_flags_from_zip_params(6, 0, 0); + let mut d = CompressorOxide::new(flags); + let (status, in_consumed) = + compress_to_output(&mut d, &slice, TDEFLFlush::Finish, |out: &[u8]| { + encoded.extend_from_slice(out); + true + }); + + assert_eq!(status, TDEFLStatus::Done); + assert_eq!(in_consumed, slice.len()); + + let decoded = decompress_to_vec(&encoded[..]).unwrap(); + assert_eq!(&decoded[..], &slice[..]); + } + + #[test] + /// Check fast compress mode + fn compress_fast() { + let slice = [ + 1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 1, 2, 6, 1, 2, 3, 1, 2, 3, 2, 3, 1, 2, 3, + ]; + let mut encoded = vec![]; + let flags = create_comp_flags_from_zip_params(1, 0, 0); + let mut d = CompressorOxide::new(flags); + let (status, in_consumed) = + compress_to_output(&mut d, &slice, TDEFLFlush::Finish, |out: &[u8]| { + encoded.extend_from_slice(out); + true + }); + + assert_eq!(status, TDEFLStatus::Done); + assert_eq!(in_consumed, slice.len()); + + // Needs to be altered if algorithm improves. + assert_eq!( + &encoded[..], + [99, 100, 98, 102, 1, 98, 48, 98, 3, 147, 204, 76, 204, 140, 76, 204, 0] + ); + + let decoded = decompress_to_vec(&encoded[..]).unwrap(); + assert_eq!(&decoded[..], &slice[..]); + } +} diff --git a/miniz_oxide-0.7.2/src/deflate/mod.rs b/miniz_oxide-0.7.2/src/deflate/mod.rs new file mode 100644 index 0000000000000..f36f28c8c06ec --- /dev/null +++ b/miniz_oxide-0.7.2/src/deflate/mod.rs @@ -0,0 +1,226 @@ +//! This module contains functionality for compression. + +use crate::alloc::vec; +use crate::alloc::vec::Vec; + +mod buffer; +pub mod core; +pub mod stream; +use self::core::*; + +/// How much processing the compressor should do to compress the data. +/// `NoCompression` and `Bestspeed` have special meanings, the other levels determine the number +/// of checks for matches in the hash chains and whether to use lazy or greedy parsing. +#[repr(i32)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum CompressionLevel { + /// Don't do any compression, only output uncompressed blocks. + NoCompression = 0, + /// Fast compression. Uses a special compression routine that is optimized for speed. + BestSpeed = 1, + /// Slow/high compression. Do a lot of checks to try to find good matches. + BestCompression = 9, + /// Even more checks, can be very slow. + UberCompression = 10, + /// Default compromise between speed and compression. + DefaultLevel = 6, + /// Use the default compression level. + DefaultCompression = -1, +} + +// Missing safe rust analogue (this and mem-to-mem are quite similar) +/* +fn tdefl_compress( + d: Option<&mut CompressorOxide>, + in_buf: *const c_void, + in_size: Option<&mut usize>, + out_buf: *mut c_void, + out_size: Option<&mut usize>, + flush: TDEFLFlush, +) -> TDEFLStatus { + let res = match d { + None => { + in_size.map(|size| *size = 0); + out_size.map(|size| *size = 0); + (TDEFLStatus::BadParam, 0, 0) + }, + Some(compressor) => { + let callback_res = CallbackOxide::new( + compressor.callback_func.clone(), + in_buf, + in_size, + out_buf, + out_size, + ); + + if let Ok(mut callback) = callback_res { + let res = compress(compressor, &mut callback, flush); + callback.update_size(Some(res.1), Some(res.2)); + res + } else { + (TDEFLStatus::BadParam, 0, 0) + } + } + }; + res.0 +}*/ + +// Missing safe rust analogue +/* +fn tdefl_init( + d: Option<&mut CompressorOxide>, + put_buf_func: PutBufFuncPtr, + put_buf_user: *mut c_void, + flags: c_int, +) -> TDEFLStatus { + if let Some(d) = d { + *d = CompressorOxide::new( + put_buf_func.map(|func| + CallbackFunc { put_buf_func: func, put_buf_user: put_buf_user } + ), + flags as u32, + ); + TDEFLStatus::Okay + } else { + TDEFLStatus::BadParam + } +}*/ + +// Missing safe rust analogue (though maybe best served by flate2 front-end instead) +/* +fn tdefl_compress_mem_to_output( + buf: *const c_void, + buf_len: usize, + put_buf_func: PutBufFuncPtr, + put_buf_user: *mut c_void, + flags: c_int, +) -> bool*/ + +// Missing safe Rust analogue +/* +fn tdefl_compress_mem_to_mem( + out_buf: *mut c_void, + out_buf_len: usize, + src_buf: *const c_void, + src_buf_len: usize, + flags: c_int, +) -> usize*/ + +/// Compress the input data to a vector, using the specified compression level (0-10). +pub fn compress_to_vec(input: &[u8], level: u8) -> Vec { + compress_to_vec_inner(input, level, 0, 0) +} + +/// Compress the input data to a vector, using the specified compression level (0-10), and with a +/// zlib wrapper. +pub fn compress_to_vec_zlib(input: &[u8], level: u8) -> Vec { + compress_to_vec_inner(input, level, 1, 0) +} + +/// Simple function to compress data to a vec. +fn compress_to_vec_inner(mut input: &[u8], level: u8, window_bits: i32, strategy: i32) -> Vec { + // The comp flags function sets the zlib flag if the window_bits parameter is > 0. + let flags = create_comp_flags_from_zip_params(level.into(), window_bits, strategy); + let mut compressor = CompressorOxide::new(flags); + let mut output = vec![0; ::core::cmp::max(input.len() / 2, 2)]; + + let mut out_pos = 0; + loop { + let (status, bytes_in, bytes_out) = compress( + &mut compressor, + input, + &mut output[out_pos..], + TDEFLFlush::Finish, + ); + out_pos += bytes_out; + + match status { + TDEFLStatus::Done => { + output.truncate(out_pos); + break; + } + TDEFLStatus::Okay if bytes_in <= input.len() => { + input = &input[bytes_in..]; + + // We need more space, so resize the vector. + if output.len().saturating_sub(out_pos) < 30 { + output.resize(output.len() * 2, 0) + } + } + // Not supposed to happen unless there is a bug. + _ => panic!("Bug! Unexpectedly failed to compress!"), + } + } + + output +} + +#[cfg(test)] +mod test { + use super::{compress_to_vec, compress_to_vec_inner, CompressionStrategy}; + use crate::inflate::decompress_to_vec; + use alloc::vec; + + /// Test deflate example. + /// + /// Check if the encoder produces the same code as the example given by Mark Adler here: + /// https://stackoverflow.com/questions/17398931/deflate-encoding-with-static-huffman-codes/17415203 + #[test] + fn compress_small() { + let test_data = b"Deflate late"; + let check = [ + 0x73, 0x49, 0x4d, 0xcb, 0x49, 0x2c, 0x49, 0x55, 0x00, 0x11, 0x00, + ]; + + let res = compress_to_vec(test_data, 1); + assert_eq!(&check[..], res.as_slice()); + + let res = compress_to_vec(test_data, 9); + assert_eq!(&check[..], res.as_slice()); + } + + #[test] + fn compress_huff_only() { + let test_data = b"Deflate late"; + + let res = compress_to_vec_inner(test_data, 1, 0, CompressionStrategy::HuffmanOnly as i32); + let d = decompress_to_vec(res.as_slice()).expect("Failed to decompress!"); + assert_eq!(test_data, d.as_slice()); + } + + /// Test that a raw block compresses fine. + #[test] + fn compress_raw() { + let text = b"Hello, zlib!"; + let encoded = { + let len = text.len(); + let notlen = !len; + let mut encoded = vec![ + 1, + len as u8, + (len >> 8) as u8, + notlen as u8, + (notlen >> 8) as u8, + ]; + encoded.extend_from_slice(&text[..]); + encoded + }; + + let res = compress_to_vec(text, 0); + assert_eq!(encoded, res.as_slice()); + } + + #[test] + fn short() { + let test_data = [10, 10, 10, 10, 10, 55]; + let c = compress_to_vec(&test_data, 9); + + let d = decompress_to_vec(c.as_slice()).expect("Failed to decompress!"); + assert_eq!(&test_data, d.as_slice()); + // Check that a static block is used here, rather than a raw block + // , so the data is actually compressed. + // (The optimal compressed length would be 5, but neither miniz nor zlib manages that either + // as neither checks matches against the byte at index 0.) + assert!(c.len() <= 6); + } +} diff --git a/miniz_oxide-0.7.2/src/deflate/stream.rs b/miniz_oxide-0.7.2/src/deflate/stream.rs new file mode 100644 index 0000000000000..39aa82d924fed --- /dev/null +++ b/miniz_oxide-0.7.2/src/deflate/stream.rs @@ -0,0 +1,121 @@ +//! Extra streaming compression functionality. +//! +//! As of now this is mainly intended for use to build a higher-level wrapper. +//! +//! There is no DeflateState as the needed state is contained in the compressor struct itself. + +use crate::deflate::core::{compress, CompressorOxide, TDEFLFlush, TDEFLStatus}; +use crate::{MZError, MZFlush, MZStatus, StreamResult}; + +/// Try to compress from input to output with the given [`CompressorOxide`]. +/// +/// # Errors +/// +/// Returns [`MZError::Buf`] If the size of the `output` slice is empty or no progress was made due +/// to lack of expected input data, or if called without [`MZFlush::Finish`] after the compression +/// was already finished. +/// +/// Returns [`MZError::Param`] if the compressor parameters are set wrong. +/// +/// Returns [`MZError::Stream`] when lower-level decompressor returns a +/// [`TDEFLStatus::PutBufFailed`]; may not actually be possible. +pub fn deflate( + compressor: &mut CompressorOxide, + input: &[u8], + output: &mut [u8], + flush: MZFlush, +) -> StreamResult { + if output.is_empty() { + return StreamResult::error(MZError::Buf); + } + + if compressor.prev_return_status() == TDEFLStatus::Done { + return if flush == MZFlush::Finish { + StreamResult { + bytes_written: 0, + bytes_consumed: 0, + status: Ok(MZStatus::StreamEnd), + } + } else { + StreamResult::error(MZError::Buf) + }; + } + + let mut bytes_written = 0; + let mut bytes_consumed = 0; + + let mut next_in = input; + let mut next_out = output; + + let status = loop { + let in_bytes; + let out_bytes; + let defl_status = { + let res = compress(compressor, next_in, next_out, TDEFLFlush::from(flush)); + in_bytes = res.1; + out_bytes = res.2; + res.0 + }; + + next_in = &next_in[in_bytes..]; + next_out = &mut next_out[out_bytes..]; + bytes_consumed += in_bytes; + bytes_written += out_bytes; + + // Check if we are done, or compression failed. + match defl_status { + TDEFLStatus::BadParam => break Err(MZError::Param), + // Don't think this can happen as we're not using a custom callback. + TDEFLStatus::PutBufFailed => break Err(MZError::Stream), + TDEFLStatus::Done => break Ok(MZStatus::StreamEnd), + _ => (), + }; + + // All the output space was used, so wait for more. + if next_out.is_empty() { + break Ok(MZStatus::Ok); + } + + if next_in.is_empty() && (flush != MZFlush::Finish) { + let total_changed = bytes_written > 0 || bytes_consumed > 0; + + break if (flush != MZFlush::None) || total_changed { + // We wrote or consumed something, and/or did a flush (sync/partial etc.). + Ok(MZStatus::Ok) + } else { + // No more input data, not flushing, and nothing was consumed or written, + // so couldn't make any progress. + Err(MZError::Buf) + }; + } + }; + StreamResult { + bytes_consumed, + bytes_written, + status, + } +} + +#[cfg(test)] +mod test { + use super::deflate; + use crate::deflate::CompressorOxide; + use crate::inflate::decompress_to_vec_zlib; + use crate::{MZFlush, MZStatus}; + use alloc::boxed::Box; + use alloc::vec; + + #[test] + fn test_state() { + let data = b"Hello zlib!"; + let mut compressed = vec![0; 50]; + let mut compressor = Box::::default(); + let res = deflate(&mut compressor, data, &mut compressed, MZFlush::Finish); + let status = res.status.expect("Failed to compress!"); + let decomp = + decompress_to_vec_zlib(&compressed).expect("Failed to decompress compressed data"); + assert_eq!(status, MZStatus::StreamEnd); + assert_eq!(decomp[..], data[..]); + assert_eq!(res.bytes_consumed, data.len()); + } +} diff --git a/miniz_oxide-0.7.2/src/inflate/core.rs b/miniz_oxide-0.7.2/src/inflate/core.rs new file mode 100644 index 0000000000000..75453f6c30e8f --- /dev/null +++ b/miniz_oxide-0.7.2/src/inflate/core.rs @@ -0,0 +1,2038 @@ +//! Streaming decompression functionality. + +use super::*; +use crate::shared::{update_adler32, HUFFMAN_LENGTH_ORDER}; +use ::core::cell::Cell; + +use ::core::convert::TryInto; +use ::core::{cmp, slice}; + +use self::output_buffer::OutputBuffer; + +pub const TINFL_LZ_DICT_SIZE: usize = 32_768; + +/// A struct containing huffman code lengths and the huffman code tree used by the decompressor. +struct HuffmanTable { + /// Length of the code at each index. + pub code_size: [u8; MAX_HUFF_SYMBOLS_0], + /// Fast lookup table for shorter huffman codes. + /// + /// See `HuffmanTable::fast_lookup`. + pub look_up: [i16; FAST_LOOKUP_SIZE as usize], + /// Full huffman tree. + /// + /// Positive values are edge nodes/symbols, negative values are + /// parent nodes/references to other nodes. + pub tree: [i16; MAX_HUFF_TREE_SIZE], +} + +impl HuffmanTable { + const fn new() -> HuffmanTable { + HuffmanTable { + code_size: [0; MAX_HUFF_SYMBOLS_0], + look_up: [0; FAST_LOOKUP_SIZE as usize], + tree: [0; MAX_HUFF_TREE_SIZE], + } + } + + /// Look for a symbol in the fast lookup table. + /// The symbol is stored in the lower 9 bits, the length in the next 6. + /// If the returned value is negative, the code wasn't found in the + /// fast lookup table and the full tree has to be traversed to find the code. + #[inline] + fn fast_lookup(&self, bit_buf: BitBuffer) -> i16 { + self.look_up[(bit_buf & BitBuffer::from(FAST_LOOKUP_SIZE - 1)) as usize] + } + + /// Get the symbol and the code length from the huffman tree. + #[inline] + fn tree_lookup(&self, fast_symbol: i32, bit_buf: BitBuffer, mut code_len: u32) -> (i32, u32) { + let mut symbol = fast_symbol; + // We step through the tree until we encounter a positive value, which indicates a + // symbol. + loop { + // symbol here indicates the position of the left (0) node, if the next bit is 1 + // we add 1 to the lookup position to get the right node. + let tree_index = (!symbol + ((bit_buf >> code_len) & 1) as i32) as usize; + debug_assert!(tree_index < self.tree.len()); + if tree_index >= self.tree.len() { + break; + } + symbol = i32::from(self.tree[tree_index]); + code_len += 1; + if symbol >= 0 { + break; + } + } + (symbol, code_len) + } + + #[inline] + /// Look up a symbol and code length from the bits in the provided bit buffer. + /// + /// Returns Some(symbol, length) on success, + /// None if the length is 0. + /// + /// It's possible we could avoid checking for 0 if we can guarantee a sane table. + /// TODO: Check if a smaller type for code_len helps performance. + fn lookup(&self, bit_buf: BitBuffer) -> Option<(i32, u32)> { + let symbol = self.fast_lookup(bit_buf).into(); + if symbol >= 0 { + if (symbol >> 9) as u32 != 0 { + Some((symbol, (symbol >> 9) as u32)) + } else { + // Zero-length code. + None + } + } else { + // We didn't get a symbol from the fast lookup table, so check the tree instead. + Some(self.tree_lookup(symbol, bit_buf, FAST_LOOKUP_BITS.into())) + } + } +} + +/// The number of huffman tables used. +const MAX_HUFF_TABLES: usize = 3; +/// The length of the first (literal/length) huffman table. +const MAX_HUFF_SYMBOLS_0: usize = 288; +/// The length of the second (distance) huffman table. +const MAX_HUFF_SYMBOLS_1: usize = 32; +/// The length of the last (huffman code length) huffman table. +const _MAX_HUFF_SYMBOLS_2: usize = 19; +/// The maximum length of a code that can be looked up in the fast lookup table. +const FAST_LOOKUP_BITS: u8 = 10; +/// The size of the fast lookup table. +const FAST_LOOKUP_SIZE: u32 = 1 << FAST_LOOKUP_BITS; +const MAX_HUFF_TREE_SIZE: usize = MAX_HUFF_SYMBOLS_0 * 2; +const LITLEN_TABLE: usize = 0; +const DIST_TABLE: usize = 1; +const HUFFLEN_TABLE: usize = 2; + +/// Flags to [`decompress()`] to control how inflation works. +/// +/// These define bits for a bitmask argument. +pub mod inflate_flags { + /// Should we try to parse a zlib header? + /// + /// If unset, the function will expect an RFC1951 deflate stream. If set, it will expect a + /// RFC1950 zlib wrapper around the deflate stream. + pub const TINFL_FLAG_PARSE_ZLIB_HEADER: u32 = 1; + + /// There will be more input that hasn't been given to the decompressor yet. + /// + /// This is useful when you want to decompress what you have so far, + /// even if you know there is probably more input that hasn't gotten here yet (_e.g._, over a + /// network connection). When [`decompress()`][super::decompress] reaches the end of the input + /// without finding the end of the compressed stream, it will return + /// [`TINFLStatus::NeedsMoreInput`][super::TINFLStatus::NeedsMoreInput] if this is set, + /// indicating that you should get more data before calling again. If not set, it will return + /// [`TINFLStatus::FailedCannotMakeProgress`][super::TINFLStatus::FailedCannotMakeProgress] + /// suggesting the stream is corrupt, since you claimed it was all there. + pub const TINFL_FLAG_HAS_MORE_INPUT: u32 = 2; + + /// The output buffer should not wrap around. + pub const TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF: u32 = 4; + + /// Calculate the adler32 checksum of the output data even if we're not inflating a zlib stream. + /// + /// If [`TINFL_FLAG_IGNORE_ADLER32`] is specified, it will override this. + /// + /// NOTE: Enabling/disabling this between calls to decompress will result in an incorrect + /// checksum. + pub const TINFL_FLAG_COMPUTE_ADLER32: u32 = 8; + + /// Ignore adler32 checksum even if we are inflating a zlib stream. + /// + /// Overrides [`TINFL_FLAG_COMPUTE_ADLER32`] if both are enabled. + /// + /// NOTE: This flag does not exist in miniz as it does not support this and is a + /// custom addition for miniz_oxide. + /// + /// NOTE: Should not be changed from enabled to disabled after decompression has started, + /// this will result in checksum failure (outside the unlikely event where the checksum happens + /// to match anyway). + pub const TINFL_FLAG_IGNORE_ADLER32: u32 = 64; +} + +use self::inflate_flags::*; + +const MIN_TABLE_SIZES: [u16; 3] = [257, 1, 4]; + +#[cfg(target_pointer_width = "64")] +type BitBuffer = u64; + +#[cfg(not(target_pointer_width = "64"))] +type BitBuffer = u32; + +/// Main decompression struct. +/// +pub struct DecompressorOxide { + /// Current state of the decompressor. + state: core::State, + /// Number of bits in the bit buffer. + num_bits: u32, + /// Zlib CMF + z_header0: u32, + /// Zlib FLG + z_header1: u32, + /// Adler32 checksum from the zlib header. + z_adler32: u32, + /// 1 if the current block is the last block, 0 otherwise. + finish: u32, + /// The type of the current block. + block_type: u32, + /// 1 if the adler32 value should be checked. + check_adler32: u32, + /// Last match distance. + dist: u32, + /// Variable used for match length, symbols, and a number of other things. + counter: u32, + /// Number of extra bits for the last length or distance code. + num_extra: u32, + /// Number of entries in each huffman table. + table_sizes: [u32; MAX_HUFF_TABLES], + /// Buffer of input data. + bit_buf: BitBuffer, + /// Huffman tables. + tables: [HuffmanTable; MAX_HUFF_TABLES], + /// Raw block header. + raw_header: [u8; 4], + /// Huffman length codes. + len_codes: [u8; MAX_HUFF_SYMBOLS_0 + MAX_HUFF_SYMBOLS_1 + 137], +} + +impl DecompressorOxide { + /// Create a new tinfl_decompressor with all fields set to 0. + pub fn new() -> DecompressorOxide { + DecompressorOxide::default() + } + + /// Set the current state to `Start`. + #[inline] + pub fn init(&mut self) { + // The rest of the data is reset or overwritten when used. + self.state = core::State::Start; + } + + /// Returns the adler32 checksum of the currently decompressed data. + /// Note: Will return Some(1) if decompressing zlib but ignoring adler32. + #[inline] + pub fn adler32(&self) -> Option { + if self.state != State::Start && !self.state.is_failure() && self.z_header0 != 0 { + Some(self.check_adler32) + } else { + None + } + } + + /// Returns the adler32 that was read from the zlib header if it exists. + #[inline] + pub fn adler32_header(&self) -> Option { + if self.state != State::Start && self.state != State::BadZlibHeader && self.z_header0 != 0 { + Some(self.z_adler32) + } else { + None + } + } +} + +impl Default for DecompressorOxide { + /// Create a new tinfl_decompressor with all fields set to 0. + #[inline(always)] + fn default() -> Self { + DecompressorOxide { + state: core::State::Start, + num_bits: 0, + z_header0: 0, + z_header1: 0, + z_adler32: 0, + finish: 0, + block_type: 0, + check_adler32: 0, + dist: 0, + counter: 0, + num_extra: 0, + table_sizes: [0; MAX_HUFF_TABLES], + bit_buf: 0, + // TODO:(oyvindln) Check that copies here are optimized out in release mode. + tables: [ + HuffmanTable::new(), + HuffmanTable::new(), + HuffmanTable::new(), + ], + raw_header: [0; 4], + len_codes: [0; MAX_HUFF_SYMBOLS_0 + MAX_HUFF_SYMBOLS_1 + 137], + } + } +} + +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +#[non_exhaustive] +enum State { + Start = 0, + ReadZlibCmf, + ReadZlibFlg, + ReadBlockHeader, + BlockTypeNoCompression, + RawHeader, + RawMemcpy1, + RawMemcpy2, + ReadTableSizes, + ReadHufflenTableCodeSize, + ReadLitlenDistTablesCodeSize, + ReadExtraBitsCodeSize, + DecodeLitlen, + WriteSymbol, + ReadExtraBitsLitlen, + DecodeDistance, + ReadExtraBitsDistance, + RawReadFirstByte, + RawStoreFirstByte, + WriteLenBytesToEnd, + BlockDone, + HuffDecodeOuterLoop1, + HuffDecodeOuterLoop2, + ReadAdler32, + + DoneForever, + + // Failure states. + BlockTypeUnexpected, + BadCodeSizeSum, + BadDistOrLiteralTableLength, + BadTotalSymbols, + BadZlibHeader, + DistanceOutOfBounds, + BadRawLength, + BadCodeSizeDistPrevLookup, + InvalidLitlen, + InvalidDist, + InvalidCodeLen, +} + +impl State { + fn is_failure(self) -> bool { + match self { + BlockTypeUnexpected => true, + BadCodeSizeSum => true, + BadDistOrLiteralTableLength => true, + BadTotalSymbols => true, + BadZlibHeader => true, + DistanceOutOfBounds => true, + BadRawLength => true, + BadCodeSizeDistPrevLookup => true, + InvalidLitlen => true, + InvalidDist => true, + _ => false, + } + } + + #[inline] + fn begin(&mut self, new_state: State) { + *self = new_state; + } +} + +use self::State::*; + +// Not sure why miniz uses 32-bit values for these, maybe alignment/cache again? +// # Optimization +// We add a extra value at the end and make the tables 32 elements long +// so we can use a mask to avoid bounds checks. +// The invalid values are set to something high enough to avoid underflowing +// the match length. +/// Base length for each length code. +/// +/// The base is used together with the value of the extra bits to decode the actual +/// length/distance values in a match. +#[rustfmt::skip] +const LENGTH_BASE: [u16; 32] = [ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 512, 512, 512 +]; + +/// Number of extra bits for each length code. +#[rustfmt::skip] +const LENGTH_EXTRA: [u8; 32] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0, 0 +]; + +/// Base length for each distance code. +#[rustfmt::skip] +const DIST_BASE: [u16; 32] = [ + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, + 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, + 2049, 3073, 4097, 6145, 8193, 12_289, 16_385, 24_577, 32_768, 32_768 +]; + +/// Number of extra bits for each distance code. +#[rustfmt::skip] +const DIST_EXTRA: [u8; 32] = [ + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 13, 13 +]; + +/// The mask used when indexing the base/extra arrays. +const BASE_EXTRA_MASK: usize = 32 - 1; + +/// Sets the value of all the elements of the slice to `val`. +#[inline] +fn memset(slice: &mut [T], val: T) { + for x in slice { + *x = val + } +} + +/// Read an le u16 value from the slice iterator. +/// +/// # Panics +/// Panics if there are less than two bytes left. +#[inline] +fn read_u16_le(iter: &mut slice::Iter) -> u16 { + let ret = { + let two_bytes = iter.as_ref()[..2].try_into().unwrap(); + u16::from_le_bytes(two_bytes) + }; + iter.nth(1); + ret +} + +/// Read an le u32 value from the slice iterator. +/// +/// # Panics +/// Panics if there are less than four bytes left. +#[inline(always)] +#[cfg(target_pointer_width = "64")] +fn read_u32_le(iter: &mut slice::Iter) -> u32 { + let ret = { + let four_bytes: [u8; 4] = iter.as_ref()[..4].try_into().unwrap(); + u32::from_le_bytes(four_bytes) + }; + iter.nth(3); + ret +} + +/// Ensure that there is data in the bit buffer. +/// +/// On 64-bit platform, we use a 64-bit value so this will +/// result in there being at least 32 bits in the bit buffer. +/// This function assumes that there is at least 4 bytes left in the input buffer. +#[inline(always)] +#[cfg(target_pointer_width = "64")] +fn fill_bit_buffer(l: &mut LocalVars, in_iter: &mut slice::Iter) { + // Read four bytes into the buffer at once. + if l.num_bits < 30 { + l.bit_buf |= BitBuffer::from(read_u32_le(in_iter)) << l.num_bits; + l.num_bits += 32; + } +} + +/// Same as previous, but for non-64-bit platforms. +/// Ensures at least 16 bits are present, requires at least 2 bytes in the in buffer. +#[inline(always)] +#[cfg(not(target_pointer_width = "64"))] +fn fill_bit_buffer(l: &mut LocalVars, in_iter: &mut slice::Iter) { + // If the buffer is 32-bit wide, read 2 bytes instead. + if l.num_bits < 15 { + l.bit_buf |= BitBuffer::from(read_u16_le(in_iter)) << l.num_bits; + l.num_bits += 16; + } +} + +/// Check that the zlib header is correct and that there is enough space in the buffer +/// for the window size specified in the header. +/// +/// See https://tools.ietf.org/html/rfc1950 +#[inline] +fn validate_zlib_header(cmf: u32, flg: u32, flags: u32, mask: usize) -> Action { + let mut failed = + // cmf + flg should be divisible by 31. + (((cmf * 256) + flg) % 31 != 0) || + // If this flag is set, a dictionary was used for this zlib compressed data. + // This is currently not supported by miniz or miniz-oxide + ((flg & 0b0010_0000) != 0) || + // Compression method. Only 8(DEFLATE) is defined by the standard. + ((cmf & 15) != 8); + + let window_size = 1 << ((cmf >> 4) + 8); + if (flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF) == 0 { + // Bail if the buffer is wrapping and the window size is larger than the buffer. + failed |= (mask + 1) < window_size; + } + + // Zlib doesn't allow window sizes above 32 * 1024. + failed |= window_size > 32_768; + + if failed { + Action::Jump(BadZlibHeader) + } else { + Action::Jump(ReadBlockHeader) + } +} + +enum Action { + None, + Jump(State), + End(TINFLStatus), +} + +/// Try to decode the next huffman code, and puts it in the counter field of the decompressor +/// if successful. +/// +/// # Returns +/// The specified action returned from `f` on success, +/// `Action::End` if there are not enough data left to decode a symbol. +fn decode_huffman_code( + r: &mut DecompressorOxide, + l: &mut LocalVars, + table: usize, + flags: u32, + in_iter: &mut slice::Iter, + f: F, +) -> Action +where + F: FnOnce(&mut DecompressorOxide, &mut LocalVars, i32) -> Action, +{ + // As the huffman codes can be up to 15 bits long we need at least 15 bits + // ready in the bit buffer to start decoding the next huffman code. + if l.num_bits < 15 { + // First, make sure there is enough data in the bit buffer to decode a huffman code. + if in_iter.len() < 2 { + // If there is less than 2 bytes left in the input buffer, we try to look up + // the huffman code with what's available, and return if that doesn't succeed. + // Original explanation in miniz: + // /* TINFL_HUFF_BITBUF_FILL() is only used rarely, when the number of bytes + // * remaining in the input buffer falls below 2. */ + // /* It reads just enough bytes from the input stream that are needed to decode + // * the next Huffman code (and absolutely no more). It works by trying to fully + // * decode a */ + // /* Huffman code by using whatever bits are currently present in the bit buffer. + // * If this fails, it reads another byte, and tries again until it succeeds or + // * until the */ + // /* bit buffer contains >=15 bits (deflate's max. Huffman code size). */ + loop { + let mut temp = i32::from(r.tables[table].fast_lookup(l.bit_buf)); + + if temp >= 0 { + let code_len = (temp >> 9) as u32; + if (code_len != 0) && (l.num_bits >= code_len) { + break; + } + } else if l.num_bits > FAST_LOOKUP_BITS.into() { + let mut code_len = u32::from(FAST_LOOKUP_BITS); + loop { + temp = i32::from( + r.tables[table].tree + [(!temp + ((l.bit_buf >> code_len) & 1) as i32) as usize], + ); + code_len += 1; + if temp >= 0 || l.num_bits < code_len + 1 { + break; + } + } + if temp >= 0 { + break; + } + } + + // TODO: miniz jumps straight to here after getting here again after failing to read + // a byte. + // Doing that lets miniz avoid re-doing the lookup that that was done in the + // previous call. + let mut byte = 0; + if let a @ Action::End(_) = read_byte(in_iter, flags, |b| { + byte = b; + Action::None + }) { + return a; + }; + + // Do this outside closure for now to avoid borrowing r. + l.bit_buf |= BitBuffer::from(byte) << l.num_bits; + l.num_bits += 8; + + if l.num_bits >= 15 { + break; + } + } + } else { + // There is enough data in the input buffer, so read the next two bytes + // and add them to the bit buffer. + // Unwrapping here is fine since we just checked that there are at least two + // bytes left. + l.bit_buf |= BitBuffer::from(read_u16_le(in_iter)) << l.num_bits; + l.num_bits += 16; + } + } + + // We now have at least 15 bits in the input buffer. + let mut symbol = i32::from(r.tables[table].fast_lookup(l.bit_buf)); + let code_len; + // If the symbol was found in the fast lookup table. + if symbol >= 0 { + // Get the length value from the top bits. + // As we shift down the sign bit, converting to an unsigned value + // shouldn't overflow. + code_len = (symbol >> 9) as u32; + // Mask out the length value. + symbol &= 511; + } else { + let res = r.tables[table].tree_lookup(symbol, l.bit_buf, u32::from(FAST_LOOKUP_BITS)); + symbol = res.0; + code_len = res.1; + }; + + if code_len == 0 { + return Action::Jump(InvalidCodeLen); + } + + l.bit_buf >>= code_len; + l.num_bits -= code_len; + f(r, l, symbol) +} + +/// Try to read one byte from `in_iter` and call `f` with the read byte as an argument, +/// returning the result. +/// If reading fails, `Action::End is returned` +#[inline] +fn read_byte(in_iter: &mut slice::Iter, flags: u32, f: F) -> Action +where + F: FnOnce(u8) -> Action, +{ + match in_iter.next() { + None => end_of_input(flags), + Some(&byte) => f(byte), + } +} + +// TODO: `l: &mut LocalVars` may be slow similar to decompress_fast (even with inline(always)) +/// Try to read `amount` number of bits from `in_iter` and call the function `f` with the bits as an +/// an argument after reading, returning the result of that function, or `Action::End` if there are +/// not enough bytes left. +#[inline] +#[allow(clippy::while_immutable_condition)] +fn read_bits( + l: &mut LocalVars, + amount: u32, + in_iter: &mut slice::Iter, + flags: u32, + f: F, +) -> Action +where + F: FnOnce(&mut LocalVars, BitBuffer) -> Action, +{ + // Clippy gives a false positive warning here due to the closure. + // Read enough bytes from the input iterator to cover the number of bits we want. + while l.num_bits < amount { + match read_byte(in_iter, flags, |byte| { + l.bit_buf |= BitBuffer::from(byte) << l.num_bits; + l.num_bits += 8; + Action::None + }) { + Action::None => (), + // If there are not enough bytes in the input iterator, return and signal that we need + // more. + action => return action, + } + } + + let bits = l.bit_buf & ((1 << amount) - 1); + l.bit_buf >>= amount; + l.num_bits -= amount; + f(l, bits) +} + +#[inline] +fn pad_to_bytes(l: &mut LocalVars, in_iter: &mut slice::Iter, flags: u32, f: F) -> Action +where + F: FnOnce(&mut LocalVars) -> Action, +{ + let num_bits = l.num_bits & 7; + read_bits(l, num_bits, in_iter, flags, |l, _| f(l)) +} + +#[inline] +fn end_of_input(flags: u32) -> Action { + Action::End(if flags & TINFL_FLAG_HAS_MORE_INPUT != 0 { + TINFLStatus::NeedsMoreInput + } else { + TINFLStatus::FailedCannotMakeProgress + }) +} + +#[inline] +fn undo_bytes(l: &mut LocalVars, max: u32) -> u32 { + let res = cmp::min(l.num_bits >> 3, max); + l.num_bits -= res << 3; + res +} + +fn start_static_table(r: &mut DecompressorOxide) { + r.table_sizes[LITLEN_TABLE] = 288; + r.table_sizes[DIST_TABLE] = 32; + memset(&mut r.tables[LITLEN_TABLE].code_size[0..144], 8); + memset(&mut r.tables[LITLEN_TABLE].code_size[144..256], 9); + memset(&mut r.tables[LITLEN_TABLE].code_size[256..280], 7); + memset(&mut r.tables[LITLEN_TABLE].code_size[280..288], 8); + memset(&mut r.tables[DIST_TABLE].code_size[0..32], 5); +} + +static REVERSED_BITS_LOOKUP: [u32; 1024] = { + let mut table = [0; 1024]; + + let mut i = 0; + while i < 1024 { + table[i] = (i as u32).reverse_bits(); + i += 1; + } + + table +}; + +fn init_tree(r: &mut DecompressorOxide, l: &mut LocalVars) -> Option { + loop { + let bt = r.block_type as usize; + if bt >= r.tables.len() { + return None; + } + let table = &mut r.tables[bt]; + let table_size = r.table_sizes[bt] as usize; + if table_size > table.code_size.len() { + return None; + } + let mut total_symbols = [0u32; 16]; + let mut next_code = [0u32; 17]; + memset(&mut table.look_up[..], 0); + memset(&mut table.tree[..], 0); + + for &code_size in &table.code_size[..table_size] { + let cs = code_size as usize; + if cs >= total_symbols.len() { + return None; + } + total_symbols[cs] += 1; + } + + let mut used_symbols = 0; + let mut total = 0; + for (ts, next) in total_symbols + .iter() + .copied() + .zip(next_code.iter_mut().skip(1)) + .skip(1) + { + used_symbols += ts; + total += ts; + total <<= 1; + *next = total; + } + + if total != 65_536 && used_symbols > 1 { + return Some(Action::Jump(BadTotalSymbols)); + } + + let mut tree_next = -1; + for symbol_index in 0..table_size { + let mut rev_code = 0; + let code_size = table.code_size[symbol_index]; + if code_size == 0 || usize::from(code_size) >= next_code.len() { + continue; + } + + let mut cur_code = next_code[code_size as usize]; + next_code[code_size as usize] += 1; + + let n = cur_code & (u32::MAX >> (32 - code_size)); + + let mut rev_code = if n < 1024 { + REVERSED_BITS_LOOKUP[n as usize] >> (32 - code_size) + } else { + for _ in 0..code_size { + rev_code = (rev_code << 1) | (cur_code & 1); + cur_code >>= 1; + } + rev_code + }; + + if code_size <= FAST_LOOKUP_BITS { + let k = (i16::from(code_size) << 9) | symbol_index as i16; + while rev_code < FAST_LOOKUP_SIZE { + table.look_up[rev_code as usize] = k; + rev_code += 1 << code_size; + } + continue; + } + + let mut tree_cur = table.look_up[(rev_code & (FAST_LOOKUP_SIZE - 1)) as usize]; + if tree_cur == 0 { + table.look_up[(rev_code & (FAST_LOOKUP_SIZE - 1)) as usize] = tree_next; + tree_cur = tree_next; + tree_next -= 2; + } + + rev_code >>= FAST_LOOKUP_BITS - 1; + for _ in FAST_LOOKUP_BITS + 1..code_size { + rev_code >>= 1; + tree_cur -= (rev_code & 1) as i16; + let tree_index = (-tree_cur - 1) as usize; + if tree_index >= table.tree.len() { + return None; + } + if table.tree[tree_index] == 0 { + table.tree[tree_index] = tree_next; + tree_cur = tree_next; + tree_next -= 2; + } else { + tree_cur = table.tree[tree_index]; + } + } + + rev_code >>= 1; + tree_cur -= (rev_code & 1) as i16; + let tree_index = (-tree_cur - 1) as usize; + if tree_index >= table.tree.len() { + return None; + } + table.tree[tree_index] = symbol_index as i16; + } + + if r.block_type == 2 { + l.counter = 0; + return Some(Action::Jump(ReadLitlenDistTablesCodeSize)); + } + + if r.block_type == 0 { + break; + } + r.block_type -= 1; + } + + l.counter = 0; + Some(Action::Jump(DecodeLitlen)) +} + +// A helper macro for generating the state machine. +// +// As Rust doesn't have fallthrough on matches, we have to return to the match statement +// and jump for each state change. (Which would ideally be optimized away, but often isn't.) +macro_rules! generate_state { + ($state: ident, $state_machine: tt, $f: expr) => { + loop { + match $f { + Action::None => continue, + Action::Jump(new_state) => { + $state = new_state; + continue $state_machine; + }, + Action::End(result) => break $state_machine result, + } + } + }; +} + +#[derive(Copy, Clone)] +struct LocalVars { + pub bit_buf: BitBuffer, + pub num_bits: u32, + pub dist: u32, + pub counter: u32, + pub num_extra: u32, +} + +#[inline] +fn transfer( + out_slice: &mut [u8], + mut source_pos: usize, + mut out_pos: usize, + match_len: usize, + out_buf_size_mask: usize, +) { + // special case that comes up surprisingly often. in the case that `source_pos` + // is 1 less than `out_pos`, we can say that the entire range will be the same + // value and optimize this to be a simple `memset` + let source_diff = if source_pos > out_pos { + source_pos - out_pos + } else { + out_pos - source_pos + }; + if out_buf_size_mask == usize::MAX && source_diff == 1 && out_pos > source_pos { + let init = out_slice[out_pos - 1]; + let end = (match_len >> 2) * 4 + out_pos; + + out_slice[out_pos..end].fill(init); + out_pos = end; + source_pos = end - 1; + // if the difference between `source_pos` and `out_pos` is greater than 3, we + // can do slightly better than the naive case by copying everything at once + } else if out_buf_size_mask == usize::MAX && source_diff >= 4 && out_pos > source_pos { + for _ in 0..match_len >> 2 { + out_slice.copy_within(source_pos..=source_pos + 3, out_pos); + source_pos += 4; + out_pos += 4; + } + } else { + for _ in 0..match_len >> 2 { + out_slice[out_pos] = out_slice[source_pos & out_buf_size_mask]; + out_slice[out_pos + 1] = out_slice[(source_pos + 1) & out_buf_size_mask]; + out_slice[out_pos + 2] = out_slice[(source_pos + 2) & out_buf_size_mask]; + out_slice[out_pos + 3] = out_slice[(source_pos + 3) & out_buf_size_mask]; + source_pos += 4; + out_pos += 4; + } + } + + match match_len & 3 { + 0 => (), + 1 => out_slice[out_pos] = out_slice[source_pos & out_buf_size_mask], + 2 => { + out_slice[out_pos] = out_slice[source_pos & out_buf_size_mask]; + out_slice[out_pos + 1] = out_slice[(source_pos + 1) & out_buf_size_mask]; + } + 3 => { + out_slice[out_pos] = out_slice[source_pos & out_buf_size_mask]; + out_slice[out_pos + 1] = out_slice[(source_pos + 1) & out_buf_size_mask]; + out_slice[out_pos + 2] = out_slice[(source_pos + 2) & out_buf_size_mask]; + } + _ => unreachable!(), + } +} + +/// Presumes that there is at least match_len bytes in output left. +#[inline] +fn apply_match( + out_slice: &mut [u8], + out_pos: usize, + dist: usize, + match_len: usize, + out_buf_size_mask: usize, +) { + debug_assert!(out_pos.checked_add(match_len).unwrap() <= out_slice.len()); + + let source_pos = out_pos.wrapping_sub(dist) & out_buf_size_mask; + + if match_len == 3 { + let out_slice = Cell::from_mut(out_slice).as_slice_of_cells(); + if let Some(dst) = out_slice.get(out_pos..out_pos + 3) { + // Moving bounds checks before any memory mutation allows the optimizer + // combine them together. + let src = out_slice + .get(source_pos) + .zip(out_slice.get((source_pos + 1) & out_buf_size_mask)) + .zip(out_slice.get((source_pos + 2) & out_buf_size_mask)); + if let Some(((a, b), c)) = src { + // For correctness, the memory reads and writes have to be interleaved. + // Cells make it possible for read and write references to overlap. + dst[0].set(a.get()); + dst[1].set(b.get()); + dst[2].set(c.get()); + } + } + return; + } + + if cfg!(not(any(target_arch = "x86", target_arch = "x86_64"))) { + // We are not on x86 so copy manually. + transfer(out_slice, source_pos, out_pos, match_len, out_buf_size_mask); + return; + } + + if source_pos >= out_pos && (source_pos - out_pos) < match_len { + transfer(out_slice, source_pos, out_pos, match_len, out_buf_size_mask); + } else if match_len <= dist && source_pos + match_len < out_slice.len() { + // Destination and source segments does not intersect and source does not wrap. + if source_pos < out_pos { + let (from_slice, to_slice) = out_slice.split_at_mut(out_pos); + to_slice[..match_len].copy_from_slice(&from_slice[source_pos..source_pos + match_len]); + } else { + let (to_slice, from_slice) = out_slice.split_at_mut(source_pos); + to_slice[out_pos..out_pos + match_len].copy_from_slice(&from_slice[..match_len]); + } + } else { + transfer(out_slice, source_pos, out_pos, match_len, out_buf_size_mask); + } +} + +/// Fast inner decompression loop which is run while there is at least +/// 259 bytes left in the output buffer, and at least 6 bytes left in the input buffer +/// (The maximum one match would need + 1). +/// +/// This was inspired by a similar optimization in zlib, which uses this info to do +/// faster unchecked copies of multiple bytes at a time. +/// Currently we don't do this here, but this function does avoid having to jump through the +/// big match loop on each state change(as rust does not have fallthrough or gotos at the moment), +/// and already improves decompression speed a fair bit. +fn decompress_fast( + r: &mut DecompressorOxide, + in_iter: &mut slice::Iter, + out_buf: &mut OutputBuffer, + flags: u32, + local_vars: &mut LocalVars, + out_buf_size_mask: usize, +) -> (TINFLStatus, State) { + // Make a local copy of the most used variables, to avoid having to update and read from values + // in a random memory location and to encourage more register use. + let mut l = *local_vars; + let mut state; + + let status: TINFLStatus = 'o: loop { + state = State::DecodeLitlen; + loop { + // This function assumes that there is at least 259 bytes left in the output buffer, + // and that there is at least 14 bytes left in the input buffer. 14 input bytes: + // 15 (prev lit) + 15 (length) + 5 (length extra) + 15 (dist) + // + 29 + 32 (left in bit buf, including last 13 dist extra) = 111 bits < 14 bytes + // We need the one extra byte as we may write one length and one full match + // before checking again. + if out_buf.bytes_left() < 259 || in_iter.len() < 14 { + state = State::DecodeLitlen; + break 'o TINFLStatus::Done; + } + + fill_bit_buffer(&mut l, in_iter); + + if let Some((symbol, code_len)) = r.tables[LITLEN_TABLE].lookup(l.bit_buf) { + l.counter = symbol as u32; + l.bit_buf >>= code_len; + l.num_bits -= code_len; + + if (l.counter & 256) != 0 { + // The symbol is not a literal. + break; + } else { + // If we have a 32-bit buffer we need to read another two bytes now + // to have enough bits to keep going. + if cfg!(not(target_pointer_width = "64")) { + fill_bit_buffer(&mut l, in_iter); + } + + if let Some((symbol, code_len)) = r.tables[LITLEN_TABLE].lookup(l.bit_buf) { + l.bit_buf >>= code_len; + l.num_bits -= code_len; + // The previous symbol was a literal, so write it directly and check + // the next one. + out_buf.write_byte(l.counter as u8); + if (symbol & 256) != 0 { + l.counter = symbol as u32; + // The symbol is a length value. + break; + } else { + // The symbol is a literal, so write it directly and continue. + out_buf.write_byte(symbol as u8); + } + } else { + state.begin(InvalidCodeLen); + break 'o TINFLStatus::Failed; + } + } + } else { + state.begin(InvalidCodeLen); + break 'o TINFLStatus::Failed; + } + } + + // Mask the top bits since they may contain length info. + l.counter &= 511; + if l.counter == 256 { + // We hit the end of block symbol. + state.begin(BlockDone); + break 'o TINFLStatus::Done; + } else if l.counter > 285 { + // Invalid code. + // We already verified earlier that the code is > 256. + state.begin(InvalidLitlen); + break 'o TINFLStatus::Failed; + } else { + // The symbol was a length code. + // # Optimization + // Mask the value to avoid bounds checks + // We could use get_unchecked later if can statically verify that + // this will never go out of bounds. + l.num_extra = u32::from(LENGTH_EXTRA[(l.counter - 257) as usize & BASE_EXTRA_MASK]); + l.counter = u32::from(LENGTH_BASE[(l.counter - 257) as usize & BASE_EXTRA_MASK]); + // Length and distance codes have a number of extra bits depending on + // the base, which together with the base gives us the exact value. + + fill_bit_buffer(&mut l, in_iter); + if l.num_extra != 0 { + let extra_bits = l.bit_buf & ((1 << l.num_extra) - 1); + l.bit_buf >>= l.num_extra; + l.num_bits -= l.num_extra; + l.counter += extra_bits as u32; + } + + // We found a length code, so a distance code should follow. + + if cfg!(not(target_pointer_width = "64")) { + fill_bit_buffer(&mut l, in_iter); + } + + if let Some((mut symbol, code_len)) = r.tables[DIST_TABLE].lookup(l.bit_buf) { + symbol &= 511; + l.bit_buf >>= code_len; + l.num_bits -= code_len; + if symbol > 29 { + state.begin(InvalidDist); + break 'o TINFLStatus::Failed; + } + + l.num_extra = u32::from(DIST_EXTRA[symbol as usize]); + l.dist = u32::from(DIST_BASE[symbol as usize]); + } else { + state.begin(InvalidCodeLen); + break 'o TINFLStatus::Failed; + } + + if l.num_extra != 0 { + fill_bit_buffer(&mut l, in_iter); + let extra_bits = l.bit_buf & ((1 << l.num_extra) - 1); + l.bit_buf >>= l.num_extra; + l.num_bits -= l.num_extra; + l.dist += extra_bits as u32; + } + + let position = out_buf.position(); + if l.dist as usize > out_buf.position() + && (flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF != 0) + { + // We encountered a distance that refers a position before + // the start of the decoded data, so we can't continue. + state.begin(DistanceOutOfBounds); + break TINFLStatus::Failed; + } + + apply_match( + out_buf.get_mut(), + position, + l.dist as usize, + l.counter as usize, + out_buf_size_mask, + ); + + out_buf.set_position(position + l.counter as usize); + } + }; + + *local_vars = l; + (status, state) +} + +/// Main decompression function. Keeps decompressing data from `in_buf` until the `in_buf` is +/// empty, `out` is full, the end of the deflate stream is hit, or there is an error in the +/// deflate stream. +/// +/// # Arguments +/// +/// `r` is a [`DecompressorOxide`] struct with the state of this stream. +/// +/// `in_buf` is a reference to the compressed data that is to be decompressed. The decompressor will +/// start at the first byte of this buffer. +/// +/// `out` is a reference to the buffer that will store the decompressed data, and that +/// stores previously decompressed data if any. +/// +/// * The offset given by `out_pos` indicates where in the output buffer slice writing should start. +/// * If [`TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF`] is not set, the output buffer is used in a +/// wrapping manner, and it's size is required to be a power of 2. +/// * The decompression function normally needs access to 32KiB of the previously decompressed data +///(or to the beginning of the decompressed data if less than 32KiB has been decompressed.) +/// - If this data is not available, decompression may fail. +/// - Some deflate compressors allow specifying a window size which limits match distances to +/// less than this, or alternatively an RLE mode where matches will only refer to the previous byte +/// and thus allows a smaller output buffer. The window size can be specified in the zlib +/// header structure, however, the header data should not be relied on to be correct. +/// +/// `flags` indicates settings and status to the decompression function. +/// * The [`TINFL_FLAG_HAS_MORE_INPUT`] has to be specified if more compressed data is to be provided +/// in a subsequent call to this function. +/// * See the the [`inflate_flags`] module for details on other flags. +/// +/// # Returns +/// +/// Returns a tuple containing the status of the compressor, the number of input bytes read, and the +/// number of bytes output to `out`. +/// +/// This function shouldn't panic pending any bugs. +pub fn decompress( + r: &mut DecompressorOxide, + in_buf: &[u8], + out: &mut [u8], + out_pos: usize, + flags: u32, +) -> (TINFLStatus, usize, usize) { + let out_buf_size_mask = if flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF != 0 { + usize::max_value() + } else { + // In the case of zero len, any attempt to write would produce HasMoreOutput, + // so to gracefully process the case of there really being no output, + // set the mask to all zeros. + out.len().saturating_sub(1) + }; + + // Ensure the output buffer's size is a power of 2, unless the output buffer + // is large enough to hold the entire output file (in which case it doesn't + // matter). + // Also make sure that the output buffer position is not past the end of the output buffer. + if (out_buf_size_mask.wrapping_add(1) & out_buf_size_mask) != 0 || out_pos > out.len() { + return (TINFLStatus::BadParam, 0, 0); + } + + let mut in_iter = in_buf.iter(); + + let mut state = r.state; + + let mut out_buf = OutputBuffer::from_slice_and_pos(out, out_pos); + + // Make a local copy of the important variables here so we can work with them on the stack. + let mut l = LocalVars { + bit_buf: r.bit_buf, + num_bits: r.num_bits, + dist: r.dist, + counter: r.counter, + num_extra: r.num_extra, + }; + + let mut status = 'state_machine: loop { + match state { + Start => generate_state!(state, 'state_machine, { + l.bit_buf = 0; + l.num_bits = 0; + l.dist = 0; + l.counter = 0; + l.num_extra = 0; + r.z_header0 = 0; + r.z_header1 = 0; + r.z_adler32 = 1; + r.check_adler32 = 1; + if flags & TINFL_FLAG_PARSE_ZLIB_HEADER != 0 { + Action::Jump(State::ReadZlibCmf) + } else { + Action::Jump(State::ReadBlockHeader) + } + }), + + ReadZlibCmf => generate_state!(state, 'state_machine, { + read_byte(&mut in_iter, flags, |cmf| { + r.z_header0 = u32::from(cmf); + Action::Jump(State::ReadZlibFlg) + }) + }), + + ReadZlibFlg => generate_state!(state, 'state_machine, { + read_byte(&mut in_iter, flags, |flg| { + r.z_header1 = u32::from(flg); + validate_zlib_header(r.z_header0, r.z_header1, flags, out_buf_size_mask) + }) + }), + + // Read the block header and jump to the relevant section depending on the block type. + ReadBlockHeader => generate_state!(state, 'state_machine, { + read_bits(&mut l, 3, &mut in_iter, flags, |l, bits| { + r.finish = (bits & 1) as u32; + r.block_type = (bits >> 1) as u32 & 3; + match r.block_type { + 0 => Action::Jump(BlockTypeNoCompression), + 1 => { + start_static_table(r); + init_tree(r, l).unwrap_or(Action::End(TINFLStatus::Failed)) + }, + 2 => { + l.counter = 0; + Action::Jump(ReadTableSizes) + }, + 3 => Action::Jump(BlockTypeUnexpected), + _ => unreachable!() + } + }) + }), + + // Raw/Stored/uncompressed block. + BlockTypeNoCompression => generate_state!(state, 'state_machine, { + pad_to_bytes(&mut l, &mut in_iter, flags, |l| { + l.counter = 0; + Action::Jump(RawHeader) + }) + }), + + // Check that the raw block header is correct. + RawHeader => generate_state!(state, 'state_machine, { + if l.counter < 4 { + // Read block length and block length check. + if l.num_bits != 0 { + read_bits(&mut l, 8, &mut in_iter, flags, |l, bits| { + r.raw_header[l.counter as usize] = bits as u8; + l.counter += 1; + Action::None + }) + } else { + read_byte(&mut in_iter, flags, |byte| { + r.raw_header[l.counter as usize] = byte; + l.counter += 1; + Action::None + }) + } + } else { + // Check if the length value of a raw block is correct. + // The 2 first (2-byte) words in a raw header are the length and the + // ones complement of the length. + let length = u16::from(r.raw_header[0]) | (u16::from(r.raw_header[1]) << 8); + let check = u16::from(r.raw_header[2]) | (u16::from(r.raw_header[3]) << 8); + let valid = length == !check; + l.counter = length.into(); + + if !valid { + Action::Jump(BadRawLength) + } else if l.counter == 0 { + // Empty raw block. Sometimes used for synchronization. + Action::Jump(BlockDone) + } else if l.num_bits != 0 { + // There is some data in the bit buffer, so we need to write that first. + Action::Jump(RawReadFirstByte) + } else { + // The bit buffer is empty, so memcpy the rest of the uncompressed data from + // the block. + Action::Jump(RawMemcpy1) + } + } + }), + + // Read the byte from the bit buffer. + RawReadFirstByte => generate_state!(state, 'state_machine, { + read_bits(&mut l, 8, &mut in_iter, flags, |l, bits| { + l.dist = bits as u32; + Action::Jump(RawStoreFirstByte) + }) + }), + + // Write the byte we just read to the output buffer. + RawStoreFirstByte => generate_state!(state, 'state_machine, { + if out_buf.bytes_left() == 0 { + Action::End(TINFLStatus::HasMoreOutput) + } else { + out_buf.write_byte(l.dist as u8); + l.counter -= 1; + if l.counter == 0 || l.num_bits == 0 { + Action::Jump(RawMemcpy1) + } else { + // There is still some data left in the bit buffer that needs to be output. + // TODO: Changed this to jump to `RawReadfirstbyte` rather than + // `RawStoreFirstByte` as that seemed to be the correct path, but this + // needs testing. + Action::Jump(RawReadFirstByte) + } + } + }), + + RawMemcpy1 => generate_state!(state, 'state_machine, { + if l.counter == 0 { + Action::Jump(BlockDone) + } else if out_buf.bytes_left() == 0 { + Action::End(TINFLStatus::HasMoreOutput) + } else { + Action::Jump(RawMemcpy2) + } + }), + + RawMemcpy2 => generate_state!(state, 'state_machine, { + if in_iter.len() > 0 { + // Copy as many raw bytes as possible from the input to the output using memcpy. + // Raw block lengths are limited to 64 * 1024, so casting through usize and u32 + // is not an issue. + let space_left = out_buf.bytes_left(); + let bytes_to_copy = cmp::min(cmp::min( + space_left, + in_iter.len()), + l.counter as usize + ); + + out_buf.write_slice(&in_iter.as_slice()[..bytes_to_copy]); + + in_iter.nth(bytes_to_copy - 1); + l.counter -= bytes_to_copy as u32; + Action::Jump(RawMemcpy1) + } else { + end_of_input(flags) + } + }), + + // Read how many huffman codes/symbols are used for each table. + ReadTableSizes => generate_state!(state, 'state_machine, { + if l.counter < 3 { + let num_bits = [5, 5, 4][l.counter as usize]; + read_bits(&mut l, num_bits, &mut in_iter, flags, |l, bits| { + r.table_sizes[l.counter as usize] = + bits as u32 + u32::from(MIN_TABLE_SIZES[l.counter as usize]); + l.counter += 1; + Action::None + }) + } else { + memset(&mut r.tables[HUFFLEN_TABLE].code_size[..], 0); + l.counter = 0; + // Check that the litlen and distance are within spec. + // litlen table should be <=286 acc to the RFC and + // additionally zlib rejects dist table sizes larger than 30. + // NOTE this the final sizes after adding back predefined values, not + // raw value in the data. + // See miniz_oxide issue #130 and https://github.com/madler/zlib/issues/82. + if r.table_sizes[LITLEN_TABLE] <= 286 && r.table_sizes[DIST_TABLE] <= 30 { + Action::Jump(ReadHufflenTableCodeSize) + } + else { + Action::Jump(BadDistOrLiteralTableLength) + } + } + }), + + // Read the 3-bit lengths of the huffman codes describing the huffman code lengths used + // to decode the lengths of the main tables. + ReadHufflenTableCodeSize => generate_state!(state, 'state_machine, { + if l.counter < r.table_sizes[HUFFLEN_TABLE] { + read_bits(&mut l, 3, &mut in_iter, flags, |l, bits| { + // These lengths are not stored in a normal ascending order, but rather one + // specified by the deflate specification intended to put the most used + // values at the front as trailing zero lengths do not have to be stored. + r.tables[HUFFLEN_TABLE] + .code_size[HUFFMAN_LENGTH_ORDER[l.counter as usize] as usize] = + bits as u8; + l.counter += 1; + Action::None + }) + } else { + r.table_sizes[HUFFLEN_TABLE] = 19; + init_tree(r, &mut l).unwrap_or(Action::End(TINFLStatus::Failed)) + } + }), + + ReadLitlenDistTablesCodeSize => generate_state!(state, 'state_machine, { + if l.counter < r.table_sizes[LITLEN_TABLE] + r.table_sizes[DIST_TABLE] { + decode_huffman_code( + r, &mut l, HUFFLEN_TABLE, + flags, &mut in_iter, |r, l, symbol| { + l.dist = symbol as u32; + if l.dist < 16 { + r.len_codes[l.counter as usize] = l.dist as u8; + l.counter += 1; + Action::None + } else if l.dist == 16 && l.counter == 0 { + Action::Jump(BadCodeSizeDistPrevLookup) + } else { + l.num_extra = [2, 3, 7][l.dist as usize - 16]; + Action::Jump(ReadExtraBitsCodeSize) + } + } + ) + } else if l.counter != r.table_sizes[LITLEN_TABLE] + r.table_sizes[DIST_TABLE] { + Action::Jump(BadCodeSizeSum) + } else { + r.tables[LITLEN_TABLE].code_size[..r.table_sizes[LITLEN_TABLE] as usize] + .copy_from_slice(&r.len_codes[..r.table_sizes[LITLEN_TABLE] as usize]); + + let dist_table_start = r.table_sizes[LITLEN_TABLE] as usize; + let dist_table_end = (r.table_sizes[LITLEN_TABLE] + + r.table_sizes[DIST_TABLE]) as usize; + r.tables[DIST_TABLE].code_size[..r.table_sizes[DIST_TABLE] as usize] + .copy_from_slice(&r.len_codes[dist_table_start..dist_table_end]); + + r.block_type -= 1; + init_tree(r, &mut l).unwrap_or(Action::End(TINFLStatus::Failed)) + } + }), + + ReadExtraBitsCodeSize => generate_state!(state, 'state_machine, { + let num_extra = l.num_extra; + read_bits(&mut l, num_extra, &mut in_iter, flags, |l, mut extra_bits| { + // Mask to avoid a bounds check. + extra_bits += [3, 3, 11][(l.dist as usize - 16) & 3]; + let val = if l.dist == 16 { + r.len_codes[l.counter as usize - 1] + } else { + 0 + }; + + memset( + &mut r.len_codes[ + l.counter as usize..l.counter as usize + extra_bits as usize + ], + val, + ); + l.counter += extra_bits as u32; + Action::Jump(ReadLitlenDistTablesCodeSize) + }) + }), + + DecodeLitlen => generate_state!(state, 'state_machine, { + if in_iter.len() < 4 || out_buf.bytes_left() < 2 { + // See if we can decode a literal with the data we have left. + // Jumps to next state (WriteSymbol) if successful. + decode_huffman_code( + r, + &mut l, + LITLEN_TABLE, + flags, + &mut in_iter, + |_r, l, symbol| { + l.counter = symbol as u32; + Action::Jump(WriteSymbol) + }, + ) + } else if + // If there is enough space, use the fast inner decompression + // function. + out_buf.bytes_left() >= 259 && + in_iter.len() >= 14 + { + let (status, new_state) = decompress_fast( + r, + &mut in_iter, + &mut out_buf, + flags, + &mut l, + out_buf_size_mask, + ); + + state = new_state; + if status == TINFLStatus::Done { + Action::Jump(new_state) + } else { + Action::End(status) + } + } else { + fill_bit_buffer(&mut l, &mut in_iter); + + if let Some((symbol, code_len)) = r.tables[LITLEN_TABLE].lookup(l.bit_buf) { + + l.counter = symbol as u32; + l.bit_buf >>= code_len; + l.num_bits -= code_len; + + if (l.counter & 256) != 0 { + // The symbol is not a literal. + Action::Jump(HuffDecodeOuterLoop1) + } else { + // If we have a 32-bit buffer we need to read another two bytes now + // to have enough bits to keep going. + if cfg!(not(target_pointer_width = "64")) { + fill_bit_buffer(&mut l, &mut in_iter); + } + + if let Some((symbol, code_len)) = r.tables[LITLEN_TABLE].lookup(l.bit_buf) { + + l.bit_buf >>= code_len; + l.num_bits -= code_len; + // The previous symbol was a literal, so write it directly and check + // the next one. + out_buf.write_byte(l.counter as u8); + if (symbol & 256) != 0 { + l.counter = symbol as u32; + // The symbol is a length value. + Action::Jump(HuffDecodeOuterLoop1) + } else { + // The symbol is a literal, so write it directly and continue. + out_buf.write_byte(symbol as u8); + Action::None + } + } else { + Action::Jump(InvalidCodeLen) + } + } + } else { + Action::Jump(InvalidCodeLen) + } + } + }), + + WriteSymbol => generate_state!(state, 'state_machine, { + if l.counter >= 256 { + Action::Jump(HuffDecodeOuterLoop1) + } else if out_buf.bytes_left() > 0 { + out_buf.write_byte(l.counter as u8); + Action::Jump(DecodeLitlen) + } else { + Action::End(TINFLStatus::HasMoreOutput) + } + }), + + HuffDecodeOuterLoop1 => generate_state!(state, 'state_machine, { + // Mask the top bits since they may contain length info. + l.counter &= 511; + + if l.counter + == 256 { + // We hit the end of block symbol. + Action::Jump(BlockDone) + } else if l.counter > 285 { + // Invalid code. + // We already verified earlier that the code is > 256. + Action::Jump(InvalidLitlen) + } else { + // # Optimization + // Mask the value to avoid bounds checks + // We could use get_unchecked later if can statically verify that + // this will never go out of bounds. + l.num_extra = + u32::from(LENGTH_EXTRA[(l.counter - 257) as usize & BASE_EXTRA_MASK]); + l.counter = u32::from(LENGTH_BASE[(l.counter - 257) as usize & BASE_EXTRA_MASK]); + // Length and distance codes have a number of extra bits depending on + // the base, which together with the base gives us the exact value. + if l.num_extra != 0 { + Action::Jump(ReadExtraBitsLitlen) + } else { + Action::Jump(DecodeDistance) + } + } + }), + + ReadExtraBitsLitlen => generate_state!(state, 'state_machine, { + let num_extra = l.num_extra; + read_bits(&mut l, num_extra, &mut in_iter, flags, |l, extra_bits| { + l.counter += extra_bits as u32; + Action::Jump(DecodeDistance) + }) + }), + + DecodeDistance => generate_state!(state, 'state_machine, { + // Try to read a huffman code from the input buffer and look up what + // length code the decoded symbol refers to. + decode_huffman_code(r, &mut l, DIST_TABLE, flags, &mut in_iter, |_r, l, symbol| { + if symbol > 29 { + // Invalid distance code. + return Action::Jump(InvalidDist) + } + // # Optimization + // Mask the value to avoid bounds checks + // We could use get_unchecked later if can statically verify that + // this will never go out of bounds. + l.num_extra = u32::from(DIST_EXTRA[symbol as usize & BASE_EXTRA_MASK]); + l.dist = u32::from(DIST_BASE[symbol as usize & BASE_EXTRA_MASK]); + if l.num_extra != 0 { + // ReadEXTRA_BITS_DISTACNE + Action::Jump(ReadExtraBitsDistance) + } else { + Action::Jump(HuffDecodeOuterLoop2) + } + }) + }), + + ReadExtraBitsDistance => generate_state!(state, 'state_machine, { + let num_extra = l.num_extra; + read_bits(&mut l, num_extra, &mut in_iter, flags, |l, extra_bits| { + l.dist += extra_bits as u32; + Action::Jump(HuffDecodeOuterLoop2) + }) + }), + + HuffDecodeOuterLoop2 => generate_state!(state, 'state_machine, { + if l.dist as usize > out_buf.position() && + (flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF != 0) + { + // We encountered a distance that refers a position before + // the start of the decoded data, so we can't continue. + Action::Jump(DistanceOutOfBounds) + } else { + let out_pos = out_buf.position(); + let source_pos = out_buf.position() + .wrapping_sub(l.dist as usize) & out_buf_size_mask; + + let out_len = out_buf.get_ref().len(); + let match_end_pos = out_buf.position() + l.counter as usize; + + if match_end_pos > out_len || + // miniz doesn't do this check here. Not sure how it makes sure + // that this case doesn't happen. + (source_pos >= out_pos && (source_pos - out_pos) < l.counter as usize) + { + // Not enough space for all of the data in the output buffer, + // so copy what we have space for. + if l.counter == 0 { + Action::Jump(DecodeLitlen) + } else { + Action::Jump(WriteLenBytesToEnd) + } + } else { + apply_match( + out_buf.get_mut(), + out_pos, + l.dist as usize, + l.counter as usize, + out_buf_size_mask + ); + out_buf.set_position(out_pos + l.counter as usize); + Action::Jump(DecodeLitlen) + } + } + }), + + WriteLenBytesToEnd => generate_state!(state, 'state_machine, { + if out_buf.bytes_left() > 0 { + let out_pos = out_buf.position(); + let source_pos = out_buf.position() + .wrapping_sub(l.dist as usize) & out_buf_size_mask; + + + let len = cmp::min(out_buf.bytes_left(), l.counter as usize); + + transfer(out_buf.get_mut(), source_pos, out_pos, len, out_buf_size_mask); + + out_buf.set_position(out_pos + len); + l.counter -= len as u32; + if l.counter == 0 { + Action::Jump(DecodeLitlen) + } else { + Action::None + } + } else { + Action::End(TINFLStatus::HasMoreOutput) + } + }), + + BlockDone => generate_state!(state, 'state_machine, { + // End once we've read the last block. + if r.finish != 0 { + pad_to_bytes(&mut l, &mut in_iter, flags, |_| Action::None); + + let in_consumed = in_buf.len() - in_iter.len(); + let undo = undo_bytes(&mut l, in_consumed as u32) as usize; + in_iter = in_buf[in_consumed - undo..].iter(); + + l.bit_buf &= ((1 as BitBuffer) << l.num_bits) - 1; + debug_assert_eq!(l.num_bits, 0); + + if flags & TINFL_FLAG_PARSE_ZLIB_HEADER != 0 { + l.counter = 0; + Action::Jump(ReadAdler32) + } else { + Action::Jump(DoneForever) + } + } else { + Action::Jump(ReadBlockHeader) + } + }), + + ReadAdler32 => generate_state!(state, 'state_machine, { + if l.counter < 4 { + if l.num_bits != 0 { + read_bits(&mut l, 8, &mut in_iter, flags, |l, bits| { + r.z_adler32 <<= 8; + r.z_adler32 |= bits as u32; + l.counter += 1; + Action::None + }) + } else { + read_byte(&mut in_iter, flags, |byte| { + r.z_adler32 <<= 8; + r.z_adler32 |= u32::from(byte); + l.counter += 1; + Action::None + }) + } + } else { + Action::Jump(DoneForever) + } + }), + + // We are done. + DoneForever => break TINFLStatus::Done, + + // Anything else indicates failure. + // BadZlibHeader | BadRawLength | BadDistOrLiteralTableLength | BlockTypeUnexpected | + // DistanceOutOfBounds | + // BadTotalSymbols | BadCodeSizeDistPrevLookup | BadCodeSizeSum | InvalidLitlen | + // InvalidDist | InvalidCodeLen + _ => break TINFLStatus::Failed, + } + }; + + let in_undo = if status != TINFLStatus::NeedsMoreInput + && status != TINFLStatus::FailedCannotMakeProgress + { + undo_bytes(&mut l, (in_buf.len() - in_iter.len()) as u32) as usize + } else { + 0 + }; + + // Make sure HasMoreOutput overrides NeedsMoreInput if the output buffer is full. + // (Unless the missing input is the adler32 value in which case we don't need to write anything.) + // TODO: May want to see if we can do this in a better way. + if status == TINFLStatus::NeedsMoreInput + && out_buf.bytes_left() == 0 + && state != State::ReadAdler32 + { + status = TINFLStatus::HasMoreOutput + } + + r.state = state; + r.bit_buf = l.bit_buf; + r.num_bits = l.num_bits; + r.dist = l.dist; + r.counter = l.counter; + r.num_extra = l.num_extra; + + r.bit_buf &= ((1 as BitBuffer) << r.num_bits) - 1; + + // If this is a zlib stream, and update the adler32 checksum with the decompressed bytes if + // requested. + let need_adler = if (flags & TINFL_FLAG_IGNORE_ADLER32) == 0 { + flags & (TINFL_FLAG_PARSE_ZLIB_HEADER | TINFL_FLAG_COMPUTE_ADLER32) != 0 + } else { + // If TINFL_FLAG_IGNORE_ADLER32 is enabled, ignore the checksum. + false + }; + if need_adler && status as i32 >= 0 { + let out_buf_pos = out_buf.position(); + r.check_adler32 = update_adler32(r.check_adler32, &out_buf.get_ref()[out_pos..out_buf_pos]); + + // disabled so that random input from fuzzer would not be rejected early, + // before it has a chance to reach interesting parts of code + if !cfg!(fuzzing) { + // Once we are done, check if the checksum matches with the one provided in the zlib header. + if status == TINFLStatus::Done + && flags & TINFL_FLAG_PARSE_ZLIB_HEADER != 0 + && r.check_adler32 != r.z_adler32 + { + status = TINFLStatus::Adler32Mismatch; + } + } + } + + ( + status, + in_buf.len() - in_iter.len() - in_undo, + out_buf.position() - out_pos, + ) +} + +#[cfg(test)] +mod test { + use super::*; + + //TODO: Fix these. + + fn tinfl_decompress_oxide<'i>( + r: &mut DecompressorOxide, + input_buffer: &'i [u8], + output_buffer: &mut [u8], + flags: u32, + ) -> (TINFLStatus, &'i [u8], usize) { + let (status, in_pos, out_pos) = decompress(r, input_buffer, output_buffer, 0, flags); + (status, &input_buffer[in_pos..], out_pos) + } + + #[test] + fn decompress_zlib() { + let encoded = [ + 120, 156, 243, 72, 205, 201, 201, 215, 81, 168, 202, 201, 76, 82, 4, 0, 27, 101, 4, 19, + ]; + let flags = TINFL_FLAG_COMPUTE_ADLER32 | TINFL_FLAG_PARSE_ZLIB_HEADER; + + let mut b = DecompressorOxide::new(); + const LEN: usize = 32; + let mut b_buf = [0; LEN]; + + // This should fail with the out buffer being to small. + let b_status = tinfl_decompress_oxide(&mut b, &encoded[..], &mut b_buf, flags); + + assert_eq!(b_status.0, TINFLStatus::Failed); + + let flags = flags | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF; + + b = DecompressorOxide::new(); + + // With TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF set this should no longer fail. + let b_status = tinfl_decompress_oxide(&mut b, &encoded[..], &mut b_buf, flags); + + assert_eq!(b_buf[..b_status.2], b"Hello, zlib!"[..]); + assert_eq!(b_status.0, TINFLStatus::Done); + } + + #[cfg(feature = "with-alloc")] + #[test] + fn raw_block() { + const LEN: usize = 64; + + let text = b"Hello, zlib!"; + let encoded = { + let len = text.len(); + let notlen = !len; + let mut encoded = vec![ + 1, + len as u8, + (len >> 8) as u8, + notlen as u8, + (notlen >> 8) as u8, + ]; + encoded.extend_from_slice(&text[..]); + encoded + }; + + //let flags = TINFL_FLAG_COMPUTE_ADLER32 | TINFL_FLAG_PARSE_ZLIB_HEADER | + let flags = TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF; + + let mut b = DecompressorOxide::new(); + + let mut b_buf = [0; LEN]; + + let b_status = tinfl_decompress_oxide(&mut b, &encoded[..], &mut b_buf, flags); + assert_eq!(b_buf[..b_status.2], text[..]); + assert_eq!(b_status.0, TINFLStatus::Done); + } + + fn masked_lookup(table: &HuffmanTable, bit_buf: BitBuffer) -> (i32, u32) { + let ret = table.lookup(bit_buf).unwrap(); + (ret.0 & 511, ret.1) + } + + #[test] + fn fixed_table_lookup() { + let mut d = DecompressorOxide::new(); + d.block_type = 1; + start_static_table(&mut d); + let mut l = LocalVars { + bit_buf: d.bit_buf, + num_bits: d.num_bits, + dist: d.dist, + counter: d.counter, + num_extra: d.num_extra, + }; + init_tree(&mut d, &mut l).unwrap(); + let llt = &d.tables[LITLEN_TABLE]; + let dt = &d.tables[DIST_TABLE]; + assert_eq!(masked_lookup(llt, 0b00001100), (0, 8)); + assert_eq!(masked_lookup(llt, 0b00011110), (72, 8)); + assert_eq!(masked_lookup(llt, 0b01011110), (74, 8)); + assert_eq!(masked_lookup(llt, 0b11111101), (143, 8)); + assert_eq!(masked_lookup(llt, 0b000010011), (144, 9)); + assert_eq!(masked_lookup(llt, 0b111111111), (255, 9)); + assert_eq!(masked_lookup(llt, 0b00000000), (256, 7)); + assert_eq!(masked_lookup(llt, 0b1110100), (279, 7)); + assert_eq!(masked_lookup(llt, 0b00000011), (280, 8)); + assert_eq!(masked_lookup(llt, 0b11100011), (287, 8)); + + assert_eq!(masked_lookup(dt, 0), (0, 5)); + assert_eq!(masked_lookup(dt, 20), (5, 5)); + } + + // Only run this test with alloc enabled as it uses a larger buffer. + #[cfg(feature = "with-alloc")] + fn check_result(input: &[u8], expected_status: TINFLStatus, expected_state: State, zlib: bool) { + let mut r = DecompressorOxide::default(); + let mut output_buf = vec![0; 1024 * 32]; + let flags = if zlib { + inflate_flags::TINFL_FLAG_PARSE_ZLIB_HEADER + } else { + 0 + } | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF + | TINFL_FLAG_HAS_MORE_INPUT; + let (d_status, _in_bytes, _out_bytes) = + decompress(&mut r, input, &mut output_buf, 0, flags); + assert_eq!(expected_status, d_status); + assert_eq!(expected_state, r.state); + } + + #[cfg(feature = "with-alloc")] + #[test] + fn bogus_input() { + use self::check_result as cr; + const F: TINFLStatus = TINFLStatus::Failed; + const OK: TINFLStatus = TINFLStatus::Done; + // Bad CM. + cr(&[0x77, 0x85], F, State::BadZlibHeader, true); + // Bad window size (but check is correct). + cr(&[0x88, 0x98], F, State::BadZlibHeader, true); + // Bad check bits. + cr(&[0x78, 0x98], F, State::BadZlibHeader, true); + + // Too many code lengths. (From inflate library issues) + cr( + b"M\xff\xffM*\xad\xad\xad\xad\xad\xad\xad\xcd\xcd\xcdM", + F, + State::BadDistOrLiteralTableLength, + false, + ); + + // Bad CLEN (also from inflate library issues) + cr( + b"\xdd\xff\xff*M\x94ffffffffff", + F, + State::BadDistOrLiteralTableLength, + false, + ); + + // Port of inflate coverage tests from zlib-ng + // https://github.com/Dead2/zlib-ng/blob/develop/test/infcover.c + let c = |a, b, c| cr(a, b, c, false); + + // Invalid uncompressed/raw block length. + c(&[0, 0, 0, 0, 0], F, State::BadRawLength); + // Ok empty uncompressed block. + c(&[3, 0], OK, State::DoneForever); + // Invalid block type. + c(&[6], F, State::BlockTypeUnexpected); + // Ok uncompressed block. + c(&[1, 1, 0, 0xfe, 0xff, 0], OK, State::DoneForever); + // Too many litlens, we handle this later than zlib, so this test won't + // give the same result. + // c(&[0xfc, 0, 0], F, State::BadTotalSymbols); + // Invalid set of code lengths - TODO Check if this is the correct error for this. + c(&[4, 0, 0xfe, 0xff], F, State::BadTotalSymbols); + // Invalid repeat in list of code lengths. + // (Try to repeat a non-existent code.) + c(&[4, 0, 0x24, 0x49, 0], F, State::BadCodeSizeDistPrevLookup); + // Missing end of block code (should we have a separate error for this?) - fails on further input + // c(&[4, 0, 0x24, 0xe9, 0xff, 0x6d], F, State::BadTotalSymbols); + // Invalid set of literals/lengths + c( + &[ + 4, 0x80, 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0x71, 0xff, 0xff, 0x93, 0x11, 0, + ], + F, + State::BadTotalSymbols, + ); + // Invalid set of distances _ needsmoreinput + // c(&[4, 0x80, 0x49, 0x92, 0x24, 0x49, 0x92, 0x24, 0x0f, 0xb4, 0xff, 0xff, 0xc3, 0x84], F, State::BadTotalSymbols); + // Invalid distance code + c(&[2, 0x7e, 0xff, 0xff], F, State::InvalidDist); + + // Distance refers to position before the start + c( + &[0x0c, 0xc0, 0x81, 0, 0, 0, 0, 0, 0x90, 0xff, 0x6b, 0x4, 0], + F, + State::DistanceOutOfBounds, + ); + + // Trailer + // Bad gzip trailer checksum GZip header not handled by miniz_oxide + //cr(&[0x1f, 0x8b, 0x08 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0x03, 0, 0, 0, 0, 0x01], F, State::BadCRC, false) + // Bad gzip trailer length + //cr(&[0x1f, 0x8b, 0x08 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0x03, 0, 0, 0, 0, 0, 0, 0, 0, 0x01], F, State::BadCRC, false) + } + + #[test] + fn empty_output_buffer_non_wrapping() { + let encoded = [ + 120, 156, 243, 72, 205, 201, 201, 215, 81, 168, 202, 201, 76, 82, 4, 0, 27, 101, 4, 19, + ]; + let flags = TINFL_FLAG_COMPUTE_ADLER32 + | TINFL_FLAG_PARSE_ZLIB_HEADER + | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF; + let mut r = DecompressorOxide::new(); + let mut output_buf: [u8; 0] = []; + // Check that we handle an empty buffer properly and not panicking. + // https://github.com/Frommi/miniz_oxide/issues/23 + let res = decompress(&mut r, &encoded, &mut output_buf, 0, flags); + assert_eq!(res, (TINFLStatus::HasMoreOutput, 4, 0)); + } + + #[test] + fn empty_output_buffer_wrapping() { + let encoded = [ + 0x73, 0x49, 0x4d, 0xcb, 0x49, 0x2c, 0x49, 0x55, 0x00, 0x11, 0x00, + ]; + let flags = TINFL_FLAG_COMPUTE_ADLER32; + let mut r = DecompressorOxide::new(); + let mut output_buf: [u8; 0] = []; + // Check that we handle an empty buffer properly and not panicking. + // https://github.com/Frommi/miniz_oxide/issues/23 + let res = decompress(&mut r, &encoded, &mut output_buf, 0, flags); + assert_eq!(res, (TINFLStatus::HasMoreOutput, 2, 0)); + } +} diff --git a/miniz_oxide-0.7.2/src/inflate/mod.rs b/miniz_oxide-0.7.2/src/inflate/mod.rs new file mode 100644 index 0000000000000..3f787e726f103 --- /dev/null +++ b/miniz_oxide-0.7.2/src/inflate/mod.rs @@ -0,0 +1,343 @@ +//! This module contains functionality for decompression. + +#[cfg(feature = "with-alloc")] +use crate::alloc::{boxed::Box, vec, vec::Vec}; +use ::core::usize; +#[cfg(all(feature = "std", feature = "with-alloc"))] +use std::error::Error; + +pub mod core; +mod output_buffer; +pub mod stream; +use self::core::*; + +const TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS: i32 = -4; +const TINFL_STATUS_BAD_PARAM: i32 = -3; +const TINFL_STATUS_ADLER32_MISMATCH: i32 = -2; +const TINFL_STATUS_FAILED: i32 = -1; +const TINFL_STATUS_DONE: i32 = 0; +const TINFL_STATUS_NEEDS_MORE_INPUT: i32 = 1; +const TINFL_STATUS_HAS_MORE_OUTPUT: i32 = 2; + +/// Return status codes. +#[repr(i8)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum TINFLStatus { + /// More input data was expected, but the caller indicated that there was no more data, so the + /// input stream is likely truncated. + /// + /// This can't happen if you have provided the + /// [`TINFL_FLAG_HAS_MORE_INPUT`][core::inflate_flags::TINFL_FLAG_HAS_MORE_INPUT] flag to the + /// decompression. By setting that flag, you indicate more input exists but is not provided, + /// and so reaching the end of the input data without finding the end of the compressed stream + /// would instead return a [`NeedsMoreInput`][Self::NeedsMoreInput] status. + FailedCannotMakeProgress = TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS as i8, + + /// The output buffer is an invalid size; consider the `flags` parameter. + BadParam = TINFL_STATUS_BAD_PARAM as i8, + + /// The decompression went fine, but the adler32 checksum did not match the one + /// provided in the header. + Adler32Mismatch = TINFL_STATUS_ADLER32_MISMATCH as i8, + + /// Failed to decompress due to invalid data. + Failed = TINFL_STATUS_FAILED as i8, + + /// Finished decompression without issues. + /// + /// This indicates the end of the compressed stream has been reached. + Done = TINFL_STATUS_DONE as i8, + + /// The decompressor needs more input data to continue decompressing. + /// + /// This occurs when there's no more consumable input, but the end of the stream hasn't been + /// reached, and you have supplied the + /// [`TINFL_FLAG_HAS_MORE_INPUT`][core::inflate_flags::TINFL_FLAG_HAS_MORE_INPUT] flag to the + /// decompressor. Had you not supplied that flag (which would mean you were asserting that you + /// believed all the data was available) you would have gotten a + /// [`FailedCannotMakeProcess`][Self::FailedCannotMakeProgress] instead. + NeedsMoreInput = TINFL_STATUS_NEEDS_MORE_INPUT as i8, + + /// There is still pending data that didn't fit in the output buffer. + HasMoreOutput = TINFL_STATUS_HAS_MORE_OUTPUT as i8, +} + +impl TINFLStatus { + pub fn from_i32(value: i32) -> Option { + use self::TINFLStatus::*; + match value { + TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS => Some(FailedCannotMakeProgress), + TINFL_STATUS_BAD_PARAM => Some(BadParam), + TINFL_STATUS_ADLER32_MISMATCH => Some(Adler32Mismatch), + TINFL_STATUS_FAILED => Some(Failed), + TINFL_STATUS_DONE => Some(Done), + TINFL_STATUS_NEEDS_MORE_INPUT => Some(NeedsMoreInput), + TINFL_STATUS_HAS_MORE_OUTPUT => Some(HasMoreOutput), + _ => None, + } + } +} + +/// Struct return when decompress_to_vec functions fail. +#[cfg(feature = "with-alloc")] +#[derive(Debug)] +pub struct DecompressError { + /// Decompressor status on failure. See [TINFLStatus] for details. + pub status: TINFLStatus, + /// The currently decompressed data if any. + pub output: Vec, +} + +#[cfg(feature = "with-alloc")] +impl alloc::fmt::Display for DecompressError { + #[cold] + fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { + f.write_str(match self.status { + TINFLStatus::FailedCannotMakeProgress => "Truncated input stream", + TINFLStatus::BadParam => "Invalid output buffer size", + TINFLStatus::Adler32Mismatch => "Adler32 checksum mismatch", + TINFLStatus::Failed => "Invalid input data", + TINFLStatus::Done => "", // Unreachable + TINFLStatus::NeedsMoreInput => "Truncated input stream", + TINFLStatus::HasMoreOutput => "Output size exceeded the specified limit", + }) + } +} + +/// Implement Error trait only if std feature is requested as it requires std. +#[cfg(all(feature = "std", feature = "with-alloc"))] +impl Error for DecompressError {} + +#[cfg(feature = "with-alloc")] +fn decompress_error(status: TINFLStatus, output: Vec) -> Result, DecompressError> { + Err(DecompressError { status, output }) +} + +/// Decompress the deflate-encoded data in `input` to a vector. +/// +/// NOTE: This function will not bound the output, so if the output is large enough it can result in an out of memory error. +/// It is therefore suggested to not use this for anything other than test programs, use the functions with a specified limit, or +/// ideally streaming decompression via the [flate2](https://github.com/alexcrichton/flate2-rs) library instead. +/// +/// Returns a [`Result`] containing the [`Vec`] of decompressed data on success, and a [struct][DecompressError] containing the status and so far decompressed data if any on failure. +#[inline] +#[cfg(feature = "with-alloc")] +pub fn decompress_to_vec(input: &[u8]) -> Result, DecompressError> { + decompress_to_vec_inner(input, 0, usize::max_value()) +} + +/// Decompress the deflate-encoded data (with a zlib wrapper) in `input` to a vector. +/// +/// NOTE: This function will not bound the output, so if the output is large enough it can result in an out of memory error. +/// It is therefore suggested to not use this for anything other than test programs, use the functions with a specified limit, or +/// ideally streaming decompression via the [flate2](https://github.com/alexcrichton/flate2-rs) library instead. +/// +/// Returns a [`Result`] containing the [`Vec`] of decompressed data on success, and a [struct][DecompressError] containing the status and so far decompressed data if any on failure. +#[inline] +#[cfg(feature = "with-alloc")] +pub fn decompress_to_vec_zlib(input: &[u8]) -> Result, DecompressError> { + decompress_to_vec_inner( + input, + inflate_flags::TINFL_FLAG_PARSE_ZLIB_HEADER, + usize::max_value(), + ) +} + +/// Decompress the deflate-encoded data in `input` to a vector. +/// +/// The vector is grown to at most `max_size` bytes; if the data does not fit in that size, +/// the error [struct][DecompressError] will contain the status [`TINFLStatus::HasMoreOutput`] and the data that was decompressed on failure. +/// +/// As this function tries to decompress everything in one go, it's not ideal for general use outside of tests or where the output size is expected to be small. +/// It is suggested to use streaming decompression via the [flate2](https://github.com/alexcrichton/flate2-rs) library instead. +/// +/// Returns a [`Result`] containing the [`Vec`] of decompressed data on success, and a [struct][DecompressError] on failure. +#[inline] +#[cfg(feature = "with-alloc")] +pub fn decompress_to_vec_with_limit( + input: &[u8], + max_size: usize, +) -> Result, DecompressError> { + decompress_to_vec_inner(input, 0, max_size) +} + +/// Decompress the deflate-encoded data (with a zlib wrapper) in `input` to a vector. +/// The vector is grown to at most `max_size` bytes; if the data does not fit in that size, +/// the error [struct][DecompressError] will contain the status [`TINFLStatus::HasMoreOutput`] and the data that was decompressed on failure. +/// +/// As this function tries to decompress everything in one go, it's not ideal for general use outside of tests or where the output size is expected to be small. +/// It is suggested to use streaming decompression via the [flate2](https://github.com/alexcrichton/flate2-rs) library instead. +/// +/// Returns a [`Result`] containing the [`Vec`] of decompressed data on success, and a [struct][DecompressError] on failure. +#[inline] +#[cfg(feature = "with-alloc")] +pub fn decompress_to_vec_zlib_with_limit( + input: &[u8], + max_size: usize, +) -> Result, DecompressError> { + decompress_to_vec_inner(input, inflate_flags::TINFL_FLAG_PARSE_ZLIB_HEADER, max_size) +} + +/// Backend of various to-[`Vec`] decompressions. +/// +/// Returns [`Vec`] of decompressed data on success and the [error struct][DecompressError] with details on failure. +#[cfg(feature = "with-alloc")] +fn decompress_to_vec_inner( + mut input: &[u8], + flags: u32, + max_output_size: usize, +) -> Result, DecompressError> { + let flags = flags | inflate_flags::TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF; + let mut ret: Vec = vec![0; input.len().saturating_mul(2).min(max_output_size)]; + + let mut decomp = Box::::default(); + + let mut out_pos = 0; + loop { + // Wrap the whole output slice so we know we have enough of the + // decompressed data for matches. + let (status, in_consumed, out_consumed) = + decompress(&mut decomp, input, &mut ret, out_pos, flags); + out_pos += out_consumed; + + match status { + TINFLStatus::Done => { + ret.truncate(out_pos); + return Ok(ret); + } + + TINFLStatus::HasMoreOutput => { + // in_consumed is not expected to be out of bounds, + // but the check eliminates a panicking code path + if in_consumed > input.len() { + return decompress_error(TINFLStatus::HasMoreOutput, ret); + } + input = &input[in_consumed..]; + + // if the buffer has already reached the size limit, return an error + if ret.len() >= max_output_size { + return decompress_error(TINFLStatus::HasMoreOutput, ret); + } + // calculate the new length, capped at `max_output_size` + let new_len = ret.len().saturating_mul(2).min(max_output_size); + ret.resize(new_len, 0); + } + + _ => return decompress_error(status, ret), + } + } +} + +/// Decompress one or more source slices from an iterator into the output slice. +/// +/// * On success, returns the number of bytes that were written. +/// * On failure, returns the failure status code. +/// +/// This will fail if the output buffer is not large enough, but in that case +/// the output buffer will still contain the partial decompression. +/// +/// * `out` the output buffer. +/// * `it` the iterator of input slices. +/// * `zlib_header` if the first slice out of the iterator is expected to have a +/// Zlib header. Otherwise the slices are assumed to be the deflate data only. +/// * `ignore_adler32` if the adler32 checksum should be calculated or not. +pub fn decompress_slice_iter_to_slice<'out, 'inp>( + out: &'out mut [u8], + it: impl Iterator, + zlib_header: bool, + ignore_adler32: bool, +) -> Result { + use self::core::inflate_flags::*; + + let mut it = it.peekable(); + let r = &mut DecompressorOxide::new(); + let mut out_pos = 0; + while let Some(in_buf) = it.next() { + let has_more = it.peek().is_some(); + let flags = { + let mut f = TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF; + if zlib_header { + f |= TINFL_FLAG_PARSE_ZLIB_HEADER; + } + if ignore_adler32 { + f |= TINFL_FLAG_IGNORE_ADLER32; + } + if has_more { + f |= TINFL_FLAG_HAS_MORE_INPUT; + } + f + }; + let (status, _input_read, bytes_written) = decompress(r, in_buf, out, out_pos, flags); + out_pos += bytes_written; + match status { + TINFLStatus::NeedsMoreInput => continue, + TINFLStatus::Done => return Ok(out_pos), + e => return Err(e), + } + } + // If we ran out of source slices without getting a `Done` from the + // decompression we can call it a failure. + Err(TINFLStatus::FailedCannotMakeProgress) +} + +#[cfg(all(test, feature = "with-alloc"))] +mod test { + use super::{ + decompress_slice_iter_to_slice, decompress_to_vec_zlib, decompress_to_vec_zlib_with_limit, + DecompressError, TINFLStatus, + }; + const ENCODED: [u8; 20] = [ + 120, 156, 243, 72, 205, 201, 201, 215, 81, 168, 202, 201, 76, 82, 4, 0, 27, 101, 4, 19, + ]; + + #[test] + fn decompress_vec() { + let res = decompress_to_vec_zlib(&ENCODED[..]).unwrap(); + assert_eq!(res.as_slice(), &b"Hello, zlib!"[..]); + } + + #[test] + fn decompress_vec_with_high_limit() { + let res = decompress_to_vec_zlib_with_limit(&ENCODED[..], 100_000).unwrap(); + assert_eq!(res.as_slice(), &b"Hello, zlib!"[..]); + } + + #[test] + fn fail_to_decompress_with_limit() { + let res = decompress_to_vec_zlib_with_limit(&ENCODED[..], 8); + match res { + Err(DecompressError { + status: TINFLStatus::HasMoreOutput, + .. + }) => (), // expected result + _ => panic!("Decompression output size limit was not enforced"), + } + } + + #[test] + fn test_decompress_slice_iter_to_slice() { + // one slice + let mut out = [0_u8; 12_usize]; + let r = + decompress_slice_iter_to_slice(&mut out, Some(&ENCODED[..]).into_iter(), true, false); + assert_eq!(r, Ok(12)); + assert_eq!(&out[..12], &b"Hello, zlib!"[..]); + + // some chunks at a time + for chunk_size in 1..13 { + // Note: because of https://github.com/Frommi/miniz_oxide/issues/110 our + // out buffer needs to have +1 byte available when the chunk size cuts + // the adler32 data off from the last actual data. + let mut out = [0_u8; 12_usize + 1]; + let r = + decompress_slice_iter_to_slice(&mut out, ENCODED.chunks(chunk_size), true, false); + assert_eq!(r, Ok(12)); + assert_eq!(&out[..12], &b"Hello, zlib!"[..]); + } + + // output buffer too small + let mut out = [0_u8; 3_usize]; + let r = decompress_slice_iter_to_slice(&mut out, ENCODED.chunks(7), true, false); + assert!(r.is_err()); + } +} diff --git a/miniz_oxide-0.7.2/src/inflate/output_buffer.rs b/miniz_oxide-0.7.2/src/inflate/output_buffer.rs new file mode 100644 index 0000000000000..5218a807d3f78 --- /dev/null +++ b/miniz_oxide-0.7.2/src/inflate/output_buffer.rs @@ -0,0 +1,60 @@ +/// A wrapper for the output slice used when decompressing. +/// +/// Using this rather than `Cursor` lets us implement the writing methods directly on +/// the buffer and lets us use a usize rather than u64 for the position which helps with +/// performance on 32-bit systems. +pub struct OutputBuffer<'a> { + slice: &'a mut [u8], + position: usize, +} + +impl<'a> OutputBuffer<'a> { + #[inline] + pub fn from_slice_and_pos(slice: &'a mut [u8], position: usize) -> OutputBuffer<'a> { + OutputBuffer { slice, position } + } + + #[inline] + pub const fn position(&self) -> usize { + self.position + } + + #[inline] + pub fn set_position(&mut self, position: usize) { + self.position = position; + } + + /// Write a byte to the current position and increment + /// + /// Assumes that there is space. + #[inline] + pub fn write_byte(&mut self, byte: u8) { + self.slice[self.position] = byte; + self.position += 1; + } + + /// Write a slice to the current position and increment + /// + /// Assumes that there is space. + #[inline] + pub fn write_slice(&mut self, data: &[u8]) { + let len = data.len(); + self.slice[self.position..self.position + len].copy_from_slice(data); + self.position += data.len(); + } + + #[inline] + pub const fn bytes_left(&self) -> usize { + self.slice.len() - self.position + } + + #[inline] + pub const fn get_ref(&self) -> &[u8] { + self.slice + } + + #[inline] + pub fn get_mut(&mut self) -> &mut [u8] { + self.slice + } +} diff --git a/miniz_oxide-0.7.2/src/inflate/stream.rs b/miniz_oxide-0.7.2/src/inflate/stream.rs new file mode 100644 index 0000000000000..5463ab0fdfd29 --- /dev/null +++ b/miniz_oxide-0.7.2/src/inflate/stream.rs @@ -0,0 +1,423 @@ +//! Extra streaming decompression functionality. +//! +//! As of now this is mainly intended for use to build a higher-level wrapper. +#[cfg(feature = "with-alloc")] +use crate::alloc::boxed::Box; +use core::{cmp, mem}; + +use crate::inflate::core::{decompress, inflate_flags, DecompressorOxide, TINFL_LZ_DICT_SIZE}; +use crate::inflate::TINFLStatus; +use crate::{DataFormat, MZError, MZFlush, MZResult, MZStatus, StreamResult}; + +/// Tag that determines reset policy of [InflateState](struct.InflateState.html) +pub trait ResetPolicy { + /// Performs reset + fn reset(&self, state: &mut InflateState); +} + +/// Resets state, without performing expensive ops (e.g. zeroing buffer) +/// +/// Note that not zeroing buffer can lead to security issues when dealing with untrusted input. +pub struct MinReset; + +impl ResetPolicy for MinReset { + fn reset(&self, state: &mut InflateState) { + state.decompressor().init(); + state.dict_ofs = 0; + state.dict_avail = 0; + state.first_call = true; + state.has_flushed = false; + state.last_status = TINFLStatus::NeedsMoreInput; + } +} + +/// Resets state and zero memory, continuing to use the same data format. +pub struct ZeroReset; + +impl ResetPolicy for ZeroReset { + #[inline] + fn reset(&self, state: &mut InflateState) { + MinReset.reset(state); + state.dict = [0; TINFL_LZ_DICT_SIZE]; + } +} + +/// Full reset of the state, including zeroing memory. +/// +/// Requires to provide new data format. +pub struct FullReset(pub DataFormat); + +impl ResetPolicy for FullReset { + #[inline] + fn reset(&self, state: &mut InflateState) { + ZeroReset.reset(state); + state.data_format = self.0; + } +} + +/// A struct that compbines a decompressor with extra data for streaming decompression. +/// +pub struct InflateState { + /// Inner decompressor struct + decomp: DecompressorOxide, + + /// Buffer of input bytes for matches. + /// TODO: Could probably do this a bit cleaner with some + /// Cursor-like class. + /// We may also look into whether we need to keep a buffer here, or just one in the + /// decompressor struct. + dict: [u8; TINFL_LZ_DICT_SIZE], + /// Where in the buffer are we currently at? + dict_ofs: usize, + /// How many bytes of data to be flushed is there currently in the buffer? + dict_avail: usize, + + first_call: bool, + has_flushed: bool, + + /// Whether the input data is wrapped in a zlib header and checksum. + /// TODO: This should be stored in the decompressor. + data_format: DataFormat, + last_status: TINFLStatus, +} + +impl Default for InflateState { + fn default() -> Self { + InflateState { + decomp: DecompressorOxide::default(), + dict: [0; TINFL_LZ_DICT_SIZE], + dict_ofs: 0, + dict_avail: 0, + first_call: true, + has_flushed: false, + data_format: DataFormat::Raw, + last_status: TINFLStatus::NeedsMoreInput, + } + } +} +impl InflateState { + /// Create a new state. + /// + /// Note that this struct is quite large due to internal buffers, and as such storing it on + /// the stack is not recommended. + /// + /// # Parameters + /// `data_format`: Determines whether the compressed data is assumed to wrapped with zlib + /// metadata. + pub fn new(data_format: DataFormat) -> InflateState { + InflateState { + data_format, + ..Default::default() + } + } + + /// Create a new state on the heap. + /// + /// # Parameters + /// `data_format`: Determines whether the compressed data is assumed to wrapped with zlib + /// metadata. + #[cfg(feature = "with-alloc")] + pub fn new_boxed(data_format: DataFormat) -> Box { + let mut b: Box = Box::default(); + b.data_format = data_format; + b + } + + /// Access the innner decompressor. + pub fn decompressor(&mut self) -> &mut DecompressorOxide { + &mut self.decomp + } + + /// Return the status of the last call to `inflate` with this `InflateState`. + pub const fn last_status(&self) -> TINFLStatus { + self.last_status + } + + /// Create a new state using miniz/zlib style window bits parameter. + /// + /// The decompressor does not support different window sizes. As such, + /// any positive (>0) value will set the zlib header flag, while a negative one + /// will not. + #[cfg(feature = "with-alloc")] + pub fn new_boxed_with_window_bits(window_bits: i32) -> Box { + let mut b: Box = Box::default(); + b.data_format = DataFormat::from_window_bits(window_bits); + b + } + + #[inline] + /// Reset the decompressor without re-allocating memory, using the given + /// data format. + pub fn reset(&mut self, data_format: DataFormat) { + self.reset_as(FullReset(data_format)); + } + + #[inline] + /// Resets the state according to specified policy. + pub fn reset_as(&mut self, policy: T) { + policy.reset(self) + } +} + +/// Try to decompress from `input` to `output` with the given [`InflateState`] +/// +/// # `flush` +/// +/// Generally, the various [`MZFlush`] flags have meaning only on the compression side. They can be +/// supplied here, but the only one that has any semantic meaning is [`MZFlush::Finish`], which is a +/// signal that the stream is expected to finish, and failing to do so is an error. It isn't +/// necessary to specify it when the stream ends; you'll still get returned a +/// [`MZStatus::StreamEnd`] anyway. Other values either have no effect or cause errors. It's +/// likely that you'll almost always just want to use [`MZFlush::None`]. +/// +/// # Errors +/// +/// Returns [`MZError::Buf`] if the size of the `output` slice is empty or no progress was made due +/// to lack of expected input data, or if called with [`MZFlush::Finish`] and input wasn't all +/// consumed. +/// +/// Returns [`MZError::Data`] if this or a a previous call failed with an error return from +/// [`TINFLStatus`]; probably indicates corrupted data. +/// +/// Returns [`MZError::Stream`] when called with [`MZFlush::Full`] (meaningless on +/// decompression), or when called without [`MZFlush::Finish`] after an earlier call with +/// [`MZFlush::Finish`] has been made. +pub fn inflate( + state: &mut InflateState, + input: &[u8], + output: &mut [u8], + flush: MZFlush, +) -> StreamResult { + let mut bytes_consumed = 0; + let mut bytes_written = 0; + let mut next_in = input; + let mut next_out = output; + + if flush == MZFlush::Full { + return StreamResult::error(MZError::Stream); + } + + let mut decomp_flags = if state.data_format == DataFormat::Zlib { + inflate_flags::TINFL_FLAG_COMPUTE_ADLER32 + } else { + inflate_flags::TINFL_FLAG_IGNORE_ADLER32 + }; + + if (state.data_format == DataFormat::Zlib) + | (state.data_format == DataFormat::ZLibIgnoreChecksum) + { + decomp_flags |= inflate_flags::TINFL_FLAG_PARSE_ZLIB_HEADER; + } + + let first_call = state.first_call; + state.first_call = false; + if state.last_status == TINFLStatus::FailedCannotMakeProgress { + return StreamResult::error(MZError::Buf); + } + if (state.last_status as i32) < 0 { + return StreamResult::error(MZError::Data); + } + + if state.has_flushed && (flush != MZFlush::Finish) { + return StreamResult::error(MZError::Stream); + } + state.has_flushed |= flush == MZFlush::Finish; + + if (flush == MZFlush::Finish) && first_call { + decomp_flags |= inflate_flags::TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF; + + let status = decompress(&mut state.decomp, next_in, next_out, 0, decomp_flags); + let in_bytes = status.1; + let out_bytes = status.2; + let status = status.0; + + state.last_status = status; + + bytes_consumed += in_bytes; + bytes_written += out_bytes; + + let ret_status = { + if status == TINFLStatus::FailedCannotMakeProgress { + Err(MZError::Buf) + } else if (status as i32) < 0 { + Err(MZError::Data) + } else if status != TINFLStatus::Done { + state.last_status = TINFLStatus::Failed; + Err(MZError::Buf) + } else { + Ok(MZStatus::StreamEnd) + } + }; + return StreamResult { + bytes_consumed, + bytes_written, + status: ret_status, + }; + } + + if flush != MZFlush::Finish { + decomp_flags |= inflate_flags::TINFL_FLAG_HAS_MORE_INPUT; + } + + if state.dict_avail != 0 { + bytes_written += push_dict_out(state, &mut next_out); + return StreamResult { + bytes_consumed, + bytes_written, + status: Ok( + if (state.last_status == TINFLStatus::Done) && (state.dict_avail == 0) { + MZStatus::StreamEnd + } else { + MZStatus::Ok + }, + ), + }; + } + + let status = inflate_loop( + state, + &mut next_in, + &mut next_out, + &mut bytes_consumed, + &mut bytes_written, + decomp_flags, + flush, + ); + StreamResult { + bytes_consumed, + bytes_written, + status, + } +} + +fn inflate_loop( + state: &mut InflateState, + next_in: &mut &[u8], + next_out: &mut &mut [u8], + total_in: &mut usize, + total_out: &mut usize, + decomp_flags: u32, + flush: MZFlush, +) -> MZResult { + let orig_in_len = next_in.len(); + loop { + let status = decompress( + &mut state.decomp, + next_in, + &mut state.dict, + state.dict_ofs, + decomp_flags, + ); + + let in_bytes = status.1; + let out_bytes = status.2; + let status = status.0; + + state.last_status = status; + + *next_in = &next_in[in_bytes..]; + *total_in += in_bytes; + + state.dict_avail = out_bytes; + *total_out += push_dict_out(state, next_out); + + // The stream was corrupted, and decompression failed. + if (status as i32) < 0 { + return Err(MZError::Data); + } + + // The decompressor has flushed all it's data and is waiting for more input, but + // there was no more input provided. + if (status == TINFLStatus::NeedsMoreInput) && orig_in_len == 0 { + return Err(MZError::Buf); + } + + if flush == MZFlush::Finish { + if status == TINFLStatus::Done { + // There is not enough space in the output buffer to flush the remaining + // decompressed data in the internal buffer. + return if state.dict_avail != 0 { + Err(MZError::Buf) + } else { + Ok(MZStatus::StreamEnd) + }; + // No more space in the output buffer, but we're not done. + } else if next_out.is_empty() { + return Err(MZError::Buf); + } + } else { + // We're not expected to finish, so it's fine if we can't flush everything yet. + let empty_buf = next_in.is_empty() || next_out.is_empty(); + if (status == TINFLStatus::Done) || empty_buf || (state.dict_avail != 0) { + return if (status == TINFLStatus::Done) && (state.dict_avail == 0) { + // No more data left, we're done. + Ok(MZStatus::StreamEnd) + } else { + // Ok for now, still waiting for more input data or output space. + Ok(MZStatus::Ok) + }; + } + } + } +} + +fn push_dict_out(state: &mut InflateState, next_out: &mut &mut [u8]) -> usize { + let n = cmp::min(state.dict_avail, next_out.len()); + (next_out[..n]).copy_from_slice(&state.dict[state.dict_ofs..state.dict_ofs + n]); + *next_out = &mut mem::take(next_out)[n..]; + state.dict_avail -= n; + state.dict_ofs = (state.dict_ofs + (n)) & (TINFL_LZ_DICT_SIZE - 1); + n +} + +#[cfg(all(test, feature = "with-alloc"))] +mod test { + use super::{inflate, InflateState}; + use crate::{DataFormat, MZFlush, MZStatus}; + use alloc::vec; + + #[test] + fn test_state() { + let encoded = [ + 120u8, 156, 243, 72, 205, 201, 201, 215, 81, 168, 202, 201, 76, 82, 4, 0, 27, 101, 4, + 19, + ]; + let mut out = vec![0; 50]; + let mut state = InflateState::new_boxed(DataFormat::Zlib); + let res = inflate(&mut state, &encoded, &mut out, MZFlush::Finish); + let status = res.status.expect("Failed to decompress!"); + assert_eq!(status, MZStatus::StreamEnd); + assert_eq!(out[..res.bytes_written as usize], b"Hello, zlib!"[..]); + assert_eq!(res.bytes_consumed, encoded.len()); + + state.reset_as(super::ZeroReset); + out.iter_mut().map(|x| *x = 0).count(); + let res = inflate(&mut state, &encoded, &mut out, MZFlush::Finish); + let status = res.status.expect("Failed to decompress!"); + assert_eq!(status, MZStatus::StreamEnd); + assert_eq!(out[..res.bytes_written as usize], b"Hello, zlib!"[..]); + assert_eq!(res.bytes_consumed, encoded.len()); + + state.reset_as(super::MinReset); + out.iter_mut().map(|x| *x = 0).count(); + let res = inflate(&mut state, &encoded, &mut out, MZFlush::Finish); + let status = res.status.expect("Failed to decompress!"); + assert_eq!(status, MZStatus::StreamEnd); + assert_eq!(out[..res.bytes_written as usize], b"Hello, zlib!"[..]); + assert_eq!(res.bytes_consumed, encoded.len()); + assert_eq!(state.decompressor().adler32(), Some(459605011)); + + // Test state when not computing adler. + state = InflateState::new_boxed(DataFormat::ZLibIgnoreChecksum); + out.iter_mut().map(|x| *x = 0).count(); + let res = inflate(&mut state, &encoded, &mut out, MZFlush::Finish); + let status = res.status.expect("Failed to decompress!"); + assert_eq!(status, MZStatus::StreamEnd); + assert_eq!(out[..res.bytes_written as usize], b"Hello, zlib!"[..]); + assert_eq!(res.bytes_consumed, encoded.len()); + // Not computed, so should be Some(1) + assert_eq!(state.decompressor().adler32(), Some(1)); + // Should still have the checksum read from the header file. + assert_eq!(state.decompressor().adler32_header(), Some(459605011)) + } +} diff --git a/miniz_oxide-0.7.2/src/lib.rs b/miniz_oxide-0.7.2/src/lib.rs new file mode 100644 index 0000000000000..2842345ad5e06 --- /dev/null +++ b/miniz_oxide-0.7.2/src/lib.rs @@ -0,0 +1,206 @@ +//! A pure rust replacement for the [miniz](https://github.com/richgel999/miniz) +//! DEFLATE/zlib encoder/decoder. +//! Used a rust back-end for the +//! [flate2](https://github.com/alexcrichton/flate2-rs) crate. +//! +#![cfg_attr( + feature = "with-alloc", + doc = r##" +# Usage +## Simple compression/decompression: +``` rust + +use miniz_oxide::inflate::decompress_to_vec; +use miniz_oxide::deflate::compress_to_vec; + +fn roundtrip(data: &[u8]) { + let compressed = compress_to_vec(data, 6); + let decompressed = decompress_to_vec(compressed.as_slice()).expect("Failed to decompress!"); +# let _ = decompressed; +} + +# roundtrip(b"Test_data test data lalalal blabla"); +"## +)] +#![allow(elided_lifetimes_in_paths)] +#![allow(unexpected_cfgs)] +#![allow(unused_imports)] +#![forbid(unsafe_code)] +#![cfg_attr(not(feature = "std"), no_std)] + +#[cfg(feature = "with-alloc")] +extern crate alloc; + +#[cfg(feature = "with-alloc")] +pub mod deflate; +pub mod inflate; +mod shared; + +pub use crate::shared::update_adler32 as mz_adler32_oxide; +pub use crate::shared::{MZ_ADLER32_INIT, MZ_DEFAULT_WINDOW_BITS}; + +/// A list of flush types. +/// +/// See for more in-depth info. +#[repr(i32)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum MZFlush { + /// Don't force any flushing. + /// Used when more input data is expected. + None = 0, + /// Zlib partial flush. + /// Currently treated as [`Sync`]. + Partial = 1, + /// Finish compressing the currently buffered data, and output an empty raw block. + /// Has no use in decompression. + Sync = 2, + /// Same as [`Sync`], but resets the compression dictionary so that further compressed + /// data does not depend on data compressed before the flush. + /// + /// Has no use in decompression, and is an error to supply in that case. + Full = 3, + /// Attempt to flush the remaining data and end the stream. + Finish = 4, + /// Not implemented. + Block = 5, +} + +impl MZFlush { + /// Create an MZFlush value from an integer value. + /// + /// Returns `MZError::Param` on invalid values. + pub fn new(flush: i32) -> Result { + match flush { + 0 => Ok(MZFlush::None), + 1 | 2 => Ok(MZFlush::Sync), + 3 => Ok(MZFlush::Full), + 4 => Ok(MZFlush::Finish), + _ => Err(MZError::Param), + } + } +} + +/// A list of miniz successful status codes. +/// +/// These are emitted as the [`Ok`] side of a [`MZResult`] in the [`StreamResult`] returned from +/// [`deflate::stream::deflate()`] or [`inflate::stream::inflate()`]. +#[repr(i32)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum MZStatus { + /// Operation succeeded. + /// + /// Some data was decompressed or compressed; see the byte counters in the [`StreamResult`] for + /// details. + Ok = 0, + + /// Operation succeeded and end of deflate stream was found. + /// + /// X-ref [`TINFLStatus::Done`][inflate::TINFLStatus::Done] or + /// [`TDEFLStatus::Done`][deflate::core::TDEFLStatus::Done] for `inflate` or `deflate` + /// respectively. + StreamEnd = 1, + + /// Unused + NeedDict = 2, +} + +/// A list of miniz failed status codes. +/// +/// These are emitted as the [`Err`] side of a [`MZResult`] in the [`StreamResult`] returned from +/// [`deflate::stream::deflate()`] or [`inflate::stream::inflate()`]. +#[repr(i32)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum MZError { + /// Unused + ErrNo = -1, + + /// General stream error. + /// + /// See [`inflate::stream::inflate()`] docs for details of how it can occur there. + /// + /// See [`deflate::stream::deflate()`] docs for how it can in principle occur there, though it's + /// believed impossible in practice. + Stream = -2, + + /// Error in inflation; see [`inflate::stream::inflate()`] for details. + /// + /// Not returned from [`deflate::stream::deflate()`]. + Data = -3, + + /// Unused + Mem = -4, + + /// Buffer-related error. + /// + /// See the docs of [`deflate::stream::deflate()`] or [`inflate::stream::inflate()`] for details + /// of when it would trigger in the one you're using. + Buf = -5, + + /// Unused + Version = -6, + + /// Bad parameters. + /// + /// This can be returned from [`deflate::stream::deflate()`] in the case of bad parameters. See + /// [`TDEFLStatus::BadParam`][deflate::core::TDEFLStatus::BadParam]. + Param = -10_000, +} + +/// How compressed data is wrapped. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum DataFormat { + /// Wrapped using the [zlib](http://www.zlib.org/rfc-zlib.html) format. + Zlib, + /// Zlib wrapped but ignore and don't compute the adler32 checksum. + /// Currently only used for inflate, behaves the same as Zlib for compression. + ZLibIgnoreChecksum, + /// Raw DEFLATE. + Raw, +} + +impl DataFormat { + pub fn from_window_bits(window_bits: i32) -> DataFormat { + if window_bits > 0 { DataFormat::Zlib } else { DataFormat::Raw } + } + + pub fn to_window_bits(self) -> i32 { + match self { + DataFormat::Zlib | DataFormat::ZLibIgnoreChecksum => shared::MZ_DEFAULT_WINDOW_BITS, + DataFormat::Raw => -shared::MZ_DEFAULT_WINDOW_BITS, + } + } +} + +/// `Result` alias for all miniz status codes both successful and failed. +pub type MZResult = Result; + +/// A structure containing the result of a call to the inflate or deflate streaming functions. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub struct StreamResult { + /// The number of bytes consumed from the input slice. + pub bytes_consumed: usize, + /// The number of bytes written to the output slice. + pub bytes_written: usize, + /// The return status of the call. + pub status: MZResult, +} + +impl StreamResult { + #[inline] + pub const fn error(error: MZError) -> StreamResult { + StreamResult { bytes_consumed: 0, bytes_written: 0, status: Err(error) } + } +} + +impl core::convert::From for MZResult { + fn from(res: StreamResult) -> Self { + res.status + } +} + +impl core::convert::From<&StreamResult> for MZResult { + fn from(res: &StreamResult) -> Self { + res.status + } +} diff --git a/miniz_oxide-0.7.2/src/shared.rs b/miniz_oxide-0.7.2/src/shared.rs new file mode 100644 index 0000000000000..8b81fb112b8c0 --- /dev/null +++ b/miniz_oxide-0.7.2/src/shared.rs @@ -0,0 +1,25 @@ +#[doc(hidden)] +pub const MZ_ADLER32_INIT: u32 = 1; + +#[doc(hidden)] +pub const MZ_DEFAULT_WINDOW_BITS: i32 = 15; + +pub const HUFFMAN_LENGTH_ORDER: [u8; 19] = [ + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, +]; + +#[doc(hidden)] +#[cfg(not(feature = "simd"))] +pub fn update_adler32(adler: u32, data: &[u8]) -> u32 { + let mut hash = adler::Adler32::from_checksum(adler); + hash.write_slice(data); + hash.checksum() +} + +#[doc(hidden)] +#[cfg(feature = "simd")] +pub fn update_adler32(adler: u32, data: &[u8]) -> u32 { + let mut hash = simd_adler32::Adler32::from_checksum(adler); + hash.write(data); + hash.finish() +} diff --git a/regex-1.8.4/.cargo-ok b/regex-1.8.4/.cargo-ok new file mode 100644 index 0000000000000..5f8b795830acb --- /dev/null +++ b/regex-1.8.4/.cargo-ok @@ -0,0 +1 @@ +{"v":1} \ No newline at end of file diff --git a/regex-1.8.4/.cargo_vcs_info.json b/regex-1.8.4/.cargo_vcs_info.json new file mode 100644 index 0000000000000..529a406af0981 --- /dev/null +++ b/regex-1.8.4/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "5a34a39b72d85730065d3ffe4ce3715f2731e49a" + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/regex-1.8.4/.gitignore b/regex-1.8.4/.gitignore new file mode 100644 index 0000000000000..8f7a426bd623d --- /dev/null +++ b/regex-1.8.4/.gitignore @@ -0,0 +1,8 @@ +target +Cargo.lock +bench-log +.*.swp +wiki +tags +examples/debug.rs +tmp/ diff --git a/regex-1.8.4/CHANGELOG.md b/regex-1.8.4/CHANGELOG.md new file mode 100644 index 0000000000000..e019afb2f5825 --- /dev/null +++ b/regex-1.8.4/CHANGELOG.md @@ -0,0 +1,1305 @@ +1.8.4 (2023-06-05) +================== +This is a patch release that fixes a bug where `(?-u:\B)` was allowed in +Unicode regexes, despite the fact that the current matching engines can report +match offsets between the code units of a single UTF-8 encoded codepoint. That +in turn means that match offsets that split a codepoint could be reported, +which in turn results in panicking when one uses them to slice a `&str`. + +This bug occurred in the transition to `regex 1.8` because the underlying +syntactical error that prevented this regex from compiling was intentionally +removed. That's because `(?-u:\B)` will be permitted in Unicode regexes in +`regex 1.9`, but the matching engines will guarantee to never report match +offsets that split a codepoint. When the underlying syntactical error was +removed, no code was added to ensure that `(?-u:\B)` didn't compile in the +`regex 1.8` transition release. This release, `regex 1.8.4`, adds that code +such that `Regex::new(r"(?-u:\B)")` returns to the `regex <1.8` behavior of +not compiling. (A `bytes::Regex` can still of course compile it.) + +Bug fixes: + +* [BUG #1006](https://github.com/rust-lang/regex/issues/1006): +Fix a bug where `(?-u:\B)` was allowed in Unicode regexes, and in turn could +lead to match offsets that split a codepoint in `&str`. + + +1.8.3 (2023-05-25) +================== +This is a patch release that fixes a bug where the regex would report a +match at every position even when it shouldn't. This could occur in a very +small subset of regexes, usually an alternation of simple literals that +have particular properties. (See the issue linked below for a more precise +description.) + +Bug fixes: + +* [BUG #999](https://github.com/rust-lang/regex/issues/999): +Fix a bug where a match at every position is erroneously reported. + + +1.8.2 (2023-05-22) +================== +This is a patch release that fixes a bug where regex compilation could panic +in debug mode for regexes with large counted repetitions. For example, +`a{2147483516}{2147483416}{5}` resulted in an integer overflow that wrapped +in release mode but panicking in debug mode. Despite the unintended wrapping +arithmetic in release mode, it didn't cause any other logical bugs since the +errant code was for new analysis that wasn't used yet. + +Bug fixes: + +* [BUG #995](https://github.com/rust-lang/regex/issues/995): +Fix a bug where regex compilation with large counted repetitions could panic. + + +1.8.1 (2023-04-21) +================== +This is a patch release that fixes a bug where a regex match could be reported +where none was found. Specifically, the bug occurs when a pattern contains some +literal prefixes that could be extracted _and_ an optional word boundary in the +prefix. + +Bug fixes: + +* [BUG #981](https://github.com/rust-lang/regex/issues/981): +Fix a bug where a word boundary could interact with prefix literal +optimizations and lead to a false positive match. + + +1.8.0 (2023-04-20) +================== +This is a sizeable release that will be soon followed by another sizeable +release. Both of them will combined close over 40 existing issues and PRs. + +This first release, despite its size, essentially represents preparatory work +for the second release, which will be even bigger. Namely, this release: + +* Increases the MSRV to Rust 1.60.0, which was released about 1 year ago. +* Upgrades its dependency on `aho-corasick` to the recently released 1.0 +version. +* Upgrades its dependency on `regex-syntax` to the simultaneously released +`0.7` version. The changes to `regex-syntax` principally revolve around a +rewrite of its literal extraction code and a number of simplifications and +optimizations to its high-level intermediate representation (HIR). + +The second release, which will follow ~shortly after the release above, will +contain a soup-to-nuts rewrite of every regex engine. This will be done by +bringing [`regex-automata`](https://github.com/BurntSushi/regex-automata) into +this repository, and then changing the `regex` crate to be nothing but an API +shim layer on top of `regex-automata`'s API. + +These tandem releases are the culmination of about 3 +years of on-and-off work that [began in earnest in March +2020](https://github.com/rust-lang/regex/issues/656). + +Because of the scale of changes involved in these releases, I would love to +hear about your experience. Especially if you notice undocumented changes in +behavior or performance changes (positive *or* negative). + +Most changes in the first release are listed below. For more details, please +see the commit log, which reflects a linear and decently documented history +of all changes. + +New features: + +* [FEATURE #501](https://github.com/rust-lang/regex/issues/501): +Permit many more characters to be escaped, even if they have no significance. +More specifically, any ASCII character except for `[0-9A-Za-z<>]` can now be +escaped. Also, a new routine, `is_escapeable_character`, has been added to +`regex-syntax` to query whether a character is escapeable or not. +* [FEATURE #547](https://github.com/rust-lang/regex/issues/547): +Add `Regex::captures_at`. This filles a hole in the API, but doesn't otherwise +introduce any new expressive power. +* [FEATURE #595](https://github.com/rust-lang/regex/issues/595): +Capture group names are now Unicode-aware. They can now begin with either a `_` +or any "alphabetic" codepoint. After the first codepoint, subsequent codepoints +can be any sequence of alpha-numeric codepoints, along with `_`, `.`, `[` and +`]`. Note that replacement syntax has not changed. +* [FEATURE #810](https://github.com/rust-lang/regex/issues/810): +Add `Match::is_empty` and `Match::len` APIs. +* [FEATURE #905](https://github.com/rust-lang/regex/issues/905): +Add an `impl Default for RegexSet`, with the default being the empty set. +* [FEATURE #908](https://github.com/rust-lang/regex/issues/908): +A new method, `Regex::static_captures_len`, has been added which returns the +number of capture groups in the pattern if and only if every possible match +always contains the same number of matching groups. +* [FEATURE #955](https://github.com/rust-lang/regex/issues/955): +Named captures can now be written as `(?re)` in addition to +`(?Pre)`. +* FEATURE: `regex-syntax` now supports empty character classes. +* FEATURE: `regex-syntax` now has an optional `std` feature. (This will come +to `regex` in the second release.) +* FEATURE: The `Hir` type in `regex-syntax` has had a number of simplifications +made to it. +* FEATURE: `regex-syntax` has support for a new `R` flag for enabling CRLF +mode. This will be supported in `regex` proper in the second release. +* FEATURE: `regex-syntax` now has proper support for "regex that never +matches" via `Hir::fail()`. +* FEATURE: The `hir::literal` module of `regex-syntax` has been completely +re-worked. It now has more documentation, examples and advice. +* FEATURE: The `allow_invalid_utf8` option in `regex-syntax` has been renamed +to `utf8`, and the meaning of the boolean has been flipped. + +Performance improvements: + +* PERF: The upgrade to `aho-corasick 1.0` may improve performance in some +cases. It's difficult to characterize exactly which patterns this might impact, +but if there are a small number of longish (>= 4 bytes) prefix literals, then +it might be faster than before. + +Bug fixes: + +* [BUG #514](https://github.com/rust-lang/regex/issues/514): +Improve `Debug` impl for `Match` so that it doesn't show the entire haystack. +* BUGS [#516](https://github.com/rust-lang/regex/issues/516), +[#731](https://github.com/rust-lang/regex/issues/731): +Fix a number of issues with printing `Hir` values as regex patterns. +* [BUG #610](https://github.com/rust-lang/regex/issues/610): +Add explicit example of `foo|bar` in the regex syntax docs. +* [BUG #625](https://github.com/rust-lang/regex/issues/625): +Clarify that `SetMatches::len` does not (regretably) refer to the number of +matches in the set. +* [BUG #660](https://github.com/rust-lang/regex/issues/660): +Clarify "verbose mode" in regex syntax documentation. +* BUG [#738](https://github.com/rust-lang/regex/issues/738), +[#950](https://github.com/rust-lang/regex/issues/950): +Fix `CaptureLocations::get` so that it never panics. +* [BUG #747](https://github.com/rust-lang/regex/issues/747): +Clarify documentation for `Regex::shortest_match`. +* [BUG #835](https://github.com/rust-lang/regex/issues/835): +Fix `\p{Sc}` so that it is equivalent to `\p{Currency_Symbol}`. +* [BUG #846](https://github.com/rust-lang/regex/issues/846): +Add more clarifying documentation to the `CompiledTooBig` error variant. +* [BUG #854](https://github.com/rust-lang/regex/issues/854): +Clarify that `regex::Regex` searches as if the haystack is a sequence of +Unicode scalar values. +* [BUG #884](https://github.com/rust-lang/regex/issues/884): +Replace `__Nonexhaustive` variants with `#[non_exhaustive]` attribute. +* [BUG #893](https://github.com/rust-lang/regex/pull/893): +Optimize case folding since it can get quite slow in some pathological cases. +* [BUG #895](https://github.com/rust-lang/regex/issues/895): +Reject `(?-u:\W)` in `regex::Regex` APIs. +* [BUG #942](https://github.com/rust-lang/regex/issues/942): +Add a missing `void` keyword to indicate "no parameters" in C API. +* [BUG #965](https://github.com/rust-lang/regex/issues/965): +Fix `\p{Lc}` so that it is equivalent to `\p{Cased_Letter}`. +* [BUG #975](https://github.com/rust-lang/regex/issues/975): +Clarify documentation for `\pX` syntax. + + +1.7.3 (2023-03-24) +================== +This is a small release that fixes a bug in `Regex::shortest_match_at` that +could cause it to panic, even when the offset given is valid. + +Bug fixes: + +* [BUG #969](https://github.com/rust-lang/regex/issues/969): + Fix a bug in how the reverse DFA was called for `Regex::shortest_match_at`. + + +1.7.2 (2023-03-21) +================== +This is a small release that fixes a failing test on FreeBSD. + +Bug fixes: + +* [BUG #967](https://github.com/rust-lang/regex/issues/967): + Fix "no stack overflow" test which can fail due to the small stack size. + + +1.7.1 (2023-01-09) +================== +This release was done principally to try and fix the doc.rs rendering for the +regex crate. + +Performance improvements: + +* [PERF #930](https://github.com/rust-lang/regex/pull/930): + Optimize `replacen`. This also applies to `replace`, but not `replace_all`. + +Bug fixes: + +* [BUG #945](https://github.com/rust-lang/regex/issues/945): + Maybe fix rustdoc rendering by just bumping a new release? + + +1.7.0 (2022-11-05) +================== +This release principally includes an upgrade to Unicode 15. + +New features: + +* [FEATURE #832](https://github.com/rust-lang/regex/issues/916): + Upgrade to Unicode 15. + + +1.6.0 (2022-07-05) +================== +This release principally includes an upgrade to Unicode 14. + +New features: + +* [FEATURE #832](https://github.com/rust-lang/regex/pull/832): + Clarify that `Captures::len` includes all groups, not just matching groups. +* [FEATURE #857](https://github.com/rust-lang/regex/pull/857): + Add an `ExactSizeIterator` impl for `SubCaptureMatches`. +* [FEATURE #861](https://github.com/rust-lang/regex/pull/861): + Improve `RegexSet` documentation examples. +* [FEATURE #877](https://github.com/rust-lang/regex/issues/877): + Upgrade to Unicode 14. + +Bug fixes: + +* [BUG #792](https://github.com/rust-lang/regex/issues/792): + Fix error message rendering bug. + + +1.5.6 (2022-05-20) +================== +This release includes a few bug fixes, including a bug that produced incorrect +matches when a non-greedy `?` operator was used. + +* [BUG #680](https://github.com/rust-lang/regex/issues/680): + Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. +* [BUG #859](https://github.com/rust-lang/regex/issues/859): + Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`. +* [BUG #862](https://github.com/rust-lang/regex/issues/862): + Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'. + + +1.5.5 (2022-03-08) +================== +This releases fixes a security bug in the regex compiler. This bug permits a +vector for a denial-of-service attack in cases where the regex being compiled +is untrusted. There are no known problems where the regex is itself trusted, +including in cases of untrusted haystacks. + +* [SECURITY #GHSA-m5pq-gvj9-9vr8](https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8): + Fixes a bug in the regex compiler where empty sub-expressions subverted the + existing mitigations in place to enforce a size limit on compiled regexes. + The Rust Security Response WG published an advisory about this: + https://groups.google.com/g/rustlang-security-announcements/c/NcNNL1Jq7Yw + + +1.5.4 (2021-05-06) +================== +This release fixes another compilation failure when building regex. This time, +the fix is for when the `pattern` feature is enabled, which only works on +nightly Rust. CI has been updated to test this case. + +* [BUG #772](https://github.com/rust-lang/regex/pull/772): + Fix build when `pattern` feature is enabled. + + +1.5.3 (2021-05-01) +================== +This releases fixes a bug when building regex with only the `unicode-perl` +feature. It turns out that while CI was building this configuration, it wasn't +actually failing the overall build on a failed compilation. + +* [BUG #769](https://github.com/rust-lang/regex/issues/769): + Fix build in `regex-syntax` when only the `unicode-perl` feature is enabled. + + +1.5.2 (2021-05-01) +================== +This release fixes a performance bug when Unicode word boundaries are used. +Namely, for certain regexes on certain inputs, it's possible for the lazy DFA +to stop searching (causing a fallback to a slower engine) when it doesn't +actually need to. + +[PR #768](https://github.com/rust-lang/regex/pull/768) fixes the bug, which was +originally reported in +[ripgrep#1860](https://github.com/BurntSushi/ripgrep/issues/1860). + + +1.5.1 (2021-04-30) +================== +This is a patch release that fixes a compilation error when the `perf-literal` +feature is not enabled. + + +1.5.0 (2021-04-30) +================== +This release primarily updates to Rust 2018 (finally) and bumps the MSRV to +Rust 1.41 (from Rust 1.28). Rust 1.41 was chosen because it's still reasonably +old, and is what's in Debian stable at the time of writing. + +This release also drops this crate's own bespoke substring search algorithms +in favor of a new +[`memmem` implementation provided by the `memchr` crate](https://docs.rs/memchr/2.4.0/memchr/memmem/index.html). +This will change the performance profile of some regexes, sometimes getting a +little worse, and hopefully more frequently, getting a lot better. Please +report any serious performance regressions if you find them. + + +1.4.6 (2021-04-22) +================== +This is a small patch release that fixes the compiler's size check on how much +heap memory a regex uses. Previously, the compiler did not account for the +heap usage of Unicode character classes. Now it does. It's possible that this +may make some regexes fail to compile that previously did compile. If that +happens, please file an issue. + +* [BUG OSS-fuzz#33579](https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579): + Some regexes can use more heap memory than one would expect. + + +1.4.5 (2021-03-14) +================== +This is a small patch release that fixes a regression in the size of a `Regex` +in the 1.4.4 release. Prior to 1.4.4, a `Regex` was 552 bytes. In the 1.4.4 +release, it was 856 bytes due to internal changes. In this release, a `Regex` +is now 16 bytes. In general, the size of a `Regex` was never something that was +on my radar, but this increased size in the 1.4.4 release seems to have crossed +a threshold and resulted in stack overflows in some programs. + +* [BUG #750](https://github.com/rust-lang/regex/pull/750): + Fixes stack overflows seemingly caused by a large `Regex` size by decreasing + its size. + + +1.4.4 (2021-03-11) +================== +This is a small patch release that contains some bug fixes. Notably, it also +drops the `thread_local` (and `lazy_static`, via transitivity) dependencies. + +Bug fixes: + +* [BUG #362](https://github.com/rust-lang/regex/pull/362): + Memory leaks caused by an internal caching strategy should now be fixed. +* [BUG #576](https://github.com/rust-lang/regex/pull/576): + All regex types now implement `UnwindSafe` and `RefUnwindSafe`. +* [BUG #728](https://github.com/rust-lang/regex/pull/749): + Add missing `Replacer` impls for `Vec`, `String`, `Cow`, etc. + + +1.4.3 (2021-01-08) +================== +This is a small patch release that adds some missing standard trait +implementations for some types in the public API. + +Bug fixes: + +* [BUG #734](https://github.com/rust-lang/regex/pull/734): + Add `FusedIterator` and `ExactSizeIterator` impls to iterator types. +* [BUG #735](https://github.com/rust-lang/regex/pull/735): + Add missing `Debug` impls to public API types. + + +1.4.2 (2020-11-01) +================== +This is a small bug fix release that bans `\P{any}`. We previously banned empty +classes like `[^\w\W]`, but missed the `\P{any}` case. In the future, we hope +to permit empty classes. + +* [BUG #722](https://github.com/rust-lang/regex/issues/722): + Ban `\P{any}` to avoid a panic in the regex compiler. Found by OSS-Fuzz. + + +1.4.1 (2020-10-13) +================== +This is a small bug fix release that makes `\p{cf}` work. Previously, it would +report "property not found" even though `cf` is a valid abbreviation for the +`Format` general category. + +* [BUG #719](https://github.com/rust-lang/regex/issues/719): + Fixes bug that prevented `\p{cf}` from working. + + +1.4.0 (2020-10-11) +================== +This releases has a few minor documentation fixes as well as some very minor +API additions. The MSRV remains at Rust 1.28 for now, but this is intended to +increase to at least Rust 1.41.1 soon. + +This release also adds support for OSS-Fuzz. Kudos to +[@DavidKorczynski](https://github.com/DavidKorczynski) +for doing the heavy lifting for that! + +New features: + +* [FEATURE #649](https://github.com/rust-lang/regex/issues/649): + Support `[`, `]` and `.` in capture group names. +* [FEATURE #687](https://github.com/rust-lang/regex/issues/687): + Add `is_empty` predicate to `RegexSet`. +* [FEATURE #689](https://github.com/rust-lang/regex/issues/689): + Implement `Clone` for `SubCaptureMatches`. +* [FEATURE #715](https://github.com/rust-lang/regex/issues/715): + Add `empty` constructor to `RegexSet` for convenience. + +Bug fixes: + +* [BUG #694](https://github.com/rust-lang/regex/issues/694): + Fix doc example for `Replacer::replace_append`. +* [BUG #698](https://github.com/rust-lang/regex/issues/698): + Clarify docs for `s` flag when using a `bytes::Regex`. +* [BUG #711](https://github.com/rust-lang/regex/issues/711): + Clarify `is_match` docs to indicate that it can match anywhere in string. + + +1.3.9 (2020-05-28) +================== +This release fixes a MSRV (Minimum Support Rust Version) regression in the +1.3.8 release. Namely, while 1.3.8 compiles on Rust 1.28, it actually does not +compile on other Rust versions, such as Rust 1.39. + +Bug fixes: + +* [BUG #685](https://github.com/rust-lang/regex/issues/685): + Remove use of `doc_comment` crate, which cannot be used before Rust 1.43. + + +1.3.8 (2020-05-28) +================== +This release contains a couple of important bug fixes driven +by better support for empty-subexpressions in regexes. For +example, regexes like `b|` are now allowed. Major thanks to +[@sliquister](https://github.com/sliquister) for implementing support for this +in [#677](https://github.com/rust-lang/regex/pull/677). + +Bug fixes: + +* [BUG #523](https://github.com/rust-lang/regex/pull/523): + Add note to documentation that spaces can be escaped in `x` mode. +* [BUG #524](https://github.com/rust-lang/regex/issues/524): + Add support for empty sub-expressions, including empty alternations. +* [BUG #659](https://github.com/rust-lang/regex/issues/659): + Fix match bug caused by an empty sub-expression miscompilation. + + +1.3.7 (2020-04-17) +================== +This release contains a small bug fix that fixes how `regex` forwards crate +features to `regex-syntax`. In particular, this will reduce recompilations in +some cases. + +Bug fixes: + +* [BUG #665](https://github.com/rust-lang/regex/pull/665): + Fix feature forwarding to `regex-syntax`. + + +1.3.6 (2020-03-24) +================== +This release contains a sizable (~30%) performance improvement when compiling +some kinds of large regular expressions. + +Performance improvements: + +* [PERF #657](https://github.com/rust-lang/regex/pull/657): + Improvement performance of compiling large regular expressions. + + +1.3.5 (2020-03-12) +================== +This release updates this crate to Unicode 13. + +New features: + +* [FEATURE #653](https://github.com/rust-lang/regex/pull/653): + Update `regex-syntax` to Unicode 13. + + +1.3.4 (2020-01-30) +================== +This is a small bug fix release that fixes a bug related to the scoping of +flags in a regex. Namely, before this fix, a regex like `((?i)a)b)` would +match `aB` despite the fact that `b` should not be matched case insensitively. + +Bug fixes: + +* [BUG #640](https://github.com/rust-lang/regex/issues/640): + Fix bug related to the scoping of flags in a regex. + + +1.3.3 (2020-01-09) +================== +This is a small maintenance release that upgrades the dependency on +`thread_local` from `0.3` to `1.0`. The minimum supported Rust version remains +at Rust 1.28. + + +1.3.2 (2020-01-09) +================== +This is a small maintenance release with some house cleaning and bug fixes. + +New features: + +* [FEATURE #631](https://github.com/rust-lang/regex/issues/631): + Add a `Match::range` method an a `From for Range` impl. + +Bug fixes: + +* [BUG #521](https://github.com/rust-lang/regex/issues/521): + Corrects `/-/.splitn("a", 2)` to return `["a"]` instead of `["a", ""]`. +* [BUG #594](https://github.com/rust-lang/regex/pull/594): + Improve error reporting when writing `\p\`. +* [BUG #627](https://github.com/rust-lang/regex/issues/627): + Corrects `/-/.split("a-")` to return `["a", ""]` instead of `["a"]`. +* [BUG #633](https://github.com/rust-lang/regex/pull/633): + Squash deprecation warnings for the `std::error::Error::description` method. + + +1.3.1 (2019-09-04) +================== +This is a maintenance release with no changes in order to try to work-around +a [docs.rs/Cargo issue](https://github.com/rust-lang/docs.rs/issues/400). + + +1.3.0 (2019-09-03) +================== +This release adds a plethora of new crate features that permit users of regex +to shrink its size considerably, in exchange for giving up either functionality +(such as Unicode support) or runtime performance. When all such features are +disabled, the dependency tree for `regex` shrinks to exactly 1 crate +(`regex-syntax`). More information about the new crate features can be +[found in the docs](https://docs.rs/regex/*/#crate-features). + +Note that while this is a new minor version release, the minimum supported +Rust version for this crate remains at `1.28.0`. + +New features: + +* [FEATURE #474](https://github.com/rust-lang/regex/issues/474): + The `use_std` feature has been deprecated in favor of the `std` feature. + The `use_std` feature will be removed in regex 2. Until then, `use_std` will + remain as an alias for the `std` feature. +* [FEATURE #583](https://github.com/rust-lang/regex/issues/583): + Add a substantial number of crate features shrinking `regex`. + + +1.2.1 (2019-08-03) +================== +This release does a bit of house cleaning. Namely: + +* This repository is now using rustfmt. +* License headers have been removed from all files, in following suit with the + Rust project. +* Teddy has been removed from the `regex` crate, and is now part of the + `aho-corasick` crate. + [See `aho-corasick`'s new `packed` sub-module for details](https://docs.rs/aho-corasick/0.7.6/aho_corasick/packed/index.html). +* The `utf8-ranges` crate has been deprecated, with its functionality moving + into the + [`utf8` sub-module of `regex-syntax`](https://docs.rs/regex-syntax/0.6.11/regex_syntax/utf8/index.html). +* The `ucd-util` dependency has been dropped, in favor of implementing what + little we need inside of `regex-syntax` itself. + +In general, this is part of an ongoing (long term) effort to make optimizations +in the regex engine easier to reason about. The current code is too convoluted +and thus it is very easy to introduce new bugs. This simplification effort is +the primary motivation behind re-working the `aho-corasick` crate to not only +bundle algorithms like Teddy, but to also provide regex-like match semantics +automatically. + +Moving forward, the plan is to join up with the `bstr` and `regex-automata` +crates, with the former providing more sophisticated substring search +algorithms (thereby deleting existing code in `regex`) and the latter providing +ahead-of-time compiled DFAs for cases where they are inexpensive to compute. + + +1.2.0 (2019-07-20) +================== +This release updates regex's minimum supported Rust version to 1.28, which was +release almost 1 year ago. This release also updates regex's Unicode data +tables to 12.1.0. + + +1.1.9 (2019-07-06) +================== +This release contains a bug fix that caused regex's tests to fail, due to a +dependency on an unreleased behavior in regex-syntax. + +* [BUG #593](https://github.com/rust-lang/regex/issues/593): + Move an integration-style test on error messages into regex-syntax. + + +1.1.8 (2019-07-04) +================== +This release contains a few small internal refactorings. One of which fixes +an instance of undefined behavior in a part of the SIMD code. + +Bug fixes: + +* [BUG #545](https://github.com/rust-lang/regex/issues/545): + Improves error messages when a repetition operator is used without a number. +* [BUG #588](https://github.com/rust-lang/regex/issues/588): + Removes use of a repr(Rust) union used for type punning in the Teddy matcher. +* [BUG #591](https://github.com/rust-lang/regex/issues/591): + Update docs for running benchmarks and improve failure modes. + + +1.1.7 (2019-06-09) +================== +This release fixes up a few warnings as a result of recent deprecations. + + +1.1.6 (2019-04-16) +================== +This release fixes a regression introduced by a bug fix (for +[BUG #557](https://github.com/rust-lang/regex/issues/557)) which could cause +the regex engine to enter an infinite loop. This bug was originally +[reported against ripgrep](https://github.com/BurntSushi/ripgrep/issues/1247). + + +1.1.5 (2019-04-01) +================== +This release fixes a bug in regex's dependency specification where it requires +a newer version of regex-syntax, but this wasn't communicated correctly in the +Cargo.toml. This would have been caught by a minimal version check, but this +check was disabled because the `rand` crate itself advertises incorrect +dependency specifications. + +Bug fixes: + +* [BUG #570](https://github.com/rust-lang/regex/pull/570): + Fix regex-syntax minimal version. + + +1.1.4 (2019-03-31) +================== +This release fixes a backwards compatibility regression where Regex was no +longer UnwindSafe. This was caused by the upgrade to aho-corasick 0.7, whose +AhoCorasick type was itself not UnwindSafe. This has been fixed in aho-corasick +0.7.4, which we now require. + +Bug fixes: + +* [BUG #568](https://github.com/rust-lang/regex/pull/568): + Fix an API regression where Regex was no longer UnwindSafe. + + +1.1.3 (2019-03-30) +================== +This releases fixes a few bugs and adds a performance improvement when a regex +is a simple alternation of literals. + +Performance improvements: + +* [OPT #566](https://github.com/rust-lang/regex/pull/566): + Upgrades `aho-corasick` to 0.7 and uses it for `foo|bar|...|quux` regexes. + +Bug fixes: + +* [BUG #527](https://github.com/rust-lang/regex/issues/527): + Fix a bug where the parser would panic on patterns like `((?x))`. +* [BUG #555](https://github.com/rust-lang/regex/issues/555): + Fix a bug where the parser would panic on patterns like `(?m){1,1}`. +* [BUG #557](https://github.com/rust-lang/regex/issues/557): + Fix a bug where captures could lead to an incorrect match. + + +1.1.2 (2019-02-27) +================== +This release fixes a bug found in the fix introduced in 1.1.1. + +Bug fixes: + +* [BUG edf45e6f](https://github.com/rust-lang/regex/commit/edf45e6f): + Fix bug introduced in reverse suffix literal matcher in the 1.1.1 release. + + +1.1.1 (2019-02-27) +================== +This is a small release with one fix for a bug caused by literal optimizations. + +Bug fixes: + +* [BUG 661bf53d](https://github.com/rust-lang/regex/commit/661bf53d): + Fixes a bug in the reverse suffix literal optimization. This was originally + reported + [against ripgrep](https://github.com/BurntSushi/ripgrep/issues/1203). + + +1.1.0 (2018-11-30) +================== +This is a small release with a couple small enhancements. This release also +increases the minimal supported Rust version (MSRV) to 1.24.1 (from 1.20.0). In +accordance with this crate's MSRV policy, this release bumps the minor version +number. + +Performance improvements: + +* [OPT #511](https://github.com/rust-lang/regex/pull/511), + [OPT #540](https://github.com/rust-lang/regex/pull/540): + Improve lazy DFA construction for large regex sets. + +New features: + +* [FEATURE #538](https://github.com/rust-lang/regex/pull/538): + Add Emoji and "break" Unicode properties. See [UNICODE.md](UNICODE.md). + +Bug fixes: + +* [BUG #530](https://github.com/rust-lang/regex/pull/530): + Add Unicode license (for data tables). +* Various typo/doc fixups. + + +1.0.6 (2018-11-06) +================== +This is a small release. + +Performance improvements: + +* [OPT #513](https://github.com/rust-lang/regex/pull/513): + Improve performance of compiling large Unicode classes by 8-10%. + +Bug fixes: + +* [BUG #533](https://github.com/rust-lang/regex/issues/533): + Fix definition of `[[:blank:]]` class that regressed in `regex-syntax 0.5`. + + +1.0.5 (2018-09-06) +================== +This is a small release with an API enhancement. + +New features: + +* [FEATURE #509](https://github.com/rust-lang/regex/pull/509): + Generalize impls of the `Replacer` trait. + + +1.0.4 (2018-08-25) +================== +This is a small release that bumps the quickcheck dependency. + + +1.0.3 (2018-08-24) +================== +This is a small bug fix release. + +Bug fixes: + +* [BUG #504](https://github.com/rust-lang/regex/pull/504): + Fix for Cargo's "minimal version" support. +* [BUG 1e39165f](https://github.com/rust-lang/regex/commit/1e39165f): + Fix doc examples for byte regexes. + + +1.0.2 (2018-07-18) +================== +This release exposes some new lower level APIs on `Regex` that permit +amortizing allocation and controlling the location at which a search is +performed in a more granular way. Most users of the regex crate will not +need or want to use these APIs. + +New features: + +* [FEATURE #493](https://github.com/rust-lang/regex/pull/493): + Add a few lower level APIs for amortizing allocation and more fine grained + searching. + +Bug fixes: + +* [BUG 3981d2ad](https://github.com/rust-lang/regex/commit/3981d2ad): + Correct outdated documentation on `RegexBuilder::dot_matches_new_line`. +* [BUG 7ebe4ae0](https://github.com/rust-lang/regex/commit/7ebe4ae0): + Correct outdated documentation on `Parser::allow_invalid_utf8` in the + `regex-syntax` crate. +* [BUG 24c7770b](https://github.com/rust-lang/regex/commit/24c7770b): + Fix a bug in the HIR printer where it wouldn't correctly escape meta + characters in character classes. + + +1.0.1 (2018-06-19) +================== +This release upgrades regex's Unicode tables to Unicode 11, and enables SIMD +optimizations automatically on Rust stable (1.27 or newer). + +New features: + +* [FEATURE #486](https://github.com/rust-lang/regex/pull/486): + Implement `size_hint` on `RegexSet` match iterators. +* [FEATURE #488](https://github.com/rust-lang/regex/pull/488): + Update Unicode tables for Unicode 11. +* [FEATURE #490](https://github.com/rust-lang/regex/pull/490): + SIMD optimizations are now enabled automatically in Rust stable, for versions + 1.27 and up. No compilation flags or features need to be set. CPU support + SIMD is detected automatically at runtime. + +Bug fixes: + +* [BUG #482](https://github.com/rust-lang/regex/pull/482): + Present a better compilation error when the `use_std` feature isn't used. + + +1.0.0 (2018-05-01) +================== +This release marks the 1.0 release of regex. + +While this release includes some breaking changes, most users of older versions +of the regex library should be able to migrate to 1.0 by simply bumping the +version number. The important changes are as follows: + +* We adopt Rust 1.20 as the new minimum supported version of Rust for regex. + We also tentativley adopt a policy that permits bumping the minimum supported + version of Rust in minor version releases of regex, but no patch releases. + That is, with respect to semver, we do not strictly consider bumping the + minimum version of Rust to be a breaking change, but adopt a conservative + stance as a compromise. +* Octal syntax in regular expressions has been disabled by default. This + permits better error messages that inform users that backreferences aren't + available. Octal syntax can be re-enabled via the corresponding option on + `RegexBuilder`. +* `(?-u:\B)` is no longer allowed in Unicode regexes since it can match at + invalid UTF-8 code unit boundaries. `(?-u:\b)` is still allowed in Unicode + regexes. +* The `From` impl has been removed. This formally removes + the public dependency on `regex-syntax`. +* A new feature, `use_std`, has been added and enabled by default. Disabling + the feature will result in a compilation error. In the future, this may + permit us to support `no_std` environments (w/ `alloc`) in a backwards + compatible way. + +For more information and discussion, please see +[1.0 release tracking issue](https://github.com/rust-lang/regex/issues/457). + + +0.2.11 (2018-05-01) +=================== +This release primarily contains bug fixes. Some of them resolve bugs where +the parser could panic. + +New features: + +* [FEATURE #459](https://github.com/rust-lang/regex/pull/459): + Include C++'s standard regex library and Boost's regex library in the + benchmark harness. We now include D/libphobos, C++/std, C++/boost, Oniguruma, + PCRE1, PCRE2, RE2 and Tcl in the harness. + +Bug fixes: + +* [BUG #445](https://github.com/rust-lang/regex/issues/445): + Clarify order of indices returned by RegexSet match iterator. +* [BUG #461](https://github.com/rust-lang/regex/issues/461): + Improve error messages for invalid regexes like `[\d-a]`. +* [BUG #464](https://github.com/rust-lang/regex/issues/464): + Fix a bug in the error message pretty printer that could cause a panic when + a regex contained a literal `\n` character. +* [BUG #465](https://github.com/rust-lang/regex/issues/465): + Fix a panic in the parser that was caused by applying a repetition operator + to `(?flags)`. +* [BUG #466](https://github.com/rust-lang/regex/issues/466): + Fix a bug where `\pC` was not recognized as an alias for `\p{Other}`. +* [BUG #470](https://github.com/rust-lang/regex/pull/470): + Fix a bug where literal searches did more work than necessary for anchored + regexes. + + +0.2.10 (2018-03-16) +=================== +This release primarily updates the regex crate to changes made in `std::arch` +on nightly Rust. + +New features: + +* [FEATURE #458](https://github.com/rust-lang/regex/pull/458): + The `Hir` type in `regex-syntax` now has a printer. + + +0.2.9 (2018-03-12) +================== +This release introduces a new nightly only feature, `unstable`, which enables +SIMD optimizations for certain types of regexes. No additional compile time +options are necessary, and the regex crate will automatically choose the +best CPU features at run time. As a result, the `simd` (nightly only) crate +dependency has been dropped. + +New features: + +* [FEATURE #456](https://github.com/rust-lang/regex/pull/456): + The regex crate now includes AVX2 optimizations in addition to the extant + SSSE3 optimization. + +Bug fixes: + +* [BUG #455](https://github.com/rust-lang/regex/pull/455): + Fix a bug where `(?x)[ / - ]` failed to parse. + + +0.2.8 (2018-03-12) +================== +Bug gixes: + +* [BUG #454](https://github.com/rust-lang/regex/pull/454): + Fix a bug in the nest limit checker being too aggressive. + + +0.2.7 (2018-03-07) +================== +This release includes a ground-up rewrite of the regex-syntax crate, which has +been in development for over a year. +731 +New features: + +* Error messages for invalid regexes have been greatly improved. You get these + automatically; you don't need to do anything. In addition to better + formatting, error messages will now explicitly call out the use of look + around. When regex 1.0 is released, this will happen for backreferences as + well. +* Full support for intersection, difference and symmetric difference of + character classes. These can be used via the `&&`, `--` and `~~` binary + operators within classes. +* A Unicode Level 1 conformat implementation of `\p{..}` character classes. + Things like `\p{scx:Hira}`, `\p{age:3.2}` or `\p{Changes_When_Casefolded}` + now work. All property name and value aliases are supported, and properties + are selected via loose matching. e.g., `\p{Greek}` is the same as + `\p{G r E e K}`. +* A new `UNICODE.md` document has been added to this repository that + exhaustively documents support for UTS#18. +* Empty sub-expressions are now permitted in most places. That is, `()+` is + now a valid regex. +* Almost everything in regex-syntax now uses constant stack space, even when + performing analysis that requires structural induction. This reduces the risk + of a user provided regular expression causing a stack overflow. +* [FEATURE #174](https://github.com/rust-lang/regex/issues/174): + The `Ast` type in `regex-syntax` now contains span information. +* [FEATURE #424](https://github.com/rust-lang/regex/issues/424): + Support `\u`, `\u{...}`, `\U` and `\U{...}` syntax for specifying code points + in a regular expression. +* [FEATURE #449](https://github.com/rust-lang/regex/pull/449): + Add a `Replace::by_ref` adapter for use of a replacer without consuming it. + +Bug fixes: + +* [BUG #446](https://github.com/rust-lang/regex/issues/446): + We re-enable the Boyer-Moore literal matcher. + + +0.2.6 (2018-02-08) +================== +Bug fixes: + +* [BUG #446](https://github.com/rust-lang/regex/issues/446): + Fixes a bug in the new Boyer-Moore searcher that results in a match failure. + We fix this bug by temporarily disabling Boyer-Moore. + + +0.2.5 (2017-12-30) +================== +Bug fixes: + +* [BUG #437](https://github.com/rust-lang/regex/issues/437): + Fixes a bug in the new Boyer-Moore searcher that results in a panic. + + +0.2.4 (2017-12-30) +================== +New features: + +* [FEATURE #348](https://github.com/rust-lang/regex/pull/348): + Improve performance for capture searches on anchored regex. + (Contributed by @ethanpailes. Nice work!) +* [FEATURE #419](https://github.com/rust-lang/regex/pull/419): + Expand literal searching to include Tuned Boyer-Moore in some cases. + (Contributed by @ethanpailes. Nice work!) + +Bug fixes: + +* [BUG](https://github.com/rust-lang/regex/pull/436): + The regex compiler plugin has been removed. +* [BUG](https://github.com/rust-lang/regex/pull/436): + `simd` has been bumped to `0.2.1`, which fixes a Rust nightly build error. +* [BUG](https://github.com/rust-lang/regex/pull/436): + Bring the benchmark harness up to date. + + +0.2.3 (2017-11-30) +================== +New features: + +* [FEATURE #374](https://github.com/rust-lang/regex/pull/374): + Add `impl From for &str`. +* [FEATURE #380](https://github.com/rust-lang/regex/pull/380): + Derive `Clone` and `PartialEq` on `Error`. +* [FEATURE #400](https://github.com/rust-lang/regex/pull/400): + Update to Unicode 10. + +Bug fixes: + +* [BUG #375](https://github.com/rust-lang/regex/issues/375): + Fix a bug that prevented the bounded backtracker from terminating. +* [BUG #393](https://github.com/rust-lang/regex/issues/393), + [BUG #394](https://github.com/rust-lang/regex/issues/394): + Fix bug with `replace` methods for empty matches. + + +0.2.2 (2017-05-21) +================== +New features: + +* [FEATURE #341](https://github.com/rust-lang/regex/issues/341): + Support nested character classes and intersection operation. + For example, `[\p{Greek}&&\pL]` matches greek letters and + `[[0-9]&&[^4]]` matches every decimal digit except `4`. + (Much thanks to @robinst, who contributed this awesome feature.) + +Bug fixes: + +* [BUG #321](https://github.com/rust-lang/regex/issues/321): + Fix bug in literal extraction and UTF-8 decoding. +* [BUG #326](https://github.com/rust-lang/regex/issues/326): + Add documentation tip about the `(?x)` flag. +* [BUG #333](https://github.com/rust-lang/regex/issues/333): + Show additional replacement example using curly braces. +* [BUG #334](https://github.com/rust-lang/regex/issues/334): + Fix bug when resolving captures after a match. +* [BUG #338](https://github.com/rust-lang/regex/issues/338): + Add example that uses `Captures::get` to API documentation. +* [BUG #353](https://github.com/rust-lang/regex/issues/353): + Fix RegexSet bug that caused match failure in some cases. +* [BUG #354](https://github.com/rust-lang/regex/pull/354): + Fix panic in parser when `(?x)` is used. +* [BUG #358](https://github.com/rust-lang/regex/issues/358): + Fix literal optimization bug with RegexSet. +* [BUG #359](https://github.com/rust-lang/regex/issues/359): + Fix example code in README. +* [BUG #365](https://github.com/rust-lang/regex/pull/365): + Fix bug in `rure_captures_len` in the C binding. +* [BUG #367](https://github.com/rust-lang/regex/issues/367): + Fix byte class bug that caused a panic. + + +0.2.1 +===== +One major bug with `replace_all` has been fixed along with a couple of other +touchups. + +* [BUG #312](https://github.com/rust-lang/regex/issues/312): + Fix documentation for `NoExpand` to reference correct lifetime parameter. +* [BUG #314](https://github.com/rust-lang/regex/issues/314): + Fix a bug with `replace_all` when replacing a match with the empty string. +* [BUG #316](https://github.com/rust-lang/regex/issues/316): + Note a missing breaking change from the `0.2.0` CHANGELOG entry. + (`RegexBuilder::compile` was renamed to `RegexBuilder::build`.) +* [BUG #324](https://github.com/rust-lang/regex/issues/324): + Compiling `regex` should only require one version of `memchr` crate. + + +0.2.0 +===== +This is a new major release of the regex crate, and is an implementation of the +[regex 1.0 RFC](https://github.com/rust-lang/rfcs/blob/master/text/1620-regex-1.0.md). +We are releasing a `0.2` first, and if there are no major problems, we will +release a `1.0` shortly. For `0.2`, the minimum *supported* Rust version is +1.12. + +There are a number of **breaking changes** in `0.2`. They are split into two +types. The first type correspond to breaking changes in regular expression +syntax. The second type correspond to breaking changes in the API. + +Breaking changes for regex syntax: + +* POSIX character classes now require double bracketing. Previously, the regex + `[:upper:]` would parse as the `upper` POSIX character class. Now it parses + as the character class containing the characters `:upper:`. The fix to this + change is to use `[[:upper:]]` instead. Note that variants like + `[[:upper:][:blank:]]` continue to work. +* The character `[` must always be escaped inside a character class. +* The characters `&`, `-` and `~` must be escaped if any one of them are + repeated consecutively. For example, `[&]`, `[\&]`, `[\&\&]`, `[&-&]` are all + equivalent while `[&&]` is illegal. (The motivation for this and the prior + change is to provide a backwards compatible path for adding character class + set notation.) +* A `bytes::Regex` now has Unicode mode enabled by default (like the main + `Regex` type). This means regexes compiled with `bytes::Regex::new` that + don't have the Unicode flag set should add `(?-u)` to recover the original + behavior. + +Breaking changes for the regex API: + +* `find` and `find_iter` now **return `Match` values instead of + `(usize, usize)`.** `Match` values have `start` and `end` methods, which + return the match offsets. `Match` values also have an `as_str` method, + which returns the text of the match itself. +* The `Captures` type now only provides a single iterator over all capturing + matches, which should replace uses of `iter` and `iter_pos`. Uses of + `iter_named` should use the `capture_names` method on `Regex`. +* The `at` method on the `Captures` type has been renamed to `get`, and it + now returns a `Match`. Similarly, the `name` method on `Captures` now returns + a `Match`. +* The `replace` methods now return `Cow` values. The `Cow::Borrowed` variant + is returned when no replacements are made. +* The `Replacer` trait has been completely overhauled. This should only + impact clients that implement this trait explicitly. Standard uses of + the `replace` methods should continue to work unchanged. If you implement + the `Replacer` trait, please consult the new documentation. +* The `quote` free function has been renamed to `escape`. +* The `Regex::with_size_limit` method has been removed. It is replaced by + `RegexBuilder::size_limit`. +* The `RegexBuilder` type has switched from owned `self` method receivers to + `&mut self` method receivers. Most uses will continue to work unchanged, but + some code may require naming an intermediate variable to hold the builder. +* The `compile` method on `RegexBuilder` has been renamed to `build`. +* The free `is_match` function has been removed. It is replaced by compiling + a `Regex` and calling its `is_match` method. +* The `PartialEq` and `Eq` impls on `Regex` have been dropped. If you relied + on these impls, the fix is to define a wrapper type around `Regex`, impl + `Deref` on it and provide the necessary impls. +* The `is_empty` method on `Captures` has been removed. This always returns + `false`, so its use is superfluous. +* The `Syntax` variant of the `Error` type now contains a string instead of + a `regex_syntax::Error`. If you were examining syntax errors more closely, + you'll need to explicitly use the `regex_syntax` crate to re-parse the regex. +* The `InvalidSet` variant of the `Error` type has been removed since it is + no longer used. +* Most of the iterator types have been renamed to match conventions. If you + were using these iterator types explicitly, please consult the documentation + for its new name. For example, `RegexSplits` has been renamed to `Split`. + +A number of bugs have been fixed: + +* [BUG #151](https://github.com/rust-lang/regex/issues/151): + The `Replacer` trait has been changed to permit the caller to control + allocation. +* [BUG #165](https://github.com/rust-lang/regex/issues/165): + Remove the free `is_match` function. +* [BUG #166](https://github.com/rust-lang/regex/issues/166): + Expose more knobs (available in `0.1`) and remove `with_size_limit`. +* [BUG #168](https://github.com/rust-lang/regex/issues/168): + Iterators produced by `Captures` now have the correct lifetime parameters. +* [BUG #175](https://github.com/rust-lang/regex/issues/175): + Fix a corner case in the parsing of POSIX character classes. +* [BUG #178](https://github.com/rust-lang/regex/issues/178): + Drop the `PartialEq` and `Eq` impls on `Regex`. +* [BUG #179](https://github.com/rust-lang/regex/issues/179): + Remove `is_empty` from `Captures` since it always returns false. +* [BUG #276](https://github.com/rust-lang/regex/issues/276): + Position of named capture can now be retrieved from a `Captures`. +* [BUG #296](https://github.com/rust-lang/regex/issues/296): + Remove winapi/kernel32-sys dependency on UNIX. +* [BUG #307](https://github.com/rust-lang/regex/issues/307): + Fix error on emscripten. + + +0.1.80 +====== +* [PR #292](https://github.com/rust-lang/regex/pull/292): + Fixes bug #291, which was introduced by PR #290. + +0.1.79 +====== +* Require regex-syntax 0.3.8. + +0.1.78 +====== +* [PR #290](https://github.com/rust-lang/regex/pull/290): + Fixes bug #289, which caused some regexes with a certain combination + of literals to match incorrectly. + +0.1.77 +====== +* [PR #281](https://github.com/rust-lang/regex/pull/281): + Fixes bug #280 by disabling all literal optimizations when a pattern + is partially anchored. + +0.1.76 +====== +* Tweak criteria for using the Teddy literal matcher. + +0.1.75 +====== +* [PR #275](https://github.com/rust-lang/regex/pull/275): + Improves match verification performance in the Teddy SIMD searcher. +* [PR #278](https://github.com/rust-lang/regex/pull/278): + Replaces slow substring loop in the Teddy SIMD searcher with Aho-Corasick. +* Implemented DoubleEndedIterator on regex set match iterators. + +0.1.74 +====== +* Release regex-syntax 0.3.5 with a minor bug fix. +* Fix bug #272. +* Fix bug #277. +* [PR #270](https://github.com/rust-lang/regex/pull/270): + Fixes bugs #264, #268 and an unreported where the DFA cache size could be + drastically under estimated in some cases (leading to high unexpected memory + usage). + +0.1.73 +====== +* Release `regex-syntax 0.3.4`. +* Bump `regex-syntax` dependency version for `regex` to `0.3.4`. + +0.1.72 +====== +* [PR #262](https://github.com/rust-lang/regex/pull/262): + Fixes a number of small bugs caught by fuzz testing (AFL). + +0.1.71 +====== +* [PR #236](https://github.com/rust-lang/regex/pull/236): + Fix a bug in how suffix literals were extracted, which could lead + to invalid match behavior in some cases. + +0.1.70 +====== +* [PR #231](https://github.com/rust-lang/regex/pull/231): + Add SIMD accelerated multiple pattern search. +* [PR #228](https://github.com/rust-lang/regex/pull/228): + Reintroduce the reverse suffix literal optimization. +* [PR #226](https://github.com/rust-lang/regex/pull/226): + Implements NFA state compression in the lazy DFA. +* [PR #223](https://github.com/rust-lang/regex/pull/223): + A fully anchored RegexSet can now short-circuit. + +0.1.69 +====== +* [PR #216](https://github.com/rust-lang/regex/pull/216): + Tweak the threshold for running backtracking. +* [PR #217](https://github.com/rust-lang/regex/pull/217): + Add upper limit (from the DFA) to capture search (for the NFA). +* [PR #218](https://github.com/rust-lang/regex/pull/218): + Add rure, a C API. + +0.1.68 +====== +* [PR #210](https://github.com/rust-lang/regex/pull/210): + Fixed a performance bug in `bytes::Regex::replace` where `extend` was used + instead of `extend_from_slice`. +* [PR #211](https://github.com/rust-lang/regex/pull/211): + Fixed a bug in the handling of word boundaries in the DFA. +* [PR #213](https://github.com/rust-lang/pull/213): + Added RE2 and Tcl to the benchmark harness. Also added a CLI utility from + running regexes using any of the following regex engines: PCRE1, PCRE2, + Oniguruma, RE2, Tcl and of course Rust's own regexes. + +0.1.67 +====== +* [PR #201](https://github.com/rust-lang/regex/pull/201): + Fix undefined behavior in the `regex!` compiler plugin macro. +* [PR #205](https://github.com/rust-lang/regex/pull/205): + More improvements to DFA performance. Competitive with RE2. See PR for + benchmarks. +* [PR #209](https://github.com/rust-lang/regex/pull/209): + Release 0.1.66 was semver incompatible since it required a newer version + of Rust than previous releases. This PR fixes that. (And `0.1.66` was + yanked.) + +0.1.66 +====== +* Speculative support for Unicode word boundaries was added to the DFA. This + should remove the last common case that disqualified use of the DFA. +* An optimization that scanned for suffix literals and then matched the regular + expression in reverse was removed because it had worst case quadratic time + complexity. It was replaced with a more limited optimization where, given any + regex of the form `re$`, it will be matched in reverse from the end of the + haystack. +* [PR #202](https://github.com/rust-lang/regex/pull/202): + The inner loop of the DFA was heavily optimized to improve cache locality + and reduce the overall number of instructions run on each iteration. This + represents the first use of `unsafe` in `regex` (to elide bounds checks). +* [PR #200](https://github.com/rust-lang/regex/pull/200): + Use of the `mempool` crate (which used thread local storage) was replaced + with a faster version of a similar API in @Amanieu's `thread_local` crate. + It should reduce contention when using a regex from multiple threads + simultaneously. +* PCRE2 JIT benchmarks were added. A benchmark comparison can be found + [here](https://gist.github.com/anonymous/14683c01993e91689f7206a18675901b). + (Includes a comparison with PCRE1's JIT and Oniguruma.) +* A bug where word boundaries weren't being matched correctly in the DFA was + fixed. This only affected use of `bytes::Regex`. +* [#160](https://github.com/rust-lang/regex/issues/160): + `Captures` now has a `Debug` impl. diff --git a/regex-1.8.4/Cargo.toml b/regex-1.8.4/Cargo.toml new file mode 100644 index 0000000000000..b4371c4b945de --- /dev/null +++ b/regex-1.8.4/Cargo.toml @@ -0,0 +1,150 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +rust-version = "1.60.0" +name = "regex" +version = "1.8.4" +authors = ["The Rust Project Developers"] +exclude = [ + "/scripts/*", + "/.github/*", +] +autotests = false +description = """ +An implementation of regular expressions for Rust. This implementation uses +finite automata and guarantees linear time matching on all inputs. +""" +homepage = "https://github.com/rust-lang/regex" +documentation = "https://docs.rs/regex" +readme = "README.md" +categories = ["text-processing"] +license = "MIT OR Apache-2.0" +repository = "https://github.com/rust-lang/regex" + +[profile.bench] +debug = 2 + +[profile.release] +debug = 2 + +[profile.test] +debug = 2 + +[lib] +doctest = false +bench = false + +[[test]] +name = "default" +path = "tests/test_default.rs" + +[[test]] +name = "default-bytes" +path = "tests/test_default_bytes.rs" + +[[test]] +name = "nfa" +path = "tests/test_nfa.rs" + +[[test]] +name = "nfa-utf8bytes" +path = "tests/test_nfa_utf8bytes.rs" + +[[test]] +name = "nfa-bytes" +path = "tests/test_nfa_bytes.rs" + +[[test]] +name = "backtrack" +path = "tests/test_backtrack.rs" + +[[test]] +name = "backtrack-utf8bytes" +path = "tests/test_backtrack_utf8bytes.rs" + +[[test]] +name = "backtrack-bytes" +path = "tests/test_backtrack_bytes.rs" + +[[test]] +name = "crates-regex" +path = "tests/test_crates_regex.rs" + +[dependencies.aho-corasick] +version = "1.0.0" +optional = true + +[dependencies.memchr] +version = "2.5.0" +optional = true + +[dependencies.regex-syntax] +version = "0.7.2" +default-features = false + +[dev-dependencies.lazy_static] +version = "1" + +[dev-dependencies.quickcheck] +version = "1.0.3" +default-features = false + +[dev-dependencies.rand] +version = "0.8.3" +features = [ + "getrandom", + "small_rng", +] +default-features = false + +[features] +default = [ + "std", + "perf", + "unicode", + "regex-syntax/default", +] +pattern = [] +perf = [ + "perf-cache", + "perf-dfa", + "perf-inline", + "perf-literal", +] +perf-cache = [] +perf-dfa = [] +perf-inline = [] +perf-literal = [ + "aho-corasick", + "memchr", +] +std = [] +unicode = [ + "unicode-age", + "unicode-bool", + "unicode-case", + "unicode-gencat", + "unicode-perl", + "unicode-script", + "unicode-segment", + "regex-syntax/unicode", +] +unicode-age = ["regex-syntax/unicode-age"] +unicode-bool = ["regex-syntax/unicode-bool"] +unicode-case = ["regex-syntax/unicode-case"] +unicode-gencat = ["regex-syntax/unicode-gencat"] +unicode-perl = ["regex-syntax/unicode-perl"] +unicode-script = ["regex-syntax/unicode-script"] +unicode-segment = ["regex-syntax/unicode-segment"] +unstable = ["pattern"] +use_std = ["std"] diff --git a/regex-1.8.4/Cargo.toml.orig b/regex-1.8.4/Cargo.toml.orig new file mode 100644 index 0000000000000..f8a76ab756ebe --- /dev/null +++ b/regex-1.8.4/Cargo.toml.orig @@ -0,0 +1,195 @@ +[package] +name = "regex" +version = "1.8.4" #:version +authors = ["The Rust Project Developers"] +license = "MIT OR Apache-2.0" +readme = "README.md" +repository = "https://github.com/rust-lang/regex" +documentation = "https://docs.rs/regex" +homepage = "https://github.com/rust-lang/regex" +description = """ +An implementation of regular expressions for Rust. This implementation uses +finite automata and guarantees linear time matching on all inputs. +""" +categories = ["text-processing"] +autotests = false +exclude = ["/scripts/*", "/.github/*"] +edition = "2021" +rust-version = "1.60.0" + +[workspace] +members = [ + "bench", "regex-capi", "regex-syntax", +] + +[lib] +# There are no benchmarks in the library code itself +bench = false +# Doc tests fail when some features aren't present. The easiest way to work +# around this is to disable automatic doc testing, but explicitly test them +# with `cargo test --doc`. +doctest = false + +# Features are documented in the "Crate features" section of the crate docs: +# https://docs.rs/regex/*/#crate-features +[features] +default = ["std", "perf", "unicode", "regex-syntax/default"] + +# ECOSYSTEM FEATURES + +# The 'std' feature permits the regex crate to use the standard library. This +# is intended to support future use cases where the regex crate may be able +# to compile without std, and instead just rely on 'core' and 'alloc' (for +# example). Currently, this isn't supported, and removing the 'std' feature +# will prevent regex from compiling. +std = [] +# The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until +# then, it is an alias for the 'std' feature. +use_std = ["std"] + + +# PERFORMANCE FEATURES + +# Enables all performance features. +perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"] +# Enables fast caching. (If disabled, caching is still used, but is slower.) +# Currently, this feature has no effect. It used to remove the thread_local +# dependency and use a slower internal cache, but now the default cache has +# been improved and thread_local is no longer a dependency at all. +perf-cache = [] +# Enables use of a lazy DFA when possible. +perf-dfa = [] +# Enables aggressive use of inlining. +perf-inline = [] +# Enables literal optimizations. +perf-literal = ["aho-corasick", "memchr"] + + +# UNICODE DATA FEATURES + +# Enables all Unicode features. This expands if new Unicode features are added. +unicode = [ + "unicode-age", + "unicode-bool", + "unicode-case", + "unicode-gencat", + "unicode-perl", + "unicode-script", + "unicode-segment", + "regex-syntax/unicode", +] +# Enables use of the `Age` property, e.g., `\p{Age:3.0}`. +unicode-age = ["regex-syntax/unicode-age"] +# Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`. +unicode-bool = ["regex-syntax/unicode-bool"] +# Enables Unicode-aware case insensitive matching, e.g., `(?i)β`. +unicode-case = ["regex-syntax/unicode-case"] +# Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`. +unicode-gencat = ["regex-syntax/unicode-gencat"] +# Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`. +unicode-perl = ["regex-syntax/unicode-perl"] +# Enables Unicode scripts and script extensions, e.g., `\p{Greek}`. +unicode-script = ["regex-syntax/unicode-script"] +# Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`. +unicode-segment = ["regex-syntax/unicode-segment"] + + +# UNSTABLE FEATURES (requires Rust nightly) + +# A blanket feature that governs whether unstable features are enabled or not. +# Unstable features are disabled by default, and typically rely on unstable +# features in rustc itself. +unstable = ["pattern"] + +# Enable to use the unstable pattern traits defined in std. This is enabled +# by default if the unstable feature is enabled. +pattern = [] + +# For very fast prefix literal matching. +[dependencies.aho-corasick] +version = "1.0.0" +optional = true + +# For skipping along search text quickly when a leading byte is known. +[dependencies.memchr] +version = "2.5.0" +optional = true + +# For parsing regular expressions. +[dependencies.regex-syntax] +path = "regex-syntax" +version = "0.7.2" +default-features = false + +[dev-dependencies] +# For examples. +lazy_static = "1" +# For property based tests. +quickcheck = { version = "1.0.3", default-features = false } +# For generating random test data. +rand = { version = "0.8.3", default-features = false, features = ["getrandom", "small_rng"] } +# To check README's example +# TODO: Re-enable this once the MSRV is 1.43 or greater. +# See: https://github.com/rust-lang/regex/issues/684 +# See: https://github.com/rust-lang/regex/issues/685 +# doc-comment = "0.3" + +# Run the test suite on the default behavior of Regex::new. +# This includes a mish mash of NFAs and DFAs, which are chosen automatically +# based on the regex. We test both of the NFA implementations by forcing their +# usage with the test definitions below. (We can't test the DFA implementations +# in the same way since they can't be used for every regex tested.) +[[test]] +path = "tests/test_default.rs" +name = "default" + +# The same as the default tests, but run on bytes::Regex. +[[test]] +path = "tests/test_default_bytes.rs" +name = "default-bytes" + +# Run the test suite on the NFA algorithm over Unicode codepoints. +[[test]] +path = "tests/test_nfa.rs" +name = "nfa" + +# Run the test suite on the NFA algorithm over bytes that match UTF-8 only. +[[test]] +path = "tests/test_nfa_utf8bytes.rs" +name = "nfa-utf8bytes" + +# Run the test suite on the NFA algorithm over arbitrary bytes. +[[test]] +path = "tests/test_nfa_bytes.rs" +name = "nfa-bytes" + +# Run the test suite on the backtracking engine over Unicode codepoints. +[[test]] +path = "tests/test_backtrack.rs" +name = "backtrack" + +# Run the test suite on the backtracking engine over bytes that match UTF-8 +# only. +[[test]] +path = "tests/test_backtrack_utf8bytes.rs" +name = "backtrack-utf8bytes" + +# Run the test suite on the backtracking engine over arbitrary bytes. +[[test]] +path = "tests/test_backtrack_bytes.rs" +name = "backtrack-bytes" + +# Run all backends against each regex found on crates.io and make sure +# that they all do the same thing. +[[test]] +path = "tests/test_crates_regex.rs" +name = "crates-regex" + +[profile.release] +debug = true + +[profile.bench] +debug = true + +[profile.test] +debug = true diff --git a/regex-1.8.4/HACKING.md b/regex-1.8.4/HACKING.md new file mode 100644 index 0000000000000..34af5b517cd96 --- /dev/null +++ b/regex-1.8.4/HACKING.md @@ -0,0 +1,341 @@ +Your friendly guide to hacking and navigating the regex library. + +This guide assumes familiarity with Rust and Cargo, and at least a perusal of +the user facing documentation for this crate. + +If you're looking for background on the implementation in this library, then +you can do no better than Russ Cox's article series on implementing regular +expressions using finite automata: https://swtch.com/~rsc/regexp/ + + +## Architecture overview + +As you probably already know, this library executes regular expressions using +finite automata. In particular, a design goal is to make searching linear +with respect to both the regular expression and the text being searched. +Meeting that design goal on its own is not so hard and can be done with an +implementation of the Pike VM (similar to Thompson's construction, but supports +capturing groups), as described in: https://swtch.com/~rsc/regexp/regexp2.html +--- This library contains such an implementation in src/pikevm.rs. + +Making it fast is harder. One of the key problems with the Pike VM is that it +can be in more than one state at any point in time, and must shuffle capture +positions between them. The Pike VM also spends a lot of time following the +same epsilon transitions over and over again. We can employ one trick to +speed up the Pike VM: extract one or more literal prefixes from the regular +expression and execute specialized code to quickly find matches of those +prefixes in the search text. The Pike VM can then be avoided for most the +search, and instead only executed when a prefix is found. The code to find +prefixes is in the regex-syntax crate (in this repository). The code to search +for literals is in src/literals.rs. When more than one literal prefix is found, +we fall back to an Aho-Corasick DFA using the aho-corasick crate. For one +literal, we use a variant of the Boyer-Moore algorithm. Both Aho-Corasick and +Boyer-Moore use `memchr` when appropriate. The Boyer-Moore variant in this +library also uses elementary frequency analysis to choose the right byte to run +`memchr` with. + +Of course, detecting prefix literals can only take us so far. Not all regular +expressions have literal prefixes. To remedy this, we try another approach +to executing the Pike VM: backtracking, whose implementation can be found in +src/backtrack.rs. One reason why backtracking can be faster is that it avoids +excessive shuffling of capture groups. Of course, backtracking is susceptible +to exponential runtimes, so we keep track of every state we've visited to make +sure we never visit it again. This guarantees linear time execution, but we +pay for it with the memory required to track visited states. Because of the +memory requirement, we only use this engine on small search strings *and* small +regular expressions. + +Lastly, the real workhorse of this library is the "lazy" DFA in src/dfa.rs. +It is distinct from the Pike VM in that the DFA is explicitly represented in +memory and is only ever in one state at a time. It is said to be "lazy" because +the DFA is computed as text is searched, where each byte in the search text +results in at most one new DFA state. It is made fast by caching states. DFAs +are susceptible to exponential state blow up (where the worst case is computing +a new state for every input byte, regardless of what's in the state cache). To +avoid using a lot of memory, the lazy DFA uses a bounded cache. Once the cache +is full, it is wiped and state computation starts over again. If the cache is +wiped too frequently, then the DFA gives up and searching falls back to one of +the aforementioned algorithms. + +All of the above matching engines expose precisely the same matching semantics. +This is indeed tested. (See the section below about testing.) + +The following sub-sections describe the rest of the library and how each of the +matching engines are actually used. + +### Parsing + +Regular expressions are parsed using the regex-syntax crate, which is +maintained in this repository. The regex-syntax crate defines an abstract +syntax and provides very detailed error messages when a parse error is +encountered. Parsing is done in a separate crate so that others may benefit +from its existence, and because it is relatively divorced from the rest of the +regex library. + +The regex-syntax crate also provides sophisticated support for extracting +prefix and suffix literals from regular expressions. + +### Compilation + +The compiler is in src/compile.rs. The input to the compiler is some abstract +syntax for a regular expression and the output is a sequence of opcodes that +matching engines use to execute a search. (One can think of matching engines as +mini virtual machines.) The sequence of opcodes is a particular encoding of a +non-deterministic finite automaton. In particular, the opcodes explicitly rely +on epsilon transitions. + +Consider a simple regular expression like `a|b`. Its compiled form looks like +this: + + 000 Save(0) + 001 Split(2, 3) + 002 'a' (goto: 4) + 003 'b' + 004 Save(1) + 005 Match + +The first column is the instruction pointer and the second column is the +instruction. Save instructions indicate that the current position in the input +should be stored in a captured location. Split instructions represent a binary +branch in the program (i.e., epsilon transitions). The instructions `'a'` and +`'b'` indicate that the literal bytes `'a'` or `'b'` should match. + +In older versions of this library, the compilation looked like this: + + 000 Save(0) + 001 Split(2, 3) + 002 'a' + 003 Jump(5) + 004 'b' + 005 Save(1) + 006 Match + +In particular, empty instructions that merely served to move execution from one +point in the program to another were removed. Instead, every instruction has a +`goto` pointer embedded into it. This resulted in a small performance boost for +the Pike VM, because it was one fewer epsilon transition that it had to follow. + +There exist more instructions and they are defined and documented in +src/prog.rs. + +Compilation has several knobs and a few unfortunately complicated invariants. +Namely, the output of compilation can be one of two types of programs: a +program that executes on Unicode scalar values or a program that executes +on raw bytes. In the former case, the matching engine is responsible for +performing UTF-8 decoding and executing instructions using Unicode codepoints. +In the latter case, the program handles UTF-8 decoding implicitly, so that the +matching engine can execute on raw bytes. All matching engines can execute +either Unicode or byte based programs except for the lazy DFA, which requires +byte based programs. In general, both representations were kept because (1) the +lazy DFA requires byte based programs so that states can be encoded in a memory +efficient manner and (2) the Pike VM benefits greatly from inlining Unicode +character classes into fewer instructions as it results in fewer epsilon +transitions. + +N.B. UTF-8 decoding is built into the compiled program by making use of the +utf8-ranges crate. The compiler in this library factors out common suffixes to +reduce the size of huge character classes (e.g., `\pL`). + +A regrettable consequence of this split in instruction sets is we generally +need to compile two programs; one for NFA execution and one for the lazy DFA. + +In fact, it is worse than that: the lazy DFA is not capable of finding the +starting location of a match in a single scan, and must instead execute a +backwards search after finding the end location. To execute a backwards search, +we must have compiled the regular expression *in reverse*. + +This means that every compilation of a regular expression generally results in +three distinct programs. It would be possible to lazily compile the Unicode +program, since it is never needed if (1) the regular expression uses no word +boundary assertions and (2) the caller never asks for sub-capture locations. + +### Execution + +At the time of writing, there are four matching engines in this library: + +1. The Pike VM (supports captures). +2. Bounded backtracking (supports captures). +3. Literal substring or multi-substring search. +4. Lazy DFA (no support for Unicode word boundary assertions). + +Only the first two matching engines are capable of executing every regular +expression program. They also happen to be the slowest, which means we need +some logic that (1) knows various facts about the regular expression and (2) +knows what the caller wants. Using this information, we can determine which +engine (or engines) to use. + +The logic for choosing which engine to execute is in src/exec.rs and is +documented on the Exec type. Exec values contain regular expression Programs +(defined in src/prog.rs), which contain all the necessary tidbits for actually +executing a regular expression on search text. + +For the most part, the execution logic is straight-forward and follows the +limitations of each engine described above pretty faithfully. The hairiest +part of src/exec.rs by far is the execution of the lazy DFA, since it requires +a forwards and backwards search, and then falls back to either the Pike VM or +backtracking if the caller requested capture locations. + +The Exec type also contains mutable scratch space for each type of matching +engine. This scratch space is used during search (for example, for the lazy +DFA, it contains compiled states that are reused on subsequent searches). + +### Programs + +A regular expression program is essentially a sequence of opcodes produced by +the compiler plus various facts about the regular expression (such as whether +it is anchored, its capture names, etc.). + +### The regex! macro + +The `regex!` macro no longer exists. It was developed in a bygone era as a +compiler plugin during the infancy of the regex crate. Back then, then only +matching engine in the crate was the Pike VM. The `regex!` macro was, itself, +also a Pike VM. The only advantages it offered over the dynamic Pike VM that +was built at runtime were the following: + + 1. Syntax checking was done at compile time. Your Rust program wouldn't + compile if your regex didn't compile. + 2. Reduction of overhead that was proportional to the size of the regex. + For the most part, this overhead consisted of heap allocation, which + was nearly eliminated in the compiler plugin. + +The main takeaway here is that the compiler plugin was a marginally faster +version of a slow regex engine. As the regex crate evolved, it grew other regex +engines (DFA, bounded backtracker) and sophisticated literal optimizations. +The regex macro didn't keep pace, and it therefore became (dramatically) slower +than the dynamic engines. The only reason left to use it was for the compile +time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint +tool) has a lint that checks your regular expression validity, which mostly +replaces that use case. + +Additionally, the regex compiler plugin stopped receiving maintenance. Nobody +complained. At that point, it seemed prudent to just remove it. + +Will a compiler plugin be brought back? The future is murky, but there is +definitely an opportunity there to build something that is faster than the +dynamic engines in some cases. But it will be challenging! As of now, there +are no plans to work on this. + + +## Testing + +A key aspect of any mature regex library is its test suite. A subset of the +tests in this library come from Glenn Fowler's AT&T test suite (its online +presence seems gone at the time of writing). The source of the test suite is +located in src/testdata. The scripts/regex-match-tests.py takes the test suite +in src/testdata and generates tests/matches.rs. + +There are also many other manually crafted tests and regression tests in +tests/tests.rs. Some of these tests were taken from RE2. + +The biggest source of complexity in the tests is related to answering this +question: how can we reuse the tests to check all of our matching engines? One +approach would have been to encode every test into some kind of format (like +the AT&T test suite) and code generate tests for each matching engine. The +approach we use in this library is to create a Cargo.toml entry point for each +matching engine we want to test. The entry points are: + +* `tests/test_default.rs` - tests `Regex::new` +* `tests/test_default_bytes.rs` - tests `bytes::Regex::new` +* `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA + algorithm on every regex. +* `tests/test_nfa_bytes.rs` - tests `Regex::new`, forced to use the NFA + algorithm on every regex and use *arbitrary* byte based programs. +* `tests/test_nfa_utf8bytes.rs` - tests `Regex::new`, forced to use the NFA + algorithm on every regex and use *UTF-8* byte based programs. +* `tests/test_backtrack.rs` - tests `Regex::new`, forced to use + backtracking on every regex. +* `tests/test_backtrack_bytes.rs` - tests `Regex::new`, forced to use + backtracking on every regex and use *arbitrary* byte based programs. +* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use + backtracking on every regex and use *UTF-8* byte based programs. +* `tests/test_crates_regex.rs` - tests to make sure that all of the + backends behave in the same way against a number of quickcheck + generated random inputs. These tests need to be enabled through + the `RUST_REGEX_RANDOM_TEST` environment variable (see + below). + +The lazy DFA and pure literal engines are absent from this list because +they cannot be used on every regular expression. Instead, we rely on +`tests/test_dynamic.rs` to test the lazy DFA and literal engines when possible. + +Since the tests are repeated several times, and because `cargo test` runs all +entry points, it can take a while to compile everything. To reduce compile +times slightly, try using `cargo test --test default`, which will only use the +`tests/test_default.rs` entry point. + +The random testing takes quite a while, so it is not enabled by default. +In order to run the random testing you can set the +`RUST_REGEX_RANDOM_TEST` environment variable to anything before +invoking `cargo test`. Note that this variable is inspected at compile +time, so if the tests don't seem to be running, you may need to run +`cargo clean`. + +## Benchmarking + +The benchmarking in this crate is made up of many micro-benchmarks. Currently, +there are two primary sets of benchmarks: the benchmarks that were adopted +at this library's inception (in `bench/src/misc.rs`) and a newer set of +benchmarks meant to test various optimizations. Specifically, the latter set +contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter +set are all executed on the same lengthy input whereas the former benchmarks +are executed on strings of varying length. + +There is also a smattering of benchmarks for parsing and compilation. + +Benchmarks are in a separate crate so that its dependencies can be managed +separately from the main regex crate. + +Benchmarking follows a similarly wonky setup as tests. There are multiple entry +points: + +* `bench_rust.rs` - benchmarks `Regex::new` +* `bench_rust_bytes.rs` benchmarks `bytes::Regex::new` +* `bench_pcre.rs` - benchmarks PCRE +* `bench_onig.rs` - benchmarks Oniguruma + +The PCRE and Oniguruma benchmarks exist as a comparison point to a mature +regular expression library. In general, this regex library compares favorably +(there are even a few benchmarks that PCRE simply runs too slowly on or +outright can't execute at all). I would love to add other regular expression +library benchmarks (especially RE2). + +If you're hacking on one of the matching engines and just want to see +benchmarks, then all you need to run is: + + $ (cd bench && ./run rust) + +If you want to compare your results with older benchmarks, then try: + + $ (cd bench && ./run rust | tee old) + $ ... make it faster + $ (cd bench && ./run rust | tee new) + $ cargo benchcmp old new --improvements + +The `cargo-benchcmp` utility is available here: +https://github.com/BurntSushi/cargo-benchcmp + +The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See +`./bench/bench --help`. + +## Dev Docs + +When digging your teeth into the codebase for the first time, the +crate documentation can be a great resource. By default `rustdoc` +will strip out all documentation of private crate members in an +effort to help consumers of the crate focus on the *interface* +without having to concern themselves with the *implementation*. +Normally this is a great thing, but if you want to start hacking +on regex internals it is not what you want. Many of the private members +of this crate are well documented with rustdoc style comments, and +it would be a shame to miss out on the opportunity that presents. +You can generate the private docs with: + +``` +$ rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments +``` + +Then just point your browser at `target/doc/regex/index.html`. + +See https://github.com/rust-lang/rust/issues/15347 for more info +about generating developer docs for internal use. diff --git a/regex-1.8.4/LICENSE-APACHE b/regex-1.8.4/LICENSE-APACHE new file mode 100644 index 0000000000000..16fe87b06e802 --- /dev/null +++ b/regex-1.8.4/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/regex-1.8.4/LICENSE-MIT b/regex-1.8.4/LICENSE-MIT new file mode 100644 index 0000000000000..39d4bdb5acd31 --- /dev/null +++ b/regex-1.8.4/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2014 The Rust Project Developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/regex-1.8.4/PERFORMANCE.md b/regex-1.8.4/PERFORMANCE.md new file mode 100644 index 0000000000000..8cd0d9c7194b5 --- /dev/null +++ b/regex-1.8.4/PERFORMANCE.md @@ -0,0 +1,277 @@ +Your friendly guide to understanding the performance characteristics of this +crate. + +This guide assumes some familiarity with the public API of this crate, which +can be found here: https://docs.rs/regex + +## Theory vs. Practice + +One of the design goals of this crate is to provide worst case linear time +behavior with respect to the text searched using finite state automata. This +means that, *in theory*, the performance of this crate is much better than most +regex implementations, which typically use backtracking which has worst case +exponential time. + +For example, try opening a Python interpreter and typing this: + + >>> import re + >>> re.search('(a*)*c', 'a' * 30).span() + +I'll wait. + +At some point, you'll figure out that it won't terminate any time soon. ^C it. + +The promise of this crate is that *this pathological behavior can't happen*. + +With that said, just because we have protected ourselves against worst case +exponential behavior doesn't mean we are immune from large constant factors +or places where the current regex engine isn't quite optimal. This guide will +detail those cases and provide guidance on how to avoid them, among other +bits of general advice. + +## Thou Shalt Not Compile Regular Expressions In A Loop + +**Advice**: Use `lazy_static` to amortize the cost of `Regex` compilation. + +Don't do it unless you really don't mind paying for it. Compiling a regular +expression in this crate is quite expensive. It is conceivable that it may get +faster some day, but I wouldn't hold out hope for, say, an order of magnitude +improvement. In particular, compilation can take any where from a few dozen +microseconds to a few dozen milliseconds. Yes, milliseconds. Unicode character +classes, in particular, have the largest impact on compilation performance. At +the time of writing, for example, `\pL{100}` takes around 44ms to compile. This +is because `\pL` corresponds to every letter in Unicode and compilation must +turn it into a proper automaton that decodes a subset of UTF-8 which +corresponds to those letters. Compilation also spends some cycles shrinking the +size of the automaton. + +This means that in order to realize efficient regex matching, one must +*amortize the cost of compilation*. Trivially, if a call to `is_match` is +inside a loop, then make sure your call to `Regex::new` is *outside* that loop. + +In many programming languages, regular expressions can be conveniently defined +and compiled in a global scope, and code can reach out and use them as if +they were global static variables. In Rust, there is really no concept of +life-before-main, and therefore, one cannot utter this: + + static MY_REGEX: Regex = Regex::new("...").unwrap(); + +Unfortunately, this would seem to imply that one must pass `Regex` objects +around to everywhere they are used, which can be especially painful depending +on how your program is structured. Thankfully, the +[`lazy_static`](https://crates.io/crates/lazy_static) +crate provides an answer that works well: + + use lazy_static::lazy_static; + use regex::Regex; + + fn some_helper_function(text: &str) -> bool { + lazy_static! { + static ref MY_REGEX: Regex = Regex::new("...").unwrap(); + } + MY_REGEX.is_match(text) + } + +In other words, the `lazy_static!` macro enables us to define a `Regex` *as if* +it were a global static value. What is actually happening under the covers is +that the code inside the macro (i.e., `Regex::new(...)`) is run on *first use* +of `MY_REGEX` via a `Deref` impl. The implementation is admittedly magical, but +it's self contained and everything works exactly as you expect. In particular, +`MY_REGEX` can be used from multiple threads without wrapping it in an `Arc` or +a `Mutex`. On that note... + +## Using a regex from multiple threads + +**Advice**: The performance impact from using a `Regex` from multiple threads +is likely negligible. If necessary, clone the `Regex` so that each thread gets +its own copy. Cloning a regex does not incur any additional memory overhead +than what would be used by using a `Regex` from multiple threads +simultaneously. *Its only cost is ergonomics.* + +It is supported and encouraged to define your regexes using `lazy_static!` as +if they were global static values, and then use them to search text from +multiple threads simultaneously. + +One might imagine that this is possible because a `Regex` represents a +*compiled* program, so that any allocation or mutation is already done, and is +therefore read-only. Unfortunately, this is not true. Each type of search +strategy in this crate requires some kind of mutable scratch space to use +*during search*. For example, when executing a DFA, its states are computed +lazily and reused on subsequent searches. Those states go into that mutable +scratch space. + +The mutable scratch space is an implementation detail, and in general, its +mutation should not be observable from users of this crate. Therefore, it uses +interior mutability. This implies that `Regex` can either only be used from one +thread, or it must do some sort of synchronization. Either choice is +reasonable, but this crate chooses the latter, in particular because it is +ergonomic and makes use with `lazy_static!` straight forward. + +Synchronization implies *some* amount of overhead. When a `Regex` is used from +a single thread, this overhead is negligible. When a `Regex` is used from +multiple threads simultaneously, it is possible for the overhead of +synchronization from contention to impact performance. The specific cases where +contention may happen is if you are calling any of these methods repeatedly +from multiple threads simultaneously: + +* shortest_match +* is_match +* find +* captures + +In particular, every invocation of one of these methods must synchronize with +other threads to retrieve its mutable scratch space before searching can start. +If, however, you are using one of these methods: + +* find_iter +* captures_iter + +Then you may not suffer from contention since the cost of synchronization is +amortized on *construction of the iterator*. That is, the mutable scratch space +is obtained when the iterator is created and retained throughout its lifetime. + +## Only ask for what you need + +**Advice**: Prefer in this order: `is_match`, `find`, `captures`. + +There are three primary search methods on a `Regex`: + +* is_match +* find +* captures + +In general, these are ordered from fastest to slowest. + +`is_match` is fastest because it doesn't actually need to find the start or the +end of the leftmost-first match. It can quit immediately after it knows there +is a match. For example, given the regex `a+` and the haystack, `aaaaa`, the +search will quit after examining the first byte. + +In contrast, `find` must return both the start and end location of the +leftmost-first match. It can use the DFA matcher for this, but must run it +forwards once to find the end of the match *and then run it backwards* to find +the start of the match. The two scans and the cost of finding the real end of +the leftmost-first match make this more expensive than `is_match`. + +`captures` is the most expensive of them all because it must do what `find` +does, and then run either the bounded backtracker or the Pike VM to fill in the +capture group locations. Both of these are simulations of an NFA, which must +spend a lot of time shuffling states around. The DFA limits the performance hit +somewhat by restricting the amount of text that must be searched via an NFA +simulation. + +One other method not mentioned is `shortest_match`. This method has precisely +the same performance characteristics as `is_match`, except it will return the +end location of when it discovered a match. For example, given the regex `a+` +and the haystack `aaaaa`, `shortest_match` may return `1` as opposed to `5`, +the latter of which being the correct end location of the leftmost-first match. + +## Literals in your regex may make it faster + +**Advice**: Literals can reduce the work that the regex engine needs to do. Use +them if you can, especially as prefixes. + +In particular, if your regex starts with a prefix literal, the prefix is +quickly searched before entering the (much slower) regex engine. For example, +given the regex `foo\w+`, the literal `foo` will be searched for using +Boyer-Moore. If there's no match, then no regex engine is ever used. Only when +there's a match is the regex engine invoked at the location of the match, which +effectively permits the regex engine to skip large portions of a haystack. +If a regex is comprised entirely of literals (possibly more than one), then +it's possible that the regex engine can be avoided entirely even when there's a +match. + +When one literal is found, Boyer-Moore is used. When multiple literals are +found, then an optimized version of Aho-Corasick is used. + +This optimization is in particular extended quite a bit in this crate. Here are +a few examples of regexes that get literal prefixes detected: + +* `(foo|bar)` detects `foo` and `bar` +* `(a|b)c` detects `ac` and `bc` +* `[ab]foo[yz]` detects `afooy`, `afooz`, `bfooy` and `bfooz` +* `a?b` detects `a` and `b` +* `a*b` detects `a` and `b` +* `(ab){3,6}` detects `ababab` + +Literals in anchored regexes can also be used for detecting non-matches very +quickly. For example, `^foo\w+` and `\w+foo$` may be able to detect a non-match +just by examining the first (or last) three bytes of the haystack. + +## Unicode word boundaries may prevent the DFA from being used + +**Advice**: In most cases, `\b` should work well. If not, use `(?-u:\b)` +instead of `\b` if you care about consistent performance more than correctness. + +It's a sad state of the current implementation. At the moment, the DFA will try +to interpret Unicode word boundaries as if they were ASCII word boundaries. +If the DFA comes across any non-ASCII byte, it will quit and fall back to an +alternative matching engine that can handle Unicode word boundaries correctly. +The alternate matching engine is generally quite a bit slower (perhaps by an +order of magnitude). If necessary, this can be ameliorated in two ways. + +The first way is to add some number of literal prefixes to your regular +expression. Even though the DFA may not be used, specialized routines will +still kick in to find prefix literals quickly, which limits how much work the +NFA simulation will need to do. + +The second way is to give up on Unicode and use an ASCII word boundary instead. +One can use an ASCII word boundary by disabling Unicode support. That is, +instead of using `\b`, use `(?-u:\b)`. Namely, given the regex `\b.+\b`, it +can be transformed into a regex that uses the DFA with `(?-u:\b).+(?-u:\b)`. It +is important to limit the scope of disabling the `u` flag, since it might lead +to a syntax error if the regex could match arbitrary bytes. For example, if one +wrote `(?-u)\b.+\b`, then a syntax error would be returned because `.` matches +any *byte* when the Unicode flag is disabled. + +The second way isn't appreciably different than just using a Unicode word +boundary in the first place, since the DFA will speculatively interpret it as +an ASCII word boundary anyway. The key difference is that if an ASCII word +boundary is used explicitly, then the DFA won't quit in the presence of +non-ASCII UTF-8 bytes. This results in giving up correctness in exchange for +more consistent performance. + +N.B. When using `bytes::Regex`, Unicode support is disabled by default, so one +can simply write `\b` to get an ASCII word boundary. + +## Excessive counting can lead to exponential state blow up in the DFA + +**Advice**: Don't write regexes that cause DFA state blow up if you care about +match performance. + +Wait, didn't I say that this crate guards against exponential worst cases? +Well, it turns out that the process of converting an NFA to a DFA can lead to +an exponential blow up in the number of states. This crate specifically guards +against exponential blow up by doing two things: + +1. The DFA is computed lazily. That is, a state in the DFA only exists in + memory if it is visited. In particular, the lazy DFA guarantees that *at + most* one state is created for every byte of input. This, on its own, + guarantees linear time complexity. +2. Of course, creating a new state for *every* byte of input means that search + will go incredibly slow because of very large constant factors. On top of + that, creating a state for every byte in a large haystack could result in + exorbitant memory usage. To ameliorate this, the DFA bounds the number of + states it can store. Once it reaches its limit, it flushes its cache. This + prevents reuse of states that it already computed. If the cache is flushed + too frequently, then the DFA will give up and execution will fall back to + one of the NFA simulations. + +In effect, this crate will detect exponential state blow up and fall back to +a search routine with fixed memory requirements. This does, however, mean that +searching will be much slower than one might expect. Regexes that rely on +counting in particular are strong aggravators of this behavior. For example, +matching `[01]*1[01]{20}$` against a random sequence of `0`s and `1`s. + +In the future, it may be possible to increase the bound that the DFA uses, +which would allow the caller to choose how much memory they're willing to +spend. + +## Resist the temptation to "optimize" regexes + +**Advice**: This ain't a backtracking engine. + +An entire book was written on how to optimize Perl-style regular expressions. +Most of those techniques are not applicable for this library. For example, +there is no problem with using non-greedy matching or having lots of +alternations in your regex. diff --git a/regex-1.8.4/README.md b/regex-1.8.4/README.md new file mode 100644 index 0000000000000..020b3539569e2 --- /dev/null +++ b/regex-1.8.4/README.md @@ -0,0 +1,246 @@ +regex +===== +A Rust library for parsing, compiling, and executing regular expressions. Its +syntax is similar to Perl-style regular expressions, but lacks a few features +like look around and backreferences. In exchange, all searches execute in +linear time with respect to the size of the regular expression and search text. +Much of the syntax and implementation is inspired +by [RE2](https://github.com/google/re2). + +[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) +[![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex) +[![Rust](https://img.shields.io/badge/rust-1.60.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) + +### Documentation + +[Module documentation with examples](https://docs.rs/regex). +The module documentation also includes a comprehensive description of the +syntax supported. + +Documentation with examples for the various matching functions and iterators +can be found on the +[`Regex` type](https://docs.rs/regex/*/regex/struct.Regex.html). + +### Usage + +To bring this crate into your repository, either add `regex` to your +`Cargo.toml`, or run `cargo add regex`. + +Here's a simple example that matches a date in YYYY-MM-DD format and prints the +year, month and day: + +```rust +use regex::Regex; + +fn main() { + let re = Regex::new(r"(?x) +(?P\d{4}) # the year +- +(?P\d{2}) # the month +- +(?P\d{2}) # the day +").unwrap(); + let caps = re.captures("2010-03-14").unwrap(); + + assert_eq!("2010", &caps["year"]); + assert_eq!("03", &caps["month"]); + assert_eq!("14", &caps["day"]); +} +``` + +If you have lots of dates in text that you'd like to iterate over, then it's +easy to adapt the above example with an iterator: + +```rust +use regex::Regex; + +const TO_SEARCH: &'static str = " +On 2010-03-14, foo happened. On 2014-10-14, bar happened. +"; + +fn main() { + let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); + + for caps in re.captures_iter(TO_SEARCH) { + // Note that all of the unwraps are actually OK for this regex + // because the only way for the regex to match is if all of the + // capture groups match. This is not true in general though! + println!("year: {}, month: {}, day: {}", + caps.get(1).unwrap().as_str(), + caps.get(2).unwrap().as_str(), + caps.get(3).unwrap().as_str()); + } +} +``` + +This example outputs: + +```text +year: 2010, month: 03, day: 14 +year: 2014, month: 10, day: 14 +``` + +### Usage: Avoid compiling the same regex in a loop + +It is an anti-pattern to compile the same regular expression in a loop since +compilation is typically expensive. (It takes anywhere from a few microseconds +to a few **milliseconds** depending on the size of the regex.) Not only is +compilation itself expensive, but this also prevents optimizations that reuse +allocations internally to the matching engines. + +In Rust, it can sometimes be a pain to pass regular expressions around if +they're used from inside a helper function. Instead, we recommend using the +[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that +regular expressions are compiled exactly once. + +For example: + +```rust,ignore +use regex::Regex; + +fn some_helper_function(text: &str) -> bool { + lazy_static! { + static ref RE: Regex = Regex::new("...").unwrap(); + } + RE.is_match(text) +} +``` + +Specifically, in this example, the regex will be compiled when it is used for +the first time. On subsequent uses, it will reuse the previous compilation. + +### Usage: match regular expressions on `&[u8]` + +The main API of this crate (`regex::Regex`) requires the caller to pass a +`&str` for searching. In Rust, an `&str` is required to be valid UTF-8, which +means the main API can't be used for searching arbitrary bytes. + +To match on arbitrary bytes, use the `regex::bytes::Regex` API. The API +is identical to the main API, except that it takes an `&[u8]` to search +on instead of an `&str`. By default, `.` will match any *byte* using +`regex::bytes::Regex`, while `.` will match any *UTF-8 encoded Unicode scalar +value* using the main API. + +This example shows how to find all null-terminated strings in a slice of bytes: + +```rust +use regex::bytes::Regex; + +let re = Regex::new(r"(?P[^\x00]+)\x00").unwrap(); +let text = b"foo\x00bar\x00baz\x00"; + +// Extract all of the strings without the null terminator from each match. +// The unwrap is OK here since a match requires the `cstr` capture to match. +let cstrs: Vec<&[u8]> = + re.captures_iter(text) + .map(|c| c.name("cstr").unwrap().as_bytes()) + .collect(); +assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); +``` + +Notice here that the `[^\x00]+` will match any *byte* except for `NUL`. When +using the main API, `[^\x00]+` would instead match any valid UTF-8 sequence +except for `NUL`. + +### Usage: match multiple regular expressions simultaneously + +This demonstrates how to use a `RegexSet` to match multiple (possibly +overlapping) regular expressions in a single scan of the search text: + +```rust +use regex::RegexSet; + +let set = RegexSet::new(&[ + r"\w+", + r"\d+", + r"\pL+", + r"foo", + r"bar", + r"barfoo", + r"foobar", +]).unwrap(); + +// Iterate over and collect all of the matches. +let matches: Vec<_> = set.matches("foobar").into_iter().collect(); +assert_eq!(matches, vec![0, 2, 3, 4, 6]); + +// You can also test whether a particular regex matched: +let matches = set.matches("foobar"); +assert!(!matches.matched(5)); +assert!(matches.matched(6)); +``` + +### Usage: enable SIMD optimizations + +SIMD optimizations are enabled automatically on Rust stable 1.27 and newer. +For nightly versions of Rust, this requires a recent version with the SIMD +features stabilized. + + +### Usage: a regular expression parser + +This repository contains a crate that provides a well tested regular expression +parser, abstract syntax and a high-level intermediate representation for +convenient analysis. It provides no facilities for compilation or execution. +This may be useful if you're implementing your own regex engine or otherwise +need to do analysis on the syntax of a regular expression. It is otherwise not +recommended for general use. + +[Documentation `regex-syntax`.](https://docs.rs/regex-syntax) + + +### Crate features + +This crate comes with several features that permit tweaking the trade off +between binary size, compilation time and runtime performance. Users of this +crate can selectively disable Unicode tables, or choose from a variety of +optimizations performed by this crate to disable. + +When all of these features are disabled, runtime match performance may be much +worse, but if you're matching on short strings, or if high performance isn't +necessary, then such a configuration is perfectly serviceable. To disable +all such features, use the following `Cargo.toml` dependency configuration: + +```toml +[dependencies.regex] +version = "1.3" +default-features = false +# regex currently requires the standard library, you must re-enable it. +features = ["std"] +``` + +This will reduce the dependency tree of `regex` down to a single crate +(`regex-syntax`). + +The full set of features one can disable are +[in the "Crate features" section of the documentation](https://docs.rs/regex/*/#crate-features). + + +### Minimum Rust version policy + +This crate's minimum supported `rustc` version is `1.60.0`. + +The current **tentative** policy is that the minimum Rust version required +to use this crate can be increased in minor version updates. For example, if +regex 1.0 requires Rust 1.20.0, then regex 1.0.z for all values of `z` will +also require Rust 1.20.0 or newer. However, regex 1.y for `y > 0` may require a +newer minimum version of Rust. + +In general, this crate will be conservative with respect to the minimum +supported version of Rust. + + +### License + +This project is licensed under either of + + * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or + https://www.apache.org/licenses/LICENSE-2.0) + * MIT license ([LICENSE-MIT](LICENSE-MIT) or + https://opensource.org/licenses/MIT) + +at your option. + +The data in `regex-syntax/src/unicode_tables/` is licensed under the Unicode +License Agreement +([LICENSE-UNICODE](https://www.unicode.org/copyright.html#License)). diff --git a/regex-1.8.4/UNICODE.md b/regex-1.8.4/UNICODE.md new file mode 100644 index 0000000000000..df7d21ed97411 --- /dev/null +++ b/regex-1.8.4/UNICODE.md @@ -0,0 +1,259 @@ +# Unicode conformance + +This document describes the regex crate's conformance to Unicode's +[UTS#18](https://unicode.org/reports/tr18/) +report, which lays out 3 levels of support: Basic, Extended and Tailored. + +Full support for Level 1 ("Basic Unicode Support") is provided with two +exceptions: + +1. Line boundaries are not Unicode aware. Namely, only the `\n` + (`END OF LINE`) character is recognized as a line boundary. +2. The compatibility properties specified by + [RL1.2a](https://unicode.org/reports/tr18/#RL1.2a) + are ASCII-only definitions. + +Little to no support is provided for either Level 2 or Level 3. For the most +part, this is because the features are either complex/hard to implement, or at +the very least, very difficult to implement without sacrificing performance. +For example, tackling canonical equivalence such that matching worked as one +would expect regardless of normalization form would be a significant +undertaking. This is at least partially a result of the fact that this regex +engine is based on finite automata, which admits less flexibility normally +associated with backtracking implementations. + + +## RL1.1 Hex Notation + +[UTS#18 RL1.1](https://unicode.org/reports/tr18/#Hex_notation) + +Hex Notation refers to the ability to specify a Unicode code point in a regular +expression via its hexadecimal code point representation. This is useful in +environments that have poor Unicode font rendering or if you need to express a +code point that is not normally displayable. All forms of hexadecimal notation +are supported + + \x7F hex character code (exactly two digits) + \x{10FFFF} any hex character code corresponding to a Unicode code point + \u007F hex character code (exactly four digits) + \u{7F} any hex character code corresponding to a Unicode code point + \U0000007F hex character code (exactly eight digits) + \U{7F} any hex character code corresponding to a Unicode code point + +Briefly, the `\x{...}`, `\u{...}` and `\U{...}` are all exactly equivalent ways +of expressing hexadecimal code points. Any number of digits can be written +within the brackets. In contrast, `\xNN`, `\uNNNN`, `\UNNNNNNNN` are all +fixed-width variants of the same idea. + +Note that when Unicode mode is disabled, any non-ASCII Unicode codepoint is +banned. Additionally, the `\xNN` syntax represents arbitrary bytes when Unicode +mode is disabled. That is, the regex `\xFF` matches the Unicode codepoint +U+00FF (encoded as `\xC3\xBF` in UTF-8) while the regex `(?-u)\xFF` matches +the literal byte `\xFF`. + + +## RL1.2 Properties + +[UTS#18 RL1.2](https://unicode.org/reports/tr18/#Categories) + +Full support for Unicode property syntax is provided. Unicode properties +provide a convenient way to construct character classes of groups of code +points specified by Unicode. The regex crate does not provide exhaustive +support, but covers a useful subset. In particular: + +* [General categories](https://unicode.org/reports/tr18/#General_Category_Property) +* [Scripts and Script Extensions](https://unicode.org/reports/tr18/#Script_Property) +* [Age](https://unicode.org/reports/tr18/#Age) +* A smattering of boolean properties, including all of those specified by + [RL1.2](https://unicode.org/reports/tr18/#RL1.2) explicitly. + +In all cases, property name and value abbreviations are supported, and all +names/values are matched loosely without regard for case, whitespace or +underscores. Property name aliases can be found in Unicode's +[`PropertyAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt) +file, while property value aliases can be found in Unicode's +[`PropertyValueAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt) +file. + +The syntax supported is also consistent with the UTS#18 recommendation: + +* `\p{Greek}` selects the `Greek` script. Equivalent expressions follow: + `\p{sc:Greek}`, `\p{Script:Greek}`, `\p{Sc=Greek}`, `\p{script=Greek}`, + `\P{sc!=Greek}`. Similarly for `General_Category` (or `gc` for short) and + `Script_Extensions` (or `scx` for short). +* `\p{age:3.2}` selects all code points in Unicode 3.2. +* `\p{Alphabetic}` selects the "alphabetic" property and can be abbreviated + via `\p{alpha}` (for example). +* Single letter variants for properties with single letter abbreviations. + For example, `\p{Letter}` can be equivalently written as `\pL`. + +The following is a list of all properties supported by the regex crate (starred +properties correspond to properties required by RL1.2): + +* `General_Category` \* (including `Any`, `ASCII` and `Assigned`) +* `Script` \* +* `Script_Extensions` \* +* `Age` +* `ASCII_Hex_Digit` +* `Alphabetic` \* +* `Bidi_Control` +* `Case_Ignorable` +* `Cased` +* `Changes_When_Casefolded` +* `Changes_When_Casemapped` +* `Changes_When_Lowercased` +* `Changes_When_Titlecased` +* `Changes_When_Uppercased` +* `Dash` +* `Default_Ignorable_Code_Point` \* +* `Deprecated` +* `Diacritic` +* `Emoji` +* `Emoji_Presentation` +* `Emoji_Modifier` +* `Emoji_Modifier_Base` +* `Emoji_Component` +* `Extended_Pictographic` +* `Extender` +* `Grapheme_Base` +* `Grapheme_Cluster_Break` +* `Grapheme_Extend` +* `Hex_Digit` +* `IDS_Binary_Operator` +* `IDS_Trinary_Operator` +* `ID_Continue` +* `ID_Start` +* `Join_Control` +* `Logical_Order_Exception` +* `Lowercase` \* +* `Math` +* `Noncharacter_Code_Point` \* +* `Pattern_Syntax` +* `Pattern_White_Space` +* `Prepended_Concatenation_Mark` +* `Quotation_Mark` +* `Radical` +* `Regional_Indicator` +* `Sentence_Break` +* `Sentence_Terminal` +* `Soft_Dotted` +* `Terminal_Punctuation` +* `Unified_Ideograph` +* `Uppercase` \* +* `Variation_Selector` +* `White_Space` \* +* `Word_Break` +* `XID_Continue` +* `XID_Start` + + +## RL1.2a Compatibility Properties + +[UTS#18 RL1.2a](https://unicode.org/reports/tr18/#RL1.2a) + +The regex crate only provides ASCII definitions of the +[compatibility properties documented in UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties) +(sans the `\X` class, for matching grapheme clusters, which isn't provided +at all). This is because it seems to be consistent with most other regular +expression engines, and in particular, because these are often referred to as +"ASCII" or "POSIX" character classes. + +Note that the `\w`, `\s` and `\d` character classes **are** Unicode aware. +Their traditional ASCII definition can be used by disabling Unicode. That is, +`[[:word:]]` and `(?-u)\w` are equivalent. + + +## RL1.3 Subtraction and Intersection + +[UTS#18 RL1.3](https://unicode.org/reports/tr18/#Subtraction_and_Intersection) + +The regex crate provides full support for nested character classes, along with +union, intersection (`&&`), difference (`--`) and symmetric difference (`~~`) +operations on arbitrary character classes. + +For example, to match all non-ASCII letters, you could use either +`[\p{Letter}--\p{Ascii}]` (difference) or `[\p{Letter}&&[^\p{Ascii}]]` +(intersecting the negation). + + +## RL1.4 Simple Word Boundaries + +[UTS#18 RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries) + +The regex crate provides basic Unicode aware word boundary assertions. A word +boundary assertion can be written as `\b`, or `\B` as its negation. A word +boundary negation corresponds to a zero-width match, where its adjacent +characters correspond to word and non-word, or non-word and word characters. + +Conformance in this case chooses to define word character in the same way that +the `\w` character class is defined: a code point that is a member of one of +the following classes: + +* `\p{Alphabetic}` +* `\p{Join_Control}` +* `\p{gc:Mark}` +* `\p{gc:Decimal_Number}` +* `\p{gc:Connector_Punctuation}` + +In particular, this differs slightly from the +[prescription given in RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries) +but is permissible according to +[UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). +Namely, it is convenient and simpler to have `\w` and `\b` be in sync with +one another. + +Finally, Unicode word boundaries can be disabled, which will cause ASCII word +boundaries to be used instead. That is, `\b` is a Unicode word boundary while +`(?-u)\b` is an ASCII-only word boundary. This can occasionally be beneficial +if performance is important, since the implementation of Unicode word +boundaries is currently sub-optimal on non-ASCII text. + + +## RL1.5 Simple Loose Matches + +[UTS#18 RL1.5](https://unicode.org/reports/tr18/#Simple_Loose_Matches) + +The regex crate provides full support for case insensitive matching in +accordance with RL1.5. That is, it uses the "simple" case folding mapping. The +"simple" mapping was chosen because of a key convenient property: every +"simple" mapping is a mapping from exactly one code point to exactly one other +code point. This makes case insensitive matching of character classes, for +example, straight-forward to implement. + +When case insensitive mode is enabled (e.g., `(?i)[a]` is equivalent to `a|A`), +then all characters classes are case folded as well. + + +## RL1.6 Line Boundaries + +[UTS#18 RL1.6](https://unicode.org/reports/tr18/#Line_Boundaries) + +The regex crate only provides support for recognizing the `\n` (`END OF LINE`) +character as a line boundary. This choice was made mostly for implementation +convenience, and to avoid performance cliffs that Unicode word boundaries are +subject to. + +Ideally, it would be nice to at least support `\r\n` as a line boundary as +well, and in theory, this could be done efficiently. + + +## RL1.7 Code Points + +[UTS#18 RL1.7](https://unicode.org/reports/tr18/#Supplementary_Characters) + +The regex crate provides full support for Unicode code point matching. Namely, +the fundamental atom of any match is always a single code point. + +Given Rust's strong ties to UTF-8, the following guarantees are also provided: + +* All matches are reported on valid UTF-8 code unit boundaries. That is, any + match range returned by the public regex API is guaranteed to successfully + slice the string that was searched. +* By consequence of the above, it is impossible to match surrogode code points. + No support for UTF-16 is provided, so this is never necessary. + +Note that when Unicode mode is disabled, the fundamental atom of matching is +no longer a code point but a single byte. When Unicode mode is disabled, many +Unicode features are disabled as well. For example, `(?-u)\pL` is not a valid +regex but `\pL(?-u)\xFF` (matches any Unicode `Letter` followed by the literal +byte `\xFF`) is, for example. diff --git a/regex-1.8.4/examples/regexdna-input.txt b/regex-1.8.4/examples/regexdna-input.txt new file mode 100644 index 0000000000000..fb23263397d9e --- /dev/null +++ b/regex-1.8.4/examples/regexdna-input.txt @@ -0,0 +1,1671 @@ +>ONE Homo sapiens alu +GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA +TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT +AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG +GCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCG +CCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGT +GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCA +GGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAA +TTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAG +AATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCA +GCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGT +AATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACC +AGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTG +GTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACC +CGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAG +AGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTT +TGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACA +TGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCT +GTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGG +TTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGT +CTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG +CGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCG +TCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTA +CTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCG +AGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCG +GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACC +TGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAA +TACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGA +GGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACT +GCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTC +ACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGT +TCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC +CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCG +CTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTG +GGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCC +CAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCT +GGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGC +GCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGA +GGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGA +GACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGA +GGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTG +AAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAAT +CCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCA +GTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAA +AAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGC +GGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCT +ACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGG +GAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATC +GCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGC +GGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGG +TCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAA +AAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAG +GAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACT +CCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCC +TGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAG +ACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGC +GTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGA +ACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGA +CAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCA +CTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCA +ACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCG +CCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGG +AGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTC +CGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCG +AGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACC +CCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAG +CTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAG +CCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGG +CCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATC +ACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAA +AAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGC +TGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCC +ACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGG +CTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGG +AGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATT +AGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAA +TCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGC +CTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAA +TCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAG +CCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGT +GGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCG +GGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAG +CGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTG +GGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATG +GTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGT +AATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTT +GCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCT +CAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCG +GGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTC +TCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACT +CGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAG +ATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGG +CGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTG +AGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATA +CAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGG +CAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGC +ACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCAC +GCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTC +GAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCG +GGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCT +TGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGG +CGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCA +GCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGG +CCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGC +GCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGG +CGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGA +CTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGG +CCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAA +ACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCC +CAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGT +GAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAA +AGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGG +ATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTAC +TAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGA +GGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGC +GCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGG +TGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTC +AGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAA +ATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGA +GAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC +AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTG +TAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGAC +CAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGT +GGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAAC +CCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACA +GAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACT +TTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAAC +ATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCC +TGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAG +GTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCG +TCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAG +GCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCC +GTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCT +ACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCC +GAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCC +GGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCAC +CTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAA +ATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTG +AGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCAC +TGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCT +CACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAG +TTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAG +CCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATC +GCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCT +GGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATC +CCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCC +TGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGG +CGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG +AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCG +AGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGG +AGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGT +GAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAA +TCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGC +AGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCA +AAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGG +CGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTC +TACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCG +GGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGAT +CGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCG +CGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAG +GTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACA +AAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCA +GGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCAC +TCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGC +CTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGA +GACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGG +CGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTG +AACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCG +ACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGC +ACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCC +AACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGC +GCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCG +GAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACT +CCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCC +GAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAAC +CCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA +GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGA +GCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAG +GCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGAT +CACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTA +AAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGG +CTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGC +CACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTG +GCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAG +GAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAAT +TAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGA +ATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAG +CCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTA +ATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCA +GCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGG +TGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCC +GGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGA +GCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTT +GGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACAT +GGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTG +TAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGT +TGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTC +TCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGC +GGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGT +CTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTAC +TCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGA +GATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGG +GCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCT +GAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT +ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAG +GCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTG +CACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCA +CGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTT +CGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCC +GGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGC +TTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGG +GCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCC +AGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTG +GCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCG +CGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAG +GCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAG +ACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAG +GCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGA +AACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATC +CCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAG +TGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAA +AAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCG +GATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTA +CTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGG +AGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCG +CGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCG +GTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGT +CAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAA +AATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGG +AGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTC +CAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCT +GTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA +CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCG +TGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAA +CCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGAC +AGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCAC +TTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAA +CATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGC +CTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGA +GGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCC +GTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGA +GGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCC +CGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGC +TACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGC +CGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGC +CGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCA +CCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAA +AATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCT +GAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCA +CTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGC +TCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGA +GTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTA +GCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAAT +CGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCC +TGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAAT +CCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGC +CTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTG +GCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGG +GAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGC +GAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG +GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGG +TGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTA +ATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTG +CAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTC +AAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGG +GCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCT +CTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTC +GGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGA +TCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGC +GCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGA +GGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATAC +AAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGC +AGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCA +CTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACG +CCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCG +AGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGG +GCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTT +GAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGC +GACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAG +CACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGC +CAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCG +CGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGC +GGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGAC +TCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGC +CGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAA +CCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCC +AGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTG +AGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA +GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA +TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT +AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG +GCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCG +CCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGT +GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCA +GGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAA +TTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAG +AATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCA +GCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGT +AATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACC +AGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTG +GTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACC +CGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAG +AGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTT +TGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACA +TGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCT +GTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGG +TTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGT +CTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG +CGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCG +TCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTA +CTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCG +AGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCG +GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACC +TGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAA +TACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGA +GGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACT +GCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTC +ACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGT +TCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC +CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCG +CTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTG +GGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCC +CAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCT +GGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGC +GCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGA +GGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGA +GACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGA +GGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTG +AAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAAT +CCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCA +GTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAA +AAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGC +GGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCT +ACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGG +GAGGCTGAGGCAGGAGAATC +>TWO IUB ambiguity codes +cttBtatcatatgctaKggNcataaaSatgtaaaDcDRtBggDtctttataattcBgtcg +tactDtDagcctatttSVHtHttKtgtHMaSattgWaHKHttttagacatWatgtRgaaa +NtactMcSMtYtcMgRtacttctWBacgaaatatagScDtttgaagacacatagtVgYgt +cattHWtMMWcStgttaggKtSgaYaaccWStcgBttgcgaMttBYatcWtgacaYcaga +gtaBDtRacttttcWatMttDBcatWtatcttactaBgaYtcttgttttttttYaaScYa +HgtgttNtSatcMtcVaaaStccRcctDaataataStcYtRDSaMtDttgttSagtRRca +tttHatSttMtWgtcgtatSSagactYaaattcaMtWatttaSgYttaRgKaRtccactt +tattRggaMcDaWaWagttttgacatgttctacaaaRaatataataaMttcgDacgaSSt +acaStYRctVaNMtMgtaggcKatcttttattaaaaagVWaHKYagtttttatttaacct +tacgtVtcVaattVMBcttaMtttaStgacttagattWWacVtgWYagWVRctDattBYt +gtttaagaagattattgacVatMaacattVctgtBSgaVtgWWggaKHaatKWcBScSWa +accRVacacaaactaccScattRatatKVtactatatttHttaagtttSKtRtacaaagt +RDttcaaaaWgcacatWaDgtDKacgaacaattacaRNWaatHtttStgttattaaMtgt +tgDcgtMgcatBtgcttcgcgaDWgagctgcgaggggVtaaScNatttacttaatgacag +cccccacatYScaMgtaggtYaNgttctgaMaacNaMRaacaaacaKctacatagYWctg +ttWaaataaaataRattagHacacaagcgKatacBttRttaagtatttccgatctHSaat +actcNttMaagtattMtgRtgaMgcataatHcMtaBSaRattagttgatHtMttaaKagg +YtaaBataSaVatactWtataVWgKgttaaaacagtgcgRatatacatVtHRtVYataSa +KtWaStVcNKHKttactatccctcatgWHatWaRcttactaggatctataDtDHBttata +aaaHgtacVtagaYttYaKcctattcttcttaataNDaaggaaaDYgcggctaaWSctBa +aNtgctggMBaKctaMVKagBaactaWaDaMaccYVtNtaHtVWtKgRtcaaNtYaNacg +gtttNattgVtttctgtBaWgtaattcaagtcaVWtactNggattctttaYtaaagccgc +tcttagHVggaYtgtNcDaVagctctctKgacgtatagYcctRYHDtgBattDaaDgccK +tcHaaStttMcctagtattgcRgWBaVatHaaaataYtgtttagMDMRtaataaggatMt +ttctWgtNtgtgaaaaMaatatRtttMtDgHHtgtcattttcWattRSHcVagaagtacg +ggtaKVattKYagactNaatgtttgKMMgYNtcccgSKttctaStatatNVataYHgtNa +BKRgNacaactgatttcctttaNcgatttctctataScaHtataRagtcRVttacDSDtt +aRtSatacHgtSKacYagttMHtWataggatgactNtatSaNctataVtttRNKtgRacc +tttYtatgttactttttcctttaaacatacaHactMacacggtWataMtBVacRaSaatc +cgtaBVttccagccBcttaRKtgtgcctttttRtgtcagcRttKtaaacKtaaatctcac +aattgcaNtSBaaccgggttattaaBcKatDagttactcttcattVtttHaaggctKKga +tacatcBggScagtVcacattttgaHaDSgHatRMaHWggtatatRgccDttcgtatcga +aacaHtaagttaRatgaVacttagattVKtaaYttaaatcaNatccRttRRaMScNaaaD +gttVHWgtcHaaHgacVaWtgttScactaagSgttatcttagggDtaccagWattWtRtg +ttHWHacgattBtgVcaYatcggttgagKcWtKKcaVtgaYgWctgYggVctgtHgaNcV +taBtWaaYatcDRaaRtSctgaHaYRttagatMatgcatttNattaDttaattgttctaa +ccctcccctagaWBtttHtBccttagaVaatMcBHagaVcWcagBVttcBtaYMccagat +gaaaaHctctaacgttagNWRtcggattNatcRaNHttcagtKttttgWatWttcSaNgg +gaWtactKKMaacatKatacNattgctWtatctaVgagctatgtRaHtYcWcttagccaa +tYttWttaWSSttaHcaaaaagVacVgtaVaRMgattaVcDactttcHHggHRtgNcctt +tYatcatKgctcctctatVcaaaaKaaaagtatatctgMtWtaaaacaStttMtcgactt +taSatcgDataaactaaacaagtaaVctaggaSccaatMVtaaSKNVattttgHccatca +cBVctgcaVatVttRtactgtVcaattHgtaaattaaattttYtatattaaRSgYtgBag +aHSBDgtagcacRHtYcBgtcacttacactaYcgctWtattgSHtSatcataaatataHt +cgtYaaMNgBaatttaRgaMaatatttBtttaaaHHKaatctgatWatYaacttMctctt +ttVctagctDaaagtaVaKaKRtaacBgtatccaaccactHHaagaagaaggaNaaatBW +attccgStaMSaMatBttgcatgRSacgttVVtaaDMtcSgVatWcaSatcttttVatag +ttactttacgatcaccNtaDVgSRcgVcgtgaacgaNtaNatatagtHtMgtHcMtagaa +attBgtataRaaaacaYKgtRccYtatgaagtaataKgtaaMttgaaRVatgcagaKStc +tHNaaatctBBtcttaYaBWHgtVtgacagcaRcataWctcaBcYacYgatDgtDHccta +aagacYRcaggattHaYgtKtaatgcVcaataMYacccatatcacgWDBtgaatcBaata +cKcttRaRtgatgaBDacggtaattaaYtataStgVHDtDctgactcaaatKtacaatgc +gYatBtRaDatHaactgtttatatDttttaaaKVccYcaaccNcBcgHaaVcattHctcg +attaaatBtatgcaaaaatYMctSactHatacgaWacattacMBgHttcgaatVaaaaca +BatatVtctgaaaaWtctRacgBMaatSgRgtgtcgactatcRtattaScctaStagKga +DcWgtYtDDWKRgRtHatRtggtcgaHgggcgtattaMgtcagccaBggWVcWctVaaat +tcgNaatcKWagcNaHtgaaaSaaagctcYctttRVtaaaatNtataaccKtaRgtttaM +tgtKaBtRtNaggaSattHatatWactcagtgtactaKctatttgRYYatKatgtccgtR +tttttatttaatatVgKtttgtatgtNtataRatWYNgtRtHggtaaKaYtKSDcatcKg +taaYatcSRctaVtSMWtVtRWHatttagataDtVggacagVcgKWagBgatBtaaagNc +aRtagcataBggactaacacRctKgttaatcctHgDgttKHHagttgttaatgHBtatHc +DaagtVaBaRccctVgtgDtacRHSctaagagcggWYaBtSaKtHBtaaactYacgNKBa +VYgtaacttagtVttcttaatgtBtatMtMtttaattaatBWccatRtttcatagVgMMt +agctStKctaMactacDNYgKYHgaWcgaHgagattacVgtttgtRaSttaWaVgataat +gtgtYtaStattattMtNgWtgttKaccaatagNYttattcgtatHcWtctaaaNVYKKt +tWtggcDtcgaagtNcagatacgcattaagaccWctgcagcttggNSgaNcHggatgtVt +catNtRaaBNcHVagagaaBtaaSggDaatWaatRccaVgggStctDaacataKttKatt +tggacYtattcSatcttagcaatgaVBMcttDattctYaaRgatgcattttNgVHtKcYR +aatRKctgtaaacRatVSagctgtWacBtKVatctgttttKcgtctaaDcaagtatcSat +aWVgcKKataWaYttcccSaatgaaaacccWgcRctWatNcWtBRttYaattataaNgac +acaatagtttVNtataNaYtaatRaVWKtBatKagtaatataDaNaaaaataMtaagaaS +tccBcaatNgaataWtHaNactgtcDtRcYaaVaaaaaDgtttRatctatgHtgttKtga +aNSgatactttcgagWaaatctKaaDaRttgtggKKagcDgataaattgSaacWaVtaNM +acKtcaDaaatttctRaaVcagNacaScRBatatctRatcctaNatWgRtcDcSaWSgtt +RtKaRtMtKaatgttBHcYaaBtgatSgaSWaScMgatNtctcctatttctYtatMatMt +RRtSaattaMtagaaaaStcgVgRttSVaScagtgDtttatcatcatacRcatatDctta +tcatVRtttataaHtattcYtcaaaatactttgVctagtaaYttagatagtSYacKaaac +gaaKtaaatagataatSatatgaaatSgKtaatVtttatcctgKHaatHattagaaccgt +YaaHactRcggSBNgtgctaaBagBttgtRttaaattYtVRaaaattgtaatVatttctc +ttcatgBcVgtgKgaHaaatattYatagWacNctgaaMcgaattStagWaSgtaaKagtt +ttaagaDgatKcctgtaHtcatggKttVDatcaaggtYcgccagNgtgcVttttagagat +gctaccacggggtNttttaSHaNtatNcctcatSaaVgtactgBHtagcaYggYVKNgta +KBcRttgaWatgaatVtagtcgattYgatgtaatttacDacSctgctaaaStttaWMagD +aaatcaVYctccgggcgaVtaaWtStaKMgDtttcaaMtVgBaatccagNaaatcYRMBg +gttWtaaScKttMWtYataRaDBMaDataatHBcacDaaKDactaMgagttDattaHatH +taYatDtattDcRNStgaatattSDttggtattaaNSYacttcDMgYgBatWtaMagact +VWttctttgYMaYaacRgHWaattgRtaagcattctMKVStatactacHVtatgatcBtV +NataaBttYtSttacKgggWgYDtgaVtYgatDaacattYgatggtRDaVDttNactaSa +MtgNttaacaaSaBStcDctaccacagacgcaHatMataWKYtaYattMcaMtgSttDag +cHacgatcaHttYaKHggagttccgatYcaatgatRaVRcaagatcagtatggScctata +ttaNtagcgacgtgKaaWaactSgagtMYtcttccaKtStaacggMtaagNttattatcg +tctaRcactctctDtaacWYtgaYaSaagaWtNtatttRacatgNaatgttattgWDDcN +aHcctgaaHacSgaataaRaataMHttatMtgaSDSKatatHHaNtacagtccaYatWtc +actaactatKDacSaStcggataHgYatagKtaatKagStaNgtatactatggRHacttg +tattatgtDVagDVaRctacMYattDgtttYgtctatggtKaRSttRccRtaaccttaga +gRatagSaaMaacgcaNtatgaaatcaRaagataatagatactcHaaYKBctccaagaRa +BaStNagataggcgaatgaMtagaatgtcaKttaaatgtaWcaBttaatRcggtgNcaca +aKtttScRtWtgcatagtttWYaagBttDKgcctttatMggNttattBtctagVtacata +aaYttacacaaRttcYtWttgHcaYYtaMgBaBatctNgcDtNttacgacDcgataaSat +YaSttWtcctatKaatgcagHaVaacgctgcatDtgttaSataaaaYSNttatagtaNYt +aDaaaNtggggacttaBggcHgcgtNtaaMcctggtVtaKcgNacNtatVaSWctWtgaW +cggNaBagctctgaYataMgaagatBSttctatacttgtgtKtaattttRagtDtacata +tatatgatNHVgBMtKtaKaNttDHaagatactHaccHtcatttaaagttVaMcNgHata +tKtaNtgYMccttatcaaNagctggacStttcNtggcaVtattactHaSttatgNMVatt +MMDtMactattattgWMSgtHBttStStgatatRaDaagattttctatMtaaaaaggtac +taaVttaSacNaatactgMttgacHaHRttgMacaaaatagttaatatWKRgacDgaRta +tatttattatcYttaWtgtBRtWatgHaaattHataagtVaDtWaVaWtgStcgtMSgaS +RgMKtaaataVacataatgtaSaatttagtcgaaHtaKaatgcacatcggRaggSKctDc +agtcSttcccStYtccRtctctYtcaaKcgagtaMttttcRaYDttgttatctaatcata +NctctgctatcaMatactataggDaHaaSttMtaDtcNatataattctMcStaaBYtaNa +gatgtaatHagagSttgWHVcttatKaYgDctcttggtgttMcRaVgSgggtagacaata +aDtaattSaDaNaHaBctattgNtaccaaRgaVtKNtaaYggHtaKKgHcatctWtctDt +ttctttggSDtNtaStagttataaacaattgcaBaBWggHgcaaaBtYgctaatgaaatW +cDcttHtcMtWWattBHatcatcaaatctKMagtDNatttWaBtHaaaNgMttaaStagt +tctctaatDtcRVaYttgttMtRtgtcaSaaYVgSWDRtaatagctcagDgcWWaaaBaa +RaBctgVgggNgDWStNaNBKcBctaaKtttDcttBaaggBttgaccatgaaaNgttttt +tttatctatgttataccaaDRaaSagtaVtDtcaWatBtacattaWacttaSgtattggD +gKaaatScaattacgWcagKHaaccaYcRcaRttaDttRtttHgaHVggcttBaRgtccc +tDatKaVtKtcRgYtaKttacgtatBtStaagcaattaagaRgBagSaattccSWYttta +ttVaataNctgHgttaaNBgcVYgtRtcccagWNaaaacaDNaBcaaaaRVtcWMgBagM +tttattacgDacttBtactatcattggaaatVccggttRttcatagttVYcatYaSHaHc +ttaaagcNWaHataaaRWtctVtRYtagHtaaaYMataHYtNBctNtKaatattStgaMc +BtRgctaKtgcScSttDgYatcVtggaaKtaagatWccHccgKYctaNNctacaWctttt +gcRtgtVcgaKttcMRHgctaHtVaataaDtatgKDcttatBtDttggNtacttttMtga +acRattaaNagaactcaaaBBVtcDtcgaStaDctgaaaSgttMaDtcgttcaccaaaag +gWtcKcgSMtcDtatgtttStaaBtatagDcatYatWtaaaBacaKgcaDatgRggaaYc +taRtccagattDaWtttggacBaVcHtHtaacDacYgtaatataMagaatgHMatcttat +acgtatttttatattacHactgttataMgStYaattYaccaattgagtcaaattaYtgta +tcatgMcaDcgggtcttDtKgcatgWRtataatatRacacNRBttcHtBgcRttgtgcgt +catacMtttBctatctBaatcattMttMYgattaaVYatgDaatVagtattDacaacDMa +tcMtHcccataagatgBggaccattVWtRtSacatgctcaaggggYtttDtaaNgNtaaB +atggaatgtctRtaBgBtcNYatatNRtagaacMgagSaSDDSaDcctRagtVWSHtVSR +ggaacaBVaccgtttaStagaacaMtactccagtttVctaaRaaHttNcttagcaattta +ttaatRtaaaatctaacDaBttggSagagctacHtaaRWgattcaaBtctRtSHaNtgta +cattVcaHaNaagtataccacaWtaRtaaVKgMYaWgttaKggKMtKcgWatcaDatYtK +SttgtacgaccNctSaattcDcatcttcaaaDKttacHtggttHggRRaRcaWacaMtBW +VHSHgaaMcKattgtaRWttScNattBBatYtaNRgcggaagacHSaattRtttcYgacc +BRccMacccKgatgaacttcgDgHcaaaaaRtatatDtatYVtttttHgSHaSaatagct +NYtaHYaVYttattNtttgaaaYtaKttWtctaNtgagaaaNctNDctaaHgttagDcRt +tatagccBaacgcaRBtRctRtggtaMYYttWtgataatcgaataattattataVaaaaa +ttacNRVYcaaMacNatRttcKatMctgaagactaattataaYgcKcaSYaatMNctcaa +cgtgatttttBacNtgatDccaattattKWWcattttatatatgatBcDtaaaagttgaa +VtaHtaHHtBtataRBgtgDtaataMttRtDgDcttattNtggtctatctaaBcatctaR +atgNacWtaatgaagtcMNaacNgHttatactaWgcNtaStaRgttaaHacccgaYStac +aaaatWggaYaWgaattattcMaactcBKaaaRVNcaNRDcYcgaBctKaacaaaaaSgc +tccYBBHYaVagaatagaaaacagYtctVccaMtcgtttVatcaatttDRtgWctagtac +RttMctgtDctttcKtWttttataaatgVttgBKtgtKWDaWagMtaaagaaattDVtag +gttacatcatttatgtcgMHaVcttaBtVRtcgtaYgBRHatttHgaBcKaYWaatcNSc +tagtaaaaatttacaatcactSWacgtaatgKttWattagttttNaggtctcaagtcact +attcttctaagKggaataMgtttcataagataaaaatagattatDgcBVHWgaBKttDgc +atRHaagcaYcRaattattatgtMatatattgHDtcaDtcaaaHctStattaatHaccga +cNattgatatattttgtgtDtRatagSacaMtcRtcattcccgacacSattgttKaWatt +NHcaacttccgtttSRtgtctgDcgctcaaMagVtBctBMcMcWtgtaacgactctcttR +ggRKSttgYtYatDccagttDgaKccacgVatWcataVaaagaataMgtgataaKYaaat +cHDaacgataYctRtcYatcgcaMgtNttaBttttgatttaRtStgcaacaaaataccVg +aaDgtVgDcStctatatttattaaaaRKDatagaaagaKaaYYcaYSgKStctccSttac +agtcNactttDVttagaaagMHttRaNcSaRaMgBttattggtttaRMggatggcKDgWR +tNaataataWKKacttcKWaaagNaBttaBatMHtccattaacttccccYtcBcYRtaga +ttaagctaaYBDttaNtgaaaccHcaRMtKtaaHMcNBttaNaNcVcgVttWNtDaBatg +ataaVtcWKcttRggWatcattgaRagHgaattNtatttctctattaattaatgaDaaMa +tacgttgggcHaYVaaNaDDttHtcaaHtcVVDgBVagcMacgtgttaaBRNtatRtcag +taagaggtttaagacaVaaggttaWatctccgtVtaDtcDatttccVatgtacNtttccg +tHttatKgScBatgtVgHtYcWagcaKtaMYaaHgtaattaSaHcgcagtWNaatNccNN +YcacgVaagaRacttctcattcccRtgtgtaattagcSttaaStWaMtctNNcSMacatt +ataaactaDgtatWgtagtttaagaaaattgtagtNagtcaataaatttgatMMYactaa +tatcggBWDtVcYttcDHtVttatacYaRgaMaacaStaatcRttttVtagaDtcacWat +ttWtgaaaagaaagNRacDtttStVatBaDNtaactatatcBSMcccaSttccggaMatg +attaaWatKMaBaBatttgataNctgttKtVaagtcagScgaaaDggaWgtgttttKtWt +atttHaatgtagttcactaaKMagttSYBtKtaYgaactcagagRtatagtVtatcaaaW +YagcgNtaDagtacNSaaYDgatBgtcgataacYDtaaactacagWDcYKaagtttatta +gcatcgagttKcatDaattgattatDtcagRtWSKtcgNtMaaaaacaMttKcaWcaaSV +MaaaccagMVtaMaDtMaHaBgaacataBBVtaatVYaNSWcSgNtDNaaKacacBttta +tKtgtttcaaHaMctcagtaacgtcgYtactDcgcctaNgagagcYgatattttaaattt +ccattttacatttDaaRctattttWctttacgtDatYtttcagacgcaaVttagtaaKaa +aRtgVtccataBggacttatttgtttaWNtgttVWtaWNVDaattgtatttBaagcBtaa +BttaaVatcHcaVgacattccNggtcgacKttaaaRtagRtctWagaYggtgMtataatM +tgaaRttattttgWcttNtDRRgMDKacagaaaaggaaaRStcccagtYccVattaNaaK +StNWtgacaVtagaagcttSaaDtcacaacgDYacWDYtgtttKatcVtgcMaDaSKStV +cgtagaaWaKaagtttcHaHgMgMtctataagBtKaaaKKcactggagRRttaagaBaaN +atVVcgRcKSttDaactagtSttSattgttgaaRYatggttVttaataaHttccaagDtg +atNWtaagHtgcYtaactRgcaatgMgtgtRaatRaNaacHKtagactactggaatttcg +ccataacgMctRgatgttaccctaHgtgWaYcactcacYaattcttaBtgacttaaacct +gYgaWatgBttcttVttcgttWttMcNYgtaaaatctYgMgaaattacNgaHgaacDVVM +tttggtHtctaaRgtacagacgHtVtaBMNBgattagcttaRcttacaHcRctgttcaaD +BggttKaacatgKtttYataVaNattccgMcgcgtagtRaVVaattaKaatggttRgaMc +agtatcWBttNtHagctaatctagaaNaaacaYBctatcgcVctBtgcaaagDgttVtga +HtactSNYtaaNccatgtgDacgaVtDcgKaRtacDcttgctaagggcagMDagggtBWR +tttSgccttttttaacgtcHctaVtVDtagatcaNMaVtcVacatHctDWNaataRgcgt +aVHaggtaaaaSgtttMtattDgBtctgatSgtRagagYtctSaKWaataMgattRKtaa +catttYcgtaacacattRWtBtcggtaaatMtaaacBatttctKagtcDtttgcBtKYYB +aKttctVttgttaDtgattttcttccacttgSaaacggaaaNDaattcYNNaWcgaaYat +tttMgcBtcatRtgtaaagatgaWtgaccaYBHgaatagataVVtHtttVgYBtMctaMt +cctgaDcYttgtccaaaRNtacagcMctKaaaggatttacatgtttaaWSaYaKttBtag +DacactagctMtttNaKtctttcNcSattNacttggaacaatDagtattRtgSHaataat +gccVgacccgatactatccctgtRctttgagaSgatcatatcgDcagWaaHSgctYYWta +tHttggttctttatVattatcgactaagtgtagcatVgtgHMtttgtttcgttaKattcM +atttgtttWcaaStNatgtHcaaaDtaagBaKBtRgaBgDtSagtatMtaacYaatYtVc +KatgtgcaacVaaaatactKcRgtaYtgtNgBBNcKtcttaccttKgaRaYcaNKtactt +tgagSBtgtRagaNgcaaaNcacagtVtttHWatgttaNatBgtttaatNgVtctgaata +tcaRtattcttttttttRaaKcRStctcggDgKagattaMaaaKtcaHacttaataataK +taRgDtKVBttttcgtKaggHHcatgttagHggttNctcgtatKKagVagRaaaggaaBt +NatttVKcRttaHctaHtcaaatgtaggHccaBataNaNaggttgcWaatctgatYcaaa +HaatWtaVgaaBttagtaagaKKtaaaKtRHatMaDBtBctagcatWtatttgWttVaaa +ScMNattRactttgtYtttaaaagtaagtMtaMaSttMBtatgaBtttaKtgaatgagYg +tNNacMtcNRacMMHcttWtgtRtctttaacaacattattcYaMagBaacYttMatcttK +cRMtgMNccattaRttNatHaHNaSaaHMacacaVaatacaKaSttHatattMtVatWga +ttttttaYctttKttHgScWaacgHtttcaVaaMgaacagNatcgttaacaaaaagtaca +HBNaattgttKtcttVttaaBtctgctacgBgcWtttcaggacacatMgacatcccagcg +gMgaVKaBattgacttaatgacacacaaaaaatRKaaBctacgtRaDcgtagcVBaacDS +BHaaaaSacatatacagacRNatcttNaaVtaaaataHattagtaaaaSWccgtatWatg +gDttaactattgcccatcttHaSgYataBttBaactattBtcHtgatcaataSttaBtat +KSHYttWggtcYtttBttaataccRgVatStaHaKagaatNtagRMNgtcttYaaSaact +cagDSgagaaYtMttDtMRVgWKWtgMaKtKaDttttgactatacataatcNtatNaHat +tVagacgYgatatatttttgtStWaaatctWaMgagaRttRatacgStgattcttaagaD +taWccaaatRcagcagaaNKagtaaDggcgccBtYtagSBMtactaaataMataBSacRM +gDgattMMgtcHtcaYDtRaDaacggttDaggcMtttatgttaNctaattaVacgaaMMt +aatDccSgtattgaRtWWaccaccgagtactMcgVNgctDctaMScatagcgtcaactat +acRacgHRttgctatttaatgaattataYKttgtaagWgtYttgcHgMtaMattWaWVta +RgcttgYgttBHtYataSccStBtgtagMgtDtggcVaaSBaatagDttgBgtctttctc +attttaNagtHKtaMWcYactVcgcgtatMVtttRacVagDaatcttgctBBcRDgcaac +KttgatSKtYtagBMagaRtcgBattHcBWcaactgatttaatttWDccatttatcgagS +KaWttataHactaHMttaatHtggaHtHagaatgtKtaaRactgtttMatacgatcaagD +gatKaDctataMggtHDtggHacctttRtatcttYattttgacttgaaSaataaatYcgB +aaaaccgNatVBttMacHaKaataagtatKgtcaagactcttaHttcggaattgttDtct +aaccHttttWaaatgaaatataaaWattccYDtKtaaaacggtgaggWVtctattagtga +ctattaagtMgtttaagcatttgSgaaatatccHaaggMaaaattttcWtatKctagDtY +tMcctagagHcactttactatacaaacattaacttaHatcVMYattYgVgtMttaaRtga +aataaDatcaHgtHHatKcDYaatcttMtNcgatYatgSaMaNtcttKcWataScKggta +tcttacgcttWaaagNatgMgHtctttNtaacVtgttcMaaRatccggggactcMtttaY +MtcWRgNctgNccKatcttgYDcMgattNYaRagatHaaHgKctcataRDttacatBatc +cattgDWttatttaWgtcggagaaaaatacaatacSNtgggtttccttacSMaagBatta +caMaNcactMttatgaRBacYcYtcaaaWtagctSaacttWgDMHgaggatgBVgcHaDt +ggaactttggtcNatNgtaKaBcccaNtaagttBaacagtatacDYttcctNgWgcgSMc +acatStctHatgRcNcgtacacaatRttMggaNKKggataaaSaYcMVcMgtaMaHtgat +tYMatYcggtcttcctHtcDccgtgRatcattgcgccgatatMaaYaataaYSggatagc +gcBtNtaaaScaKgttBgagVagttaKagagtatVaactaSacWactSaKatWccaKaaa +atBKgaaKtDMattttgtaaatcRctMatcaaMagMttDgVatggMaaWgttcgaWatga +aatttgRtYtattaWHKcRgctacatKttctaccaaHttRatctaYattaaWatVNccat +NgagtcKttKataStRaatatattcctRWatDctVagttYDgSBaatYgttttgtVaatt +taatagcagMatRaacttBctattgtMagagattaaactaMatVtHtaaatctRgaaaaa +aaatttWacaacaYccYDSaattMatgaccKtaBKWBattgtcaagcHKaagttMMtaat +ttcKcMagNaaKagattggMagaggtaatttYacatcWaaDgatMgKHacMacgcVaaca +DtaDatatYggttBcgtatgWgaSatttgtagaHYRVacaRtctHaaRtatgaactaata +tctSSBgggaaHMWtcaagatKgagtDaSatagttgattVRatNtctMtcSaagaSHaat +aNataataRaaRgattctttaataaagWaRHcYgcatgtWRcttgaaggaMcaataBRaa +ccagStaaacNtttcaatataYtaatatgHaDgcStcWttaacctaRgtYaRtataKtgM +ttttatgactaaaatttacYatcccRWtttHRtattaaatgtttatatttgttYaatMca +RcSVaaDatcgtaYMcatgtagacatgaaattgRtcaaYaaYtRBatKacttataccaNa +aattVaBtctggacaagKaaYaaatatWtMtatcYaaVNtcgHaactBaagKcHgtctac +aatWtaDtSgtaHcataHtactgataNctRgttMtDcDttatHtcgtacatcccaggStt +aBgtcacacWtccNMcNatMVaVgtccDYStatMaccDatggYaRKaaagataRatttHK +tSaaatDgataaacttaHgttgVBtcttVttHgDacgaKatgtatatNYataactctSat +atatattgcHRRYttStggaactHgttttYtttaWtatMcttttctatctDtagVHYgMR +BgtHttcctaatYRttKtaagatggaVRataKDctaMtKBNtMtHNtWtttYcVtattMc +gRaacMcctNSctcatttaaagDcaHtYccSgatgcaatYaaaaDcttcgtaWtaattct +cgttttScttggtaatctttYgtctaactKataHacctMctcttacHtKataacacagcN +RatgKatttttSaaatRYcgDttaMRcgaaattactMtgcgtaagcgttatBtttttaat +taagtNacatHgttcRgacKcBBtVgatKttcgaBaatactDRgtRtgaNacWtcacYtt +aaKcgttctHaKttaNaMgWgWaggtctRgaKgWttSttBtDcNtgtttacaaatYcDRt +gVtgcctattcNtctaaaDMNttttNtggctgagaVctDaacVtWccaagtaacacaNct +gaScattccDHcVBatcgatgtMtaatBgHaatDctMYgagaatgYWKcctaatNaStHa +aaKccgHgcgtYaaYtattgtStgtgcaaRtattaKatattagaWVtcaMtBagttatta +gNaWHcVgcaattttDcMtgtaRHVYtHtctgtaaaaHVtMKacatcgNaatttMatatg +ttgttactagWYtaRacgataKagYNKcattataNaRtgaacKaYgcaaYYacaNccHat +MatDcNgtHttRaWttagaaDcaaaaaatagggtKDtStaDaRtaVtHWKNtgtattVct +SVgRgataDaRaWataBgaagaaKtaataaYgDcaStaNgtaDaaggtattHaRaWMYaY +aWtggttHYgagVtgtgcttttcaaDKcagVcgttagacNaaWtagtaataDttctggtt +VcatcataaagtgKaaaNaMtaBBaattaatWaattgctHaVKaSgDaaVKaHtatatat +HatcatSBagNgHtatcHYMHgttDgtaHtBttWatcgtttaRaattgStKgSKNWKatc +agDtctcagatttctRtYtBatBgHHtKaWtgYBgacVVWaKtacKcDttKMaKaVcggt +gttataagaataaHaatattagtataatMHgttYgaRttagtaRtcaaVatacggtcMcg +agtaaRttacWgactKRYataaaagSattYaWgagatYagKagatgSaagKgttaatMgg +tataatgttWYttatgagaaacctNVataatHcccKtDctcctaatactggctHggaSag +gRtKHaWaattcgSatMatttagaggcYtctaMcgctcataSatatgRagacNaaDagga +VBagaYttKtacNaKgtSYtagttggaWcatcWttaatctatgaVtcgtgtMtatcaYcg +tRccaaYgDctgcMgtgtWgacWtgataacacgcgctBtgttaKtYDtatDcatcagKaV +MctaatcttgVcaaRgcRMtDcgattaHttcaNatgaatMtactacVgtRgatggaWttt +actaaKatgagSaaKggtaNtactVaYtaaKRagaacccacaMtaaMtKtatBcttgtaa +WBtMctaataaVcDaaYtcRHBtcgttNtaaHatttBNgRStVDattBatVtaagttaYa +tVattaagaBcacggtSgtVtatttaRattgatgtaHDKgcaatattKtggcctatgaWD +KRYcggattgRctatNgatacaatMNttctgtcRBYRaaaHctNYattcHtaWcaattct +BtMKtVgYataatMgYtcagcttMDataVtggRtKtgaatgccNcRttcaMtRgattaac +attRcagcctHtWMtgtDRagaKaBtgDttYaaaaKatKgatctVaaYaacWcgcatagB +VtaNtRtYRaggBaaBtgKgttacataagagcatgtRattccacttaccatRaaatgWgD +aMHaYVgVtaSctatcgKaatatattaDgacccYagtgtaYNaaatKcagtBRgagtcca +tgKgaaaccBgaagBtgSttWtacgatWHaYatcgatttRaaNRgcaNaKVacaNtDgat +tgHVaatcDaagcgtatgcNttaDataatcSataaKcaataaHWataBtttatBtcaKtK +tatagttaDgSaYctacaRatNtaWctSaatatttYaKaKtaccWtatcRagacttaYtt +VcKgSDcgagaagatccHtaattctSttatggtKYgtMaHagVaBRatttctgtRgtcta +tgggtaHKgtHacHtSYacgtacacHatacKaaBaVaccaDtatcSaataaHaagagaat +ScagactataaRttagcaaVcaHataKgDacatWccccaagcaBgagWatctaYttgaaa +tctVNcYtttWagHcgcgcDcVaaatgttKcHtNtcaatagtgtNRaactttttcaatgg +WgBcgDtgVgtttctacMtaaataaaRggaaacWaHttaRtNtgctaaRRtVBctYtVta +tDcattDtgaccYatagatYRKatNYKttNgcctagtaWtgaactaMVaacctgaStttc +tgaKVtaaVaRKDttVtVctaDNtataaaDtccccaagtWtcgatcactDgYaBcatcct +MtVtacDaaBtYtMaKNatNtcaNacgDatYcatcgcaRatWBgaacWttKttagYtaat +tcggttgSWttttDWctttacYtatatWtcatDtMgtBttgRtVDggttaacYtacgtac +atgaattgaaWcttMStaDgtatattgaDtcRBcattSgaaVBRgagccaaKtttcDgcg +aSMtatgWattaKttWtgDBMaggBBttBaatWttRtgcNtHcgttttHtKtcWtagHSt +aacagttgatatBtaWSaWggtaataaMttaKacDaatactcBttcaatatHttcBaaSa +aatYggtaRtatNtHcaatcaHtagVtgtattataNggaMtcttHtNagctaaaggtaga +YctMattNaMVNtcKtactBKcaHHcBttaSagaKacataYgctaKaYgttYcgacWVtt +WtSagcaacatcccHaccKtcttaacgaKttcacKtNtacHtatatRtaaatacactaBt +ttgaHaRttggttWtatYagcatYDatcggagagcWBataagRtacctataRKgtBgatg +aDatataSttagBaHtaatNtaDWcWtgtaattacagKttcNtMagtattaNgtctcgtc +ctcttBaHaKcKccgtRcaaYagSattaagtKataDatatatagtcDtaacaWHcaKttD +gaaRcgtgYttgtcatatNtatttttatggccHtgDtYHtWgttatYaacaattcaWtat +NgctcaaaSttRgctaatcaaatNatcgtttaBtNNVtgttataagcaaagattBacgtD +atttNatttaaaDcBgtaSKgacgtagataatttcHMVNttgttBtDtgtaWKaaRMcKM +tHtaVtagataWctccNNaSWtVaHatctcMgggDgtNHtDaDttatatVWttgttattt +aacctttcacaaggaSaDcggttttttatatVtctgVtaacaStDVaKactaMtttaSNa +gtgaaattaNacttSKctattcctctaSagKcaVttaagNaVcttaVaaRNaHaaHttat +gtHttgtgatMccaggtaDcgaccgtWgtWMtttaHcRtattgScctatttKtaaccaag +tYagaHgtWcHaatgccKNRtttagtMYSgaDatctgtgaWDtccMNcgHgcaaacNDaa +aRaStDWtcaaaaHKtaNBctagBtgtattaactaattttVctagaatggcWSatMaccc +ttHttaSgSgtgMRcatRVKtatctgaaaccDNatYgaaVHNgatMgHRtacttaaaRta +tStRtDtatDttYatattHggaBcttHgcgattgaKcKtttcRataMtcgaVttWacatN +catacctRataDDatVaWNcggttgaHtgtMacVtttaBHtgagVttMaataattatgtt +cttagtttgtgcDtSatttgBtcaacHattaaBagVWcgcaSYttMgcttacYKtVtatc +aYaKctgBatgcgggcYcaaaaacgNtctagKBtattatctttKtaVttatagtaYtRag +NtaYataaVtgaatatcHgcaaRataHtacacatgtaNtgtcgYatWMatttgaactacR +ctaWtWtatacaatctBatatgYtaagtatgtgtatSttactVatcttYtaBcKgRaSgg +RaaaaatgcagtaaaWgtaRgcgataatcBaataccgtatttttccatcNHtatWYgatH +SaaaDHttgctgtccHtggggcctaataatttttctatattYWtcattBtgBRcVttaVM +RSgctaatMagtYtttaaaaatBRtcBttcaaVtaacagctccSaaSttKNtHtKYcagc +agaaaccccRtttttaaDcDtaStatccaagcgctHtatcttaDRYgatDHtWcaaaBcW +gKWHttHataagHacgMNKttMKHccaYcatMVaacgttaKgYcaVaaBtacgcaacttt +MctaaHaatgtBatgagaSatgtatgSRgHgWaVWgataaatatttccKagVgataattW +aHNcYggaaatgctHtKtaDtctaaagtMaatVDVactWtSaaWaaMtaHtaSKtcBRaN +cttStggtBttacNagcatagRgtKtgcgaacaacBcgKaatgataagatgaaaattgta +ctgcgggtccHHWHaaNacaBttNKtKtcaaBatatgctaHNgtKcDWgtttatNgVDHg +accaacWctKaaggHttgaRgYaatHcaBacaatgagcaaattactgtaVaaYaDtagat +tgagNKggtggtgKtWKaatacagDRtatRaMRtgattDggtcaaYRtatttNtagaDtc +acaaSDctDtataatcgtactaHttatacaatYaacaaHttHatHtgcgatRRttNgcat +SVtacWWgaaggagtatVMaVaaattScDDKNcaYBYaDatHgtctatBagcaacaagaa +tgagaaRcataaKNaRtBDatcaaacgcattttttaaBtcSgtacaRggatgtMNaattg +gatatWtgagtattaaaVctgcaYMtatgatttttYgaHtgtcttaagWBttHttgtctt +attDtcgtatWtataataSgctaHagcDVcNtaatcaagtaBDaWaDgtttagYctaNcc +DtaKtaHcttaataacccaRKtacaVaatNgcWRaMgaattatgaBaaagattVYaHMDc +aDHtcRcgYtcttaaaWaaaVKgatacRtttRRKYgaatacaWVacVcRtatMacaBtac +tggMataaattttHggNagSctacHgtBagcgtcgtgattNtttgatSaaggMttctttc +ttNtYNagBtaaacaaatttMgaccttacataattgYtcgacBtVMctgStgMDtagtaR +ctHtatgttcatatVRNWataDKatWcgaaaaagttaaaagcacgHNacgtaatctttMR +tgacttttDacctataaacgaaatatgattagaactccSYtaBctttaataacWgaaaYa +tagatgWttcatKtNgatttttcaagHtaYgaaRaDaagtaggagcttatVtagtctttc +attaaaatcgKtattaRttacagVaDatgcatVgattgggtctttHVtagKaaRBtaHta +aggccccaaaaKatggtttaMWgtBtaaacttcactttKHtcgatctccctaYaBacMgt +cttBaBaNgcgaaacaatctagtHccHtKttcRtRVttccVctttcatacYagMVtMcag +aMaaacaataBctgYtaatRaaagattaaccatVRatHtaRagcgcaBcgDttStttttc +VtttaDtKgcaaWaaaaatSccMcVatgtKgtaKgcgatatgtagtSaaaDttatacaaa +catYaRRcVRHctKtcgacKttaaVctaDaatgttMggRcWaacttttHaDaKaDaBctg +taggcgtttaHBccatccattcNHtDaYtaataMttacggctNVaacDattgatatttta +cVttSaattacaaRtataNDgacVtgaacataVRttttaDtcaaacataYDBtttaatBa +DtttYDaDaMccMttNBttatatgagaaMgaNtattHccNataattcaHagtgaaggDga +tgtatatatgYatgaStcataaBStWacgtcccataRMaaDattggttaaattcMKtctM +acaBSactcggaatDDgatDgcWctaacaccgggaVcacWKVacggtaNatatacctMta +tgatagtgcaKagggVaDtgtaacttggagtcKatatcgMcttRaMagcattaBRaStct +YSggaHYtacaactMBaagDcaBDRaaacMYacaHaattagcattaaaHgcgctaaggSc +cKtgaaKtNaBtatDDcKBSaVtgatVYaagVtctSgMctacgttaacWaaattctSgtD +actaaStaaattgcagBBRVctaatatacctNttMcRggctttMttagacRaHcaBaacV +KgaataHttttMgYgattcYaNRgttMgcVaaacaVVcDHaatttgKtMYgtatBtVVct +WgVtatHtacaaHttcacgatagcagtaaNattBatatatttcVgaDagcggttMaagtc +ScHagaaatgcYNggcgtttttMtStggtRatctacttaaatVVtBacttHNttttaRca +aatcacagHgagagtMgatcSWaNRacagDtatactaaDKaSRtgattctccatSaaRtt +aaYctacacNtaRtaactggatgaccYtacactttaattaattgattYgttcagDtNKtt +agDttaaaaaaaBtttaaNaYWKMBaaaacVcBMtatWtgBatatgaacVtattMtYatM +NYDKNcKgDttDaVtaaaatgggatttctgtaaatWtctcWgtVVagtcgRgacttcccc +taDcacagcRcagagtgtWSatgtacatgttaaSttgtaaHcgatgggMagtgaacttat +RtttaVcaccaWaMgtactaatSSaHtcMgaaYtatcgaaggYgggcgtgaNDtgttMNg +aNDMtaattcgVttttaacatgVatgtWVMatatcaKgaaattcaBcctccWcttgaaWH +tWgHtcgNWgaRgctcBgSgaattgcaaHtgattgtgNagtDttHHgBttaaWcaaWagc +aSaHHtaaaVctRaaMagtaDaatHtDMtcVaWMtagSagcttHSattaacaaagtRacM +tRtctgttagcMtcaBatVKtKtKacgagaSNatSactgtatatcBctgagVtYactgta +aattaaaggcYgDHgtaacatSRDatMMccHatKgttaacgactKtgKagtcttcaaHRV +tccttKgtSataatttacaactggatDNgaacttcaRtVaagDcaWatcBctctHYatHa +DaaatttagYatSatccaWtttagaaatVaacBatHcatcgtacaatatcgcNYRcaata +YaRaYtgattVttgaatgaVaactcRcaNStgtgtattMtgaggtNttBaDRcgaaaagc +tNgBcWaWgtSaDcVtgVaatMKBtttcgtttctaaHctaaagYactgMtatBDtcStga +ccgtSDattYaataHctgggaYYttcggttaWaatctggtRagWMaDagtaacBccacta +cgHWMKaatgatWatcctgHcaBaSctVtcMtgtDttacctaVgatYcWaDRaaaaRtag +atcgaMagtggaRaWctctgMgcWttaagKBRtaaDaaWtctgtaagYMttactaHtaat +cttcataacggcacBtSgcgttNHtgtHccatgttttaaagtatcgaKtMttVcataYBB +aKtaMVaVgtattNDSataHcagtWMtaggtaSaaKgttgBtVtttgttatcatKcgHac +acRtctHatNVagSBgatgHtgaRaSgttRcctaacaaattDNttgacctaaYtBgaaaa +tagttattactcttttgatgtNNtVtgtatMgtcttRttcatttgatgacacttcHSaaa +ccaWWDtWagtaRDDVNacVaRatgttBccttaatHtgtaaacStcVNtcacaSRttcYa +gacagaMMttttgMcNttBcgWBtactgVtaRttctccaaYHBtaaagaBattaYacgat +ttacatctgtaaMKaRYtttttactaaVatWgctBtttDVttctggcDaHaggDaagtcg +aWcaagtagtWttHtgKtVataStccaMcWcaagataagatcactctHatgtcYgaKcat +cagatactaagNSStHcctRRNtattgtccttagttagMVgtatagactaactctVcaat +MctgtttgtgttgccttatWgtaBVtttctggMcaaKgDWtcgtaaYStgSactatttHg +atctgKagtagBtVacRaagRtMctatgggcaaaKaaaatacttcHctaRtgtDcttDat +taggaaatttcYHaRaaBttaatggcacKtgctHVcaDcaaaVDaaaVcgMttgtNagcg +taDWgtcgttaatDgKgagcSatatcSHtagtagttggtgtHaWtaHKtatagctgtVga +ttaBVaatgaataagtaatVatSttaHctttKtttgtagttaccttaatcgtagtcctgB +cgactatttVcMacHaaaggaatgDatggKtaHtgStatattaaSagctWcctccRtata +BaDYcgttgcNaagaggatRaaaYtaWgNtSMcaatttactaacatttaaWttHtatBat +tgtcgacaatNgattgcNgtMaaaKaBDattHacttggtRtttaYaacgVactBtaBaKt +gBttatgVttgtVttcaatcWcNctDBaaBgaDHacBttattNtgtDtatttVSaaacag +gatgcRatSgtaSaNtgBatagttcHBgcBBaaattaHgtDattatDaKaatBaaYaaMa +ataaataKtttYtagtBgMatNcatgtttgaNagtgttgtgKaNaSagtttgaSMaYBca +aaacDStagttVacaaaaactaaWttBaagtctgtgcgtMgtaattctcctacctcaNtt +taaccaaaaVtBcacataacaccccBcWMtatVtggaatgaWtcaaWaaaaaaaaWtDta +atatRcctDWtcctaccMtVVatKttaWaaKaaatataaagScHBagaggBaSMtaWaVt +atattactSaaaKNaactatNatccttgaYctattcaaaVgatttYHcRagattttaSat +aggttattcVtaaagaKgtattattKtRttNcggcRgtgtgtWYtaacHgKatKgatYta +cYagDtWcHBDctctgRaYKaYagcactKcacSaRtBttttBHKcMtNtcBatttatttt +tgSatVgaaagaWtcDtagDatatgMacaacRgatatatgtttgtKtNRaatatNatgYc +aHtgHataacKtgagtagtaacYttaNccaaatHcacaacaVDtagtaYtccagcattNt +acKtBtactaaagaBatVtKaaHBctgStgtBgtatgaSNtgDataaccctgtagcaBgt +gatcttaDataStgaMaccaSBBgWagtacKcgattgaDgNNaaaacacagtSatBacKD +gcgtataBKcatacactaSaatYtYcDaactHttcatRtttaatcaattataRtttgtaa +gMcgNttcatcBtYBagtNWNMtSHcattcRctttttRWgaKacKttgggagBcgttcgc +MaWHtaatactgtctctatttataVgtttaBScttttaBMaNaatMacactYtBMggtHa +cMagtaRtctgcatttaHtcaaaatttgagKtgNtactBacaHtcgtatttctMaSRagc +agttaatgtNtaaattgagagWcKtaNttagVtacgatttgaatttcgRtgtWcVatcgt +taaDVctgtttBWgaccagaaagtcSgtVtatagaBccttttcctaaattgHtatcggRa +ttttcaaggcYSKaagWaWtRactaaaacccBatMtttBaatYtaagaactSttcgaaSc +aatagtattgaccaagtgttttctaacatgtttNVaatcaaagagaaaNattaaRtttta +VaaaccgcaggNMtatattVctcaagaggaacgBgtttaacaagttcKcYaatatactaa +ccBaaaSggttcNtattctagttRtBacgScVctcaatttaatYtaaaaaaatgSaatga +tagaMBRatgRcMcgttgaWHtcaVYgaatYtaatctttYttatRaWtctgBtDcgatNa +tcKaBaDgatgtaNatWKctccgatattaacattNaaacDatgBgttctgtDtaaaMggt +gaBaSHataacgccSctaBtttaRBtcNHcDatcDcctagagtcRtaBgWttDRVHagat +tYatgtatcWtaHtttYcattWtaaagtctNgtStggRNcgcggagSSaaagaaaatYcH +DtcgctttaatgYcKBVSgtattRaYBaDaaatBgtatgaHtaaRaRgcaSWNtagatHa +acttNctBtcaccatctMcatattccaSatttgcgaDagDgtatYtaaaVDtaagtttWV +aagtagYatRttaagDcNgacKBcScagHtattatcDaDactaaaaaYgHttBcgaDttg +gataaaKSRcBMaBcgaBSttcWtgNBatRaccgattcatttataacggHVtaattcaca +agagVttaaRaatVVRKcgWtVgacctgDgYaaHaWtctttcacMagggatVgactagMa +aataKaaNWagKatagNaaWtaaaatttgaattttatttgctaaVgaHatBatcaaBWcB +gttcMatcgBaaNgttcgSNaggSaRtttgHtRtattaNttcDcatSaVttttcgaaaaa +ttgHatctaRaggSaNatMDaaatDcacgattttagaHgHaWtYgattaatHNSttatMS +gggNtcKtYatRggtttgtMWVtttaYtagcagBagHaYagttatatggtBacYcattaR +SataBatMtttaaatctHcaaaSaaaagttNSaaWcWRccRtKaagtBWtcaaattSttM +tattggaaaccttaacgttBtWatttatatWcDaatagattcctScacctaagggRaaYt +aNaatgVtBcttaaBaacaMVaaattatStYgRcctgtactatcMcVKatttcgSgatRH +MaaaHtagtaaHtVgcaaataatatcgKKtgccaatBNgaaWcVttgagttaKatagttc +aggKDatDtattgaKaVcaKtaataDataataHSaHcattagttaatRVYcNaHtaRcaa +ggtNHcgtcaaccaBaaagYtHWaaaRcKgaYaaDttgcWYtataRgaatatgtYtgcKt +aNttWacatYHctRaDtYtattcBttttatcSataYaYgttWaRagcacHMgtttHtYtt +YaatcggtatStttcgtRSattaaDaKMaatatactaNBaWgctacacYtgaYVgtgHta +aaRaaRgHtagtWattataaaSDaaWtgMattatcgaaaagtaYRSaWtSgNtBgagcRY +aMDtactaacttaWgtatctagacaagNtattHggataatYttYatcataDcgHgttBtt +ctttVttgccgaaWtaaaacgKgtatctaaaaaNtccDtaDatBMaMggaatNKtatBaa +atVtccRaHtaSacataHattgtttKVYattcataVaattWtcgtgMttcttKtgtctaa +cVtatctatatBRataactcgKatStatattcatHHRttKtccaacgtgggtgRgtgaMt +attattggctatcgtgacMtRcBDtcttgtactaatRHttttaagatcgVMDStattatY +BtttDttgtBtNttgRcMtYtgBacHaWaBaatDKctaagtgaaactaatgRaaKgatcc +aagNaaaatattaggWNtaagtatacttttKcgtcggSYtcttgRctataYcttatataa +agtatattaatttataVaacacaDHatctatttttKYVatHRactttaBHccaWagtact +BtcacgaVgcgttRtttttttSVgtSagtBaaattctgaHgactcttgMcattttagVta +agaattHctHtcaDaaNtaacRggWatagttcgtSttgaDatcNgNagctagDgatcNtt +KgttgtaDtctttRaaYStRatDtgMggactSttaDtagSaVtBDttgtDgccatcacaM +attaaaMtNacaVcgSWcVaaDatcaHaatgaattaMtatccVtctBtaattgtWattat +BRcWcaatgNNtactWYtDaKttaaatcactcagtRaaRgatggtKgcgccaaHgaggat +StattYcaNMtcaBttacttatgagDaNtaMgaaWtgtttcttctaHtMNgttatctaWW +atMtBtaaatagDVatgtBYtatcggcttaagacMRtaHScgatatYgRDtcattatSDa +HggaaataNgaWSRRaaaBaatagBattaDctttgHWNttacaataaaaaaatacggttt +gHgVtaHtWMttNtBtctagtMcgKMgHgYtataHaNagWtcaacYattaataYRgtaWK +gaBctataaccgatttaHaNBRaRaMtccggtNgacMtctcatttgcaattcWgMactta +caaDaaNtactWatVtttagccttMaatcagVaagtctVaaDaBtattaattaYtNaYtg +gattaKtaKctYaMtattYgatattataatKtVgDcttatatNBtcgttgtStttttMag +aggttaHYSttcKgtcKtDNtataagttataagSgttatDtRttattgttttSNggRtca +aKMNatgaatattgtBWtaMacctgggYgaSgaagYataagattacgagaatBtggtRcV +HtgYggaDgaYaKagWagctatagacgaaHgtWaNgacttHRatVaWacKYtgRVNgVcS +gRWctacatcKSactctgWYtBggtataagcttNRttVtgRcaWaaatDMatYattaact +ttcgaagRatSctgccttgcRKaccHtttSNVagtagHagBagttagaccaRtataBcca +taatSHatRtcHagacBWatagcaMtacaRtgtgaaBatctKRtScttccaNaatcNgta +atatWtcaMgactctBtWtaaNactHaaaaRctcgcatggctMcaaNtcagaaaaacaca +gtggggWttRttagtaagaVctVMtcgaatcttcMaaaHcaHBttcgattatgtcaDagc +YRtBtYcgacMgtDcagcgaNgttaataatagcagKYYtcgtaBtYctMaRtaRtDagaa +aacacatgYaBttgattattcgaaNttBctSataaMataWRgaHtttccgtDgaYtatgg +tDgHKgMtatttVtMtVagttaRatMattRagataaccctKctMtSttgaHagtcStcta +tttccSagatgttccacgaggYNttHRacgattcDatatDcataaaatBBttatcgaHtN +HaaatatDNaggctgaNcaaggagttBttMgRagVatBcRtaWgatgBtSgaKtcgHttt +gaatcaaDaHttcSBgHcagtVaaSttDcagccgttNBtgttHagYtattctttRWaaVt +SttcatatKaaRaaaNacaVtVctMtSDtDtRHRcgtaatgctcttaaatSacacaatcg +HattcaWcttaaaatHaaatcNctWttaNMcMtaKctVtcctaagYgatgatcYaaaRac +tctaRDaYagtaacgtDgaggaaatctcaaacatcaScttcKttNtaccatNtaNataca +tttHaaDHgcaDatMWaaBttcRggctMaagctVYcacgatcaDttatYtaatcKatWat +caatVYtNagatttgattgaYttttYgacttVtcKaRagaaaHVgDtaMatKYagagttN +atWttaccNtYtcDWgSatgaRgtMatgKtcgacaagWtacttaagtcgKtgatccttNc +ttatagMatHVggtagcgHctatagccctYttggtaattKNaacgaaYatatVctaataM +aaaYtgVtcKaYtaataacagaatHcacVagatYWHttagaaSMaatWtYtgtaaagNaa +acaVgaWtcacNWgataNttcaSagctMDaRttgNactaccgataMaaatgtttattDtc +aagacgctDHYYatggttcaagccNctccttcMctttagacBtaaWtaWVHggaaaaNat +ttaDtDtgctaaHHtMtatNtMtagtcatttgcaaaRatacagRHtatDNtgtDgaatVg +tVNtcaaatYBMaaaagcaKgtgatgatMgWWMaHttttMgMagatDtataaattaacca +actMtacataaattgRataatacgBtKtaataattRgtatDagDtcRDacctatRcagag +cSHatNtcaScNtttggacNtaaggaccgtgKNttgttNcttgaaRgYgRtNtcagttBc +ttttcHtKtgcttYaaNgYagtaaatgaatggWaMattBHtatctatSgtcYtgcHtaat +tHgaaMtHcagaaSatggtatgccaHBtYtcNattWtgtNgctttaggtttgtWatNtgH +tgcDttactttttttgcNtactKtWRaVcttcatagtgSNKaNccgaataaBttataata +YtSagctttaaatSttggctaaKSaatRccgWHgagDttaaatcatgagMtcgagtVtaD +ggaBtatttgDacataaacgtagYRagBWtgDStKDgatgaagttcattatttaKWcata +aatWRgatataRgttRacaaNKttNtKagaaYaStaactScattattaacgatttaaatg +DtaattagatHgaYataaactatggggatVHtgccgtNgatNYcaStRtagaccacWcaM +tatRagHgVactYtWHtcttcatgatWgagaKggagtatgaWtDtVtNaNtcgYYgtaaa +ctttaDtBactagtaDctatagtaatatttatatataacgHaaaRagKattSagttYtSt +>THREE Homo sapiens frequency +agagagacgatgaaaattaatcgtcaatacgctggcgaacactgagggggacccaatgct +cttctcggtctaaaaaggaatgtgtcagaaattggtcagttcaaaagtagaccggatctt +tgcggagaacaattcacggaacgtagcgttgggaaatatcctttctaccacacatcggat +tttcgccctctcccattatttattgtgttctcacatagaattattgtttagacatccctc +gttgtatggagagttgcccgagcgtaaaggcataatccatataccgccgggtgagtgacc +tgaaattgtttttagttgggatttcgctatggattagcttacacgaagagattctaatgg +tactataggataattataatgctgcgtggcgcagtacaccgttacaaacgtcgttcgcat +atgtggctaacacggtgaaaatacctacatcgtatttgcaatttcggtcgtttcatagag +cgcattgaattactcaaaaattatatatgttgattatttgattagactgcgtggaaagaa +ggggtactcaagccatttgtaaaagctgcatctcgcttaagtttgagagcttacattagt +ctatttcagtcttctaggaaatgtctgtgtgagtggttgtcgtccataggtcactggcat +atgcgattcatgacatgctaaactaagaaagtagattactattaccggcatgcctaatgc +gattgcactgctatgaaggtgcggacgtcgcgcccatgtagccctgataataccaatact +tacatttggtcagcaattctgacattatacctagcacccataaatttactcagacttgag +gacaggctcttggagtcgatcttctgtttgtatgcatgtgatcatatagatgaataagcg +atgcgactagttagggcatagtatagatctgtgtatacagttcagctgaacgtccgcgag +tggaagtacagctgagatctatcctaaaatgcaaccatatcgttcacacatgatatgaac +ccagggggaaacattgagttcagttaaattggcagcgaatcccccaagaagaaggcggag +tgacgttgaacgggcttatggtttttcagtacttcctccgtataagttgagcgaaatgta +aacagaataatcgttgtgttaacaacattaaaatcgcggaatatgatgagaatacacagt +gtgagcatttcacttgtaaaatatctttggtagaacttactttgctttaaatatgttaaa +ccgatctaataatctacaaaacggtagattttgcctagcacattgcgtccttctctattc +agatagaggcaatactcagaaggttttatccaaagcactgtgttgactaacctaagtttt +agtctaataatcatgattgattataggtgccgtggactacatgactcgtccacaaataat +acttagcagatcagcaattggccaagcacccgacttttatttaatggttgtgcaatagtc +cagattcgtattcgggactctttcaaataatagtttcctggcatctaagtaagaaaagct +cataaggaagcgatattatgacacgctcttccgccgctgttttgaaacttgagtattgct +cgtccgaaattgagggtcacttcaaaatttactgagaagacgaagatcgactaaagttaa +aatgctagtccacagttggtcaagttgaattcatccacgagttatatagctattttaatt +tatagtcgagtgtacaaaaaacatccacaataagatttatcttagaataacaacccccgt +atcatcgaaatcctccgttatggcctgactcctcgagcttatagcatttgtgctggcgct +cttgccaggaacttgctcgcgaggtggtgacgagtgagatgatcagtttcattatgatga +tacgattttatcgcgactagttaatcatcatagcaagtaaaatttgaattatgtcattat +catgctccattaacaggttatttaattgatactgacgaaattttttcacaatgggttttc +tagaatttaatatcagtaattgaagccttcataggggtcctactagtatcctacacgacg +caggtccgcagtatcctggagggacgtgttactgattaaaagggtcaaaggaatgaaggc +tcacaatgttacctgcttcaccatagtgagccgatgagttttacattagtactaaatccc +aaatcatactttacgatgaggcttgctagcgctaaagagaatacatacaccaccacatag +aattgttagcgatgatatcaaatagactcctggaagtgtcagggggaaactgttcaatat +ttcgtccacaggactgaccaggcatggaaaagactgacgttggaaactataccatctcac +gcccgacgcttcactaattgatgatccaaaaaatatagcccggattcctgattagcaaag +ggttcacagagaaagatattatcgacgtatatcccaaaaaacagacgtaatgtgcatctt +cgaatcgggatgaatacttgtatcataaaaatgtgacctctagtatacaggttaatgtta +gtgatacacaatactcgtgggccatgggttctcaaataaaatgtaatattgcgtcgatca +ctcacccacgtatttggtctaattatgttttatttagtgacaatccaatagataaccggt +cctattaagggctatatttttagcgaccacgcgtttaaacaaaggattgtatgtagatgg +taccagtttaattgccagtgggcaatcctaagcaaaatgagattctatcctaaagtttgg +gcttgatataagatttcggatgtatgggttttataatcgttggagagctcaatcatgagc +taatacatggatttcgctacctcaccgagagaccttgcatgaagaattctaaccaaaagt +ttaataggccggattggattgagttaattaagaccttgttcagtcatagtaaaaaccctt +aaattttaccgattgacaaagtgagcagtcgcaataccctatgcgaaacgcctcgatagt +gactaggtatacaaggtttttgagttcctttgaaatagttaactaatttaaaattaatta +acgacatggaaatcacagaacctaatgctttgtaggagttatttatgctgtttactgcct +ctacaaccctaataaagcagtcctaagaatgaaacgcatcttttagttcagaaagtggta +tccagggtggtcaatttaataaattcaacatcgggtctcaggatattcggtcatataatt +tattaagggctcttcgagtcttactctgagtgaaattggaaacagtcatccttttcgttg +tgaggcatcttacaccgctatcgatatacaatgcattccaccgcggtgtcccgtacacaa +ggaaacttgttaccttggggatataagaaaactcacacgtctcattattaaactgagtac +aatttttgcacgagaaagtaatgcaatacaatatgatgaaagccagctaatgaaaaggga +tggaacgcacctcggatctgttgcactggattaaaatccgattatttttaaaaatattca +gtgctagagcatatcaggtctacttttttatctggtatgtaaagcccacggagcgatagt +gagatccttacgactcaacgaaaagttataacataactcccgttagccaaagcccaatcc +cgattactgccctaccctaacgtctgccatctaaatatcgaacttgttatgatcaatgtg +actacctcccaccctttccccttcatttgttccactggggataagctagcgttttcagaa +tcaatgcaataagaatagccaattgtctcacttcatcagagctcttggcaattccaggcg +ctacgtggttctggaatatattcatttttcaaatagtaatacgtttagtgttgctattgt +ctacacgtttggatattacgttatgtgagcggacatcaatagttgtctaactctttagta +agccagagatagcactcttagcgaatggataccatcttccataagtttagttaatagtcc +gaaacaactgcttcgagcatatttgaacctccttgtaggcaaatagcctcttcaaagcaa +tcttactaatagatagagtttgttttaagggactactagaaatgggacaatcttaatagt +atgacctaaactgacatttaaagatatatccaggtggcaagcataaagatcattgcgcca +cctccaccgtgggattacttatcagtcgatatcctatatgctaagtttgcgacggcagaa +tacaaactaagctgagttgatgctaaccttacctatgataccccattggaccggttaaca +gccctacttattccaaataaaagaacttttatgctgtagaagctattatagtgatgcctg +gtaacttcagtatattaaaatgacacacatacgccatatagagctcctggaactttgaat +aatgagcgaacttcgaagttgaagagcaagaaaccatatgtcacggttgcctaaagcccg +gtaaccagacatgtgctatcattgatcattatcgaggttttcataaccttgacccattat +cggctgtgcgcggacaagtacttaaatcactagtttcttcacctgcttatcggtaagaaa +taaggttggcaaagaatcgcataagacggacgtagagccgcagcgttgtgcgagtccagg +tgcatgcgcagcaataggattttaaattttgttccatttttaatttagccgtaaggatgt +ccgtaaatgattgaaaattggattcaatctttgggcctatgctactggaacctgatcgac +aaaatttcaaacatacgttaactccgaaagaccgtatttttgcggctagaatagtcagtc +gcttggagccatataccttaccacttaaacgacgtgctcctgtagttgaaatataaacag +aacacaaagactaccgatcatatcaactgaagatctttgtaactttgaggcgaagcaccc +tcttcgagacaactaagagtaaagtaccgggcgccgcaaggagtcgattgggaccctaaa +tcttgacgaattgctaagaggctcagagctaccactgtaatttctctagagcccataata +aatgaacgatacatccgtaggtagcacctaagggattataatggaagccaaatgcagtta +ataatattatatactggcgtacacgattcgacggatctctcacatagtgattcacgaccc +ccccctttgattgacacagcgtcagcattttgcaagaacgatcttctgcatagggtgcgc +caccgtaaggatgacgtcgaagctacaactgggtataatttaccatgcttccctgatgct +gagtgcaatacactaagaatgagtttttaccccatatcaccagtatttgttctgttattg +cgaagaaatggctatgctgagttggcgactaaagtcacccatcctttttattaggtaacc +ccctcccttaaactaactgatttgctggagctgccctgcatacatatactttatcattta +tggacgtccgtgacgcttattatccaccatagtcgatatgctacacggattcattaatgg +atcgtaggagtttaagttatatttactaagatcggtctcggctactatcccgccttaccc +ggcgctatttacggccatttttaatatattgacggtaattattcctatggtttcgaccgc +acgtccttggacaagaaagaatggcaaaaaaaatgtaaaagaaaaaaaatattgagtccc +taccatcatataaaaaatatgtgatgagtaacttgacgaaatgttagtggttattaaaga +ctatctattacaccttttgttttctgtcgtagtatattaaagtctagaagccttacagga +aaatcagggttatacagccgatactccgcagcatgaatcatcgaggaggtgtcctaccat +cgcgccttgtaatcttgtctgtgtatactgtatttagaccttttatacaaagtaaatatc +tcggctttatgtgattgggaggggcctactcaaacatgatgacttgacctaataatcact +gtgcgggcgtcttatgactagctattccttgaaatccaccaccaaatggttaatatgtaa +aaactttgacgatgaaacaaggtgaatgtgtagttactttgtgtaattagctgcgtcgag +cattgcttgtaaaaccgtcaatcgcacacgttacttccataaaatttctacgaatacacc +cttcttaaaaaaaacgtaggaattcacgagtttaacaaacgataactgtataaagtggaa +gtccgaagaaagcagatgcccgaactactcgaagatgtttcgttttcttaaccatagggg +cttcttaatggcccactacgcacattttgttcaagcccgagagggacatccccattacgg +gagtattactaaaactgttccgtaatacgttcagcaagggatgaaaaaggccactgctca +agttattgacgtgggagtattacatcggaagcctgaatcccacactatgatggtctgtac +aggcctagggactgcgtctagacggtattaccggcttctaatcatacgatcgtgagtctt +aacgggaagtaaggctcacacctaccccaaaccatttatctatgtaagtataaaattgtg +cgtaagtgttcaaagtggacaataaagacgtggcaaaaacccccgcacataagccgcttt +agatttcacaaataccaatgcggttaaaaacatccttgagtcgtacatacaccatactcg +cgttaaacggatataacagaagataataaatccggatgtggagtcggtgtaactatagaa +agccaagtgaaataatgcttaccagtcatttagctatacggctttcatttcatgtcaaga +gggtggagtttgacctgtacagttgatatatcaccgatacttagaactcacctaaagcta +aaattgctcgcagcgtgtaatccgcatattacaaacaatagatgggattcattatacata +agacacgatgatctgctttttcaggttgcgagatgttgcctatcgtcaatcgagtcctgc +cttacaccacttaaacaaaagtattgacagggaacctattttcgaggtattatatagtcc +agcttgaatatcaatttgacagttaacctagtgaaaatcagtaagaggaaatacgccaca +ttctccagtgaaattctacgggttatcgtctagtccaactatcaattataactcacgaga +tataagtaaattctcgtacttggcctgatttttattatactttggatccttagtaaacag +gaagggagaaaccttcaacgaaaaacactggattttgttttactctcaaagctcttatat +gacggaaataccctgtcaagtcttaactttattactagactaatgaaatgggcttggggt +ggccagaatcatagtacaatttagcggatacactattcggactttcctatcggctgtctg +gttggataagtatggggactaataggctagacatacctatacttaaactatacaggcgtc +atctatctctgcaactttggagttccctgatgttctcccgccctttgggttcacatcttc +tataccgacacccctaataacgattagtttgtgggttagagtaaattaatacggttaata +ttaatgtatcgttgaaaagctggtgtcgccaataaggtaaccggctaggcagagtatatg +tcacgaagtataactaccctaatgataagctgtaggaataaaattaatgctgtctctaag +cgaagagatatttccgactctgttttaatgacgaatctcattacttctgacttgcaaatg +ttcaatatggcacggtttcacggcacctttgtgacgcatataatgaacttagaagattat +aacgacggaactttatatgataatccgttacgattaaagaatctgttaaatatcataatg +gcattcagttctagaccgtgcatcatggtaaacttactttctctgcatggcgacatacat +ttcgctattcaaattcgcgtgtggttacacccactcgcacctttggaatattaagagaag +atgatcagaaaatccattcgctcaatttttctgacgtacgtctaatttatcctaggagac +aaatcgttttatgtctctcacatttttgaagaaaggttcgagagacaatactcaggtcct +gaactgctagaagatactcggtggagcgtggcaacaatgaaaaactcgtgacataaatga +atgatacttttccaagttcagttaagtgaatatgtttaacatacccggcttttcgatctt +aagctgacgctggacgtgcgagtaatgtcagtctcttacatacactagtgactccaagtt +tcgtcaaaaacgccccctcccttctcgagcccactcacgctatgtattgacgcgaacttg +ttcgggatcagacttttcaggagttcggtcgcgtgtccctatgtgctaatatataagtta +gatcgcattagatgctaatctgaatacttatagacgaccttcaacgagaacgggtaccac +cttgaggctagagttaggtgtgaaacgacaggtagggacatataaaatttgagtgcggct +ttagttaagggtttaattacctactcaaacatcacgctcgcgcccttcgtacgtaatcga +ccatctagaggctaaggggactgtactaggtagtgattaatgatatcctagacgcacgtg +ccttagatcttcagactctgatggtccgcgatcaccgtaattgtagtcctccaactcgat +cactttgttggcgtcaaagaaattacgatatctaaatacttataatacaataaccaagga +tgagaatgactcatcgcgttggagttatattgcttgaagttctatggaatgaaagcacgt +tatctgccgtcccaatatctccagtgagctaattcattggacggtccactttgatcaatc +cccgaggagatgttcggacactttagtctgtaacacttagcgttgagaccacgaacaatt +gattactcagtcttgaaggtgttttccaaagttcattttaaataagactacgataggcct +ttcctattgatataaactacccggctctgttgttcgtgtgagtcgtacttctctgtgttt +ttctgattatagcaagattcgattcttagtgtaaacagcgatttttatttgacccgtcaa +tgagaagcgcataggatctaagcaaaattatcaagttgtgccacaaggtaagatctttcc +agttattgcaggtaggatgtatcccacgttgatagtatgaggtctgacgtcaactgtcta +ggagagttgaccgcgtgcgggtacaccggatttgcatcgatgttgagaacgcagaactcc +cactgtcgtggcggcgttcctgatatttagcaagaggcgttgataaagccctcatcatct +agatctcgacctcatctgccctcttgctccatcattttctacacagactactttcctatc +tacgttagtataattgctttctatcttagtatcatttagagcttctccgtcaacaggttc +gtgctattaaagttagtacgaaagggacaacttgtagcaacgcatttaatcggttttcga +ctacttcgcacaaaatcagataaagaagtttgtcattctattagacattgaattgcgcaa +ttgacttgtaccacttatgatcgaacactgaatcaagactgtgattaactaaaatagaca +agccactatatcaactaataaaaacgcccctggtggtcgaacatagttgactacaggata +attaattggactggagccattacattctctacaatcgtatcacttcccaagtagacaact +ttgaccttgtagtttcatgtacaaaaaaatgctttcgcaggagcacattggtagttcaat +agtttcatgggaacctcttgagccgtcttctgtgggtgtgttcggatagtaggtactgat +aaagtcgtgtcgctttcgatgagagggaattcaccggaaaacaccttggttaacaggata +gtctatgtaaacttcgagacatgtttaagagttaccagcttaatccacggtgctctacta +gtatcatcagctgtcttgcctcgcctagaaatatgcattctatcgttatcctatcaacgg +ttgccgtactgagcagccttattgtggaagagtaatatataaatgtagtcttgtctttac +gaagcagacgtaagtaataatgacttggaataccaaaactaaacatagtggattatcata +ctcaagaactctccagataaataacagtttttacgatacgtcaccaatgagcttaaagat +taggatcctcaaaactgatacaaacgctaattcatttgttattggatccagtatcagtta +aactgaatggagtgaagattgtagaatgttgttctggcctcgcatggggtctaggtgata +tacaatttctcatacttacacggtagtggaaatctgattctagcttcgtagctgactata +ctcaaggaaccactgctcaaggtaggagactagttccgaccctacagtcaaagtggccga +agcttaaactatagactagttgttaaatgctgatttcaagatatcatctatatacagttt +ggacaattatgtgtgcgaaactaaaattcatgctattcagatggatttcacttatgcctt +agaaacagatattgcccgagctcaatcaacagttttagccggaaacaatcgaagcatagg +gacaatgtatcttttcctaaattgccatgtgcagatttctgagtgtcacgaagcgcataa +tagaatcttgtgttgcctcaactcgttgaaaagtttaaaacaatcgcagcagtctttttg +gggtctactgtgtgtttgcaaaataactgaaagaaacgcttgaacaactctgaagtagct +cgagtactcattaaagtgtaacacattagtgaatatcggccaatgaaccaaacgcttccc +ggtacgctatctctctcatcgggaggcgatgtgcaggttatctacgaaagcatcccttta +cgttgagagtgtcgatgcatgaacctcattgtaacaatagcccagcaaattctcatacgt +gcctcagggtccgggcgtactcctccatggaagggcgcgcatctagtgttataccaactc +gctttttaactactatgctgtagttctacaggcatagtggccagtattttctaacttctc +tggatagatgctctcactcctcatccatcacggcttcagtttacgtcttacttgcttgtt +cagcaacggatggaggcattaagtatcttcactgttccctaaaattgctgttcaatatca +aagtaaggacgatacagggaaagctcaagcacactcattgaatactgccccagttgcaac +ctcacttaatctgacaaaaataatgactactctaagtgttgcggaagcagtctcttccac +gagcttgtctgtatcacttcgtataggcatgtaactcgatagacacgaacaccgagtgag +aaactatattcttgcttccgtgtgtgtgacaccaggtaattgatgcggatataagctgga +gatcactcacgcccacacaaggcgctgctacctctttattccaatgtgtaagaatttgct +aacttcatttctagaccgcagctttgcggtcataatttcacggtacggacccttgggtta +gagacttgataacacacttcgcagtttccaccgcgcacatgttttagtggcttctaacat +agaatttttgttgtgacataaagagtgcgtgggagacttgcccgaccgttaagccataat +caattgaaagccccgtgagtcacatctaattggttgtactgcgcatttagctatccttta +gctgactcgaagagattcgattcctaatataggttaattagatggctgccgcgcgaagta +aaacgtgaaaaacgtagtgcgcagatctgcataactcgcgcttaattacttatgagtagt +tccaagttcgctacgttatgagagagattggaattaagcaaatatgttttatggtgattt +tgggatgagaaggactgctaagtacggctactaaacaaatttctaaaaccgccatctacc +ttatcttggagacatttaagttgtatatgtcactagtctagcttttgtctgtgggacgcg +ttctcggaatgagggaaatgcaagagccgattcatcaaatgcttatctaagaaagtagtg +gactattacaccaagcacgaatgccagggaactgctttcttgctcaggacctcgcgacaa +ggtaccccgcataagtcctagaattacatttggtcagcaatgctgacatttgaccgtgaa +aacataattttaatcagaaggcagctcacccgcttgctctagatcttatctttgtatgaa +tgtcagaatttactgcaatatccgttccgaatagtgagggcttagtatagttctctgtat +acaggtcacatcaaactccccctgtcctagtacagctctgagctttaattaattgcatac +atttccttcaatcatcagatgaaaacaccgcgaatcatgctcttctcgtatagggcaaga +gaagcaacaaacaactagcccgactcacgttcatccgccgtatccttgttcagttcttac +tccgtattaggtcagcgaaatctaatcagaataatcggtcgcgtatcaaaattaaaatcc +cgcttgaggttgacaattaaaacgctgagcagttatcggctattagatagtggggtgaaa +gtaattggctggaattatgttaaaacgtgatattaagctaaaatacgctacttgttgccg +acctaattcagtcattcgatattcagttagagccaagaataacaagcttgtataaattga +acggggtgcactaaacgatgtgttactctaatattcagcttggagtatacctgaaggcga +attcatgtatcggccaataataagacgttgaagatcacaatttggactagcaaaagaagg +tgatttatgcgtggggattgagtccactgtacgagtacggtctctggaaaattataggtt +cagggaatataaggaagtaaagataattaccaagagatttttggtatcgctatgacccag +aggtgttctaacgtctgttttgatccgcagaatttctgcctcaatgcatatttgacggac +ttgaactagagcctctaaagttaaatggcgacgcaactgttcctaaacttcaattattac +tactctttttttcctagggtattgtagaggccagtggacaaaataaatcaaatttaagat +gtttcggacattaacatcccccgtagcatagaaatcatcagttatccaatctctcatcga +gcttttacaatttctgctggcgctatggacagcatatgccgcgagacctccgcaagactc +acttgatcactgtaagtatcttcattagaggttagagcctatagttaagctgctgaccta +gtaaaattggtattttctaattttattgctcaagttaaaggttagtgaagggataatgac +gttatttttgaacaatgggttgtattcaattttatatcacgaatggaacccttcattccc +ggcataatactagacgacacgaacaagctccgatctatcagccaggcacgtgttaaggtt +taattccggcaaaccaatgaagcatcaaaaggtgacctgatgcaacttagggtcacgatg +agtttttcaggactacttattacctattaataagttaacatgagccttcataccccgtaa +gacaatacatactccaccaattagaattctgagccatcttatctttttgtatcatcgaag +ggtatggccgaataggttaattagttactcctaacgtctctacaggcatgcatttgacgc +accttcgaaaatagtcaatctctcgccacacgcgtctagtatgcagcatcaaaaatatag +tccacggtttccggattaccaaacgcggcaaagagaaacattgtatcgacggagataact +taatacagaaggaaggggcatcttcgaatacggatgaataattctatctgtttattctga +catcttgttttcaggttaatcttacgcattcaaatgacgcctgccccatgcgtgcgcaat +tattttctaatattgacgagagcaatctcactccttttgggtctatttatgttttattga +ggcacaagcctatacagaacaggtactattaaggccgtgagtgtgagactcaaaccgtgg +aaacaaaggatgggttgttcttggtacaagttttagtgcatgtgggcaatccttaccaaa +atcagatgctatccttaactttgggctgcatttaagatggcggttggaggcctgtgagaa +tcctgcgtgtcatctttaatgaccgaattcatccatgtagattcagatcacacactcatt +ccttgatgttgtctaaacaaaagttgttgtggacgcattggagggagttaagtaacaact +tgggatcgcatacttataaaaattatatgttaaactttcacaaacgctgaagtccaaagt +aactagcccaaacgcctcgagagtcactaggtattaatggtgtttgagttcctgtgaaat +agtgttcgaaggtaaaatttatgtaccaaatcgaaagaacacttaataaggcttgcttgc +acggaggtatgatgtttactgactctacaaccctaattttccagtacgtacattcattcc +aataggttagttctcaaagtgctatacaggctcctcaattgatgatatgcttcagccgct +ctatggatattagctcattttatttaggaagcccgcttagaggcttactatgagggaaat +gccaaaatgtcatacttttcggtgtgtcccatatgacaccgctttacatagaatttgaat +taaaacgcgctctcccgttcactaccatacttggtaccgtgcgcatattacatatagata +taggatcattttttaaagctgtactaggtttgatcgacaatcttatgctatactatatga +tgtaaccctcataatcaataccgatcgtacgatcctagcataggtggcaagcgattttat +gccgattattgtgttaaatagtctgtgagtgtgattatcagggctacgttggtagagggg +ttgtatagacctcgcacacattgtgacatacttaacaatatacgaaaactgatataataa +atccccttacccaaacaccaatcccgttgaatcaactaccataacgtctcccatataaat +tgcctacttgtttgcataaatctgaatacataacaccattgcaccttcttgtgttccaat +cccgttaagattgccttgtcagatgatatgcaagaacaatagcatttgctagcaattatt +aacagctcttcgaattgcctccacataacgcgggagggtatattttaatttggcaaatac +taagtactgttggcgtcatatgctattaacggttggatattaagttatgtcagccgtaag +caagagtgggcgaaatattttgttacccagtgagagcactcttagagtttggatacaata +ggccatatgttgacttaagaggacgtaactacgccgtacaccattgttcaaccgacttct +tggcaaatagaatcgtattagcaatcttaagaatagagacacgttcgtgttagggtatac +tacaaatccgaaaatcttaagaggatcacctaaactgaaatttatacatatttcaacgtg +gatagatttaacataattcagccacctccaacctgggagtaattttcagtagatttacta +gatgattagtggcccaacgcacttgactatataagatctggggatcctaacctgacctat +gagacaaaattggaaacgttaacagcccttatgtgtacaaagaaaagtaagttgttgctg +ttcaacagatgatagtcatgacgcgtaacttcactatagtaaattgaaacaaatacgcaa +tttagacagaatggtacggtcatgaatgacagtaattcgaagtgctagaccaacttaaaa +taggtaaacgtgcccgaaaccccccttaacagaaagctgctatcatggtgcagtatcgac +gtgttcagaaacttgtaacttttgagcaggtccgagcacatggaagtatatcacgtgttt +ctgaaccggcttatccctaagatatatccgtcgcaaactttcgatttagtcccacgtaga +gcccaagcgttgtgcgactccacgtgcatgcccagaaatacgagtttaaatttggttaca +tggttaattttgaccgaagcatcgcactttatgattgataattggattcaatatgtcgcc +ctatgcgaatgcaacatgatccacaatttggctataagacgtttaatccgtatcacactt +tgtttgcggctagtatagtaacgcccgtgcaccaagagtcagtaacaattataagtactc +cgcaggtacttcaaatataaaaactaatcaaacacgacccatatgatcatctgaagatat +ttggaactttctcgacaaccaccctcgtactcaatacttacactaatcgacaggcacacg +caacgtgtacagtcgcaccatattgagtcaagatttgcttagtggcgatgagcgtacacg +cttatttctctagtcacaattagttatctacgagacatcacgagggagcaaataagcgat +gttatggctacacataggcacgtatgaatatgatataagccagttaaacagtcgaaccat +cgagcaaattctcatgcaccaacccacacgttgaggcacaaagagtaagctgtttgaatg +taacttcttctgctgagcgggccccaacgtaaggatcaactagaagagaaaactcggtat +tagtttaaatgcgtcacggagcatgagtgcatttcactaagaatgtctgtgtaaccaata +taacatctatttgttatctgattgcctacttatggctttgcggtcgtggcgactaatgtc +tccaatccttttgaggtcggtaccaactccctttaaattacgctgtgcaggctcatgcac +tgcatacatatacggtagcaggtagggacctcacgcacccttattataatcaatagtagt +tatcagtcaacgaggcaggaatgctgaggtcgaggtgttggtatattttctatgtgccgt +ctaggcgactatcacgcattaccaggcgagatttaagccaattttgaatatagtcaacgt +aatttttactatgggttccaccgaaacgccttgcacaactaagaatcccataaaatatcg +atatcaaataaaagattgtgtcaataccttcatatatattttttcggttgactaacgtga +actaaggttaggggttttgtatgtctatataggaaacagtttcttttctgtcctacttta +gtaaagtcttcaagccttactccaaaatcacggtgattaagccgttactcagcagcatga +ttctgcctgctcgggtcctaaaatccagccttgtaagagtcgctgtgtattagctaggga +gacctttgttaaaaaggatatatcgcggcgggatgtgagtgcgtggcgcatactcaatct +tcagctcgtgtcattataatatctctcccccacgcttttcactagatatgccgtgtaagc +aaacaccttatgcttaatttcgaaaatattggtacttgaaaaaagctgtaggggtactta +atgtctggtaggagatcaggagagaattgagtgtaaaaccgtaaagccctcacctgactt +catgtaaatggcttagaagactccatgatttaataaatactacgaaggaaagactggatc +taaagataactctagtaaggccaactcccttcaatgctgttgccagttataatccaagag +ctgtccttttctgaaccatagcggcttctgaagcgaactagaagcaaagttggttctagc +cagacagccacataccctgtacgggtgtattactaaaactggtccggtattagttcacca +agggaggaattaggcaaaggatctaggtatgcaagtcggagtattacatccctaccctga +atccatcaataggttcctctgtactggccttcgcaatgagtattcaaggttgtacagccg +tataataataagatagtgactatgaacgggaagtaacccgctcaccttccccaaaacatt +gttatatctaagtattaaagtctgccgtagtgttaatactcgaaaataaacaactggcaa +attacaccgcacttaagccgcttttgatttatatttttccaatgcgcttttaaaaataat +tcagtcctacatactaattaagacccttaaacggagatatcacaagttaagttttaacca +tctcgactaggtggaactatagatacccaactcaatttatcattacctgtaatgttccta +gaaggattgcatttcatgtcaagacggtggagtttcacagcgaaacttcagtgtgaacag +attctgagaaatcacctaaacctattagtcagagcacccggttagaaccagttgtcaaaa +aatagagcggttgcatgagacagaagtaacgatgagatccgttgtaacgttgagacatct +ggcctatcgtcaatacagtcctcccttaaaaatatttttaaatactaggcaaacccaaca +taggttagtcctatgtgatacgccacatggtatatcattttgtaacgttacctagggata +atcaggaagtggaattacgcaaaagtagacagtgaaatgcttagggttatagtctagtcc +aaagataaaggataaagcacgtcagagaactatattagccgaatgggaatcattgttagg +agactgtggatcatgtctaaaaagcaacgcagaaacagtcatcgaaaaaatctcgttttt +gtttgaatctaaaagagctttgatgaccgatagtacctgtatactagttactgtattacg +tgtctaatgatttcggattggggtccccagaatcagacgtcattgtagacgattcaagtt +taccaatttaatttcccagctctccttggagaactatcgccaataattgcagtcactttc +cttttctgaaacgataaagccgtcagagttctctgcaacgttggacttacctgaggttct +aacccactttcggttctaatagtagttaacgacacaacgaataacctttactgtggggct +ttcacgatattttttcgcttattattaatggttacgtcataagctggtgtccaaattaag +gttaccggcttcgcagagtagttgtatccaagtataacttccctaatcataagatcgagg +tagaaaattaatgctgtctctaaccgaacagatatgtcccactatgtggtatggacgttg +ctaattacttctgaagggaaattggtcattatggatacgtgtctaccatcaggtcggacg +cagatatggttctgtcttcagttgatccaccgttctttataggataataactgacgatta +aagattatggtaaatagattaagccaattctcttcttgtcagtgaagcatccttaactga +cttgctctgcagcccctcatacatttagctattcaaagtaccggctcgtttcaaactctc +ccacctttggaagaggttgtcaacttgataagtatatcatttacagcattttttcggacg +tacctctaatgtttcattgcagaaaattagttttttctatcgcacattttgcaagtaacg +ttagagacacaattatctgcgaatgaactgctagatctgacgaccgggagcctcgcaaat +atcaaaaaagactgacatatatcaaggagtcgttgacaagtgctggtaagtcaattggtt +tatctgtcccggcgtttcgatcttaagctgaccatgcacggcagagtaatgtcactctcg +ttcttacaagtctgtctccaagggtcggcaaaaaagacccctccattctcgagcccactc +acgatatgtagggacgacaacttgtgcggcttatgaattgtctggactgcgggcgagggt +ccatatctccgaagttagaagggacatacctttagatgataagatcaattcttattgacg +aaattcatccacaacggggaacaacttcaccctagacttacgtctgaaaagacacctagc +gtcttataaaaggtcagtgccccgtttcgtaaggctggaattacctacgcaaacttaaac +ctcgcgcccttccttacgtatcgacaagatagaggctatcgcgaatgtactacggaggca +tgaatcatatactagaaccaagtgcctgtgatattaacaagatgatccgacgcgagcacc +gtaattctaggcataaaactccagcaatttgggggccgaaaacaaatgacgttagctaat +taattatatgacatgatcaaaggaggtcaatcacgcatcgagttcgacgtatattcattg +aacttcgtgcgtttgaaagaaacttttatgaaggcaaaattgatcctgtctcctatttca +tgcgtacctcctagttgataattccccgagcagtggttaggacacttttgtcggtatcaa +gttccggtctcaaaacgtaaaattctgtaatctgtatggatggtctgtgaattagttaat +ttttatgaagtcgtcgagacgcagttcctattgatttattctaaacggagatgtgcttcg +tgggactcggaagtagatctgtgtttatgattattgctactttagatgctgactgttaac +tccgtgttgtttttcaaccgtatatcacaaccgaattggatagaacctatagtttcaagt +tctgccacaaggtatcatatttacagttagtgctggttgcttctttcaaacgtggtgagt +ttgtgctatcacgtcaacggtagagctcagtggaccgagtgcgcgttcaaccctgttcca +gagagggtgtgatagcacatataccacgctcgtcgaggcgttcatgatagtttgcaagag +ccggtgttaaacacatattattattgttatccaactaatcggacctatgcataaagcatt +gtctaaacagaataattgcctatatacggtagttttagtgatttatatcttagtatcagt +tagagcttcgaactcttcaggttcctcatatttaacgttcttcgaaagcgaaaacttcta +caaacgaatgtaagcggttttccaagtagtacctataaatcacagaaagatctgtctcag +tatagttgaaatggtattcagctagtgacgtgtaccaattatcatagttcactcaagcaa +gacgctcattaacgaatatagacaagacactatatcatataataaaaaagaacatggtgc +tcgaacatagttgaattcaccatattgaaggggaatgctgacatgtaattcgctactaga +cgatcaattccctacttgtcaaagttgaactggtacgttcttggaattaaatatgattgc +gctggaccaaattgcgacttcttgagtttcagggcaaacgattgagccggaggatgtccg +tctcttacctttcttgcttatgataaacgacggtccctgtacatcactgggaattctcag +caaaaataattgggtaaatcgagactcgatgtattcggccacaaaggtgttagacgttaa +agattattcaacggggcgataataggatcataaccggtatgcaagcgcattgaaagagcc +atgagatccttatccgataaacgctgcacggtatgtgcagccttattgtcgatcacgaat +ttataaatgtagtctgggctgtaagttgaagacctaagttataatgaagtgcaataccaa +atcgattcatagtggattatcagactcaagatatctcctgataaattacagttgttaaga +tacggataaaatgagatttaagattagcagcctctaatctgtttcaatcccgttggaatg +tggtatgcgatcaaggttaagttaaaatcaagcctgtcttcagtcttgattcttgttctg +ccatcgcatgcggtctacgtgagttaatatgtagcttacgttctagcttgtgctaatctg +agtatagattcgtagaggaatattatcaagcttccacgcctcaacgtacgtgtattggtc +acacaagacactaaaagtggaagtagcgtaaactatagtctagttgttaaatgctcagtt +cttgttatattcgatatactcttggctaatttatgtctgagtatataaaattaatgatat +taacttgcatttcacggatcccttagaaaaagattttgaccgagcgcattataaacggtt +acaccgaatcaatagaagcatacccaatagctttctttgaatttattgcctgcgcaactt +ggctgactctctagatccgaataattctatatggtcgtgacgaaactagttcattactgt +ttaaaatgccaacatgtcttttgggccgataatggctctttgcaaaattactcaatgata +cgattgatcaaagcggtagttgctagtggtagcatgtaagtctatcaaatgtctgattat +ccgaaaatcttccaaaagagtccacgtaccatatctatctcatagcgacgcgaggggaac +cttatctaactatcattccatttaccgggtgactctcgatgcaggatccgattgggataa +attgcccagaaatggctcattcctgactaagggtaaggccgttctcagcaagggaacccc +gcgaatctaggcttataccatctagattgttaactacttgcctgtagttctacagccata +ctggacagttgtttctaaatgatcgggattcatgctagcactcctctgaatgcaccgcgt +aagtttaactattacgtccgtgggcagataaggatggaggctgtatgtatcttaactgtt +acctaatatggctggtaattatcaaagtaaggaccttaatgccatagcgctagcaatcgc +tttgtatactgaccatgtgccaacctctcttaatctgtaaaatataatgtcttagctaac +tgtggacgatcatgtctctgcctagagcttcgctgtatcaattcctatagccagcgtact +agtgacacaacaacaccgtgtgagaaaagatattagtccttacgtctgtctctctacagc +ttattgatgaggattgaacatggacatatagctccccctcaaaagcagatgctacctctt +tattccattctcgaacatttgccgaacttaatttcgacaaacctgaggtcacgtcttaat +ttatcggtaacgtcacgtccctttgagactggataaatatattaccaggggccaacgagc +aattgttggaggcgcttctataatacaaggtgtcttgtcaaagaaagacggcgtgcgtct +cgtgcaactcacttaaccaatattaatgtgaaacccccctctctcacatcttatgcggtg +tactgccctggtacatttcctgtacaggactccaacagtgtagattcctaagatagctgt +tggagttgcctcacgccagatcgaaaaactgaataaactagtgagctgagctgcagaaat +accgcttaattacttatgactagttcaaagggacctacgtgatgtcagacattgcaagga +agaaattaggtttgtgcgtcattttggctggactagcactccttacttcccctactattc +aaatgtcgtaaacagcatgagacaggatcgtgctgacatttaaggtctattgggaacgag +gctacctttggtcgcgcgctcgcgttctccgaatgaccgaaatgcatgagcacagtatgc +aattgcttatagatctaaggtctggtcgttgaaaccaagcacgtaggcctgggaaatcag +ttcttcctcagcaactacacaaaagcgtccaagcattagtacttgtagtaaatgtccgaa +cctatgcgctcatttgaaagtcaaaaaatatttttaagcagtaggcacctaacccgattc +ctctacttagtagctttctttgattctcagaattgactgcaatatcactgcacaattctg +tgccattactagacttctctgtattaacgtctcatcttactaacactcgcctaggacaca +tctgagagtgaagtatttcaatacatttactgaaatcttcagttctaaaatccccgaata +aggctcttatcggtttggccaacacaagaaaaaaacttcttgcaccactcaccttcatac +gcaggagcctggggaacttagtaataactatttcggcagacaaagcttataacaagttgc +cggcgcgtataatatttaaaagaccccttgagctgctcaattaaaacgctcacctggtat +aggctattagatagtgccgtcttagtaaggggcgggaattatcggataaactgatatttt +gataaaataaccgacttgttcacgacataagtcactaaggagattttatctttctccaaa +gtatatcttccttggataatttcaaagcgctgcaatttaagttctgttactagtttatgc +tgctgggaggtgaccggaaggcgtagtaatctagaggcaaattataagaagttcatcata +tcattttcgactacaaaaacaaggtgttgtatgccggcgcattgtgtaaactggacgagt +accctagatggaaaattatacgttaagccaagatttcgatgtaatgataattacctacac +atttttgctatccataggaacaagagctgttctataggctcgtggcatacgaacatttgc +tgccgctatgaatattggaagctcttcaactacagactctattcttaattgccgtcgaaa +atgggccgaatcggctattattaatactcggtttttccgaggggattgttgtcgacagtc +gtaattattattaatattgatgttggtgaggtcatttaaatacaaccttgcagacaatga +ataagggatccaatctctcatactccttttacaattgctcatgcccctatgcaaacctta +tgccgccacacctccgcaactctctcttctgaactgtaagtagcttcattactggtttga +gactatactgaagctgatgacattctaaaatggctattttcgaatgtgattcataatgtt +tatcgtttgggatggcagaatcacgttatttttgatatagcccgggtattctattgtata +gaacgtatgctacaagtcattccccgaagaagactagaagtaaacaacatgcgaccatcg +ttaagccacgcaaggctgtagctttatttcccgataacctatcttccataaatagcggac +agcaggatactgacgctcaacatcagtggttatggtctaatttttaacttttaataaggt +aacttcagcaggcatacacagtaactctttaatttataatcaaattagaagtctgacact +tcttatatttttctatcatccaacgcgatcgcccattagcttattgtgttactaataacg +tatctaaaccaatccttttcaagctactgcctatattgtcaatatatacaaacaacagga +tagtaggctgcttaaaaaatattgtcaaccgtgtacgctttacaatacccggaaatcaca +aactttgtagacaacgagtgaaatttatacactacgaagggccagcgtacaagacccatg +aattaggcgatatgtttattctgacatattggtttatccttaatctgtcgctgtaaaatg +aagccgcccccatccctgcgaattttttttcgaagattcacgactgaaatataaatacgt +ttggctatatttatgttggagggaggcaatagcctttactgttaaccgaagatttagcca +gtgagtgtgacactaaaacactggaataaatgcaggcgttcttctgggtaaaaggtttag +tcaatctcgcctataagttcatatagctctggatataattatctggcccatgcatttatc +atggcgcttggtgccctgtgtgaagccggcctctcatattgaaggtccgaagtattccat +gtacattaagatcactctctcattcatgcatcttggcttaacaaatctggttgtccaagc +tttccaggcacgtatggtacaaattcggatcgaatacttataaaaatgatatgttaaact +gtctaaaacgctcatctacaaagtaaagtgcactaaccaatagagtctcaagaccgtgta +atgctggtgcactgaatgtgtaatacggttagaagggattagttatgttacaaatccatt +gaaaacttaagaagcattgcgtgctcggagggtgcatcttttatcaagagactaacatta +ttttcaacgacgtacatgctttacaatagggtacttatcaaacgccgagaaacgcgccta +tagtgatgttatgattatgacccgatatccattggaccgaattttatgtaggttcccagc +gtactcgcgtaatatctcggtattgccataatgtaatacttgtcggtctctcccagatga +aaaagcgttacagagtatttcaatgaaaaacagcgcgcaacgtcaatacctttaggggta +acggccgctgatttcatatagatatacgataagttggtatagctctactaggtggcatcc +acaatcgttgcatttactatagctggttacaatcataatctataccgttccttacatact +accatagcgggatagcgtttttttgccgttgattgggtttaagaggatgtcagtctcatt +atatccgattcggtgggagagccgttgttttcaaatcgcacactttgtgacataatgtac +aagataacaaaactgatataagatataaactgtcaatatcaccttgacacttgaatcaaa +gtaaattaactcgcaaatataatttgactaattgggtgcagatttctcaattaataaaaa +aatggcaccggatgggcttacaagccccttatcattcacttgtatcatgatttccaagaa +caatagaatttgctagcaagtatgaacagagattcgaattgcatccacagtacgccggag +cgtttattttaatgtggatatgacgatgtactgttggcggcatttgctagtaaccggtcc +ttatttacgtagcgcacacgtaagcatgtctgggagaaatatggtggtacaatctcagag +aaagattacagtttggtttaaataggacttatcgggtcggaagtggaacttaataagcag +tacacaattgggcaacagacgtcttgcctattacaataggattacaatgcgttagatttc +agacacgttcgtgtttggctattcgtcaattccctaaatagttagacgatcaactattat +caaagtgattctttgttcatcctccattcatgtaacagatggcacactacgcataacgcc +gaggaattttaacgagatttaagagagcagttcgggcacaacccacttgactttataaca +gctcggcagcataaacggtaatatgtgacaaatttccaaacgttataagaacgtatgtgt +acttagaaaactaagtggttcatgttcaacagatgtgacgcagcaagcctaacttatcta +ttggttttgctataaaagaacaaagttacacagaatcctaagggcttgtttcacacttat +gcctagtgcttcaccatcttaaaatagcgaaaccggcacgaatcaaaccttaaaacaatg +cgcagatattggtgatggtgactccgggtatgataatggtaactgttgaccagcgcccac +ctcatcgaagtatagaaagtggttaggataaggatgagaccgaacttatttccggccata +actttagattttctacctagtacacaacatcagggcggacacgaaaccgccatcacatca +tataccaggtttaatttgcttaatgggggaagtgtcaacgaaccttcgaactttagcagg +catatggccattatatatggccccagagcagaatgctacagcagacaaaatttggattta +tgtagtttaatacctatcaaacttggtgtgaccatacttgtctaacgacagtgcacaaag +tgtaagttacaattattactactcagcagcttctgcaatgataaaatcttatcatacacg +tcacatatgataatatctacttagggggaacgggctccacaacctacatagtactcaata +cttacactattcgacaggcacaccaaacctgtacagtcccaaaagattgagtcaactttg +cagtactgcagatcacagtaatagcttagttagcgagtcaaaattagttttctacgagac +tgcacgaccgtgcaaatttccgatgtgttggctacaaatagcaacgtatgaatttgtttg +aagccacgtaaactgtacaaccttagagataagtctcaggctactaaaaacacgttgtgg +cactaacaggatcatggttgattcttacttattcggctgaccggcccaataagtaacctt +caactagaacagaataatcgggagtagtttaattcagtcaaggtgcaggtctcattgtaa +ctaacaagctctgtgtaaccaagttaaaatcgttttcttagcggattccctacttatgga +tttgagctcgtccacaatattcgatacaagaagtttgtggtccgtaacaacgaaatttta +attacgctgtgcagcctcatccaaggaattaatagaaggttgatggtaggctccgaacgc +tccatgattataatcaagtggactgtgcagtaaacgaggaaggtatcctgacgtcgtggt +gttcgtttttgttatttgtgccctatacgagtagataaaccatgaacagcacagtgtgaa +cccatggttgattttaggctaccttatttttaatttccgttacacagaaacgaattccac +aactaacatgccattaatttttcgatatcttataaaagatggtcgaaattcattcattta +ttttttttcggttctcgaaagtcaactaagctgtcgcgttttgtttctctttagaggtaa +aagtggctttgatctcctacgtttggatactagtcaaccattactccatttgatccgtga +gtatcacctgtctaacatccagcattatgactcctcggcgaagaaaagacacacttctta +gagtcgatgtgtattagctagggacacagttgtttaatacgatagtgagcccagggaggg +cagtgcgtcccccagtagatttattcagctagtgtaagtataagatatctcacccacgag +gttcaagtgatatgcagtcttagaataatacttatcctgaatttcgatattatgggtact +tcaataatccgctagcgctactttatgtctcgttggacagcaggacacatggcagtctta +aacactaaagacatcacctgaatgaatgtaatgggattacaagaatcaatgaggtattat +atacgacgtaggaaactctggatatatacagtaatctagttacgccatcgcacttcattc +ctctggaaacttagaagacatcagctgtacgtggaggaaccagacccccgtatgtagcca +aatagaaccaaagttgcttatacaaacacacccaatgacaatggaccgctggagttcgta +aactcggaacgtagtactgcacaaacccagcatttagcaataggagctacgtatgcaact +cccacgtggtaataccttcaagctatcaatatataggtgcctagctaatcgcattcgcaa +gcagtattcaagcttgtaaaccagtataataattacagaggctctatgaaacccaacttt +ccagctaaaagtcccaattaaatggttatttcgtacttttaaagtcgcccgttctgttat +tacgcgaattgattctactccaaaattaaacacaaattatcaaccgtttcatttatattt +gtcaatgcagctgtttaaaataaggctctactaaattataattaagacacttattaccag +atttctctagttaagtttgaaccagctcgactaccgcgaaagatacattcccttctctat +ttttcagttcatctatgggtcagagaagcattgaatttattctattcaccctcgtcgttc +acagcgaatcgtcagtgtgatcagtgtatgagaaatatcctaaaccgtttagtcagacca +cacgcttagaacaagtggtctaaaaagactgccctggaaggagtaagaagtatacagctg +atccggtgtatccttcagtcatctgccctatactaattacacgacgcaaggaaaaatagg +tttattttctaggcaaacccttcataggtgactccgatgtgttacgaatcatgcttgaga +atgtgctatcgttaccgacggataataacgatctccaatgaaccaaatgtagaatgtcta +ttgattacccttttactattcgacttagagataggagatagaacctcagtgtactttttt +agccgaatgggaatctttgggaggtgaatggccataaggtcgtaaatccaaccctcttaa +agtcttccatattatatcgttgttcgtggaatcgataacagatttgttgacccatagtaa +atgtatactagtttatgttgtaagtgtagattgttttccgattgccgtccaaactttatg +tcgtaattgtagaccagtaaagttgaccaaggtaagtgcccagcgatcctgcgagatcga +tcgccaatttttccagtcactgtaagtgtaggtttagataaagccgtatgagttatatca +taagggcctcggaaagcagcttcgaaccaaagttcccttataatagtagtttaactataa +aagtatatactggtctgtcgccctttcacgatttgttttaccggtttatgaagcgttacg +tcattagagcggctccaatttaaggttaacggcttccatgtgtagttgtatacaaggata +acttaaagtatctgttcagcgagctagttaagttatcctcgatagaacacaactcagagg +tcccaagatcgggtttgcaacttgctaatttattctcaaggcaaattgggaattatcgat +acctgtataccataaggtcgctcgatgtgatgcttatgtcttctggtgatcctaccttag +ttagtgctgattaacggaacattaatgtttatcgttttgagatttagccaattctctgat +tctaactcaagatgccttatctgacgtgctatgcagcccctaagtattttacattgtaat +aggacacgctcctttaaaactcgccaaaaggtcgttgtggttctctactggttaactata +taatttacagctttgttgagctagttcctctttggtttaagtcctcaatattagttggtt +cgagcgataagttggctagttaccttagtcactatattagatccgaatgttatgcttcat +ctgaagaccgccaccctccaaaatttcttttaagactcacttattgcaaggtgtaggtga +attcggctcgtttctcaagtggtgtatctgtacacgagtttccatattttcatcaacagc +caccgcacacttatgtcactctaggtattaaaagtcgctctacaaggggacgcaattaag +aaacagacatgctagtcaaaaataaacatagcgaggcaccactaattcggccgcttatca +atgggatgctctgcgcgagacgcgccagagctcagtagttagttcggacatacatttact +tcagatgatcaattagttttctacaaatgcttactctaccccgaaaaaagtcaccagact +cttacgtctctttagtatccttccgtcttatataaggtcagtcccccgtttcggtaccct +ggaatttactaagaataatgaaacagcccccaaggacgtacgtttacaaatgatagacca +gatcgcctagcttattccgacgcatgttgcatagaattgaaccaacggaatgtgagagta +actagatgagccgaccacagcacccgtttgcgtcgcagaatacgcctgatagttcggcca +cgaaatcatatgtcctttgagtattaagtatttgtaatgatcaatcgagctcaagcaagc +ttacacttcctcggatattcagggaacttagtgcctttgaaagatacgttgatcaacgaa +aaattgataatggctcatatggaatgcctacctcatagtgctgaattaacacagcactgc +ggacctaacttttcgaggtttcaagttcacgtctcaaaacctaataggctggaatatgta +gggatcctcggtgaatttgtgattgggtttgttgtagtactgaccaagtgaatattcttt +ttttctaaaagcagatctgctgccgggcactacgaaggagatctctgtgtatcattattg +cttcttgacatgatgactcttaaatcactgtgggtgtgcaaaacgatagcacaacccaat +tcgatagtacatattgttgatacttcgcactaaaccgttcatatttaaaggttgtgctcc +ttccttcgttaaatactggtgacttggtcctatctactattagctagacctctggggaac +cacgcccccgtaaaacctgtgcaagagagggggtcatacatcttagacatcgcgcctcca +ccagggaagcattgggtgattgaccaggtgtgtaacaaatatgattattcttatactaat +attagcaaagatgcataatgatttgtattaaatgtataattgaattgataagggtctttt +agtcagtgatagagtagtataaggtagacattagaactcttaaccggacgcagatttttc +ggtcttagtaagccaattagtcgacaaaacaaggtaagagcggttactagtagtacctat +aatgcactgaatcttcggtcgaagtatagttctaatgctatgcagattgtgacggcgaca +aatgttcagacttatatcatgaaacaagctcttgtaagtattgacaaatgaaaagattga +atatttttaaatacaaaatgcgcctacttattaggggaattaaccagattgaaggccaat +cctcacatgtaatgagataatagacgataaatgaaattcttgtaatagttgaactgctac +gtgatgggtattatatatgattgagatcctccaattgccgacgtcttgtcttgatgccca +aaagattgtcaacgaggagctccctcgcgtacctgtcgtccgtatcataaacgacgcgac +atgtacagcactccgaagtataagcaataataatgcgggtaatccagactagatcttttc +ggactcaatgcggtttcacggtaaacatgattaataccggagagtagtcgagcttatcag +cgatgcaagcgaattcattgtgccaggagatacgttgcagataaaaccggcaacgtatgt +caacaagttttggcgatctcgttgtttgtattcgacgaggcgcgggaacttcaagaacta +tcgtatattcaagtccattaccttttagtttcagactggtggagctgactaaagttatat +catcattttgtacactggtttagttaacgataatttcagatttaacatgaccagacgata +atcgctgtatatccagttggaatgtggtttgccagaaaggttaacttataatcaagcctc +tcttcagtcttgattcgtcgtatcccatccattgcgctatacctcagtgtatttggagct +gtagttataccgtgtgctaagatcagtagacatgacgagagcaatattatctaccttaca +agcatcaacggacgtctagtcggaacaaaagactctaaaactcgaacttcaggttaatat +actatagttctgtattcagcagttattcttatattcgatattatcttgcctattggatgt +ctgactttagtatattaatcatagtatctgccatgtaaaggtgccagtactaaatctgtt +tcacagtgcgaattataaacggttacaaccattaaagacaacaagaccctatagctttat +ttgaattttgtcaatgcgcaacttggagctcgcgatacatcccaattagtctatagggtc +gggacgattctacggcatttctggttataatgacaacatggattgtggcccgagaatcgc +tctttcattaattaagcaatcattacagtcttataagcgctacttccgagtggtagcagg +taactcgatataaggtcgcatgagccgaatagcttaaaaaacaggccaccgaacattgat +agagaataccgaccacagcgcaacctttgattactttcattaaattgtacggctcactcg +acatcaagcttaagattgcgataatgtgaactcaaatggatcagtactgaagaaccgtaa +cccacttcgcagaaagcgtacccagagaagatacgctgttacaatatacagggtgaaatt +attgcctgttcttcgtaaccatttcgccaaacttggttagaaatgatagccattcatgat +agaaataagctgaatgataccagtatctttaactatgtagtcagggggaagataacgatg +gtccatgtatgtttctgatatgtgacagtattggccgcgtaatttgctaacgaagctact +taatgcctttgagcttcatatagatttctttaatcaaaatcggcaaaaagatagtatgag +ctataatatatgctagtagagaactctggaccatcatctatatgaatactgattcgagcg +tgcaattactttagcctgcgtactactgactctacaaaacactctgagataagtttgtag +tcagtaagtcgctctctataaaccttttggatgaccattgtacagccacttatagatccc +aataaatagcacaggagacagagtttttcaatgctcgatcatttgccgatagtattttcg +tctaacctcagggcacctattatttgatacctaacctaacggccctttcacaatggagaa +atatatgacatcgggacaaacacaaatggtgggtggccaggagatatgacatggtggcgt +ctctaagaaacacggactccctctaggcaaactcacgtaaccaattttaatgtcaaacaa +aacgctcgaaaagattttgccgtgtaatgacctggtacattgactggtcaggaatacatc +actgtagttgccgtagtgtcctgttggtgttccatcaagacacatcgtataacgcaattt +acgacggacatcagatcaagttatacagattatttaagtatcacgtgtgcattgggacat +aagggatctcacacatgccttggaacatttttgctttgtgccgctttttcgctgcactac +caatccttacttaccagtatattcaaaggtcgttaacagaatgagaaaggttagggctct +aagttatcgtcgattgggatagacgagacatttgcgagcgccctccacggatacgaatct +cccatatcaatgtgaactggatgctatgcagtttagttcttacgtctcctagtggtaaaa +atcaaagtagcactcgcatagcagttattcagaacctaatacacaaaaccgtcaaacatt +ttctaattctaggtatgggccgatcataggagctaaggtgaaactcataaatgttttgtt +agatctagcatcctaaaaagatgcatatactgagtagctggcgtgcattctctcaattgt +atcctttttaactgaactagtcggtcccatttcgtgactgagatctattaaccgataaga +ttaataacactcgcattcgtatcagctcagagtgaagtttttcaataatttgactgatat +attaacttctaaaataaccctttaagcctcggatccgtttcccaatcacatcaaaaattc +ttattccaactatctacggattaacaacgtgcatggggatcgtagtaagaacttgttccg +atcactttgagtatatcaagttgacggcccggttattattgaatagaaacattcacctgc +taaattaaataccgcacatcggatacccgatttcagagggccgtcttactaagggcaggc +tttgttcggtttaactgagatgttcattattttacagtatgcttcaactaatatgtaacg +aaggacagtggatctgtctccatagtagatcttcagtcgtgaatttcataccgctcctat +ttaagttcgcgttcgagttgttgatcatggcacgtgaaagcaacccctagtattctagac +gaaaattttttctagttcatctgataatttgccaattcaaaaacaaccgctggtttcccg +gcgcattctctaaaatggaagtcgaacctagagccattatttgtcggtaacccatgagtt +ccttcttttcagaagttaatacactgtggtcctatacagaggaaaaacagcggttatata +cgatcgtggcataacaacattggatcaagatagcaatttggctacctattctaattctca +ctagattcggtattccactacaatatcggcagattaggattggatgaataatcggtgttt +aagtccggttgcgtctccaatctcctaatttttattaatattgatcttggtgacctattg +taaataaaaacttcaagactttgaataacggtgaaaagatagaagactcatttgaaaatg +gatcatccacagatccaaacattagcaagacactaatccccaactagctattctgatcgc +gatcgtgctgcagtactcctgtcacaatagtctgttcatgatctaattctttttgggctt +tgttcgatggtgattcagaatctttatccggtcgcttccctgtagctactttgtggggat +attgcccggggattatagggttgagatcgtttcctaaaagtatttaaaccaagtagactt +caactaaactacatcagaacatcgtgaagacaccatacgcggtacctttatttaccgata +acatttcttcaagaaataccggtaagcagcataatgaccctaaacagctcggggtatcgt +cgtagttttaaattttatttaggttactgctcaaggaataaaaactaactatttaattta +taataatattacaaggctcacactgattagatttgtctataagacttcgcgatcccccat +taccggattgtcttaagaataaactagataaaccatgcattttctagataaggcctttag +tctaattagatacaaaaaacacgatagttgcatccttaatttattgtgtcaaacctggaa +ccttttaattacccgcaaatcactttatgtcgagactacctctgaaatttattatctacc +taccgcatgaggacttgaaccatcttgtaggagttatgtttattagctaagattcgttta +tcctgtagcggtccatgtatattcaacaagcaaaaagcactcagaattgtttttagttga +gtcaagactgatatataaataagtttccctagttttttcgtggtgggacgatattgaatt +gaatcttaaccgaagagtttcccactctgtcgcacaataatacacgccaatatttccagc +cctgcttatgccttaatcggttactcaatctcccattgaagttcattttgatctgcatag +aagtttcgggcccagccttttttctgccaccttcctccaagctctgtagacgcactctaa +gattgatgctcacatgtattaattctacattaacataaatatataagtcatgcatcttcg +agtaaaatatctggttctccaacatgtcctggcacgtatcgttataatgcccatacatgt +agtattaaaatgattgggttaactggatattaagatcatcgaaattgtaaagtcaaatta +acaatactgtctcaagaccgtgtattcctcgtgctcggaagggctattacgcttacttcc +gttttggtatcttaatatgactttcaaaaattaagttgcagtgagtcctacctgcgtgca +tcggttagcaagagtataaaagttgtttaaacgaactacttgctttacaataccggtcgt +atatatcgccgtgaatccagaagattgtcttctttggattatcaaccgagatcctgtgga +ccgatgttttgggaccttcacagaggactccaggtagagctcgcttttgcattaatctaa +gaattgtacctctctaaaagatctaaaacagtgaatgtgtatttcatggaaaaacacaga +gaaacgtaaattactttaggccgaaaggcacatgagttattatacatatacgagatggtg +gtatacatcgaattcggggcatacactatagttgcattgtatttagctgctttaaataat +atgatattaccttccttacataagacattaccggcataccctggttttcaacttgtgggg +ctttttgacgatcgcactctcatttgatccgagtagggcggtgacccctgcttttcaaat +acaaaaatttcgctatgaaggtaatagattacttttcgctgttatgatagaaacggtaaa +tttaaaattgaaacttctagaaaagtaaagtaacgagaaatgattttgtgaataatgcgg +tcatgattgcgcaagtaagaaaaaaaggcaaaaggatgcgcggaatagaaacttatcagt +cacgggtatcttgatttcattcttcttgtcaattgccgacataggatgaaatcagattcc +aatgcaatacacagtaacccccacccttgattgtaatgtcgatttgaagttgtacgcgtc +gacgaagtggatagtatacgggccttttgtacggtgcgatcaactatgaatctcggcgag +ttagatggtcgtacaatctcacacatagaggtcacttgcctgtaatgacgaattttcggc +taggtactcgaactttattagaagtaaaaatgtgggcaaaagaaggattccattttacaa +gacgattacaatgagttacatgtctctcaacgtagtctttccctagtagtctttgaacta +tttaggtactccagaaaattttagcaaagggtttctgtgtgaatccgccattcatgttta +tgatggaacaataagaataacgccctcgtatgttatcgacagtgaagtcagcagttcggc +caaaaacatattcaatttagtacagatccccagaagttaagctaagtgctctaaaatggc +ctaaacggttatcaaagtaggtctaattactatactaacgggtgcatcgtaataactgct +gtcgatgcaacactatatgatagtgtcgttttgctatatatgtacaatgtgacaaagaag +ccttagcgattcttgcaaacttaggacttcggattctcaatcttaaatgtccgaaaacgc +aaagattcaaaaatttaatctatgagcagatatgcctgatggtgactacgcgtatgttaa +ggctaaatgttgacaaccgcacacataatcgaactattgatagtcgggagcataaccagg +tgaacgtactttgttcacgacatttattgacatgttctaaatacgtctcaaaatcacggc +gcactagaaaacgcaatcaaatcattgtcctggtttaagggccgtaatgccggtagtgtc +aaacttcatgagaactttagctggcttttggccagtatttagggaccaagagcactagcc +ttaagctgaatattttgccatttatctactgttataactttaaaacttggtggcaccaga +cttgtcgatacacacgcatcaatctgtaacgtaaaaggtttactaagaacaagcgtagga +attgagtttatattatatttaaactaaaagatgatattagcttctgagggcgatagggct +ccaaatcataaagaggaatatattattacacgattagaaacccacaacatacctcgaatc +gcccaaaagtttgacgaaacttggcagtactccacatctcagtaatacagttgggagagt +ctcaaatgttgttttattactcaatgaaccaccctcataatttcactgctgttccattaa +atttgcaaacgatcatttgctttgaagaaacgtaaaatcgacaaaattacagataagtag +atgcataataaaaaaaactgctcgctataacacgatcatcgtgcattcttacttaggagc +atcacccgcacaataacgtaccttaaactacaacactattagaccgagtactgtaattca +cgaaagctcaagctcgcattgtaaagaacttgctctctcgtaaaatgtgataatagtttg +cggagaggattcaattattttccattgcacctactccactagattcgataaaagaaggtg +gtcctcccttaaaaagaaatgttaagtaacatcggaaccataagcaaagcatgtaagtga +accgtcatccttccctaagaaacataaaggtttttaataatgtcgactgtgaactataac +tgcatcctttcctgacctactccggttccttgttgttatttctgaacgagaccagtagat +aaacaatgtaaaccacagtgggtaccaatggtgcatgtgacgctaccgttgttttaagtg +cccgtacaaacataagaagtcataatcttacttgaaattaattttgccttttattttttt +tcaggctcgaaattaatgatttgttttttttgaccttctagttacgctaatatgcggtcg +cctgtggtttctattgagtcctataacgggatgggatctaatacgtttggttactagtaa +acaaggtataaatttgataccggagtatcaactgtataacatcaagctttatgactcata +cgcgaagtaatgacacaaggctttcaggagatcgcgagtacagagccactaaggggtgta +ttacgatagtgacaccaccgagcgcactcactccccaagtagatttatgatcctacgcta +agtattagatatataaccaaagaggttctagtcagtgcaactcttagaataataattagc +cggttttgcctttttaggcctaatgcaatattcagctagcccttatgtatctcgcgttcc +acagcaccactcatggcacgcgtttaaactaatcaaatataatctatgaatgttatgcca +gtacttgaataaatcaggttttttataagtccttgcatactctcgttatatactgttaga +gtcttaccccatagaaattctttcatctgcaaacttagaagaattctcagctacggggag +cataaagtccccaggatgttgacaaatacaacaaatgtggcttatacaaacactccatat +gaaaatcgaaccctcgtggtagttttagccgaaccttgtacggataaatccctccatttt +ccaatagcagatacctatcctactacctcgtggtattaaattaaagcttgaaatatagag +ctgcatagcttatccaattcccaagcacgagtctaccgtcgtaaccacgatttgatttac +agacgctagagcaaacccatctttaaacatataagtaaaaattaaagggtgagtgcgtac +gtgtttactagcaacttcgcttattaagacaattgtttataagccataattaaaaacata +tgttcaacaggttcattgatatttgtaattgcacaggtttttaataaggatctacgtaag +tataatgaacaaactttttaccagagttatattctgtactttgaaaatgctcctctaccg +ccttagagactttcaattagattttttgcagttaatctatgcgtaagtgaaccatgcaag +ggatgcgattcaaccgcctcgtgctaaccctatcgtctgtctcataactgtaggtctaat +ataattttcagttttcgaacacataaccctttgaaaatctgctatttaatgtctcacctg +catgcactatcttctatactgctcagaacggctatacgtcactatgctccaagtgacgat +ttaaacgaagcaaggaataataggtttattttagtgcaaaacaattaagtgcggactacg +tgctctttacaataagccttgtgattgggctataggttaagtcccatattaacgatctcc +aatgtacaaaatcgacaatcgctttgcattacccggttactagtcgaattacagatagct +gttagatactcactctaattttggacaacaatcccaatcttggggtcgtctatcgcctga +agctcgtaaatccttccatcttaaacgattacatattatagacttgttcggggtagagat +atcacagttgtgcaaacattgtaaatcgatactagtttatgttggtagtctagttgcttt +taccattccccgaaaaacttgatctactatttcgacaacagtaaacttgaactaggtaag +tgaaaacagagaatgcctcatagtgccactatttgtccactatatgtaagtgtagcttta +cataatccactatgactgagatcattacggcctaggaaagcagcgtagaaaaaaagggcc +cggatattacgactgtaactataaaactagttactggtagcgcgccatgtatagatttgt +tttaccggttgtggttgcgttaacgaatttcagccgcgaaaattgatccgttaaccagtc +catctcgacttctataaaacgataaagtaaagttgatgttcagcctccttcttatggttg +catcgagagtacactactcagtgggaaatagatcggggttcctacttcagattgtattat +ctaggcaattgccgattgtgccatacctggataaaataagctacctacatgtgatgctta +tctattatcgtcatactaccttagggtgtcctgttgaacgctacattaatctttagccgt +ttgagatgttccaatggataggagtctaacgcatgatgaagtttaggaaggcagagcatc +ccactaagtatgtgacagtgtatttcgaaacgagacgttataaatagaaaaaaggtcctt +ctggttctattctgctgaactattgaatggaaagattggttgacctacgtactatttgct +tgaagtcatcaatttgacggggtgagagacatatggtgcatactttacggactctatatt +ttagatcagaagcttagcagtcttctctacaccccctcacgacataattgcttttaagaa +tctatgtttgattcctctacgggaattcggatccgttcgcatgtgcggtttatctaaacc +aggggacatatgttcagctaaagcatacgaacactttgctaactagacgtatgtatagta +gctataaatcccgacgatatttacaaaaagaaatgagactcaaatatatacatagcgacc +ctacacttattcgcaccctgatctaggcgatcctagcacccacacccgaaagtgagcact +agtgtcttccgtattaaatttactgcagttgagattttagttgtctactaaggattactc +taacccgtaataaggatcaagactcggtactagctttactatcattccctatgtgttttc +ctaactcacaagggtacgtaccagcctatgtaattacaataatgataaagacacaaagga +agtaactttacaaatgagtctccagttacactagcttagtccctcccatcttgctttgaa +gtctaaatacgcaatctctgaggatatacagcagaagaacactcataacgttggagtcca +agaattagactcatagggcccccaacatttaatatgtactgtgagtttgaaggtgttcta +ttgttaattcctgctcttgatacatgacacgtactccgtgtttaaggcttcggactgact +ttctttcataagttgagcaacgaaaatttcagaatcgataagttggattcactaactaat +acggctgattgaaaactccactccggacctatatggtcgacctttatacgtaaccgatat +aaaacttataggctggtatatcgagccttcctagcgcaatttcggatggggtttcttcta +ctactcaacaacggaatagtctttgtttagtaaaccagagctcaggacgcccaatacgta +ggagagcgctgtggagcatgtgtcattatggactggagcactcttaaatcactctgcgtg +tgctaaacgatagatcataacatgtcctgagtaaattttcttgatacgtcgcaatatacc +gttattagttaaacgttctcatccgtcatgcgtgaaatacggctgtcgtgctcagatata +ctattagcgactcatctcgcctaacacgcacacgtataaactcggaatgactgccgctct +tacatattagaaatacagactacaccacggaagcattgggtcattctcaaccgctgtata +aaagatgattagtcttataataagattaccaaagaggcagaatcatgggtagtaaatcta +ttattcaagtgattaccgtcgtgtaggcagggagtgaggacgagatggtactcaggacaa +atattaaccggacgaagtggtttacgtcgtactttcactattagtagtaaatacaaggta +acaccggggaatagtactaaatataatgatatctatcttcgggagaacgagtcgtctatt +gctttgaacattctcaaggcgtaaaatgtgctgacttatagcatgatacaaccgattgtt +acttttgtctattcaaaagattgaatagttttttatacaaaagccgcatacttatgacgg +ctagtatacagtttcatcccctagcatcaatgctatggacagtattgaacttataggaaa +ttcttctaatagggcaaatccgtcgtgatgcctattttttttcagtcacatcctcaaatg +gcactagtattgtcgggatcccattaacaggctcaaccacgagctcacgcgaggacatgt +agtccgtatctttaacgaagcgacagcgacagaactcccatggataaccaattataaggc +ccgtaatcctctagacatcgtttaccaataaatccgctttctccgtaatcatgttgaata +ccccagagtagtccagatgataaccgatgaaacacaagtctttctcaatgcacttacggt +gaacttattaccgccaacgtagctcatcaaggttgcgacatctagttgtgtgtttgcgac +gagcccagcgaacttcatcaactttcgtatattcaacgccttgtaattttactttaagac +gcctggtgatgtagattcttagataatcagtttgttatcggctgtactttaccataattt +cacaggtttcaggtcaagaagattatagctgtatatacagttccatgctcggtgcacaga +aacgtgatcggataataatcaatcgcttatgtcgtctttaggcgtatccaatacatgccc +cgataccgcagtgtatttcgacatgtaggtataccgtcgcatttgagctcgagtcaggac +gtcagctagattagattccttaatagaatataccgacctctagtccgaactaaactatag +ataacgccaacttcaggttaattgtctagtcgtctgtttgcagatgggattcttagatga +gtgagtatcggccatattggttcgagcactttagtttttgatgcataggatatgcaatgt +atagctgaaagtactttatctgtttcaaactcacattgattaaaccggtaaacctttaaa +gactacaagaaaatattcagtgagggcaattttgtcaatcacaatcttccagctagagat +acttcacaatttgtcttgaggctacgcaacattagacggattttcgcgttttattgaaat +aatcgaggggcccaagagtatccatagttcattttgtaagatttctttacaggcttatta +cagcttcttcagactcctacatgcttacgagttatatgctagcatgtgaacaatagatta +atatacaggaaaacgtacattgagagagatgaccctacacagcgcaaccgttgagtactt +tcattaaagggtaacgctctcgagacagcatccttaagatggccttattgtcaaatcatt +tgcagaagtacgcaagatccctaaccaacgtagaagaatccctacaaacacatgagacgc +ggtgaaaatagacagggtgttagtattcaatcttcggagtatcaatttcgccaatcttgg +tgagaaagcataccctttcttcagagaaagaagatcaatcataacactatctttaacgag +gtacgcacgcgcatcattacctgcctccatggatctttaggatagcggaaagtattggca +gcgtattgtgatttcgttcctactttatcaatttcacattcatatacatgtcttttatca +aaatcgccaataagataggatgagctatattagatgctagtagagttcgcgccaacatca +tcgataggaatactcaggacagcgtgataggacttttcaatccctaatactctctataat +tataactctctcttaagtttggaggcagtaacgcgctctatataatcagtttgctgcacc +attcttcagcctctgatacatacaaataaattccacagcagtaagagggtttaattgaga +catcttgggaacttaggattttactctaacatcaccgaaacgattattggataccgtacc +taaacgaactttctcaaggcagtaatataggacatccgcaataacacaaatgctgcctcc +ccaggagttatgtcttcctggaggctatatcttacacccactcactataggcaaactaaa +gtttaaatgttgattgtctaaaaaaaagatagataagagttggccggcgtagcacatgcg +aaagtgaatcgtaagctataattctctggacttgaagttctgtcctgttcctctgcaaga +aacaaacttcctttaaagctatttacgacgcacatctcagcaagttataaacatgttgga +agtttctagtcggaattcccaaagaacggatctatctaatgcattcctacatttttcctg +tctgccgatggtgccatcctattcaaagaatttcttaaaagtagattaaatgggactttt +aacaatgagtaaccttacgcctctaagggttcctcgagtgccatacaccagtcaggtccg +agccacatacacggagaacattctaacatagcattctcaactcgatcatttgcaggttac +ttctttcctatcctagtgctaaaaatcatacttgcaatcccatagcacggattaagaacc +taagaaacaattcagtaaaacatgttcgaattcttggtatgggaacatcattgcagctat +ggtctaacgcattaatgtttgggtacatcttccatcatataaacaggaagagtctgacga +cagggagtgcttgcgatcatgtctatcattgtgaaatcaaattgtagctcacatgtcgtc +tatgagagcgtgtatccgataagatttagaaaaatagaagtcgtataagatctcactgaa +cttttgaatgaatgtgaagcatatatgatctgctttaataaaactttatccataggatac +gtttccaaatcaattcaataattattagtcaaaatagataaggatgaacaacctgaaggc +cgatcggacgtagaaagtggtcccatcactttgagttgatattgttgaaccacacgttat +tatggttttcaaacagtctcaggatattgtatatacagataatccgataccagttgtctg +acgcccctcttacgtaccccaccctttgtgacgtttaaagcagttgttcagtattttaaa +ctaggcggcaactaatttggaaagaagcacagtggatatgtctaaattcttgttattcag +gcctgaatttaatacaccgcatagttaacttcgcggtagagttgttcatcatgcctcctc +taagctaccacttctatgatacaccaatagttgttctacggaatctgataattggccaag +tcataaacttccgctgcgttcaacccccttgctcgaatatccaactcgaaaagacagcct +tttggtgtccggaacaaatcagttacttcttttctgatgttaattctctgtggtcagata +cagaccaaaaactccgcggatttaccatcctccaagaacaaatttgcatcaacatagcat +tttggctacatattctaagtctcaatagtttaggttttcaactacattatcccaacatta +ggattggaggaataatagctgggtaagtccccttgcgtctacaatcgactattttttatg +aatatgcttctgccgcacctatggttattaaaaaagtcatgactttgaagaaccctgaaa +agatagatgaatcaggtgtaatggcagcagccaaagagcatataattagcaacactctaa +gaacattatagatatgatgatagcgatcgtcatgatgttatccggtcacaatagtagctt +catcagctaattcgttttgccagtggtgacttgcgctggaagaatcgttatacggtccct +tccctcttgatacggtgggggcttattcaaccgcgtggattgggttgtcatacttgcatt +aaacgatgtaaaccatctagtagtcaactatactaaatcacaaaatagtgatcaatacat +acccgcttcatggttttaaccatttaattgattaaagatattccgctaagaaccattatc +tacctaaactgatcgccgtatcctagtagtttgaaatttgatgtaccgtaatgatcaacg +aagtaaaacgttatattgtatgtagaataataggtcttggagctaaatgatgtgattggt +agtgaagacttacccttacaactttaccggtttctcggaagaatatactagagaatcaat +gcatgggctacataagcactttagtctaatgagataaaaaatacacgagtcttccatcat +gaattttttgtcgaaaaactcgaacctggtaatttaaaccatatatctttatgtcgtcaa +taactctcatatgttttatataacttcccaatcacgacttgtaactgcttgttcgactga +gctgtttgagctatgaggccgggatccggttgagctacatctatttgctacaagaaaaat +gaaagcacatttgttgggagttctggctacactcatagagaaataagtggcccgagtggg +tgcggcctgcctccatattcaagtgtatcttaaaccaagtggttccaacgctcgcgctaa +agaattaaagcctttatttcctccacggagtagcccgtaatccggttcgaaagagaccat +tgaagttaattttcatatccagtgaagtttaggcacaagcatgtgttctgccacatgcct +caaagcgctcttcaaccaagatatgattcatcctaacttcgatgaatgcgtctgtaacat +aaatatagaaggaatgattcggcgagttaattttcgccttctccaacatggcatccctac +gttcgttataaggaccatacatgtaggttttaaaggtttgcggttaatcgatatttacat +catagaaattctatagtcaaatttacaagactctagatactcactcgttgcagccggcta +ggaagcgctttgtaccttacttcccttttcgttgcgtaatatgaatttcatatagtaagt +tcaaggcactcatacctccgtgaagagggtagatagactattaaagttgtttaatagtac +gtattgatggaaatgacccgtaggagatttaccactcaatccacaagattcgctgctgtg +cattatcaaaacagtgcatgtcgaaacatgggttgggtccttcaaacacgaatccaggta +gagatacctttgcaattttt diff --git a/regex-1.8.4/examples/regexdna-output.txt b/regex-1.8.4/examples/regexdna-output.txt new file mode 100644 index 0000000000000..d36baa5be8d2f --- /dev/null +++ b/regex-1.8.4/examples/regexdna-output.txt @@ -0,0 +1,13 @@ +agggtaaa|tttaccct 0 +[cgt]gggtaaa|tttaccc[acg] 3 +a[act]ggtaaa|tttacc[agt]t 9 +ag[act]gtaaa|tttac[agt]ct 8 +agg[act]taaa|ttta[agt]cct 10 +aggg[acg]aaa|ttt[cgt]ccct 3 +agggt[cgt]aa|tt[acg]accct 4 +agggta[cgt]a|t[acg]taccct 3 +agggtaa[cgt]|[acg]ttaccct 5 + +101745 +100000 +133640 diff --git a/regex-1.8.4/examples/shootout-regex-dna-bytes.rs b/regex-1.8.4/examples/shootout-regex-dna-bytes.rs new file mode 100644 index 0000000000000..773fd9ba8d774 --- /dev/null +++ b/regex-1.8.4/examples/shootout-regex-dna-bytes.rs @@ -0,0 +1,68 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +use std::io::{self, Read}; +use std::sync::Arc; +use std::thread; + +macro_rules! regex { + ($re:expr) => { + ::regex::bytes::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = Vec::with_capacity(51 * (1 << 20)); + io::stdin().read_to_end(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]).into_owned(); + let clen = seq.len(); + let seq_arc = Arc::new(seq.clone()); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + let mut counts = vec![]; + for variant in variants { + let seq = seq_arc.clone(); + let restr = variant.to_string(); + let future = thread::spawn(move || variant.find_iter(&seq).count()); + counts.push((restr, future)); + } + + let substs = vec![ + (regex!("B"), &b"(c|g|t)"[..]), + (regex!("D"), &b"(a|g|t)"[..]), + (regex!("H"), &b"(a|c|t)"[..]), + (regex!("K"), &b"(g|t)"[..]), + (regex!("M"), &b"(a|c)"[..]), + (regex!("N"), &b"(a|c|g|t)"[..]), + (regex!("R"), &b"(a|g)"[..]), + (regex!("S"), &b"(c|g)"[..]), + (regex!("V"), &b"(a|c|g)"[..]), + (regex!("W"), &b"(a|t)"[..]), + (regex!("Y"), &b"(c|t)"[..]), + ]; + let mut seq = seq; + for (re, replacement) in substs { + seq = re.replace_all(&seq, replacement).into_owned(); + } + + for (variant, count) in counts { + println!("{} {}", variant, count.join().unwrap()); + } + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} diff --git a/regex-1.8.4/examples/shootout-regex-dna-cheat.rs b/regex-1.8.4/examples/shootout-regex-dna-cheat.rs new file mode 100644 index 0000000000000..1bde7ab1ff07f --- /dev/null +++ b/regex-1.8.4/examples/shootout-regex-dna-cheat.rs @@ -0,0 +1,90 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +// This technically solves the problem posed in the `regex-dna` benchmark, but +// it cheats by combining all of the replacements into a single regex and +// replacing them with a single linear scan. i.e., it re-implements +// `replace_all`. As a result, this is around 25% faster. ---AG + +use std::io::{self, Read}; +use std::sync::Arc; +use std::thread; + +macro_rules! regex { + ($re:expr) => { + ::regex::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = String::with_capacity(50 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + let clen = seq.len(); + let seq_arc = Arc::new(seq.clone()); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + let mut counts = vec![]; + for variant in variants { + let seq = seq_arc.clone(); + let restr = variant.to_string(); + let future = thread::spawn(move || variant.find_iter(&seq).count()); + counts.push((restr, future)); + } + + let substs = vec![ + (b'B', "(c|g|t)"), + (b'D', "(a|g|t)"), + (b'H', "(a|c|t)"), + (b'K', "(g|t)"), + (b'M', "(a|c)"), + (b'N', "(a|c|g|t)"), + (b'R', "(a|g)"), + (b'S', "(c|g)"), + (b'V', "(a|c|g)"), + (b'W', "(a|t)"), + (b'Y', "(c|t)"), + ]; // combined into one regex in `replace_all` + let seq = replace_all(&seq, substs); + + for (variant, count) in counts { + println!("{} {}", variant, count.join().unwrap()); + } + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} + +fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { + let mut replacements = vec![""; 256]; + let mut alternates = vec![]; + for (re, replacement) in substs { + replacements[re as usize] = replacement; + alternates.push((re as char).to_string()); + } + + let re = regex!(&alternates.join("|")); + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); + } + new.push_str(&text[last_match..]); + new +} diff --git a/regex-1.8.4/examples/shootout-regex-dna-replace.rs b/regex-1.8.4/examples/shootout-regex-dna-replace.rs new file mode 100644 index 0000000000000..20694e06f307d --- /dev/null +++ b/regex-1.8.4/examples/shootout-regex-dna-replace.rs @@ -0,0 +1,17 @@ +use std::io::{self, Read}; + +macro_rules! regex { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re).build().unwrap().into_regex() + }}; +} + +fn main() { + let mut seq = String::with_capacity(50 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + println!("original: {}, replaced: {}", ilen, seq.len()); +} diff --git a/regex-1.8.4/examples/shootout-regex-dna-single-cheat.rs b/regex-1.8.4/examples/shootout-regex-dna-single-cheat.rs new file mode 100644 index 0000000000000..70a979c6d445c --- /dev/null +++ b/regex-1.8.4/examples/shootout-regex-dna-single-cheat.rs @@ -0,0 +1,75 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +use std::io::{self, Read}; + +macro_rules! regex { + ($re:expr) => { + ::regex::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = String::with_capacity(50 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + let clen = seq.len(); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + for re in variants { + println!("{} {}", re.to_string(), re.find_iter(&seq).count()); + } + + let substs = vec![ + (b'B', "(c|g|t)"), + (b'D', "(a|g|t)"), + (b'H', "(a|c|t)"), + (b'K', "(g|t)"), + (b'M', "(a|c)"), + (b'N', "(a|c|g|t)"), + (b'R', "(a|g)"), + (b'S', "(c|g)"), + (b'V', "(a|c|g)"), + (b'W', "(a|t)"), + (b'Y', "(c|t)"), + ]; // combined into one regex in `replace_all` + let seq = replace_all(&seq, substs); + + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} + +fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { + let mut replacements = vec![""; 256]; + let mut alternates = vec![]; + for (re, replacement) in substs { + replacements[re as usize] = replacement; + alternates.push((re as char).to_string()); + } + + let re = regex!(&alternates.join("|")); + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); + } + new.push_str(&text[last_match..]); + new +} diff --git a/regex-1.8.4/examples/shootout-regex-dna-single.rs b/regex-1.8.4/examples/shootout-regex-dna-single.rs new file mode 100644 index 0000000000000..b4740596004e7 --- /dev/null +++ b/regex-1.8.4/examples/shootout-regex-dna-single.rs @@ -0,0 +1,57 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +use std::io::{self, Read}; + +macro_rules! regex { + ($re:expr) => { + ::regex::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = String::with_capacity(50 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + let clen = seq.len(); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + for re in variants { + println!("{} {}", re.to_string(), re.find_iter(&seq).count()); + } + + let substs = vec![ + (regex!("B"), "(c|g|t)"), + (regex!("D"), "(a|g|t)"), + (regex!("H"), "(a|c|t)"), + (regex!("K"), "(g|t)"), + (regex!("M"), "(a|c)"), + (regex!("N"), "(a|c|g|t)"), + (regex!("R"), "(a|g)"), + (regex!("S"), "(c|g)"), + (regex!("V"), "(a|c|g)"), + (regex!("W"), "(a|t)"), + (regex!("Y"), "(c|t)"), + ]; + let mut seq = seq; + for (re, replacement) in substs { + seq = re.replace_all(&seq, replacement).into_owned(); + } + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} diff --git a/regex-1.8.4/examples/shootout-regex-dna.rs b/regex-1.8.4/examples/shootout-regex-dna.rs new file mode 100644 index 0000000000000..b96518e4c4325 --- /dev/null +++ b/regex-1.8.4/examples/shootout-regex-dna.rs @@ -0,0 +1,68 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +use std::io::{self, Read}; +use std::sync::Arc; +use std::thread; + +macro_rules! regex { + ($re:expr) => { + ::regex::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = String::with_capacity(51 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + let clen = seq.len(); + let seq_arc = Arc::new(seq.clone()); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + let mut counts = vec![]; + for variant in variants { + let seq = seq_arc.clone(); + let restr = variant.to_string(); + let future = thread::spawn(move || variant.find_iter(&seq).count()); + counts.push((restr, future)); + } + + let substs = vec![ + (regex!("B"), "(c|g|t)"), + (regex!("D"), "(a|g|t)"), + (regex!("H"), "(a|c|t)"), + (regex!("K"), "(g|t)"), + (regex!("M"), "(a|c)"), + (regex!("N"), "(a|c|g|t)"), + (regex!("R"), "(a|g)"), + (regex!("S"), "(c|g)"), + (regex!("V"), "(a|c|g)"), + (regex!("W"), "(a|t)"), + (regex!("Y"), "(c|t)"), + ]; + let mut seq = seq; + for (re, replacement) in substs { + seq = re.replace_all(&seq, replacement).into_owned(); + } + + for (variant, count) in counts { + println!("{} {}", variant, count.join().unwrap()); + } + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} diff --git a/regex-1.8.4/record/README.md b/regex-1.8.4/record/README.md new file mode 100644 index 0000000000000..432b06ab9a64a --- /dev/null +++ b/regex-1.8.4/record/README.md @@ -0,0 +1,4 @@ +This directory contains various recordings of results. These are committed to +the repository so that they can be compared over time. (At the time of writing, +there is no tooling for facilitating this comparison. It has to be done +manually.) diff --git a/regex-1.8.4/record/compile-test/2023-04-19_1.7.3.csv b/regex-1.8.4/record/compile-test/2023-04-19_1.7.3.csv new file mode 100644 index 0000000000000..af62da10acf13 --- /dev/null +++ b/regex-1.8.4/record/compile-test/2023-04-19_1.7.3.csv @@ -0,0 +1,11 @@ +name,crate,revision,profile,duration,size,relative-size +regex__dev__std_perf_unicode,regex,9582040009,dev,1.824209152s,3434992,3113064 +regex__dev__std,regex,9582040009,dev,1.206314935s,1362392,1040464 +regex__dev__std_perf,regex,9582040009,dev,1.543583435s,2726384,2404456 +regex__dev__std_unicode,regex,9582040009,dev,1.490095643s,2066904,1744976 +regex__dev__std_unicode-case_unicode-perl,regex,9582040009,dev,1.292011694s,1812952,1491024 +regex__release__std_perf_unicode,regex,9582040009,release,2.398133563s,1616216,1294368 +regex__release__std,regex,9582040009,release,1.413680252s,694592,372744 +regex__release__std_perf,regex,9582040009,release,2.341496191s,1124696,802848 +regex__release__std_unicode,regex,9582040009,release,1.671407822s,1190208,868360 +regex__release__std_unicode-case_unicode-perl,regex,9582040009,release,1.441712198s,932160,610312 diff --git a/regex-1.8.4/record/compile-test/2023-04-20_master.csv b/regex-1.8.4/record/compile-test/2023-04-20_master.csv new file mode 100644 index 0000000000000..4c3e916740d72 --- /dev/null +++ b/regex-1.8.4/record/compile-test/2023-04-20_master.csv @@ -0,0 +1,11 @@ +name,crate,revision,profile,duration,size,relative-size +regex__dev__std_perf_unicode,regex,f1f99af2bc,dev,1.834267609s,3799536,3477608 +regex__dev__std,regex,f1f99af2bc,dev,1.263958602s,1427928,1106000 +regex__dev__std_perf,regex,f1f99af2bc,dev,1.631302845s,3234288,2912360 +regex__dev__std_unicode,regex,f1f99af2bc,dev,1.550536696s,1997272,1675344 +regex__dev__std_unicode-case_unicode-perl,regex,f1f99af2bc,dev,1.341622852s,1739224,1417296 +regex__release__std_perf_unicode,regex,f1f99af2bc,release,2.475080323s,1755480,1433632 +regex__release__std,regex,f1f99af2bc,release,1.45990031s,731456,409608 +regex__release__std_perf,regex,f1f99af2bc,release,2.421787211s,1259864,938016 +regex__release__std_unicode,regex,f1f99af2bc,release,1.693972619s,1227072,905224 +regex__release__std_unicode-case_unicode-perl,regex,f1f99af2bc,release,1.528003306s,969024,647176 diff --git a/regex-1.8.4/record/compile-test/README.md b/regex-1.8.4/record/compile-test/README.md new file mode 100644 index 0000000000000..7291d5d376d02 --- /dev/null +++ b/regex-1.8.4/record/compile-test/README.md @@ -0,0 +1,27 @@ +This directory contains the results of compilation tests. Specifically, +the results are from testing both the from scratch compilation time and +relative binary size increases of various features for both the `regex` and +`regex-automata` crates. + +Here's an example of how to run these tests for just the `regex` crate. You'll +need the `regex-cli` command installed, which can be found in the `regex-cli` +directory in the root of this repository. + +This must be run in the root of a checkout of this repository. + +``` +$ mkdir /tmp/regex-compile-test +$ regex-cli compile-test ./ /tmp/regex-compile-test | tee record/compile-test/2023-04-19_1.7.3.csv +``` + +You can then look at the results using a tool like [`xsv`][xsv]: + +``` +$ xsv table record/compile-test/2023-04-19_1.7.3.csv +``` + +Note that the relative binary size is computed by building a "baseline" hello +world program, and then subtracting that from the size of a binary that uses +the regex crate. + +[xsv]: https://github.com/BurntSushi/xsv diff --git a/regex-1.8.4/rustfmt.toml b/regex-1.8.4/rustfmt.toml new file mode 100644 index 0000000000000..aa37a218b97e5 --- /dev/null +++ b/regex-1.8.4/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 79 +use_small_heuristics = "max" diff --git a/regex-1.8.4/src/backtrack.rs b/regex-1.8.4/src/backtrack.rs new file mode 100644 index 0000000000000..4d83856ca004b --- /dev/null +++ b/regex-1.8.4/src/backtrack.rs @@ -0,0 +1,282 @@ +// This is the backtracking matching engine. It has the same exact capability +// as the full NFA simulation, except it is artificially restricted to small +// regexes on small inputs because of its memory requirements. +// +// In particular, this is a *bounded* backtracking engine. It retains worst +// case linear time by keeping track of the states that it has visited (using a +// bitmap). Namely, once a state is visited, it is never visited again. Since a +// state is keyed by `(instruction index, input index)`, we have that its time +// complexity is `O(mn)` (i.e., linear in the size of the search text). +// +// The backtracking engine can beat out the NFA simulation on small +// regexes/inputs because it doesn't have to keep track of multiple copies of +// the capture groups. In benchmarks, the backtracking engine is roughly twice +// as fast as the full NFA simulation. Note though that its performance doesn't +// scale, even if you're willing to live with the memory requirements. Namely, +// the bitset has to be zeroed on each execution, which becomes quite expensive +// on large bitsets. + +use crate::exec::ProgramCache; +use crate::input::{Input, InputAt}; +use crate::prog::{InstPtr, Program}; +use crate::re_trait::Slot; + +type Bits = u32; + +const BIT_SIZE: usize = 32; +const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB + +/// Returns true iff the given regex and input should be executed by this +/// engine with reasonable memory usage. +pub fn should_exec(num_insts: usize, text_len: usize) -> bool { + // Total memory usage in bytes is determined by: + // + // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) + // + // The actual limit picked is pretty much a heuristic. + // See: https://github.com/rust-lang/regex/issues/215 + let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; + size <= MAX_SIZE_BYTES +} + +/// A backtracking matching engine. +#[derive(Debug)] +pub struct Bounded<'a, 'm, 'r, 's, I> { + prog: &'r Program, + input: I, + matches: &'m mut [bool], + slots: &'s mut [Slot], + m: &'a mut Cache, +} + +/// Shared cached state between multiple invocations of a backtracking engine +/// in the same thread. +#[derive(Clone, Debug)] +pub struct Cache { + jobs: Vec, + visited: Vec, +} + +impl Cache { + /// Create new empty cache for the backtracking engine. + pub fn new(_prog: &Program) -> Self { + Cache { jobs: vec![], visited: vec![] } + } +} + +/// A job is an explicit unit of stack space in the backtracking engine. +/// +/// The "normal" representation is a single state transition, which corresponds +/// to an NFA state and a character in the input. However, the backtracking +/// engine must keep track of old capture group values. We use the explicit +/// stack to do it. +#[derive(Clone, Copy, Debug)] +enum Job { + Inst { ip: InstPtr, at: InputAt }, + SaveRestore { slot: usize, old_pos: Option }, +} + +impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { + /// Execute the backtracking matching engine. + /// + /// If there's a match, `exec` returns `true` and populates the given + /// captures accordingly. + pub fn exec( + prog: &'r Program, + cache: &ProgramCache, + matches: &'m mut [bool], + slots: &'s mut [Slot], + input: I, + start: usize, + end: usize, + ) -> bool { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.backtrack; + let start = input.at(start); + let mut b = Bounded { prog, input, matches, slots, m: cache }; + b.exec_(start, end) + } + + /// Clears the cache such that the backtracking engine can be executed + /// on some input of fixed length. + fn clear(&mut self) { + // Reset the job memory so that we start fresh. + self.m.jobs.clear(); + + // Now we need to clear the bit state set. + // We do this by figuring out how much space we need to keep track + // of the states we've visited. + // Then we reset all existing allocated space to 0. + // Finally, we request more space if we need it. + // + // This is all a little circuitous, but doing this using unchecked + // operations doesn't seem to have a measurable impact on performance. + // (Probably because backtracking is limited to such small + // inputs/regexes in the first place.) + let visited_len = + (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) + / BIT_SIZE; + self.m.visited.truncate(visited_len); + for v in &mut self.m.visited { + *v = 0; + } + if visited_len > self.m.visited.len() { + let len = self.m.visited.len(); + self.m.visited.reserve_exact(visited_len - len); + for _ in 0..(visited_len - len) { + self.m.visited.push(0); + } + } + } + + /// Start backtracking at the given position in the input, but also look + /// for literal prefixes. + fn exec_(&mut self, mut at: InputAt, end: usize) -> bool { + self.clear(); + // If this is an anchored regex at the beginning of the input, then + // we're either already done or we only need to try backtracking once. + if self.prog.is_anchored_start { + return if !at.is_start() { false } else { self.backtrack(at) }; + } + let mut matched = false; + loop { + if !self.prog.prefixes.is_empty() { + at = match self.input.prefix_at(&self.prog.prefixes, at) { + None => break, + Some(at) => at, + }; + } + matched = self.backtrack(at) || matched; + if matched && self.prog.matches.len() == 1 { + return true; + } + if at.pos() >= end { + break; + } + at = self.input.at(at.next_pos()); + } + matched + } + + /// The main backtracking loop starting at the given input position. + fn backtrack(&mut self, start: InputAt) -> bool { + // N.B. We use an explicit stack to avoid recursion. + // To avoid excessive pushing and popping, most transitions are handled + // in the `step` helper function, which only pushes to the stack when + // there's a capture or a branch. + let mut matched = false; + self.m.jobs.push(Job::Inst { ip: 0, at: start }); + while let Some(job) = self.m.jobs.pop() { + match job { + Job::Inst { ip, at } => { + if self.step(ip, at) { + // Only quit if we're matching one regex. + // If we're matching a regex set, then mush on and + // try to find other matches (if we want them). + if self.prog.matches.len() == 1 { + return true; + } + matched = true; + } + } + Job::SaveRestore { slot, old_pos } => { + if slot < self.slots.len() { + self.slots[slot] = old_pos; + } + } + } + } + matched + } + + fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { + use crate::prog::Inst::*; + loop { + // This loop is an optimization to avoid constantly pushing/popping + // from the stack. Namely, if we're pushing a job only to run it + // next, avoid the push and just mutate `ip` (and possibly `at`) + // in place. + if self.has_visited(ip, at) { + return false; + } + match self.prog[ip] { + Match(slot) => { + if slot < self.matches.len() { + self.matches[slot] = true; + } + return true; + } + Save(ref inst) => { + if let Some(&old_pos) = self.slots.get(inst.slot) { + // If this path doesn't work out, then we save the old + // capture index (if one exists) in an alternate + // job. If the next path fails, then the alternate + // job is popped and the old capture index is restored. + self.m.jobs.push(Job::SaveRestore { + slot: inst.slot, + old_pos, + }); + self.slots[inst.slot] = Some(at.pos()); + } + ip = inst.goto; + } + Split(ref inst) => { + self.m.jobs.push(Job::Inst { ip: inst.goto2, at }); + ip = inst.goto1; + } + EmptyLook(ref inst) => { + if self.input.is_empty_match(at, inst) { + ip = inst.goto; + } else { + return false; + } + } + Char(ref inst) => { + if inst.c == at.char() { + ip = inst.goto; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + Ranges(ref inst) => { + if inst.matches(at.char()) { + ip = inst.goto; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + Bytes(ref inst) => { + if let Some(b) = at.byte() { + if inst.matches(b) { + ip = inst.goto; + at = self.input.at(at.next_pos()); + continue; + } + } + return false; + } + } + } + } + + fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool { + let k = ip * (self.input.len() + 1) + at.pos(); + let k1 = k / BIT_SIZE; + let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1))); + if self.m.visited[k1] & k2 == 0 { + self.m.visited[k1] |= k2; + false + } else { + true + } + } +} + +fn usize_to_u32(n: usize) -> u32 { + if (n as u64) > (::std::u32::MAX as u64) { + panic!("BUG: {} is too big to fit into u32", n) + } + n as u32 +} diff --git a/regex-1.8.4/src/compile.rs b/regex-1.8.4/src/compile.rs new file mode 100644 index 0000000000000..23e63ec8914f8 --- /dev/null +++ b/regex-1.8.4/src/compile.rs @@ -0,0 +1,1333 @@ +use std::collections::HashMap; +use std::fmt; +use std::iter; +use std::result; +use std::sync::Arc; + +use regex_syntax::hir::{self, Hir, Look}; +use regex_syntax::is_word_byte; +use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; + +use crate::prog::{ + EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, + InstSave, InstSplit, Program, +}; + +use crate::Error; + +type Result = result::Result; +type ResultOrEmpty = result::Result, Error>; + +#[derive(Debug)] +struct Patch { + hole: Hole, + entry: InstPtr, +} + +/// A compiler translates a regular expression AST to a sequence of +/// instructions. The sequence of instructions represents an NFA. +// `Compiler` is only public via the `internal` module, so avoid deriving +// `Debug`. +#[allow(missing_debug_implementations)] +pub struct Compiler { + insts: Vec, + compiled: Program, + capture_name_idx: HashMap, + num_exprs: usize, + size_limit: usize, + suffix_cache: SuffixCache, + utf8_seqs: Option, + byte_classes: ByteClassSet, + // This keeps track of extra bytes allocated while compiling the regex + // program. Currently, this corresponds to two things. First is the heap + // memory allocated by Unicode character classes ('InstRanges'). Second is + // a "fake" amount of memory used by empty sub-expressions, so that enough + // empty sub-expressions will ultimately trigger the compiler to bail + // because of a size limit restriction. (That empty sub-expressions don't + // add to heap memory usage is more-or-less an implementation detail.) In + // the second case, if we don't bail, then an excessively large repetition + // on an empty sub-expression can result in the compiler using a very large + // amount of CPU time. + extra_inst_bytes: usize, +} + +impl Compiler { + /// Create a new regular expression compiler. + /// + /// Various options can be set before calling `compile` on an expression. + pub fn new() -> Self { + Compiler { + insts: vec![], + compiled: Program::new(), + capture_name_idx: HashMap::new(), + num_exprs: 0, + size_limit: 10 * (1 << 20), + suffix_cache: SuffixCache::new(1000), + utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), + byte_classes: ByteClassSet::new(), + extra_inst_bytes: 0, + } + } + + /// The size of the resulting program is limited by size_limit. If + /// the program approximately exceeds the given size (in bytes), then + /// compilation will stop and return an error. + pub fn size_limit(mut self, size_limit: usize) -> Self { + self.size_limit = size_limit; + self + } + + /// If bytes is true, then the program is compiled as a byte based + /// automaton, which incorporates UTF-8 decoding into the machine. If it's + /// false, then the automaton is Unicode scalar value based, e.g., an + /// engine utilizing such an automaton is responsible for UTF-8 decoding. + /// + /// The specific invariant is that when returning a byte based machine, + /// the neither the `Char` nor `Ranges` instructions are produced. + /// Conversely, when producing a Unicode scalar value machine, the `Bytes` + /// instruction is never produced. + /// + /// Note that `dfa(true)` implies `bytes(true)`. + pub fn bytes(mut self, yes: bool) -> Self { + self.compiled.is_bytes = yes; + self + } + + /// When disabled, the program compiled may match arbitrary bytes. + /// + /// When enabled (the default), all compiled programs exclusively match + /// valid UTF-8 bytes. + pub fn only_utf8(mut self, yes: bool) -> Self { + self.compiled.only_utf8 = yes; + self + } + + /// When set, the machine returned is suitable for use in the DFA matching + /// engine. + /// + /// In particular, this ensures that if the regex is not anchored in the + /// beginning, then a preceding `.*?` is included in the program. (The NFA + /// based engines handle the preceding `.*?` explicitly, which is difficult + /// or impossible in the DFA engine.) + pub fn dfa(mut self, yes: bool) -> Self { + self.compiled.is_dfa = yes; + self + } + + /// When set, the machine returned is suitable for matching text in + /// reverse. In particular, all concatenations are flipped. + pub fn reverse(mut self, yes: bool) -> Self { + self.compiled.is_reverse = yes; + self + } + + /// Compile a regular expression given its AST. + /// + /// The compiler is guaranteed to succeed unless the program exceeds the + /// specified size limit. If the size limit is exceeded, then compilation + /// stops and returns an error. + pub fn compile(mut self, exprs: &[Hir]) -> result::Result { + debug_assert!(!exprs.is_empty()); + self.num_exprs = exprs.len(); + if exprs.len() == 1 { + self.compile_one(&exprs[0]) + } else { + self.compile_many(exprs) + } + } + + fn compile_one(mut self, expr: &Hir) -> result::Result { + if self.compiled.only_utf8 + && expr.properties().look_set().contains(Look::WordAsciiNegate) + { + return Err(Error::Syntax( + "ASCII-only \\B is not allowed in Unicode regexes \ + because it may result in invalid UTF-8 matches" + .to_string(), + )); + } + // If we're compiling a forward DFA and we aren't anchored, then + // add a `.*?` before the first capture group. + // Other matching engines handle this by baking the logic into the + // matching engine itself. + let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; + self.compiled.is_anchored_start = + expr.properties().look_set_prefix().contains(Look::Start); + self.compiled.is_anchored_end = + expr.properties().look_set_suffix().contains(Look::End); + if self.compiled.needs_dotstar() { + dotstar_patch = self.c_dotstar()?; + self.compiled.start = dotstar_patch.entry; + } + self.compiled.captures = vec![None]; + let patch = + self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); + if self.compiled.needs_dotstar() { + self.fill(dotstar_patch.hole, patch.entry); + } else { + self.compiled.start = patch.entry; + } + self.fill_to_next(patch.hole); + self.compiled.matches = vec![self.insts.len()]; + self.push_compiled(Inst::Match(0)); + self.compiled.static_captures_len = + expr.properties().static_explicit_captures_len(); + self.compile_finish() + } + + fn compile_many( + mut self, + exprs: &[Hir], + ) -> result::Result { + debug_assert!(exprs.len() > 1); + + self.compiled.is_anchored_start = exprs + .iter() + .all(|e| e.properties().look_set_prefix().contains(Look::Start)); + self.compiled.is_anchored_end = exprs + .iter() + .all(|e| e.properties().look_set_suffix().contains(Look::End)); + let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; + if self.compiled.needs_dotstar() { + dotstar_patch = self.c_dotstar()?; + self.compiled.start = dotstar_patch.entry; + } else { + self.compiled.start = 0; // first instruction is always split + } + self.fill_to_next(dotstar_patch.hole); + + let mut prev_hole = Hole::None; + for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let Patch { hole, entry } = + self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); + self.fill_to_next(hole); + self.compiled.matches.push(self.insts.len()); + self.push_compiled(Inst::Match(i)); + prev_hole = self.fill_split(split, Some(entry), None); + } + let i = exprs.len() - 1; + let Patch { hole, entry } = + self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); + self.fill(prev_hole, entry); + self.fill_to_next(hole); + self.compiled.matches.push(self.insts.len()); + self.push_compiled(Inst::Match(i)); + self.compile_finish() + } + + fn compile_finish(mut self) -> result::Result { + self.compiled.insts = + self.insts.into_iter().map(|inst| inst.unwrap()).collect(); + self.compiled.byte_classes = self.byte_classes.byte_classes(); + self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); + Ok(self.compiled) + } + + /// Compile expr into self.insts, returning a patch on success, + /// or an error if we run out of memory. + /// + /// All of the c_* methods of the compiler share the contract outlined + /// here. + /// + /// The main thing that a c_* method does is mutate `self.insts` + /// to add a list of mostly compiled instructions required to execute + /// the given expression. `self.insts` contains MaybeInsts rather than + /// Insts because there is some backpatching required. + /// + /// The `Patch` value returned by each c_* method provides metadata + /// about the compiled instructions emitted to `self.insts`. The + /// `entry` member of the patch refers to the first instruction + /// (the entry point), while the `hole` member contains zero or + /// more offsets to partial instructions that need to be backpatched. + /// The c_* routine can't know where its list of instructions are going to + /// jump to after execution, so it is up to the caller to patch + /// these jumps to point to the right place. So compiling some + /// expression, e, we would end up with a situation that looked like: + /// + /// ```text + /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] + /// ^ ^ ^ + /// | \ / + /// entry \ / + /// hole + /// ``` + /// + /// To compile two expressions, e1 and e2, concatenated together we + /// would do: + /// + /// ```ignore + /// let patch1 = self.c(e1); + /// let patch2 = self.c(e2); + /// ``` + /// + /// while leaves us with a situation that looks like + /// + /// ```text + /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] + /// ^ ^ ^ ^ + /// | | | | + /// entry1 hole1 entry2 hole2 + /// ``` + /// + /// Then to merge the two patches together into one we would backpatch + /// hole1 with entry2 and return a new patch that enters at entry1 + /// and has hole2 for a hole. In fact, if you look at the c_concat + /// method you will see that it does exactly this, though it handles + /// a list of expressions rather than just the two that we use for + /// an example. + /// + /// Ok(None) is returned when an expression is compiled to no + /// instruction, and so no patch.entry value makes sense. + fn c(&mut self, expr: &Hir) -> ResultOrEmpty { + use crate::prog; + use regex_syntax::hir::HirKind::*; + + self.check_size()?; + match *expr.kind() { + Empty => self.c_empty(), + Literal(hir::Literal(ref bytes)) => { + if self.compiled.is_reverse { + let mut bytes = bytes.to_vec(); + bytes.reverse(); + self.c_literal(&bytes) + } else { + self.c_literal(bytes) + } + } + Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), + Class(hir::Class::Bytes(ref cls)) => { + if self.compiled.uses_bytes() { + self.c_class_bytes(cls.ranges()) + } else { + assert!(cls.is_ascii()); + let mut char_ranges = vec![]; + for r in cls.iter() { + let (s, e) = (r.start() as char, r.end() as char); + char_ranges.push(hir::ClassUnicodeRange::new(s, e)); + } + self.c_class(&char_ranges) + } + } + Look(ref look) => match *look { + hir::Look::Start if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::EndText) + } + hir::Look::Start => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText), + hir::Look::StartLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartCRLF | hir::Look::EndCRLF => { + return Err(Error::Syntax( + "CRLF-aware line anchors are not supported yet" + .to_string(), + )); + } + hir::Look::WordAscii => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) + } + hir::Look::WordAsciiNegate => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) + } + hir::Look::WordUnicode => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); + } + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // We also make sure that all ASCII bytes are in a different + // class from non-ASCII bytes. Otherwise, it's possible for + // ASCII bytes to get lumped into the same class as non-ASCII + // bytes. This in turn may cause the lazy DFA to falsely start + // when it sees an ASCII byte that maps to a byte class with + // non-ASCII bytes. This ensures that never happens. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::WordBoundary) + } + hir::Look::WordUnicodeNegate => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); + } + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // See comments above for why we set the ASCII range here. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::NotWordBoundary) + } + }, + Capture(hir::Capture { index, ref name, ref sub }) => { + if index as usize >= self.compiled.captures.len() { + let name = match *name { + None => None, + Some(ref boxed_str) => Some(boxed_str.to_string()), + }; + self.compiled.captures.push(name.clone()); + if let Some(name) = name { + self.capture_name_idx.insert(name, index as usize); + } + } + self.c_capture(2 * index as usize, sub) + } + Concat(ref es) => { + if self.compiled.is_reverse { + self.c_concat(es.iter().rev()) + } else { + self.c_concat(es) + } + } + Alternation(ref es) => self.c_alternate(&**es), + Repetition(ref rep) => self.c_repeat(rep), + } + } + + fn c_empty(&mut self) -> ResultOrEmpty { + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // Since 'empty' sub-expressions don't increase the size of + // the actual compiled object, we "fake" an increase in its + // size so that our 'check_size_limit' routine will eventually + // stop compilation if there are too many empty sub-expressions + // (e.g., via a large repetition). + self.extra_inst_bytes += std::mem::size_of::(); + Ok(None) + } + + fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { + if self.num_exprs > 1 || self.compiled.is_dfa { + // Don't ever compile Save instructions for regex sets because + // they are never used. They are also never used in DFA programs + // because DFAs can't handle captures. + self.c(expr) + } else { + let entry = self.insts.len(); + let hole = self.push_hole(InstHole::Save { slot: first_slot }); + let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); + self.fill(hole, patch.entry); + self.fill_to_next(patch.hole); + let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); + Ok(Some(Patch { hole, entry })) + } + } + + fn c_dotstar(&mut self) -> Result { + let hir = if self.compiled.only_utf8() { + Hir::dot(hir::Dot::AnyChar) + } else { + Hir::dot(hir::Dot::AnyByte) + }; + Ok(self + .c(&Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy: false, + sub: Box::new(hir), + }))? + .unwrap()) + } + + fn c_char(&mut self, c: char) -> ResultOrEmpty { + if self.compiled.uses_bytes() { + if c.is_ascii() { + let b = c as u8; + let hole = + self.push_hole(InstHole::Bytes { start: b, end: b }); + self.byte_classes.set_range(b, b); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } else { + self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) + } + } else { + let hole = self.push_hole(InstHole::Char { c }); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } + } + + fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { + use std::mem::size_of; + + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } + if self.compiled.uses_bytes() { + Ok(Some(CompileClass { c: self, ranges }.compile()?)) + } else { + let ranges: Vec<(char, char)> = + ranges.iter().map(|r| (r.start(), r.end())).collect(); + let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { + self.push_hole(InstHole::Char { c: ranges[0].0 }) + } else { + self.extra_inst_bytes += + ranges.len() * (size_of::() * 2); + self.push_hole(InstHole::Ranges { ranges }) + }; + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } + } + + fn c_byte(&mut self, b: u8) -> ResultOrEmpty { + self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) + } + + fn c_class_bytes( + &mut self, + ranges: &[hir::ClassBytesRange], + ) -> ResultOrEmpty { + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } + + let first_split_entry = self.insts.len(); + let mut holes = vec![]; + let mut prev_hole = Hole::None; + for r in &ranges[0..ranges.len() - 1] { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let next = self.insts.len(); + self.byte_classes.set_range(r.start(), r.end()); + holes.push(self.push_hole(InstHole::Bytes { + start: r.start(), + end: r.end(), + })); + prev_hole = self.fill_split(split, Some(next), None); + } + let next = self.insts.len(); + let r = &ranges[ranges.len() - 1]; + self.byte_classes.set_range(r.start(), r.end()); + holes.push( + self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), + ); + self.fill(prev_hole, next); + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) + } + + fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { + let hole = self.push_hole(InstHole::EmptyLook { look }); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } + + fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty { + match core::str::from_utf8(bytes) { + Ok(string) => { + let mut it = string.chars(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(ch) => { + if let Some(p) = self.c_char(ch)? { + break p; + } + } + } + }; + for ch in it { + if let Some(p) = self.c_char(ch)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + Err(_) => { + assert!(self.compiled.uses_bytes()); + let mut it = bytes.iter().copied(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(byte) => { + if let Some(p) = self.c_byte(byte)? { + break p; + } + } + } + }; + for byte in it { + if let Some(p) = self.c_byte(byte)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + } + } + + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty + where + I: IntoIterator, + { + let mut exprs = exprs.into_iter(); + let Patch { mut hole, entry } = loop { + match exprs.next() { + None => return self.c_empty(), + Some(e) => { + if let Some(p) = self.c(e)? { + break p; + } + } + } + }; + for e in exprs { + if let Some(p) = self.c(e)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + + fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { + debug_assert!( + exprs.len() >= 2, + "alternates must have at least 2 exprs" + ); + + // Initial entry point is always the first split. + let first_split_entry = self.insts.len(); + + // Save up all of the holes from each alternate. They will all get + // patched to point to the same location. + let mut holes = vec![]; + + // true indicates that the hole is a split where we want to fill + // the second branch. + let mut prev_hole = (Hole::None, false); + for e in &exprs[0..exprs.len() - 1] { + if prev_hole.1 { + let next = self.insts.len(); + self.fill_split(prev_hole.0, None, Some(next)); + } else { + self.fill_to_next(prev_hole.0); + } + let split = self.push_split_hole(); + if let Some(Patch { hole, entry }) = self.c(e)? { + holes.push(hole); + prev_hole = (self.fill_split(split, Some(entry), None), false); + } else { + let (split1, split2) = split.dup_one(); + holes.push(split1); + prev_hole = (split2, true); + } + } + if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { + holes.push(hole); + if prev_hole.1 { + self.fill_split(prev_hole.0, None, Some(entry)); + } else { + self.fill(prev_hole.0, entry); + } + } else { + // We ignore prev_hole.1. When it's true, it means we have two + // empty branches both pushing prev_hole.0 into holes, so both + // branches will go to the same place anyway. + holes.push(prev_hole.0); + } + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) + } + + fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { + match (rep.min, rep.max) { + (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy), + (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy), + (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy), + (min, None) => { + self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min) + } + (min, Some(max)) => { + self.c_repeat_range(&rep.sub, rep.greedy, min, max) + } + } + } + + fn c_repeat_zero_or_one( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let split_entry = self.insts.len(); + let split = self.push_split_hole(); + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; + let split_hole = if greedy { + self.fill_split(split, Some(entry_rep), None) + } else { + self.fill_split(split, None, Some(entry_rep)) + }; + let holes = vec![hole_rep, split_hole]; + Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) + } + + fn c_repeat_zero_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let split_entry = self.insts.len(); + let split = self.push_split_hole(); + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; + + self.fill(hole_rep, split_entry); + let split_hole = if greedy { + self.fill_split(split, Some(entry_rep), None) + } else { + self.fill_split(split, None, Some(entry_rep)) + }; + Ok(Some(Patch { hole: split_hole, entry: split_entry })) + } + + fn c_repeat_one_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return Ok(None), + }; + self.fill_to_next(hole_rep); + let split = self.push_split_hole(); + + let split_hole = if greedy { + self.fill_split(split, Some(entry_rep), None) + } else { + self.fill_split(split, None, Some(entry_rep)) + }; + Ok(Some(Patch { hole: split_hole, entry: entry_rep })) + } + + fn c_repeat_range_min_or_more( + &mut self, + expr: &Hir, + greedy: bool, + min: u32, + ) -> ResultOrEmpty { + let min = u32_to_usize(min); + // Using next_inst() is ok, because we can't return it (concat would + // have to return Some(_) while c_repeat_range_min_or_more returns + // None). + let patch_concat = self + .c_concat(iter::repeat(expr).take(min))? + .unwrap_or_else(|| self.next_inst()); + if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { + self.fill(patch_concat.hole, patch_rep.entry); + Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) + } else { + Ok(None) + } + } + + fn c_repeat_range( + &mut self, + expr: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> ResultOrEmpty { + let (min, max) = (u32_to_usize(min), u32_to_usize(max)); + debug_assert!(min <= max); + let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; + if min == max { + return Ok(patch_concat); + } + // Same reasoning as in c_repeat_range_min_or_more (we know that min < + // max at this point). + let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); + let initial_entry = patch_concat.entry; + // It is much simpler to compile, e.g., `a{2,5}` as: + // + // aaa?a?a? + // + // But you end up with a sequence of instructions like this: + // + // 0: 'a' + // 1: 'a', + // 2: split(3, 4) + // 3: 'a' + // 4: split(5, 6) + // 5: 'a' + // 6: split(7, 8) + // 7: 'a' + // 8: MATCH + // + // This is *incredibly* inefficient because the splits end + // up forming a chain, which has to be resolved everything a + // transition is followed. + let mut holes = vec![]; + let mut prev_hole = patch_concat.hole; + for _ in min..max { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let Patch { hole, entry } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; + prev_hole = hole; + if greedy { + holes.push(self.fill_split(split, Some(entry), None)); + } else { + holes.push(self.fill_split(split, None, Some(entry))); + } + } + holes.push(prev_hole); + Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) + } + + /// Can be used as a default value for the c_* functions when the call to + /// c_function is followed by inserting at least one instruction that is + /// always executed after the ones written by the c* function. + fn next_inst(&self) -> Patch { + Patch { hole: Hole::None, entry: self.insts.len() } + } + + fn fill(&mut self, hole: Hole, goto: InstPtr) { + match hole { + Hole::None => {} + Hole::One(pc) => { + self.insts[pc].fill(goto); + } + Hole::Many(holes) => { + for hole in holes { + self.fill(hole, goto); + } + } + } + } + + fn fill_to_next(&mut self, hole: Hole) { + let next = self.insts.len(); + self.fill(hole, next); + } + + fn fill_split( + &mut self, + hole: Hole, + goto1: Option, + goto2: Option, + ) -> Hole { + match hole { + Hole::None => Hole::None, + Hole::One(pc) => match (goto1, goto2) { + (Some(goto1), Some(goto2)) => { + self.insts[pc].fill_split(goto1, goto2); + Hole::None + } + (Some(goto1), None) => { + self.insts[pc].half_fill_split_goto1(goto1); + Hole::One(pc) + } + (None, Some(goto2)) => { + self.insts[pc].half_fill_split_goto2(goto2); + Hole::One(pc) + } + (None, None) => unreachable!( + "at least one of the split \ + holes must be filled" + ), + }, + Hole::Many(holes) => { + let mut new_holes = vec![]; + for hole in holes { + new_holes.push(self.fill_split(hole, goto1, goto2)); + } + if new_holes.is_empty() { + Hole::None + } else if new_holes.len() == 1 { + new_holes.pop().unwrap() + } else { + Hole::Many(new_holes) + } + } + } + } + + fn push_compiled(&mut self, inst: Inst) { + self.insts.push(MaybeInst::Compiled(inst)); + } + + fn push_hole(&mut self, inst: InstHole) -> Hole { + let hole = self.insts.len(); + self.insts.push(MaybeInst::Uncompiled(inst)); + Hole::One(hole) + } + + fn push_split_hole(&mut self) -> Hole { + let hole = self.insts.len(); + self.insts.push(MaybeInst::Split); + Hole::One(hole) + } + + fn pop_split_hole(&mut self) -> ResultOrEmpty { + self.insts.pop(); + Ok(None) + } + + fn check_size(&self) -> result::Result<(), Error> { + use std::mem::size_of; + + let size = + self.extra_inst_bytes + (self.insts.len() * size_of::()); + if size > self.size_limit { + Err(Error::CompiledTooBig(self.size_limit)) + } else { + Ok(()) + } + } +} + +#[derive(Debug)] +enum Hole { + None, + One(InstPtr), + Many(Vec), +} + +impl Hole { + fn dup_one(self) -> (Self, Self) { + match self { + Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), + Hole::None | Hole::Many(_) => { + unreachable!("must be called on single hole") + } + } + } +} + +#[derive(Clone, Debug)] +enum MaybeInst { + Compiled(Inst), + Uncompiled(InstHole), + Split, + Split1(InstPtr), + Split2(InstPtr), +} + +impl MaybeInst { + fn fill(&mut self, goto: InstPtr) { + let maybeinst = match *self { + MaybeInst::Split => MaybeInst::Split1(goto), + MaybeInst::Uncompiled(ref inst) => { + MaybeInst::Compiled(inst.fill(goto)) + } + MaybeInst::Split1(goto1) => { + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1, + goto2: goto, + })) + } + MaybeInst::Split2(goto2) => { + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1: goto, + goto2, + })) + } + _ => unreachable!( + "not all instructions were compiled! \ + found uncompiled instruction: {:?}", + self + ), + }; + *self = maybeinst; + } + + fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { + let filled = match *self { + MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), + _ => unreachable!( + "must be called on Split instruction, \ + instead it was called on: {:?}", + self + ), + }; + *self = MaybeInst::Compiled(filled); + } + + fn half_fill_split_goto1(&mut self, goto1: InstPtr) { + let half_filled = match *self { + MaybeInst::Split => goto1, + _ => unreachable!( + "must be called on Split instruction, \ + instead it was called on: {:?}", + self + ), + }; + *self = MaybeInst::Split1(half_filled); + } + + fn half_fill_split_goto2(&mut self, goto2: InstPtr) { + let half_filled = match *self { + MaybeInst::Split => goto2, + _ => unreachable!( + "must be called on Split instruction, \ + instead it was called on: {:?}", + self + ), + }; + *self = MaybeInst::Split2(half_filled); + } + + fn unwrap(self) -> Inst { + match self { + MaybeInst::Compiled(inst) => inst, + _ => unreachable!( + "must be called on a compiled instruction, \ + instead it was called on: {:?}", + self + ), + } + } +} + +#[derive(Clone, Debug)] +enum InstHole { + Save { slot: usize }, + EmptyLook { look: EmptyLook }, + Char { c: char }, + Ranges { ranges: Vec<(char, char)> }, + Bytes { start: u8, end: u8 }, +} + +impl InstHole { + fn fill(&self, goto: InstPtr) -> Inst { + match *self { + InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }), + InstHole::EmptyLook { look } => { + Inst::EmptyLook(InstEmptyLook { goto, look }) + } + InstHole::Char { c } => Inst::Char(InstChar { goto, c }), + InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { + goto, + ranges: ranges.clone().into_boxed_slice(), + }), + InstHole::Bytes { start, end } => { + Inst::Bytes(InstBytes { goto, start, end }) + } + } + } +} + +struct CompileClass<'a, 'b> { + c: &'a mut Compiler, + ranges: &'b [hir::ClassUnicodeRange], +} + +impl<'a, 'b> CompileClass<'a, 'b> { + fn compile(mut self) -> Result { + let mut holes = vec![]; + let mut initial_entry = None; + let mut last_split = Hole::None; + let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); + self.c.suffix_cache.clear(); + + for (i, range) in self.ranges.iter().enumerate() { + let is_last_range = i + 1 == self.ranges.len(); + utf8_seqs.reset(range.start(), range.end()); + let mut it = (&mut utf8_seqs).peekable(); + loop { + let utf8_seq = match it.next() { + None => break, + Some(utf8_seq) => utf8_seq, + }; + if is_last_range && it.peek().is_none() { + let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; + holes.push(hole); + self.c.fill(last_split, entry); + last_split = Hole::None; + if initial_entry.is_none() { + initial_entry = Some(entry); + } + } else { + if initial_entry.is_none() { + initial_entry = Some(self.c.insts.len()); + } + self.c.fill_to_next(last_split); + last_split = self.c.push_split_hole(); + let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; + holes.push(hole); + last_split = + self.c.fill_split(last_split, Some(entry), None); + } + } + } + self.c.utf8_seqs = Some(utf8_seqs); + Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) + } + + fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { + if self.c.compiled.is_reverse { + self.c_utf8_seq_(seq) + } else { + self.c_utf8_seq_(seq.into_iter().rev()) + } + } + + fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result + where + I: IntoIterator, + { + // The initial instruction for each UTF-8 sequence should be the same. + let mut from_inst = ::std::usize::MAX; + let mut last_hole = Hole::None; + for byte_range in seq { + let key = SuffixCacheKey { + from_inst, + start: byte_range.start, + end: byte_range.end, + }; + { + let pc = self.c.insts.len(); + if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { + from_inst = cached_pc; + continue; + } + } + self.c.byte_classes.set_range(byte_range.start, byte_range.end); + if from_inst == ::std::usize::MAX { + last_hole = self.c.push_hole(InstHole::Bytes { + start: byte_range.start, + end: byte_range.end, + }); + } else { + self.c.push_compiled(Inst::Bytes(InstBytes { + goto: from_inst, + start: byte_range.start, + end: byte_range.end, + })); + } + from_inst = self.c.insts.len().checked_sub(1).unwrap(); + debug_assert!(from_inst < ::std::usize::MAX); + } + debug_assert!(from_inst < ::std::usize::MAX); + Ok(Patch { hole: last_hole, entry: from_inst }) + } +} + +/// `SuffixCache` is a simple bounded hash map for caching suffix entries in +/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. +/// The set of byte ranges looks like this: +/// +/// [0-7F] +/// [C2-DF][80-BF] +/// [E0][A0-BF][80-BF] +/// [E1-EC][80-BF][80-BF] +/// [ED][80-9F][80-BF] +/// [EE-EF][80-BF][80-BF] +/// +/// Each line above translates to one alternate in the compiled regex program. +/// However, all but one of the alternates end in the same suffix, which is +/// a waste of an instruction. The suffix cache facilitates reusing them across +/// alternates. +/// +/// Note that a HashMap could be trivially used for this, but we don't need its +/// overhead. Some small bounded space (LRU style) is more than enough. +/// +/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), +/// except it uses hashes as original indices and then compares full keys for +/// validation against `dense` array. +#[derive(Debug)] +struct SuffixCache { + sparse: Box<[usize]>, + dense: Vec, +} + +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] +struct SuffixCacheEntry { + key: SuffixCacheKey, + pc: InstPtr, +} + +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] +struct SuffixCacheKey { + from_inst: InstPtr, + start: u8, + end: u8, +} + +impl SuffixCache { + fn new(size: usize) -> Self { + SuffixCache { + sparse: vec![0usize; size].into(), + dense: Vec::with_capacity(size), + } + } + + fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option { + let hash = self.hash(&key); + let pos = &mut self.sparse[hash]; + if let Some(entry) = self.dense.get(*pos) { + if entry.key == key { + return Some(entry.pc); + } + } + *pos = self.dense.len(); + self.dense.push(SuffixCacheEntry { key, pc }); + None + } + + fn clear(&mut self) { + self.dense.clear(); + } + + fn hash(&self, suffix: &SuffixCacheKey) -> usize { + // Basic FNV-1a hash as described: + // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + const FNV_PRIME: u64 = 1_099_511_628_211; + let mut h = 14_695_981_039_346_656_037; + h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); + h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); + h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); + (h as usize) % self.sparse.len() + } +} + +struct ByteClassSet([bool; 256]); + +impl ByteClassSet { + fn new() -> Self { + ByteClassSet([false; 256]) + } + + fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0[start as usize - 1] = true; + } + self.0[end as usize] = true; + } + + fn set_word_boundary(&mut self) { + // We need to mark all ranges of bytes whose pairs result in + // evaluating \b differently. + let iswb = is_word_byte; + let mut b1: u16 = 0; + let mut b2: u16; + while b1 <= 255 { + b2 = b1 + 1; + while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { + b2 += 1; + } + self.set_range(b1 as u8, (b2 - 1) as u8); + b1 = b2; + } + } + + fn byte_classes(&self) -> Vec { + // N.B. If you're debugging the DFA, it's useful to simply return + // `(0..256).collect()`, which effectively removes the byte classes + // and makes the transitions easier to read. + // (0usize..256).map(|x| x as u8).collect() + let mut byte_classes = vec![0; 256]; + let mut class = 0u8; + let mut i = 0; + loop { + byte_classes[i] = class as u8; + if i >= 255 { + break; + } + if self.0[i] { + class = class.checked_add(1).unwrap(); + } + i += 1; + } + byte_classes + } +} + +impl fmt::Debug for ByteClassSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() + } +} + +fn u32_to_usize(n: u32) -> usize { + // In case usize is less than 32 bits, we need to guard against overflow. + // On most platforms this compiles to nothing. + // TODO Use `std::convert::TryFrom` once it's stable. + if (n as u64) > (::std::usize::MAX as u64) { + panic!("BUG: {} is too big to be pointer sized", n) + } + n as usize +} + +#[cfg(test)] +mod tests { + use super::ByteClassSet; + + #[test] + fn byte_classes() { + let mut set = ByteClassSet::new(); + set.set_range(b'a', b'z'); + let classes = set.byte_classes(); + assert_eq!(classes[0], 0); + assert_eq!(classes[1], 0); + assert_eq!(classes[2], 0); + assert_eq!(classes[b'a' as usize - 1], 0); + assert_eq!(classes[b'a' as usize], 1); + assert_eq!(classes[b'm' as usize], 1); + assert_eq!(classes[b'z' as usize], 1); + assert_eq!(classes[b'z' as usize + 1], 2); + assert_eq!(classes[254], 2); + assert_eq!(classes[255], 2); + + let mut set = ByteClassSet::new(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes[0], 0); + assert_eq!(classes[1], 0); + assert_eq!(classes[2], 0); + assert_eq!(classes[3], 1); + assert_eq!(classes[4], 2); + assert_eq!(classes[5], 2); + assert_eq!(classes[6], 2); + assert_eq!(classes[7], 3); + assert_eq!(classes[255], 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassSet::new(); + for i in 0..256u16 { + set.set_range(i as u8, i as u8); + } + assert_eq!(set.byte_classes().len(), 256); + } +} diff --git a/regex-1.8.4/src/dfa.rs b/regex-1.8.4/src/dfa.rs new file mode 100644 index 0000000000000..78ed71021ee74 --- /dev/null +++ b/regex-1.8.4/src/dfa.rs @@ -0,0 +1,1945 @@ +/*! +The DFA matching engine. + +A DFA provides faster matching because the engine is in exactly one state at +any point in time. In the NFA, there may be multiple active states, and +considerable CPU cycles are spent shuffling them around. In finite automata +speak, the DFA follows epsilon transitions in the regex far less than the NFA. + +A DFA is a classic trade off between time and space. The NFA is slower, but +its memory requirements are typically small and predictable. The DFA is faster, +but given the right regex and the right input, the number of states in the +DFA can grow exponentially. To mitigate this space problem, we do two things: + +1. We implement an *online* DFA. That is, the DFA is constructed from the NFA + during a search. When a new state is computed, it is stored in a cache so + that it may be reused. An important consequence of this implementation + is that states that are never reached for a particular input are never + computed. (This is impossible in an "offline" DFA which needs to compute + all possible states up front.) +2. If the cache gets too big, we wipe it and continue matching. + +In pathological cases, a new state can be created for every byte of input. +(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.) +In this case, performance regresses to slightly slower than the full NFA +simulation, in large part because the cache becomes useless. If the cache +is wiped too frequently, the DFA quits and control falls back to one of the +NFA simulations. + +Because of the "lazy" nature of this DFA, the inner matching loop is +considerably more complex than one might expect out of a DFA. A number of +tricks are employed to make it fast. Tread carefully. + +N.B. While this implementation is heavily commented, Russ Cox's series of +articles on regexes is strongly recommended: +(As is the DFA implementation in RE2, which heavily influenced this +implementation.) +*/ + +use std::collections::HashMap; +use std::fmt; +use std::iter::repeat; +use std::mem; +use std::sync::Arc; + +use crate::exec::ProgramCache; +use crate::prog::{Inst, Program}; +use crate::sparse::SparseSet; + +/// Return true if and only if the given program can be executed by a DFA. +/// +/// Generally, a DFA is always possible. A pathological case where it is not +/// possible is if the number of NFA states exceeds `u32::MAX`, in which case, +/// this function will return false. +/// +/// This function will also return false if the given program has any Unicode +/// instructions (Char or Ranges) since the DFA operates on bytes only. +pub fn can_exec(insts: &Program) -> bool { + use crate::prog::Inst::*; + // If for some reason we manage to allocate a regex program with more + // than i32::MAX instructions, then we can't execute the DFA because we + // use 32 bit instruction pointer deltas for memory savings. + // If i32::MAX is the largest positive delta, + // then -i32::MAX == i32::MIN + 1 is the largest negative delta, + // and we are OK to use 32 bits. + if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize { + return false; + } + for inst in insts { + match *inst { + Char(_) | Ranges(_) => return false, + EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {} + } + } + true +} + +/// A reusable cache of DFA states. +/// +/// This cache is reused between multiple invocations of the same regex +/// program. (It is not shared simultaneously between threads. If there is +/// contention, then new caches are created.) +#[derive(Debug)] +pub struct Cache { + /// Group persistent DFA related cache state together. The sparse sets + /// listed below are used as scratch space while computing uncached states. + inner: CacheInner, + /// qcur and qnext are ordered sets with constant time + /// addition/membership/clearing-whole-set and linear time iteration. They + /// are used to manage the sets of NFA states in DFA states when computing + /// cached DFA states. In particular, the order of the NFA states matters + /// for leftmost-first style matching. Namely, when computing a cached + /// state, the set of NFA states stops growing as soon as the first Match + /// instruction is observed. + qcur: SparseSet, + qnext: SparseSet, +} + +/// `CacheInner` is logically just a part of Cache, but groups together fields +/// that aren't passed as function parameters throughout search. (This split +/// is mostly an artifact of the borrow checker. It is happily paid.) +#[derive(Debug)] +struct CacheInner { + /// A cache of pre-compiled DFA states, keyed by the set of NFA states + /// and the set of empty-width flags set at the byte in the input when the + /// state was observed. + /// + /// A StatePtr is effectively a `*State`, but to avoid various inconvenient + /// things, we just pass indexes around manually. The performance impact of + /// this is probably an instruction or two in the inner loop. However, on + /// 64 bit, each StatePtr is half the size of a *State. + compiled: StateMap, + /// The transition table. + /// + /// The transition table is laid out in row-major order, where states are + /// rows and the transitions for each state are columns. At a high level, + /// given state `s` and byte `b`, the next state can be found at index + /// `s * 256 + b`. + /// + /// This is, of course, a lie. A StatePtr is actually a pointer to the + /// *start* of a row in this table. When indexing in the DFA's inner loop, + /// this removes the need to multiply the StatePtr by the stride. Yes, it + /// matters. This reduces the number of states we can store, but: the + /// stride is rarely 256 since we define transitions in terms of + /// *equivalence classes* of bytes. Each class corresponds to a set of + /// bytes that never discriminate a distinct path through the DFA from each + /// other. + trans: Transitions, + /// A set of cached start states, which are limited to the number of + /// permutations of flags set just before the initial byte of input. (The + /// index into this vec is a `EmptyFlags`.) + /// + /// N.B. A start state can be "dead" (i.e., no possible match), so we + /// represent it with a StatePtr. + start_states: Vec, + /// Stack scratch space used to follow epsilon transitions in the NFA. + /// (This permits us to avoid recursion.) + /// + /// The maximum stack size is the number of NFA states. + stack: Vec, + /// The total number of times this cache has been flushed by the DFA + /// because of space constraints. + flush_count: u64, + /// The total heap size of the DFA's cache. We use this to determine when + /// we should flush the cache. + size: usize, + /// Scratch space used when building instruction pointer lists for new + /// states. This helps amortize allocation. + insts_scratch_space: Vec, +} + +/// The transition table. +/// +/// It is laid out in row-major order, with states as rows and byte class +/// transitions as columns. +/// +/// The transition table is responsible for producing valid `StatePtrs`. A +/// `StatePtr` points to the start of a particular row in this table. When +/// indexing to find the next state this allows us to avoid a multiplication +/// when computing an index into the table. +#[derive(Clone)] +struct Transitions { + /// The table. + table: Vec, + /// The stride. + num_byte_classes: usize, +} + +/// Fsm encapsulates the actual execution of the DFA. +#[derive(Debug)] +pub struct Fsm<'a> { + /// prog contains the NFA instruction opcodes. DFA execution uses either + /// the `dfa` instructions or the `dfa_reverse` instructions from + /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have + /// Unicode opcodes that cannot be executed by the DFA.) + prog: &'a Program, + /// The start state. We record it here because the pointer may change + /// when the cache is wiped. + start: StatePtr, + /// The current position in the input. + at: usize, + /// Should we quit after seeing the first match? e.g., When the caller + /// uses `is_match` or `shortest_match`. + quit_after_match: bool, + /// The last state that matched. + /// + /// When no match has occurred, this is set to STATE_UNKNOWN. + /// + /// This is only useful when matching regex sets. The last match state + /// is useful because it contains all of the match instructions seen, + /// thereby allowing us to enumerate which regexes in the set matched. + last_match_si: StatePtr, + /// The input position of the last cache flush. We use this to determine + /// if we're thrashing in the cache too often. If so, the DFA quits so + /// that we can fall back to the NFA algorithm. + last_cache_flush: usize, + /// All cached DFA information that is persisted between searches. + cache: &'a mut CacheInner, +} + +/// The result of running the DFA. +/// +/// Generally, the result is either a match or not a match, but sometimes the +/// DFA runs too slowly because the cache size is too small. In that case, it +/// gives up with the intent of falling back to the NFA algorithm. +/// +/// The DFA can also give up if it runs out of room to create new states, or if +/// it sees non-ASCII bytes in the presence of a Unicode word boundary. +#[derive(Clone, Debug)] +pub enum Result { + Match(T), + NoMatch(usize), + Quit, +} + +impl Result { + /// Returns true if this result corresponds to a match. + pub fn is_match(&self) -> bool { + match *self { + Result::Match(_) => true, + Result::NoMatch(_) | Result::Quit => false, + } + } + + /// Maps the given function onto T and returns the result. + /// + /// If this isn't a match, then this is a no-op. + #[cfg(feature = "perf-literal")] + pub fn map U>(self, mut f: F) -> Result { + match self { + Result::Match(t) => Result::Match(f(t)), + Result::NoMatch(x) => Result::NoMatch(x), + Result::Quit => Result::Quit, + } + } + + /// Sets the non-match position. + /// + /// If this isn't a non-match, then this is a no-op. + fn set_non_match(self, at: usize) -> Result { + match self { + Result::NoMatch(_) => Result::NoMatch(at), + r => r, + } + } +} + +/// `State` is a DFA state. It contains an ordered set of NFA states (not +/// necessarily complete) and a smattering of flags. +/// +/// The flags are packed into the first byte of data. +/// +/// States don't carry their transitions. Instead, transitions are stored in +/// a single row-major table. +/// +/// Delta encoding is used to store the instruction pointers. +/// The first instruction pointer is stored directly starting +/// at data[1], and each following pointer is stored as an offset +/// to the previous one. If a delta is in the range -127..127, +/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8) +/// is coded as a flag, followed by 4 bytes encoding the delta. +#[derive(Clone, Eq, Hash, PartialEq)] +struct State { + data: Arc<[u8]>, +} + +/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes +/// an NFA state). +/// +/// Throughout this library, this is usually set to `usize`, but we force a +/// `u32` here for the DFA to save on space. +type InstPtr = u32; + +/// Adds ip to data using delta encoding with respect to prev. +/// +/// After completion, `data` will contain `ip` and `prev` will be set to `ip`. +fn push_inst_ptr(data: &mut Vec, prev: &mut InstPtr, ip: InstPtr) { + let delta = (ip as i32) - (*prev as i32); + write_vari32(data, delta); + *prev = ip; +} + +struct InstPtrs<'a> { + base: usize, + data: &'a [u8], +} + +impl<'a> Iterator for InstPtrs<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + if self.data.is_empty() { + return None; + } + let (delta, nread) = read_vari32(self.data); + let base = self.base as i32 + delta; + debug_assert!(base >= 0); + debug_assert!(nread > 0); + self.data = &self.data[nread..]; + self.base = base as usize; + Some(self.base) + } +} + +impl State { + fn flags(&self) -> StateFlags { + StateFlags(self.data[0]) + } + + fn inst_ptrs(&self) -> InstPtrs<'_> { + InstPtrs { base: 0, data: &self.data[1..] } + } +} + +/// `StatePtr` is a 32 bit pointer to the start of a row in the transition +/// table. +/// +/// It has many special values. There are two types of special values: +/// sentinels and flags. +/// +/// Sentinels corresponds to special states that carry some kind of +/// significance. There are three such states: unknown, dead and quit states. +/// +/// Unknown states are states that haven't been computed yet. They indicate +/// that a transition should be filled in that points to either an existing +/// cached state or a new state altogether. In general, an unknown state means +/// "follow the NFA's epsilon transitions." +/// +/// Dead states are states that can never lead to a match, no matter what +/// subsequent input is observed. This means that the DFA should quit +/// immediately and return the longest match it has found thus far. +/// +/// Quit states are states that imply the DFA is not capable of matching the +/// regex correctly. Currently, this is only used when a Unicode word boundary +/// exists in the regex *and* a non-ASCII byte is observed. +/// +/// The other type of state pointer is a state pointer with special flag bits. +/// There are two flags: a start flag and a match flag. The lower bits of both +/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX` +/// mask). +/// +/// The start flag means that the state is a start state, and therefore may be +/// subject to special prefix scanning optimizations. +/// +/// The match flag means that the state is a match state, and therefore the +/// current position in the input (while searching) should be recorded. +/// +/// The above exists mostly in the service of making the inner loop fast. +/// In particular, the inner *inner* loop looks something like this: +/// +/// ```ignore +/// while state <= STATE_MAX and i < len(text): +/// state = state.next[i] +/// ``` +/// +/// This is nice because it lets us execute a lazy DFA as if it were an +/// entirely offline DFA (i.e., with very few instructions). The loop will +/// quit only when we need to examine a case that needs special attention. +type StatePtr = u32; + +/// An unknown state means that the state has not been computed yet, and that +/// the only way to progress is to compute it. +const STATE_UNKNOWN: StatePtr = 1 << 31; + +/// A dead state means that the state has been computed and it is known that +/// once it is entered, no future match can ever occur. +const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1; + +/// A quit state means that the DFA came across some input that it doesn't +/// know how to process correctly. The DFA should quit and another matching +/// engine should be run in its place. +const STATE_QUIT: StatePtr = STATE_DEAD + 1; + +/// A start state is a state that the DFA can start in. +/// +/// Note that start states have their lower bits set to a state pointer. +const STATE_START: StatePtr = 1 << 30; + +/// A match state means that the regex has successfully matched. +/// +/// Note that match states have their lower bits set to a state pointer. +const STATE_MATCH: StatePtr = 1 << 29; + +/// The maximum state pointer. This is useful to mask out the "valid" state +/// pointer from a state with the "start" or "match" bits set. +/// +/// It doesn't make sense to use this with unknown, dead or quit state +/// pointers, since those pointers are sentinels and never have their lower +/// bits set to anything meaningful. +const STATE_MAX: StatePtr = STATE_MATCH - 1; + +/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the +/// special EOF sentinel value. +#[derive(Copy, Clone, Debug)] +struct Byte(u16); + +/// A set of flags for zero-width assertions. +#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)] +struct EmptyFlags { + start: bool, + end: bool, + start_line: bool, + end_line: bool, + word_boundary: bool, + not_word_boundary: bool, +} + +/// A set of flags describing various configurations of a DFA state. This is +/// represented by a `u8` so that it is compact. +#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)] +struct StateFlags(u8); + +impl Cache { + /// Create new empty cache for the DFA engine. + pub fn new(prog: &Program) -> Self { + // We add 1 to account for the special EOF byte. + let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1; + let starts = vec![STATE_UNKNOWN; 256]; + let mut cache = Cache { + inner: CacheInner { + compiled: StateMap::new(num_byte_classes), + trans: Transitions::new(num_byte_classes), + start_states: starts, + stack: vec![], + flush_count: 0, + size: 0, + insts_scratch_space: vec![], + }, + qcur: SparseSet::new(prog.insts.len()), + qnext: SparseSet::new(prog.insts.len()), + }; + cache.inner.reset_size(); + cache + } +} + +impl CacheInner { + /// Resets the cache size to account for fixed costs, such as the program + /// and stack sizes. + fn reset_size(&mut self) { + self.size = (self.start_states.len() * mem::size_of::()) + + (self.stack.len() * mem::size_of::()); + } +} + +impl<'a> Fsm<'a> { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn forward( + prog: &'a Program, + cache: &ProgramCache, + quit_after_match: bool, + text: &[u8], + at: usize, + ) -> Result { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.dfa; + let mut dfa = Fsm { + prog, + start: 0, // filled in below + at, + quit_after_match, + last_match_si: STATE_UNKNOWN, + last_cache_flush: at, + cache: &mut cache.inner, + }; + let (empty_flags, state_flags) = dfa.start_flags(text, at); + dfa.start = + match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { + None => return Result::Quit, + Some(STATE_DEAD) => return Result::NoMatch(at), + Some(si) => si, + }; + debug_assert!(dfa.start != STATE_UNKNOWN); + dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn reverse( + prog: &'a Program, + cache: &ProgramCache, + quit_after_match: bool, + text: &[u8], + at: usize, + ) -> Result { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.dfa_reverse; + let mut dfa = Fsm { + prog, + start: 0, // filled in below + at, + quit_after_match, + last_match_si: STATE_UNKNOWN, + last_cache_flush: at, + cache: &mut cache.inner, + }; + let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at); + dfa.start = + match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { + None => return Result::Quit, + Some(STATE_DEAD) => return Result::NoMatch(at), + Some(si) => si, + }; + debug_assert!(dfa.start != STATE_UNKNOWN); + dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn forward_many( + prog: &'a Program, + cache: &ProgramCache, + matches: &mut [bool], + text: &[u8], + at: usize, + ) -> Result { + debug_assert!(matches.len() == prog.matches.len()); + let mut cache = cache.borrow_mut(); + let cache = &mut cache.dfa; + let mut dfa = Fsm { + prog, + start: 0, // filled in below + at, + quit_after_match: false, + last_match_si: STATE_UNKNOWN, + last_cache_flush: at, + cache: &mut cache.inner, + }; + let (empty_flags, state_flags) = dfa.start_flags(text, at); + dfa.start = + match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { + None => return Result::Quit, + Some(STATE_DEAD) => return Result::NoMatch(at), + Some(si) => si, + }; + debug_assert!(dfa.start != STATE_UNKNOWN); + let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text); + if result.is_match() { + if matches.len() == 1 { + matches[0] = true; + } else { + debug_assert!(dfa.last_match_si != STATE_UNKNOWN); + debug_assert!(dfa.last_match_si != STATE_DEAD); + for ip in dfa.state(dfa.last_match_si).inst_ptrs() { + if let Inst::Match(slot) = dfa.prog[ip] { + matches[slot] = true; + } + } + } + } + result + } + + /// Executes the DFA on a forward NFA. + /// + /// {qcur,qnext} are scratch ordered sets which may be non-empty. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_at( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + text: &[u8], + ) -> Result { + // For the most part, the DFA is basically: + // + // last_match = null + // while current_byte != EOF: + // si = current_state.next[current_byte] + // if si is match + // last_match = si + // return last_match + // + // However, we need to deal with a few things: + // + // 1. This is an *online* DFA, so the current state's next list + // may not point to anywhere yet, so we must go out and compute + // them. (They are then cached into the current state's next list + // to avoid re-computation.) + // 2. If we come across a state that is known to be dead (i.e., never + // leads to a match), then we can quit early. + // 3. If the caller just wants to know if a match occurs, then we + // can quit as soon as we know we have a match. (Full leftmost + // first semantics require continuing on.) + // 4. If we're in the start state, then we can use a pre-computed set + // of prefix literals to skip quickly along the input. + // 5. After the input is exhausted, we run the DFA on one symbol + // that stands for EOF. This is useful for handling empty width + // assertions. + // 6. We can't actually do state.next[byte]. Instead, we have to do + // state.next[byte_classes[byte]], which permits us to keep the + // 'next' list very small. + // + // Since there's a bunch of extra stuff we need to consider, we do some + // pretty hairy tricks to get the inner loop to run as fast as + // possible. + debug_assert!(!self.prog.is_reverse); + + // The last match is the currently known ending match position. It is + // reported as an index to the most recent byte that resulted in a + // transition to a match state and is always stored in capture slot `1` + // when searching forwards. Its maximum value is `text.len()`. + let mut result = Result::NoMatch(self.at); + let (mut prev_si, mut next_si) = (self.start, self.start); + let mut at = self.at; + while at < text.len() { + // This is the real inner loop. We take advantage of special bits + // set in the state pointer to determine whether a state is in the + // "common" case or not. Specifically, the common case is a + // non-match non-start non-dead state that has already been + // computed. So long as we remain in the common case, this inner + // loop will chew through the input. + // + // We also unroll the loop 4 times to amortize the cost of checking + // whether we've consumed the entire input. We are also careful + // to make sure that `prev_si` always represents the previous state + // and `next_si` always represents the next state after the loop + // exits, even if it isn't always true inside the loop. + while next_si <= STATE_MAX && at < text.len() { + // Argument for safety is in the definition of next_si. + prev_si = unsafe { self.next_si(next_si, text, at) }; + at += 1; + if prev_si > STATE_MAX || at + 2 >= text.len() { + mem::swap(&mut prev_si, &mut next_si); + break; + } + next_si = unsafe { self.next_si(prev_si, text, at) }; + at += 1; + if next_si > STATE_MAX { + break; + } + prev_si = unsafe { self.next_si(next_si, text, at) }; + at += 1; + if prev_si > STATE_MAX { + mem::swap(&mut prev_si, &mut next_si); + break; + } + next_si = unsafe { self.next_si(prev_si, text, at) }; + at += 1; + } + if next_si & STATE_MATCH > 0 { + // A match state is outside of the common case because it needs + // special case analysis. In particular, we need to record the + // last position as having matched and possibly quit the DFA if + // we don't need to keep matching. + next_si &= !STATE_MATCH; + result = Result::Match(at - 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + prev_si = next_si; + + // This permits short-circuiting when matching a regex set. + // In particular, if this DFA state contains only match states, + // then it's impossible to extend the set of matches since + // match states are final. Therefore, we can quit. + if self.prog.matches.len() > 1 { + let state = self.state(next_si); + let just_matches = + state.inst_ptrs().all(|ip| self.prog[ip].is_match()); + if just_matches { + return result; + } + } + + // Another inner loop! If the DFA stays in this particular + // match state, then we can rip through all of the input + // very quickly, and only recording the match location once + // we've left this particular state. + let cur = at; + while (next_si & !STATE_MATCH) == prev_si + && at + 2 < text.len() + { + // Argument for safety is in the definition of next_si. + next_si = unsafe { + self.next_si(next_si & !STATE_MATCH, text, at) + }; + at += 1; + } + if at > cur { + result = Result::Match(at - 2); + } + } else if next_si & STATE_START > 0 { + // A start state isn't in the common case because we may + // want to do quick prefix scanning. If the program doesn't + // have a detected prefix, then start states are actually + // considered common and this case is never reached. + debug_assert!(self.has_prefix()); + next_si &= !STATE_START; + prev_si = next_si; + at = match self.prefix_at(text, at) { + None => return Result::NoMatch(text.len()), + Some(i) => i, + }; + } else if next_si >= STATE_UNKNOWN { + if next_si == STATE_QUIT { + return Result::Quit; + } + // Finally, this corresponds to the case where the transition + // entered a state that can never lead to a match or a state + // that hasn't been computed yet. The latter being the "slow" + // path. + let byte = Byte::byte(text[at - 1]); + // We no longer care about the special bits in the state + // pointer. + prev_si &= STATE_MAX; + // Record where we are. This is used to track progress for + // determining whether we should quit if we've flushed the + // cache too much. + self.at = at; + next_si = match self.next_state(qcur, qnext, prev_si, byte) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(at), + Some(si) => si, + }; + debug_assert!(next_si != STATE_UNKNOWN); + if next_si & STATE_MATCH > 0 { + next_si &= !STATE_MATCH; + result = Result::Match(at - 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + } + prev_si = next_si; + } else { + prev_si = next_si; + } + } + + // Run the DFA once more on the special EOF sentinel value. + // We don't care about the special bits in the state pointer any more, + // so get rid of them. + prev_si &= STATE_MAX; + prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(text.len()), + Some(si) => si & !STATE_START, + }; + debug_assert!(prev_si != STATE_UNKNOWN); + if prev_si & STATE_MATCH > 0 { + prev_si &= !STATE_MATCH; + self.last_match_si = prev_si; + result = Result::Match(text.len()); + } + result + } + + /// Executes the DFA on a reverse NFA. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_at_reverse( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + text: &[u8], + ) -> Result { + // The comments in `exec_at` above mostly apply here too. The main + // difference is that we move backwards over the input and we look for + // the longest possible match instead of the leftmost-first match. + // + // N.B. The code duplication here is regrettable. Efforts to improve + // it without sacrificing performance are welcome. ---AG + debug_assert!(self.prog.is_reverse); + let mut result = Result::NoMatch(self.at); + let (mut prev_si, mut next_si) = (self.start, self.start); + let mut at = self.at; + while at > 0 { + while next_si <= STATE_MAX && at > 0 { + // Argument for safety is in the definition of next_si. + at -= 1; + prev_si = unsafe { self.next_si(next_si, text, at) }; + if prev_si > STATE_MAX || at <= 4 { + mem::swap(&mut prev_si, &mut next_si); + break; + } + at -= 1; + next_si = unsafe { self.next_si(prev_si, text, at) }; + if next_si > STATE_MAX { + break; + } + at -= 1; + prev_si = unsafe { self.next_si(next_si, text, at) }; + if prev_si > STATE_MAX { + mem::swap(&mut prev_si, &mut next_si); + break; + } + at -= 1; + next_si = unsafe { self.next_si(prev_si, text, at) }; + } + if next_si & STATE_MATCH > 0 { + next_si &= !STATE_MATCH; + result = Result::Match(at + 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + prev_si = next_si; + let cur = at; + while (next_si & !STATE_MATCH) == prev_si && at >= 2 { + // Argument for safety is in the definition of next_si. + at -= 1; + next_si = unsafe { + self.next_si(next_si & !STATE_MATCH, text, at) + }; + } + if at < cur { + result = Result::Match(at + 2); + } + } else if next_si >= STATE_UNKNOWN { + if next_si == STATE_QUIT { + return Result::Quit; + } + let byte = Byte::byte(text[at]); + prev_si &= STATE_MAX; + self.at = at; + next_si = match self.next_state(qcur, qnext, prev_si, byte) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(at), + Some(si) => si, + }; + debug_assert!(next_si != STATE_UNKNOWN); + if next_si & STATE_MATCH > 0 { + next_si &= !STATE_MATCH; + result = Result::Match(at + 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + } + prev_si = next_si; + } else { + prev_si = next_si; + } + } + + // Run the DFA once more on the special EOF sentinel value. + prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(0), + Some(si) => si, + }; + debug_assert!(prev_si != STATE_UNKNOWN); + if prev_si & STATE_MATCH > 0 { + prev_si &= !STATE_MATCH; + self.last_match_si = prev_si; + result = Result::Match(0); + } + result + } + + /// next_si transitions to the next state, where the transition input + /// corresponds to text[i]. + /// + /// This elides bounds checks, and is therefore not safe. + #[cfg_attr(feature = "perf-inline", inline(always))] + unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr { + // What is the argument for safety here? + // We have three unchecked accesses that could possibly violate safety: + // + // 1. The given byte of input (`text[i]`). + // 2. The class of the byte of input (`classes[text[i]]`). + // 3. The transition for the class (`trans[si + cls]`). + // + // (1) is only safe when calling next_si is guarded by + // `i < text.len()`. + // + // (2) is the easiest case to guarantee since `text[i]` is always a + // `u8` and `self.prog.byte_classes` always has length `u8::MAX`. + // (See `ByteClassSet.byte_classes` in `compile.rs`.) + // + // (3) is only safe if (1)+(2) are safe. Namely, the transitions + // of every state are defined to have length equal to the number of + // byte classes in the program. Therefore, a valid class leads to a + // valid transition. (All possible transitions are valid lookups, even + // if it points to a state that hasn't been computed yet.) (3) also + // relies on `si` being correct, but StatePtrs should only ever be + // retrieved from the transition table, which ensures they are correct. + debug_assert!(i < text.len()); + let b = *text.get_unchecked(i); + debug_assert!((b as usize) < self.prog.byte_classes.len()); + let cls = *self.prog.byte_classes.get_unchecked(b as usize); + self.cache.trans.next_unchecked(si, cls as usize) + } + + /// Computes the next state given the current state and the current input + /// byte (which may be EOF). + /// + /// If STATE_DEAD is returned, then there is no valid state transition. + /// This implies that no permutation of future input can lead to a match + /// state. + /// + /// STATE_UNKNOWN can never be returned. + fn exec_byte( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + mut si: StatePtr, + b: Byte, + ) -> Option { + use crate::prog::Inst::*; + + // Initialize a queue with the current DFA state's NFA states. + qcur.clear(); + for ip in self.state(si).inst_ptrs() { + qcur.insert(ip); + } + + // Before inspecting the current byte, we may need to also inspect + // whether the position immediately preceding the current byte + // satisfies the empty assertions found in the current state. + // + // We only need to do this step if there are any empty assertions in + // the current state. + let is_word_last = self.state(si).flags().is_word(); + let is_word = b.is_ascii_word(); + if self.state(si).flags().has_empty() { + // Compute the flags immediately preceding the current byte. + // This means we only care about the "end" or "end line" flags. + // (The "start" flags are computed immediately following the + // current byte and are handled below.) + let mut flags = EmptyFlags::default(); + if b.is_eof() { + flags.end = true; + flags.end_line = true; + } else if b.as_byte().map_or(false, |b| b == b'\n') { + flags.end_line = true; + } + if is_word_last == is_word { + flags.not_word_boundary = true; + } else { + flags.word_boundary = true; + } + // Now follow epsilon transitions from every NFA state, but make + // sure we only follow transitions that satisfy our flags. + qnext.clear(); + for &ip in &*qcur { + self.follow_epsilons(usize_to_u32(ip), qnext, flags); + } + mem::swap(qcur, qnext); + } + + // Now we set flags for immediately after the current byte. Since start + // states are processed separately, and are the only states that can + // have the StartText flag set, we therefore only need to worry about + // the StartLine flag here. + // + // We do also keep track of whether this DFA state contains a NFA state + // that is a matching state. This is precisely how we delay the DFA + // matching by one byte in order to process the special EOF sentinel + // byte. Namely, if this DFA state containing a matching NFA state, + // then it is the *next* DFA state that is marked as a match. + let mut empty_flags = EmptyFlags::default(); + let mut state_flags = StateFlags::default(); + empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n'); + if b.is_ascii_word() { + state_flags.set_word(); + } + // Now follow all epsilon transitions again, but only after consuming + // the current byte. + qnext.clear(); + for &ip in &*qcur { + match self.prog[ip as usize] { + // These states never happen in a byte-based program. + Char(_) | Ranges(_) => unreachable!(), + // These states are handled when following epsilon transitions. + Save(_) | Split(_) | EmptyLook(_) => {} + Match(_) => { + state_flags.set_match(); + if !self.continue_past_first_match() { + break; + } else if self.prog.matches.len() > 1 + && !qnext.contains(ip as usize) + { + // If we are continuing on to find other matches, + // then keep a record of the match states we've seen. + qnext.insert(ip); + } + } + Bytes(ref inst) => { + if b.as_byte().map_or(false, |b| inst.matches(b)) { + self.follow_epsilons( + inst.goto as InstPtr, + qnext, + empty_flags, + ); + } + } + } + } + + let cache = if b.is_eof() && self.prog.matches.len() > 1 { + // If we're processing the last byte of the input and we're + // matching a regex set, then make the next state contain the + // previous states transitions. We do this so that the main + // matching loop can extract all of the match instructions. + mem::swap(qcur, qnext); + // And don't cache this state because it's totally bunk. + false + } else { + true + }; + + // We've now built up the set of NFA states that ought to comprise the + // next DFA state, so try to find it in the cache, and if it doesn't + // exist, cache it. + // + // N.B. We pass `&mut si` here because the cache may clear itself if + // it has gotten too full. When that happens, the location of the + // current state may change. + let mut next = + match self.cached_state(qnext, state_flags, Some(&mut si)) { + None => return None, + Some(next) => next, + }; + if (self.start & !STATE_START) == next { + // Start states can never be match states since all matches are + // delayed by one byte. + debug_assert!(!self.state(next).flags().is_match()); + next = self.start_ptr(next); + } + if next <= STATE_MAX && self.state(next).flags().is_match() { + next |= STATE_MATCH; + } + debug_assert!(next != STATE_UNKNOWN); + // And now store our state in the current state's next list. + if cache { + let cls = self.byte_class(b); + self.cache.trans.set_next(si, cls, next); + } + Some(next) + } + + /// Follows the epsilon transitions starting at (and including) `ip`. The + /// resulting states are inserted into the ordered set `q`. + /// + /// Conditional epsilon transitions (i.e., empty width assertions) are only + /// followed if they are satisfied by the given flags, which should + /// represent the flags set at the current location in the input. + /// + /// If the current location corresponds to the empty string, then only the + /// end line and/or end text flags may be set. If the current location + /// corresponds to a real byte in the input, then only the start line + /// and/or start text flags may be set. + /// + /// As an exception to the above, when finding the initial state, any of + /// the above flags may be set: + /// + /// If matching starts at the beginning of the input, then start text and + /// start line should be set. If the input is empty, then end text and end + /// line should also be set. + /// + /// If matching starts after the beginning of the input, then only start + /// line should be set if the preceding byte is `\n`. End line should never + /// be set in this case. (Even if the following byte is a `\n`, it will + /// be handled in a subsequent DFA state.) + fn follow_epsilons( + &mut self, + ip: InstPtr, + q: &mut SparseSet, + flags: EmptyFlags, + ) { + use crate::prog::EmptyLook::*; + use crate::prog::Inst::*; + + // We need to traverse the NFA to follow epsilon transitions, so avoid + // recursion with an explicit stack. + self.cache.stack.push(ip); + while let Some(mut ip) = self.cache.stack.pop() { + // Try to munch through as many states as possible without + // pushes/pops to the stack. + loop { + // Don't visit states we've already added. + if q.contains(ip as usize) { + break; + } + q.insert(ip as usize); + match self.prog[ip as usize] { + Char(_) | Ranges(_) => unreachable!(), + Match(_) | Bytes(_) => { + break; + } + EmptyLook(ref inst) => { + // Only follow empty assertion states if our flags + // satisfy the assertion. + match inst.look { + StartLine if flags.start_line => { + ip = inst.goto as InstPtr; + } + EndLine if flags.end_line => { + ip = inst.goto as InstPtr; + } + StartText if flags.start => { + ip = inst.goto as InstPtr; + } + EndText if flags.end => { + ip = inst.goto as InstPtr; + } + WordBoundaryAscii if flags.word_boundary => { + ip = inst.goto as InstPtr; + } + NotWordBoundaryAscii + if flags.not_word_boundary => + { + ip = inst.goto as InstPtr; + } + WordBoundary if flags.word_boundary => { + ip = inst.goto as InstPtr; + } + NotWordBoundary if flags.not_word_boundary => { + ip = inst.goto as InstPtr; + } + StartLine | EndLine | StartText | EndText + | WordBoundaryAscii | NotWordBoundaryAscii + | WordBoundary | NotWordBoundary => { + break; + } + } + } + Save(ref inst) => { + ip = inst.goto as InstPtr; + } + Split(ref inst) => { + self.cache.stack.push(inst.goto2 as InstPtr); + ip = inst.goto1 as InstPtr; + } + } + } + } + } + + /// Find a previously computed state matching the given set of instructions + /// and is_match bool. + /// + /// The given set of instructions should represent a single state in the + /// NFA along with all states reachable without consuming any input. + /// + /// The is_match bool should be true if and only if the preceding DFA state + /// contains an NFA matching state. The cached state produced here will + /// then signify a match. (This enables us to delay a match by one byte, + /// in order to account for the EOF sentinel byte.) + /// + /// If the cache is full, then it is wiped before caching a new state. + /// + /// The current state should be specified if it exists, since it will need + /// to be preserved if the cache clears itself. (Start states are + /// always saved, so they should not be passed here.) It takes a mutable + /// pointer to the index because if the cache is cleared, the state's + /// location may change. + fn cached_state( + &mut self, + q: &SparseSet, + mut state_flags: StateFlags, + current_state: Option<&mut StatePtr>, + ) -> Option { + // If we couldn't come up with a non-empty key to represent this state, + // then it is dead and can never lead to a match. + // + // Note that inst_flags represent the set of empty width assertions + // in q. We use this as an optimization in exec_byte to determine when + // we should follow epsilon transitions at the empty string preceding + // the current byte. + let key = match self.cached_state_key(q, &mut state_flags) { + None => return Some(STATE_DEAD), + Some(v) => v, + }; + // In the cache? Cool. Done. + if let Some(si) = self.cache.compiled.get_ptr(&key) { + return Some(si); + } + // If the cache has gotten too big, wipe it. + if self.approximate_size() > self.prog.dfa_size_limit + && !self.clear_cache_and_save(current_state) + { + // Ooops. DFA is giving up. + return None; + } + // Allocate room for our state and add it. + self.add_state(key) + } + + /// Produces a key suitable for describing a state in the DFA cache. + /// + /// The key invariant here is that equivalent keys are produced for any two + /// sets of ordered NFA states (and toggling of whether the previous NFA + /// states contain a match state) that do not discriminate a match for any + /// input. + /// + /// Specifically, q should be an ordered set of NFA states and is_match + /// should be true if and only if the previous NFA states contained a match + /// state. + fn cached_state_key( + &mut self, + q: &SparseSet, + state_flags: &mut StateFlags, + ) -> Option { + use crate::prog::Inst::*; + + // We need to build up enough information to recognize pre-built states + // in the DFA. Generally speaking, this includes every instruction + // except for those which are purely epsilon transitions, e.g., the + // Save and Split instructions. + // + // Empty width assertions are also epsilon transitions, but since they + // are conditional, we need to make them part of a state's key in the + // cache. + + let mut insts = + mem::replace(&mut self.cache.insts_scratch_space, vec![]); + insts.clear(); + // Reserve 1 byte for flags. + insts.push(0); + + let mut prev = 0; + for &ip in q { + let ip = usize_to_u32(ip); + match self.prog[ip as usize] { + Char(_) | Ranges(_) => unreachable!(), + Save(_) | Split(_) => {} + Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip), + EmptyLook(_) => { + state_flags.set_empty(); + push_inst_ptr(&mut insts, &mut prev, ip) + } + Match(_) => { + push_inst_ptr(&mut insts, &mut prev, ip); + if !self.continue_past_first_match() { + break; + } + } + } + } + // If we couldn't transition to any other instructions and we didn't + // see a match when expanding NFA states previously, then this is a + // dead state and no amount of additional input can transition out + // of this state. + let opt_state = if insts.len() == 1 && !state_flags.is_match() { + None + } else { + let StateFlags(f) = *state_flags; + insts[0] = f; + Some(State { data: Arc::from(&*insts) }) + }; + self.cache.insts_scratch_space = insts; + opt_state + } + + /// Clears the cache, but saves and restores current_state if it is not + /// none. + /// + /// The current state must be provided here in case its location in the + /// cache changes. + /// + /// This returns false if the cache is not cleared and the DFA should + /// give up. + fn clear_cache_and_save( + &mut self, + current_state: Option<&mut StatePtr>, + ) -> bool { + if self.cache.compiled.is_empty() { + // Nothing to clear... + return true; + } + match current_state { + None => self.clear_cache(), + Some(si) => { + let cur = self.state(*si).clone(); + if !self.clear_cache() { + return false; + } + // The unwrap is OK because we just cleared the cache and + // therefore know that the next state pointer won't exceed + // STATE_MAX. + *si = self.restore_state(cur).unwrap(); + true + } + } + } + + /// Wipes the state cache, but saves and restores the current start state. + /// + /// This returns false if the cache is not cleared and the DFA should + /// give up. + fn clear_cache(&mut self) -> bool { + // Bail out of the DFA if we're moving too "slowly." + // A heuristic from RE2: assume the DFA is too slow if it is processing + // 10 or fewer bytes per state. + // Additionally, we permit the cache to be flushed a few times before + // caling it quits. + let nstates = self.cache.compiled.len(); + if self.cache.flush_count >= 3 + && self.at >= self.last_cache_flush + && (self.at - self.last_cache_flush) <= 10 * nstates + { + return false; + } + // Update statistics tracking cache flushes. + self.last_cache_flush = self.at; + self.cache.flush_count += 1; + + // OK, actually flush the cache. + let start = self.state(self.start & !STATE_START).clone(); + let last_match = if self.last_match_si <= STATE_MAX { + Some(self.state(self.last_match_si).clone()) + } else { + None + }; + self.cache.reset_size(); + self.cache.trans.clear(); + self.cache.compiled.clear(); + for s in &mut self.cache.start_states { + *s = STATE_UNKNOWN; + } + // The unwraps are OK because we just cleared the cache and therefore + // know that the next state pointer won't exceed STATE_MAX. + let start_ptr = self.restore_state(start).unwrap(); + self.start = self.start_ptr(start_ptr); + if let Some(last_match) = last_match { + self.last_match_si = self.restore_state(last_match).unwrap(); + } + true + } + + /// Restores the given state back into the cache, and returns a pointer + /// to it. + fn restore_state(&mut self, state: State) -> Option { + // If we've already stored this state, just return a pointer to it. + // None will be the wiser. + if let Some(si) = self.cache.compiled.get_ptr(&state) { + return Some(si); + } + self.add_state(state) + } + + /// Returns the next state given the current state si and current byte + /// b. {qcur,qnext} are used as scratch space for storing ordered NFA + /// states. + /// + /// This tries to fetch the next state from the cache, but if that fails, + /// it computes the next state, caches it and returns a pointer to it. + /// + /// The pointer can be to a real state, or it can be STATE_DEAD. + /// STATE_UNKNOWN cannot be returned. + /// + /// None is returned if a new state could not be allocated (i.e., the DFA + /// ran out of space and thinks it's running too slowly). + fn next_state( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + si: StatePtr, + b: Byte, + ) -> Option { + if si == STATE_DEAD { + return Some(STATE_DEAD); + } + match self.cache.trans.next(si, self.byte_class(b)) { + STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b), + STATE_QUIT => None, + nsi => Some(nsi), + } + } + + /// Computes and returns the start state, where searching begins at + /// position `at` in `text`. If the state has already been computed, + /// then it is pulled from the cache. If the state hasn't been cached, + /// then it is computed, cached and a pointer to it is returned. + /// + /// This may return STATE_DEAD but never STATE_UNKNOWN. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start_state( + &mut self, + q: &mut SparseSet, + empty_flags: EmptyFlags, + state_flags: StateFlags, + ) -> Option { + // Compute an index into our cache of start states based on the set + // of empty/state flags set at the current position in the input. We + // don't use every flag since not all flags matter. For example, since + // matches are delayed by one byte, start states can never be match + // states. + let flagi = { + (((empty_flags.start as u8) << 0) + | ((empty_flags.end as u8) << 1) + | ((empty_flags.start_line as u8) << 2) + | ((empty_flags.end_line as u8) << 3) + | ((empty_flags.word_boundary as u8) << 4) + | ((empty_flags.not_word_boundary as u8) << 5) + | ((state_flags.is_word() as u8) << 6)) as usize + }; + match self.cache.start_states[flagi] { + STATE_UNKNOWN => {} + si => return Some(si), + } + q.clear(); + let start = usize_to_u32(self.prog.start); + self.follow_epsilons(start, q, empty_flags); + // Start states can never be match states because we delay every match + // by one byte. Given an empty string and an empty match, the match + // won't actually occur until the DFA processes the special EOF + // sentinel byte. + let sp = match self.cached_state(q, state_flags, None) { + None => return None, + Some(sp) => self.start_ptr(sp), + }; + self.cache.start_states[flagi] = sp; + Some(sp) + } + + /// Computes the set of starting flags for the given position in text. + /// + /// This should only be used when executing the DFA forwards over the + /// input. + fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) { + let mut empty_flags = EmptyFlags::default(); + let mut state_flags = StateFlags::default(); + empty_flags.start = at == 0; + empty_flags.end = text.is_empty(); + empty_flags.start_line = at == 0 || text[at - 1] == b'\n'; + empty_flags.end_line = text.is_empty(); + + let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); + let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word(); + if is_word_last { + state_flags.set_word(); + } + if is_word == is_word_last { + empty_flags.not_word_boundary = true; + } else { + empty_flags.word_boundary = true; + } + (empty_flags, state_flags) + } + + /// Computes the set of starting flags for the given position in text. + /// + /// This should only be used when executing the DFA in reverse over the + /// input. + fn start_flags_reverse( + &self, + text: &[u8], + at: usize, + ) -> (EmptyFlags, StateFlags) { + let mut empty_flags = EmptyFlags::default(); + let mut state_flags = StateFlags::default(); + empty_flags.start = at == text.len(); + empty_flags.end = text.is_empty(); + empty_flags.start_line = at == text.len() || text[at] == b'\n'; + empty_flags.end_line = text.is_empty(); + + let is_word_last = + at < text.len() && Byte::byte(text[at]).is_ascii_word(); + let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); + if is_word_last { + state_flags.set_word(); + } + if is_word == is_word_last { + empty_flags.not_word_boundary = true; + } else { + empty_flags.word_boundary = true; + } + (empty_flags, state_flags) + } + + /// Returns a reference to a State given a pointer to it. + fn state(&self, si: StatePtr) -> &State { + self.cache.compiled.get_state(si).unwrap() + } + + /// Adds the given state to the DFA. + /// + /// This allocates room for transitions out of this state in + /// self.cache.trans. The transitions can be set with the returned + /// StatePtr. + /// + /// If None is returned, then the state limit was reached and the DFA + /// should quit. + fn add_state(&mut self, state: State) -> Option { + // This will fail if the next state pointer exceeds STATE_PTR. In + // practice, the cache limit will prevent us from ever getting here, + // but maybe callers will set the cache size to something ridiculous... + let si = match self.cache.trans.add() { + None => return None, + Some(si) => si, + }; + // If the program has a Unicode word boundary, then set any transitions + // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a + // transition, then it will quit and an alternative matching engine + // will take over. + if self.prog.has_unicode_word_boundary { + for b in 128..256 { + let cls = self.byte_class(Byte::byte(b as u8)); + self.cache.trans.set_next(si, cls, STATE_QUIT); + } + } + // Finally, put our actual state on to our heap of states and index it + // so we can find it later. + self.cache.size += self.cache.trans.state_heap_size() + + state.data.len() + + (2 * mem::size_of::()) + + mem::size_of::(); + self.cache.compiled.insert(state, si); + // Transition table and set of states and map should all be in sync. + debug_assert!( + self.cache.compiled.len() == self.cache.trans.num_states() + ); + Some(si) + } + + /// Quickly finds the next occurrence of any literal prefixes in the regex. + /// If there are no literal prefixes, then the current position is + /// returned. If there are literal prefixes and one could not be found, + /// then None is returned. + /// + /// This should only be called when the DFA is in a start state. + fn prefix_at(&self, text: &[u8], at: usize) -> Option { + self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s) + } + + /// Returns the number of byte classes required to discriminate transitions + /// in each state. + /// + /// invariant: num_byte_classes() == len(State.next) + fn num_byte_classes(&self) -> usize { + // We add 1 to account for the special EOF byte. + (self.prog.byte_classes[255] as usize + 1) + 1 + } + + /// Given an input byte or the special EOF sentinel, return its + /// corresponding byte class. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn byte_class(&self, b: Byte) -> usize { + match b.as_byte() { + None => self.num_byte_classes() - 1, + Some(b) => self.u8_class(b), + } + } + + /// Like byte_class, but explicitly for u8s. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn u8_class(&self, b: u8) -> usize { + self.prog.byte_classes[b as usize] as usize + } + + /// Returns true if the DFA should continue searching past the first match. + /// + /// Leftmost first semantics in the DFA are preserved by not following NFA + /// transitions after the first match is seen. + /// + /// On occasion, we want to avoid leftmost first semantics to find either + /// the longest match (for reverse search) or all possible matches (for + /// regex sets). + fn continue_past_first_match(&self) -> bool { + self.prog.is_reverse || self.prog.matches.len() > 1 + } + + /// Returns true if there is a prefix we can quickly search for. + fn has_prefix(&self) -> bool { + !self.prog.is_reverse + && !self.prog.prefixes.is_empty() + && !self.prog.is_anchored_start + } + + /// Sets the STATE_START bit in the given state pointer if and only if + /// we have a prefix to scan for. + /// + /// If there's no prefix, then it's a waste to treat the start state + /// specially. + fn start_ptr(&self, si: StatePtr) -> StatePtr { + if self.has_prefix() { + si | STATE_START + } else { + si + } + } + + /// Approximate size returns the approximate heap space currently used by + /// the DFA. It is used to determine whether the DFA's state cache needs to + /// be wiped. Namely, it is possible that for certain regexes on certain + /// inputs, a new state could be created for every byte of input. (This is + /// bad for memory use, so we bound it with a cache.) + fn approximate_size(&self) -> usize { + self.cache.size + } +} + +/// An abstraction for representing a map of states. The map supports two +/// different ways of state lookup. One is fast constant time access via a +/// state pointer. The other is a hashmap lookup based on the DFA's +/// constituent NFA states. +/// +/// A DFA state internally uses an Arc such that we only need to store the +/// set of NFA states on the heap once, even though we support looking up +/// states by two different means. A more natural way to express this might +/// use raw pointers, but an Arc is safe and effectively achieves the same +/// thing. +#[derive(Debug)] +struct StateMap { + /// The keys are not actually static but rely on always pointing to a + /// buffer in `states` which will never be moved except when clearing + /// the map or on drop, in which case the keys of this map will be + /// removed before + map: HashMap, + /// Our set of states. Note that `StatePtr / num_byte_classes` indexes + /// this Vec rather than just a `StatePtr`. + states: Vec, + /// The number of byte classes in the DFA. Used to index `states`. + num_byte_classes: usize, +} + +impl StateMap { + fn new(num_byte_classes: usize) -> StateMap { + StateMap { map: HashMap::new(), states: vec![], num_byte_classes } + } + + fn len(&self) -> usize { + self.states.len() + } + + fn is_empty(&self) -> bool { + self.states.is_empty() + } + + fn get_ptr(&self, state: &State) -> Option { + self.map.get(state).cloned() + } + + fn get_state(&self, si: StatePtr) -> Option<&State> { + self.states.get(si as usize / self.num_byte_classes) + } + + fn insert(&mut self, state: State, si: StatePtr) { + self.map.insert(state.clone(), si); + self.states.push(state); + } + + fn clear(&mut self) { + self.map.clear(); + self.states.clear(); + } +} + +impl Transitions { + /// Create a new transition table. + /// + /// The number of byte classes corresponds to the stride. Every state will + /// have `num_byte_classes` slots for transitions. + fn new(num_byte_classes: usize) -> Transitions { + Transitions { table: vec![], num_byte_classes } + } + + /// Returns the total number of states currently in this table. + fn num_states(&self) -> usize { + self.table.len() / self.num_byte_classes + } + + /// Allocates room for one additional state and returns a pointer to it. + /// + /// If there's no more room, None is returned. + fn add(&mut self) -> Option { + let si = self.table.len(); + if si > STATE_MAX as usize { + return None; + } + self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes)); + Some(usize_to_u32(si)) + } + + /// Clears the table of all states. + fn clear(&mut self) { + self.table.clear(); + } + + /// Sets the transition from (si, cls) to next. + fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) { + self.table[si as usize + cls] = next; + } + + /// Returns the transition corresponding to (si, cls). + fn next(&self, si: StatePtr, cls: usize) -> StatePtr { + self.table[si as usize + cls] + } + + /// The heap size, in bytes, of a single state in the transition table. + fn state_heap_size(&self) -> usize { + self.num_byte_classes * mem::size_of::() + } + + /// Like `next`, but uses unchecked access and is therefore not safe. + unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr { + debug_assert!((si as usize) < self.table.len()); + debug_assert!(cls < self.num_byte_classes); + *self.table.get_unchecked(si as usize + cls) + } +} + +impl StateFlags { + fn is_match(&self) -> bool { + self.0 & 0b0000_0001 > 0 + } + + fn set_match(&mut self) { + self.0 |= 0b0000_0001; + } + + fn is_word(&self) -> bool { + self.0 & 0b0000_0010 > 0 + } + + fn set_word(&mut self) { + self.0 |= 0b0000_0010; + } + + fn has_empty(&self) -> bool { + self.0 & 0b0000_0100 > 0 + } + + fn set_empty(&mut self) { + self.0 |= 0b0000_0100; + } +} + +impl Byte { + fn byte(b: u8) -> Self { + Byte(b as u16) + } + fn eof() -> Self { + Byte(256) + } + fn is_eof(&self) -> bool { + self.0 == 256 + } + + fn is_ascii_word(&self) -> bool { + let b = match self.as_byte() { + None => return false, + Some(b) => b, + }; + match b { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true, + _ => false, + } + } + + fn as_byte(&self) -> Option { + if self.is_eof() { + None + } else { + Some(self.0 as u8) + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let ips: Vec = self.inst_ptrs().collect(); + f.debug_struct("State") + .field("flags", &self.flags()) + .field("insts", &ips) + .finish() + } +} + +impl fmt::Debug for Transitions { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut fmtd = f.debug_map(); + for si in 0..self.num_states() { + let s = si * self.num_byte_classes; + let e = s + self.num_byte_classes; + fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e])); + } + fmtd.finish() + } +} + +struct TransitionsRow<'a>(&'a [StatePtr]); + +impl<'a> fmt::Debug for TransitionsRow<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut fmtd = f.debug_map(); + for (b, si) in self.0.iter().enumerate() { + match *si { + STATE_UNKNOWN => {} + STATE_DEAD => { + fmtd.entry(&vb(b as usize), &"DEAD"); + } + si => { + fmtd.entry(&vb(b as usize), &si.to_string()); + } + } + } + fmtd.finish() + } +} + +impl fmt::Debug for StateFlags { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StateFlags") + .field("is_match", &self.is_match()) + .field("is_word", &self.is_word()) + .field("has_empty", &self.has_empty()) + .finish() + } +} + +/// Helper function for formatting a byte as a nice-to-read escaped string. +fn vb(b: usize) -> String { + use std::ascii::escape_default; + + if b > ::std::u8::MAX as usize { + "EOF".to_owned() + } else { + let escaped = escape_default(b as u8).collect::>(); + String::from_utf8_lossy(&escaped).into_owned() + } +} + +fn usize_to_u32(n: usize) -> u32 { + if (n as u64) > (::std::u32::MAX as u64) { + panic!("BUG: {} is too big to fit into u32", n) + } + n as u32 +} + +#[allow(dead_code)] // useful for debugging +fn show_state_ptr(si: StatePtr) -> String { + let mut s = format!("{:?}", si & STATE_MAX); + if si == STATE_UNKNOWN { + s = format!("{} (unknown)", s); + } + if si == STATE_DEAD { + s = format!("{} (dead)", s); + } + if si == STATE_QUIT { + s = format!("{} (quit)", s); + } + if si & STATE_START > 0 { + s = format!("{} (start)", s); + } + if si & STATE_MATCH > 0 { + s = format!("{} (match)", s); + } + s +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_vari32(data: &mut Vec, n: i32) { + let mut un = (n as u32) << 1; + if n < 0 { + un = !un; + } + write_varu32(data, un) +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_vari32(data: &[u8]) -> (i32, usize) { + let (un, i) = read_varu32(data); + let mut n = (un >> 1) as i32; + if un & 1 != 0 { + n = !n; + } + (n, i) +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_varu32(data: &mut Vec, mut n: u32) { + while n >= 0b1000_0000 { + data.push((n as u8) | 0b1000_0000); + n >>= 7; + } + data.push(n as u8); +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_varu32(data: &[u8]) -> (u32, usize) { + let mut n: u32 = 0; + let mut shift: u32 = 0; + for (i, &b) in data.iter().enumerate() { + if b < 0b1000_0000 { + return (n | ((b as u32) << shift), i + 1); + } + n |= ((b as u32) & 0b0111_1111) << shift; + shift += 7; + } + (0, 0) +} + +#[cfg(test)] +mod tests { + + use super::{ + push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32, + State, StateFlags, + }; + use quickcheck::{quickcheck, Gen, QuickCheck}; + use std::sync::Arc; + + #[test] + fn prop_state_encode_decode() { + fn p(mut ips: Vec, flags: u8) -> bool { + // It looks like our encoding scheme can't handle instruction + // pointers at or above 2**31. We should fix that, but it seems + // unlikely to occur in real code due to the amount of memory + // required for such a state machine. So for now, we just clamp + // our test data. + for ip in &mut ips { + if *ip >= 1 << 31 { + *ip = (1 << 31) - 1; + } + } + let mut data = vec![flags]; + let mut prev = 0; + for &ip in ips.iter() { + push_inst_ptr(&mut data, &mut prev, ip); + } + let state = State { data: Arc::from(&data[..]) }; + + let expected: Vec = + ips.into_iter().map(|ip| ip as usize).collect(); + let got: Vec = state.inst_ptrs().collect(); + expected == got && state.flags() == StateFlags(flags) + } + QuickCheck::new() + .gen(Gen::new(10_000)) + .quickcheck(p as fn(Vec, u8) -> bool); + } + + #[test] + fn prop_read_write_u32() { + fn p(n: u32) -> bool { + let mut buf = vec![]; + write_varu32(&mut buf, n); + let (got, nread) = read_varu32(&buf); + nread == buf.len() && got == n + } + quickcheck(p as fn(u32) -> bool); + } + + #[test] + fn prop_read_write_i32() { + fn p(n: i32) -> bool { + let mut buf = vec![]; + write_vari32(&mut buf, n); + let (got, nread) = read_vari32(&buf); + nread == buf.len() && got == n + } + quickcheck(p as fn(i32) -> bool); + } +} diff --git a/regex-1.8.4/src/error.rs b/regex-1.8.4/src/error.rs new file mode 100644 index 0000000000000..6c341f604ba61 --- /dev/null +++ b/regex-1.8.4/src/error.rs @@ -0,0 +1,89 @@ +use std::fmt; +use std::iter::repeat; + +/// An error that occurred during parsing or compiling a regular expression. +#[derive(Clone, PartialEq)] +pub enum Error { + /// A syntax error. + Syntax(String), + /// The compiled program exceeded the set size + /// limit. The argument is the size limit imposed by + /// [`RegexBuilder::size_limit`](crate::RegexBuilder::size_limit). Even + /// when not configured explicitly, it defaults to a reasonable limit. + /// + /// If you're getting this error, it occurred because your regex has been + /// compiled to an intermediate state that is too big. It is important to + /// note that exceeding this limit does _not_ mean the regex is too big to + /// _work_, but rather, the regex is big enough that it may wind up being + /// surprisingly slow when used in a search. In other words, this error is + /// meant to be a practical heuristic for avoiding a performance footgun, + /// and especially so for the case where the regex pattern is coming from + /// an untrusted source. + /// + /// There are generally two ways to move forward if you hit this error. + /// The first is to find some way to use a smaller regex. The second is to + /// increase the size limit via `RegexBuilder::size_limit`. However, if + /// your regex pattern is not from a trusted source, then neither of these + /// approaches may be appropriate. Instead, you'll have to determine just + /// how big of a regex you want to allow. + CompiledTooBig(usize), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl ::std::error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + match *self { + Error::Syntax(ref err) => err, + Error::CompiledTooBig(_) => "compiled program too big", + Error::__Nonexhaustive => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Error::Syntax(ref err) => err.fmt(f), + Error::CompiledTooBig(limit) => write!( + f, + "Compiled regex exceeds size limit of {} bytes.", + limit + ), + Error::__Nonexhaustive => unreachable!(), + } + } +} + +// We implement our own Debug implementation so that we show nicer syntax +// errors when people use `Regex::new(...).unwrap()`. It's a little weird, +// but the `Syntax` variant is already storing a `String` anyway, so we might +// as well format it nicely. +impl fmt::Debug for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Error::Syntax(ref err) => { + let hr: String = repeat('~').take(79).collect(); + writeln!(f, "Syntax(")?; + writeln!(f, "{}", hr)?; + writeln!(f, "{}", err)?; + writeln!(f, "{}", hr)?; + write!(f, ")")?; + Ok(()) + } + Error::CompiledTooBig(limit) => { + f.debug_tuple("CompiledTooBig").field(&limit).finish() + } + Error::__Nonexhaustive => { + f.debug_tuple("__Nonexhaustive").finish() + } + } + } +} diff --git a/regex-1.8.4/src/exec.rs b/regex-1.8.4/src/exec.rs new file mode 100644 index 0000000000000..d44908c66e8d3 --- /dev/null +++ b/regex-1.8.4/src/exec.rs @@ -0,0 +1,1759 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::panic::AssertUnwindSafe; +use std::sync::Arc; + +#[cfg(feature = "perf-literal")] +use aho_corasick::{AhoCorasick, MatchKind}; +use regex_syntax::hir::literal; +use regex_syntax::hir::{Hir, Look}; +use regex_syntax::ParserBuilder; + +use crate::backtrack; +use crate::compile::Compiler; +#[cfg(feature = "perf-dfa")] +use crate::dfa; +use crate::error::Error; +use crate::input::{ByteInput, CharInput}; +use crate::literal::LiteralSearcher; +use crate::pikevm; +use crate::pool::{Pool, PoolGuard}; +use crate::prog::Program; +use crate::re_builder::RegexOptions; +use crate::re_bytes; +use crate::re_set; +use crate::re_trait::{Locations, RegularExpression, Slot}; +use crate::re_unicode; +use crate::utf8::next_utf8; + +/// `Exec` manages the execution of a regular expression. +/// +/// In particular, this manages the various compiled forms of a single regular +/// expression and the choice of which matching engine to use to execute a +/// regular expression. +#[derive(Debug)] +pub struct Exec { + /// All read only state. + ro: Arc, + /// A pool of reusable values for the various matching engines. + /// + /// Note that boxing this value is not strictly necessary, but it is an + /// easy way to ensure that T does not bloat the stack sized used by a pool + /// in the case where T is big. And this turns out to be the case at the + /// time of writing for regex's use of this pool. At the time of writing, + /// the size of a Regex on the stack is 856 bytes. Boxing this value + /// reduces that size to 16 bytes. + pool: Box>, +} + +/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This +/// means it is no longer Sync, but we can now avoid the overhead of +/// synchronization to fetch the cache. +#[derive(Debug)] +pub struct ExecNoSync<'c> { + /// All read only state. + ro: &'c Arc, + /// Caches for the various matching engines. + cache: PoolGuard<'c, ProgramCache>, +} + +/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. +#[derive(Debug)] +pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); + +/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such +/// state is determined at compile time and never changes during search. +#[derive(Debug)] +struct ExecReadOnly { + /// The original regular expressions given by the caller to compile. + res: Vec, + /// A compiled program that is used in the NFA simulation and backtracking. + /// It can be byte-based or Unicode codepoint based. + /// + /// N.B. It is not possibly to make this byte-based from the public API. + /// It is only used for testing byte based programs in the NFA simulations. + nfa: Program, + /// A compiled byte based program for DFA execution. This is only used + /// if a DFA can be executed. (Currently, only word boundary assertions are + /// not supported.) Note that this program contains an embedded `.*?` + /// preceding the first capture group, unless the regex is anchored at the + /// beginning. + #[allow(dead_code)] + dfa: Program, + /// The same as above, except the program is reversed (and there is no + /// preceding `.*?`). This is used by the DFA to find the starting location + /// of matches. + #[allow(dead_code)] + dfa_reverse: Program, + /// A set of suffix literals extracted from the regex. + /// + /// Prefix literals are stored on the `Program`, since they are used inside + /// the matching engines. + #[allow(dead_code)] + suffixes: LiteralSearcher, + /// An Aho-Corasick automaton with leftmost-first match semantics. + /// + /// This is only set when the entire regex is a simple unanchored + /// alternation of literals. We could probably use it more circumstances, + /// but this is already hacky enough in this architecture. + /// + /// N.B. We use u32 as a state ID representation under the assumption that + /// if we were to exhaust the ID space, we probably would have long + /// surpassed the compilation size limit. + #[cfg(feature = "perf-literal")] + ac: Option, + /// match_type encodes as much upfront knowledge about how we're going to + /// execute a search as possible. + match_type: MatchType, +} + +/// Facilitates the construction of an executor by exposing various knobs +/// to control how a regex is executed and what kinds of resources it's +/// permitted to use. +// `ExecBuilder` is only public via the `internal` module, so avoid deriving +// `Debug`. +#[allow(missing_debug_implementations)] +pub struct ExecBuilder { + options: RegexOptions, + match_type: Option, + bytes: bool, + only_utf8: bool, +} + +/// Parsed represents a set of parsed regular expressions and their detected +/// literals. +struct Parsed { + exprs: Vec, + prefixes: literal::Seq, + suffixes: literal::Seq, + bytes: bool, +} + +impl ExecBuilder { + /// Create a regex execution builder. + /// + /// This uses default settings for everything except the regex itself, + /// which must be provided. Further knobs can be set by calling methods, + /// and then finally, `build` to actually create the executor. + pub fn new(re: &str) -> Self { + Self::new_many(&[re]) + } + + /// Like new, but compiles the union of the given regular expressions. + /// + /// Note that when compiling 2 or more regular expressions, capture groups + /// are completely unsupported. (This means both `find` and `captures` + /// won't work.) + pub fn new_many(res: I) -> Self + where + S: AsRef, + I: IntoIterator, + { + let mut opts = RegexOptions::default(); + opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); + Self::new_options(opts) + } + + /// Create a regex execution builder. + pub fn new_options(opts: RegexOptions) -> Self { + ExecBuilder { + options: opts, + match_type: None, + bytes: false, + only_utf8: true, + } + } + + /// Set the matching engine to be automatically determined. + /// + /// This is the default state and will apply whatever optimizations are + /// possible, such as running a DFA. + /// + /// This overrides whatever was previously set via the `nfa` or + /// `bounded_backtracking` methods. + pub fn automatic(mut self) -> Self { + self.match_type = None; + self + } + + /// Sets the matching engine to use the NFA algorithm no matter what + /// optimizations are possible. + /// + /// This overrides whatever was previously set via the `automatic` or + /// `bounded_backtracking` methods. + pub fn nfa(mut self) -> Self { + self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); + self + } + + /// Sets the matching engine to use a bounded backtracking engine no + /// matter what optimizations are possible. + /// + /// One must use this with care, since the bounded backtracking engine + /// uses memory proportion to `len(regex) * len(text)`. + /// + /// This overrides whatever was previously set via the `automatic` or + /// `nfa` methods. + pub fn bounded_backtracking(mut self) -> Self { + self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); + self + } + + /// Compiles byte based programs for use with the NFA matching engines. + /// + /// By default, the NFA engines match on Unicode scalar values. They can + /// be made to use byte based programs instead. In general, the byte based + /// programs are slower because of a less efficient encoding of character + /// classes. + /// + /// Note that this does not impact DFA matching engines, which always + /// execute on bytes. + pub fn bytes(mut self, yes: bool) -> Self { + self.bytes = yes; + self + } + + /// When disabled, the program compiled may match arbitrary bytes. + /// + /// When enabled (the default), all compiled programs exclusively match + /// valid UTF-8 bytes. + pub fn only_utf8(mut self, yes: bool) -> Self { + self.only_utf8 = yes; + self + } + + /// Set the Unicode flag. + pub fn unicode(mut self, yes: bool) -> Self { + self.options.unicode = yes; + self + } + + /// Parse the current set of patterns into their AST and extract literals. + fn parse(&self) -> Result { + let mut exprs = Vec::with_capacity(self.options.pats.len()); + let mut prefixes = Some(literal::Seq::empty()); + let mut suffixes = Some(literal::Seq::empty()); + let mut bytes = false; + let is_set = self.options.pats.len() > 1; + // If we're compiling a regex set and that set has any anchored + // expressions, then disable all literal optimizations. + for pat in &self.options.pats { + let mut parser = ParserBuilder::new() + .octal(self.options.octal) + .case_insensitive(self.options.case_insensitive) + .multi_line(self.options.multi_line) + .dot_matches_new_line(self.options.dot_matches_new_line) + .swap_greed(self.options.swap_greed) + .ignore_whitespace(self.options.ignore_whitespace) + .unicode(self.options.unicode) + .utf8(self.only_utf8) + .nest_limit(self.options.nest_limit) + .build(); + let expr = + parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; + let props = expr.properties(); + // This used to just check whether the HIR matched valid UTF-8 + // or not, but in regex-syntax 0.7, we changed our definition of + // "matches valid UTF-8" to exclude zero-width matches. And in + // particular, previously, we considered WordAsciiNegate (that + // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our + // matcher engines were built under this assumption and fixing + // them is not worth it with the imminent plan to switch over to + // regex-automata. So for now, we retain the previous behavior by + // just explicitly treating the presence of a negated ASCII word + // boundary as forcing use to use a byte oriented automaton. + bytes = bytes + || !props.is_utf8() + || props.look_set().contains(Look::WordAsciiNegate); + + if cfg!(feature = "perf-literal") { + if !props.look_set_prefix().contains(Look::Start) + && props.look_set().contains(Look::Start) + { + // Partial anchors unfortunately make it hard to use + // prefixes, so disable them. + prefixes = None; + } else if is_set + && props.look_set_prefix_any().contains(Look::Start) + { + // Regex sets with anchors do not go well with literal + // optimizations. + prefixes = None; + } else if props.look_set_prefix_any().contains_word() { + // The new literal extractor ignores look-around while + // the old one refused to extract prefixes from regexes + // that began with a \b. These old creaky regex internals + // can't deal with it, so we drop it. + prefixes = None; + } else if props.look_set_prefix_any().contains(Look::StartLF) { + // Similar to the reasoning for word boundaries, this old + // regex engine can't handle literal prefixes with '(?m:^)' + // at the beginning of a regex. + prefixes = None; + } + + if !props.look_set_suffix().contains(Look::End) + && props.look_set().contains(Look::End) + { + // Partial anchors unfortunately make it hard to use + // suffixes, so disable them. + suffixes = None; + } else if is_set + && props.look_set_suffix_any().contains(Look::End) + { + // Regex sets with anchors do not go well with literal + // optimizations. + suffixes = None; + } else if props.look_set_suffix_any().contains_word() { + // See the prefix case for reasoning here. + suffixes = None; + } else if props.look_set_suffix_any().contains(Look::EndLF) { + // See the prefix case for reasoning here. + suffixes = None; + } + + let (mut pres, mut suffs) = + if prefixes.is_none() && suffixes.is_none() { + (literal::Seq::infinite(), literal::Seq::infinite()) + } else { + literal_analysis(&expr) + }; + // These old creaky regex internals can't handle cases where + // the literal sequences are exact but there are look-around + // assertions. So we make sure the sequences are inexact if + // there are look-around assertions anywhere. This forces the + // regex engines to run instead of assuming that a literal + // match implies an overall match. + if !props.look_set().is_empty() { + pres.make_inexact(); + suffs.make_inexact(); + } + prefixes = prefixes.and_then(|mut prefixes| { + prefixes.union(&mut pres); + Some(prefixes) + }); + suffixes = suffixes.and_then(|mut suffixes| { + suffixes.union(&mut suffs); + Some(suffixes) + }); + } + exprs.push(expr); + } + Ok(Parsed { + exprs, + prefixes: prefixes.unwrap_or_else(literal::Seq::empty), + suffixes: suffixes.unwrap_or_else(literal::Seq::empty), + bytes, + }) + } + + /// Build an executor that can run a regular expression. + pub fn build(self) -> Result { + // Special case when we have no patterns to compile. + // This can happen when compiling a regex set. + if self.options.pats.is_empty() { + let ro = Arc::new(ExecReadOnly { + res: vec![], + nfa: Program::new(), + dfa: Program::new(), + dfa_reverse: Program::new(), + suffixes: LiteralSearcher::empty(), + #[cfg(feature = "perf-literal")] + ac: None, + match_type: MatchType::Nothing, + }); + let pool = ExecReadOnly::new_pool(&ro); + return Ok(Exec { ro, pool }); + } + let parsed = self.parse()?; + let mut nfa = Compiler::new() + .size_limit(self.options.size_limit) + .bytes(self.bytes || parsed.bytes) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + let mut dfa = Compiler::new() + .size_limit(self.options.size_limit) + .dfa(true) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + let mut dfa_reverse = Compiler::new() + .size_limit(self.options.size_limit) + .dfa(true) + .only_utf8(self.only_utf8) + .reverse(true) + .compile(&parsed.exprs)?; + + #[cfg(feature = "perf-literal")] + let ac = self.build_aho_corasick(&parsed); + nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); + dfa.prefixes = nfa.prefixes.clone(); + dfa.dfa_size_limit = self.options.dfa_size_limit; + dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; + + let mut ro = ExecReadOnly { + res: self.options.pats, + nfa, + dfa, + dfa_reverse, + suffixes: LiteralSearcher::suffixes(parsed.suffixes), + #[cfg(feature = "perf-literal")] + ac, + match_type: MatchType::Nothing, + }; + ro.match_type = ro.choose_match_type(self.match_type); + + let ro = Arc::new(ro); + let pool = ExecReadOnly::new_pool(&ro); + Ok(Exec { ro, pool }) + } + + #[cfg(feature = "perf-literal")] + fn build_aho_corasick(&self, parsed: &Parsed) -> Option { + if parsed.exprs.len() != 1 { + return None; + } + let lits = match alternation_literals(&parsed.exprs[0]) { + None => return None, + Some(lits) => lits, + }; + // If we have a small number of literals, then let Teddy handle + // things (see literal/mod.rs). + if lits.len() <= 32 { + return None; + } + Some( + AhoCorasick::builder() + .match_kind(MatchKind::LeftmostFirst) + .build(&lits) + // This should never happen because we'd long exceed the + // compilation limit for regexes first. + .expect("AC automaton too big"), + ) + } +} + +impl<'c> RegularExpression for ExecNoSyncStr<'c> { + type Text = str; + + fn slots_len(&self) -> usize { + self.0.slots_len() + } + + fn next_after_empty(&self, text: &str, i: usize) -> usize { + next_utf8(text.as_bytes(), i) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_match_at(&self, text: &str, start: usize) -> Option { + self.0.shortest_match_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match_at(&self, text: &str, start: usize) -> bool { + self.0.is_match_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { + self.0.find_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn captures_read_at( + &self, + locs: &mut Locations, + text: &str, + start: usize, + ) -> Option<(usize, usize)> { + self.0.captures_read_at(locs, text.as_bytes(), start) + } +} + +impl<'c> RegularExpression for ExecNoSync<'c> { + type Text = [u8]; + + /// Returns the number of capture slots in the regular expression. (There + /// are two slots for every capture group, corresponding to possibly empty + /// start and end locations of the capture.) + fn slots_len(&self) -> usize { + self.ro.nfa.captures.len() * 2 + } + + fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { + i + 1 + } + + /// Returns the end of a match location, possibly occurring before the + /// end location of the correct leftmost-first match. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_match_at(&self, text: &[u8], start: usize) -> Option { + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).map(|(_, e)| e) + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa | MatchType::DfaMany => { + match self.shortest_dfa(text, start) { + dfa::Result::Match(end) => Some(end), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + true, + &text[start..], + text.len() - start, + ) { + dfa::Result::Match(_) => Some(text.len()), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.shortest_dfa_reverse_suffix(text, start) { + dfa::Result::Match(e) => Some(e), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), + MatchType::Nothing => None, + } + } + + /// Returns true if and only if the regex matches text. + /// + /// For single regular expressions, this is equivalent to calling + /// shortest_match(...).is_some(). + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match_at(&self, text: &[u8], start: usize) -> bool { + if !self.is_anchor_end_match(text) { + return false; + } + // We need to do this dance because shortest_match relies on the NFA + // filling in captures[1], but a RegexSet has no captures. In other + // words, a RegexSet can't (currently) use shortest_match. ---AG + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).is_some() + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa | MatchType::DfaMany => { + match self.shortest_dfa(text, start) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + true, + &text[start..], + text.len() - start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.shortest_dfa_reverse_suffix(text, start) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), + MatchType::Nothing => false, + } + } + + /// Finds the start and end location of the leftmost-first match, starting + /// at the given location. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => self.find_literals(ty, text, start), + #[cfg(feature = "perf-dfa")] + MatchType::Dfa => match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + }, + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match self.find_dfa_anchored_reverse(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.find_dfa_reverse_suffix(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + } + } + MatchType::Nfa(ty) => self.find_nfa(ty, text, start), + MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] + MatchType::DfaMany => { + unreachable!("BUG: RegexSet cannot be used with find") + } + } + } + + /// Finds the start and end location of the leftmost-first match and also + /// fills in all matching capture groups. + /// + /// The number of capture slots given should be equal to the total number + /// of capture slots in the compiled program. + /// + /// Note that the first two slots always correspond to the start and end + /// locations of the overall match. + fn captures_read_at( + &self, + locs: &mut Locations, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + let slots = locs.as_slots(); + for slot in slots.iter_mut() { + *slot = None; + } + // If the caller unnecessarily uses this, then we try to save them + // from themselves. + match slots.len() { + 0 => return self.find_at(text, start), + 2 => { + return self.find_at(text, start).map(|(s, e)| { + slots[0] = Some(s); + slots[1] = Some(e); + (s, e) + }); + } + _ => {} // fallthrough + } + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).and_then(|(s, e)| { + self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ) + }) + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa => { + if self.ro.nfa.is_anchored_start { + self.captures_nfa(slots, text, start) + } else { + match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.captures_nfa(slots, text, start) + } + } + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match self.find_dfa_anchored_reverse(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.find_dfa_reverse_suffix(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), + } + } + MatchType::Nfa(ty) => { + self.captures_nfa_type(ty, slots, text, start, text.len()) + } + MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] + MatchType::DfaMany => { + unreachable!("BUG: RegexSet cannot be used with captures") + } + } + } +} + +impl<'c> ExecNoSync<'c> { + /// Finds the leftmost-first match using only literal search. + #[cfg(feature = "perf-literal")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_literals( + &self, + ty: MatchLiteralType, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + use self::MatchLiteralType::*; + match ty { + Unanchored => { + let lits = &self.ro.nfa.prefixes; + lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) + } + AnchoredStart => { + let lits = &self.ro.nfa.prefixes; + if start == 0 || !self.ro.nfa.is_anchored_start { + lits.find_start(&text[start..]) + .map(|(s, e)| (start + s, start + e)) + } else { + None + } + } + AnchoredEnd => { + let lits = &self.ro.suffixes; + lits.find_end(&text[start..]) + .map(|(s, e)| (start + s, start + e)) + } + AhoCorasick => self + .ro + .ac + .as_ref() + .unwrap() + .find(&text[start..]) + .map(|m| (start + m.start(), start + m.end())), + } + } + + /// Finds the leftmost-first match (start and end) using only the DFA. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_forward( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + let end = match dfa::Fsm::forward( + &self.ro.dfa, + self.cache.value(), + false, + text, + start, + ) { + NoMatch(i) => return NoMatch(i), + Quit => return Quit, + Match(end) if start == end => return Match((start, start)), + Match(end) => end, + }; + // Now run the DFA in reverse to find the start of the match. + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..], + end - start, + ) { + Match(s) => Match((start + s, end)), + NoMatch(i) => NoMatch(i), + Quit => Quit, + } + } + + /// Finds the leftmost-first match (start and end) using only the DFA, + /// but assumes the regex is anchored at the end and therefore starts at + /// the end of the regex and matches in reverse. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_anchored_reverse( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..], + text.len() - start, + ) { + Match(s) => Match((start + s, text.len())), + NoMatch(i) => NoMatch(i), + Quit => Quit, + } + } + + /// Finds the end of the shortest match using only the DFA. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result { + dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) + } + + /// Finds the end of the shortest match using only the DFA by scanning for + /// suffix literals. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_dfa_reverse_suffix( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result { + match self.exec_dfa_reverse_suffix(text, start) { + None => self.shortest_dfa(text, start), + Some(r) => r.map(|(_, end)| end), + } + } + + /// Finds the end of the shortest match using only the DFA by scanning for + /// suffix literals. It also reports the start of the match. + /// + /// Note that if None is returned, then the optimization gave up to avoid + /// worst case quadratic behavior. A forward scanning DFA should be tried + /// next. + /// + /// If a match is returned and the full leftmost-first match is desired, + /// then a forward scan starting from the beginning of the match must be + /// done. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_dfa_reverse_suffix( + &self, + text: &[u8], + original_start: usize, + ) -> Option> { + use crate::dfa::Result::*; + + let lcs = self.ro.suffixes.lcs(); + debug_assert!(lcs.len() >= 1); + let mut start = original_start; + let mut end = start; + let mut last_literal = start; + while end <= text.len() { + last_literal += match lcs.find(&text[last_literal..]) { + None => return Some(NoMatch(text.len())), + Some(i) => i, + }; + end = last_literal + lcs.len(); + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..end], + end - start, + ) { + Match(0) | NoMatch(0) => return None, + Match(i) => return Some(Match((start + i, end))), + NoMatch(i) => { + start += i; + last_literal += 1; + continue; + } + Quit => return Some(Quit), + } + } + Some(NoMatch(text.len())) + } + + /// Finds the leftmost-first match (start and end) using only the DFA + /// by scanning for suffix literals. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_reverse_suffix( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + + let match_start = match self.exec_dfa_reverse_suffix(text, start) { + None => return self.find_dfa_forward(text, start), + Some(Match((start, _))) => start, + Some(r) => return r, + }; + // At this point, we've found a match. The only way to quit now + // without a match is if the DFA gives up (seems unlikely). + // + // Now run the DFA forwards to find the proper end of the match. + // (The suffix literal match can only indicate the earliest + // possible end location, which may appear before the end of the + // leftmost-first match.) + match dfa::Fsm::forward( + &self.ro.dfa, + self.cache.value(), + false, + text, + match_start, + ) { + NoMatch(_) => panic!("BUG: reverse match implies forward match"), + Quit => Quit, + Match(e) => Match((match_start, e)), + } + } + + /// Executes the NFA engine to return whether there is a match or not. + /// + /// Ideally, we could use shortest_nfa(...).is_some() and get the same + /// performance characteristics, but regex sets don't have captures, which + /// shortest_nfa depends on. + #[cfg(feature = "perf-dfa")] + fn match_nfa(&self, text: &[u8], start: usize) -> bool { + self.match_nfa_type(MatchNfaType::Auto, text, start) + } + + /// Like match_nfa, but allows specification of the type of NFA engine. + fn match_nfa_type( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> bool { + self.exec_nfa( + ty, + &mut [false], + &mut [], + true, + false, + text, + start, + text.len(), + ) + } + + /// Finds the shortest match using an NFA. + #[cfg(feature = "perf-dfa")] + fn shortest_nfa(&self, text: &[u8], start: usize) -> Option { + self.shortest_nfa_type(MatchNfaType::Auto, text, start) + } + + /// Like shortest_nfa, but allows specification of the type of NFA engine. + fn shortest_nfa_type( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> Option { + let mut slots = [None, None]; + if self.exec_nfa( + ty, + &mut [false], + &mut slots, + true, + true, + text, + start, + text.len(), + ) { + slots[1] + } else { + None + } + } + + /// Like find, but executes an NFA engine. + fn find_nfa( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + let mut slots = [None, None]; + if self.exec_nfa( + ty, + &mut [false], + &mut slots, + false, + false, + text, + start, + text.len(), + ) { + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } + } else { + None + } + } + + /// Like find_nfa, but fills in captures. + /// + /// `slots` should have length equal to `2 * nfa.captures.len()`. + #[cfg(feature = "perf-dfa")] + fn captures_nfa( + &self, + slots: &mut [Slot], + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + start, + text.len(), + ) + } + + /// Like captures_nfa, but allows specification of type of NFA engine. + fn captures_nfa_type( + &self, + ty: MatchNfaType, + slots: &mut [Slot], + text: &[u8], + start: usize, + end: usize, + ) -> Option<(usize, usize)> { + if self.exec_nfa( + ty, + &mut [false], + slots, + false, + false, + text, + start, + end, + ) { + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } + } else { + None + } + } + + fn exec_nfa( + &self, + mut ty: MatchNfaType, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + quit_after_match_with_pos: bool, + text: &[u8], + start: usize, + end: usize, + ) -> bool { + use self::MatchNfaType::*; + if let Auto = ty { + if backtrack::should_exec(self.ro.nfa.len(), text.len()) { + ty = Backtrack; + } else { + ty = PikeVM; + } + } + // The backtracker can't return the shortest match position as it is + // implemented today. So if someone calls `shortest_match` and we need + // to run an NFA, then use the PikeVM. + if quit_after_match_with_pos || ty == PikeVM { + self.exec_pikevm( + matches, + slots, + quit_after_match, + text, + start, + end, + ) + } else { + self.exec_backtrack(matches, slots, text, start, end) + } + } + + /// Always run the NFA algorithm. + fn exec_pikevm( + &self, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + text: &[u8], + start: usize, + end: usize, + ) -> bool { + if self.ro.nfa.uses_bytes() { + pikevm::Fsm::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + quit_after_match, + ByteInput::new(text, self.ro.nfa.only_utf8), + start, + end, + ) + } else { + pikevm::Fsm::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + quit_after_match, + CharInput::new(text), + start, + end, + ) + } + } + + /// Always runs the NFA using bounded backtracking. + fn exec_backtrack( + &self, + matches: &mut [bool], + slots: &mut [Slot], + text: &[u8], + start: usize, + end: usize, + ) -> bool { + if self.ro.nfa.uses_bytes() { + backtrack::Bounded::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + ByteInput::new(text, self.ro.nfa.only_utf8), + start, + end, + ) + } else { + backtrack::Bounded::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + CharInput::new(text), + start, + end, + ) + } + } + + /// Finds which regular expressions match the given text. + /// + /// `matches` should have length equal to the number of regexes being + /// searched. + /// + /// This is only useful when one wants to know which regexes in a set + /// match some text. + pub fn many_matches_at( + &self, + matches: &mut [bool], + text: &[u8], + start: usize, + ) -> bool { + use self::MatchType::*; + if !self.is_anchor_end_match(text) { + return false; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + Literal(ty) => { + debug_assert_eq!(matches.len(), 1); + matches[0] = self.find_literals(ty, text, start).is_some(); + matches[0] + } + #[cfg(feature = "perf-dfa")] + Dfa | DfaAnchoredReverse | DfaMany => { + match dfa::Fsm::forward_many( + &self.ro.dfa, + self.cache.value(), + matches, + text, + start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.exec_nfa( + MatchNfaType::Auto, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + DfaSuffix => { + match dfa::Fsm::forward_many( + &self.ro.dfa, + self.cache.value(), + matches, + text, + start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.exec_nfa( + MatchNfaType::Auto, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + } + } + Nfa(ty) => self.exec_nfa( + ty, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + Nothing => false, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_anchor_end_match(&self, text: &[u8]) -> bool { + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { + true + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { + // Only do this check if the haystack is big (>1MB). + if text.len() > (1 << 20) && ro.nfa.is_anchored_end { + let lcs = ro.suffixes.lcs(); + if lcs.len() >= 1 && !lcs.is_suffix(text) { + return false; + } + } + true + } + + imp(&self.ro, text) + } + + pub fn capture_name_idx(&self) -> &Arc> { + &self.ro.nfa.capture_name_idx + } +} + +impl<'c> ExecNoSyncStr<'c> { + pub fn capture_name_idx(&self) -> &Arc> { + self.0.capture_name_idx() + } +} + +impl Exec { + /// Get a searcher that isn't Sync. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn searcher(&self) -> ExecNoSync<'_> { + ExecNoSync { + ro: &self.ro, // a clone is too expensive here! (and not needed) + cache: self.pool.get(), + } + } + + /// Get a searcher that isn't Sync and can match on &str. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn searcher_str(&self) -> ExecNoSyncStr<'_> { + ExecNoSyncStr(self.searcher()) + } + + /// Build a Regex from this executor. + pub fn into_regex(self) -> re_unicode::Regex { + re_unicode::Regex::from(self) + } + + /// Build a RegexSet from this executor. + pub fn into_regex_set(self) -> re_set::unicode::RegexSet { + re_set::unicode::RegexSet::from(self) + } + + /// Build a Regex from this executor that can match arbitrary bytes. + pub fn into_byte_regex(self) -> re_bytes::Regex { + re_bytes::Regex::from(self) + } + + /// Build a RegexSet from this executor that can match arbitrary bytes. + pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { + re_set::bytes::RegexSet::from(self) + } + + /// The original regular expressions given by the caller that were + /// compiled. + pub fn regex_strings(&self) -> &[String] { + &self.ro.res + } + + /// Return a slice of capture names. + /// + /// Any capture that isn't named is None. + pub fn capture_names(&self) -> &[Option] { + &self.ro.nfa.captures + } + + /// Return a reference to named groups mapping (from group name to + /// group position). + pub fn capture_name_idx(&self) -> &Arc> { + &self.ro.nfa.capture_name_idx + } + + /// If the number of capture groups in every match is always the same, then + /// return that number. Otherwise return `None`. + pub fn static_captures_len(&self) -> Option { + self.ro.nfa.static_captures_len + } +} + +impl Clone for Exec { + fn clone(&self) -> Exec { + let pool = ExecReadOnly::new_pool(&self.ro); + Exec { ro: self.ro.clone(), pool } + } +} + +impl ExecReadOnly { + fn choose_match_type(&self, hint: Option) -> MatchType { + if let Some(MatchType::Nfa(_)) = hint { + return hint.unwrap(); + } + // If the NFA is empty, then we'll never match anything. + if self.nfa.insts.is_empty() { + return MatchType::Nothing; + } + if let Some(literalty) = self.choose_literal_match_type() { + return literalty; + } + if let Some(dfaty) = self.choose_dfa_match_type() { + return dfaty; + } + // We're so totally hosed. + MatchType::Nfa(MatchNfaType::Auto) + } + + /// If a plain literal scan can be used, then a corresponding literal + /// search type is returned. + fn choose_literal_match_type(&self) -> Option { + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly) -> Option { + None + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly) -> Option { + // If our set of prefixes is complete, then we can use it to find + // a match in lieu of a regex engine. This doesn't quite work well + // in the presence of multiple regexes, so only do it when there's + // one. + // + // TODO(burntsushi): Also, don't try to match literals if the regex + // is partially anchored. We could technically do it, but we'd need + // to create two sets of literals: all of them and then the subset + // that aren't anchored. We would then only search for all of them + // when at the beginning of the input and use the subset in all + // other cases. + if ro.res.len() != 1 { + return None; + } + if ro.ac.is_some() { + return Some(MatchType::Literal( + MatchLiteralType::AhoCorasick, + )); + } + if ro.nfa.prefixes.complete() { + return if ro.nfa.is_anchored_start { + Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) + } else { + Some(MatchType::Literal(MatchLiteralType::Unanchored)) + }; + } + if ro.suffixes.complete() { + return if ro.nfa.is_anchored_end { + Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) + } else { + // This case shouldn't happen. When the regex isn't + // anchored, then complete prefixes should imply complete + // suffixes. + // + // The above is wrong! This case can happen. While + // complete prefixes should imply complete suffixes + // here, that doesn't necessarily mean we have a useful + // prefix matcher! It could be the case that the literal + // searcher decided the prefixes---even though they are + // "complete"---weren't good enough and thus created an + // empty matcher. If that happens and we return Unanchored + // here, then we'll end up using that matcher, which is + // very bad because it matches at every position. So... + // return None. + None + }; + } + None + } + + imp(self) + } + + /// If a DFA scan can be used, then choose the appropriate DFA strategy. + fn choose_dfa_match_type(&self) -> Option { + #[cfg(not(feature = "perf-dfa"))] + fn imp(_: &ExecReadOnly) -> Option { + None + } + + #[cfg(feature = "perf-dfa")] + fn imp(ro: &ExecReadOnly) -> Option { + if !dfa::can_exec(&ro.dfa) { + return None; + } + // Regex sets require a slightly specialized path. + if ro.res.len() >= 2 { + return Some(MatchType::DfaMany); + } + // If the regex is anchored at the end but not the start, then + // just match in reverse from the end of the haystack. + if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { + return Some(MatchType::DfaAnchoredReverse); + } + #[cfg(feature = "perf-literal")] + { + // If there's a longish suffix literal, then it might be faster + // to look for that first. + if ro.should_suffix_scan() { + return Some(MatchType::DfaSuffix); + } + } + // Fall back to your garden variety forward searching lazy DFA. + Some(MatchType::Dfa) + } + + imp(self) + } + + /// Returns true if the program is amenable to suffix scanning. + /// + /// When this is true, as a heuristic, we assume it is OK to quickly scan + /// for suffix literals and then do a *reverse* DFA match from any matches + /// produced by the literal scan. (And then followed by a forward DFA + /// search, since the previously found suffix literal maybe not actually be + /// the end of a match.) + /// + /// This is a bit of a specialized optimization, but can result in pretty + /// big performance wins if 1) there are no prefix literals and 2) the + /// suffix literals are pretty rare in the text. (1) is obviously easy to + /// account for but (2) is harder. As a proxy, we assume that longer + /// strings are generally rarer, so we only enable this optimization when + /// we have a meaty suffix. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + fn should_suffix_scan(&self) -> bool { + if self.suffixes.is_empty() { + return false; + } + let lcs_len = self.suffixes.lcs().char_len(); + lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() + } + + fn new_pool(ro: &Arc) -> Box> { + let ro = ro.clone(); + Box::new(Pool::new(Box::new(move || { + AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro))) + }))) + } +} + +#[derive(Clone, Copy, Debug)] +enum MatchType { + /// A single or multiple literal search. This is only used when the regex + /// can be decomposed into a literal search. + #[cfg(feature = "perf-literal")] + Literal(MatchLiteralType), + /// A normal DFA search. + #[cfg(feature = "perf-dfa")] + Dfa, + /// A reverse DFA search starting from the end of a haystack. + #[cfg(feature = "perf-dfa")] + DfaAnchoredReverse, + /// A reverse DFA search with suffix literal scanning. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + DfaSuffix, + /// Use the DFA on two or more regular expressions. + #[cfg(feature = "perf-dfa")] + DfaMany, + /// An NFA variant. + Nfa(MatchNfaType), + /// No match is ever possible, so don't ever try to search. + Nothing, +} + +#[derive(Clone, Copy, Debug)] +#[cfg(feature = "perf-literal")] +enum MatchLiteralType { + /// Match literals anywhere in text. + Unanchored, + /// Match literals only at the start of text. + AnchoredStart, + /// Match literals only at the end of text. + AnchoredEnd, + /// Use an Aho-Corasick automaton. This requires `ac` to be Some on + /// ExecReadOnly. + AhoCorasick, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum MatchNfaType { + /// Choose between Backtrack and PikeVM. + Auto, + /// NFA bounded backtracking. + /// + /// (This is only set by tests, since it never makes sense to always want + /// backtracking.) + Backtrack, + /// The Pike VM. + /// + /// (This is only set by tests, since it never makes sense to always want + /// the Pike VM.) + PikeVM, +} + +/// `ProgramCache` maintains reusable allocations for each matching engine +/// available to a particular program. +/// +/// We declare this as unwind safe since it's a cache that's only used for +/// performance purposes. If a panic occurs, it is (or should be) always safe +/// to continue using the same regex object. +pub type ProgramCache = AssertUnwindSafe>; + +#[derive(Debug)] +pub struct ProgramCacheInner { + pub pikevm: pikevm::Cache, + pub backtrack: backtrack::Cache, + #[cfg(feature = "perf-dfa")] + pub dfa: dfa::Cache, + #[cfg(feature = "perf-dfa")] + pub dfa_reverse: dfa::Cache, +} + +impl ProgramCacheInner { + fn new(ro: &ExecReadOnly) -> Self { + ProgramCacheInner { + pikevm: pikevm::Cache::new(&ro.nfa), + backtrack: backtrack::Cache::new(&ro.nfa), + #[cfg(feature = "perf-dfa")] + dfa: dfa::Cache::new(&ro.dfa), + #[cfg(feature = "perf-dfa")] + dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), + } + } +} + +/// Alternation literals checks if the given HIR is a simple alternation of +/// literals, and if so, returns them. Otherwise, this returns None. +#[cfg(feature = "perf-literal")] +fn alternation_literals(expr: &Hir) -> Option>> { + use regex_syntax::hir::{HirKind, Literal}; + + // This is pretty hacky, but basically, if `is_alternation_literal` is + // true, then we can make several assumptions about the structure of our + // HIR. This is what justifies the `unreachable!` statements below. + // + // This code should be refactored once we overhaul this crate's + // optimization pipeline, because this is a terribly inflexible way to go + // about things. + + if !expr.properties().is_alternation_literal() { + return None; + } + let alts = match *expr.kind() { + HirKind::Alternation(ref alts) => alts, + _ => return None, // one literal isn't worth it + }; + + let mut lits = vec![]; + for alt in alts { + let mut lit = vec![]; + match *alt.kind() { + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes) + } + HirKind::Concat(ref exprs) => { + for e in exprs { + match *e.kind() { + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes); + } + _ => unreachable!("expected literal, got {:?}", e), + } + } + } + _ => unreachable!("expected literal or concat, got {:?}", alt), + } + lits.push(lit); + } + Some(lits) +} + +#[cfg(not(feature = "perf-literal"))] +fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) { + (literal::Seq::infinite(), literal::Seq::infinite()) +} + +#[cfg(feature = "perf-literal")] +fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) { + const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)]; + + let mut prefixes = literal::Extractor::new() + .kind(literal::ExtractKind::Prefix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match prefixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + prefixes.keep_first_bytes(keep); + prefixes.minimize_by_preference(); + } + + let mut suffixes = literal::Extractor::new() + .kind(literal::ExtractKind::Suffix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match suffixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + suffixes.keep_last_bytes(keep); + suffixes.minimize_by_preference(); + } + + (prefixes, suffixes) +} + +#[cfg(test)] +mod test { + #[test] + fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { + use crate::internal::ExecBuilder; + + let backtrack_bytes_re = ExecBuilder::new("^S") + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_bytes_re = ExecBuilder::new("^S") + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = vec![83, 83]; + + let s1 = backtrack_bytes_re.split(&input); + let s2 = default_bytes_re.split(&input); + for (chunk1, chunk2) in s1.zip(s2) { + assert_eq!(chunk1, chunk2); + } + } + + #[test] + fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { + use crate::internal::ExecBuilder; + + let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") + .bounded_backtracking() + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = "**"; + + let s1 = backtrack_bytes_re.split(input); + let s2 = default_bytes_re.split(input); + for (chunk1, chunk2) in s1.zip(s2) { + assert_eq!(chunk1, chunk2); + } + } +} diff --git a/regex-1.8.4/src/expand.rs b/regex-1.8.4/src/expand.rs new file mode 100644 index 0000000000000..98fafc949f847 --- /dev/null +++ b/regex-1.8.4/src/expand.rs @@ -0,0 +1,247 @@ +use std::str; + +use crate::find_byte::find_byte; + +use crate::re_bytes; +use crate::re_unicode; + +pub fn expand_str( + caps: &re_unicode::Captures<'_>, + mut replacement: &str, + dst: &mut String, +) { + while !replacement.is_empty() { + match find_byte(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement.as_bytes()) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => { + dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or("")); + } + Ref::Named(name) => { + dst.push_str( + caps.name(name).map(|m| m.as_str()).unwrap_or(""), + ); + } + } + } + dst.push_str(replacement); +} + +pub fn expand_bytes( + caps: &re_bytes::Captures<'_>, + mut replacement: &[u8], + dst: &mut Vec, +) { + while !replacement.is_empty() { + match find_byte(b'$', replacement) { + None => break, + Some(i) => { + dst.extend(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => { + dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"")); + } + Ref::Named(name) => { + dst.extend( + caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""), + ); + } + } + } + dst.extend(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text following the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref(replacement: &[u8]) -> Option> { + let mut i = 0; + let rep: &[u8] = replacement; + if rep.len() <= 1 || rep[0] != b'$' { + return None; + } + i += 1; + if rep[i] == b'{' { + return find_cap_ref_braced(rep, i + 1); + } + let mut cap_end = i; + while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check via an unchecked conversion or by parsing the number straight from + // &[u8]. + let cap = + str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name +/// written in non-brace form. +fn is_valid_cap_letter(b: u8) -> bool { + match b { + b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::{find_cap_ref, CaptureRef}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + // See https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + find!(find_cap_ref14, "$1-$2", c!(1, 2)); + find!(find_cap_ref15, "$1_$2", c!("1_", 3)); + find!(find_cap_ref16, "$x-$y", c!("x", 2)); + find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); + find!(find_cap_ref20, "${¾}", c!("¾", 5)); + find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); + find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); + find!(find_cap_ref23, "${☃}", c!("☃", 6)); + find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); + find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); + find!(find_cap_ref26, "${名字}", c!("名字", 9)); +} diff --git a/regex-1.8.4/src/find_byte.rs b/regex-1.8.4/src/find_byte.rs new file mode 100644 index 0000000000000..e95f72afb9412 --- /dev/null +++ b/regex-1.8.4/src/find_byte.rs @@ -0,0 +1,18 @@ +/// Searches for the given needle in the given haystack. +/// +/// If the perf-literal feature is enabled, then this uses the super optimized +/// memchr crate. Otherwise, it uses the naive byte-at-a-time implementation. +pub fn find_byte(needle: u8, haystack: &[u8]) -> Option { + #[cfg(not(feature = "perf-literal"))] + fn imp(needle: u8, haystack: &[u8]) -> Option { + haystack.iter().position(|&b| b == needle) + } + + #[cfg(feature = "perf-literal")] + fn imp(needle: u8, haystack: &[u8]) -> Option { + use memchr::memchr; + memchr(needle, haystack) + } + + imp(needle, haystack) +} diff --git a/regex-1.8.4/src/freqs.rs b/regex-1.8.4/src/freqs.rs new file mode 100644 index 0000000000000..fcffa95fb55ba --- /dev/null +++ b/regex-1.8.4/src/freqs.rs @@ -0,0 +1,261 @@ +// NOTE: The following code was generated by "scripts/frequencies.py", do not +// edit directly + +pub const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/regex-1.8.4/src/input.rs b/regex-1.8.4/src/input.rs new file mode 100644 index 0000000000000..df6c3e0c91039 --- /dev/null +++ b/regex-1.8.4/src/input.rs @@ -0,0 +1,432 @@ +use std::char; +use std::cmp::Ordering; +use std::fmt; +use std::ops; +use std::u32; + +use crate::literal::LiteralSearcher; +use crate::prog::InstEmptyLook; +use crate::utf8::{decode_last_utf8, decode_utf8}; + +/// Represents a location in the input. +#[derive(Clone, Copy, Debug)] +pub struct InputAt { + pos: usize, + c: Char, + byte: Option, + len: usize, +} + +impl InputAt { + /// Returns true iff this position is at the beginning of the input. + pub fn is_start(&self) -> bool { + self.pos == 0 + } + + /// Returns true iff this position is past the end of the input. + pub fn is_end(&self) -> bool { + self.c.is_none() && self.byte.is_none() + } + + /// Returns the character at this position. + /// + /// If this position is just before or after the input, then an absent + /// character is returned. + pub fn char(&self) -> Char { + self.c + } + + /// Returns the byte at this position. + pub fn byte(&self) -> Option { + self.byte + } + + /// Returns the UTF-8 width of the character at this position. + pub fn len(&self) -> usize { + self.len + } + + /// Returns whether the UTF-8 width of the character at this position + /// is zero. + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the byte offset of this position. + pub fn pos(&self) -> usize { + self.pos + } + + /// Returns the byte offset of the next position in the input. + pub fn next_pos(&self) -> usize { + self.pos + self.len + } +} + +/// An abstraction over input used in the matching engines. +pub trait Input: fmt::Debug { + /// Return an encoding of the position at byte offset `i`. + fn at(&self, i: usize) -> InputAt; + + /// Return the Unicode character occurring next to `at`. + /// + /// If no such character could be decoded, then `Char` is absent. + fn next_char(&self, at: InputAt) -> Char; + + /// Return the Unicode character occurring previous to `at`. + /// + /// If no such character could be decoded, then `Char` is absent. + fn previous_char(&self, at: InputAt) -> Char; + + /// Return true if the given empty width instruction matches at the + /// input position given. + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; + + /// Scan the input for a matching prefix. + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option; + + /// The number of bytes in the input. + fn len(&self) -> usize; + + /// Whether the input is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return the given input as a sequence of bytes. + fn as_bytes(&self) -> &[u8]; +} + +impl<'a, T: Input> Input for &'a T { + fn at(&self, i: usize) -> InputAt { + (**self).at(i) + } + + fn next_char(&self, at: InputAt) -> Char { + (**self).next_char(at) + } + + fn previous_char(&self, at: InputAt) -> Char { + (**self).previous_char(at) + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + (**self).is_empty_match(at, empty) + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option { + (**self).prefix_at(prefixes, at) + } + + fn len(&self) -> usize { + (**self).len() + } + + fn as_bytes(&self) -> &[u8] { + (**self).as_bytes() + } +} + +/// An input reader over characters. +#[derive(Clone, Copy, Debug)] +pub struct CharInput<'t>(&'t [u8]); + +impl<'t> CharInput<'t> { + /// Return a new character input reader for the given string. + pub fn new(s: &'t [u8]) -> CharInput<'t> { + CharInput(s) + } +} + +impl<'t> ops::Deref for CharInput<'t> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.0 + } +} + +impl<'t> Input for CharInput<'t> { + fn at(&self, i: usize) -> InputAt { + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); + InputAt { pos: i, c, byte: None, len: c.len_utf8() } + } + } + + fn next_char(&self, at: InputAt) -> Char { + at.char() + } + + fn previous_char(&self, at: InputAt) -> Char { + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use crate::prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + at.pos() == 0 || c == '\n' + } + EndLine => { + let c = self.next_char(at); + at.pos() == self.len() || c == '\n' + } + StartText => at.pos() == 0, + EndText => at.pos() == self.len(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() == c2.is_word_byte() + } + } + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option { + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn as_bytes(&self) -> &[u8] { + self.0 + } +} + +/// An input reader over bytes. +#[derive(Clone, Copy, Debug)] +pub struct ByteInput<'t> { + text: &'t [u8], + only_utf8: bool, +} + +impl<'t> ByteInput<'t> { + /// Return a new byte-based input reader for the given string. + pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { + ByteInput { text, only_utf8 } + } +} + +impl<'t> ops::Deref for ByteInput<'t> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.text + } +} + +impl<'t> Input for ByteInput<'t> { + fn at(&self, i: usize) -> InputAt { + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + InputAt { + pos: i, + c: None.into(), + byte: self.get(i).cloned(), + len: 1, + } + } + } + + fn next_char(&self, at: InputAt) -> Char { + decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() + } + + fn previous_char(&self, at: InputAt) -> Char { + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use crate::prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + at.pos() == 0 || c == '\n' + } + EndLine => { + let c = self.next_char(at); + at.pos() == self.len() || c == '\n' + } + StartText => at.pos() == 0, + EndText => at.pos() == self.len(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() == c2.is_word_byte() + } + } + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option { + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) + } + + fn len(&self) -> usize { + self.text.len() + } + + fn as_bytes(&self) -> &[u8] { + self.text + } +} + +/// An inline representation of `Option`. +/// +/// This eliminates the need to do case analysis on `Option` to determine +/// ordinality with other characters. +/// +/// (The `Option` is not related to encoding. Instead, it is used in the +/// matching engines to represent the beginning and ending boundaries of the +/// search text.) +#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct Char(u32); + +impl fmt::Debug for Char { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match char::from_u32(self.0) { + None => write!(f, "Empty"), + Some(c) => write!(f, "{:?}", c), + } + } +} + +impl Char { + /// Returns true iff the character is absent. + #[inline] + pub fn is_none(self) -> bool { + self.0 == u32::MAX + } + + /// Returns the length of the character's UTF-8 encoding. + /// + /// If the character is absent, then `1` is returned. + #[inline] + pub fn len_utf8(self) -> usize { + char::from_u32(self.0).map_or(1, |c| c.len_utf8()) + } + + /// Returns true iff the character is a word character. + /// + /// If the character is absent, then false is returned. + pub fn is_word_char(self) -> bool { + // is_word_character can panic if the Unicode data for \w isn't + // available. However, our compiler ensures that if a Unicode word + // boundary is used, then the data must also be available. If it isn't, + // then the compiler returns an error. + char::from_u32(self.0).map_or(false, regex_syntax::is_word_character) + } + + /// Returns true iff the byte is a word byte. + /// + /// If the byte is absent, then false is returned. + pub fn is_word_byte(self) -> bool { + match char::from_u32(self.0) { + Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8), + None | Some(_) => false, + } + } +} + +impl From for Char { + fn from(c: char) -> Char { + Char(c as u32) + } +} + +impl From> for Char { + fn from(c: Option) -> Char { + c.map_or(Char(u32::MAX), |c| c.into()) + } +} + +impl PartialEq for Char { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0 == *other as u32 + } +} + +impl PartialEq for char { + #[inline] + fn eq(&self, other: &Char) -> bool { + *self as u32 == other.0 + } +} + +impl PartialOrd for Char { + #[inline] + fn partial_cmp(&self, other: &char) -> Option { + self.0.partial_cmp(&(*other as u32)) + } +} + +impl PartialOrd for char { + #[inline] + fn partial_cmp(&self, other: &Char) -> Option { + (*self as u32).partial_cmp(&other.0) + } +} diff --git a/regex-1.8.4/src/lib.rs b/regex-1.8.4/src/lib.rs new file mode 100644 index 0000000000000..861edcf43aecd --- /dev/null +++ b/regex-1.8.4/src/lib.rs @@ -0,0 +1,806 @@ +/*! +This crate provides a library for parsing, compiling, and executing regular +expressions. Its syntax is similar to Perl-style regular expressions, but lacks +a few features like look around and backreferences. In exchange, all searches +execute in linear time with respect to the size of the regular expression and +search text. + +This crate's documentation provides some simple examples, describes +[Unicode support](#unicode) and exhaustively lists the +[supported syntax](#syntax). + +For more specific details on the API for regular expressions, please see the +documentation for the [`Regex`](struct.Regex.html) type. + +# Usage + +This crate is [on crates.io](https://crates.io/crates/regex) and can be +used by adding `regex` to your dependencies in your project's `Cargo.toml`. + +```toml +[dependencies] +regex = "1" +``` + +# Example: find a date + +General use of regular expressions in this package involves compiling an +expression and then using it to search, split or replace text. For example, +to confirm that some text resembles a date: + +```rust +use regex::Regex; +let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); +assert!(re.is_match("2014-01-01")); +``` + +Notice the use of the `^` and `$` anchors. In this crate, every expression +is executed with an implicit `.*?` at the beginning and end, which allows +it to match anywhere in the text. Anchors can be used to ensure that the +full text matches an expression. + +This example also demonstrates the utility of +[raw strings](https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals) +in Rust, which +are just like regular strings except they are prefixed with an `r` and do +not process any escape sequences. For example, `"\\d"` is the same +expression as `r"\d"`. + +# Example: Avoid compiling the same regex in a loop + +It is an anti-pattern to compile the same regular expression in a loop +since compilation is typically expensive. (It takes anywhere from a few +microseconds to a few **milliseconds** depending on the size of the +regex.) Not only is compilation itself expensive, but this also prevents +optimizations that reuse allocations internally to the matching engines. + +In Rust, it can sometimes be a pain to pass regular expressions around if +they're used from inside a helper function. Instead, we recommend using the +[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that +regular expressions are compiled exactly once. + +For example: + +```rust +use lazy_static::lazy_static; +use regex::Regex; + +fn some_helper_function(text: &str) -> bool { + lazy_static! { + static ref RE: Regex = Regex::new("...").unwrap(); + } + RE.is_match(text) +} + +fn main() {} +``` + +Specifically, in this example, the regex will be compiled when it is used for +the first time. On subsequent uses, it will reuse the previous compilation. + +# Example: iterating over capture groups + +This crate provides convenient iterators for matching an expression +repeatedly against a search string to find successive non-overlapping +matches. For example, to find all dates in a string and be able to access +them by their component pieces: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); +let text = "2012-03-14, 2013-01-01 and 2014-07-05"; +for cap in re.captures_iter(text) { + println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]); +} +// Output: +// Month: 03 Day: 14 Year: 2012 +// Month: 01 Day: 01 Year: 2013 +// Month: 07 Day: 05 Year: 2014 +# } +``` + +Notice that the year is in the capture group indexed at `1`. This is +because the *entire match* is stored in the capture group at index `0`. + +# Example: replacement with named capture groups + +Building on the previous example, perhaps we'd like to rearrange the date +formats. This can be done with text replacement. But to make the code +clearer, we can *name* our capture groups and use those names as variables +in our replacement text: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?P\d{4})-(?P\d{2})-(?P\d{2})").unwrap(); +let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +# } +``` + +The `replace` methods are actually polymorphic in the replacement, which +provides more flexibility than is seen here. (See the documentation for +`Regex::replace` for more details.) + +Note that if your regex gets complicated, you can use the `x` flag to +enable insignificant whitespace mode, which also lets you write comments: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?x) + (?P\d{4}) # the year + - + (?P\d{2}) # the month + - + (?P\d{2}) # the day +").unwrap(); +let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +# } +``` + +If you wish to match against whitespace in this mode, you can still use `\s`, +`\n`, `\t`, etc. For escaping a single space character, you can escape it +directly with `\ `, use its hex character code `\x20` or temporarily disable +the `x` flag, e.g., `(?-x: )`. + +# Example: match multiple regular expressions simultaneously + +This demonstrates how to use a `RegexSet` to match multiple (possibly +overlapping) regular expressions in a single scan of the search text: + +```rust +use regex::RegexSet; + +let set = RegexSet::new(&[ + r"\w+", + r"\d+", + r"\pL+", + r"foo", + r"bar", + r"barfoo", + r"foobar", +]).unwrap(); + +// Iterate over and collect all of the matches. +let matches: Vec<_> = set.matches("foobar").into_iter().collect(); +assert_eq!(matches, vec![0, 2, 3, 4, 6]); + +// You can also test whether a particular regex matched: +let matches = set.matches("foobar"); +assert!(!matches.matched(5)); +assert!(matches.matched(6)); +``` + +# Pay for what you use + +With respect to searching text with a regular expression, there are three +questions that can be asked: + +1. Does the text match this expression? +2. If so, where does it match? +3. Where did the capturing groups match? + +Generally speaking, this crate could provide a function to answer only #3, +which would subsume #1 and #2 automatically. However, it can be significantly +more expensive to compute the location of capturing group matches, so it's best +not to do it if you don't need to. + +Therefore, only use what you need. For example, don't use `find` if you +only need to test if an expression matches a string. (Use `is_match` +instead.) + +# Unicode + +This implementation executes regular expressions **only** on valid UTF-8 +while exposing match locations as byte indices into the search string. (To +relax this restriction, use the [`bytes`](bytes/index.html) sub-module.) +Conceptually, the regex engine works by matching a haystack as if it were a +sequence of Unicode scalar values. + +Only simple case folding is supported. Namely, when matching +case-insensitively, the characters are first mapped using the "simple" case +folding rules defined by Unicode. + +Regular expressions themselves are **only** interpreted as a sequence of +Unicode scalar values. This means you can use Unicode characters directly +in your expression: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?i)Δ+").unwrap(); +let mat = re.find("ΔδΔ").unwrap(); +assert_eq!((mat.start(), mat.end()), (0, 6)); +# } +``` + +Most features of the regular expressions in this crate are Unicode aware. Here +are some examples: + +* `.` will match any valid UTF-8 encoded Unicode scalar value except for `\n`. + (To also match `\n`, enable the `s` flag, e.g., `(?s:.)`.) +* `\w`, `\d` and `\s` are Unicode aware. For example, `\s` will match all forms + of whitespace categorized by Unicode. +* `\b` matches a Unicode word boundary. +* Negated character classes like `[^a]` match all Unicode scalar values except + for `a`. +* `^` and `$` are **not** Unicode aware in multi-line mode. Namely, they only + recognize `\n` and not any of the other forms of line terminators defined + by Unicode. + +Unicode general categories, scripts, script extensions, ages and a smattering +of boolean properties are available as character classes. For example, you can +match a sequence of numerals, Greek or Cherokee letters: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap(); +let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap(); +assert_eq!((mat.start(), mat.end()), (3, 23)); +# } +``` + +For a more detailed breakdown of Unicode support with respect to +[UTS#18](https://unicode.org/reports/tr18/), +please see the +[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md) +document in the root of the regex repository. + +# Opt out of Unicode support + +The `bytes` sub-module provides a `Regex` type that can be used to match +on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with +the main `Regex` type. However, this behavior can be disabled by turning +off the `u` flag, even if doing so could result in matching invalid UTF-8. +For example, when the `u` flag is disabled, `.` will match any byte instead +of any Unicode scalar value. + +Disabling the `u` flag is also possible with the standard `&str`-based `Regex` +type, but it is only allowed where the UTF-8 invariant is maintained. For +example, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an +`&str`-based `Regex`, but `(?-u:\xFF)` will attempt to match the raw byte +`\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based +regexes. + +Finally, since Unicode support requires bundling large Unicode data +tables, this crate exposes knobs to disable the compilation of those +data tables, which can be useful for shrinking binary size and reducing +compilation times. For details on how to do that, see the section on [crate +features](#crate-features). + +# Syntax + +The syntax supported in this crate is documented below. + +Note that the regular expression parser and abstract syntax are exposed in +a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). + +## Matching one character + +
+.             any character except new line (includes new line with s flag)
+\d            digit (\p{Nd})
+\D            not digit
+\pX           Unicode character class identified by a one-letter name
+\p{Greek}     Unicode character class (general category or script)
+\PX           Negated Unicode character class identified by a one-letter name
+\P{Greek}     negated Unicode character class (general category or script)
+
+ +### Character classes + +
+[xyz]         A character class matching either x, y or z (union).
+[^xyz]        A character class matching any character except x, y and z.
+[a-z]         A character class matching any character in range a-z.
+[[:alpha:]]   ASCII character class ([A-Za-z])
+[[:^alpha:]]  Negated ASCII character class ([^A-Za-z])
+[x[^xyz]]     Nested/grouping character class (matching any character except y and z)
+[a-y&&xyz]    Intersection (matching x or y)
+[0-9&&[^4]]   Subtraction using intersection and negation (matching 0-9 except 4)
+[0-9--4]      Direct subtraction (matching 0-9 except 4)
+[a-g~~b-h]    Symmetric difference (matching `a` and `h` only)
+[\[\]]        Escaping in character classes (matching [ or ])
+
+ +Any named character class may appear inside a bracketed `[...]` character +class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII +digit. `[\p{Greek}&&\pL]` matches Greek letters. + +Precedence in character classes, from most binding to least: + +1. Ranges: `a-cd` == `[a-c]d` +2. Union: `ab&&bc` == `[ab]&&[bc]` +3. Intersection: `^a-z&&b` == `^[a-z&&b]` +4. Negation + +## Composites + +
+xy    concatenation (x followed by y)
+x|y   alternation (x or y, prefer x)
+
+ +This example shows how an alternation works, and what it means to prefer a +branch in the alternation over subsequent branches. + +``` +use regex::Regex; + +let haystack = "samwise"; +// If 'samwise' comes first in our alternation, then it is +// preferred as a match, even if the regex engine could +// technically detect that 'sam' led to a match earlier. +let re = Regex::new(r"samwise|sam").unwrap(); +assert_eq!("samwise", re.find(haystack).unwrap().as_str()); +// But if 'sam' comes first, then it will match instead. +// In this case, it is impossible for 'samwise' to match +// because 'sam' is a prefix of it. +let re = Regex::new(r"sam|samwise").unwrap(); +assert_eq!("sam", re.find(haystack).unwrap().as_str()); +``` + +## Repetitions + +
+x*        zero or more of x (greedy)
+x+        one or more of x (greedy)
+x?        zero or one of x (greedy)
+x*?       zero or more of x (ungreedy/lazy)
+x+?       one or more of x (ungreedy/lazy)
+x??       zero or one of x (ungreedy/lazy)
+x{n,m}    at least n x and at most m x (greedy)
+x{n,}     at least n x (greedy)
+x{n}      exactly n x
+x{n,m}?   at least n x and at most m x (ungreedy/lazy)
+x{n,}?    at least n x (ungreedy/lazy)
+x{n}?     exactly n x
+
+ +## Empty matches + +
+^     the beginning of text (or start-of-line with multi-line mode)
+$     the end of text (or end-of-line with multi-line mode)
+\A    only the beginning of text (even with multi-line mode enabled)
+\z    only the end of text (even with multi-line mode enabled)
+\b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B    not a Unicode word boundary
+
+ +The empty regex is valid and matches the empty string. For example, the empty +regex matches `abc` at positions `0`, `1`, `2` and `3`. + +## Grouping and flags + +
+(exp)          numbered capture group (indexed by opening parenthesis)
+(?P<name>exp)  named (also numbered) capture group (names must be alpha-numeric)
+(?<name>exp)   named (also numbered) capture group (names must be alpha-numeric)
+(?:exp)        non-capturing group
+(?flags)       set flags within current group
+(?flags:exp)   set flags for exp (non-capturing)
+
+ +Capture group names must be any sequence of alpha-numeric Unicode codepoints, +in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or +an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic` +Unicode property, while numeric codepoints correspond to the union of the +`Decimal_Number`, `Letter_Number` and `Other_Number` general categories. + +Flags are each a single character. For example, `(?x)` sets the flag `x` +and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at +the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets +the `x` flag and clears the `y` flag. + +All flags are by default disabled unless stated otherwise. They are: + +
+i     case-insensitive: letters match both upper and lower case
+m     multi-line mode: ^ and $ match begin/end of line
+s     allow . to match \n
+U     swap the meaning of x* and x*?
+u     Unicode support (enabled by default)
+x     verbose mode, ignores whitespace and allow line comments (starting with `#`)
+
+ +Note that in verbose mode, whitespace is ignored everywhere, including within +character classes. To insert whitespace, use its escaped form or a hex literal. +For example, `\ ` or `\x20` for an ASCII space. + +Flags can be toggled within a pattern. Here's an example that matches +case-insensitively for the first part but case-sensitively for the second part: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); +let cap = re.captures("AaAaAbbBBBb").unwrap(); +assert_eq!(&cap[0], "AaAaAbb"); +# } +``` + +Notice that the `a+` matches either `a` or `A`, but the `b+` only matches +`b`. + +Multi-line mode means `^` and `$` no longer match just at the beginning/end of +the input, but at the beginning/end of lines: + +``` +# use regex::Regex; +let re = Regex::new(r"(?m)^line \d+").unwrap(); +let m = re.find("line one\nline 2\n").unwrap(); +assert_eq!(m.as_str(), "line 2"); +``` + +Note that `^` matches after new lines, even at the end of input: + +``` +# use regex::Regex; +let re = Regex::new(r"(?m)^").unwrap(); +let m = re.find_iter("test\n").last().unwrap(); +assert_eq!((m.start(), m.end()), (5, 5)); +``` + +Here is an example that uses an ASCII word boundary instead of a Unicode +word boundary: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap(); +let cap = re.captures("$$abc$$").unwrap(); +assert_eq!(&cap[0], "abc"); +# } +``` + +## Escape sequences + +
+\*          literal *, works for any punctuation character: \.+*?()|[]{}^$
+\a          bell (\x07)
+\f          form feed (\x0C)
+\t          horizontal tab
+\n          new line
+\r          carriage return
+\v          vertical tab (\x0B)
+\123        octal character code (up to three digits) (when enabled)
+\x7F        hex character code (exactly two digits)
+\x{10FFFF}  any hex character code corresponding to a Unicode code point
+\u007F      hex character code (exactly four digits)
+\u{7F}      any hex character code corresponding to a Unicode code point
+\U0000007F  hex character code (exactly eight digits)
+\U{7F}      any hex character code corresponding to a Unicode code point
+
+ +## Perl character classes (Unicode friendly) + +These classes are based on the definitions provided in +[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties): + +
+\d     digit (\p{Nd})
+\D     not digit
+\s     whitespace (\p{White_Space})
+\S     not whitespace
+\w     word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
+\W     not word character
+
+ +## ASCII character classes + +
+[[:alnum:]]    alphanumeric ([0-9A-Za-z])
+[[:alpha:]]    alphabetic ([A-Za-z])
+[[:ascii:]]    ASCII ([\x00-\x7F])
+[[:blank:]]    blank ([\t ])
+[[:cntrl:]]    control ([\x00-\x1F\x7F])
+[[:digit:]]    digits ([0-9])
+[[:graph:]]    graphical ([!-~])
+[[:lower:]]    lower case ([a-z])
+[[:print:]]    printable ([ -~])
+[[:punct:]]    punctuation ([!-/:-@\[-`{-~])
+[[:space:]]    whitespace ([\t\n\v\f\r ])
+[[:upper:]]    upper case ([A-Z])
+[[:word:]]     word characters ([0-9A-Za-z_])
+[[:xdigit:]]   hex digit ([0-9A-Fa-f])
+
+ +# Crate features + +By default, this crate tries pretty hard to make regex matching both as fast +as possible and as correct as it can be, within reason. This means that there +is a lot of code dedicated to performance, the handling of Unicode data and the +Unicode data itself. Overall, this leads to more dependencies, larger binaries +and longer compile times. This trade off may not be appropriate in all cases, +and indeed, even when all Unicode and performance features are disabled, one +is still left with a perfectly serviceable regex engine that will work well +in many cases. + +This crate exposes a number of features for controlling that trade off. Some +of these features are strictly performance oriented, such that disabling them +won't result in a loss of functionality, but may result in worse performance. +Other features, such as the ones controlling the presence or absence of Unicode +data, can result in a loss of functionality. For example, if one disables the +`unicode-case` feature (described below), then compiling the regex `(?i)a` +will fail since Unicode case insensitivity is enabled by default. Instead, +callers must use `(?i-u)a` instead to disable Unicode case folding. Stated +differently, enabling or disabling any of the features below can only add or +subtract from the total set of valid regular expressions. Enabling or disabling +a feature will never modify the match semantics of a regular expression. + +All features below are enabled by default. + +### Ecosystem features + +* **std** - + When enabled, this will cause `regex` to use the standard library. Currently, + disabling this feature will always result in a compilation error. It is + intended to add `alloc`-only support to regex in the future. + +### Performance features + +* **perf** - + Enables all performance related features. This feature is enabled by default + and will always cover all features that improve performance, even if more + are added in the future. +* **perf-dfa** - + Enables the use of a lazy DFA for matching. The lazy DFA is used to compile + portions of a regex to a very fast DFA on an as-needed basis. This can + result in substantial speedups, usually by an order of magnitude on large + haystacks. The lazy DFA does not bring in any new dependencies, but it can + make compile times longer. +* **perf-inline** - + Enables the use of aggressive inlining inside match routines. This reduces + the overhead of each match. The aggressive inlining, however, increases + compile times and binary size. +* **perf-literal** - + Enables the use of literal optimizations for speeding up matches. In some + cases, literal optimizations can result in speedups of _several_ orders of + magnitude. Disabling this drops the `aho-corasick` and `memchr` dependencies. +* **perf-cache** - + This feature used to enable a faster internal cache at the cost of using + additional dependencies, but this is no longer an option. A fast internal + cache is now used unconditionally with no additional dependencies. This may + change in the future. + +### Unicode features + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. + + +# Untrusted input + +This crate can handle both untrusted regular expressions and untrusted +search text. + +Untrusted regular expressions are handled by capping the size of a compiled +regular expression. +(See [`RegexBuilder::size_limit`](struct.RegexBuilder.html#method.size_limit).) +Without this, it would be trivial for an attacker to exhaust your system's +memory with expressions like `a{100}{100}{100}`. + +Untrusted search text is allowed because the matching engine(s) in this +crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search +text`), which means there's no way to cause exponential blow-up like with +some other regular expression engines. (We pay for this by disallowing +features like arbitrary look-ahead and backreferences.) + +When a DFA is used, pathological cases with exponential state blow-up are +avoided by constructing the DFA lazily or in an "online" manner. Therefore, +at most one new state can be created for each byte of input. This satisfies +our time complexity guarantees, but can lead to memory growth +proportional to the size of the input. As a stopgap, the DFA is only +allowed to store a fixed number of states. When the limit is reached, its +states are wiped and continues on, possibly duplicating previous work. If +the limit is reached too frequently, it gives up and hands control off to +another matching engine with fixed memory requirements. +(The DFA size limit can also be tweaked. See +[`RegexBuilder::dfa_size_limit`](struct.RegexBuilder.html#method.dfa_size_limit).) +*/ + +#![deny(missing_docs)] +#![cfg_attr(feature = "pattern", feature(pattern))] +#![warn(missing_debug_implementations)] +#![allow(unused_imports)] +#![allow(elided_lifetimes_in_paths)] +#![allow(rustc::default_hash_types)] +#![allow(rustc::potential_query_instability)] + +#[cfg(not(feature = "std"))] +compile_error!("`std` feature is currently required to build this crate"); + +// To check README's example +// TODO: Re-enable this once the MSRV is 1.43 or greater. +// See: https://github.com/rust-lang/regex/issues/684 +// See: https://github.com/rust-lang/regex/issues/685 +// #[cfg(doctest)] +// doc_comment::doctest!("../README.md"); + +#[cfg(feature = "std")] +pub use crate::error::Error; +#[cfg(feature = "std")] +pub use crate::re_builder::set_unicode::*; +#[cfg(feature = "std")] +pub use crate::re_builder::unicode::*; +#[cfg(feature = "std")] +pub use crate::re_set::unicode::*; +#[cfg(feature = "std")] +pub use crate::re_unicode::{ + escape, CaptureLocations, CaptureMatches, CaptureNames, Captures, + Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split, + SplitN, SubCaptureMatches, +}; + +/** +Match regular expressions on arbitrary bytes. + +This module provides a nearly identical API to the one found in the +top-level of this crate. There are two important differences: + +1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec` +is used where `String` would have been used. +2. Unicode support can be disabled even when disabling it would result in +matching invalid UTF-8 bytes. + +# Example: match null terminated string + +This shows how to find all null-terminated strings in a slice of bytes: + +```rust +# use regex::bytes::Regex; +let re = Regex::new(r"(?-u)(?P[^\x00]+)\x00").unwrap(); +let text = b"foo\x00bar\x00baz\x00"; + +// Extract all of the strings without the null terminator from each match. +// The unwrap is OK here since a match requires the `cstr` capture to match. +let cstrs: Vec<&[u8]> = + re.captures_iter(text) + .map(|c| c.name("cstr").unwrap().as_bytes()) + .collect(); +assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); +``` + +# Example: selectively enable Unicode support + +This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded +string (e.g., to extract a title from a Matroska file): + +```rust +# use std::str; +# use regex::bytes::Regex; +let re = Regex::new( + r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))" +).unwrap(); +let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; +let caps = re.captures(text).unwrap(); + +// Notice that despite the `.*` at the end, it will only match valid UTF-8 +// because Unicode mode was enabled with the `u` flag. Without the `u` flag, +// the `.*` would match the rest of the bytes. +let mat = caps.get(1).unwrap(); +assert_eq!((7, 10), (mat.start(), mat.end())); + +// If there was a match, Unicode mode guarantees that `title` is valid UTF-8. +let title = str::from_utf8(&caps[1]).unwrap(); +assert_eq!("☃", title); +``` + +In general, if the Unicode flag is enabled in a capture group and that capture +is part of the overall match, then the capture is *guaranteed* to be valid +UTF-8. + +# Syntax + +The supported syntax is pretty much the same as the syntax for Unicode +regular expressions with a few changes that make sense for matching arbitrary +bytes: + +1. The `u` flag can be disabled even when disabling it might cause the regex to +match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in +"ASCII compatible" mode. +2. In ASCII compatible mode, neither Unicode scalar values nor Unicode +character classes are allowed. +3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) +revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps +to `[[:digit:]]` and `\s` maps to `[[:space:]]`. +4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to +determine whether a byte is a word byte or not. +5. Hexadecimal notation can be used to specify arbitrary bytes instead of +Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the +literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that +matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when +enabled. +6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the +`s` flag is additionally enabled, `.` matches any byte. + +# Performance + +In general, one should expect performance on `&[u8]` to be roughly similar to +performance on `&str`. +*/ + +#[cfg(feature = "std")] +pub mod bytes { + pub use crate::re_builder::bytes::*; + pub use crate::re_builder::set_bytes::*; + pub use crate::re_bytes::*; + pub use crate::re_set::bytes::*; +} + +mod backtrack; +mod compile; +#[cfg(feature = "perf-dfa")] +mod dfa; +mod error; +mod exec; +mod expand; +mod find_byte; +mod input; +mod literal; +#[cfg(feature = "pattern")] +mod pattern; +mod pikevm; +mod pool; +mod prog; +mod re_builder; +mod re_bytes; +mod re_set; +mod re_trait; +mod re_unicode; +mod sparse; +mod utf8; + +/// The `internal` module exists to support suspicious activity, such as +/// testing different matching engines and supporting the `regex-debug` CLI +/// utility. +#[doc(hidden)] +#[cfg(feature = "std")] +pub mod internal { + pub use crate::compile::Compiler; + pub use crate::exec::{Exec, ExecBuilder}; + pub use crate::input::{Char, CharInput, Input, InputAt}; + pub use crate::literal::LiteralSearcher; + pub use crate::prog::{EmptyLook, Inst, InstRanges, Program}; +} diff --git a/regex-1.8.4/src/literal/imp.rs b/regex-1.8.4/src/literal/imp.rs new file mode 100644 index 0000000000000..75fa6e37b27be --- /dev/null +++ b/regex-1.8.4/src/literal/imp.rs @@ -0,0 +1,413 @@ +use std::mem; + +use aho_corasick::{self, packed, AhoCorasick}; +use memchr::{memchr, memchr2, memchr3, memmem}; +use regex_syntax::hir::literal::{Literal, Seq}; + +/// A prefix extracted from a compiled regular expression. +/// +/// A regex prefix is a set of literal strings that *must* be matched at the +/// beginning of a regex in order for the entire regex to match. Similarly +/// for a regex suffix. +#[derive(Clone, Debug)] +pub struct LiteralSearcher { + complete: bool, + lcp: Memmem, + lcs: Memmem, + matcher: Matcher, +} + +#[derive(Clone, Debug)] +enum Matcher { + /// No literals. (Never advances through the input.) + Empty, + /// A set of four or more single byte literals. + Bytes(SingleByteSet), + /// A single substring, using vector accelerated routines when available. + Memmem(Memmem), + /// An Aho-Corasick automaton. + AC { ac: AhoCorasick, lits: Vec }, + /// A packed multiple substring searcher, using SIMD. + /// + /// Note that Aho-Corasick will actually use this packed searcher + /// internally automatically, however, there is some overhead associated + /// with going through the Aho-Corasick machinery. So using the packed + /// searcher directly results in some gains. + Packed { s: packed::Searcher, lits: Vec }, +} + +impl LiteralSearcher { + /// Returns a matcher that never matches and never advances the input. + pub fn empty() -> Self { + Self::new(Seq::infinite(), Matcher::Empty) + } + + /// Returns a matcher for literal prefixes from the given set. + pub fn prefixes(lits: Seq) -> Self { + let matcher = Matcher::prefixes(&lits); + Self::new(lits, matcher) + } + + /// Returns a matcher for literal suffixes from the given set. + pub fn suffixes(lits: Seq) -> Self { + let matcher = Matcher::suffixes(&lits); + Self::new(lits, matcher) + } + + fn new(lits: Seq, matcher: Matcher) -> Self { + LiteralSearcher { + complete: lits.is_exact(), + lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")), + lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")), + matcher, + } + } + + /// Returns true if all matches comprise the entire regular expression. + /// + /// This does not necessarily mean that a literal match implies a match + /// of the regular expression. For example, the regular expression `^a` + /// is comprised of a single complete literal `a`, but the regular + /// expression demands that it only match at the beginning of a string. + pub fn complete(&self) -> bool { + self.complete && !self.is_empty() + } + + /// Find the position of a literal in `haystack` if it exists. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { + use self::Matcher::*; + match self.matcher { + Empty => Some((0, 0)), + Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), + Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())), + AC { ref ac, .. } => { + ac.find(haystack).map(|m| (m.start(), m.end())) + } + Packed { ref s, .. } => { + s.find(haystack).map(|m| (m.start(), m.end())) + } + } + } + + /// Like find, except matches must start at index `0`. + pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; + } + if lit == &haystack[0..lit.len()] { + return Some((0, lit.len())); + } + } + None + } + + /// Like find, except matches must end at index `haystack.len()`. + pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; + } + if lit == &haystack[haystack.len() - lit.len()..] { + return Some((haystack.len() - lit.len(), haystack.len())); + } + } + None + } + + /// Returns an iterator over all literals to be matched. + pub fn iter(&self) -> LiteralIter<'_> { + match self.matcher { + Matcher::Empty => LiteralIter::Empty, + Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), + Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()), + Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), + Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), + } + } + + /// Returns a matcher for the longest common prefix of this matcher. + pub fn lcp(&self) -> &Memmem { + &self.lcp + } + + /// Returns a matcher for the longest common suffix of this matcher. + pub fn lcs(&self) -> &Memmem { + &self.lcs + } + + /// Returns true iff this prefix is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of prefixes in this machine. + pub fn len(&self) -> usize { + use self::Matcher::*; + match self.matcher { + Empty => 0, + Bytes(ref sset) => sset.dense.len(), + Memmem(_) => 1, + AC { ref ac, .. } => ac.patterns_len(), + Packed { ref lits, .. } => lits.len(), + } + } + + /// Return the approximate heap usage of literals in bytes. + pub fn approximate_size(&self) -> usize { + use self::Matcher::*; + match self.matcher { + Empty => 0, + Bytes(ref sset) => sset.approximate_size(), + Memmem(ref single) => single.approximate_size(), + AC { ref ac, .. } => ac.memory_usage(), + Packed { ref s, .. } => s.memory_usage(), + } + } +} + +impl Matcher { + fn prefixes(lits: &Seq) -> Self { + let sset = SingleByteSet::prefixes(lits); + Matcher::new(lits, sset) + } + + fn suffixes(lits: &Seq) -> Self { + let sset = SingleByteSet::suffixes(lits); + Matcher::new(lits, sset) + } + + fn new(lits: &Seq, sset: SingleByteSet) -> Self { + if lits.is_empty() || lits.min_literal_len() == Some(0) { + return Matcher::Empty; + } + let lits = match lits.literals() { + None => return Matcher::Empty, + Some(members) => members, + }; + if sset.dense.len() >= 26 { + // Avoid trying to match a large number of single bytes. + // This is *very* sensitive to a frequency analysis comparison + // between the bytes in sset and the composition of the haystack. + // No matter the size of sset, if its members all are rare in the + // haystack, then it'd be worth using it. How to tune this... IDK. + // ---AG + return Matcher::Empty; + } + if sset.complete { + return Matcher::Bytes(sset); + } + if lits.len() == 1 { + return Matcher::Memmem(Memmem::new(lits[0].as_bytes())); + } + + let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect(); + let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; + if lits.len() <= 100 && !is_aho_corasick_fast { + let mut builder = packed::Config::new() + .match_kind(packed::MatchKind::LeftmostFirst) + .builder(); + if let Some(s) = builder.extend(&pats).build() { + return Matcher::Packed { s, lits: lits.to_owned() }; + } + } + let ac = AhoCorasick::builder() + .match_kind(aho_corasick::MatchKind::LeftmostFirst) + .kind(Some(aho_corasick::AhoCorasickKind::DFA)) + .build(&pats) + .unwrap(); + Matcher::AC { ac, lits: lits.to_owned() } + } +} + +#[derive(Debug)] +pub enum LiteralIter<'a> { + Empty, + Bytes(&'a [u8]), + Single(&'a [u8]), + AC(&'a [Literal]), + Packed(&'a [Literal]), +} + +impl<'a> Iterator for LiteralIter<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + match *self { + LiteralIter::Empty => None, + LiteralIter::Bytes(ref mut many) => { + if many.is_empty() { + None + } else { + let next = &many[0..1]; + *many = &many[1..]; + Some(next) + } + } + LiteralIter::Single(ref mut one) => { + if one.is_empty() { + None + } else { + let next = &one[..]; + *one = &[]; + Some(next) + } + } + LiteralIter::AC(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(next.as_bytes()) + } + } + LiteralIter::Packed(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(next.as_bytes()) + } + } + } + } +} + +#[derive(Clone, Debug)] +struct SingleByteSet { + sparse: Vec, + dense: Vec, + complete: bool, + all_ascii: bool, +} + +impl SingleByteSet { + fn new() -> SingleByteSet { + SingleByteSet { + sparse: vec![false; 256], + dense: vec![], + complete: true, + all_ascii: true, + } + } + + fn prefixes(lits: &Seq) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.as_bytes().get(0) { + if !sset.sparse[b as usize] { + if b > 0x7F { + sset.all_ascii = false; + } + sset.dense.push(b); + sset.sparse[b as usize] = true; + } + } + } + sset + } + + fn suffixes(lits: &Seq) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.as_bytes().last() { + if !sset.sparse[b as usize] { + if b > 0x7F { + sset.all_ascii = false; + } + sset.dense.push(b); + sset.sparse[b as usize] = true; + } + } + } + sset + } + + /// Faster find that special cases certain sizes to use memchr. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find(&self, text: &[u8]) -> Option { + match self.dense.len() { + 0 => None, + 1 => memchr(self.dense[0], text), + 2 => memchr2(self.dense[0], self.dense[1], text), + 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), + _ => self._find(text), + } + } + + /// Generic find that works on any sized set. + fn _find(&self, haystack: &[u8]) -> Option { + for (i, &b) in haystack.iter().enumerate() { + if self.sparse[b as usize] { + return Some(i); + } + } + None + } + + fn approximate_size(&self) -> usize { + (self.dense.len() * mem::size_of::()) + + (self.sparse.len() * mem::size_of::()) + } +} + +/// A simple wrapper around the memchr crate's memmem implementation. +/// +/// The API this exposes mirrors the API of previous substring searchers that +/// this supplanted. +#[derive(Clone, Debug)] +pub struct Memmem { + finder: memmem::Finder<'static>, + char_len: usize, +} + +impl Memmem { + fn new(pat: &[u8]) -> Memmem { + Memmem { + finder: memmem::Finder::new(pat).into_owned(), + char_len: char_len_lossy(pat), + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn find(&self, haystack: &[u8]) -> Option { + self.finder.find(haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn is_suffix(&self, text: &[u8]) -> bool { + if text.len() < self.len() { + return false; + } + &text[text.len() - self.len()..] == self.finder.needle() + } + + pub fn len(&self) -> usize { + self.finder.needle().len() + } + + pub fn char_len(&self) -> usize { + self.char_len + } + + fn approximate_size(&self) -> usize { + self.finder.needle().len() * mem::size_of::() + } +} + +fn char_len_lossy(bytes: &[u8]) -> usize { + String::from_utf8_lossy(bytes).chars().count() +} diff --git a/regex-1.8.4/src/literal/mod.rs b/regex-1.8.4/src/literal/mod.rs new file mode 100644 index 0000000000000..b9fb77aed9130 --- /dev/null +++ b/regex-1.8.4/src/literal/mod.rs @@ -0,0 +1,55 @@ +pub use self::imp::*; + +#[cfg(feature = "perf-literal")] +mod imp; + +#[allow(missing_docs)] +#[cfg(not(feature = "perf-literal"))] +mod imp { + use regex_syntax::hir::literal::Seq; + + #[derive(Clone, Debug)] + pub struct LiteralSearcher(()); + + impl LiteralSearcher { + pub fn empty() -> Self { + LiteralSearcher(()) + } + + pub fn prefixes(_: Seq) -> Self { + LiteralSearcher(()) + } + + pub fn suffixes(_: Seq) -> Self { + LiteralSearcher(()) + } + + pub fn complete(&self) -> bool { + false + } + + pub fn find(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_start(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_end(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn is_empty(&self) -> bool { + true + } + + pub fn len(&self) -> usize { + 0 + } + + pub fn approximate_size(&self) -> usize { + 0 + } + } +} diff --git a/regex-1.8.4/src/pattern.rs b/regex-1.8.4/src/pattern.rs new file mode 100644 index 0000000000000..00549e510628a --- /dev/null +++ b/regex-1.8.4/src/pattern.rs @@ -0,0 +1,63 @@ +use std::str::pattern::{Pattern, SearchStep, Searcher}; + +use crate::re_unicode::{Matches, Regex}; + +#[derive(Debug)] +pub struct RegexSearcher<'r, 't> { + haystack: &'t str, + it: Matches<'r, 't>, + last_step_end: usize, + next_match: Option<(usize, usize)>, +} + +impl<'r, 't> Pattern<'t> for &'r Regex { + type Searcher = RegexSearcher<'r, 't>; + + fn into_searcher(self, haystack: &'t str) -> RegexSearcher<'r, 't> { + RegexSearcher { + haystack, + it: self.find_iter(haystack), + last_step_end: 0, + next_match: None, + } + } +} + +unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { + #[inline] + fn haystack(&self) -> &'t str { + self.haystack + } + + #[inline] + fn next(&mut self) -> SearchStep { + if let Some((s, e)) = self.next_match { + self.next_match = None; + self.last_step_end = e; + return SearchStep::Match(s, e); + } + match self.it.next() { + None => { + if self.last_step_end < self.haystack().len() { + let last = self.last_step_end; + self.last_step_end = self.haystack().len(); + SearchStep::Reject(last, self.haystack().len()) + } else { + SearchStep::Done + } + } + Some(m) => { + let (s, e) = (m.start(), m.end()); + if s == self.last_step_end { + self.last_step_end = e; + SearchStep::Match(s, e) + } else { + self.next_match = Some((s, e)); + let last = self.last_step_end; + self.last_step_end = s; + SearchStep::Reject(last, s) + } + } + } + } +} diff --git a/regex-1.8.4/src/pikevm.rs b/regex-1.8.4/src/pikevm.rs new file mode 100644 index 0000000000000..8c9eac2d39abc --- /dev/null +++ b/regex-1.8.4/src/pikevm.rs @@ -0,0 +1,360 @@ +// This module implements the Pike VM. That is, it guarantees linear time +// search of a regex on any text with memory use proportional to the size of +// the regex. +// +// It is equal in power to the backtracking engine in this crate, except the +// backtracking engine is typically faster on small regexes/texts at the +// expense of a bigger memory footprint. +// +// It can do more than the DFA can (specifically, record capture locations +// and execute Unicode word boundary assertions), but at a slower speed. +// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding +// epsilon transitions. That is, the Pike VM engine can be in multiple states +// at once where as the DFA is only ever in one state at a time. +// +// Therefore, the Pike VM is generally treated as the fallback when the other +// matching engines either aren't feasible to run or are insufficient. + +use std::mem; + +use crate::exec::ProgramCache; +use crate::input::{Input, InputAt}; +use crate::prog::{InstPtr, Program}; +use crate::re_trait::Slot; +use crate::sparse::SparseSet; + +/// An NFA simulation matching engine. +#[derive(Debug)] +pub struct Fsm<'r, I> { + /// The sequence of opcodes (among other things) that is actually executed. + /// + /// The program may be byte oriented or Unicode codepoint oriented. + prog: &'r Program, + /// An explicit stack used for following epsilon transitions. (This is + /// borrowed from the cache.) + stack: &'r mut Vec, + /// The input to search. + input: I, +} + +/// A cached allocation that can be reused on each execution. +#[derive(Clone, Debug)] +pub struct Cache { + /// A pair of ordered sets for tracking NFA states. + clist: Threads, + nlist: Threads, + /// An explicit stack used for following epsilon transitions. + stack: Vec, +} + +/// An ordered set of NFA states and their captures. +#[derive(Clone, Debug)] +struct Threads { + /// An ordered set of opcodes (each opcode is an NFA state). + set: SparseSet, + /// Captures for every NFA state. + /// + /// It is stored in row-major order, where the columns are the capture + /// slots and the rows are the states. + caps: Vec, + /// The number of capture slots stored per thread. (Every capture has + /// two slots.) + slots_per_thread: usize, +} + +/// A representation of an explicit stack frame when following epsilon +/// transitions. This is used to avoid recursion. +#[derive(Clone, Debug)] +enum FollowEpsilon { + /// Follow transitions at the given instruction pointer. + IP(InstPtr), + /// Restore the capture slot with the given position in the input. + Capture { slot: usize, pos: Slot }, +} + +impl Cache { + /// Create a new allocation used by the NFA machine to record execution + /// and captures. + pub fn new(_prog: &Program) -> Self { + Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } + } +} + +impl<'r, I: Input> Fsm<'r, I> { + /// Execute the NFA matching engine. + /// + /// If there's a match, `exec` returns `true` and populates the given + /// captures accordingly. + pub fn exec( + prog: &'r Program, + cache: &ProgramCache, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + input: I, + start: usize, + end: usize, + ) -> bool { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.pikevm; + cache.clist.resize(prog.len(), prog.captures.len()); + cache.nlist.resize(prog.len(), prog.captures.len()); + let at = input.at(start); + Fsm { prog, stack: &mut cache.stack, input }.exec_( + &mut cache.clist, + &mut cache.nlist, + matches, + slots, + quit_after_match, + at, + end, + ) + } + + fn exec_( + &mut self, + mut clist: &mut Threads, + mut nlist: &mut Threads, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + mut at: InputAt, + end: usize, + ) -> bool { + let mut matched = false; + let mut all_matched = false; + clist.set.clear(); + nlist.set.clear(); + 'LOOP: loop { + if clist.set.is_empty() { + // Three ways to bail out when our current set of threads is + // empty. + // + // 1. We have a match---so we're done exploring any possible + // alternatives. Time to quit. (We can't do this if we're + // looking for matches for multiple regexes, unless we know + // they all matched.) + // + // 2. If the expression starts with a '^' we can terminate as + // soon as the last thread dies. + if (matched && matches.len() <= 1) + || all_matched + || (!at.is_start() && self.prog.is_anchored_start) + { + break; + } + + // 3. If there's a literal prefix for the program, try to + // jump ahead quickly. If it can't be found, then we can + // bail out early. + if !self.prog.prefixes.is_empty() { + at = match self.input.prefix_at(&self.prog.prefixes, at) { + None => break, + Some(at) => at, + }; + } + } + + // This simulates a preceding '.*?' for every regex by adding + // a state starting at the current position in the input for the + // beginning of the program only if we don't already have a match. + if clist.set.is_empty() + || (!self.prog.is_anchored_start && !all_matched) + { + self.add(&mut clist, slots, 0, at); + } + // The previous call to "add" actually inspects the position just + // before the current character. For stepping through the machine, + // we can to look at the current character, so we advance the + // input. + let at_next = self.input.at(at.next_pos()); + for i in 0..clist.set.len() { + let ip = clist.set[i]; + if self.step( + &mut nlist, + matches, + slots, + clist.caps(ip), + ip, + at, + at_next, + ) { + matched = true; + all_matched = all_matched || matches.iter().all(|&b| b); + if quit_after_match { + // If we only care if a match occurs (not its + // position), then we can quit right now. + break 'LOOP; + } + if self.prog.matches.len() == 1 { + // We don't need to check the rest of the threads + // in this set because we've matched something + // ("leftmost-first"). However, we still need to check + // threads in the next set to support things like + // greedy matching. + // + // This is only true on normal regexes. For regex sets, + // we need to mush on to observe other matches. + break; + } + } + } + if at.pos() >= end { + break; + } + at = at_next; + mem::swap(clist, nlist); + nlist.set.clear(); + } + matched + } + + /// Step through the input, one token (byte or codepoint) at a time. + /// + /// nlist is the set of states that will be processed on the next token + /// in the input. + /// + /// caps is the set of captures passed by the caller of the NFA. They are + /// written to only when a match state is visited. + /// + /// thread_caps is the set of captures set for the current NFA state, ip. + /// + /// at and at_next are the current and next positions in the input. at or + /// at_next may be EOF. + fn step( + &mut self, + nlist: &mut Threads, + matches: &mut [bool], + slots: &mut [Slot], + thread_caps: &mut [Option], + ip: usize, + at: InputAt, + at_next: InputAt, + ) -> bool { + use crate::prog::Inst::*; + match self.prog[ip] { + Match(match_slot) => { + if match_slot < matches.len() { + matches[match_slot] = true; + } + for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + true + } + Char(ref inst) => { + if inst.c == at.char() { + self.add(nlist, thread_caps, inst.goto, at_next); + } + false + } + Ranges(ref inst) => { + if inst.matches(at.char()) { + self.add(nlist, thread_caps, inst.goto, at_next); + } + false + } + Bytes(ref inst) => { + if let Some(b) = at.byte() { + if inst.matches(b) { + self.add(nlist, thread_caps, inst.goto, at_next); + } + } + false + } + EmptyLook(_) | Save(_) | Split(_) => false, + } + } + + /// Follows epsilon transitions and adds them for processing to nlist, + /// starting at and including ip. + fn add( + &mut self, + nlist: &mut Threads, + thread_caps: &mut [Option], + ip: usize, + at: InputAt, + ) { + self.stack.push(FollowEpsilon::IP(ip)); + while let Some(frame) = self.stack.pop() { + match frame { + FollowEpsilon::IP(ip) => { + self.add_step(nlist, thread_caps, ip, at); + } + FollowEpsilon::Capture { slot, pos } => { + thread_caps[slot] = pos; + } + } + } + } + + /// A helper function for add that avoids excessive pushing to the stack. + fn add_step( + &mut self, + nlist: &mut Threads, + thread_caps: &mut [Option], + mut ip: usize, + at: InputAt, + ) { + // Instead of pushing and popping to the stack, we mutate ip as we + // traverse the set of states. We only push to the stack when we + // absolutely need recursion (restoring captures or following a + // branch). + use crate::prog::Inst::*; + loop { + // Don't visit states we've already added. + if nlist.set.contains(ip) { + return; + } + nlist.set.insert(ip); + match self.prog[ip] { + EmptyLook(ref inst) => { + if self.input.is_empty_match(at, inst) { + ip = inst.goto; + } + } + Save(ref inst) => { + if inst.slot < thread_caps.len() { + self.stack.push(FollowEpsilon::Capture { + slot: inst.slot, + pos: thread_caps[inst.slot], + }); + thread_caps[inst.slot] = Some(at.pos()); + } + ip = inst.goto; + } + Split(ref inst) => { + self.stack.push(FollowEpsilon::IP(inst.goto2)); + ip = inst.goto1; + } + Match(_) | Char(_) | Ranges(_) | Bytes(_) => { + let t = &mut nlist.caps(ip); + for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + return; + } + } + } + } +} + +impl Threads { + fn new() -> Self { + Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 } + } + + fn resize(&mut self, num_insts: usize, ncaps: usize) { + if num_insts == self.set.capacity() { + return; + } + self.slots_per_thread = ncaps * 2; + self.set = SparseSet::new(num_insts); + self.caps = vec![None; self.slots_per_thread * num_insts]; + } + + fn caps(&mut self, pc: usize) -> &mut [Option] { + let i = pc * self.slots_per_thread; + &mut self.caps[i..i + self.slots_per_thread] + } +} diff --git a/regex-1.8.4/src/pool.rs b/regex-1.8.4/src/pool.rs new file mode 100644 index 0000000000000..6a6f15b1942e9 --- /dev/null +++ b/regex-1.8.4/src/pool.rs @@ -0,0 +1,333 @@ +// This module provides a relatively simple thread-safe pool of reusable +// objects. For the most part, it's implemented by a stack represented by a +// Mutex>. It has one small trick: because unlocking a mutex is somewhat +// costly, in the case where a pool is accessed by the first thread that tried +// to get a value, we bypass the mutex. Here are some benchmarks showing the +// difference. +// +// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s) +// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s) +// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s) +// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s) +// +// (1) represents our baseline: the master branch at the time of writing when +// using the 'thread_local' crate to implement the pool below. +// +// (2) represents a naive pool implemented completely via Mutex>. There +// is no special trick for bypassing the mutex. +// +// (3) is the same as (2), except it uses Mutex>>. It is twice as +// fast because a Box is much smaller than the T we use with a Pool in this +// crate. So pushing and popping a Box from a Vec is quite a bit faster +// than for T. +// +// (4) is the same as (3), but with the trick for bypassing the mutex in the +// case of the first-to-get thread. +// +// Why move off of thread_local? Even though (4) is a hair faster than (1) +// above, this was not the main goal. The main goal was to move off of +// thread_local and find a way to *simply* re-capture some of its speed for +// regex's specific case. So again, why move off of it? The *primary* reason is +// because of memory leaks. See https://github.com/rust-lang/regex/issues/362 +// for example. (Why do I want it to be simple? Well, I suppose what I mean is, +// "use as much safe code as possible to minimize risk and be as sure as I can +// be that it is correct.") +// +// My guess is that the thread_local design is probably not appropriate for +// regex since its memory usage scales to the number of active threads that +// have used a regex, where as the pool below scales to the number of threads +// that simultaneously use a regex. While neither case permits contraction, +// since we own the pool data structure below, we can add contraction if a +// clear use case pops up in the wild. More pressingly though, it seems that +// there are at least some use case patterns where one might have many threads +// sitting around that might have used a regex at one point. While thread_local +// does try to reuse space previously used by a thread that has since stopped, +// its maximal memory usage still scales with the total number of active +// threads. In contrast, the pool below scales with the total number of threads +// *simultaneously* using the pool. The hope is that this uses less memory +// overall. And if it doesn't, we can hopefully tune it somehow. +// +// It seems that these sort of conditions happen frequently +// in FFI inside of other more "managed" languages. This was +// mentioned in the issue linked above, and also mentioned here: +// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users +// confirm that disabling the use of thread_local resolves the leak. +// +// There were other weaker reasons for moving off of thread_local as well. +// Namely, at the time, I was looking to reduce dependencies. And for something +// like regex, maintenance can be simpler when we own the full dependency tree. + +use std::panic::{RefUnwindSafe, UnwindSafe}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Mutex; + +/// An atomic counter used to allocate thread IDs. +static COUNTER: AtomicUsize = AtomicUsize::new(1); + +thread_local!( + /// A thread local used to assign an ID to a thread. + static THREAD_ID: usize = { + let next = COUNTER.fetch_add(1, Ordering::Relaxed); + // SAFETY: We cannot permit the reuse of thread IDs since reusing a + // thread ID might result in more than one thread "owning" a pool, + // and thus, permit accessing a mutable value from multiple threads + // simultaneously without synchronization. The intent of this panic is + // to be a sanity check. It is not expected that the thread ID space + // will actually be exhausted in practice. + // + // This checks that the counter never wraps around, since atomic + // addition wraps around on overflow. + if next == 0 { + panic!("regex: thread ID allocation space exhausted"); + } + next + }; +); + +/// The type of the function used to create values in a pool when the pool is +/// empty and the caller requests one. +type CreateFn = + Box T + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>; + +/// A simple thread safe pool for reusing values. +/// +/// Getting a value out comes with a guard. When that guard is dropped, the +/// value is automatically put back in the pool. +/// +/// A Pool impls Sync when T is Send (even if it's not Sync). This means +/// that T can use interior mutability. This is possible because a pool is +/// guaranteed to provide a value to exactly one thread at any time. +/// +/// Currently, a pool never contracts in size. Its size is proportional to the +/// number of simultaneous uses. +pub struct Pool { + /// A stack of T values to hand out. These are used when a Pool is + /// accessed by a thread that didn't create it. + stack: Mutex>>, + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: CreateFn, + /// The ID of the thread that owns this pool. The owner is the thread + /// that makes the first call to 'get'. When the owner calls 'get', it + /// gets 'owner_val' directly instead of returning a T from 'stack'. + /// See comments elsewhere for details, but this is intended to be an + /// optimization for the common case that makes getting a T faster. + /// + /// It is initialized to a value of zero (an impossible thread ID) as a + /// sentinel to indicate that it is unowned. + owner: AtomicUsize, + /// A value to return when the caller is in the same thread that created + /// the Pool. + owner_val: T, +} + +// SAFETY: Since we want to use a Pool from multiple threads simultaneously +// behind an Arc, we need for it to be Sync. In cases where T is sync, Pool +// would be Sync. However, since we use a Pool to store mutable scratch space, +// we wind up using a T that has interior mutability and is thus itself not +// Sync. So what we *really* want is for our Pool to by Sync even when T is +// not Sync (but is at least Send). +// +// The only non-sync aspect of a Pool is its 'owner_val' field, which is used +// to implement faster access to a pool value in the common case of a pool +// being accessed in the same thread in which it was created. The 'stack' field +// is also shared, but a Mutex where T: Send is already Sync. So we only +// need to worry about 'owner_val'. +// +// The key is to guarantee that 'owner_val' can only ever be accessed from one +// thread. In our implementation below, we guarantee this by only returning the +// 'owner_val' when the ID of the current thread matches the ID of the thread +// that created the Pool. Since this can only ever be one thread, it follows +// that only one thread can access 'owner_val' at any point in time. Thus, it +// is safe to declare that Pool is Sync when T is Send. +// +// NOTE: It would also be possible to make the owning thread be the *first* +// thread that tries to get a value out of a Pool. However, the current +// implementation is a little simpler and it's not clear if making the first +// thread (rather than the creating thread) is meaningfully better. +// +// If there is a way to achieve our performance goals using safe code, then +// I would very much welcome a patch. As it stands, the implementation below +// tries to balance safety with performance. The case where a Regex is used +// from multiple threads simultaneously will suffer a bit since getting a cache +// will require unlocking a mutex. +unsafe impl Sync for Pool {} + +impl ::std::fmt::Debug for Pool { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + f.debug_struct("Pool") + .field("stack", &self.stack) + .field("owner", &self.owner) + .field("owner_val", &self.owner_val) + .finish() + } +} + +/// A guard that is returned when a caller requests a value from the pool. +/// +/// The purpose of the guard is to use RAII to automatically put the value back +/// in the pool once it's dropped. +#[derive(Debug)] +pub struct PoolGuard<'a, T: Send> { + /// The pool that this guard is attached to. + pool: &'a Pool, + /// This is None when the guard represents the special "owned" value. In + /// which case, the value is retrieved from 'pool.owner_val'. + value: Option>, +} + +impl Pool { + /// Create a new pool. The given closure is used to create values in the + /// pool when necessary. + pub fn new(create: CreateFn) -> Pool { + let owner = AtomicUsize::new(0); + let owner_val = create(); + Pool { stack: Mutex::new(vec![]), create, owner, owner_val } + } + + /// Get a value from the pool. The caller is guaranteed to have exclusive + /// access to the given value. + /// + /// Note that there is no guarantee provided about which value in the + /// pool is returned. That is, calling get, dropping the guard (causing + /// the value to go back into the pool) and then calling get again is NOT + /// guaranteed to return the same value received in the first get call. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn get(&self) -> PoolGuard<'_, T> { + // Our fast path checks if the caller is the thread that "owns" this + // pool. Or stated differently, whether it is the first thread that + // tried to extract a value from the pool. If it is, then we can return + // a T to the caller without going through a mutex. + // + // SAFETY: We must guarantee that only one thread gets access to this + // value. Since a thread is uniquely identified by the THREAD_ID thread + // local, it follows that is the caller's thread ID is equal to the + // owner, then only one thread may receive this value. + let caller = THREAD_ID.with(|id| *id); + let owner = self.owner.load(Ordering::Relaxed); + if caller == owner { + return self.guard_owned(); + } + self.get_slow(caller, owner) + } + + /// This is the "slow" version that goes through a mutex to pop an + /// allocated value off a stack to return to the caller. (Or, if the stack + /// is empty, a new value is created.) + /// + /// If the pool has no owner, then this will set the owner. + #[cold] + fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<'_, T> { + use std::sync::atomic::Ordering::Relaxed; + + if owner == 0 { + // The sentinel 0 value means this pool is not yet owned. We + // try to atomically set the owner. If we do, then this thread + // becomes the owner and we can return a guard that represents + // the special T for the owner. + let res = self.owner.compare_exchange(0, caller, Relaxed, Relaxed); + if res.is_ok() { + return self.guard_owned(); + } + } + let mut stack = self.stack.lock().unwrap(); + let value = match stack.pop() { + None => Box::new((self.create)()), + Some(value) => value, + }; + self.guard_stack(value) + } + + /// Puts a value back into the pool. Callers don't need to call this. Once + /// the guard that's returned by 'get' is dropped, it is put back into the + /// pool automatically. + fn put(&self, value: Box) { + let mut stack = self.stack.lock().unwrap(); + stack.push(value); + } + + /// Create a guard that represents the special owned T. + fn guard_owned(&self) -> PoolGuard<'_, T> { + PoolGuard { pool: self, value: None } + } + + /// Create a guard that contains a value from the pool's stack. + fn guard_stack(&self, value: Box) -> PoolGuard<'_, T> { + PoolGuard { pool: self, value: Some(value) } + } +} + +impl<'a, T: Send> PoolGuard<'a, T> { + /// Return the underlying value. + pub fn value(&self) -> &T { + match self.value { + None => &self.pool.owner_val, + Some(ref v) => &**v, + } + } +} + +impl<'a, T: Send> Drop for PoolGuard<'a, T> { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn drop(&mut self) { + if let Some(value) = self.value.take() { + self.pool.put(value); + } + } +} + +#[cfg(test)] +mod tests { + use std::panic::{RefUnwindSafe, UnwindSafe}; + + use super::*; + + #[test] + fn oibits() { + use crate::exec::ProgramCache; + + fn has_oibits() {} + has_oibits::>(); + } + + // Tests that Pool implements the "single owner" optimization. That is, the + // thread that first accesses the pool gets its own copy, while all other + // threads get distinct copies. + #[test] + fn thread_owner_optimization() { + use std::cell::RefCell; + use std::sync::Arc; + + let pool: Arc>>> = + Arc::new(Pool::new(Box::new(|| RefCell::new(vec!['a'])))); + pool.get().value().borrow_mut().push('x'); + + let pool1 = pool.clone(); + let t1 = std::thread::spawn(move || { + let guard = pool1.get(); + let v = guard.value(); + v.borrow_mut().push('y'); + }); + + let pool2 = pool.clone(); + let t2 = std::thread::spawn(move || { + let guard = pool2.get(); + let v = guard.value(); + v.borrow_mut().push('z'); + }); + + t1.join().unwrap(); + t2.join().unwrap(); + + // If we didn't implement the single owner optimization, then one of + // the threads above is likely to have mutated the [a, x] vec that + // we stuffed in the pool before spawning the threads. But since + // neither thread was first to access the pool, and because of the + // optimization, we should be guaranteed that neither thread mutates + // the special owned pool value. + // + // (Technically this is an implementation detail and not a contract of + // Pool's API.) + assert_eq!(vec!['a', 'x'], *pool.get().value().borrow()); + } +} diff --git a/regex-1.8.4/src/prog.rs b/regex-1.8.4/src/prog.rs new file mode 100644 index 0000000000000..100862cf1b3a4 --- /dev/null +++ b/regex-1.8.4/src/prog.rs @@ -0,0 +1,451 @@ +use std::cmp::Ordering; +use std::collections::HashMap; +use std::fmt; +use std::mem; +use std::ops::Deref; +use std::slice; +use std::sync::Arc; + +use crate::input::Char; +use crate::literal::LiteralSearcher; + +/// `InstPtr` represents the index of an instruction in a regex program. +pub type InstPtr = usize; + +/// Program is a sequence of instructions and various facts about thos +/// instructions. +#[derive(Clone)] +pub struct Program { + /// A sequence of instructions that represents an NFA. + pub insts: Vec, + /// Pointers to each Match instruction in the sequence. + /// + /// This is always length 1 unless this program represents a regex set. + pub matches: Vec, + /// The ordered sequence of all capture groups extracted from the AST. + /// Unnamed groups are `None`. + pub captures: Vec>, + /// Pointers to all named capture groups into `captures`. + pub capture_name_idx: Arc>, + /// If the number of capture groups is the same for all possible matches, + /// then this is that number. + pub static_captures_len: Option, + /// A pointer to the start instruction. This can vary depending on how + /// the program was compiled. For example, programs for use with the DFA + /// engine have a `.*?` inserted at the beginning of unanchored regular + /// expressions. The actual starting point of the program is after the + /// `.*?`. + pub start: InstPtr, + /// A set of equivalence classes for discriminating bytes in the compiled + /// program. + pub byte_classes: Vec, + /// When true, this program can only match valid UTF-8. + pub only_utf8: bool, + /// When true, this program uses byte range instructions instead of Unicode + /// range instructions. + pub is_bytes: bool, + /// When true, the program is compiled for DFA matching. For example, this + /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored + /// regexes. + pub is_dfa: bool, + /// When true, the program matches text in reverse (for use only in the + /// DFA). + pub is_reverse: bool, + /// Whether the regex must match from the start of the input. + pub is_anchored_start: bool, + /// Whether the regex must match at the end of the input. + pub is_anchored_end: bool, + /// Whether this program contains a Unicode word boundary instruction. + pub has_unicode_word_boundary: bool, + /// A possibly empty machine for very quickly matching prefix literals. + pub prefixes: LiteralSearcher, + /// A limit on the size of the cache that the DFA is allowed to use while + /// matching. + /// + /// The cache limit specifies approximately how much space we're willing to + /// give to the state cache. Once the state cache exceeds the size, it is + /// wiped and all states must be re-computed. + /// + /// Note that this value does not impact correctness. It can be set to 0 + /// and the DFA will run just fine. (It will only ever store exactly one + /// state in the cache, and will likely run very slowly, but it will work.) + /// + /// Also note that this limit is *per thread of execution*. That is, + /// if the same regex is used to search text across multiple threads + /// simultaneously, then the DFA cache is not shared. Instead, copies are + /// made. + pub dfa_size_limit: usize, +} + +impl Program { + /// Creates an empty instruction sequence. Fields are given default + /// values. + pub fn new() -> Self { + Program { + insts: vec![], + matches: vec![], + captures: vec![], + capture_name_idx: Arc::new(HashMap::new()), + static_captures_len: None, + start: 0, + byte_classes: vec![0; 256], + only_utf8: true, + is_bytes: false, + is_dfa: false, + is_reverse: false, + is_anchored_start: false, + is_anchored_end: false, + has_unicode_word_boundary: false, + prefixes: LiteralSearcher::empty(), + dfa_size_limit: 2 * (1 << 20), + } + } + + /// If pc is an index to a no-op instruction (like Save), then return the + /// next pc that is not a no-op instruction. + pub fn skip(&self, mut pc: usize) -> usize { + loop { + match self[pc] { + Inst::Save(ref i) => pc = i.goto, + _ => return pc, + } + } + } + + /// Return true if and only if an execution engine at instruction `pc` will + /// always lead to a match. + pub fn leads_to_match(&self, pc: usize) -> bool { + if self.matches.len() > 1 { + // If we have a regex set, then we have more than one ending + // state, so leading to one of those states is generally + // meaningless. + return false; + } + match self[self.skip(pc)] { + Inst::Match(_) => true, + _ => false, + } + } + + /// Returns true if the current configuration demands that an implicit + /// `.*?` be prepended to the instruction sequence. + pub fn needs_dotstar(&self) -> bool { + self.is_dfa && !self.is_reverse && !self.is_anchored_start + } + + /// Returns true if this program uses Byte instructions instead of + /// Char/Range instructions. + pub fn uses_bytes(&self) -> bool { + self.is_bytes || self.is_dfa + } + + /// Returns true if this program exclusively matches valid UTF-8 bytes. + /// + /// That is, if an invalid UTF-8 byte is seen, then no match is possible. + pub fn only_utf8(&self) -> bool { + self.only_utf8 + } + + /// Return the approximate heap usage of this instruction sequence in + /// bytes. + pub fn approximate_size(&self) -> usize { + // The only instruction that uses heap space is Ranges (for + // Unicode codepoint programs) to store non-overlapping codepoint + // ranges. To keep this operation constant time, we ignore them. + (self.len() * mem::size_of::()) + + (self.matches.len() * mem::size_of::()) + + (self.captures.len() * mem::size_of::>()) + + (self.capture_name_idx.len() + * (mem::size_of::() + mem::size_of::())) + + (self.byte_classes.len() * mem::size_of::()) + + self.prefixes.approximate_size() + } +} + +impl Deref for Program { + type Target = [Inst]; + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn deref(&self) -> &Self::Target { + &*self.insts + } +} + +impl fmt::Debug for Program { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::Inst::*; + + fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { + if goto == cur + 1 { + fmtd + } else { + format!("{} (goto: {})", fmtd, goto) + } + } + + fn visible_byte(b: u8) -> String { + use std::ascii::escape_default; + let escaped = escape_default(b).collect::>(); + String::from_utf8_lossy(&escaped).into_owned() + } + + for (pc, inst) in self.iter().enumerate() { + match *inst { + Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, + Save(ref inst) => { + let s = format!("{:04} Save({})", pc, inst.slot); + write!(f, "{}", with_goto(pc, inst.goto, s))?; + } + Split(ref inst) => { + write!( + f, + "{:04} Split({}, {})", + pc, inst.goto1, inst.goto2 + )?; + } + EmptyLook(ref inst) => { + let s = format!("{:?}", inst.look); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + Char(ref inst) => { + let s = format!("{:?}", inst.c); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + Ranges(ref inst) => { + let ranges = inst + .ranges + .iter() + .map(|r| format!("{:?}-{:?}", r.0, r.1)) + .collect::>() + .join(", "); + write!( + f, + "{:04} {}", + pc, + with_goto(pc, inst.goto, ranges) + )?; + } + Bytes(ref inst) => { + let s = format!( + "Bytes({}, {})", + visible_byte(inst.start), + visible_byte(inst.end) + ); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + } + if pc == self.start { + write!(f, " (start)")?; + } + writeln!(f)?; + } + Ok(()) + } +} + +impl<'a> IntoIterator for &'a Program { + type Item = &'a Inst; + type IntoIter = slice::Iter<'a, Inst>; + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// Inst is an instruction code in a Regex program. +/// +/// Regrettably, a regex program either contains Unicode codepoint +/// instructions (Char and Ranges) or it contains byte instructions (Bytes). +/// A regex program can never contain both. +/// +/// It would be worth investigating splitting this into two distinct types and +/// then figuring out how to make the matching engines polymorphic over those +/// types without sacrificing performance. +/// +/// Other than the benefit of moving invariants into the type system, another +/// benefit is the decreased size. If we remove the `Char` and `Ranges` +/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to +/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges` +/// variant.) Given that byte based machines are typically much bigger than +/// their Unicode analogues (because they can decode UTF-8 directly), this ends +/// up being a pretty significant savings. +#[derive(Clone, Debug)] +pub enum Inst { + /// Match indicates that the program has reached a match state. + /// + /// The number in the match corresponds to the Nth logical regular + /// expression in this program. This index is always 0 for normal regex + /// programs. Values greater than 0 appear when compiling regex sets, and + /// each match instruction gets its own unique value. The value corresponds + /// to the Nth regex in the set. + Match(usize), + /// Save causes the program to save the current location of the input in + /// the slot indicated by InstSave. + Save(InstSave), + /// Split causes the program to diverge to one of two paths in the + /// program, preferring goto1 in InstSplit. + Split(InstSplit), + /// EmptyLook represents a zero-width assertion in a regex program. A + /// zero-width assertion does not consume any of the input text. + EmptyLook(InstEmptyLook), + /// Char requires the regex program to match the character in InstChar at + /// the current position in the input. + Char(InstChar), + /// Ranges requires the regex program to match the character at the current + /// position in the input with one of the ranges specified in InstRanges. + Ranges(InstRanges), + /// Bytes is like Ranges, except it expresses a single byte range. It is + /// used in conjunction with Split instructions to implement multi-byte + /// character classes. + Bytes(InstBytes), +} + +impl Inst { + /// Returns true if and only if this is a match instruction. + pub fn is_match(&self) -> bool { + match *self { + Inst::Match(_) => true, + _ => false, + } + } +} + +/// Representation of the Save instruction. +#[derive(Clone, Debug)] +pub struct InstSave { + /// The next location to execute in the program. + pub goto: InstPtr, + /// The capture slot (there are two slots for every capture in a regex, + /// including the zeroth capture for the entire match). + pub slot: usize, +} + +/// Representation of the Split instruction. +#[derive(Clone, Debug)] +pub struct InstSplit { + /// The first instruction to try. A match resulting from following goto1 + /// has precedence over a match resulting from following goto2. + pub goto1: InstPtr, + /// The second instruction to try. A match resulting from following goto1 + /// has precedence over a match resulting from following goto2. + pub goto2: InstPtr, +} + +/// Representation of the `EmptyLook` instruction. +#[derive(Clone, Debug)] +pub struct InstEmptyLook { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The type of zero-width assertion to check. + pub look: EmptyLook, +} + +/// The set of zero-width match instructions. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum EmptyLook { + /// Start of line or input. + StartLine, + /// End of line or input. + EndLine, + /// Start of input. + StartText, + /// End of input. + EndText, + /// Word character on one side and non-word character on other. + WordBoundary, + /// Word character on both sides or non-word character on both sides. + NotWordBoundary, + /// ASCII word boundary. + WordBoundaryAscii, + /// Not ASCII word boundary. + NotWordBoundaryAscii, +} + +/// Representation of the Char instruction. +#[derive(Clone, Debug)] +pub struct InstChar { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The character to test. + pub c: char, +} + +/// Representation of the Ranges instruction. +#[derive(Clone, Debug)] +pub struct InstRanges { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The set of Unicode scalar value ranges to test. + pub ranges: Box<[(char, char)]>, +} + +impl InstRanges { + /// Tests whether the given input character matches this instruction. + pub fn matches(&self, c: Char) -> bool { + // This speeds up the `match_class_unicode` benchmark by checking + // some common cases quickly without binary search. e.g., Matching + // a Unicode class on predominantly ASCII text. + for r in self.ranges.iter().take(4) { + if c < r.0 { + return false; + } + if c <= r.1 { + return true; + } + } + self.ranges + .binary_search_by(|r| { + if r.1 < c { + Ordering::Less + } else if r.0 > c { + Ordering::Greater + } else { + Ordering::Equal + } + }) + .is_ok() + } + + /// Return the number of distinct characters represented by all of the + /// ranges. + pub fn num_chars(&self) -> usize { + self.ranges + .iter() + .map(|&(s, e)| 1 + (e as u32) - (s as u32)) + .sum::() as usize + } +} + +/// Representation of the Bytes instruction. +#[derive(Clone, Debug)] +pub struct InstBytes { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The start (inclusive) of this byte range. + pub start: u8, + /// The end (inclusive) of this byte range. + pub end: u8, +} + +impl InstBytes { + /// Returns true if and only if the given byte is in this range. + pub fn matches(&self, byte: u8) -> bool { + self.start <= byte && byte <= self.end + } +} + +#[cfg(test)] +mod test { + #[test] + #[cfg(target_pointer_width = "64")] + fn test_size_of_inst() { + use std::mem::size_of; + + use super::Inst; + + assert_eq!(32, size_of::()); + } +} diff --git a/regex-1.8.4/src/re_builder.rs b/regex-1.8.4/src/re_builder.rs new file mode 100644 index 0000000000000..ee6383690d787 --- /dev/null +++ b/regex-1.8.4/src/re_builder.rs @@ -0,0 +1,421 @@ +/// The set of user configurable options for compiling zero or more regexes. +#[derive(Clone, Debug)] +#[allow(missing_docs)] +pub struct RegexOptions { + pub pats: Vec, + pub size_limit: usize, + pub dfa_size_limit: usize, + pub nest_limit: u32, + pub case_insensitive: bool, + pub multi_line: bool, + pub dot_matches_new_line: bool, + pub swap_greed: bool, + pub ignore_whitespace: bool, + pub unicode: bool, + pub octal: bool, +} + +impl Default for RegexOptions { + fn default() -> Self { + RegexOptions { + pats: vec![], + size_limit: 10 * (1 << 20), + dfa_size_limit: 2 * (1 << 20), + nest_limit: 250, + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + octal: false, + } + } +} + +macro_rules! define_builder { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { + pub mod $name { + use super::RegexOptions; + use crate::error::Error; + use crate::exec::ExecBuilder; + + use crate::$regex_mod::Regex; + + /// A configurable builder for a regular expression. + /// + /// A builder can be used to configure how the regex is built, for example, by + /// setting the default flags (which can be overridden in the expression + /// itself) or setting various limits. + #[derive(Debug)] + pub struct RegexBuilder(RegexOptions); + + impl RegexBuilder { + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `build` is called. + pub fn new(pattern: &str) -> RegexBuilder { + let mut builder = RegexBuilder(RegexOptions::default()); + builder.0.pats.push(pattern.to_owned()); + builder + } + + /// Consume the builder and compile the regular expression. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. + pub fn build(&self) -> Result { + ExecBuilder::new_options(self.0.clone()) + .only_utf8($only_utf8) + .build() + .map(Regex::from) + } + + /// Set the value for the case insensitive (`i`) flag. + /// + /// When enabled, letters in the pattern will match both upper case and + /// lower case variants. + pub fn case_insensitive( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.0.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + /// + /// When enabled, `^` matches the beginning of lines and `$` matches the + /// end of lines. + /// + /// By default, they match beginning/end of the input. + pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" when Unicode is disabled and + /// means "any valid UTF-8 encoding of any Unicode scalar value" when + /// Unicode is enabled. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.0.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + /// + /// When enabled, a pattern like `a*` is lazy (tries to find shortest + /// match) and `a*?` is greedy (tries to find longest match). + /// + /// By default, `a*` is greedy and `a*?` is lazy. + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + /// + /// When enabled, whitespace such as new lines and spaces will be ignored + /// between expressions of the pattern, and `#` can be used to start a + /// comment until the next new line. + pub fn ignore_whitespace( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.0.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + /// + /// Enabled by default. When disabled, character classes such as `\w` only + /// match ASCII word characters instead of all Unicode word characters. + pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.unicode = yes; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.octal = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit( + &mut self, + limit: usize, + ) -> &mut RegexBuilder { + self.0.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simultaneously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit( + &mut self, + limit: usize, + ) -> &mut RegexBuilder { + self.0.dfa_size_limit = limit; + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { + self.0.nest_limit = limit; + self + } + } + } + }; +} + +define_builder!(bytes, re_bytes, false); +define_builder!(unicode, re_unicode, true); + +macro_rules! define_set_builder { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { + pub mod $name { + use super::RegexOptions; + use crate::error::Error; + use crate::exec::ExecBuilder; + + use crate::re_set::$regex_mod::RegexSet; + + /// A configurable builder for a set of regular expressions. + /// + /// A builder can be used to configure how the regexes are built, for example, + /// by setting the default flags (which can be overridden in the expression + /// itself) or setting various limits. + #[derive(Debug)] + pub struct RegexSetBuilder(RegexOptions); + + impl RegexSetBuilder { + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `build` is called. + pub fn new(patterns: I) -> RegexSetBuilder + where + S: AsRef, + I: IntoIterator, + { + let mut builder = RegexSetBuilder(RegexOptions::default()); + for pat in patterns { + builder.0.pats.push(pat.as_ref().to_owned()); + } + builder + } + + /// Consume the builder and compile the regular expressions into a set. + pub fn build(&self) -> Result { + ExecBuilder::new_options(self.0.clone()) + .only_utf8($only_utf8) + .build() + .map(RegexSet::from) + } + + /// Set the value for the case insensitive (`i`) flag. + pub fn case_insensitive( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + pub fn multi_line( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet` + /// expressions and means "any Unicode scalar value" for `regex::RegexSet` + /// expressions. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + pub fn swap_greed( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + pub fn ignore_whitespace( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.unicode = yes; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.octal = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit( + &mut self, + limit: usize, + ) -> &mut RegexSetBuilder { + self.0.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simultaneously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit( + &mut self, + limit: usize, + ) -> &mut RegexSetBuilder { + self.0.dfa_size_limit = limit; + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit( + &mut self, + limit: u32, + ) -> &mut RegexSetBuilder { + self.0.nest_limit = limit; + self + } + } + } + }; +} + +define_set_builder!(set_bytes, bytes, false); +define_set_builder!(set_unicode, unicode, true); diff --git a/regex-1.8.4/src/re_bytes.rs b/regex-1.8.4/src/re_bytes.rs new file mode 100644 index 0000000000000..e3a3b019b5501 --- /dev/null +++ b/regex-1.8.4/src/re_bytes.rs @@ -0,0 +1,1372 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt; +use std::iter::FusedIterator; +use std::ops::{Index, Range}; +use std::str::FromStr; +use std::sync::Arc; + +use crate::find_byte::find_byte; + +use crate::error::Error; +use crate::exec::{Exec, ExecNoSync}; +use crate::expand::expand_bytes; +use crate::re_builder::bytes::RegexBuilder; +use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; + +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t [u8], + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> Range { + self.start..self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_bytes(&self) -> &'t [u8] { + &self.text[self.range()] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> { + Match { text: haystack, start, end } + } +} + +impl<'t> std::fmt::Debug for Match<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mut fmt = f.debug_struct("Match"); + fmt.field("start", &self.start).field("end", &self.end); + if let Ok(s) = std::str::from_utf8(self.as_bytes()) { + fmt.field("bytes", &s); + } else { + // FIXME: It would be nice if this could be printed as a string + // with invalid UTF-8 replaced with hex escapes. A alloc would + // probably okay if that makes it easier, but regex-automata does + // (at time of writing) have internal routines that do this. So + // maybe we should expose them. + fmt.field("bytes", &self.as_bytes()); + } + fmt.finish() + } +} + +impl<'t> From> for Range { + fn from(m: Match<'t>) -> Range { + m.range() + } +} + +/// A compiled regular expression for matching arbitrary bytes. +/// +/// It can be used to search, split or replace text. All searching is done with +/// an implicit `.*?` at the beginning and end of an expression. To force an +/// expression to match the whole string (or a prefix or a suffix), you must +/// use an anchor like `^` or `$` (or `\A` and `\z`). +/// +/// Like the `Regex` type in the parent module, matches with this regex return +/// byte offsets into the search text. **Unlike** the parent `Regex` type, +/// these byte offsets may not correspond to UTF-8 sequence boundaries since +/// the regexes in this module can match arbitrary bytes. +#[derive(Clone)] +pub struct Regex(Exec); + +impl fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +/// A constructor for Regex from an Exec. +/// +/// This is hidden because Exec isn't actually part of the public API. +#[doc(hidden)] +impl From for Regex { + fn from(exec: Exec) -> Regex { + Regex(exec) + } +} + +impl FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result { + Regex::new(s) + } +} + +/// Core regular expression methods. +impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace text in a string. + /// + /// If an invalid expression is given, then an error is returned. + pub fn new(re: &str) -> Result { + RegexBuilder::new(re).build() + } + + /// Returns true if and only if there is a match for the regex in the + /// string given. + /// + /// It is recommended to use this method if all you need to do is test + /// a match, since the underlying matching engine may be able to do less + /// work. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 ASCII word + /// bytes: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"I categorically deny having triskaidekaphobia."; + /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); + /// # } + /// ``` + pub fn is_match(&self, text: &[u8]) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `text`. If no match exists, then `None` is returned. + /// + /// Note that this should only be used if you want to discover the position + /// of the match. Testing the existence of a match is faster if you use + /// `is_match`. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// ASCII word bytes: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"I categorically deny having triskaidekaphobia."; + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!((mat.start(), mat.end()), (2, 15)); + /// # } + /// ``` + pub fn find<'t>(&self, text: &'t [u8]) -> Option> { + self.find_at(text, 0) + } + + /// Returns an iterator for each successive non-overlapping match in + /// `text`, returning the start and end byte indices with respect to + /// `text`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 ASCII + /// word bytes: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"Retroactively relinquishing remunerations is reprehensible."; + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); + /// } + /// # } + /// ``` + pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> { + Matches(self.0.searcher().find_iter(text)) + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `text`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// You should only use `captures` if you need access to the location of + /// capturing group matches. Otherwise, `find` is faster for discovering + /// the location of the overall match. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.get(1).unwrap().as_bytes(), &b"Citizen Kane"[..]); + /// assert_eq!(caps.get(2).unwrap().as_bytes(), &b"1941"[..]); + /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. + /// assert_eq!(&caps[1], b"Citizen Kane"); + /// assert_eq!(&caps[2], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// # } + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane"); + /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941"); + /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. + /// assert_eq!(&caps["title"], b"Citizen Kane"); + /// assert_eq!(&caps["year"], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// + /// # } + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> { + self.captures_at(text, 0) + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `text`. This is operationally the same as `find_iter`, except it + /// yields information about capturing group matches. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # use std::str; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for caps in re.captures_iter(text) { + /// let title = str::from_utf8(&caps["title"]).unwrap(); + /// let year = str::from_utf8(&caps["year"]).unwrap(); + /// println!("Movie: {:?}, Released: {:?}", title, year); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # } + /// ``` + pub fn captures_iter<'r, 't>( + &'r self, + text: &'t [u8], + ) -> CaptureMatches<'r, 't> { + CaptureMatches(self.0.searcher().captures_iter(text)) + } + + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Namely, each element of the iterator corresponds to + /// text that *isn't* matched by the regular expression. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect(); + /// assert_eq!(fields, vec![ + /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..], + /// ]); + /// # } + /// ``` + pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0 } + } + + /// Returns an iterator of at most `limit` substrings of `text` delimited + /// by a match of the regular expression. (A `limit` of `0` will return no + /// substrings.) Namely, each element of the iterator corresponds to text + /// that *isn't* matched by the regular expression. The remainder of the + /// string that is not split will be the last element in the iterator. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// Get the first two words in some text: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"\W+").unwrap(); + /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect(); + /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]); + /// # } + /// ``` + pub fn splitn<'r, 't>( + &'r self, + text: &'t [u8], + limit: usize, + ) -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit } + } + + /// Replaces the leftmost-first match with the replacement provided. The + /// replacement can be a regular byte string (where `$N` and `$name` are + /// expanded to match capture groups) or a function that takes the matches' + /// `Captures` and returns the replaced byte string. + /// + /// If no match is found, then a copy of the byte string is returned + /// unchanged. + /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// + /// # Examples + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal byte string: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new("[^01]+").unwrap(); + /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]); + /// # } + /// ``` + /// + /// But anything satisfying the `Replacer` trait will work. For example, a + /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the + /// captures corresponding to a match. This allows one to access capturing + /// group matches easily: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # use regex::bytes::Captures; fn main() { + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| { + /// let mut replacement = caps[2].to_owned(); + /// replacement.push(b' '); + /// replacement.extend(&caps[1]); + /// replacement + /// }); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); + /// # } + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported that expands `$name` into the corresponding capture + /// group. Here's the last example, but using this expansion technique + /// with named capture groups: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); + /// # } + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement and surrounding literal text. + /// For example, if we wanted to join two words together with an + /// underscore: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); + /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]); + /// assert_eq!(result, &b"deep_fried"[..]); + /// # } + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// regard for capturing group expansion. This can be done by wrapping a + /// byte string with `NoExpand`: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// use regex::bytes::NoExpand; + /// + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); + /// assert_eq!(result, &b"$2 $last"[..]); + /// # } + /// ``` + pub fn replace<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { + self.replacen(text, 1, rep) + } + + /// Replaces all non-overlapping matches in `text` with the replacement + /// provided. This is the same as calling `replacen` with `limit` set to + /// `0`. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement text. + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { + self.replacen(text, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in `text` with the + /// replacement provided. If `limit` is 0, then all non-overlapping matches + /// are replaced. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement text. + pub fn replacen<'t, R: Replacer>( + &self, + text: &'t [u8], + limit: usize, + mut rep: R, + ) -> Cow<'t, [u8]> { + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, m) in it { + new.extend_from_slice(&text[last_match..m.start()]); + new.extend_from_slice(&rep); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.extend_from_slice(&text[last_match..]); + return Cow::Owned(new); + } + + // The slower path, which we use if the replacement needs access to + // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, cap) in it { + // unwrap on 0 is OK because captures only reports matches + let m = cap.get(0).unwrap(); + new.extend_from_slice(&text[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.extend_from_slice(&text[last_match..]); + Cow::Owned(new) + } +} + +/// Advanced or "lower level" search methods. +impl Regex { + /// Returns the end location of a match in the text given. + /// + /// This method may have the same performance characteristics as + /// `is_match`, except it provides an end location for a match. In + /// particular, the location returned *may be shorter* than the proper end + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// text, but `shortest_match` can give up as soon as it sees the first + /// `a`. + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"aaaaa"; + /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); + /// assert_eq!(pos, Some(1)); + /// # } + /// ``` + pub fn shortest_match(&self, text: &[u8]) -> Option<usize> { + self.shortest_match_at(text, 0) + } + + /// Returns the same as shortest_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn shortest_match_at( + &self, + text: &[u8], + start: usize, + ) -> Option<usize> { + self.0.searcher().shortest_match_at(text, start) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { + self.0.searcher().is_match_at(text, start) + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn find_at<'t>( + &self, + text: &'t [u8], + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher() + .find_at(text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_at<'t>( + &self, + text: &'t [u8], + start: usize, + ) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, start).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalence to the `0`th capture group. + pub fn captures_read<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, 0) + } + + /// Returns the same as `captures_read`, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_read_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher() + .captures_read_at(&mut locs.0, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + start: usize, + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, start) + } +} + +/// Auxiliary methods. +impl Regex { + /// Returns the original string of this regex. + pub fn as_str(&self) -> &str { + &self.0.regex_strings()[0] + } + + /// Returns an iterator over the capture names. + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.0.capture_names().iter()) + } + + /// Returns the number of captures. + pub fn captures_len(&self) -> usize { + self.0.capture_names().len() + } + + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option<usize> { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher().locations()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + pub fn locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher().locations()) + } +} + +/// An iterator over all non-overlapping matches for a particular string. +/// +/// The iterator yields a tuple of integers corresponding to the start and end +/// of the match. The indices are byte offsets. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched byte string. +#[derive(Debug)] +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>); + +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; + + fn next(&mut self) -> Option<Match<'t>> { + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) + } +} + +impl<'r, 't> FusedIterator for Matches<'r, 't> {} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched byte string. +#[derive(Debug)] +pub struct CaptureMatches<'r, 't>( + re_trait::CaptureMatches<'t, ExecNoSync<'r>>, +); + +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { + type Item = Captures<'t>; + + fn next(&mut self) -> Option<Captures<'t>> { + self.0.next().map(|locs| Captures { + text: self.0.text(), + locs, + named_groups: self.0.regex().capture_name_idx().clone(), + }) + } +} + +impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {} + +/// Yields all substrings delimited by a regular expression match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +#[derive(Debug)] +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for Split<'r, 't> { + type Item = &'t [u8]; + + fn next(&mut self) -> Option<&'t [u8]> { + let text = self.finder.0.text(); + match self.finder.next() { + None => { + if self.last > text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len() + 1; // Next call will return None + Some(s) + } + } + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); + Some(matched) + } + } + } +} + +impl<'r, 't> FusedIterator for Split<'r, 't> {} + +/// Yields at most `N` substrings delimited by a regular expression match. +/// +/// The last substring will be whatever remains after splitting. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +#[derive(Debug)] +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, + n: usize, +} + +impl<'r, 't> Iterator for SplitN<'r, 't> { + type Item = &'t [u8]; + + fn next(&mut self) -> Option<&'t [u8]> { + if self.n == 0 { + return None; + } + + self.n -= 1; + if self.n > 0 { + return self.splits.next(); + } + + let text = self.splits.finder.0.text(); + if self.splits.last > text.len() { + // We've already returned all substrings. + None + } else { + // self.n == 0, so future calls will return None immediately + Some(&text[self.splits.last..]) + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.n)) + } +} + +impl<'r, 't> FusedIterator for SplitN<'r, 't> {} + +/// An iterator over the names of all possible captures. +/// +/// `None` indicates an unnamed capture; the first element (capture 0, the +/// whole matched region) is always unnamed. +/// +/// `'r` is the lifetime of the compiled regular expression. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + fn next(&mut self) -> Option<Option<&'r str>> { + self.0 + .next() + .as_ref() + .map(|slot| slot.as_ref().map(|name| name.as_ref())) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } + + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> FusedIterator for CaptureNames<'r> {} + +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// You can think of this as a lower level +/// [`Captures`](struct.Captures.html), where this type does not support +/// named capturing groups directly and it does not borrow the text that these +/// offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs +/// such as `read_captures`, which permits amortizing the allocation in which +/// capture match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` +#[derive(Clone, Debug)] +pub struct CaptureLocations(re_trait::Locations); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.pos(i) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + +/// Captures represents a group of captured byte strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched byte string is *also* available via the +/// `name` method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'t` is the lifetime of the matched text. +pub struct Captures<'t> { + text: &'t [u8], + locs: re_trait::Locations, + named_groups: Arc<HashMap<String, usize>>, +} + +impl<'t> Captures<'t> { + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures(b"abc123").unwrap(); + /// + /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); + /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); + /// assert_eq!(text1, &b"123"[..]); + /// assert_eq!(text2, &b""[..]); + /// ``` + pub fn get(&self, i: usize) -> Option<Match<'t>> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) + } + + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option<Match<'t>> { + self.named_groups.get(name).and_then(|&i| self.get(i)) + } + + /// An iterator that yields all capturing matches in the order in which + /// they appear in the regex. If a particular capture group didn't + /// participate in the match, then `None` is yielded for that capture. + /// + /// The first match always corresponds to the overall match of the regex. + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { + SubCaptureMatches { caps: self, it: self.locs.iter() } + } + + /// Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of valid UTF-8 bytes is permitted. If the + /// sequence does not refer to a capture group name in the corresponding + /// regex, then it is replaced with an empty string. + /// + /// To write a literal `$` use `$$`. + pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) { + expand_bytes(self, replacement, dst) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. + #[inline] + pub fn len(&self) -> usize { + self.locs.len() + } +} + +impl<'t> fmt::Debug for Captures<'t> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() + } +} + +struct CapturesDebug<'c, 't>(&'c Captures<'t>); + +impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn escape_bytes(bytes: &[u8]) -> String { + let mut s = String::new(); + for &b in bytes { + s.push_str(&escape_byte(b)); + } + s + } + + fn escape_byte(byte: u8) -> String { + use std::ascii::escape_default; + + let escaped: Vec<u8> = escape_default(byte).collect(); + String::from_utf8_lossy(&escaped).into_owned() + } + + // We'd like to show something nice here, even if it means an + // allocation to build a reverse index. + let slot_to_name: HashMap<&usize, &String> = + self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); + let mut map = f.debug_map(); + for (slot, m) in self.0.locs.iter().enumerate() { + let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e])); + if let Some(name) = slot_to_name.get(&slot) { + map.entry(&name, &m); + } else { + map.entry(&slot, &m); + } + } + map.finish() + } +} + +/// Get a group by index. +/// +/// `'t` is the lifetime of the matched text. +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `get()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'t> Index<usize> for Captures<'t> { + type Output = [u8]; + + fn index(&self, i: usize) -> &[u8] { + self.get(i) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'t` is the lifetime of the matched text and `'i` is the lifetime +/// of the group name (the index). +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'t, 'i> Index<&'i str> for Captures<'t> { + type Output = [u8]; + + fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { + self.name(name) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator that yields all capturing matches in the order in which they +/// appear in the regex. +/// +/// If a particular capture group didn't participate in the match, then `None` +/// is yielded for that capture. The first match always corresponds to the +/// overall match of the regex. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and +/// the lifetime `'t` corresponds to the originally matched text. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 't> { + caps: &'c Captures<'t>, + it: SubCapturesPosIter<'c>, +} + +impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { + type Item = Option<Match<'t>>; + + fn next(&mut self) -> Option<Option<Match<'t>>> { + self.it + .next() + .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e))) + } +} + +impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} + +/// Replacer describes types that can be used to replace matches in a byte +/// string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&[u8]` along with other +/// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any +/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend(&caps[0])`. + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>); + + /// Return a fixed unchanging replacement byte string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + None + } + + /// Return a `Replacer` that borrows and wraps this `Replacer`. + /// + /// This is useful when you want to take a generic `Replacer` (which might + /// not be cloneable) and use it without consuming it, so it can be used + /// more than once. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::{Regex, Replacer}; + /// + /// fn replace_all_twice<R: Replacer>( + /// re: Regex, + /// src: &[u8], + /// mut rep: R, + /// ) -> Vec<u8> { + /// let dst = re.replace_all(src, rep.by_ref()); + /// let dst = re.replace_all(&dst, rep.by_ref()); + /// dst.into_owned() + /// } + /// ``` + fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { + ReplacerRef(self) + } +} + +/// By-reference adaptor for a `Replacer` +/// +/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + self.0.replace_append(caps, dst) + } + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + self.0.no_expansion() + } +} + +impl<'a> Replacer for &'a [u8] { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Vec<u8> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl Replacer for Vec<u8> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> { + let s = t.as_ref(); + match find_byte(b'$', s) { + Some(_) => None, + None => Some(Cow::Borrowed(s)), + } +} + +impl<F, T> Replacer for F +where + F: FnMut(&Captures<'_>) -> T, + T: AsRef<[u8]>, +{ + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + dst.extend_from_slice((*self)(caps).as_ref()); + } +} + +/// `NoExpand` indicates literal byte string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal byte string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +#[derive(Clone, Debug)] +pub struct NoExpand<'t>(pub &'t [u8]); + +impl<'t> Replacer for NoExpand<'t> { + fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) { + dst.extend_from_slice(self.0); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/regex-1.8.4/src/re_set.rs b/regex-1.8.4/src/re_set.rs new file mode 100644 index 0000000000000..7c8253f0caa95 --- /dev/null +++ b/regex-1.8.4/src/re_set.rs @@ -0,0 +1,518 @@ +macro_rules! define_set { + ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, + $(#[$doc_regexset_example:meta])* ) => { + pub mod $name { + use std::fmt; + use std::iter; + use std::slice; + use std::vec; + + use crate::error::Error; + use crate::exec::Exec; + use crate::re_builder::$builder_mod::RegexSetBuilder; + use crate::re_trait::RegularExpression; + +/// Match multiple (possibly overlapping) regular expressions in a single scan. +/// +/// A regex set corresponds to the union of two or more regular expressions. +/// That is, a regex set will match text where at least one of its +/// constituent regular expressions matches. A regex set as its formulated here +/// provides a touch more power: it will also report *which* regular +/// expressions in the set match. Indeed, this is the key difference between +/// regex sets and a single `Regex` with many alternates, since only one +/// alternate can match at a time. +/// +/// For example, consider regular expressions to match email addresses and +/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a +/// regex set is constructed from those regexes, then searching the text +/// `foo@example.com` will report both regexes as matching. Of course, one +/// could accomplish this by compiling each regex on its own and doing two +/// searches over the text. The key advantage of using a regex set is that it +/// will report the matching regexes using a *single pass through the text*. +/// If one has hundreds or thousands of regexes to match repeatedly (like a URL +/// router for a complex web application or a user agent matcher), then a regex +/// set can realize huge performance gains. +/// +/// # Example +/// +/// This shows how the above two regexes (for matching email addresses and +/// domains) might work: +/// +$(#[$doc_regexset_example])* +/// +/// Note that it would be possible to adapt the above example to using `Regex` +/// with an expression like: +/// +/// ```text +/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) +/// ``` +/// +/// After a match, one could then inspect the capture groups to figure out +/// which alternates matched. The problem is that it is hard to make this +/// approach scale when there are many regexes since the overlap between each +/// alternate isn't always obvious to reason about. +/// +/// # Limitations +/// +/// Regex sets are limited to answering the following two questions: +/// +/// 1. Does any regex in the set match? +/// 2. If so, which regexes in the set match? +/// +/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1) +/// instead of (2) since the matching engines can stop after the first match +/// is found. +/// +/// You cannot directly extract [`Match`][crate::Match] or +/// [`Captures`][crate::Captures] objects from a regex set. If you need these +/// operations, the recommended approach is to compile each pattern in the set +/// independently and scan the exact same input a second time with those +/// independently compiled patterns: +/// +/// ```rust +/// use regex::{Regex, RegexSet}; +/// +/// let patterns = ["foo", "bar"]; +/// // Both patterns will match different ranges of this string. +/// let text = "barfoo"; +/// +/// // Compile a set matching any of our patterns. +/// let set = RegexSet::new(&patterns).unwrap(); +/// // Compile each pattern independently. +/// let regexes: Vec<_> = set.patterns().iter() +/// .map(|pat| Regex::new(pat).unwrap()) +/// .collect(); +/// +/// // Match against the whole set first and identify the individual +/// // matching patterns. +/// let matches: Vec<&str> = set.matches(text).into_iter() +/// // Dereference the match index to get the corresponding +/// // compiled pattern. +/// .map(|match_idx| ®exes[match_idx]) +/// // To get match locations or any other info, we then have to search +/// // the exact same text again, using our separately-compiled pattern. +/// .map(|pat| pat.find(text).unwrap().as_str()) +/// .collect(); +/// +/// // Matches arrive in the order the constituent patterns were declared, +/// // not the order they appear in the input. +/// assert_eq!(vec!["foo", "bar"], matches); +/// ``` +/// +/// # Performance +/// +/// A `RegexSet` has the same performance characteristics as `Regex`. Namely, +/// search takes `O(mn)` time, where `m` is proportional to the size of the +/// regex set and `n` is proportional to the length of the search text. +#[derive(Clone)] +pub struct RegexSet(Exec); + +impl RegexSet { + /// Create a new regex set with the given regular expressions. + /// + /// This takes an iterator of `S`, where `S` is something that can produce + /// a `&str`. If any of the strings in the iterator are not valid regular + /// expressions, then an error is returned. + /// + /// # Example + /// + /// Create a new regex set from an iterator of strings: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// ``` + pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> + where S: AsRef<str>, I: IntoIterator<Item=S> { + RegexSetBuilder::new(exprs).build() + } + + /// Create a new empty regex set. + /// + /// # Example + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::empty(); + /// assert!(set.is_empty()); + /// ``` + pub fn empty() -> RegexSet { + RegexSetBuilder::new(&[""; 0]).build().unwrap() + } + + /// Returns true if and only if one of the regexes in this set matches + /// the text given. + /// + /// This method should be preferred if you only need to test whether any + /// of the regexes in the set should match, but don't care about *which* + /// regexes matched. This is because the underlying matching engine will + /// quit immediately after seeing the first match instead of continuing to + /// find all matches. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests whether a set matches some text: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// assert!(!set.is_match("☃")); + /// ``` + pub fn is_match(&self, text: $text_ty) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { + self.0.searcher().is_match_at($as_bytes(text), start) + } + + /// Returns the set of regular expressions that match in the given text. + /// + /// The set returned contains the index of each regular expression that + /// matches in the given text. The index is in correspondence with the + /// order of regular expressions given to `RegexSet`'s constructor. + /// + /// The set can also be used to iterate over the matched indices. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests which regular expressions match the given text: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); + /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); + /// + /// // You can also test whether a particular regex matched: + /// let matches = set.matches("foobar"); + /// assert!(!matches.matched(5)); + /// assert!(matches.matched(6)); + /// ``` + pub fn matches(&self, text: $text_ty) -> SetMatches { + let mut matches = vec![false; self.0.regex_strings().len()]; + let any = self.read_matches_at(&mut matches, text, 0); + SetMatches { + matched_any: any, + matches: matches, + } + } + + /// Returns the same as matches, but starts the search at the given + /// offset and stores the matches into the slice given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// `matches` must have a length that is at least the number of regexes + /// in this set. + /// + /// This method returns true if and only if at least one member of + /// `matches` is true after executing the set against `text`. + #[doc(hidden)] + pub fn read_matches_at( + &self, + matches: &mut [bool], + text: $text_ty, + start: usize, + ) -> bool { + self.0.searcher().many_matches_at(matches, $as_bytes(text), start) + } + + /// Returns the total number of regular expressions in this set. + pub fn len(&self) -> usize { + self.0.regex_strings().len() + } + + /// Returns `true` if this set contains no regular expressions. + pub fn is_empty(&self) -> bool { + self.0.regex_strings().is_empty() + } + + /// Returns the patterns that this set will match on. + /// + /// This function can be used to determine the pattern for a match. The + /// slice returned has exactly as many patterns givens to this regex set, + /// and the order of the slice is the same as the order of the patterns + /// provided to the set. + /// + /// # Example + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set + /// .matches("foobar") + /// .into_iter() + /// .map(|match_idx| &set.patterns()[match_idx]) + /// .collect(); + /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); + /// ``` + pub fn patterns(&self) -> &[String] { + self.0.regex_strings() + } +} + +impl Default for RegexSet { + fn default() -> Self { + RegexSet::empty() + } +} + +/// A set of matches returned by a regex set. +#[derive(Clone, Debug)] +pub struct SetMatches { + matched_any: bool, + matches: Vec<bool>, +} + +impl SetMatches { + /// Whether this set contains any matches. + pub fn matched_any(&self) -> bool { + self.matched_any + } + + /// Whether the regex at the given index matched. + /// + /// The index for a regex is determined by its insertion order upon the + /// initial construction of a `RegexSet`, starting at `0`. + /// + /// # Panics + /// + /// If `regex_index` is greater than or equal to `self.len()`. + pub fn matched(&self, regex_index: usize) -> bool { + self.matches[regex_index] + } + + /// The total number of regexes in the set that created these matches. + /// + /// **WARNING:** This always returns the same value as [`RegexSet::len`]. + /// In particular, it does *not* return the number of elements yielded by + /// [`SetMatches::iter`]. The only way to determine the total number of + /// matched regexes is to iterate over them. + pub fn len(&self) -> usize { + self.matches.len() + } + + /// Returns an iterator over indexes in the regex that matched. + /// + /// This will always produces matches in ascending order of index, where + /// the index corresponds to the index of the regex that matched with + /// respect to its position when initially building the set. + pub fn iter(&self) -> SetMatchesIter<'_> { + SetMatchesIter((&*self.matches).into_iter().enumerate()) + } +} + +impl IntoIterator for SetMatches { + type IntoIter = SetMatchesIntoIter; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + SetMatchesIntoIter(self.matches.into_iter().enumerate()) + } +} + +impl<'a> IntoIterator for &'a SetMatches { + type IntoIter = SetMatchesIter<'a>; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// An owned iterator over the set of matches from a regex set. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +#[derive(Debug)] +pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); + +impl Iterator for SetMatchesIntoIter { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + match self.0.next() { + None => return None, + Some((_, false)) => {} + Some((i, true)) => return Some(i), + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl DoubleEndedIterator for SetMatchesIntoIter { + fn next_back(&mut self) -> Option<usize> { + loop { + match self.0.next_back() { + None => return None, + Some((_, false)) => {} + Some((i, true)) => return Some(i), + } + } + } +} + +impl iter::FusedIterator for SetMatchesIntoIter {} + +/// A borrowed iterator over the set of matches from a regex set. +/// +/// The lifetime `'a` refers to the lifetime of a `SetMatches` value. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +#[derive(Clone, Debug)] +pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); + +impl<'a> Iterator for SetMatchesIter<'a> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + match self.0.next() { + None => return None, + Some((_, &false)) => {} + Some((i, &true)) => return Some(i), + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { + fn next_back(&mut self) -> Option<usize> { + loop { + match self.0.next_back() { + None => return None, + Some((_, &false)) => {} + Some((i, &true)) => return Some(i), + } + } + } +} + +impl<'a> iter::FusedIterator for SetMatchesIter<'a> {} + +#[doc(hidden)] +impl From<Exec> for RegexSet { + fn from(exec: Exec) -> Self { + RegexSet(exec) + } +} + +impl fmt::Debug for RegexSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "RegexSet({:?})", self.0.regex_strings()) + } +} + +#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } +#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } + } + } +} + +define_set! { + unicode, + set_unicode, + &str, + as_bytes_str, +/// ```rust +/// # use regex::RegexSet; +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match("foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with text that only matches one of the regexes. +/// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with text that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches("example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +} + +define_set! { + bytes, + set_bytes, + &[u8], + as_bytes_bytes, +/// ```rust +/// # use regex::bytes::RegexSet; +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match(b"foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with text that only matches one of the regexes. +/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with text that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +} diff --git a/regex-1.8.4/src/re_trait.rs b/regex-1.8.4/src/re_trait.rs new file mode 100644 index 0000000000000..505810c848cb2 --- /dev/null +++ b/regex-1.8.4/src/re_trait.rs @@ -0,0 +1,294 @@ +use std::fmt; +use std::iter::FusedIterator; + +/// Slot is a single saved capture location. Note that there are two slots for +/// every capture in a regular expression (one slot each for the start and end +/// of the capture). +pub type Slot = Option<usize>; + +/// Locations represents the offsets of each capturing group in a regex for +/// a single match. +/// +/// Unlike `Captures`, a `Locations` value only stores offsets. +#[doc(hidden)] +#[derive(Clone, Debug)] +pub struct Locations(Vec<Slot>); + +impl Locations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?); + match (self.0.get(s), self.0.get(e)) { + (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), + _ => None, + } + } + + /// Creates an iterator of all the capture group positions in order of + /// appearance in the regular expression. Positions are byte indices + /// in terms of the original string matched. + pub fn iter(&self) -> SubCapturesPosIter<'_> { + SubCapturesPosIter { idx: 0, locs: self } + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + pub fn len(&self) -> usize { + self.0.len() / 2 + } + + /// Return the individual slots as a slice. + pub(crate) fn as_slots(&mut self) -> &mut [Slot] { + &mut self.0 + } +} + +/// An iterator over capture group positions for a particular match of a +/// regular expression. +/// +/// Positions are byte indices in terms of the original string matched. +/// +/// `'c` is the lifetime of the captures. +#[derive(Clone, Debug)] +pub struct SubCapturesPosIter<'c> { + idx: usize, + locs: &'c Locations, +} + +impl<'c> Iterator for SubCapturesPosIter<'c> { + type Item = Option<(usize, usize)>; + + fn next(&mut self) -> Option<Option<(usize, usize)>> { + if self.idx >= self.locs.len() { + return None; + } + let x = match self.locs.pos(self.idx) { + None => Some(None), + Some((s, e)) => Some(Some((s, e))), + }; + self.idx += 1; + x + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let len = self.locs.len() - self.idx; + (len, Some(len)) + } + + fn count(self) -> usize { + self.len() + } +} + +impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {} + +impl<'c> FusedIterator for SubCapturesPosIter<'c> {} + +/// `RegularExpression` describes types that can implement regex searching. +/// +/// This trait is my attempt at reducing code duplication and to standardize +/// the internal API. Specific duplication that is avoided are the `find` +/// and `capture` iterators, which are slightly tricky. +/// +/// It's not clear whether this trait is worth it, and it also isn't +/// clear whether it's useful as a public trait or not. Methods like +/// `next_after_empty` reak of bad design, but the rest of the methods seem +/// somewhat reasonable. One particular thing this trait would expose would be +/// the ability to start the search of a regex anywhere in a haystack, which +/// isn't possible in the current public API. +pub trait RegularExpression: Sized + fmt::Debug { + /// The type of the haystack. + type Text: ?Sized + fmt::Debug; + + /// The number of capture slots in the compiled regular expression. This is + /// always two times the number of capture groups (two slots per group). + fn slots_len(&self) -> usize; + + /// Allocates fresh space for all capturing groups in this regex. + fn locations(&self) -> Locations { + Locations(vec![None; self.slots_len()]) + } + + /// Returns the position of the next character after `i`. + /// + /// For example, a haystack with type `&[u8]` probably returns `i+1`, + /// whereas a haystack with type `&str` probably returns `i` plus the + /// length of the next UTF-8 sequence. + fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; + + /// Returns the location of the shortest match. + fn shortest_match_at( + &self, + text: &Self::Text, + start: usize, + ) -> Option<usize>; + + /// Returns whether the regex matches the text given. + fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; + + /// Returns the leftmost-first match location if one exists. + fn find_at( + &self, + text: &Self::Text, + start: usize, + ) -> Option<(usize, usize)>; + + /// Returns the leftmost-first match location if one exists, and also + /// fills in any matching capture slot locations. + fn captures_read_at( + &self, + locs: &mut Locations, + text: &Self::Text, + start: usize, + ) -> Option<(usize, usize)>; + + /// Returns an iterator over all non-overlapping successive leftmost-first + /// matches. + fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { + Matches { re: self, text, last_end: 0, last_match: None } + } + + /// Returns an iterator over all non-overlapping successive leftmost-first + /// matches with captures. + fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> { + CaptureMatches(self.find_iter(text)) + } +} + +/// An iterator over all non-overlapping successive leftmost-first matches. +#[derive(Debug)] +pub struct Matches<'t, R> +where + R: RegularExpression, + R::Text: 't, +{ + re: R, + text: &'t R::Text, + last_end: usize, + last_match: Option<usize>, +} + +impl<'t, R> Matches<'t, R> +where + R: RegularExpression, + R::Text: 't, +{ + /// Return the text being searched. + pub fn text(&self) -> &'t R::Text { + self.text + } + + /// Return the underlying regex. + pub fn regex(&self) -> &R { + &self.re + } +} + +impl<'t, R> Iterator for Matches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + if self.last_end > self.text.as_ref().len() { + return None; + } + let (s, e) = match self.re.find_at(self.text, self.last_end) { + None => return None, + Some((s, e)) => (s, e), + }; + if s == e { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = self.re.next_after_empty(self.text, e); + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(e) == self.last_match { + return self.next(); + } + } else { + self.last_end = e; + } + self.last_match = Some(e); + Some((s, e)) + } +} + +impl<'t, R> FusedIterator for Matches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ +} + +/// An iterator over all non-overlapping successive leftmost-first matches with +/// captures. +#[derive(Debug)] +pub struct CaptureMatches<'t, R>(Matches<'t, R>) +where + R: RegularExpression, + R::Text: 't; + +impl<'t, R> CaptureMatches<'t, R> +where + R: RegularExpression, + R::Text: 't, +{ + /// Return the text being searched. + pub fn text(&self) -> &'t R::Text { + self.0.text() + } + + /// Return the underlying regex. + pub fn regex(&self) -> &R { + self.0.regex() + } +} + +impl<'t, R> Iterator for CaptureMatches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ + type Item = Locations; + + fn next(&mut self) -> Option<Locations> { + if self.0.last_end > self.0.text.as_ref().len() { + return None; + } + let mut locs = self.0.re.locations(); + let (s, e) = match self.0.re.captures_read_at( + &mut locs, + self.0.text, + self.0.last_end, + ) { + None => return None, + Some((s, e)) => (s, e), + }; + if s == e { + self.0.last_end = self.0.re.next_after_empty(self.0.text, e); + if Some(e) == self.0.last_match { + return self.next(); + } + } else { + self.0.last_end = e; + } + self.0.last_match = Some(e); + Some(locs) + } +} + +impl<'t, R> FusedIterator for CaptureMatches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ +} diff --git a/regex-1.8.4/src/re_unicode.rs b/regex-1.8.4/src/re_unicode.rs new file mode 100644 index 0000000000000..57689086dc429 --- /dev/null +++ b/regex-1.8.4/src/re_unicode.rs @@ -0,0 +1,1415 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt; +use std::iter::FusedIterator; +use std::ops::{Index, Range}; +use std::str::FromStr; +use std::sync::Arc; + +use crate::find_byte::find_byte; + +use crate::error::Error; +use crate::exec::{Exec, ExecNoSyncStr}; +use crate::expand::expand_str; +use crate::re_builder::unicode::RegexBuilder; +use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn escape(text: &str) -> String { + regex_syntax::escape(text) +} + +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t str, + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> Range<usize> { + self.start..self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_str(&self) -> &'t str { + &self.text[self.range()] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> { + Match { text: haystack, start, end } + } +} + +impl<'t> std::fmt::Debug for Match<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Match") + .field("start", &self.start) + .field("end", &self.end) + .field("string", &self.as_str()) + .finish() + } +} + +impl<'t> From<Match<'t>> for &'t str { + fn from(m: Match<'t>) -> &'t str { + m.as_str() + } +} + +impl<'t> From<Match<'t>> for Range<usize> { + fn from(m: Match<'t>) -> Range<usize> { + m.range() + } +} + +/// A compiled regular expression for matching Unicode strings. +/// +/// It is represented as either a sequence of bytecode instructions (dynamic) +/// or as a specialized Rust function (native). It can be used to search, split +/// or replace text. All searching is done with an implicit `.*?` at the +/// beginning and end of an expression. To force an expression to match the +/// whole string (or a prefix or a suffix), you must use an anchor like `^` or +/// `$` (or `\A` and `\z`). +/// +/// While this crate will handle Unicode strings (whether in the regular +/// expression or in the search text), all positions returned are **byte +/// indices**. Every byte index is guaranteed to be at a Unicode code point +/// boundary. +/// +/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a +/// compiled regular expression and text to search, respectively. +/// +/// The only methods that allocate new strings are the string replacement +/// methods. All other methods (searching and splitting) return borrowed +/// pointers into the string given. +/// +/// # Examples +/// +/// Find the location of a US phone number: +/// +/// ```rust +/// # use regex::Regex; +/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); +/// let mat = re.find("phone: 111-222-3333").unwrap(); +/// assert_eq!((mat.start(), mat.end()), (7, 19)); +/// ``` +/// +/// # Using the `std::str::pattern` methods with `Regex` +/// +/// > **Note**: This section requires that this crate is compiled with the +/// > `pattern` Cargo feature enabled, which **requires nightly Rust**. +/// +/// Since `Regex` implements `Pattern`, you can use regexes with methods +/// defined on `&str`. For example, `is_match`, `find`, `find_iter` +/// and `split` can be replaced with `str::contains`, `str::find`, +/// `str::match_indices` and `str::split`. +/// +/// Here are some examples: +/// +/// ```rust,ignore +/// # use regex::Regex; +/// let re = Regex::new(r"\d+").unwrap(); +/// let haystack = "a111b222c"; +/// +/// assert!(haystack.contains(&re)); +/// assert_eq!(haystack.find(&re), Some(1)); +/// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(), +/// vec![(1, "111"), (5, "222")]); +/// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]); +/// ``` +#[derive(Clone)] +pub struct Regex(Exec); + +impl fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +#[doc(hidden)] +impl From<Exec> for Regex { + fn from(exec: Exec) -> Regex { + Regex(exec) + } +} + +impl FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result<Regex, Error> { + Regex::new(s) + } +} + +/// Core regular expression methods. +impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace text in a string. + /// + /// If an invalid expression is given, then an error is returned. + pub fn new(re: &str) -> Result<Regex, Error> { + RegexBuilder::new(re).build() + } + + /// Returns true if and only if there is a match for the regex in the + /// string given. + /// + /// It is recommended to use this method if all you need to do is test + /// a match, since the underlying matching engine may be able to do less + /// work. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 + /// Unicode word characters: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "I categorically deny having triskaidekaphobia."; + /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); + /// # } + /// ``` + pub fn is_match(&self, text: &str) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `text`. If no match exists, then `None` is returned. + /// + /// Note that this should only be used if you want to discover the position + /// of the match. Testing the existence of a match is faster if you use + /// `is_match`. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// Unicode word characters: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "I categorically deny having triskaidekaphobia."; + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!(mat.start(), 2); + /// assert_eq!(mat.end(), 15); + /// # } + /// ``` + pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> { + self.find_at(text, 0) + } + + /// Returns an iterator for each successive non-overlapping match in + /// `text`, returning the start and end byte indices with respect to + /// `text`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 Unicode + /// word characters: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "Retroactively relinquishing remunerations is reprehensible."; + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); + /// } + /// # } + /// ``` + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { + Matches(self.0.searcher_str().find_iter(text)) + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `text`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// You should only use `captures` if you need access to the location of + /// capturing group matches. Otherwise, `find` is faster for discovering + /// the location of the overall match. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. + /// assert_eq!(&caps[1], "Citizen Kane"); + /// assert_eq!(&caps[2], "1941"); + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// # } + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.name("year").unwrap().as_str(), "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// + /// # } + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { + self.captures_at(text, 0) + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `text`. This is operationally the same as `find_iter`, except it + /// yields information about capturing group matches. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for caps in re.captures_iter(text) { + /// println!("Movie: {:?}, Released: {:?}", + /// &caps["title"], &caps["year"]); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # } + /// ``` + pub fn captures_iter<'r, 't>( + &'r self, + text: &'t str, + ) -> CaptureMatches<'r, 't> { + CaptureMatches(self.0.searcher_str().captures_iter(text)) + } + + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Namely, each element of the iterator corresponds to + /// text that *isn't* matched by the regular expression. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let fields: Vec<&str> = re.split("a b \t c\td e").collect(); + /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); + /// # } + /// ``` + pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0 } + } + + /// Returns an iterator of at most `limit` substrings of `text` delimited + /// by a match of the regular expression. (A `limit` of `0` will return no + /// substrings.) Namely, each element of the iterator corresponds to text + /// that *isn't* matched by the regular expression. The remainder of the + /// string that is not split will be the last element in the iterator. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// Get the first two words in some text: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"\W+").unwrap(); + /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect(); + /// assert_eq!(fields, vec!("Hey", "How", "are you?")); + /// # } + /// ``` + pub fn splitn<'r, 't>( + &'r self, + text: &'t str, + limit: usize, + ) -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit } + } + + /// Replaces the leftmost-first match with the replacement provided. + /// The replacement can be a regular string (where `$N` and `$name` are + /// expanded to match capture groups) or a function that takes the matches' + /// `Captures` and returns the replaced string. + /// + /// If no match is found, then a copy of the string is returned unchanged. + /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// + /// # Examples + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal string: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new("[^01]+").unwrap(); + /// assert_eq!(re.replace("1078910", ""), "1010"); + /// # } + /// ``` + /// + /// But anything satisfying the `Replacer` trait will work. For example, + /// a closure of type `|&Captures| -> String` provides direct access to the + /// captures corresponding to a match. This allows one to access + /// capturing group matches easily: + /// + /// ```rust + /// # use regex::Regex; + /// # use regex::Captures; fn main() { + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { + /// format!("{} {}", &caps[2], &caps[1]) + /// }); + /// assert_eq!(result, "Bruce Springsteen"); + /// # } + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported that expands `$name` into the corresponding capture + /// group. Here's the last example, but using this expansion technique + /// with named capture groups: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", "$first $last"); + /// assert_eq!(result, "Bruce Springsteen"); + /// # } + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement and surrounding literal text. + /// For example, if we wanted to join two words together with an + /// underscore: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); + /// let result = re.replace("deep fried", "${first}_$second"); + /// assert_eq!(result, "deep_fried"); + /// # } + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// regard for capturing group expansion. This can be done by wrapping a + /// byte string with `NoExpand`: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// use regex::NoExpand; + /// + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); + /// assert_eq!(result, "$2 $last"); + /// # } + /// ``` + pub fn replace<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { + self.replacen(text, 1, rep) + } + + /// Replaces all non-overlapping matches in `text` with the replacement + /// provided. This is the same as calling `replacen` with `limit` set to + /// `0`. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement string. + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { + self.replacen(text, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in `text` with the + /// replacement provided. If `limit` is 0, then all non-overlapping matches + /// are replaced. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement string. + pub fn replacen<'t, R: Replacer>( + &self, + text: &'t str, + limit: usize, + mut rep: R, + ) -> Cow<'t, str> { + // If we know that the replacement doesn't have any capture expansions, + // then we can use the fast path. The fast path can make a tremendous + // difference: + // + // 1) We use `find_iter` instead of `captures_iter`. Not asking for + // captures generally makes the regex engines faster. + // 2) We don't need to look up all of the capture groups and do + // replacements inside the replacement string. We just push it + // at each match and be done with it. + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for (i, m) in it { + new.push_str(&text[last_match..m.start()]); + new.push_str(&rep); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.push_str(&text[last_match..]); + return Cow::Owned(new); + } + + // The slower path, which we use if the replacement needs access to + // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for (i, cap) in it { + // unwrap on 0 is OK because captures only reports matches + let m = cap.get(0).unwrap(); + new.push_str(&text[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.push_str(&text[last_match..]); + Cow::Owned(new) + } +} + +/// Advanced or "lower level" search methods. +impl Regex { + /// Returns the end location of a match in the text given. + /// + /// This method may have the same performance characteristics as + /// `is_match`, except it provides an end location for a match. In + /// particular, the location returned *may be shorter* than the proper end + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// text, but `shortest_match` can give up as soon as it sees the first + /// `a`. + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "aaaaa"; + /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); + /// assert_eq!(pos, Some(1)); + /// # } + /// ``` + pub fn shortest_match(&self, text: &str) -> Option<usize> { + self.shortest_match_at(text, 0) + } + + /// Returns the same as `shortest_match`, but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only match + /// when `start == 0`. + pub fn shortest_match_at( + &self, + text: &str, + start: usize, + ) -> Option<usize> { + self.0.searcher_str().shortest_match_at(text, start) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at(&self, text: &str, start: usize) -> bool { + self.0.searcher_str().is_match_at(text, start) + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn find_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher_str() + .find_at(text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, start).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalence to the `0`th capture group. + pub fn captures_read<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, 0) + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_read_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher_str() + .captures_read_at(&mut locs.0, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + start: usize, + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, start) + } +} + +/// Auxiliary methods. +impl Regex { + /// Returns the original string of this regex. + pub fn as_str(&self) -> &str { + &self.0.regex_strings()[0] + } + + /// Returns an iterator over the capture names. + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.0.capture_names().iter()) + } + + /// Returns the number of captures. + pub fn captures_len(&self) -> usize { + self.0.capture_names().len() + } + + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option<usize> { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher_str().locations()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + pub fn locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher_str().locations()) + } +} + +/// An iterator over the names of all possible captures. +/// +/// `None` indicates an unnamed capture; the first element (capture 0, the +/// whole matched region) is always unnamed. +/// +/// `'r` is the lifetime of the compiled regular expression. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + fn next(&mut self) -> Option<Option<&'r str>> { + self.0 + .next() + .as_ref() + .map(|slot| slot.as_ref().map(|name| name.as_ref())) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } + + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> FusedIterator for CaptureNames<'r> {} + +/// Yields all substrings delimited by a regular expression match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the string being split. +#[derive(Debug)] +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for Split<'r, 't> { + type Item = &'t str; + + fn next(&mut self) -> Option<&'t str> { + let text = self.finder.0.text(); + match self.finder.next() { + None => { + if self.last > text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len() + 1; // Next call will return None + Some(s) + } + } + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); + Some(matched) + } + } + } +} + +impl<'r, 't> FusedIterator for Split<'r, 't> {} + +/// Yields at most `N` substrings delimited by a regular expression match. +/// +/// The last substring will be whatever remains after splitting. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the string being split. +#[derive(Debug)] +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, + n: usize, +} + +impl<'r, 't> Iterator for SplitN<'r, 't> { + type Item = &'t str; + + fn next(&mut self) -> Option<&'t str> { + if self.n == 0 { + return None; + } + + self.n -= 1; + if self.n > 0 { + return self.splits.next(); + } + + let text = self.splits.finder.0.text(); + if self.splits.last > text.len() { + // We've already returned all substrings. + None + } else { + // self.n == 0, so future calls will return None immediately + Some(&text[self.splits.last..]) + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.n)) + } +} + +impl<'r, 't> FusedIterator for SplitN<'r, 't> {} + +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// You can think of this as a lower level +/// [`Captures`](struct.Captures.html), where this type does not support +/// named capturing groups directly and it does not borrow the text that these +/// offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs +/// such as `read_captures`, which permits amortizing the allocation in which +/// capture match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::Regex; +/// +/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` +#[derive(Clone, Debug)] +pub struct CaptureLocations(re_trait::Locations); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.pos(i) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + +/// Captures represents a group of captured strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched string is *also* available via the `name` +/// method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'t` is the lifetime of the matched text. +pub struct Captures<'t> { + text: &'t str, + locs: re_trait::Locations, + named_groups: Arc<HashMap<String, usize>>, +} + +impl<'t> Captures<'t> { + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # use regex::Regex; + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures("abc123").unwrap(); + /// + /// let text1 = caps.get(1).map_or("", |m| m.as_str()); + /// let text2 = caps.get(2).map_or("", |m| m.as_str()); + /// assert_eq!(text1, "123"); + /// assert_eq!(text2, ""); + /// ``` + pub fn get(&self, i: usize) -> Option<Match<'t>> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) + } + + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option<Match<'t>> { + self.named_groups.get(name).and_then(|&i| self.get(i)) + } + + /// An iterator that yields all capturing matches in the order in which + /// they appear in the regex. If a particular capture group didn't + /// participate in the match, then `None` is yielded for that capture. + /// + /// The first match always corresponds to the overall match of the regex. + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { + SubCaptureMatches { caps: self, it: self.locs.iter() } + } + + /// Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of characters is permitted. If the sequence + /// does not refer to a capture group name in the corresponding regex, then + /// it is replaced with an empty string. + /// + /// To write a literal `$` use `$$`. + pub fn expand(&self, replacement: &str, dst: &mut String) { + expand_str(self, replacement, dst) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. + #[inline] + pub fn len(&self) -> usize { + self.locs.len() + } +} + +impl<'t> fmt::Debug for Captures<'t> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() + } +} + +struct CapturesDebug<'c, 't>(&'c Captures<'t>); + +impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // We'd like to show something nice here, even if it means an + // allocation to build a reverse index. + let slot_to_name: HashMap<&usize, &String> = + self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); + let mut map = f.debug_map(); + for (slot, m) in self.0.locs.iter().enumerate() { + let m = m.map(|(s, e)| &self.0.text[s..e]); + if let Some(name) = slot_to_name.get(&slot) { + map.entry(&name, &m); + } else { + map.entry(&slot, &m); + } + } + map.finish() + } +} + +/// Get a group by index. +/// +/// `'t` is the lifetime of the matched text. +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `get()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'t> Index<usize> for Captures<'t> { + type Output = str; + + fn index(&self, i: usize) -> &str { + self.get(i) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'t` is the lifetime of the matched text and `'i` is the lifetime +/// of the group name (the index). +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'t, 'i> Index<&'i str> for Captures<'t> { + type Output = str; + + fn index<'a>(&'a self, name: &'i str) -> &'a str { + self.name(name) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator that yields all capturing matches in the order in which they +/// appear in the regex. +/// +/// If a particular capture group didn't participate in the match, then `None` +/// is yielded for that capture. The first match always corresponds to the +/// overall match of the regex. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and +/// the lifetime `'t` corresponds to the originally matched text. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 't> { + caps: &'c Captures<'t>, + it: SubCapturesPosIter<'c>, +} + +impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { + type Item = Option<Match<'t>>; + + fn next(&mut self) -> Option<Option<Match<'t>>> { + self.it + .next() + .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e))) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + fn count(self) -> usize { + self.it.count() + } +} + +impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {} + +impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched string. +#[derive(Debug)] +pub struct CaptureMatches<'r, 't>( + re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>, +); + +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { + type Item = Captures<'t>; + + fn next(&mut self) -> Option<Captures<'t>> { + self.0.next().map(|locs| Captures { + text: self.0.text(), + locs, + named_groups: self.0.regex().capture_name_idx().clone(), + }) + } +} + +impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {} + +/// An iterator over all non-overlapping matches for a particular string. +/// +/// The iterator yields a `Match` value. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched string. +#[derive(Debug)] +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>); + +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; + + fn next(&mut self) -> Option<Match<'t>> { + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) + } +} + +impl<'r, 't> FusedIterator for Matches<'r, 't> {} + +/// Replacer describes types that can be used to replace matches in a string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&str` along with other +/// variants of string types and `FnMut(&Captures) -> String` (or any +/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.push_str(caps.get(0).unwrap().as_str())`. + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String); + + /// Return a fixed unchanging replacement string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> { + None + } + + /// Return a `Replacer` that borrows and wraps this `Replacer`. + /// + /// This is useful when you want to take a generic `Replacer` (which might + /// not be cloneable) and use it without consuming it, so it can be used + /// more than once. + /// + /// # Example + /// + /// ``` + /// use regex::{Regex, Replacer}; + /// + /// fn replace_all_twice<R: Replacer>( + /// re: Regex, + /// src: &str, + /// mut rep: R, + /// ) -> String { + /// let dst = re.replace_all(src, rep.by_ref()); + /// let dst = re.replace_all(&dst, rep.by_ref()); + /// dst.into_owned() + /// } + /// ``` + fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { + ReplacerRef(self) + } +} + +/// By-reference adaptor for a `Replacer` +/// +/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.0.replace_append(caps, dst) + } + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + self.0.no_expansion() + } +} + +impl<'a> Replacer for &'a str { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a String { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_str().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl Replacer for String { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_str().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for Cow<'a, str> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_ref().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Cow<'a, str> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_ref().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> { + let s = t.as_ref(); + match find_byte(b'$', s.as_bytes()) { + Some(_) => None, + None => Some(Cow::Borrowed(s)), + } +} + +impl<F, T> Replacer for F +where + F: FnMut(&Captures<'_>) -> T, + T: AsRef<str>, +{ + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + dst.push_str((*self)(caps).as_ref()); + } +} + +/// `NoExpand` indicates literal string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +#[derive(Clone, Debug)] +pub struct NoExpand<'t>(pub &'t str); + +impl<'t> Replacer for NoExpand<'t> { + fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) { + dst.push_str(self.0); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/regex-1.8.4/src/sparse.rs b/regex-1.8.4/src/sparse.rs new file mode 100644 index 0000000000000..98b726613d16c --- /dev/null +++ b/regex-1.8.4/src/sparse.rs @@ -0,0 +1,84 @@ +use std::fmt; +use std::ops::Deref; +use std::slice; + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse allocations, so the initial allocation cost is bareable. However, +/// its other properties listed above are extremely useful. +#[derive(Clone)] +pub struct SparseSet { + /// Dense contains the instruction pointers in the order in which they + /// were inserted. + dense: Vec<usize>, + /// Sparse maps instruction pointers to their location in dense. + /// + /// An instruction pointer is in the set if and only if + /// sparse[ip] < dense.len() && ip == dense[sparse[ip]]. + sparse: Box<[usize]>, +} + +impl SparseSet { + pub fn new(size: usize) -> SparseSet { + SparseSet { + dense: Vec::with_capacity(size), + sparse: vec![0; size].into_boxed_slice(), + } + } + + pub fn len(&self) -> usize { + self.dense.len() + } + + pub fn is_empty(&self) -> bool { + self.dense.is_empty() + } + + pub fn capacity(&self) -> usize { + self.dense.capacity() + } + + pub fn insert(&mut self, value: usize) { + let i = self.len(); + assert!(i < self.capacity()); + self.dense.push(value); + self.sparse[value] = i; + } + + pub fn contains(&self, value: usize) -> bool { + let i = self.sparse[value]; + self.dense.get(i) == Some(&value) + } + + pub fn clear(&mut self) { + self.dense.clear(); + } +} + +impl fmt::Debug for SparseSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "SparseSet({:?})", self.dense) + } +} + +impl Deref for SparseSet { + type Target = [usize]; + + fn deref(&self) -> &Self::Target { + &self.dense + } +} + +impl<'a> IntoIterator for &'a SparseSet { + type Item = &'a usize; + type IntoIter = slice::Iter<'a, usize>; + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} diff --git a/regex-1.8.4/src/testdata/LICENSE b/regex-1.8.4/src/testdata/LICENSE new file mode 100644 index 0000000000000..f47dbf4c449bc --- /dev/null +++ b/regex-1.8.4/src/testdata/LICENSE @@ -0,0 +1,19 @@ +The following license covers testregex.c and all associated test data. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of the +Software, and to permit persons to whom the Software is furnished to do +so, subject to the following disclaimer: + +THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/regex-1.8.4/src/testdata/README b/regex-1.8.4/src/testdata/README new file mode 100644 index 0000000000000..6efc2dad33774 --- /dev/null +++ b/regex-1.8.4/src/testdata/README @@ -0,0 +1,17 @@ +Test data was taken from the Go distribution, which was in turn taken from the +testregex test suite: + + http://www2.research.att.com/~astopen/testregex/testregex.html + +The LICENSE in this directory corresponds to the LICENSE that the data was +released under. + +The tests themselves were modified for RE2/Go. A couple were modified further +by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. +(Yes, it seems like RE2/Go includes failing test cases.) This may or may not +have been a bad idea, but I think being consistent with an established Regex +library is worth something. + +Note that these files are read by 'scripts/regex-match-tests.py' and turned +into Rust tests found in 'regex_macros/tests/matches.rs'. + diff --git a/regex-1.8.4/src/testdata/basic.dat b/regex-1.8.4/src/testdata/basic.dat new file mode 100644 index 0000000000000..632e1bb4165db --- /dev/null +++ b/regex-1.8.4/src/testdata/basic.dat @@ -0,0 +1,221 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (?-u)(Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[<element>]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +# No collation in Go +#BE [[-]] [[-]] (2,4) +#BE [[.NIL.]] NULL ECOLLATE +#BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\x7f (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +#E (a*)* - (0,0)(0,0) +E (a*)* - (0,0)(?,?) RE2/Go +E (a*)+ - (0,0)(0,0) +#E (a*|b)* - (0,0)(0,0) +E (a*|b)* - (0,0)(?,?) RE2/Go +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +#E (^)* - (0,0)(0,0) +E (^)* - (0,0)(?,?) RE2/Go +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +#E ((a*|b))* - (0,0)(0,0)(0,0) +E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/regex-1.8.4/src/testdata/nullsubexpr.dat b/regex-1.8.4/src/testdata/nullsubexpr.dat new file mode 100644 index 0000000000000..2e18fbb917070 --- /dev/null +++ b/regex-1.8.4/src/testdata/nullsubexpr.dat @@ -0,0 +1,79 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +#E SAME b (0,0)(0,0) +E SAME b (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +#E SAME aaaaaa (0,0)(0,0) +E SAME aaaaaa (0,0)(?,?) RE2/Go +E ([^ab]*)* ccccxx (0,6)(0,6) +#E SAME ababab (0,0)(0,0) +E SAME ababab (0,0)(?,?) RE2/Go + +E ((z)+|a)* zabcde (0,2)(1,2) + +#{E a+? aaaaaa (0,1) no *? +? mimimal match ops +#E (a) aaa (0,1)(0,1) +#E (a*?) aaa (0,0)(0,0) +#E (a)*? aaa (0,0) +#E (a*?)*? aaa (0,0) +#} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +#E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/regex-1.8.4/src/testdata/repetition.dat b/regex-1.8.4/src/testdata/repetition.dat new file mode 100644 index 0000000000000..3bb2121180005 --- /dev/null +++ b/regex-1.8.4/src/testdata/repetition.dat @@ -0,0 +1,163 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) +E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +# These test a bug in OS X / FreeBSD / NetBSD, and libtree. +# Linux/GLIBC gets the {8,} and {8,8} wrong. + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/regex-1.8.4/src/utf8.rs b/regex-1.8.4/src/utf8.rs new file mode 100644 index 0000000000000..2dfd2c0d1d9c3 --- /dev/null +++ b/regex-1.8.4/src/utf8.rs @@ -0,0 +1,264 @@ +/// A few elementary UTF-8 encoding and decoding functions used by the matching +/// engines. +/// +/// In an ideal world, the matching engines operate on `&str` and we can just +/// lean on the standard library for all our UTF-8 needs. However, to support +/// byte based regexes (that can match on arbitrary bytes which may contain +/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`. +/// The standard library doesn't really recognize this use case, so we have +/// to build it out ourselves. +/// +/// Should this be factored out into a separate crate? It seems independently +/// useful. There are other crates that already exist (e.g., `utf-8`) that have +/// overlapping use cases. Not sure what to do. +use std::char; + +const TAG_CONT: u8 = 0b1000_0000; +const TAG_TWO: u8 = 0b1100_0000; +const TAG_THREE: u8 = 0b1110_0000; +const TAG_FOUR: u8 = 0b1111_0000; + +/// Returns the smallest possible index of the next valid UTF-8 sequence +/// starting after `i`. +pub fn next_utf8(text: &[u8], i: usize) -> usize { + let b = match text.get(i) { + None => return i + 1, + Some(&b) => b, + }; + let inc = if b <= 0x7F { + 1 + } else if b <= 0b110_11111 { + 2 + } else if b <= 0b1110_1111 { + 3 + } else { + 4 + }; + i + inc +} + +/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`. +/// +/// If no valid UTF-8 sequence could be found, then `None` is returned. +/// Otherwise, the decoded codepoint and the number of bytes read is returned. +/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be +/// 1, 2, 3 or 4. +/// +/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a +/// codepoint that is out of range (surrogate codepoints are out of range) or +/// is not the shortest possible UTF-8 sequence for that codepoint. +#[inline] +pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { + let b0 = match src.get(0) { + None => return None, + Some(&b) if b <= 0x7F => return Some((b as char, 1)), + Some(&b) => b, + }; + match b0 { + 0b110_00000..=0b110_11111 => { + if src.len() < 2 { + return None; + } + let b1 = src[1]; + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); + match cp { + 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), + _ => None, + } + } + 0b1110_0000..=0b1110_1111 => { + if src.len() < 3 { + return None; + } + let (b1, b2) = (src[1], src[2]); + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + if 0b11_000000 & b2 != TAG_CONT { + return None; + } + let cp = ((b0 & !TAG_THREE) as u32) << 12 + | ((b1 & !TAG_CONT) as u32) << 6 + | ((b2 & !TAG_CONT) as u32); + match cp { + // char::from_u32 will disallow surrogate codepoints. + 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), + _ => None, + } + } + 0b11110_000..=0b11110_111 => { + if src.len() < 4 { + return None; + } + let (b1, b2, b3) = (src[1], src[2], src[3]); + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + if 0b11_000000 & b2 != TAG_CONT { + return None; + } + if 0b11_000000 & b3 != TAG_CONT { + return None; + } + let cp = ((b0 & !TAG_FOUR) as u32) << 18 + | ((b1 & !TAG_CONT) as u32) << 12 + | ((b2 & !TAG_CONT) as u32) << 6 + | ((b3 & !TAG_CONT) as u32); + match cp { + 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)), + _ => None, + } + } + _ => None, + } +} + +/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead +/// of the first. +pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> { + if src.is_empty() { + return None; + } + let mut start = src.len() - 1; + if src[start] <= 0x7F { + return Some((src[start] as char, 1)); + } + while start > src.len().saturating_sub(4) { + start -= 1; + if is_start_byte(src[start]) { + break; + } + } + match decode_utf8(&src[start..]) { + None => None, + Some((_, n)) if n < src.len() - start => None, + Some((cp, n)) => Some((cp, n)), + } +} + +fn is_start_byte(b: u8) -> bool { + b & 0b11_000000 != 0b1_0000000 +} + +#[cfg(test)] +mod tests { + use std::str; + + use quickcheck::quickcheck; + + use super::{ + decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO, + }; + + #[test] + fn prop_roundtrip() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let encoded_len = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap(); + encoded_len == got_len && given_cp == got_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_roundtrip_last() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let encoded_len = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, got_len) = + decode_last_utf8(&tmp[..encoded_len]).unwrap(); + encoded_len == got_len && given_cp == got_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_encode_matches_std() { + fn p(cp: char) -> bool { + let mut got = [0; 4]; + let n = cp.encode_utf8(&mut got).len(); + let expected = cp.to_string(); + &got[..n] == expected.as_bytes() + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_decode_matches_std() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let n = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap(); + let expected_cp = + str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap(); + got_cp == expected_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_decode_last_matches_std() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let n = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap(); + let expected_cp = str::from_utf8(&tmp[..n]) + .unwrap() + .chars() + .rev() + .next() + .unwrap(); + got_cp == expected_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn reject_invalid() { + // Invalid start byte + assert_eq!(decode_utf8(&[0xFF]), None); + // Surrogate pair + assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None); + // Invalid continuation byte. + assert_eq!(decode_utf8(&[0xD4, 0xC2]), None); + // Bad lengths + assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes + assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes + assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes + // Not a minimal UTF-8 sequence + assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); + assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None); + assert_eq!( + decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]), + None + ); + } + + #[test] + fn reject_invalid_last() { + // Invalid start byte + assert_eq!(decode_last_utf8(&[0xFF]), None); + // Surrogate pair + assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None); + // Bad lengths + assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes + assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes + assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes + // Not a minimal UTF-8 sequence + assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); + assert_eq!( + decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]), + None + ); + assert_eq!( + decode_last_utf8( + &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',] + ), + None + ); + } +} diff --git a/regex-1.8.4/test b/regex-1.8.4/test new file mode 100755 index 0000000000000..b10564f12803b --- /dev/null +++ b/regex-1.8.4/test @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +# This is a convenience script for running a broad swath of tests across +# features. We don't test the complete space, since the complete space is quite +# large. Hopefully once we migrate the test suite to better infrastructure +# (like regex-automata), we'll be able to test more of the space. +echo "===== DEFAULT FEATURES ===" +cargo test + +echo "===== DOC TESTS ===" +cargo test --doc + +features=( + "std" + "std unicode" + "std unicode-perl" + "std perf" + "std perf-cache" + "std perf-dfa" + "std perf-inline" + "std perf-literal" +) +for f in "${features[@]}"; do + echo "===== FEATURE: $f (default) ===" + cargo test --test default --no-default-features --features "$f" + echo "===== FEATURE: $f (default-bytes) ===" + cargo test --test default-bytes --no-default-features --features "$f" +done diff --git a/regex-1.8.4/tests/api.rs b/regex-1.8.4/tests/api.rs new file mode 100644 index 0000000000000..c7250a8a3a5d8 --- /dev/null +++ b/regex-1.8.4/tests/api.rs @@ -0,0 +1,234 @@ +#[test] +fn empty_regex_empty_match() { + let re = regex!(""); + assert_eq!(vec![(0, 0)], findall!(re, "")); +} + +#[test] +fn empty_regex_nonempty_match() { + let re = regex!(""); + assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc")); +} + +#[test] +fn one_zero_length_match() { + let re = regex!(r"[0-9]*"); + assert_eq!(vec![(0, 0), (1, 2), (3, 4)], findall!(re, "a1b2")); +} + +#[test] +fn many_zero_length_match() { + let re = regex!(r"[0-9]*"); + assert_eq!( + vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)], + findall!(re, "a1bbb2") + ); +} + +#[test] +fn many_sequential_zero_length_match() { + let re = regex!(r"[0-9]?"); + assert_eq!( + vec![(0, 0), (1, 2), (2, 3), (4, 5), (6, 6)], + findall!(re, "a12b3c") + ); +} + +#[test] +fn quoted_bracket_set() { + let re = regex!(r"([\x{5b}\x{5d}])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); + let re = regex!(r"([\[\]])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); +} + +#[test] +fn first_range_starts_with_left_bracket() { + let re = regex!(r"([\[-z])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); +} + +#[test] +fn range_ends_with_escape() { + let re = regex!(r"([\[-\x{5d}])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); +} + +#[test] +fn empty_match_find_iter() { + let re = regex!(r".*?"); + assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc")); +} + +#[test] +fn empty_match_captures_iter() { + let re = regex!(r".*?"); + let ms: Vec<_> = re + .captures_iter(text!("abc")) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); + assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); +} + +#[test] +fn capture_names() { + let re = regex!(r"(.)(?P<a>.)"); + assert_eq!(3, re.captures_len()); + assert_eq!((3, Some(3)), re.capture_names().size_hint()); + assert_eq!( + vec![None, None, Some("a")], + re.capture_names().collect::<Vec<_>>() + ); +} + +#[test] +fn regex_string() { + assert_eq!(r"[a-zA-Z0-9]+", regex!(r"[a-zA-Z0-9]+").as_str()); + assert_eq!(r"[a-zA-Z0-9]+", &format!("{}", regex!(r"[a-zA-Z0-9]+"))); + assert_eq!(r"[a-zA-Z0-9]+", &format!("{:?}", regex!(r"[a-zA-Z0-9]+"))); +} + +#[test] +fn capture_index() { + let re = regex!(r"^(?P<name>.+)$"); + let cap = re.captures(t!("abc")).unwrap(); + assert_eq!(&cap[0], t!("abc")); + assert_eq!(&cap[1], t!("abc")); + assert_eq!(&cap["name"], t!("abc")); +} + +#[test] +#[should_panic] +#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)] +fn capture_index_panic_usize() { + let re = regex!(r"^(?P<name>.+)$"); + let cap = re.captures(t!("abc")).unwrap(); + let _ = cap[2]; +} + +#[test] +#[should_panic] +#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)] +fn capture_index_panic_name() { + let re = regex!(r"^(?P<name>.+)$"); + let cap = re.captures(t!("abc")).unwrap(); + let _ = cap["bad name"]; +} + +#[test] +fn capture_index_lifetime() { + // This is a test of whether the types on `caps["..."]` are general + // enough. If not, this will fail to typecheck. + fn inner(s: &str) -> usize { + let re = regex!(r"(?P<number>[0-9]+)"); + let caps = re.captures(t!(s)).unwrap(); + caps["number"].len() + } + assert_eq!(3, inner("123")); +} + +#[test] +fn capture_misc() { + let re = regex!(r"(.)(?P<a>a)?(.)(?P<b>.)"); + let cap = re.captures(t!("abc")).unwrap(); + + assert_eq!(5, cap.len()); + + assert_eq!((0, 3), { + let m = cap.get(0).unwrap(); + (m.start(), m.end()) + }); + assert_eq!(None, cap.get(2)); + assert_eq!((2, 3), { + let m = cap.get(4).unwrap(); + (m.start(), m.end()) + }); + + assert_eq!(t!("abc"), match_text!(cap.get(0).unwrap())); + assert_eq!(None, cap.get(2)); + assert_eq!(t!("c"), match_text!(cap.get(4).unwrap())); + + assert_eq!(None, cap.name("a")); + assert_eq!(t!("c"), match_text!(cap.name("b").unwrap())); +} + +#[test] +fn sub_capture_matches() { + let re = regex!(r"([a-z])(([a-z])|([0-9]))"); + let cap = re.captures(t!("a5")).unwrap(); + let subs: Vec<_> = cap.iter().collect(); + + assert_eq!(5, subs.len()); + assert!(subs[0].is_some()); + assert!(subs[1].is_some()); + assert!(subs[2].is_some()); + assert!(subs[3].is_none()); + assert!(subs[4].is_some()); + + assert_eq!(t!("a5"), match_text!(subs[0].unwrap())); + assert_eq!(t!("a"), match_text!(subs[1].unwrap())); + assert_eq!(t!("5"), match_text!(subs[2].unwrap())); + assert_eq!(t!("5"), match_text!(subs[4].unwrap())); +} + +expand!(expand1, r"(?-u)(?P<foo>\w+)", "abc", "$foo", "abc"); +expand!(expand2, r"(?-u)(?P<foo>\w+)", "abc", "$0", "abc"); +expand!(expand3, r"(?-u)(?P<foo>\w+)", "abc", "$1", "abc"); +expand!(expand4, r"(?-u)(?P<foo>\w+)", "abc", "$$1", "$1"); +expand!(expand5, r"(?-u)(?P<foo>\w+)", "abc", "$$foo", "$foo"); +expand!(expand6, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$b$a", "123abc"); +expand!(expand7, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "z$bz$az", "z"); +expand!( + expand8, + r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", + "abc 123", + ".$b.$a.", + ".123.abc." +); +expand!( + expand9, + r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", + "abc 123", + " $b $a ", + " 123 abc " +); +expand!(expand10, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$bz$az", ""); + +expand!(expand_name1, r"%(?P<Z>[a-z]+)", "%abc", "$Z%", "abc%"); +expand!(expand_name2, r"\[(?P<Z>[a-z]+)", "[abc", "$Z[", "abc["); +expand!(expand_name3, r"\{(?P<Z>[a-z]+)", "{abc", "$Z{", "abc{"); +expand!(expand_name4, r"\}(?P<Z>[a-z]+)", "}abc", "$Z}", "abc}"); +expand!(expand_name5, r"%([a-z]+)", "%abc", "$1a%", "%"); +expand!(expand_name6, r"%([a-z]+)", "%abc", "${1}a%", "abca%"); +expand!(expand_name7, r"\[(?P<Z[>[a-z]+)", "[abc", "${Z[}[", "abc["); +expand!(expand_name8, r"\[(?P<Z[>[a-z]+)", "[abc", "${foo}[", "["); +expand!(expand_name9, r"\[(?P<Z[>[a-z]+)", "[abc", "${1a}[", "["); +expand!(expand_name10, r"\[(?P<Z[>[a-z]+)", "[abc", "${#}[", "["); +expand!(expand_name11, r"\[(?P<Z[>[a-z]+)", "[abc", "${$$}[", "["); + +split!( + split1, + r"(?-u)\s+", + "a b\nc\td\n\t e", + &[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")] +); +split!( + split2, + r"(?-u)\b", + "a b c", + &[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c"), t!("")] +); +split!(split3, r"a$", "a", &[t!(""), t!("")]); +split!(split_none, r"-", r"a", &[t!("a")]); +split!(split_trailing_blank, r"-", r"a-", &[t!("a"), t!("")]); +split!(split_trailing_blanks, r"-", r"a--", &[t!("a"), t!(""), t!("")]); +split!(split_empty, r"-", r"", &[t!("")]); + +splitn!(splitn_below_limit, r"-", r"a", 2, &[t!("a")]); +splitn!(splitn_at_limit, r"-", r"a-b", 2, &[t!("a"), t!("b")]); +splitn!(splitn_above_limit, r"-", r"a-b-c", 2, &[t!("a"), t!("b-c")]); +splitn!(splitn_zero_limit, r"-", r"a-b", 0, empty_vec!()); +splitn!(splitn_trailing_blank, r"-", r"a-", 2, &[t!("a"), t!("")]); +splitn!(splitn_trailing_separator, r"-", r"a--", 2, &[t!("a"), t!("-")]); +splitn!(splitn_empty, r"-", r"", 1, &[t!("")]); diff --git a/regex-1.8.4/tests/api_str.rs b/regex-1.8.4/tests/api_str.rs new file mode 100644 index 0000000000000..480116da739fe --- /dev/null +++ b/regex-1.8.4/tests/api_str.rs @@ -0,0 +1,34 @@ +// These tests don't really make sense with the bytes API, so we only test them +// on the Unicode API. + +#[test] +fn empty_match_unicode_find_iter() { + // Tests that we still yield byte ranges at valid UTF-8 sequence boundaries + // even when we're susceptible to empty width matches. + let re = regex!(r".*?"); + assert_eq!( + vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], + findall!(re, "Ⅰ1Ⅱ2") + ); +} + +#[test] +fn empty_match_unicode_captures_iter() { + // Same as empty_match_unicode_find_iter, but tests capture iteration. + let re = regex!(r".*?"); + let ms: Vec<_> = re + .captures_iter(text!("Ⅰ1Ⅱ2")) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); + assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms); +} + +#[test] +fn match_as_str() { + let re = regex!(r"fo+"); + let caps = re.captures("barfoobar").unwrap(); + assert_eq!(caps.get(0).map(|m| m.as_str()), Some("foo")); + assert_eq!(caps.get(0).map(From::from), Some("foo")); + assert_eq!(caps.get(0).map(Into::into), Some("foo")); +} diff --git a/regex-1.8.4/tests/bytes.rs b/regex-1.8.4/tests/bytes.rs new file mode 100644 index 0000000000000..d05f138edfa5e --- /dev/null +++ b/regex-1.8.4/tests/bytes.rs @@ -0,0 +1,107 @@ +// These are tests specifically crafted for regexes that can match arbitrary +// bytes. + +// A silly wrapper to make it possible to write and match raw bytes. +struct R<'a>(&'a [u8]); +impl<'a> R<'a> { + fn as_bytes(&self) -> &'a [u8] { + self.0 + } +} + +mat!(word_boundary, r"(?-u) \b", " δ", None); +#[cfg(feature = "unicode-perl")] +mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1))); +mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1))); +#[cfg(feature = "unicode-perl")] +mat!(word_not_boundary_unicode, r" \B", " δ", None); + +mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1))); +#[cfg(feature = "unicode-perl")] +mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3))); +mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1))); +#[cfg(feature = "unicode-perl")] +mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8))); +mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1))); +#[cfg(feature = "unicode-perl")] +mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4))); + +// The first `(.+)` matches two Unicode codepoints, but can't match the 5th +// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and +// matches. +mat!( + mixed1, + r"(.+)(?-u)(.+)", + R(b"\xCE\x93\xCE\x94\xFF"), + Some((0, 5)), + Some((0, 4)), + Some((4, 5)) +); + +mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1))); +mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5))); +#[cfg(feature = "unicode-case")] +mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7))); +mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2))); + +mat!(negate_unicode, r"[^a]", "δ", Some((0, 2))); +mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1))); + +// This doesn't match in a normal Unicode regex because the implicit preceding +// `.*?` is Unicode aware. +mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2))); + +// Have fun with null bytes. +mat!( + null_bytes, + r"(?-u)(?P<cstr>[^\x00]+)\x00", + R(b"foo\x00"), + Some((0, 4)), + Some((0, 3)) +); + +// Test that lookahead operators work properly in the face of invalid UTF-8. +// See: https://github.com/rust-lang/regex/issues/277 +matiter!( + invalidutf8_anchor1, + r"(?-u)\xcc?^", + R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), + (0, 0) +); +matiter!( + invalidutf8_anchor2, + r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$", + R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), + (22, 22) +); +matiter!( + invalidutf8_anchor3, + r"(?-u)^|ddp\xff\xffdddddlQd@\x80", + R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), + (0, 0) +); + +// See https://github.com/rust-lang/regex/issues/303 +#[test] +fn negated_full_byte_range() { + assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err()); +} + +matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!( + word_boundary_ascii2, + r"(?-u:\B)", + "0\u{7EF5E}", + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); + +// See: https://github.com/rust-lang/regex/issues/264 +mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0))); +mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0))); + +// See: https://github.com/rust-lang/regex/issues/271 +mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8))); diff --git a/regex-1.8.4/tests/consistent.rs b/regex-1.8.4/tests/consistent.rs new file mode 100644 index 0000000000000..722f2a51a0b6e --- /dev/null +++ b/regex-1.8.4/tests/consistent.rs @@ -0,0 +1,238 @@ +use regex::internal::ExecBuilder; + +/// Given a regex, check if all of the backends produce the same +/// results on a number of different inputs. +/// +/// For now this just throws quickcheck at the problem, which +/// is not very good because it only really tests half of the +/// problem space. It is pretty unlikely that a random string +/// will match any given regex, so this will probably just +/// be checking that the different backends fail in the same +/// way. This is still worthwhile to test, but is definitely not +/// the whole story. +/// +/// TODO(ethan): In order to cover the other half of the problem +/// space, we should generate a random matching string by inspecting +/// the AST of the input regex. The right way to do this probably +/// involves adding a custom Arbitrary instance around a couple +/// of newtypes. That way we can respect the quickcheck size hinting +/// and shrinking and whatnot. +pub fn backends_are_consistent(re: &str) -> Result<u64, String> { + let standard_backends = vec![ + ( + "bounded_backtracking_re", + ExecBuilder::new(re) + .bounded_backtracking() + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "pikevm_re", + ExecBuilder::new(re) + .nfa() + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "default_re", + ExecBuilder::new(re) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ]; + + let utf8bytes_backends = vec![ + ( + "bounded_backtracking_utf8bytes_re", + ExecBuilder::new(re) + .bounded_backtracking() + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "pikevm_utf8bytes_re", + ExecBuilder::new(re) + .nfa() + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "default_utf8bytes_re", + ExecBuilder::new(re) + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ]; + + let bytes_backends = vec![ + ( + "bounded_backtracking_bytes_re", + ExecBuilder::new(re) + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "pikevm_bytes_re", + ExecBuilder::new(re) + .nfa() + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "default_bytes_re", + ExecBuilder::new(re) + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err))?, + ), + ]; + + Ok(string_checker::check_backends(&standard_backends)? + + string_checker::check_backends(&utf8bytes_backends)? + + bytes_checker::check_backends(&bytes_backends)?) +} + +// +// A consistency checker parameterized by the input type (&str or &[u8]). +// + +macro_rules! checker { + ($module_name:ident, $regex_type:path, $mk_input:expr) => { + mod $module_name { + use quickcheck; + use quickcheck::{Arbitrary, TestResult}; + + pub fn check_backends( + backends: &[(&str, $regex_type)], + ) -> Result<u64, String> { + let mut total_passed = 0; + for regex in backends[1..].iter() { + total_passed += quickcheck_regex_eq(&backends[0], regex)?; + } + + Ok(total_passed) + } + + fn quickcheck_regex_eq( + &(name1, ref re1): &(&str, $regex_type), + &(name2, ref re2): &(&str, $regex_type), + ) -> Result<u64, String> { + quickcheck::QuickCheck::new() + .quicktest(RegexEqualityTest::new( + re1.clone(), + re2.clone(), + )) + .map_err(|err| { + format!( + "{}(/{}/) and {}(/{}/) are inconsistent.\ + QuickCheck Err: {:?}", + name1, re1, name2, re2, err + ) + }) + } + + struct RegexEqualityTest { + re1: $regex_type, + re2: $regex_type, + } + impl RegexEqualityTest { + fn new(re1: $regex_type, re2: $regex_type) -> Self { + RegexEqualityTest { re1: re1, re2: re2 } + } + } + + impl quickcheck::Testable for RegexEqualityTest { + fn result(&self, gen: &mut quickcheck::Gen) -> TestResult { + let input = $mk_input(gen); + let input = &input; + + if self.re1.find(&input) != self.re2.find(input) { + return TestResult::error(format!( + "find mismatch input={:?}", + input + )); + } + + let cap1 = self.re1.captures(input); + let cap2 = self.re2.captures(input); + match (cap1, cap2) { + (None, None) => {} + (Some(cap1), Some(cap2)) => { + for (c1, c2) in cap1.iter().zip(cap2.iter()) { + if c1 != c2 { + return TestResult::error(format!( + "captures mismatch input={:?}", + input + )); + } + } + } + _ => { + return TestResult::error(format!( + "captures mismatch input={:?}", + input + )) + } + } + + let fi1 = self.re1.find_iter(input); + let fi2 = self.re2.find_iter(input); + for (m1, m2) in fi1.zip(fi2) { + if m1 != m2 { + return TestResult::error(format!( + "find_iter mismatch input={:?}", + input + )); + } + } + + let ci1 = self.re1.captures_iter(input); + let ci2 = self.re2.captures_iter(input); + for (cap1, cap2) in ci1.zip(ci2) { + for (c1, c2) in cap1.iter().zip(cap2.iter()) { + if c1 != c2 { + return TestResult::error(format!( + "captures_iter mismatch input={:?}", + input + )); + } + } + } + + let s1 = self.re1.split(input); + let s2 = self.re2.split(input); + for (chunk1, chunk2) in s1.zip(s2) { + if chunk1 != chunk2 { + return TestResult::error(format!( + "split mismatch input={:?}", + input + )); + } + } + + TestResult::from_bool(true) + } + } + } // mod + }; // rule case +} // macro_rules! + +checker!(string_checker, ::regex::Regex, |gen| String::arbitrary(gen)); +checker!(bytes_checker, ::regex::bytes::Regex, |gen| Vec::<u8>::arbitrary( + gen +)); diff --git a/regex-1.8.4/tests/crates_regex.rs b/regex-1.8.4/tests/crates_regex.rs new file mode 100644 index 0000000000000..200ec27b2d251 --- /dev/null +++ b/regex-1.8.4/tests/crates_regex.rs @@ -0,0 +1,3287 @@ +// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py' +// on 2018-06-20 09:56:32.820354. + +// autoshutdown-0.1.0: r"\s*(\d+)(\w)\s*" +consistent!(autoshutdown_0, r"\s*(\d+)(\w)\s*"); + +// epub-1.1.1: r"/" +consistent!(epub_0, r"/"); + +// rpi-info-0.2.0: "^Revision\t+: ([0-9a-fA-F]+)" +consistent!(rpi_info_0, "^Revision\t+: ([0-9a-fA-F]+)"); + +// rpi-info-0.2.0: "Serial\t+: ([0-9a-fA-F]+)" +consistent!(rpi_info_1, "Serial\t+: ([0-9a-fA-F]+)"); + +// pnet_macros-0.21.0: r"^u([0-9]+)(be|le|he)?$" +consistent!(pnet_macros_0, r"^u([0-9]+)(be|le|he)?$"); + +// iban_validate-1.0.3: r"^[A-Z]{2}\d{2}[A-Z\d]{1,30}$" +consistent!(iban_validate_0, r"^[A-Z]{2}\d{2}[A-Z\d]{1,30}$"); + +// markifier-0.1.0: r".*\[(?P<percent>.+)%.*\].*" +consistent!(markifier_0, r".*\[(?P<percent>.+)%.*\].*"); + +// mallumo-0.3.0: r"(#include) (\S*)(.*)" +consistent!(mallumo_0, r"(#include) (\S*)(.*)"); + +// mallumo-0.3.0: r"(ERROR: \d+:)(\d+)(: )(.+)" +consistent!(mallumo_1, r"(ERROR: \d+:)(\d+)(: )(.+)"); + +// mallumo-0.3.0: r"(\d+\()(\d+)(?:\) : )(.+)" +consistent!(mallumo_2, r"(\d+\()(\d+)(?:\) : )(.+)"); + +// magnet_more-0.0.1: r"(.+?)(\[.*?\])?" +consistent!(magnet_more_0, r"(.+?)(\[.*?\])?"); + +// magnet_app-0.0.1: r":(?P<k>[a-zA-Z_]+)" +consistent!(magnet_app_0, r":(?P<k>[a-zA-Z_]+)"); + +// yubibomb-0.2.0: r"^\d{6}(?:\s*,\s*\d{6})*$" +consistent!(yubibomb_0, r"^\d{6}(?:\s*,\s*\d{6})*$"); + +// multirust-rs-0.0.4: r"[\\/]([^\\/?]+)(\?.*)?$" +consistent!(multirust_rs_0, r"[\\/]([^\\/?]+)(\?.*)?$"); + +// hueclient-0.3.2: "\"[a-z]*\":null" +consistent!(hueclient_0, "\"[a-z]*\":null"); + +// hueclient-0.3.2: ",+" +consistent!(hueclient_1, ",+"); + +// hueclient-0.3.2: ",\\}" +consistent!(hueclient_2, ",\\}"); + +// hueclient-0.3.2: "\\{," +consistent!(hueclient_3, "\\{,"); + +// aerial-0.1.0: r"[a-zA-Z_\$][a-zA-Z_0-9]*" +consistent!(aerial_0, r"[a-zA-Z_\$][a-zA-Z_0-9]*"); + +// aerial-0.1.0: r"thi[sng]+" +consistent!(aerial_1, r"thi[sng]+"); + +// rvue-0.1.0: r"(.+)\s+\((.+?)\)" +consistent!(rvue_0, r"(.+)\s+\((.+?)\)"); + +// rvue-0.1.0: r"([\d\.]+)\s*out\s*of\s*([\d\.]+)" +consistent!(rvue_1, r"([\d\.]+)\s*out\s*of\s*([\d\.]+)"); + +// rvue-0.1.0: r"^([\d\.]+)\s*(?:\(\))?$" +consistent!(rvue_2, r"^([\d\.]+)\s*(?:\(\))?$"); + +// rvue-0.1.0: r"([\d\.]+)\s*Points\s*Possible" +consistent!(rvue_3, r"([\d\.]+)\s*Points\s*Possible"); + +// rvue-0.1.0: r"([\d\.]+)\s*/\s*([\d\.]+)" +consistent!(rvue_4, r"([\d\.]+)\s*/\s*([\d\.]+)"); + +// rvsim-0.1.0: r"_?([_a-z0-9]+)\s*:\s*([_a-z0-9]+)\s*[,)]" +consistent!(rvsim_0, r"_?([_a-z0-9]+)\s*:\s*([_a-z0-9]+)\s*[,)]"); + +// nereon-0.1.4: "(.*[^\\\\])\\{\\}(.*)" +consistent!(nereon_0, "(.*[^\\\\])\\{\\}(.*)"); + +// next_episode-0.3.0: r"((?i)^(.+).s(\d+)e(\d+).*)$" +consistent!(next_episode_0, r"((?i)^(.+).s(\d+)e(\d+).*)$"); + +// migrant_lib-0.19.2: r"[^a-z0-9-]+" +consistent!(migrant_lib_0, r"[^a-z0-9-]+"); + +// migrant_lib-0.19.2: r"[0-9]{14}_[a-z0-9-]+" +consistent!(migrant_lib_1, r"[0-9]{14}_[a-z0-9-]+"); + +// migrant_lib-0.19.2: r"([0-9]{14}_)?[a-z0-9-]+" +consistent!(migrant_lib_2, r"([0-9]{14}_)?[a-z0-9-]+"); + +// minipre-0.2.0: "$_" +consistent!(minipre_0, "$_"); + +// minifier-0.0.13: r">\s+<" +consistent!(minifier_0, r">\s+<"); + +// minifier-0.0.13: r"\s{2,}|[\r\n]" +consistent!(minifier_1, r"\s{2,}|[\r\n]"); + +// minifier-0.0.13: r"<(style|script)[\w|\s].*?>" +consistent!(minifier_2, r"<(style|script)[\w|\s].*?>"); + +// minifier-0.0.13: "<!--(.|\n)*?-->" +consistent!(minifier_3, "<!--(.|\n)*?-->"); + +// minifier-0.0.13: r"<\w.*?>" +consistent!(minifier_4, r"<\w.*?>"); + +// minifier-0.0.13: r" \s+|\s +" +consistent!(minifier_5, r" \s+|\s +"); + +// minifier-0.0.13: r"\w\s+\w" +consistent!(minifier_6, r"\w\s+\w"); + +// minifier-0.0.13: r"'\s+>" +consistent!(minifier_7, r"'\s+>"); + +// minifier-0.0.13: r"\d\s+>" +consistent!(minifier_8, r"\d\s+>"); + +// ggp-rs-0.1.2: r"(?P<relation>\([^)]+\))|(?P<prop>[a-zA-Z0-9_]+)" +consistent!(ggp_rs_0, r"(?P<relation>\([^)]+\))|(?P<prop>[a-zA-Z0-9_]+)"); + +// ggp-rs-0.1.2: r"\((.*)\)." +consistent!(ggp_rs_1, r"\((.*)\)."); + +// poe-superfilter-0.2.0: "[A-Za-z0-9_]" +consistent!(poe_superfilter_0, "[A-Za-z0-9_]"); + +// poke-a-mango-0.5.0: r"(\d+)x(\d+)" +consistent!(poke_a_mango_0, r"(\d+)x(\d+)"); + +// pop3-rs-0.1.0: r"(?P<nmsg>\d+) (?P<size>\d+)" +consistent!(pop3_rs_0, r"(?P<nmsg>\d+) (?P<size>\d+)"); + +// pop3-rs-0.1.0: r"(?P<msgid>\d+) (?P<uidl>[\x21-\x7E]{1,70})" +consistent!(pop3_rs_1, r"(?P<msgid>\d+) (?P<uidl>[\x21-\x7E]{1,70})"); + +// pop3-rs-0.1.0: r"(<.*>)\r\n$" +consistent!(pop3_rs_2, r"(<.*>)\r\n$"); + +// pop3-rs-0.1.0: r"^(?P<status>\+OK|-ERR) (?P<statustext>.*)" +consistent!(pop3_rs_3, r"^(?P<status>\+OK|-ERR) (?P<statustext>.*)"); + +// pop3-1.0.6: r"^\.\r\n$" +consistent!(pop3_0, r"^\.\r\n$"); + +// pop3-1.0.6: r"\+OK(.*)" +consistent!(pop3_1, r"\+OK(.*)"); + +// pop3-1.0.6: r"-ERR(.*)" +consistent!(pop3_2, r"-ERR(.*)"); + +// pop3-1.0.6: r"\+OK (\d+) (\d+)\r\n" +consistent!(pop3_3, r"\+OK (\d+) (\d+)\r\n"); + +// pop3-1.0.6: r"(\d+) ([\x21-\x7e]+)\r\n" +consistent!(pop3_4, r"(\d+) ([\x21-\x7e]+)\r\n"); + +// pop3-1.0.6: r"\+OK (\d+) ([\x21-\x7e]+)\r\n" +consistent!(pop3_5, r"\+OK (\d+) ([\x21-\x7e]+)\r\n"); + +// pop3-1.0.6: r"(\d+) (\d+)\r\n" +consistent!(pop3_6, r"(\d+) (\d+)\r\n"); + +// pop3-1.0.6: r"\+OK (\d+) (\d+)\r\n" +consistent!(pop3_7, r"\+OK (\d+) (\d+)\r\n"); + +// polk-1.1.3: "github:(\\w+)/?(\\w+)?" +consistent!(polk_0, "github:(\\w+)/?(\\w+)?"); + +// geochunk-0.1.5: "^[0-9]{5}" +consistent!(geochunk_0, "^[0-9]{5}"); + +// generic-dns-update-1.1.4: r"((?:(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?)\.){3}(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?))" +consistent!(generic_dns_update_0, r"((?:(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?)\.){3}(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?))"); + +// generic-dns-update-1.1.4: r"((([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}:[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){5}:([0-9A-Fa-f]{1,4}:)?[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){4}:([0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){3}:([0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){2}:([0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(([0-9A-Fa-f]{1,4}:){0,5}:((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(::([0-9A-Fa-f]{1,4}:){0,5}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|([0-9A-Fa-f]{1,4}::([0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})|(::([0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){1,7}:))" +consistent!(generic_dns_update_1, r"((([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}:[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){5}:([0-9A-Fa-f]{1,4}:)?[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){4}:([0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){3}:([0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){2}:([0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(([0-9A-Fa-f]{1,4}:){0,5}:((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(::([0-9A-Fa-f]{1,4}:){0,5}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|([0-9A-Fa-f]{1,4}::([0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})|(::([0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){1,7}:))"); + +// generic-dns-update-1.1.4: r"<value><string>([0-9.]*)</string></value>" +consistent!( + generic_dns_update_2, + r"<value><string>([0-9.]*)</string></value>" +); + +// generic-dns-update-1.1.4: r"<int>([0-9]+)</int>" +consistent!(generic_dns_update_3, r"<int>([0-9]+)</int>"); + +// generic-dns-update-1.1.4: r"<int>([0-9]+)</int>" +consistent!(generic_dns_update_4, r"<int>([0-9]+)</int>"); + +// generic-dns-update-1.1.4: r"<boolean>([0-1]*)</boolean>" +consistent!(generic_dns_update_5, r"<boolean>([0-1]*)</boolean>"); + +// generate-nix-pkg-0.3.0: r"(\d*)\.(\d*)\.(\d*)(-(\S*))?" +consistent!(generate_nix_pkg_0, r"(\d*)\.(\d*)\.(\d*)(-(\S*))?"); + +// generate-nix-pkg-0.3.0: r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?" +consistent!(generate_nix_pkg_1, r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?"); + +// genact-0.6.0: r"arch/([a-z0-9_])+/" +consistent!(genact_0, r"arch/([a-z0-9_])+/"); + +// genact-0.6.0: r"arch/([a-z0-9_])+/" +consistent!(genact_1, r"arch/([a-z0-9_])+/"); + +// cron_rs-0.1.6: r"^\s*((\*(/\d+)?)|[0-9-,/]+)(\s+((\*(/\d+)?)|[0-9-,/]+)){4,5}\s*$" +consistent!( + cron_rs_0, + r"^\s*((\*(/\d+)?)|[0-9-,/]+)(\s+((\*(/\d+)?)|[0-9-,/]+)){4,5}\s*$" +); + +// systemfd-0.3.0: r"^([a-zA-Z]+)::(.+)$" +consistent!(systemfd_0, r"^([a-zA-Z]+)::(.+)$"); + +// symbolic-debuginfo-5.0.2: "__?hidden#\\d+_" +consistent!(symbolic_debuginfo_0, "__?hidden#\\d+_"); + +// symbolic-minidump-5.0.2: r"^Linux ([^ ]+) (.*) \w+(?: GNU/Linux)?$" +consistent!(symbolic_minidump_0, r"^Linux ([^ ]+) (.*) \w+(?: GNU/Linux)?$"); + +// graphql-idl-parser-0.1.1: "^(?u:\\#)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+" +consistent!(graphql_idl_parser_0, "^(?u:\\#)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+"); + +// graphql-idl-parser-0.1.1: "^(?u:=)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+" +consistent!(graphql_idl_parser_1, "^(?u:=)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+"); + +// graphql-idl-parser-0.1.1: "^(?u:[A-Z_-_a-z])(?u:[0-9A-Z_-_a-z])*" +consistent!(graphql_idl_parser_2, "^(?u:[A-Z_-_a-z])(?u:[0-9A-Z_-_a-z])*"); + +// graphql-idl-parser-0.1.1: "^(?u:!)" +consistent!(graphql_idl_parser_3, "^(?u:!)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\()" +consistent!(graphql_idl_parser_4, "^(?u:\\()"); + +// graphql-idl-parser-0.1.1: "^(?u:\\))" +consistent!(graphql_idl_parser_5, "^(?u:\\))"); + +// graphql-idl-parser-0.1.1: "^(?u:,)" +consistent!(graphql_idl_parser_6, "^(?u:,)"); + +// graphql-idl-parser-0.1.1: "^(?u::)" +consistent!(graphql_idl_parser_7, "^(?u::)"); + +// graphql-idl-parser-0.1.1: "^(?u:@)" +consistent!(graphql_idl_parser_8, "^(?u:@)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\[)" +consistent!(graphql_idl_parser_9, "^(?u:\\[)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\])" +consistent!(graphql_idl_parser_10, "^(?u:\\])"); + +// graphql-idl-parser-0.1.1: "^(?u:enum)" +consistent!(graphql_idl_parser_11, "^(?u:enum)"); + +// graphql-idl-parser-0.1.1: "^(?u:implements)" +consistent!(graphql_idl_parser_12, "^(?u:implements)"); + +// graphql-idl-parser-0.1.1: "^(?u:input)" +consistent!(graphql_idl_parser_13, "^(?u:input)"); + +// graphql-idl-parser-0.1.1: "^(?u:interface)" +consistent!(graphql_idl_parser_14, "^(?u:interface)"); + +// graphql-idl-parser-0.1.1: "^(?u:scalar)" +consistent!(graphql_idl_parser_15, "^(?u:scalar)"); + +// graphql-idl-parser-0.1.1: "^(?u:type)" +consistent!(graphql_idl_parser_16, "^(?u:type)"); + +// graphql-idl-parser-0.1.1: "^(?u:union)" +consistent!(graphql_idl_parser_17, "^(?u:union)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\{)" +consistent!(graphql_idl_parser_18, "^(?u:\\{)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\})" +consistent!(graphql_idl_parser_19, "^(?u:\\})"); + +// grimoire-0.1.0: r"(?s)/\*(?P<config>.*?)\*/" +consistent!(grimoire_0, r"(?s)/\*(?P<config>.*?)\*/"); + +// phonenumber-0.2.0+8.9.0: r"[\d]+(?:[~\x{2053}\x{223C}\x{FF5E}][\d]+)?" +consistent!(phonenumber_0, r"[\d]+(?:[~\x{2053}\x{223C}\x{FF5E}][\d]+)?"); + +// phonenumber-0.2.0+8.9.0: r"[, \[\]]" +consistent!(phonenumber_1, r"[, \[\]]"); + +// phonenumber-0.2.0+8.9.0: r"[\\/] *x" +consistent!(phonenumber_2, r"[\\/] *x"); + +// phonenumber-0.2.0+8.9.0: r"[[\P{N}&&\P{L}]&&[^#]]+$" +consistent!(phonenumber_3, r"[[\P{N}&&\P{L}]&&[^#]]+$"); + +// phonenumber-0.2.0+8.9.0: r"(?:.*?[A-Za-z]){3}.*" +consistent!(phonenumber_4, r"(?:.*?[A-Za-z]){3}.*"); + +// phonenumber-0.2.0+8.9.0: r"(\D+)" +consistent!(phonenumber_5, r"(\D+)"); + +// phonenumber-0.2.0+8.9.0: r"(\$\d)" +consistent!(phonenumber_6, r"(\$\d)"); + +// phonenumber-0.2.0+8.9.0: r"\(?\$1\)?" +consistent!(phonenumber_7, r"\(?\$1\)?"); + +// phone_number-0.1.0: r"\D" +consistent!(phone_number_0, r"\D"); + +// phone_number-0.1.0: r"^0+" +consistent!(phone_number_1, r"^0+"); + +// phone_number-0.1.0: r"^89" +consistent!(phone_number_2, r"^89"); + +// phone_number-0.1.0: r"^8+" +consistent!(phone_number_3, r"^8+"); + +// phile-0.1.4: r"^ *(\^_*\^) *$" +consistent!(phile_0, r"^ *(\^_*\^) *$"); + +// phile-0.1.4: r"^[_\p{XID_Start}]$" +consistent!(phile_1, r"^[_\p{XID_Start}]$"); + +// phile-0.1.4: r"^\p{XID_Continue}$" +consistent!(phile_2, r"^\p{XID_Continue}$"); + +// uritemplate-0.1.2: "%25(?P<hex>[0-9a-fA-F][0-9a-fA-F])" +consistent!(uritemplate_0, "%25(?P<hex>[0-9a-fA-F][0-9a-fA-F])"); + +// urdf-rs-0.4.2: "^package://(\\w+)/" +consistent!(urdf_rs_0, "^package://(\\w+)/"); + +// url-match-0.1.7: r"(?P<key>[?&.])" +consistent!(url_match_0, r"(?P<key>[?&.])"); + +// url-match-0.1.7: r":(?P<key>[a-zA-Z0-9_-]+)" +consistent!(url_match_1, r":(?P<key>[a-zA-Z0-9_-]+)"); + +// tsm-sys-0.1.0: r"hello world" +consistent!(tsm_sys_0, r"hello world"); + +// deb-version-0.1.0: "^(?:(?:(?:\\d+:).+)|(?:[^:]+))$" +consistent!(deb_version_0, "^(?:(?:(?:\\d+:).+)|(?:[^:]+))$"); + +// debcargo-2.1.0: r"^(?i)(a|an|the)\s+" +consistent!(debcargo_0, r"^(?i)(a|an|the)\s+"); + +// debcargo-2.1.0: r"^(?i)(rust\s+)?(implementation|library|tool|crate)\s+(of|to|for)\s+" +consistent!( + debcargo_1, + r"^(?i)(rust\s+)?(implementation|library|tool|crate)\s+(of|to|for)\s+" +); + +// feaders-0.2.0: r"^.*\.h$" +consistent!(feaders_0, r"^.*\.h$"); + +// feaders-0.2.0: r"^.*\.c$" +consistent!(feaders_1, r"^.*\.c$"); + +// feaders-0.2.0: r"^.*\.hpp$" +consistent!(feaders_2, r"^.*\.hpp$"); + +// feaders-0.2.0: r"^.*\.cc$" +consistent!(feaders_3, r"^.*\.cc$"); + +// feaders-0.2.0: r"^.*\.cpp$" +consistent!(feaders_4, r"^.*\.cpp$"); + +// hyperscan-0.1.6: r"CPtr\(\w+\)" +consistent!(hyperscan_0, r"CPtr\(\w+\)"); + +// hyperscan-0.1.6: r"^Version:\s(\d\.\d\.\d)\sFeatures:\s+(\w+)?\sMode:\s(\w+)$" +consistent!( + hyperscan_1, + r"^Version:\s(\d\.\d\.\d)\sFeatures:\s+(\w+)?\sMode:\s(\w+)$" +); + +// hyperscan-0.1.6: r"RawDatabase<Block>\{db: \w+\}" +consistent!(hyperscan_2, r"RawDatabase<Block>\{db: \w+\}"); + +// hyperscan-0.1.6: r"RawSerializedDatabase\{p: \w+, len: \d+\}" +consistent!(hyperscan_3, r"RawSerializedDatabase\{p: \w+, len: \d+\}"); + +// ucd-parse-0.1.1: r"[0-9A-F]+" +consistent!(ucd_parse_0, r"[0-9A-F]+"); + +// afsort-0.2.0: r".*" +consistent!(afsort_0, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_1, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_2, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_3, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_4, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_5, r".*"); + +// afsort-0.2.0: r"^[a-z]+$" +consistent!(afsort_6, r"^[a-z]+$"); + +// afsort-0.2.0: r"^[a-z]+$" +consistent!(afsort_7, r"^[a-z]+$"); + +// tin-summer-1.21.4: r"(\.git|\.pijul|_darcs|\.hg)$" +consistent!(tin_summer_0, r"(\.git|\.pijul|_darcs|\.hg)$"); + +// tin-drummer-1.0.1: r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$" +consistent!(tin_drummer_0, r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"); + +// tin-drummer-1.0.1: r".*?\.(stats|conf|h|out|cache.*|dat|pc|info|\.js)$" +consistent!( + tin_drummer_1, + r".*?\.(stats|conf|h|out|cache.*|dat|pc|info|\.js)$" +); + +// tin-drummer-1.0.1: r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$" +consistent!(tin_drummer_2, r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"); + +// tin-drummer-1.0.1: r".*?\.(stats|conf|h|out|cache.*|\.js)$" +consistent!(tin_drummer_3, r".*?\.(stats|conf|h|out|cache.*|\.js)$"); + +// tin-drummer-1.0.1: r"(\.git|\.pijul|_darcs|\.hg)$" +consistent!(tin_drummer_4, r"(\.git|\.pijul|_darcs|\.hg)$"); + +// tin-drummer-1.0.1: r".*?\.(dyn_o|out|d|hi|dyn_hi|dump-.*|p_hi|p_o|prof|tix)$" +consistent!( + tin_drummer_5, + r".*?\.(dyn_o|out|d|hi|dyn_hi|dump-.*|p_hi|p_o|prof|tix)$" +); + +// tin-drummer-1.0.1: r".*?\.(ibc)$" +consistent!(tin_drummer_6, r".*?\.(ibc)$"); + +// tin-drummer-1.0.1: r"\.stack-work|dist-newstyle" +consistent!(tin_drummer_7, r"\.stack-work|dist-newstyle"); + +// timmy-0.3.0: r"_NET_WM_PID\(CARDINAL\) = (\d+)" +consistent!(timmy_0, r"_NET_WM_PID\(CARDINAL\) = (\d+)"); + +// timmy-0.3.0: r"today|yesterday|now" +consistent!(timmy_1, r"today|yesterday|now"); + +// timmy-0.3.0: r"(?P<day>\d{1,2})/(?P<month>\d{1,2})(/(?P<year>\d{4}|\d{2}))?" +consistent!( + timmy_2, + r"(?P<day>\d{1,2})/(?P<month>\d{1,2})(/(?P<year>\d{4}|\d{2}))?" +); + +// timmy-0.3.0: r"(?P<n>\d+) (days?|ds?)(?P<ago>( ago)?)" +consistent!(timmy_3, r"(?P<n>\d+) (days?|ds?)(?P<ago>( ago)?)"); + +// timmy-0.3.0: r"(?P<hr>\d{2}):(?P<mins>\d{2})" +consistent!(timmy_4, r"(?P<hr>\d{2}):(?P<mins>\d{2})"); + +// tinfo-0.5.0: r"^(\d+): \d+ windows \(.*\) \[\d+x\d+\]( \(attached\))?" +consistent!( + tinfo_0, + r"^(\d+): \d+ windows \(.*\) \[\d+x\d+\]( \(attached\))?" +); + +// tinfo-0.5.0: r"^(\d+):(\d+): (.*) \((\d+) panes\) \[(\d+)x(\d+)\]" +consistent!(tinfo_1, r"^(\d+):(\d+): (.*) \((\d+) panes\) \[(\d+)x(\d+)\]"); + +// timespan-0.0.4: r"(?:\\\{start\\\}|\\\{end\\\})" +consistent!(timespan_0, r"(?:\\\{start\\\}|\\\{end\\\})"); + +// timespan-0.0.4: r"(.*)\s+-\s+(.*)" +consistent!(timespan_1, r"(.*)\s+-\s+(.*)"); + +// timespan-0.0.4: r"(.*)\s+(\w+)$" +consistent!(timespan_2, r"(.*)\s+(\w+)$"); + +// timespan-0.0.4: r"(.*)\s+(\w+)$" +consistent!(timespan_3, r"(.*)\s+(\w+)$"); + +// timespan-0.0.4: r"(.*)\s+-\s+(.*)" +consistent!(timespan_4, r"(.*)\s+-\s+(.*)"); + +// titlecase-0.10.0: r"[[:lower:]]" +consistent!(titlecase_0, r"[[:lower:]]"); + +// tight-0.1.3: r"^\d+ (day|week|month|year)s?$" +consistent!(tight_0, r"^\d+ (day|week|month|year)s?$"); + +// tight-0.1.3: r"^\d+ (day|week|month|year)s?$" +consistent!(tight_1, r"^\d+ (day|week|month|year)s?$"); + +// yaml-0.2.1: r"^[-+]?(0|[1-9][0-9_]*)$" +consistent!(yaml_0, r"^[-+]?(0|[1-9][0-9_]*)$"); + +// yaml-0.2.1: r"^([-+]?)0o?([0-7_]+)$" +consistent!(yaml_1, r"^([-+]?)0o?([0-7_]+)$"); + +// yaml-0.2.1: r"^([-+]?)0x([0-9a-fA-F_]+)$" +consistent!(yaml_2, r"^([-+]?)0x([0-9a-fA-F_]+)$"); + +// yaml-0.2.1: r"^([-+]?)0b([0-1_]+)$" +consistent!(yaml_3, r"^([-+]?)0b([0-1_]+)$"); + +// yaml-0.2.1: r"^([-+]?)(\.[0-9]+|[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+)?)$" +consistent!( + yaml_4, + r"^([-+]?)(\.[0-9]+|[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+)?)$" +); + +// yaml-0.2.1: r"^[+]?(\.inf|\.Inf|\.INF)$" +consistent!(yaml_5, r"^[+]?(\.inf|\.Inf|\.INF)$"); + +// yaml-0.2.1: r"^-(\.inf|\.Inf|\.INF)$" +consistent!(yaml_6, r"^-(\.inf|\.Inf|\.INF)$"); + +// yaml-0.2.1: r"^(\.nan|\.NaN|\.NAN)$" +consistent!(yaml_7, r"^(\.nan|\.NaN|\.NAN)$"); + +// yaml-0.2.1: r"^(null|Null|NULL|~)$" +consistent!(yaml_8, r"^(null|Null|NULL|~)$"); + +// yaml-0.2.1: r"^(true|True|TRUE|yes|Yes|YES)$" +consistent!(yaml_9, r"^(true|True|TRUE|yes|Yes|YES)$"); + +// yaml-0.2.1: r"^(false|False|FALSE|no|No|NO)$" +consistent!(yaml_10, r"^(false|False|FALSE|no|No|NO)$"); + +// kefia-0.1.0: r"(?m)^(\S+)/(\S+) (\S+)(?: \((.*)\))?$" +consistent!(kefia_0, r"(?m)^(\S+)/(\S+) (\S+)(?: \((.*)\))?$"); + +// risp-0.7.0: "^(\\s+|;.*?(\n|$))+" +consistent!(risp_0, "^(\\s+|;.*?(\n|$))+"); + +// risp-0.7.0: "^\".*?\"" +consistent!(risp_1, "^\".*?\""); + +// risp-0.7.0: r"^[^\s\{\}()\[\]]+" +consistent!(risp_2, r"^[^\s\{\}()\[\]]+"); + +// risp-0.7.0: r"^-?\d+" +consistent!(risp_3, r"^-?\d+"); + +// ripgrep-0.8.1: "^([0-9]+)([KMG])?$" +consistent!(ripgrep_0, "^([0-9]+)([KMG])?$"); + +// riquid-0.0.1: r"^\w+" +consistent!(riquid_0, r"^\w+"); + +// riquid-0.0.1: r"^\d+" +consistent!(riquid_1, r"^\d+"); + +// recursive_disassembler-2.1.2: r"\A(0x)?([a-fA-F0-9]+)\z" +consistent!(recursive_disassembler_0, r"\A(0x)?([a-fA-F0-9]+)\z"); + +// remake-0.1.0: r"^[a-zA-Z_][a-zA-Z0-9_]*" +consistent!(remake_0, r"^[a-zA-Z_][a-zA-Z0-9_]*"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_0, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_1, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_2, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_3, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_4, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_5, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{2})\)" +consistent!(regex_decode_6, r"'(?P<title>[^']+)'\s+\((?P<year>\d{2})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_7, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_8, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_9, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_10, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_11, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_12, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_13, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-cache-0.2.0: "[0-9]{3}-[0-9]{3}-[0-9]{4}" +consistent!(regex_cache_0, "[0-9]{3}-[0-9]{3}-[0-9]{4}"); + +// regex-cache-0.2.0: r"^\d+$" +consistent!(regex_cache_1, r"^\d+$"); + +// regex-cache-0.2.0: r"^[a-z]+$" +consistent!(regex_cache_2, r"^[a-z]+$"); + +// regex-cache-0.2.0: r"^\d+$" +consistent!(regex_cache_3, r"^\d+$"); + +// regex-cache-0.2.0: r"^\d+$" +consistent!(regex_cache_4, r"^\d+$"); + +// regex_dfa-0.5.0: r"\d{4}-\d{2}-\d{2}" +consistent!(regex_dfa_0, r"\d{4}-\d{2}-\d{2}"); + +// reaper-2.0.0: r"^[0-9\p{L} _\\.]{3,16}$" +consistent!(reaper_0, r"^[0-9\p{L} _\\.]{3,16}$"); + +// retdec-0.1.0: r"^attachment; filename=(.+)$" +consistent!(retdec_0, r"^attachment; filename=(.+)$"); + +// renvsubst-0.1.2: r"(\\)(?P<head>\$[0-9A-Za-z_{])" +consistent!(renvsubst_0, r"(\\)(?P<head>\$[0-9A-Za-z_{])"); + +// renvsubst-0.1.2: r"\$([[:word:]]+)" +consistent!(renvsubst_1, r"\$([[:word:]]+)"); + +// renvsubst-0.1.2: r"\$\{([[:word:]]+)\}" +consistent!(renvsubst_2, r"\$\{([[:word:]]+)\}"); + +// rexpect-0.3.0: r"'[a-z]+'" +consistent!(rexpect_0, r"'[a-z]+'"); + +// rexpect-0.3.0: r"^\d{4}-\d{2}-\d{2}$" +consistent!(rexpect_1, r"^\d{4}-\d{2}-\d{2}$"); + +// rexpect-0.3.0: r"-\d{2}-" +consistent!(rexpect_2, r"-\d{2}-"); + +// luther-0.1.0: "^a(b|c)c*$" +consistent!(luther_0, "^a(b|c)c*$"); + +// little_boxes-1.6.0: r"(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]" +consistent!(little_boxes_0, r"(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]"); + +// libimagentrytag-0.8.0: "^[a-zA-Z]([a-zA-Z0-9_-]*)$" +consistent!(libimagentrytag_0, "^[a-zA-Z]([a-zA-Z0-9_-]*)$"); + +// libimaginteraction-0.8.0: r"^[Yy](\n?)$" +consistent!(libimaginteraction_0, r"^[Yy](\n?)$"); + +// libimaginteraction-0.8.0: r"^[Nn](\n?)$" +consistent!(libimaginteraction_1, r"^[Nn](\n?)$"); + +// libimagutil-0.8.0: "^(?P<KEY>([^=]*))=(.*)$" +consistent!(libimagutil_0, "^(?P<KEY>([^=]*))=(.*)$"); + +// libimagutil-0.8.0: "(.*)=(\"(?P<QVALUE>([^\"]*))\"|(?P<VALUE>(.*)))$" +consistent!(libimagutil_1, "(.*)=(\"(?P<QVALUE>([^\"]*))\"|(?P<VALUE>(.*)))$"); + +// linux_ip-0.1.0: r"\s+" +consistent!(linux_ip_0, r"\s+"); + +// linux_ip-0.1.0: r"\s*[\n\r]+\s*" +consistent!(linux_ip_1, r"\s*[\n\r]+\s*"); + +// linux_ip-0.1.0: r"^([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$" +consistent!(linux_ip_2, r"^([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$"); + +// linux_ip-0.1.0: r"^([0-9a-fA-F\.:/]+|default)\s+via\s+([a-z0-9\.:]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$" +consistent!(linux_ip_3, r"^([0-9a-fA-F\.:/]+|default)\s+via\s+([a-z0-9\.:]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$"); + +// linux_ip-0.1.0: r"^(blackhole)\s+([0-9a-fA-F\.:/]+)$" +consistent!(linux_ip_4, r"^(blackhole)\s+([0-9a-fA-F\.:/]+)$"); + +// linux_ip-0.1.0: r"^(unreachable)\s+([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s+(.*)$" +consistent!( + linux_ip_5, + r"^(unreachable)\s+([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s+(.*)$" +); + +// linux_ip-0.1.0: r"\s*[\n\r]+\s*" +consistent!(linux_ip_6, r"\s*[\n\r]+\s*"); + +// linux_ip-0.1.0: r"^\d+:\s+([a-zA-Z0-9\.-]+)(@\S+)*:\s+(.*)$" +consistent!(linux_ip_7, r"^\d+:\s+([a-zA-Z0-9\.-]+)(@\S+)*:\s+(.*)$"); + +// linux_ip-0.1.0: r"\s*link/ether\s+([a-f0-9:]+)\s+.*" +consistent!(linux_ip_8, r"\s*link/ether\s+([a-f0-9:]+)\s+.*"); + +// linux_ip-0.1.0: r"\s*inet[6]*\s+([0-9a-f:\./]+)\s+.*" +consistent!(linux_ip_9, r"\s*inet[6]*\s+([0-9a-f:\./]+)\s+.*"); + +// linky-0.1.4: r"[^\w -]" +consistent!(linky_0, r"[^\w -]"); + +// linky-0.1.4: r"^(.*):(\d+): [^ ]* ([^ ]*)$" +consistent!(linky_1, r"^(.*):(\d+): [^ ]* ([^ ]*)$"); + +// limonite-0.2.1: r"^(\d{4}-\d{2}-\d{2})-(\d{3})-(.+)$" +consistent!(limonite_0, r"^(\d{4}-\d{2}-\d{2})-(\d{3})-(.+)$"); + +// process-queue-0.1.1: r"^[a-zA-Z]+$" +consistent!(process_queue_0, r"^[a-zA-Z]+$"); + +// pronghorn-0.1.2: r"^\{([a-zA-Z_]+)\}$" +consistent!(pronghorn_0, r"^\{([a-zA-Z_]+)\}$"); + +// protocol-ftp-client-0.1.1: "(?m:^(\\d{3}) (.+)\r$)" +consistent!(protocol_ftp_client_0, "(?m:^(\\d{3}) (.+)\r$)"); + +// protocol-ftp-client-0.1.1: "\"(.+)\"" +consistent!(protocol_ftp_client_1, "\"(.+)\""); + +// protocol-ftp-client-0.1.1: "(\\w+) [Tt]ype: (\\w+)" +consistent!(protocol_ftp_client_2, "(\\w+) [Tt]ype: (\\w+)"); + +// protocol-ftp-client-0.1.1: "(?m:^(\\d{3})-.+\r$)" +consistent!(protocol_ftp_client_3, "(?m:^(\\d{3})-.+\r$)"); + +// protocol-ftp-client-0.1.1: "Entering Passive Mode \\((\\d+),(\\d+),(\\d+),(\\d+),(\\d+),(\\d+)\\)" +consistent!( + protocol_ftp_client_4, + "Entering Passive Mode \\((\\d+),(\\d+),(\\d+),(\\d+),(\\d+),(\\d+)\\)" +); + +// protocol-ftp-client-0.1.1: "(?m:^(.+)\r$)" +consistent!(protocol_ftp_client_5, "(?m:^(.+)\r$)"); + +// protocol-ftp-client-0.1.1: "^([d-])(?:[rwx-]{3}){3} +\\d+ +\\w+ +\\w+ +(\\d+) +(.+) +(.+)$" +consistent!( + protocol_ftp_client_6, + "^([d-])(?:[rwx-]{3}){3} +\\d+ +\\w+ +\\w+ +(\\d+) +(.+) +(.+)$" +); + +// article-date-extractor-0.1.1: r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})" +consistent!(article_date_extractor_0, r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})"); + +// article-date-extractor-0.1.1: r"(?i)publishdate|pubdate|timestamp|article_date|articledate|date" +consistent!( + article_date_extractor_1, + r"(?i)publishdate|pubdate|timestamp|article_date|articledate|date" +); + +// arthas_plugin-0.1.1: r"type\((.*)\)" +consistent!(arthas_plugin_0, r"type\((.*)\)"); + +// arthas_plugin-0.1.1: r"Vec<(.*)>" +consistent!(arthas_plugin_1, r"Vec<(.*)>"); + +// arthas_plugin-0.1.1: r"Option<(.*)>" +consistent!(arthas_plugin_2, r"Option<(.*)>"); + +// arthas_plugin-0.1.1: r"HashMap<[a-z0-9A-Z]+, *(.*)>" +consistent!(arthas_plugin_3, r"HashMap<[a-z0-9A-Z]+, *(.*)>"); + +// arthas_derive-0.1.0: "Vec *< *(.*) *>" +consistent!(arthas_derive_0, "Vec *< *(.*) *>"); + +// arthas_derive-0.1.0: r"Option *< *(.*) *>" +consistent!(arthas_derive_1, r"Option *< *(.*) *>"); + +// arthas_derive-0.1.0: r"HashMap *< *[a-z0-9A-Z]+ *, *(.*) *>" +consistent!(arthas_derive_2, r"HashMap *< *[a-z0-9A-Z]+ *, *(.*) *>"); + +// arpabet-0.2.0: r"^([\w\-\(\)\.']+)\s+([^\s].*)\s*$" +consistent!(arpabet_0, r"^([\w\-\(\)\.']+)\s+([^\s].*)\s*$"); + +// arpabet-0.2.0: r"^;;;\s+" +consistent!(arpabet_1, r"^;;;\s+"); + +// glossy_codegen-0.2.0: r"/\*.*?\*/|//.*" +consistent!(glossy_codegen_0, r"/\*.*?\*/|//.*"); + +// glossy_codegen-0.2.0: "^\\s*#\\s*include\\s+<([:print:]+)>\\s*$" +consistent!(glossy_codegen_1, "^\\s*#\\s*include\\s+<([:print:]+)>\\s*$"); + +// glossy_codegen-0.2.0: "^\\s*#\\s*include\\s+\"([:print:]+)\"\\s*$" +consistent!(glossy_codegen_2, "^\\s*#\\s*include\\s+\"([:print:]+)\"\\s*$"); + +// glossy_codegen-0.2.0: r"^\s*#\s*version\s+(\d+)" +consistent!(glossy_codegen_3, r"^\s*#\s*version\s+(\d+)"); + +// glossy_codegen-0.2.0: r"^\s*$" +consistent!(glossy_codegen_4, r"^\s*$"); + +// gluster-1.0.1: r"(?P<addr>via \S+)" +consistent!(gluster_0, r"(?P<addr>via \S+)"); + +// gluster-1.0.1: r"(?P<src>src \S+)" +consistent!(gluster_1, r"(?P<src>src \S+)"); + +// gl_helpers-0.1.7: r"(.*)\[\d+\]" +consistent!(gl_helpers_0, r"(.*)\[\d+\]"); + +// gl_helpers-0.1.7: r"(\d+).(\d+)" +consistent!(gl_helpers_1, r"(\d+).(\d+)"); + +// glr-parser-0.0.1: r"(?P<c>[\\\.\+\*\?\(\)\|\[\]\{\}\^\$])" +consistent!(glr_parser_0, r"(?P<c>[\\\.\+\*\?\(\)\|\[\]\{\}\^\$])"); + +// glr-parser-0.0.1: r"^\w+$" +consistent!(glr_parser_1, r"^\w+$"); + +// glr-parser-0.0.1: "'[^']+'" +consistent!(glr_parser_2, "'[^']+'"); + +// hoodlum-0.5.0: r"(?m)//.*" +consistent!(hoodlum_0, r"(?m)//.*"); + +// form-checker-0.2.2: r"^1\d{10}$" +consistent!(form_checker_0, r"^1\d{10}$"); + +// form-checker-0.2.2: r"(?i)^[\w.%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,4}$" +consistent!(form_checker_1, r"(?i)^[\w.%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,4}$"); + +// wikibase-0.2.0: r"(?P<user_agent>[a-zA-Z0-9-_]+/[0-9\.]+)" +consistent!(wikibase_0, r"(?P<user_agent>[a-zA-Z0-9-_]+/[0-9\.]+)"); + +// wifiscanner-0.3.6: r"Cell [0-9]{2,} - Address:" +consistent!(wifiscanner_0, r"Cell [0-9]{2,} - Address:"); + +// wifiscanner-0.3.6: r"([0-9a-zA-Z]{1}[0-9a-zA-Z]{1}[:]{1}){5}[0-9a-zA-Z]{1}[0-9a-zA-Z]{1}" +consistent!( + wifiscanner_1, + r"([0-9a-zA-Z]{1}[0-9a-zA-Z]{1}[:]{1}){5}[0-9a-zA-Z]{1}[0-9a-zA-Z]{1}" +); + +// wifiscanner-0.3.6: r"Signal level=(\d+)/100" +consistent!(wifiscanner_2, r"Signal level=(\d+)/100"); + +// bbcode-1.0.2: r"(?s)\[b\](.*?)\[/b\]" +consistent!(bbcode_0, r"(?s)\[b\](.*?)\[/b\]"); + +// bbcode-1.0.2: r"(?s)\[i\](.*?)\[/i\]" +consistent!(bbcode_1, r"(?s)\[i\](.*?)\[/i\]"); + +// bbcode-1.0.2: r"(?s)\[u\](.*?)\[/u\]" +consistent!(bbcode_2, r"(?s)\[u\](.*?)\[/u\]"); + +// bbcode-1.0.2: r"(?s)\[s\](.*?)\[/s\]" +consistent!(bbcode_3, r"(?s)\[s\](.*?)\[/s\]"); + +// bbcode-1.0.2: r"(?s)\[size=(\d+)](.*?)\[/size\]" +consistent!(bbcode_4, r"(?s)\[size=(\d+)](.*?)\[/size\]"); + +// bbcode-1.0.2: r"(?s)\[color=(.+)](.*?)\[/color\]" +consistent!(bbcode_5, r"(?s)\[color=(.+)](.*?)\[/color\]"); + +// bbcode-1.0.2: r"(?s)\[center\](.*?)\[/center\]" +consistent!(bbcode_6, r"(?s)\[center\](.*?)\[/center\]"); + +// bbcode-1.0.2: r"(?s)\[left\](.*?)\[/left\]" +consistent!(bbcode_7, r"(?s)\[left\](.*?)\[/left\]"); + +// bbcode-1.0.2: r"(?s)\[right\](.*?)\[/right\]" +consistent!(bbcode_8, r"(?s)\[right\](.*?)\[/right\]"); + +// bbcode-1.0.2: r"(?s)\[table\](.*?)\[/table\]" +consistent!(bbcode_9, r"(?s)\[table\](.*?)\[/table\]"); + +// bbcode-1.0.2: r"(?s)\[td\](.*?)\[/td\]" +consistent!(bbcode_10, r"(?s)\[td\](.*?)\[/td\]"); + +// bbcode-1.0.2: r"(?s)\[tr\](.*?)\[/tr\]" +consistent!(bbcode_11, r"(?s)\[tr\](.*?)\[/tr\]"); + +// bbcode-1.0.2: r"(?s)\[th\](.*?)\[/th\]" +consistent!(bbcode_12, r"(?s)\[th\](.*?)\[/th\]"); + +// bbcode-1.0.2: r"(?s)\[url\](.*?)\[/url\]" +consistent!(bbcode_13, r"(?s)\[url\](.*?)\[/url\]"); + +// bbcode-1.0.2: r"(?s)\[url=(.+)\](.*?)\[/url\]" +consistent!(bbcode_14, r"(?s)\[url=(.+)\](.*?)\[/url\]"); + +// bbcode-1.0.2: r"(?s)\[quote\](.*?)\[/quote\]" +consistent!(bbcode_15, r"(?s)\[quote\](.*?)\[/quote\]"); + +// bbcode-1.0.2: r"(?s)\[quote=(.+)\](.*?)\[/quote\]" +consistent!(bbcode_16, r"(?s)\[quote=(.+)\](.*?)\[/quote\]"); + +// bbcode-1.0.2: r"(?s)\[img=(\d+)x(\d+)(\b.*)?\](.*?)\[/img\]" +consistent!(bbcode_17, r"(?s)\[img=(\d+)x(\d+)(\b.*)?\](.*?)\[/img\]"); + +// bbcode-1.0.2: r"(?s)\[img=(.+)(\b.*)?\](.*?)\[/img\]" +consistent!(bbcode_18, r"(?s)\[img=(.+)(\b.*)?\](.*?)\[/img\]"); + +// bbcode-1.0.2: r"(?s)\[img(\b.*)?\](.*?)\[/img\]" +consistent!(bbcode_19, r"(?s)\[img(\b.*)?\](.*?)\[/img\]"); + +// bbcode-1.0.2: r"(?s)\[ol\](.*?)\[/ol\]" +consistent!(bbcode_20, r"(?s)\[ol\](.*?)\[/ol\]"); + +// bbcode-1.0.2: r"(?s)\[ul\](.*?)\[/ul\]" +consistent!(bbcode_21, r"(?s)\[ul\](.*?)\[/ul\]"); + +// bbcode-1.0.2: r"(?s)\[list\](.*?)\[/list\]" +consistent!(bbcode_22, r"(?s)\[list\](.*?)\[/list\]"); + +// bbcode-1.0.2: r"(?s)\[youtube\](.*?)\[/youtube\]" +consistent!(bbcode_23, r"(?s)\[youtube\](.*?)\[/youtube\]"); + +// bbcode-1.0.2: r"(?s)\[youtube=(\d+)x(\d+)\](.*?)\[/youtube\]" +consistent!(bbcode_24, r"(?s)\[youtube=(\d+)x(\d+)\](.*?)\[/youtube\]"); + +// bbcode-1.0.2: r"(?s)\[li\](.*?)\[/li\]" +consistent!(bbcode_25, r"(?s)\[li\](.*?)\[/li\]"); + +// block-utils-0.5.0: r"loop\d+" +consistent!(block_utils_0, r"loop\d+"); + +// block-utils-0.5.0: r"ram\d+" +consistent!(block_utils_1, r"ram\d+"); + +// block-utils-0.5.0: r"md\d+" +consistent!(block_utils_2, r"md\d+"); + +// kvvliveapi-0.1.0: r"^([1-9]) min$" +consistent!(kvvliveapi_0, r"^([1-9]) min$"); + +// rfc822_sanitizer-0.3.3: r"(\d{2}):(\d{2}):(\d{2})" +consistent!(rfc822_sanitizer_0, r"(\d{2}):(\d{2}):(\d{2})"); + +// rfc822_sanitizer-0.3.3: r"(\d{1,2}):(\d{1,2}):(\d{1,2})" +consistent!(rfc822_sanitizer_1, r"(\d{1,2}):(\d{1,2}):(\d{1,2})"); + +// faker-0.0.4: r"[2-9]" +consistent!(faker_0, r"[2-9]"); + +// faker-0.0.4: r"[1-9]" +consistent!(faker_1, r"[1-9]"); + +// faker-0.0.4: r"[0-9]" +consistent!(faker_2, r"[0-9]"); + +// faker-0.0.4: r"\d{10}" +consistent!(faker_3, r"\d{10}"); + +// faker-0.0.4: r"\d{1}" +consistent!(faker_4, r"\d{1}"); + +// faker-0.0.4: r"^\w+" +consistent!(faker_5, r"^\w+"); + +// faker-0.0.4: r"^\w+" +consistent!(faker_6, r"^\w+"); + +// faker-0.0.4: r"^(\w+\.? ?){2,3}$" +consistent!(faker_7, r"^(\w+\.? ?){2,3}$"); + +// faker-0.0.4: r"^[A-Z][a-z]+\.?$" +consistent!(faker_8, r"^[A-Z][a-z]+\.?$"); + +// faker-0.0.4: r"^[A-Z][A-Za-z]*\.?$" +consistent!(faker_9, r"^[A-Z][A-Za-z]*\.?$"); + +// faker-0.0.4: r"http://lorempixel.com/100/100/\w+" +consistent!(faker_10, r"http://lorempixel.com/100/100/\w+"); + +// faker-0.0.4: r"http://lorempixel.com/100/100/cats" +consistent!(faker_11, r"http://lorempixel.com/100/100/cats"); + +// fancy-regex-0.1.0: "(?i:ß)" +consistent!(fancy_regex_0, "(?i:ß)"); + +// fancy-regex-0.1.0: "(?i:\\x{0587})" +consistent!(fancy_regex_1, "(?i:\\x{0587})"); + +// fancy-regex-0.1.0: "^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})" +consistent!(fancy_regex_2, "^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})"); + +// fancy-prompt-0.1.5: r"/([^/])[^/]+/" +consistent!(fancy_prompt_0, r"/([^/])[^/]+/"); + +// fancy-prompt-0.1.5: r"^([^:]+):.*?(?::([^:]+))?$" +consistent!(fancy_prompt_1, r"^([^:]+):.*?(?::([^:]+))?$"); + +// fanta-0.2.0: r"^(/?__\w+__)/(.*)" +consistent!(fanta_0, r"^(/?__\w+__)/(.*)"); + +// fanta-cli-0.1.1: r"(.)([A-Z])" +consistent!(fanta_cli_0, r"(.)([A-Z])"); + +// fanta-cli-0.1.1: "\\{:[^\\s]+\\}" +consistent!(fanta_cli_1, "\\{:[^\\s]+\\}"); + +// amethyst_tools-0.7.1: "(?P<last>[^\r])\n" +consistent!(amethyst_tools_0, "(?P<last>[^\r])\n"); + +// amigo-0.3.1: r"^-?\d+(\.\d)?" +consistent!(amigo_0, r"^-?\d+(\.\d)?"); + +// amigo-0.3.1: r"^[a-zA-Z_]+[\w-]*[!?_]?" +consistent!(amigo_1, r"^[a-zA-Z_]+[\w-]*[!?_]?"); + +// amigo-0.3.1: r"^\(" +consistent!(amigo_2, r"^\("); + +// amigo-0.3.1: r"^\)" +consistent!(amigo_3, r"^\)"); + +// amigo-0.3.1: r"^\s+" +consistent!(amigo_4, r"^\s+"); + +// ethcore-logger-1.12.0: "\x1b\\[[^m]+m" +consistent!(ethcore_logger_0, "\x1b\\[[^m]+m"); + +// dash2html-1.0.1: r"__.*?__" +consistent!(dash2html_0, r"__.*?__"); + +// dash2html-1.0.1: r"(?i)@(?:time|clipboard|cursor|date)" +consistent!(dash2html_1, r"(?i)@(?:time|clipboard|cursor|date)"); + +// os_type-2.0.0: r"^Microsoft Windows \[Version\s(\d+\.\d+\.\d+)\]$" +consistent!(os_type_0, r"^Microsoft Windows \[Version\s(\d+\.\d+\.\d+)\]$"); + +// os_type-2.0.0: r"ProductName:\s([\w\s]+)\n" +consistent!(os_type_1, r"ProductName:\s([\w\s]+)\n"); + +// os_type-2.0.0: r"ProductVersion:\s(\w+\.\w+\.\w+)" +consistent!(os_type_2, r"ProductVersion:\s(\w+\.\w+\.\w+)"); + +// os_type-2.0.0: r"BuildVersion:\s(\w+)" +consistent!(os_type_3, r"BuildVersion:\s(\w+)"); + +// os_type-2.0.0: r"(\w+) Linux release" +consistent!(os_type_4, r"(\w+) Linux release"); + +// os_type-2.0.0: r"release\s([\w\.]+)" +consistent!(os_type_5, r"release\s([\w\.]+)"); + +// os_type-2.0.0: r"Distributor ID:\s(\w+)" +consistent!(os_type_6, r"Distributor ID:\s(\w+)"); + +// os_type-2.0.0: r"Release:\s([\w\.]+)" +consistent!(os_type_7, r"Release:\s([\w\.]+)"); + +// bindgen-0.37.0: r"typename type\-parameter\-\d+\-\d+::.+" +consistent!(bindgen_0, r"typename type\-parameter\-\d+\-\d+::.+"); + +// imap-0.8.1: "^+(.*)\r\n" +consistent!(imap_0, "^+(.*)\r\n"); + +// image-base64-0.1.0: r"^ffd8ffe0" +consistent!(image_base64_0, r"^ffd8ffe0"); + +// image-base64-0.1.0: r"^89504e47" +consistent!(image_base64_1, r"^89504e47"); + +// image-base64-0.1.0: r"^47494638" +consistent!(image_base64_2, r"^47494638"); + +// json-pointer-0.3.2: "^(/([^/~]|~[01])*)*$" +consistent!(json_pointer_0, "^(/([^/~]|~[01])*)*$"); + +// json-pointer-0.3.2: "^#(/([^/~%]|~[01]|%[0-9a-fA-F]{2})*)*$" +consistent!(json_pointer_1, "^#(/([^/~%]|~[01]|%[0-9a-fA-F]{2})*)*$"); + +// mysql_common-0.7.0: r"^5.5.5-(\d{1,2})\.(\d{1,2})\.(\d{1,3})-MariaDB" +consistent!(mysql_common_0, r"^5.5.5-(\d{1,2})\.(\d{1,2})\.(\d{1,3})-MariaDB"); + +// mysql_common-0.7.0: r"^(\d{1,2})\.(\d{1,2})\.(\d{1,3})(.*)" +consistent!(mysql_common_1, r"^(\d{1,2})\.(\d{1,2})\.(\d{1,3})(.*)"); + +// government_id-0.1.0: r"^[0-9]{4}[0-9A-Z]{2}[0-9]{3}$" +consistent!(government_id_0, r"^[0-9]{4}[0-9A-Z]{2}[0-9]{3}$"); + +// ohmers-0.1.1: r"UniqueIndexViolation: (\w+)" +consistent!(ohmers_0, r"UniqueIndexViolation: (\w+)"); + +// eliza-1.0.0: r"(.*) you are (.*)" +consistent!(eliza_0, r"(.*) you are (.*)"); + +// eliza-1.0.0: r"(.*) you are (.*)" +consistent!(eliza_1, r"(.*) you are (.*)"); + +// eliza-1.0.0: r"(.*) you are (.*)" +consistent!(eliza_2, r"(.*) you are (.*)"); + +// chema-0.0.5: "^\\s*\\*" +consistent!(chema_0, "^\\s*\\*"); + +// chema-0.0.5: "^\\s*@(\\w+)\\s+(.*)" +consistent!(chema_1, "^\\s*@(\\w+)\\s+(.*)"); + +// chord3-0.3.0: r"^\s*#" +consistent!(chord3_0, r"^\s*#"); + +// chord3-0.3.0: r"\{(?P<cmd>\w+)(?::?\s*(?P<arg>.*))?\}" +consistent!(chord3_1, r"\{(?P<cmd>\w+)(?::?\s*(?P<arg>.*))?\}"); + +// chord3-0.3.0: r"\{(eot|end_of_tab):?\s*" +consistent!(chord3_2, r"\{(eot|end_of_tab):?\s*"); + +// chord3-0.3.0: r"([^\[]*)(?:\[([^\]]*)\])?" +consistent!(chord3_3, r"([^\[]*)(?:\[([^\]]*)\])?"); + +// checkmail-0.1.1: "^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$" +consistent!(checkmail_0, "^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$"); + +// cntk-0.2.1: r"\b\w\w+\b" +consistent!(cntk_0, r"\b\w\w+\b"); + +// cntk-0.2.1: r"\b\w\w+\b" +consistent!(cntk_1, r"\b\w\w+\b"); + +// cniguru-0.1.0: r"\(id: (\d+)\)" +consistent!(cniguru_0, r"\(id: (\d+)\)"); + +// upm_lib-0.3.0: r"^(\d+)\.(\d+)\.(\d+)(?:-([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?(?:\+([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?$" +consistent!(upm_lib_0, r"^(\d+)\.(\d+)\.(\d+)(?:-([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?(?:\+([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?$"); + +// avro-0.2.1: r"^\s*(\*+(\s+))?" +consistent!(avro_0, r"^\s*(\*+(\s+))?"); + +// avro-0.2.1: r"^\s*(\*+)?" +consistent!(avro_1, r"^\s*(\*+)?"); + +// nomi-0.0.2: "[0-9]+" +consistent!(nomi_0, "[0-9]+"); + +// nodes-0.1.0: "([0-9]+)@(?:nodes|n)?:([^@]+)?" +consistent!(nodes_0, "([0-9]+)@(?:nodes|n)?:([^@]+)?"); + +// not-stakkr-1.0.0: r"(?i)in (\d+) (second|minute|hour|day|week)s?" +consistent!(not_stakkr_0, r"(?i)in (\d+) (second|minute|hour|day|week)s?"); + +// notetxt-0.0.1: "^([A-Za-z0-9 -_:]+)\n-+\n" +consistent!(notetxt_0, "^([A-Za-z0-9 -_:]+)\n-+\n"); + +// nail-0.1.0-pre.0: r"^-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?$" +consistent!(nail_0, r"^-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?$"); + +// nail-0.1.0-pre.0: r"^-?[0-9]+$" +consistent!(nail_1, r"^-?[0-9]+$"); + +// askalono-0.2.0: r"[^\w\s\pP]+" +consistent!(askalono_0, r"[^\w\s\pP]+"); + +// askalono-0.2.0: r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+" +consistent!(askalono_1, r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+"); + +// askalono-0.2.0: r"\p{Pd}+" +consistent!(askalono_2, r"\p{Pd}+"); + +// askalono-0.2.0: r"\p{Ps}+" +consistent!(askalono_3, r"\p{Ps}+"); + +// askalono-0.2.0: r"\p{Pe}+" +consistent!(askalono_4, r"\p{Pe}+"); + +// askalono-0.2.0: r"\p{Pc}+" +consistent!(askalono_5, r"\p{Pc}+"); + +// askalono-0.2.0: r"[©Ⓒⓒ]" +consistent!(askalono_6, r"[©Ⓒⓒ]"); + +// askalono-0.2.0: r"[\r\n\v\f]" +consistent!(askalono_7, r"[\r\n\v\f]"); + +// askalono-0.2.0: r"\n{3,}" +consistent!(askalono_8, r"\n{3,}"); + +// askalono-0.2.0: r"[^\w\s]+" +consistent!(askalono_9, r"[^\w\s]+"); + +// askalono-0.2.0: r"\s+" +consistent!(askalono_10, r"\s+"); + +// assembunny_plus-0.0.3: r"[^0-9a-zA-Z_]" +consistent!(assembunny_plus_0, r"[^0-9a-zA-Z_]"); + +// assembunny_plus-0.0.3: r"[0-9]" +consistent!(assembunny_plus_1, r"[0-9]"); + +// salt-compressor-0.4.0: r"(?m)^Minion (\S*) did not respond\. No job will be sent\.$" +consistent!( + salt_compressor_0, + r"(?m)^Minion (\S*) did not respond\. No job will be sent\.$" +); + +// sabisabi-0.4.1: r"</?[^>]+?>" +consistent!(sabisabi_0, r"</?[^>]+?>"); + +// sabisabi-0.4.1: r"\([^)]*\)" +consistent!(sabisabi_1, r"\([^)]*\)"); + +// sassers-0.13.5-h28: "@import \"([^\"]*)\";" +consistent!(sassers_0, "@import \"([^\"]*)\";"); + +// shadowsocks-0.6.2: r"[A-Za-z\d-]{1,63}$" +consistent!(shadowsocks_0, r"[A-Za-z\d-]{1,63}$"); + +// shkeleton-0.1.5: "[abc]+" +consistent!(shkeleton_0, "[abc]+"); + +// shellwords-0.1.0: r"([^A-Za-z0-9_\-.,:/@\n])" +consistent!(shellwords_0, r"([^A-Za-z0-9_\-.,:/@\n])"); + +// shellwords-0.1.0: r"\n" +consistent!(shellwords_1, r"\n"); + +// shush-0.1.5: "(?P<num>[0-9]+)(?P<units>[dhms])" +consistent!(shush_0, "(?P<num>[0-9]+)(?P<units>[dhms])"); + +// woothee-0.8.0: r"(?:Chrome|CrMo|CriOS)/([.0-9]+)" +consistent!(woothee_0, r"(?:Chrome|CrMo|CriOS)/([.0-9]+)"); + +// woothee-0.8.0: r"Vivaldi/([.0-9]+)" +consistent!(woothee_1, r"Vivaldi/([.0-9]+)"); + +// woothee-0.8.0: r"Firefox/([.0-9]+)" +consistent!(woothee_2, r"Firefox/([.0-9]+)"); + +// woothee-0.8.0: r"^Mozilla/[.0-9]+ \((?:Mobile|Tablet);(?:.*;)? rv:([.0-9]+)\) Gecko/[.0-9]+ Firefox/[.0-9]+$" +consistent!(woothee_3, r"^Mozilla/[.0-9]+ \((?:Mobile|Tablet);(?:.*;)? rv:([.0-9]+)\) Gecko/[.0-9]+ Firefox/[.0-9]+$"); + +// woothee-0.8.0: r"FxiOS/([.0-9]+)" +consistent!(woothee_4, r"FxiOS/([.0-9]+)"); + +// woothee-0.8.0: r"\(([^;)]+);FOMA;" +consistent!(woothee_5, r"\(([^;)]+);FOMA;"); + +// woothee-0.8.0: r"jig browser[^;]+; ([^);]+)" +consistent!(woothee_6, r"jig browser[^;]+; ([^);]+)"); + +// woothee-0.8.0: r"(?i)rss(?:reader|bar|[-_ /;()]|[ +]*/)" +consistent!(woothee_7, r"(?i)rss(?:reader|bar|[-_ /;()]|[ +]*/)"); + +// woothee-0.8.0: r"(?i)(?:bot|crawler|spider)(?:[-_ ./;@()]|$)" +consistent!(woothee_8, r"(?i)(?:bot|crawler|spider)(?:[-_ ./;@()]|$)"); + +// woothee-0.8.0: r"(?i)(?:feed|web) ?parser" +consistent!(woothee_9, r"(?i)(?:feed|web) ?parser"); + +// woothee-0.8.0: r"(?i)watch ?dog" +consistent!(woothee_10, r"(?i)watch ?dog"); + +// woothee-0.8.0: r"Edge/([.0-9]+)" +consistent!(woothee_11, r"Edge/([.0-9]+)"); + +// woothee-0.8.0: r"MSIE ([.0-9]+);" +consistent!(woothee_12, r"MSIE ([.0-9]+);"); + +// woothee-0.8.0: r"Version/([.0-9]+)" +consistent!(woothee_13, r"Version/([.0-9]+)"); + +// woothee-0.8.0: r"Opera[/ ]([.0-9]+)" +consistent!(woothee_14, r"Opera[/ ]([.0-9]+)"); + +// woothee-0.8.0: r"OPR/([.0-9]+)" +consistent!(woothee_15, r"OPR/([.0-9]+)"); + +// woothee-0.8.0: r"Version/([.0-9]+)" +consistent!(woothee_16, r"Version/([.0-9]+)"); + +// woothee-0.8.0: r"(?:SoftBank|Vodafone|J-PHONE)/[.0-9]+/([^ /;()]+)" +consistent!(woothee_17, r"(?:SoftBank|Vodafone|J-PHONE)/[.0-9]+/([^ /;()]+)"); + +// woothee-0.8.0: r"Trident/([.0-9]+);" +consistent!(woothee_18, r"Trident/([.0-9]+);"); + +// woothee-0.8.0: r" rv:([.0-9]+)" +consistent!(woothee_19, r" rv:([.0-9]+)"); + +// woothee-0.8.0: r"IEMobile/([.0-9]+);" +consistent!(woothee_20, r"IEMobile/([.0-9]+);"); + +// woothee-0.8.0: r"(?:WILLCOM|DDIPOCKET);[^/]+/([^ /;()]+)" +consistent!(woothee_21, r"(?:WILLCOM|DDIPOCKET);[^/]+/([^ /;()]+)"); + +// woothee-0.8.0: r"Windows ([ .a-zA-Z0-9]+)[;\\)]" +consistent!(woothee_22, r"Windows ([ .a-zA-Z0-9]+)[;\\)]"); + +// woothee-0.8.0: r"^Phone(?: OS)? ([.0-9]+)" +consistent!(woothee_23, r"^Phone(?: OS)? ([.0-9]+)"); + +// woothee-0.8.0: r"iP(hone;|ad;|od) .*like Mac OS X" +consistent!(woothee_24, r"iP(hone;|ad;|od) .*like Mac OS X"); + +// woothee-0.8.0: r"Version/([.0-9]+)" +consistent!(woothee_25, r"Version/([.0-9]+)"); + +// woothee-0.8.0: r"rv:(\d+\.\d+\.\d+)" +consistent!(woothee_26, r"rv:(\d+\.\d+\.\d+)"); + +// woothee-0.8.0: r"FreeBSD ([^;\)]+);" +consistent!(woothee_27, r"FreeBSD ([^;\)]+);"); + +// woothee-0.8.0: r"CrOS ([^\)]+)\)" +consistent!(woothee_28, r"CrOS ([^\)]+)\)"); + +// woothee-0.8.0: r"Android[- ](\d+\.\d+(?:\.\d+)?)" +consistent!(woothee_29, r"Android[- ](\d+\.\d+(?:\.\d+)?)"); + +// woothee-0.8.0: r"PSP \(PlayStation Portable\); ([.0-9]+)\)" +consistent!(woothee_30, r"PSP \(PlayStation Portable\); ([.0-9]+)\)"); + +// woothee-0.8.0: r"PLAYSTATION 3;? ([.0-9]+)\)" +consistent!(woothee_31, r"PLAYSTATION 3;? ([.0-9]+)\)"); + +// woothee-0.8.0: r"PlayStation Vita ([.0-9]+)\)" +consistent!(woothee_32, r"PlayStation Vita ([.0-9]+)\)"); + +// woothee-0.8.0: r"PlayStation 4 ([.0-9]+)\)" +consistent!(woothee_33, r"PlayStation 4 ([.0-9]+)\)"); + +// woothee-0.8.0: r"BB10(?:.+)Version/([.0-9]+) " +consistent!(woothee_34, r"BB10(?:.+)Version/([.0-9]+) "); + +// woothee-0.8.0: r"BlackBerry(?:\d+)/([.0-9]+) " +consistent!(woothee_35, r"BlackBerry(?:\d+)/([.0-9]+) "); + +// woothee-0.8.0: r"; CPU(?: iPhone)? OS (\d+_\d+(?:_\d+)?) like Mac OS X" +consistent!( + woothee_36, + r"; CPU(?: iPhone)? OS (\d+_\d+(?:_\d+)?) like Mac OS X" +); + +// woothee-0.8.0: r"Mac OS X (10[._]\d+(?:[._]\d+)?)(?:\)|;)" +consistent!(woothee_37, r"Mac OS X (10[._]\d+(?:[._]\d+)?)(?:\)|;)"); + +// woothee-0.8.0: r"^(?:Apache-HttpClient/|Jakarta Commons-HttpClient/|Java/)" +consistent!( + woothee_38, + r"^(?:Apache-HttpClient/|Jakarta Commons-HttpClient/|Java/)" +); + +// woothee-0.8.0: r"[- ]HttpClient(/|$)" +consistent!(woothee_39, r"[- ]HttpClient(/|$)"); + +// woothee-0.8.0: r"^(?:PHP|WordPress|CakePHP|PukiWiki|PECL::HTTP)(?:/| |$)" +consistent!( + woothee_40, + r"^(?:PHP|WordPress|CakePHP|PukiWiki|PECL::HTTP)(?:/| |$)" +); + +// woothee-0.8.0: r"(?:PEAR HTTP_Request|HTTP_Request)(?: class|2)" +consistent!(woothee_41, r"(?:PEAR HTTP_Request|HTTP_Request)(?: class|2)"); + +// woothee-0.8.0: r"(?:Rome Client |UnwindFetchor/|ia_archiver |Summify |PostRank/)" +consistent!( + woothee_42, + r"(?:Rome Client |UnwindFetchor/|ia_archiver |Summify |PostRank/)" +); + +// woothee-0.8.0: r"Sleipnir/([.0-9]+)" +consistent!(woothee_43, r"Sleipnir/([.0-9]+)"); + +// word_replace-0.0.3: r"@@[a-z|A-Z|\d]+@@" +consistent!(word_replace_0, r"@@[a-z|A-Z|\d]+@@"); + +// wordcount-0.1.0: r"\w+" +consistent!(wordcount_0, r"\w+"); + +// just-0.3.12: "^([^=]+)=(.*)$" +consistent!(just_0, "^([^=]+)=(.*)$"); + +// emote-0.1.0: r":[a-zA-Z_]+?:" +consistent!(emote_0, r":[a-zA-Z_]+?:"); + +// emojicons-1.0.1: r":([a-zA-Z0-9_+-]+):" +consistent!(emojicons_0, r":([a-zA-Z0-9_+-]+):"); + +// git2_codecommit-0.1.2: r"git-codecommit\.([a-z0-9-]+)\.amazonaws\.com" +consistent!( + git2_codecommit_0, + r"git-codecommit\.([a-z0-9-]+)\.amazonaws\.com" +); + +// git-workarea-3.1.2: r"^submodule\.(?P<name>.*)\.(?P<key>[^=]*)=(?P<value>.*)$" +consistent!( + git_workarea_0, + r"^submodule\.(?P<name>.*)\.(?P<key>[^=]*)=(?P<value>.*)$" +); + +// git-shell-enforce-directory-1.0.0: r"^(?P<command>git-(?:receive|upload)-pack) '(?P<path>.+)'$" +consistent!( + git_shell_enforce_directory_0, + r"^(?P<command>git-(?:receive|upload)-pack) '(?P<path>.+)'$" +); + +// git-journal-1.6.3: r"[ \n]:(.*?):" +consistent!(git_journal_0, r"[ \n]:(.*?):"); + +// git-find-0.3.2: r"^git@(?P<host>[[:alnum:]\._-]+):(?P<path>[[:alnum:]\._\-/]+).git$" +consistent!( + git_find_0, + r"^git@(?P<host>[[:alnum:]\._-]+):(?P<path>[[:alnum:]\._\-/]+).git$" +); + +// gitlab-api-0.6.0: r"private_token=\w{20}" +consistent!(gitlab_api_0, r"private_token=\w{20}"); + +// td-client-0.7.0: "^(http://|https://)" +consistent!(td_client_0, "^(http://|https://)"); + +// karaconv-0.3.0: r"--(?P<type>[a-zA-Z]+)-- (?P<contents>.*)" +consistent!(karaconv_0, r"--(?P<type>[a-zA-Z]+)-- (?P<contents>.*)"); + +// katana-1.0.2: r"(?P<comp>et al\.)(?:\.)" +consistent!(katana_0, r"(?P<comp>et al\.)(?:\.)"); + +// katana-1.0.2: r"\.{3}" +consistent!(katana_1, r"\.{3}"); + +// katana-1.0.2: r"(?P<number>[0-9]+)\.(?P<decimal>[0-9]+)" +consistent!(katana_2, r"(?P<number>[0-9]+)\.(?P<decimal>[0-9]+)"); + +// katana-1.0.2: r"\s\.(?P<nums>[0-9]+)" +consistent!(katana_3, r"\s\.(?P<nums>[0-9]+)"); + +// katana-1.0.2: r"(?:[A-Za-z]\.){2,}" +consistent!(katana_4, r"(?:[A-Za-z]\.){2,}"); + +// katana-1.0.2: r"(?P<init>[A-Z])(?P<point>\.)" +consistent!(katana_5, r"(?P<init>[A-Z])(?P<point>\.)"); + +// katana-1.0.2: r"(?P<title>[A-Z][a-z]{1,3})(\.)" +consistent!(katana_6, r"(?P<title>[A-Z][a-z]{1,3})(\.)"); + +// katana-1.0.2: r"&==&(?P<p>[.!?])" +consistent!(katana_7, r"&==&(?P<p>[.!?])"); + +// katana-1.0.2: r"&\^&(?P<p>[.!?])" +consistent!(katana_8, r"&\^&(?P<p>[.!?])"); + +// katana-1.0.2: r"&\*\*&(?P<p>[.!?])" +consistent!(katana_9, r"&\*\*&(?P<p>[.!?])"); + +// katana-1.0.2: r"&=&(?P<p>[.!?])" +consistent!(katana_10, r"&=&(?P<p>[.!?])"); + +// katana-1.0.2: r"&##&(?P<p>[.!?])" +consistent!(katana_11, r"&##&(?P<p>[.!?])"); + +// katana-1.0.2: r"&\$&(?P<p>[.!?])" +consistent!(katana_12, r"&\$&(?P<p>[.!?])"); + +// kailua_syntax-1.1.0: r"@(?:_|\d+(?:/\d+(?:-\d+)?)?)" +consistent!(kailua_syntax_0, r"@(?:_|\d+(?:/\d+(?:-\d+)?)?)"); + +// kailua_syntax-1.1.0: r"<(\d+)>" +consistent!(kailua_syntax_1, r"<(\d+)>"); + +// ftp-3.0.1: r"\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\)" +consistent!(ftp_0, r"\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\)"); + +// ftp-3.0.1: r"\b(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})\b" +consistent!(ftp_1, r"\b(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})\b"); + +// ftp-3.0.1: r"\s+(\d+)\s*$" +consistent!(ftp_2, r"\s+(\d+)\s*$"); + +// vat-0.1.0: r"<countryCode>(.*?)</countryCode>" +consistent!(vat_0, r"<countryCode>(.*?)</countryCode>"); + +// vat-0.1.0: r"<vatNumber>(.*?)</vatNumber>" +consistent!(vat_1, r"<vatNumber>(.*?)</vatNumber>"); + +// vat-0.1.0: r"<name>(.*?)</name>" +consistent!(vat_2, r"<name>(.*?)</name>"); + +// vat-0.1.0: r"<address>(?s)(.*?)(?-s)</address>" +consistent!(vat_3, r"<address>(?s)(.*?)(?-s)</address>"); + +// vat-0.1.0: r"<valid>(true|false)</valid>" +consistent!(vat_4, r"<valid>(true|false)</valid>"); + +// vat-0.1.0: r"^ATU\d{8}$" +consistent!(vat_5, r"^ATU\d{8}$"); + +// vat-0.1.0: r"^BE0?\d{9, 10}$" +consistent!(vat_6, r"^BE0?\d{9, 10}$"); + +// vat-0.1.0: r"^BG\d{9,10}$" +consistent!(vat_7, r"^BG\d{9,10}$"); + +// vat-0.1.0: r"^HR\d{11}$" +consistent!(vat_8, r"^HR\d{11}$"); + +// vat-0.1.0: r"^CY\d{8}[A-Z]$" +consistent!(vat_9, r"^CY\d{8}[A-Z]$"); + +// vat-0.1.0: r"^CZ\d{8,10}$" +consistent!(vat_10, r"^CZ\d{8,10}$"); + +// vat-0.1.0: r"^DK\d{8}$" +consistent!(vat_11, r"^DK\d{8}$"); + +// vat-0.1.0: r"^EE\d{9}$" +consistent!(vat_12, r"^EE\d{9}$"); + +// vat-0.1.0: r"^FI\d{8}$" +consistent!(vat_13, r"^FI\d{8}$"); + +// vat-0.1.0: r"^FR[A-HJ-NP-Z0-9][A-HJ-NP-Z0-9]\d{9}$" +consistent!(vat_14, r"^FR[A-HJ-NP-Z0-9][A-HJ-NP-Z0-9]\d{9}$"); + +// vat-0.1.0: r"^DE\d{9}$" +consistent!(vat_15, r"^DE\d{9}$"); + +// vat-0.1.0: r"^EL\d{9}$" +consistent!(vat_16, r"^EL\d{9}$"); + +// vat-0.1.0: r"^HU\d{8}$" +consistent!(vat_17, r"^HU\d{8}$"); + +// vat-0.1.0: r"^IE\d[A-Z0-9\+\*]\d{5}[A-Z]{1,2}$" +consistent!(vat_18, r"^IE\d[A-Z0-9\+\*]\d{5}[A-Z]{1,2}$"); + +// vat-0.1.0: r"^IT\d{11}$" +consistent!(vat_19, r"^IT\d{11}$"); + +// vat-0.1.0: r"^LV\d{11}$" +consistent!(vat_20, r"^LV\d{11}$"); + +// vat-0.1.0: r"^LT(\d{9}|\d{12})$" +consistent!(vat_21, r"^LT(\d{9}|\d{12})$"); + +// vat-0.1.0: r"^LU\d{8}$" +consistent!(vat_22, r"^LU\d{8}$"); + +// vat-0.1.0: r"^MT\d{8}$" +consistent!(vat_23, r"^MT\d{8}$"); + +// vat-0.1.0: r"^NL\d{9}B\d{2}$" +consistent!(vat_24, r"^NL\d{9}B\d{2}$"); + +// vat-0.1.0: r"^PL\d{10}$" +consistent!(vat_25, r"^PL\d{10}$"); + +// vat-0.1.0: r"^PT\d{9}$" +consistent!(vat_26, r"^PT\d{9}$"); + +// vat-0.1.0: r"^RO\d{2,10}$" +consistent!(vat_27, r"^RO\d{2,10}$"); + +// vat-0.1.0: r"^SK\d{10}$" +consistent!(vat_28, r"^SK\d{10}$"); + +// vat-0.1.0: r"^SI\d{8}$" +consistent!(vat_29, r"^SI\d{8}$"); + +// vat-0.1.0: r"^ES[A-Z0-9]\d{7}[A-Z0-9]$" +consistent!(vat_30, r"^ES[A-Z0-9]\d{7}[A-Z0-9]$"); + +// vat-0.1.0: r"^SE\d{10}01$" +consistent!(vat_31, r"^SE\d{10}01$"); + +// vat-0.1.0: r"^(GB(GD|HA)\d{3}|GB\d{9}|GB\d{12})$" +consistent!(vat_32, r"^(GB(GD|HA)\d{3}|GB\d{9}|GB\d{12})$"); + +// eve-0.1.1: r"\{\{(.*)\}\}" +consistent!(eve_0, r"\{\{(.*)\}\}"); + +// egc-0.1.2: "^mio" +consistent!(egc_0, "^mio"); + +// pew-0.2.3: "" +consistent!(pew_0, ""); + +// pew-0.2.3: "" +consistent!(pew_1, ""); + +// mob-0.4.3: "y" +consistent!(mob_0, "y"); + +// lit-0.2.8: "@([a-z]+)" +consistent!(lit_0, "@([a-z]+)"); + +// lit-0.2.8: "([A-Z-]+):(.*)" +consistent!(lit_1, "([A-Z-]+):(.*)"); + +// lit-0.2.8: "^[a-zA-Z_][a-zA-Z0-9_]*$" +consistent!(lit_2, "^[a-zA-Z_][a-zA-Z0-9_]*$"); + +// avm-1.0.1: r"\d+\.\d+\.\d+" +consistent!(avm_0, r"\d+\.\d+\.\d+"); + +// avm-1.0.1: r"\d+\.\d+\.\d+" +consistent!(avm_1, r"\d+\.\d+\.\d+"); + +// orm-0.2.0: r"^Vec<(.+)>$" +consistent!(orm_0, r"^Vec<(.+)>$"); + +// sgf-0.1.5: r"\\(\r\n|\n\r|\n|\r)" +consistent!(sgf_0, r"\\(\r\n|\n\r|\n|\r)"); + +// sgf-0.1.5: r"\\(.)" +consistent!(sgf_1, r"\\(.)"); + +// sgf-0.1.5: r"\r\n|\n\r|\n|\r" +consistent!(sgf_2, r"\r\n|\n\r|\n|\r"); + +// sgf-0.1.5: r"([\]\\:])" +consistent!(sgf_3, r"([\]\\:])"); + +// dok-0.2.0: "^Bearer realm=\"(.+?)\",service=\"(.+?)\",scope=\"(.+?)\"$" +consistent!( + dok_0, + "^Bearer realm=\"(.+?)\",service=\"(.+?)\",scope=\"(.+?)\"$" +); + +// d20-0.1.0: r"([+-]?\s*\d+[dD]\d+|[+-]?\s*\d+)" +consistent!(d20_0, r"([+-]?\s*\d+[dD]\d+|[+-]?\s*\d+)"); + +// dvb-0.3.0: "E" +consistent!(dvb_0, "E"); + +// dvb-0.3.0: "^F" +consistent!(dvb_1, "^F"); + +// dvb-0.3.0: "^S" +consistent!(dvb_2, "^S"); + +// ger-0.2.0: r"Change-Id: (I[a-f0-9]{40})$" +consistent!(ger_0, r"Change-Id: (I[a-f0-9]{40})$"); + +// ger-0.2.0: r"(refs|ref|fix|fixes|close|closes)\s+([A-Z]{2,5}-[0-9]{1,5})$" +consistent!( + ger_1, + r"(refs|ref|fix|fixes|close|closes)\s+([A-Z]{2,5}-[0-9]{1,5})$" +); + +// n5-0.2.1: r"(\d+)(\.(\d+))?(\.(\d+))?(.*)" +consistent!(n5_0, r"(\d+)(\.(\d+))?(\.(\d+))?(.*)"); + +// po-0.1.4: r"[A-Za-z0-9]" +consistent!(po_0, r"[A-Za-z0-9]"); + +// carnix-0.8.5: "path is (‘|')?([^’'\n]*)(’|')?" +consistent!(carnix_0, "path is (‘|')?([^’'\n]*)(’|')?"); + +// carnix-0.8.5: r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?(.*)?" +consistent!(carnix_1, r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?(.*)?"); + +// carnix-0.8.5: r"(\d*)\.(\d*)\.(\d*)(-(\S*))?" +consistent!(carnix_2, r"(\d*)\.(\d*)\.(\d*)(-(\S*))?"); + +// carnix-0.8.5: r"(\S*)-(\d*)\.(\d*)\.(\d*)(-(\S*))?" +consistent!(carnix_3, r"(\S*)-(\d*)\.(\d*)\.(\d*)(-(\S*))?"); + +// caseless-0.2.1: r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$" +consistent!(caseless_0, r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$"); + +// caseless-0.2.1: r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);" +consistent!(caseless_1, r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"); + +// cabot-0.2.0: "\r?\n\r?\n" +consistent!(cabot_0, "\r?\n\r?\n"); + +// cabot-0.2.0: "\r?\n" +consistent!(cabot_1, "\r?\n"); + +// card-validate-2.2.1: r"^600" +consistent!(card_validate_0, r"^600"); + +// card-validate-2.2.1: r"^5019" +consistent!(card_validate_1, r"^5019"); + +// card-validate-2.2.1: r"^4" +consistent!(card_validate_2, r"^4"); + +// card-validate-2.2.1: r"^(5[1-5]|2[2-7])" +consistent!(card_validate_3, r"^(5[1-5]|2[2-7])"); + +// card-validate-2.2.1: r"^3[47]" +consistent!(card_validate_4, r"^3[47]"); + +// card-validate-2.2.1: r"^3[0689]" +consistent!(card_validate_5, r"^3[0689]"); + +// card-validate-2.2.1: r"^6([045]|22)" +consistent!(card_validate_6, r"^6([045]|22)"); + +// card-validate-2.2.1: r"^(62|88)" +consistent!(card_validate_7, r"^(62|88)"); + +// card-validate-2.2.1: r"^35" +consistent!(card_validate_8, r"^35"); + +// card-validate-2.2.1: r"^[0-9]+$" +consistent!(card_validate_9, r"^[0-9]+$"); + +// cargo-testify-0.3.0: r"\d{1,} passed.*filtered out" +consistent!(cargo_testify_0, r"\d{1,} passed.*filtered out"); + +// cargo-testify-0.3.0: r"error(:|\[).*" +consistent!(cargo_testify_1, r"error(:|\[).*"); + +// cargo-wix-0.0.5: r"<(.*?)>" +consistent!(cargo_wix_0, r"<(.*?)>"); + +// cargo-wix-0.0.5: r"<(.*?)>" +consistent!(cargo_wix_1, r"<(.*?)>"); + +// cargo-wix-0.0.5: r"<(.*?)>" +consistent!(cargo_wix_2, r"<(.*?)>"); + +// cargo-wix-0.0.5: r"<(.*?)>" +consistent!(cargo_wix_3, r"<(.*?)>"); + +// cargo-incremental-0.1.23: r"(?m)^incremental: re-using (\d+) out of (\d+) modules$" +consistent!( + cargo_incremental_0, + r"(?m)^incremental: re-using (\d+) out of (\d+) modules$" +); + +// cargo-incremental-0.1.23: "(?m)(warning|error): (.*)\n --> ([^:]:\\d+:\\d+)$" +consistent!( + cargo_incremental_1, + "(?m)(warning|error): (.*)\n --> ([^:]:\\d+:\\d+)$" +); + +// cargo-incremental-0.1.23: r"(?m)^test (.*) \.\.\. (\w+)" +consistent!(cargo_incremental_2, r"(?m)^test (.*) \.\.\. (\w+)"); + +// cargo-incremental-0.1.23: r"(?m)(\d+) passed; (\d+) failed; (\d+) ignored; \d+ measured" +consistent!( + cargo_incremental_3, + r"(?m)(\d+) passed; (\d+) failed; (\d+) ignored; \d+ measured" +); + +// cargo-testjs-0.1.2: r"^[^-]+-[0-9a-f]+\.js$" +consistent!(cargo_testjs_0, r"^[^-]+-[0-9a-f]+\.js$"); + +// cargo-tarpaulin-0.6.2: r"\s*//" +consistent!(cargo_tarpaulin_0, r"\s*//"); + +// cargo-tarpaulin-0.6.2: r"/\*" +consistent!(cargo_tarpaulin_1, r"/\*"); + +// cargo-tarpaulin-0.6.2: r"\*/" +consistent!(cargo_tarpaulin_2, r"\*/"); + +// cargo-culture-kit-0.1.0: r"^fo" +consistent!(cargo_culture_kit_0, r"^fo"); + +// cargo-screeps-0.1.3: "\\s+" +consistent!(cargo_screeps_0, "\\s+"); + +// cargo-brew-0.1.4: r"`(\S+) v([0-9.]+)" +consistent!(cargo_brew_0, r"`(\S+) v([0-9.]+)"); + +// cargo-release-0.10.2: "^\\[.+\\]" +consistent!(cargo_release_0, "^\\[.+\\]"); + +// cargo-release-0.10.2: "^\\[\\[.+\\]\\]" +consistent!(cargo_release_1, "^\\[\\[.+\\]\\]"); + +// cargo-edit-0.3.0-beta.1: r"^https://github.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$" +consistent!( + cargo_edit_0, + r"^https://github.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$" +); + +// cargo-edit-0.3.0-beta.1: r"^https://gitlab.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$" +consistent!( + cargo_edit_1, + r"^https://gitlab.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$" +); + +// cargo-disassemble-0.1.1: ".*" +consistent!(cargo_disassemble_0, ".*"); + +// cargo-demangle-0.1.2: r"(?m)(?P<symbol>_ZN[0-9]+.*E)" +consistent!(cargo_demangle_0, r"(?m)(?P<symbol>_ZN[0-9]+.*E)"); + +// cargo-coverage-annotations-0.1.5: r"^\s*\}(?:\)*;?|\s*else\s*\{)$" +consistent!(cargo_coverage_annotations_0, r"^\s*\}(?:\)*;?|\s*else\s*\{)$"); + +// cargo-urlcrate-1.0.1: "[\u{001b}\u{009b}][\\[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-PRZcf-nqry=><]" +consistent!(cargo_urlcrate_0, "[\u{001b}\u{009b}][\\[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-PRZcf-nqry=><]"); + +// cargo-script-0.2.8: r"^\s*\*( |$)" +consistent!(cargo_script_0, r"^\s*\*( |$)"); + +// cargo-script-0.2.8: r"^(\s+)" +consistent!(cargo_script_1, r"^(\s+)"); + +// cargo-script-0.2.8: r"/\*|\*/" +consistent!(cargo_script_2, r"/\*|\*/"); + +// cargo-script-0.2.8: r"^\s*//!" +consistent!(cargo_script_3, r"^\s*//!"); + +// cargo-script-0.2.8: r"^#![^\[].*?(\r\n|\n)" +consistent!(cargo_script_4, r"^#![^\[].*?(\r\n|\n)"); + +// cargo-update-1.5.2: r"cargo-install-update\.exe-v.+" +consistent!(cargo_update_0, r"cargo-install-update\.exe-v.+"); + +// canteen-0.4.1: r"^<(?:(int|uint|str|float|path):)?([\w_][a-zA-Z0-9_]*)>$" +consistent!( + canteen_0, + r"^<(?:(int|uint|str|float|path):)?([\w_][a-zA-Z0-9_]*)>$" +); + +// thruster-cli-0.1.3: r"(.)([A-Z])" +consistent!(thruster_cli_0, r"(.)([A-Z])"); + +// thieves-cant-0.1.0: "([Z]+)$" +consistent!(thieves_cant_0, "([Z]+)$"); + +// codeowners-0.1.3: r"^@\S+/\S+" +consistent!(codeowners_0, r"^@\S+/\S+"); + +// codeowners-0.1.3: r"^@\S+" +consistent!(codeowners_1, r"^@\S+"); + +// codeowners-0.1.3: r"^\S+@\S+" +consistent!(codeowners_2, r"^\S+@\S+"); + +// conserve-0.4.2: r"^b0000 {21} complete 20[-0-9T:+]+\s +\d+s\n$" +consistent!(conserve_0, r"^b0000 {21} complete 20[-0-9T:+]+\s +\d+s\n$"); + +// commodore-0.3.0: r"(?P<greeting>\S+?) (?P<name>\S+?)$" +consistent!(commodore_0, r"(?P<greeting>\S+?) (?P<name>\S+?)$"); + +// corollary-0.3.0: r"([ \t]*)```haskell([\s\S]*?)```" +consistent!(corollary_0, r"([ \t]*)```haskell([\s\S]*?)```"); + +// corollary-0.3.0: r"\b((?:a|b|t)\d*)\b" +consistent!(corollary_1, r"\b((?:a|b|t)\d*)\b"); + +// colorizex-0.1.3: "NB" +consistent!(colorizex_0, "NB"); + +// colorstring-0.0.1: r"(?i)\[[a-z0-9_-]+\]" +consistent!(colorstring_0, r"(?i)\[[a-z0-9_-]+\]"); + +// colorstring-0.0.1: r"^(?i)(\[[a-z0-9_-]+\])+" +consistent!(colorstring_1, r"^(?i)(\[[a-z0-9_-]+\])+"); + +// cosmogony-0.3.0: "name:(.+)" +consistent!(cosmogony_0, "name:(.+)"); + +// cobalt-bin-0.12.1: r"(?m:^ {0,3}\[[^\]]+\]:.+$)" +consistent!(cobalt_bin_0, r"(?m:^ {0,3}\[[^\]]+\]:.+$)"); + +// comrak-0.2.12: r"[^\p{L}\p{M}\p{N}\p{Pc} -]" +consistent!(comrak_0, r"[^\p{L}\p{M}\p{N}\p{Pc} -]"); + +// content-blocker-0.2.3: "" +consistent!(content_blocker_0, ""); + +// content-blocker-0.2.3: "(?i)hi" +consistent!(content_blocker_1, "(?i)hi"); + +// content-blocker-0.2.3: "http[s]?://domain.org" +consistent!(content_blocker_2, "http[s]?://domain.org"); + +// content-blocker-0.2.3: "(?i)http[s]?://domain.org" +consistent!(content_blocker_3, "(?i)http[s]?://domain.org"); + +// content-blocker-0.2.3: "http://domain.org" +consistent!(content_blocker_4, "http://domain.org"); + +// content-blocker-0.2.3: "http://domain.org" +consistent!(content_blocker_5, "http://domain.org"); + +// content-blocker-0.2.3: "ad.html" +consistent!(content_blocker_6, "ad.html"); + +// content-blocker-0.2.3: "ad.html" +consistent!(content_blocker_7, "ad.html"); + +// content-blocker-0.2.3: "http://domain.org" +consistent!(content_blocker_8, "http://domain.org"); + +// content-blocker-0.2.3: "http://domain.org/nocookies.sjs" +consistent!(content_blocker_9, "http://domain.org/nocookies.sjs"); + +// content-blocker-0.2.3: "http://domain.org/nocookies.sjs" +consistent!(content_blocker_10, "http://domain.org/nocookies.sjs"); + +// content-blocker-0.2.3: "http://domain.org/hideme.jpg" +consistent!(content_blocker_11, "http://domain.org/hideme.jpg"); + +// content-blocker-0.2.3: "http://domain.org/ok.html" +consistent!(content_blocker_12, "http://domain.org/ok.html"); + +// content-blocker-0.2.3: "http://domain.org/ok.html\\?except_this=1" +consistent!(content_blocker_13, "http://domain.org/ok.html\\?except_this=1"); + +// victoria-dom-0.1.2: "[A-Za-z0-9=]" +consistent!(victoria_dom_0, "[A-Za-z0-9=]"); + +// numbat-1.0.0: r"^nsq://" +consistent!(numbat_0, r"^nsq://"); + +// airkorea-0.1.2: r"[\s\t\r\n]" +consistent!(airkorea_0, r"[\s\t\r\n]"); + +// airkorea-0.1.2: r"([\{\[,])|([\}\]])" +consistent!(airkorea_1, r"([\{\[,])|([\}\]])"); + +// airkorea-0.1.2: r"[^.\d]+$" +consistent!(airkorea_2, r"[^.\d]+$"); + +// rofl-0.0.1: r"\b" +// consistent!(rofl_0, r"\b"); + +// rogcat-0.2.15: r"--------- beginning of.*" +consistent!(rogcat_0, r"--------- beginning of.*"); + +// rogcat-0.2.15: r"a|e|i|o|u" +consistent!(rogcat_1, r"a|e|i|o|u"); + +// rogcat-0.2.15: r"^(\d+)([kMG])$" +consistent!(rogcat_2, r"^(\d+)([kMG])$"); + +// media_filename-0.1.4: "\\.([A-Za-z0-9]{2,4})$" +consistent!(media_filename_0, "\\.([A-Za-z0-9]{2,4})$"); + +// media_filename-0.1.4: "([0-9]{3,4}p|[0-9]{3,4}x[0-9]{3,4})" +consistent!(media_filename_1, "([0-9]{3,4}p|[0-9]{3,4}x[0-9]{3,4})"); + +// media_filename-0.1.4: "(?:^\\[([^]]+)\\]|- ?([^-]+)$)" +consistent!(media_filename_2, "(?:^\\[([^]]+)\\]|- ?([^-]+)$)"); + +// media_filename-0.1.4: "(?:[eE]([0-9]{2,3})|[^0-9A-Za-z]([0-9]{2,3})(?:v[0-9])?[^0-9A-Za-z])" +consistent!( + media_filename_3, + "(?:[eE]([0-9]{2,3})|[^0-9A-Za-z]([0-9]{2,3})(?:v[0-9])?[^0-9A-Za-z])" +); + +// media_filename-0.1.4: "[sS]([0-9]{1,2})" +consistent!(media_filename_4, "[sS]([0-9]{1,2})"); + +// media_filename-0.1.4: "((?i)(?:PPV.)?[HP]DTV|(?:HD)?CAM|BRRIP|[^a-z]TS[^a-z]|(?:PPV )?WEB.?DL(?: DVDRip)?|HDRip|DVDRip|CamRip|W[EB]BRip|BluRay|BD|DVD|DvDScr|hdtv)" +consistent!(media_filename_5, "((?i)(?:PPV.)?[HP]DTV|(?:HD)?CAM|BRRIP|[^a-z]TS[^a-z]|(?:PPV )?WEB.?DL(?: DVDRip)?|HDRip|DVDRip|CamRip|W[EB]BRip|BluRay|BD|DVD|DvDScr|hdtv)"); + +// media_filename-0.1.4: "((19[0-9]|20[01])[0-9])" +consistent!(media_filename_6, "((19[0-9]|20[01])[0-9])"); + +// media_filename-0.1.4: "((?i)xvid|x264|h\\.?264)" +consistent!(media_filename_7, "((?i)xvid|x264|h\\.?264)"); + +// media_filename-0.1.4: "((?i)MP3|DD5\\.?1|Dual[- ]Audio|LiNE|DTS|AAC(?:\\.?2\\.0)?|AC3(?:\\.5\\.1)?)" +consistent!(media_filename_8, "((?i)MP3|DD5\\.?1|Dual[- ]Audio|LiNE|DTS|AAC(?:\\.?2\\.0)?|AC3(?:\\.5\\.1)?)"); + +// media_filename-0.1.4: "\\[([0-9A-F]{8})\\]" +consistent!(media_filename_9, "\\[([0-9A-F]{8})\\]"); + +// termimage-0.3.2: r"(\d+)[xX](\d+)" +consistent!(termimage_0, r"(\d+)[xX](\d+)"); + +// teensy-0.1.0: r".*(\d{4}-\d{2}-\d{2}).*" +consistent!(teensy_0, r".*(\d{4}-\d{2}-\d{2}).*"); + +// telescreen-0.1.3: r"<@(.+)>" +consistent!(telescreen_0, r"<@(.+)>"); + +// tempus_fugit-0.4.4: r"^(\d+)" +consistent!(tempus_fugit_0, r"^(\d+)"); + +// fselect-0.4.1: "(\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)" +consistent!(fselect_0, "(\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)"); + +// fselect-0.4.1: "(%|_|\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)" +consistent!(fselect_1, "(%|_|\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)"); + +// fs_eventbridge-0.1.0: r"^([A-Z]+)(?:\s(.+))?\s*" +consistent!(fs_eventbridge_0, r"^([A-Z]+)(?:\s(.+))?\s*"); + +// joseki-0.0.1: r"(\w{1,2})\[(.+?)\]" +consistent!(joseki_0, r"(\w{1,2})\[(.+?)\]"); + +// tweetr-0.2.1: r"(?i)in (\d+) (second|minute|hour|day|week)s?" +consistent!(tweetr_0, r"(?i)in (\d+) (second|minute|hour|day|week)s?"); + +// bullet_core-0.1.1: "^(?u:[0-9])+" +consistent!(bullet_core_0, "^(?u:[0-9])+"); + +// bullet_core-0.1.1: "^(?u:[0-9])+(?u:\\.)(?u:[0-9])+" +consistent!(bullet_core_1, "^(?u:[0-9])+(?u:\\.)(?u:[0-9])+"); + +// bullet_core-0.1.1: "^(?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+" +consistent!(bullet_core_2, "^(?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+"); + +// bullet_core-0.1.1: "^(?u:d/d)((?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+)" +consistent!(bullet_core_3, "^(?u:d/d)((?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+)"); + +// bullet_core-0.1.1: "^(?u:\\()" +consistent!(bullet_core_4, "^(?u:\\()"); + +// bullet_core-0.1.1: "^(?u:\\))" +consistent!(bullet_core_5, "^(?u:\\))"); + +// bullet_core-0.1.1: "^(?u:\\*)" +consistent!(bullet_core_6, "^(?u:\\*)"); + +// bullet_core-0.1.1: "^(?u:\\+)" +consistent!(bullet_core_7, "^(?u:\\+)"); + +// bullet_core-0.1.1: "^(?u:,)" +consistent!(bullet_core_8, "^(?u:,)"); + +// bullet_core-0.1.1: "^(?u:\\-)" +consistent!(bullet_core_9, "^(?u:\\-)"); + +// bullet_core-0.1.1: "^(?u:/)" +consistent!(bullet_core_10, "^(?u:/)"); + +// bullet_core-0.1.1: "^(?u:\\[)" +consistent!(bullet_core_11, "^(?u:\\[)"); + +// bullet_core-0.1.1: "^(?u:\\])" +consistent!(bullet_core_12, "^(?u:\\])"); + +// bullet_core-0.1.1: "^(?u:\\^)" +consistent!(bullet_core_13, "^(?u:\\^)"); + +// bullet_core-0.1.1: "^(?u:·)" +consistent!(bullet_core_14, "^(?u:·)"); + +// actix-web-0.6.13: "//+" +consistent!(actix_web_0, "//+"); + +// actix-web-0.6.13: "//+" +consistent!(actix_web_1, "//+"); + +// althea_kernel_interface-0.1.0: r"(\S*) .* (\S*) (REACHABLE|STALE|DELAY)" +consistent!( + althea_kernel_interface_0, + r"(\S*) .* (\S*) (REACHABLE|STALE|DELAY)" +); + +// althea_kernel_interface-0.1.0: r"-s (.*) --ip6-dst (.*)/.* bcnt = (.*)" +consistent!( + althea_kernel_interface_1, + r"-s (.*) --ip6-dst (.*)/.* bcnt = (.*)" +); + +// alcibiades-0.3.0: r"\buci(?:\s|$)" +consistent!(alcibiades_0, r"\buci(?:\s|$)"); + +// ruma-identifiers-0.11.0: r"\A[a-z0-9._=-]+\z" +consistent!(ruma_identifiers_0, r"\A[a-z0-9._=-]+\z"); + +// rusqbin-0.2.3: r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})$" +consistent!(rusqbin_0, r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})$"); + +// rusqbin-0.2.3: r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})/requests/?$" +consistent!(rusqbin_1, r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})/requests/?$"); + +// rust-install-0.0.4: r"^(nightly|beta|stable)(?:-(\d{4}-\d{2}-\d{2}))?$" +consistent!( + rust_install_0, + r"^(nightly|beta|stable)(?:-(\d{4}-\d{2}-\d{2}))?$" +); + +// rust_inbox-0.0.5: "^+(.*)\r\n" +consistent!(rust_inbox_0, "^+(.*)\r\n"); + +// rust_inbox-0.0.5: r"^\* CAPABILITY (.*)\r\n" +consistent!(rust_inbox_1, r"^\* CAPABILITY (.*)\r\n"); + +// rust_inbox-0.0.5: r"^([a-zA-Z0-9]+) (OK|NO|BAD)(.*)" +consistent!(rust_inbox_2, r"^([a-zA-Z0-9]+) (OK|NO|BAD)(.*)"); + +// rust_inbox-0.0.5: r"^\* (\d+) EXISTS\r\n" +consistent!(rust_inbox_3, r"^\* (\d+) EXISTS\r\n"); + +// rust_inbox-0.0.5: r"^\* (\d+) RECENT\r\n" +consistent!(rust_inbox_4, r"^\* (\d+) RECENT\r\n"); + +// rust_inbox-0.0.5: r"^\* FLAGS (.+)\r\n" +consistent!(rust_inbox_5, r"^\* FLAGS (.+)\r\n"); + +// rust_inbox-0.0.5: r"^\* OK \[UNSEEN (\d+)\](.*)\r\n" +consistent!(rust_inbox_6, r"^\* OK \[UNSEEN (\d+)\](.*)\r\n"); + +// rust_inbox-0.0.5: r"^\* OK \[UIDVALIDITY (\d+)\](.*)\r\n" +consistent!(rust_inbox_7, r"^\* OK \[UIDVALIDITY (\d+)\](.*)\r\n"); + +// rust_inbox-0.0.5: r"^\* OK \[UIDNEXT (\d+)\](.*)\r\n" +consistent!(rust_inbox_8, r"^\* OK \[UIDNEXT (\d+)\](.*)\r\n"); + +// rust_inbox-0.0.5: r"^\* OK \[PERMANENTFLAGS (.+)\](.*)\r\n" +consistent!(rust_inbox_9, r"^\* OK \[PERMANENTFLAGS (.+)\](.*)\r\n"); + +// rustml-0.0.7: r"^[a-z]+ (\d+)$" +consistent!(rustml_0, r"^[a-z]+ (\d+)$"); + +// rustml-0.0.7: r"^[a-z]+ (\d+)$" +consistent!(rustml_1, r"^[a-z]+ (\d+)$"); + +// rustml-0.0.7: r"^[a-z]+ (\d+)$" +consistent!(rustml_2, r"^[a-z]+ (\d+)$"); + +// rustfmt-0.10.0: r"([^\\](\\\\)*)\\[\n\r][[:space:]]*" +consistent!(rustfmt_0, r"([^\\](\\\\)*)\\[\n\r][[:space:]]*"); + +// rustfmt-core-0.4.0: r"(^\s*$)|(^\s*//\s*rustfmt-[^:]+:\s*\S+)" +consistent!(rustfmt_core_0, r"(^\s*$)|(^\s*//\s*rustfmt-[^:]+:\s*\S+)"); + +// rustfmt-core-0.4.0: r"^## `([^`]+)`" +consistent!(rustfmt_core_1, r"^## `([^`]+)`"); + +// rustfmt-core-0.4.0: r"([^\\](\\\\)*)\\[\n\r][[:space:]]*" +consistent!(rustfmt_core_2, r"([^\\](\\\\)*)\\[\n\r][[:space:]]*"); + +// rustfmt-core-0.4.0: r"\s;" +consistent!(rustfmt_core_3, r"\s;"); + +// rust-enum-derive-0.4.0: r"^(0x)?([:digit:]+)$" +consistent!(rust_enum_derive_0, r"^(0x)?([:digit:]+)$"); + +// rust-enum-derive-0.4.0: r"^([:digit:]+)[:space:]*<<[:space:]*([:digit:]+)$" +consistent!( + rust_enum_derive_1, + r"^([:digit:]+)[:space:]*<<[:space:]*([:digit:]+)$" +); + +// rust-enum-derive-0.4.0: r"^[:space:]*([[:alnum:]_]+)([:space:]*=[:space:]*([:graph:]+))?[:space:]*," +consistent!(rust_enum_derive_2, r"^[:space:]*([[:alnum:]_]+)([:space:]*=[:space:]*([:graph:]+))?[:space:]*,"); + +// rust-enum-derive-0.4.0: r"^#define[:space:]+([:graph:]+)[:space:]+([:graph:]+)" +consistent!( + rust_enum_derive_3, + r"^#define[:space:]+([:graph:]+)[:space:]+([:graph:]+)" +); + +// rustsourcebundler-0.2.0: r"^\s*pub mod (.+);$" +consistent!(rustsourcebundler_0, r"^\s*pub mod (.+);$"); + +// rustsourcebundler-0.2.0: r"^\s*pub mod (.+);$" +consistent!(rustsourcebundler_1, r"^\s*pub mod (.+);$"); + +// rustfmt-nightly-0.8.2: r"([^\\](\\\\)*)\\[\n\r][[:space:]]*" +consistent!(rustfmt_nightly_0, r"([^\\](\\\\)*)\\[\n\r][[:space:]]*"); + +// rustfmt-nightly-0.8.2: r"\s;" +consistent!(rustfmt_nightly_1, r"\s;"); + +// rustache-0.1.0: r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)" +consistent!(rustache_0, r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)"); + +// rustfilt-0.2.0: r"_ZN[\$\._[:alnum:]]*" +consistent!(rustfilt_0, r"_ZN[\$\._[:alnum:]]*"); + +// rustache-lists-0.1.2: r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)" +consistent!(rustache_lists_0, r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)"); + +// rural-0.7.3: "(.+)=(.+)" +consistent!(rural_0, "(.+)=(.+)"); + +// rural-0.7.3: "(.*):(.+)" +consistent!(rural_1, "(.*):(.+)"); + +// rural-0.7.3: "(.+):=(.+)" +consistent!(rural_2, "(.+):=(.+)"); + +// rural-0.7.3: "(.*)==(.+)" +consistent!(rural_3, "(.*)==(.+)"); + +// rusoto_credential-0.11.0: r"^\[([^\]]+)\]$" +consistent!(rusoto_credential_0, r"^\[([^\]]+)\]$"); + +// rumblebars-0.3.0: "([:blank:]*)$" +consistent!(rumblebars_0, "([:blank:]*)$"); + +// rumblebars-0.3.0: "(\r?\n)[:blank:]*(\\{\\{~?[#!/](?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z" +consistent!(rumblebars_1, "(\r?\n)[:blank:]*(\\{\\{~?[#!/](?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z"); + +// rumblebars-0.3.0: "(\r?\n[:blank:]*)(\\{\\{~?>(?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z" +consistent!( + rumblebars_2, + "(\r?\n[:blank:]*)(\\{\\{~?>(?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z" +); + +// rumblebars-0.3.0: "((?:[:blank:]|\r?\n)*)(\r?\n)[:blank:]*$" +consistent!(rumblebars_3, "((?:[:blank:]|\r?\n)*)(\r?\n)[:blank:]*$"); + +// rumblebars-0.3.0: "^([:blank:]*\r?\n)(.*)" +consistent!(rumblebars_4, "^([:blank:]*\r?\n)(.*)"); + +// diesel_cli-1.3.1: r"(?P<stamp>[\d-]*)_hello" +consistent!(diesel_cli_0, r"(?P<stamp>[\d-]*)_hello"); + +// dishub-0.1.1: r"(\d+)s" +consistent!(dishub_0, r"(\d+)s"); + +// spreadsheet_textconv-0.1.0: r"\n" +consistent!(spreadsheet_textconv_0, r"\n"); + +// spreadsheet_textconv-0.1.0: r"\r" +consistent!(spreadsheet_textconv_1, r"\r"); + +// spreadsheet_textconv-0.1.0: r"\t" +consistent!(spreadsheet_textconv_2, r"\t"); + +// split_aud-0.1.0: r"DELAY (-?\d+)ms" +consistent!(split_aud_0, r"DELAY (-?\d+)ms"); + +// split_aud-0.1.0: r"Trim\((\d+), ?(\d+)\)" +consistent!(split_aud_1, r"Trim\((\d+), ?(\d+)\)"); + +// spotrust-0.0.5: r"spotify:[a-z]+:[a-zA-Z0-9]+" +consistent!(spotrust_0, r"spotify:[a-z]+:[a-zA-Z0-9]+"); + +// spaceslugs-0.1.0: r"[^\x00-\x7F]" +consistent!(spaceslugs_0, r"[^\x00-\x7F]"); + +// spaceslugs-0.1.0: r"[']+" +consistent!(spaceslugs_1, r"[']+"); + +// spaceslugs-0.1.0: r"\W+" +consistent!(spaceslugs_2, r"\W+"); + +// spaceslugs-0.1.0: r"[ ]+" +consistent!(spaceslugs_3, r"[ ]+"); + +// space_email_api-0.1.1: "PHPSESSID=([0-9a-f]+)" +consistent!(space_email_api_0, "PHPSESSID=([0-9a-f]+)"); + +// lorikeet-0.7.0: "[^0-9.,]" +consistent!(lorikeet_0, "[^0-9.,]"); + +// claude-0.3.0: r"^(?:\b|(-)?)(\p{Currency_Symbol})?((?:(?:\d{1,3}[\.,])+\d{3})|\d+)(?:[\.,](\d{2}))?\b$" +consistent!(claude_0, r"^(?:\b|(-)?)(\p{Currency_Symbol})?((?:(?:\d{1,3}[\.,])+\d{3})|\d+)(?:[\.,](\d{2}))?\b$"); + +// clam-0.1.6: r"<%=\s*(.+?)\s*%>" +consistent!(clam_0, r"<%=\s*(.+?)\s*%>"); + +// classifier-0.0.3: r"(\s)" +consistent!(classifier_0, r"(\s)"); + +// click-0.3.2: r"(-----BEGIN .*-----\n)((?:(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)*\n)+)(-----END .*-----)" +consistent!(click_0, r"(-----BEGIN .*-----\n)((?:(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)*\n)+)(-----END .*-----)"); + +// click-0.3.2: r"-----BEGIN PRIVATE KEY-----" +consistent!(click_1, r"-----BEGIN PRIVATE KEY-----"); + +// ultrastar-txt-0.1.2: r"#([A-Z3a-z]*):(.*)" +consistent!(ultrastar_txt_0, r"#([A-Z3a-z]*):(.*)"); + +// ultrastar-txt-0.1.2: "^-\\s?(-?[0-9]+)\\s*$" +consistent!(ultrastar_txt_1, "^-\\s?(-?[0-9]+)\\s*$"); + +// ultrastar-txt-0.1.2: "^-\\s?(-?[0-9]+)\\s+(-?[0-9]+)" +consistent!(ultrastar_txt_2, "^-\\s?(-?[0-9]+)\\s+(-?[0-9]+)"); + +// ultrastar-txt-0.1.2: "^(.)\\s*(-?[0-9]+)\\s+(-?[0-9]+)\\s+(-?[0-9]+)\\s?(.*)" +consistent!( + ultrastar_txt_3, + "^(.)\\s*(-?[0-9]+)\\s+(-?[0-9]+)\\s+(-?[0-9]+)\\s?(.*)" +); + +// ultrastar-txt-0.1.2: "^P\\s?(-?[0-9]+)" +consistent!(ultrastar_txt_4, "^P\\s?(-?[0-9]+)"); + +// db-accelerate-2.0.0: r"^template\.add($|\..+$)" +consistent!(db_accelerate_0, r"^template\.add($|\..+$)"); + +// db-accelerate-2.0.0: r"^template\.sub($|\..+$)" +consistent!(db_accelerate_1, r"^template\.sub($|\..+$)"); + +// sterling-0.3.0: r"(\d+)([cegps])" +consistent!(sterling_0, r"(\d+)([cegps])"); + +// stache-0.2.0: r"[^\w]" +consistent!(stache_0, r"[^\w]"); + +// strukt-0.1.0: "\"([<>]?)([xcbB\\?hHiIlLqQfdspP]*)\"" +consistent!(strukt_0, "\"([<>]?)([xcbB\\?hHiIlLqQfdspP]*)\""); + +// steamid-ng-0.3.1: r"^STEAM_([0-4]):([0-1]):([0-9]{1,10})$" +consistent!(steamid_ng_0, r"^STEAM_([0-4]):([0-1]):([0-9]{1,10})$"); + +// steamid-ng-0.3.1: r"^\[([AGMPCgcLTIUai]):([0-4]):([0-9]{1,10})(:([0-9]+))?\]$" +consistent!( + steamid_ng_1, + r"^\[([AGMPCgcLTIUai]):([0-4]):([0-9]{1,10})(:([0-9]+))?\]$" +); + +// strscan-0.1.1: r"^\w+" +consistent!(strscan_0, r"^\w+"); + +// strscan-0.1.1: r"^\s+" +consistent!(strscan_1, r"^\s+"); + +// strscan-0.1.1: r"^\w+" +consistent!(strscan_2, r"^\w+"); + +// strscan-0.1.1: r"^\s+" +consistent!(strscan_3, r"^\s+"); + +// strscan-0.1.1: r"^(\w+)\s+" +consistent!(strscan_4, r"^(\w+)\s+"); + +// tk-carbon-0.2.0: r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$" +consistent!(tk_carbon_0, r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$"); + +// tk-carbon-0.2.0: r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$" +consistent!(tk_carbon_1, r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$"); + +// evalrs-0.0.10: r"extern\s+crate\s+([a-z0-9_]+)\s*;(\s*//(.+))?" +consistent!(evalrs_0, r"extern\s+crate\s+([a-z0-9_]+)\s*;(\s*//(.+))?"); + +// evalrs-0.0.10: r"(?m)^# " +consistent!(evalrs_1, r"(?m)^# "); + +// evalrs-0.0.10: r"(?m)^\s*fn +main *\( *\)" +consistent!(evalrs_2, r"(?m)^\s*fn +main *\( *\)"); + +// evalrs-0.0.10: r"(extern\s+crate\s+[a-z0-9_]+\s*;)" +consistent!(evalrs_3, r"(extern\s+crate\s+[a-z0-9_]+\s*;)"); + +// gate_build-0.5.0: "(.*)_t([0-9]+)" +consistent!(gate_build_0, "(.*)_t([0-9]+)"); + +// rake-0.1.1: r"[^\P{P}-]|\s+-\s+" +consistent!(rake_0, r"[^\P{P}-]|\s+-\s+"); + +// rafy-0.2.1: r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*" +consistent!(rafy_0, r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*"); + +// raven-0.2.1: r"^(?P<protocol>.*?)://(?P<public_key>.*?):(?P<secret_key>.*?)@(?P<host>.*?)/(?P<path>.*/)?(?P<project_id>.*)$" +consistent!(raven_0, r"^(?P<protocol>.*?)://(?P<public_key>.*?):(?P<secret_key>.*?)@(?P<host>.*?)/(?P<path>.*/)?(?P<project_id>.*)$"); + +// rargs-0.2.0: r"\{[[:space:]]*[^{}]*[[:space:]]*\}" +consistent!(rargs_0, r"\{[[:space:]]*[^{}]*[[:space:]]*\}"); + +// rargs-0.2.0: r"^\{[[:space:]]*(?P<name>[[:word:]]*)[[:space:]]*\}$" +consistent!(rargs_1, r"^\{[[:space:]]*(?P<name>[[:word:]]*)[[:space:]]*\}$"); + +// rargs-0.2.0: r"^\{[[:space:]]*(?P<num>-?\d+)[[:space:]]*\}$" +consistent!(rargs_2, r"^\{[[:space:]]*(?P<num>-?\d+)[[:space:]]*\}$"); + +// rargs-0.2.0: r"^\{(?P<left>-?\d*)?\.\.(?P<right>-?\d*)?(?::(?P<sep>.*))?\}$" +consistent!( + rargs_3, + r"^\{(?P<left>-?\d*)?\.\.(?P<right>-?\d*)?(?::(?P<sep>.*))?\}$" +); + +// rargs-0.2.0: r"(.*?)[[:space:]]+|(.*?)$" +consistent!(rargs_4, r"(.*?)[[:space:]]+|(.*?)$"); + +// indradb-lib-0.15.0: r"[a-zA-Z0-9]{8}" +consistent!(indradb_lib_0, r"[a-zA-Z0-9]{8}"); + +// fungi-lang-0.1.50: r"::" +consistent!(fungi_lang_0, r"::"); + +// nickel-0.10.1: "/hello/(?P<name>[a-zA-Z]+)" +consistent!(nickel_0, "/hello/(?P<name>[a-zA-Z]+)"); + +// nickel-0.10.1: "/hello/(?P<name>[a-zA-Z]+)" +consistent!(nickel_1, "/hello/(?P<name>[a-zA-Z]+)"); + +// pact_verifier-0.4.0: r"\{(\w+)\}" +consistent!(pact_verifier_0, r"\{(\w+)\}"); + +// pact_matching-0.4.1: "application/.*json" +consistent!(pact_matching_0, "application/.*json"); + +// pact_matching-0.4.1: "application/json.*" +consistent!(pact_matching_1, "application/json.*"); + +// pact_matching-0.4.1: "application/.*xml" +consistent!(pact_matching_2, "application/.*xml"); + +// pangu-0.2.0: "([\"'\\(\\[\\{{<\u{201c}])(\\s*)(.+?)(\\s*)([\"'\\)\\]\\}}>\u{201d}])" +consistent!( + pangu_0, + "([\"'\\(\\[\\{{<\u{201c}])(\\s*)(.+?)(\\s*)([\"'\\)\\]\\}}>\u{201d}])" +); + +// pangu-0.2.0: "([\\(\\[\\{{<\u{201c}]+)(\\s*)(.+?)(\\s*)([\\)\\]\\}}>\u{201d}]+)" +consistent!( + pangu_1, + "([\\(\\[\\{{<\u{201c}]+)(\\s*)(.+?)(\\s*)([\\)\\]\\}}>\u{201d}]+)" +); + +// parser-haskell-0.2.0: r"\{-[\s\S]*?-\}" +consistent!(parser_haskell_0, r"\{-[\s\S]*?-\}"); + +// parser-haskell-0.2.0: r"(?m);+\s*$" +consistent!(parser_haskell_1, r"(?m);+\s*$"); + +// parser-haskell-0.2.0: r"(?m)^#(if|ifn?def|endif|else|include|elif).*" +consistent!(parser_haskell_2, r"(?m)^#(if|ifn?def|endif|else|include|elif).*"); + +// parser-haskell-0.2.0: r"'([^'\\]|\\[A-Z]{1,3}|\\.)'" +consistent!(parser_haskell_3, r"'([^'\\]|\\[A-Z]{1,3}|\\.)'"); + +// parser-haskell-0.2.0: r"forall\s+(.*?)\." +consistent!(parser_haskell_4, r"forall\s+(.*?)\."); + +// html2md-0.2.1: "\\s{2,}" +consistent!(html2md_0, "\\s{2,}"); + +// html2md-0.2.1: "\\n{2,}" +consistent!(html2md_1, "\\n{2,}"); + +// html2md-0.2.1: "(?m)(\\S) $" +consistent!(html2md_2, "(?m)(\\S) $"); + +// html2md-0.2.1: "(?m)^[-*] " +consistent!(html2md_3, "(?m)^[-*] "); + +// ovpnfile-0.1.2: r"#.*$" +consistent!(ovpnfile_0, r"#.*$"); + +// ovpnfile-0.1.2: r"^<(\S+)>" +consistent!(ovpnfile_1, r"^<(\S+)>"); + +// ovpnfile-0.1.2: r"^</(\S+)>" +consistent!(ovpnfile_2, r"^</(\S+)>"); + +// screenruster-saver-fractal-0.1.1: r"#([:xdigit:]{2})([:xdigit:]{2})([:xdigit:]{2})" +consistent!( + screenruster_saver_fractal_0, + r"#([:xdigit:]{2})([:xdigit:]{2})([:xdigit:]{2})" +); + +// scarlet-0.2.2: r"rgb\((?: *(\d{1,3}),)(?: *(\d{1,3}),)(?: *(\d{1,3}))\)" +consistent!( + scarlet_0, + r"rgb\((?: *(\d{1,3}),)(?: *(\d{1,3}),)(?: *(\d{1,3}))\)" +); + +// cpp_to_rust_generator-0.2.0: r"^([\w:]+)<(.+)>$" +consistent!(cpp_to_rust_generator_0, r"^([\w:]+)<(.+)>$"); + +// cpp_to_rust_generator-0.2.0: r"^type-parameter-(\d+)-(\d+)$" +consistent!(cpp_to_rust_generator_1, r"^type-parameter-(\d+)-(\d+)$"); + +// cpp_to_rust_generator-0.2.0: r"^([\w~]+)<[^<>]+>$" +consistent!(cpp_to_rust_generator_2, r"^([\w~]+)<[^<>]+>$"); + +// cpp_to_rust_generator-0.2.0: r"(signals|Q_SIGNALS)\s*:" +consistent!(cpp_to_rust_generator_3, r"(signals|Q_SIGNALS)\s*:"); + +// cpp_to_rust_generator-0.2.0: r"(slots|Q_SLOTS)\s*:" +consistent!(cpp_to_rust_generator_4, r"(slots|Q_SLOTS)\s*:"); + +// cpp_to_rust_generator-0.2.0: r"(public|protected|private)\s*:" +consistent!(cpp_to_rust_generator_5, r"(public|protected|private)\s*:"); + +// cpp_to_rust-0.5.3: r"^([\w:]+)<(.+)>$" +consistent!(cpp_to_rust_0, r"^([\w:]+)<(.+)>$"); + +// cpp_to_rust-0.5.3: r"^type-parameter-(\d+)-(\d+)$" +consistent!(cpp_to_rust_1, r"^type-parameter-(\d+)-(\d+)$"); + +// cpp_to_rust-0.5.3: r"^([\w~]+)<[^<>]+>$" +consistent!(cpp_to_rust_2, r"^([\w~]+)<[^<>]+>$"); + +// cpp_to_rust-0.5.3: r"(signals|Q_SIGNALS)\s*:" +consistent!(cpp_to_rust_3, r"(signals|Q_SIGNALS)\s*:"); + +// cpp_to_rust-0.5.3: r"(slots|Q_SLOTS)\s*:" +consistent!(cpp_to_rust_4, r"(slots|Q_SLOTS)\s*:"); + +// cpp_to_rust-0.5.3: r"(public|protected|private)\s*:" +consistent!(cpp_to_rust_5, r"(public|protected|private)\s*:"); + +// fritzbox_logs-0.2.0: "(\\d{2}\\.\\d{2}\\.\\d{2}) (\\d{2}:\\d{2}:\\d{2}) (.*)" +consistent!( + fritzbox_logs_0, + "(\\d{2}\\.\\d{2}\\.\\d{2}) (\\d{2}:\\d{2}:\\d{2}) (.*)" +); + +// fractal-matrix-api-3.29.0: r"mxc://(?P<server>[^/]+)/(?P<media>.+)" +consistent!(fractal_matrix_api_0, r"mxc://(?P<server>[^/]+)/(?P<media>.+)"); + +// smtp2go-0.1.4: r"^api-[a-zA-Z0-9]{32}$" +consistent!(smtp2go_0, r"^api-[a-zA-Z0-9]{32}$"); + +// pusher-0.3.1: r"^[-a-zA-Z0-9_=@,.;]+$" +consistent!(pusher_0, r"^[-a-zA-Z0-9_=@,.;]+$"); + +// pusher-0.3.1: r"\A\d+\.\d+\z" +consistent!(pusher_1, r"\A\d+\.\d+\z"); + +// bakervm-0.9.0: r"^\.(.+?) +?(.+)$" +consistent!(bakervm_0, r"^\.(.+?) +?(.+)$"); + +// bakervm-0.9.0: r"^\.([^\s]+)$" +consistent!(bakervm_1, r"^\.([^\s]+)$"); + +// bakervm-0.9.0: r"^include! +([^\s]+)$" +consistent!(bakervm_2, r"^include! +([^\s]+)$"); + +// bakervm-0.9.0: r"^@(\d+)$" +consistent!(bakervm_3, r"^@(\d+)$"); + +// bakervm-0.9.0: r"^true|false$" +consistent!(bakervm_4, r"^true|false$"); + +// bakervm-0.9.0: r"^(-?\d+)?\.[0-9]+$" +consistent!(bakervm_5, r"^(-?\d+)?\.[0-9]+$"); + +// bakervm-0.9.0: r"^(-?\d+)?$" +consistent!(bakervm_6, r"^(-?\d+)?$"); + +// bakervm-0.9.0: r"^#([0-9abcdefABCDEF]{6})$" +consistent!(bakervm_7, r"^#([0-9abcdefABCDEF]{6})$"); + +// bakervm-0.9.0: r"^'(.)'$" +consistent!(bakervm_8, r"^'(.)'$"); + +// bakervm-0.9.0: r"^\$vi\((\d+)\)$" +consistent!(bakervm_9, r"^\$vi\((\d+)\)$"); + +// bakervm-0.9.0: r"^\$key\((\d+)\)$" +consistent!(bakervm_10, r"^\$key\((\d+)\)$"); + +// banana-0.0.2: "(?P<type>[A-Z^']+) (?P<route>[^']+) HTTP/(?P<http>[^']+)" +consistent!( + banana_0, + "(?P<type>[A-Z^']+) (?P<route>[^']+) HTTP/(?P<http>[^']+)" +); + +// serial-key-2.0.0: r"[A-F0-9]{8}" +consistent!(serial_key_0, r"[A-F0-9]{8}"); + +// serde-hjson-0.8.1: "[\\\\\"\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]" +consistent!(serde_hjson_0, "[\\\\\"\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]"); + +// serde-hjson-0.8.1: "[\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]" +consistent!(serde_hjson_1, "[\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]"); + +// serde-hjson-0.8.1: "'''|[\x00-\x09\x0b\x0c\x0e-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]" +consistent!(serde_hjson_2, "'''|[\x00-\x09\x0b\x0c\x0e-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]"); + +// serde-odbc-0.1.0: r"/todos/(?P<id>\d+)" +consistent!(serde_odbc_0, r"/todos/(?P<id>\d+)"); + +// sentry-0.6.0: r"^(?:_<)?([a-zA-Z0-9_]+?)(?:\.\.|::)" +consistent!(sentry_0, r"^(?:_<)?([a-zA-Z0-9_]+?)(?:\.\.|::)"); + +// sentiment-0.1.1: r"[^a-zA-Z0 -]+" +consistent!(sentiment_0, r"[^a-zA-Z0 -]+"); + +// sentiment-0.1.1: r" {2,}" +consistent!(sentiment_1, r" {2,}"); + +// verilog-0.0.1: r"(?m)//.*" +consistent!(verilog_0, r"(?m)//.*"); + +// verex-0.2.2: "(?P<robot>C3PO)" +consistent!(verex_0, "(?P<robot>C3PO)"); + +// handlebars-0.32.4: ">|<|\"|&" +consistent!(handlebars_0, ">|<|\"|&"); + +// haikunator-0.1.2: r"^\w+-\w+-[0123456789]{4}$" +consistent!(haikunator_0, r"^\w+-\w+-[0123456789]{4}$"); + +// haikunator-0.1.2: r"^\w+@\w+@[0123456789]{4}$" +consistent!(haikunator_1, r"^\w+@\w+@[0123456789]{4}$"); + +// haikunator-0.1.2: r"^\w+-\w+-[0123456789abcdef]{4}$" +consistent!(haikunator_2, r"^\w+-\w+-[0123456789abcdef]{4}$"); + +// haikunator-0.1.2: r"^\w+-\w+-[0123456789忠犬ハチ公]{10}$" +consistent!(haikunator_3, r"^\w+-\w+-[0123456789忠犬ハチ公]{10}$"); + +// haikunator-0.1.2: r"^\w+-\w+$" +consistent!(haikunator_4, r"^\w+-\w+$"); + +// haikunator-0.1.2: r"^\w+-\w+-[foo]{4}$" +consistent!(haikunator_5, r"^\w+-\w+-[foo]{4}$"); + +// haikunator-0.1.2: r"^\w+-\w+-[0123456789忠犬ハチ公]{5}$" +consistent!(haikunator_6, r"^\w+-\w+-[0123456789忠犬ハチ公]{5}$"); + +// bobbin-cli-0.8.3: r"(.*)" +consistent!(bobbin_cli_0, r"(.*)"); + +// bobbin-cli-0.8.3: r"rustc (.*)" +consistent!(bobbin_cli_1, r"rustc (.*)"); + +// bobbin-cli-0.8.3: r"cargo (.*)" +consistent!(bobbin_cli_2, r"cargo (.*)"); + +// bobbin-cli-0.8.3: r"xargo (.*)\n" +consistent!(bobbin_cli_3, r"xargo (.*)\n"); + +// bobbin-cli-0.8.3: r"Open On-Chip Debugger (.*)" +consistent!(bobbin_cli_4, r"Open On-Chip Debugger (.*)"); + +// bobbin-cli-0.8.3: r"arm-none-eabi-gcc \(GNU Tools for ARM Embedded Processors[^\)]*\) (.*)" +consistent!( + bobbin_cli_5, + r"arm-none-eabi-gcc \(GNU Tools for ARM Embedded Processors[^\)]*\) (.*)" +); + +// bobbin-cli-0.8.3: r"(?m).*\nBasic Open Source SAM-BA Application \(BOSSA\) Version (.*)\n" +consistent!( + bobbin_cli_6, + r"(?m).*\nBasic Open Source SAM-BA Application \(BOSSA\) Version (.*)\n" +); + +// bobbin-cli-0.8.3: r"(?m)SEGGER J-Link Commander (.*)\n" +consistent!(bobbin_cli_7, r"(?m)SEGGER J-Link Commander (.*)\n"); + +// bobbin-cli-0.8.3: r"(?m)Teensy Loader, Command Line, Version (.*)\n" +consistent!(bobbin_cli_8, r"(?m)Teensy Loader, Command Line, Version (.*)\n"); + +// bobbin-cli-0.8.3: r"dfu-util (.*)\n" +consistent!(bobbin_cli_9, r"dfu-util (.*)\n"); + +// borsholder-0.9.1: r"^/static/[\w.]+$" +consistent!(borsholder_0, r"^/static/[\w.]+$"); + +// borsholder-0.9.1: r"^/timeline/([0-9]+)$" +consistent!(borsholder_1, r"^/timeline/([0-9]+)$"); + +// fblog-1.0.1: "\u{001B}\\[[\\d;]*[^\\d;]" +consistent!(fblog_0, "\u{001B}\\[[\\d;]*[^\\d;]"); + +// fblog-1.0.1: "\u{001B}\\[[\\d;]*[^\\d;]" +consistent!(fblog_1, "\u{001B}\\[[\\d;]*[^\\d;]"); + +// toml-query-0.6.0: r"^\[\d+\]$" +consistent!(toml_query_0, r"^\[\d+\]$"); + +// todo-txt-1.1.0: r" (?P<key>[^\s]+):(?P<value>[^\s^/]+)" +consistent!(todo_txt_0, r" (?P<key>[^\s]+):(?P<value>[^\s^/]+)"); + +// findr-0.1.5: r"\band\b" +consistent!(findr_0, r"\band\b"); + +// findr-0.1.5: r"\bor\b" +consistent!(findr_1, r"\bor\b"); + +// findr-0.1.5: r"\bnot\b" +consistent!(findr_2, r"\bnot\b"); + +// file-sniffer-3.0.1: r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$" +consistent!(file_sniffer_0, r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"); + +// file-sniffer-3.0.1: r".*?\.(stats|conf|h|cache.*|dat|pc|info)$" +consistent!(file_sniffer_1, r".*?\.(stats|conf|h|cache.*|dat|pc|info)$"); + +// file-sniffer-3.0.1: r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$" +consistent!(file_sniffer_2, r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"); + +// file-sniffer-3.0.1: r".*?\.(stats|conf|h|cache.*)$" +consistent!(file_sniffer_3, r".*?\.(stats|conf|h|cache.*)$"); + +// file-sniffer-3.0.1: r"(\.git|\.pijul|_darcs|\.hg)$" +consistent!(file_sniffer_4, r"(\.git|\.pijul|_darcs|\.hg)$"); + +// file_logger-0.1.0: "test" +consistent!(file_logger_0, "test"); + +// file_scanner-0.2.0: r"foo" +consistent!(file_scanner_0, r"foo"); + +// file_scanner-0.2.0: r"a+b" +consistent!(file_scanner_1, r"a+b"); + +// file_scanner-0.2.0: r"a[ab]*b" +consistent!(file_scanner_2, r"a[ab]*b"); + +// file_scanner-0.2.0: r"\s+" +consistent!(file_scanner_3, r"\s+"); + +// file_scanner-0.2.0: r"\s+" +consistent!(file_scanner_4, r"\s+"); + +// cellsplit-0.2.1: r"^\s*([^\s]+) %cellsplit<\d+>$" +consistent!(cellsplit_0, r"^\s*([^\s]+) %cellsplit<\d+>$"); + +// cellsplit-0.2.1: r"^\s*([^\s]+) %cellsplit<\d+>$" +consistent!(cellsplit_1, r"^\s*([^\s]+) %cellsplit<\d+>$"); + +// aterm-0.20.0: r"^[+\-]?[0-9]+" +consistent!(aterm_0, r"^[+\-]?[0-9]+"); + +// aterm-0.20.0: r"^[+\-]?[0-9]+\.[0-9]*([eE][+\-]?[0-9]+)?" +consistent!(aterm_1, r"^[+\-]?[0-9]+\.[0-9]*([eE][+\-]?[0-9]+)?"); + +// atarashii_imap-0.3.0: r"^[*] OK" +consistent!(atarashii_imap_0, r"^[*] OK"); + +// atarashii_imap-0.3.0: r"FLAGS\s\((.+)\)" +consistent!(atarashii_imap_1, r"FLAGS\s\((.+)\)"); + +// atarashii_imap-0.3.0: r"\[PERMANENTFLAGS\s\((.+)\)\]" +consistent!(atarashii_imap_2, r"\[PERMANENTFLAGS\s\((.+)\)\]"); + +// atarashii_imap-0.3.0: r"\[UIDVALIDITY\s(\d+)\]" +consistent!(atarashii_imap_3, r"\[UIDVALIDITY\s(\d+)\]"); + +// atarashii_imap-0.3.0: r"(\d+)\sEXISTS" +consistent!(atarashii_imap_4, r"(\d+)\sEXISTS"); + +// atarashii_imap-0.3.0: r"(\d+)\sRECENT" +consistent!(atarashii_imap_5, r"(\d+)\sRECENT"); + +// atarashii_imap-0.3.0: r"\[UNSEEN\s(\d+)\]" +consistent!(atarashii_imap_6, r"\[UNSEEN\s(\d+)\]"); + +// atarashii_imap-0.3.0: r"\[UIDNEXT\s(\d+)\]" +consistent!(atarashii_imap_7, r"\[UIDNEXT\s(\d+)\]"); + +// editorconfig-1.0.0: r"\\(\{|\})" +consistent!(editorconfig_0, r"\\(\{|\})"); + +// editorconfig-1.0.0: r"(^|[^\\])\\\|" +consistent!(editorconfig_1, r"(^|[^\\])\\\|"); + +// editorconfig-1.0.0: r"\[([^\]]*)$" +consistent!(editorconfig_2, r"\[([^\]]*)$"); + +// editorconfig-1.0.0: r"\[(.*/.*)\]" +consistent!(editorconfig_3, r"\[(.*/.*)\]"); + +// editorconfig-1.0.0: r"\{(-?\d+\\\.\\\.-?\d+)\}" +consistent!(editorconfig_4, r"\{(-?\d+\\\.\\\.-?\d+)\}"); + +// editorconfig-1.0.0: r"\{([^,]+)\}" +consistent!(editorconfig_5, r"\{([^,]+)\}"); + +// editorconfig-1.0.0: r"\{(([^\}].*)?(,|\|)(.*[^\\])?)\}" +consistent!(editorconfig_6, r"\{(([^\}].*)?(,|\|)(.*[^\\])?)\}"); + +// editorconfig-1.0.0: r"^/" +consistent!(editorconfig_7, r"^/"); + +// editorconfig-1.0.0: r"(^|[^\\])(\{|\})" +consistent!(editorconfig_8, r"(^|[^\\])(\{|\})"); + +// edmunge-1.0.0: "^#!.*\n" +consistent!(edmunge_0, "^#!.*\n"); + +// unicode_names2_macros-0.2.0: r"\\N\{(.*?)(?:\}|$)" +consistent!(unicode_names2_macros_0, r"\\N\{(.*?)(?:\}|$)"); + +// unidiff-0.2.1: r"^--- (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?" +consistent!( + unidiff_0, + r"^--- (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?" +); + +// unidiff-0.2.1: r"^\+\+\+ (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?" +consistent!( + unidiff_1, + r"^\+\+\+ (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?" +); + +// unidiff-0.2.1: r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)" +consistent!(unidiff_2, r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)"); + +// unidiff-0.2.1: r"^(?P<line_type>[- \n\+\\]?)(?P<value>.*)" +consistent!(unidiff_3, r"^(?P<line_type>[- \n\+\\]?)(?P<value>.*)"); + +// slippy-map-tiles-0.13.1: "/?(?P<zoom>[0-9]?[0-9])/(?P<x>[0-9]{1,10})/(?P<y>[0-9]{1,10})(\\.[a-zA-Z]{3,4})?$" +consistent!(slippy_map_tiles_0, "/?(?P<zoom>[0-9]?[0-9])/(?P<x>[0-9]{1,10})/(?P<y>[0-9]{1,10})(\\.[a-zA-Z]{3,4})?$"); + +// slippy-map-tiles-0.13.1: r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$" +consistent!(slippy_map_tiles_1, r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$"); + +// slippy-map-tiles-0.13.1: r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$" +consistent!(slippy_map_tiles_2, r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$"); + +// sonos-0.1.2: r"^https?://(.+?):1400/xml" +consistent!(sonos_0, r"^https?://(.+?):1400/xml"); + +// validator_derive-0.7.0: r"^[a-z]{2}$" +consistent!(validator_derive_0, r"^[a-z]{2}$"); + +// validator_derive-0.7.0: r"[a-z]{2}" +consistent!(validator_derive_1, r"[a-z]{2}"); + +// validator_derive-0.7.0: r"[a-z]{2}" +consistent!(validator_derive_2, r"[a-z]{2}"); + +// nginx-config-0.8.0: r"one of \d+ options" +consistent!(nginx_config_0, r"one of \d+ options"); + +// waltz-0.4.0: r"[\s,]" +consistent!(waltz_0, r"[\s,]"); + +// warheadhateus-0.2.1: r"^aws_access_key_id = (.*)" +consistent!(warheadhateus_0, r"^aws_access_key_id = (.*)"); + +// warheadhateus-0.2.1: r"^aws_secret_access_key = (.*)" +consistent!(warheadhateus_1, r"^aws_secret_access_key = (.*)"); + +// warheadhateus-0.2.1: r"^aws_access_key_id = (.*)" +consistent!(warheadhateus_2, r"^aws_access_key_id = (.*)"); + +// warheadhateus-0.2.1: r"^aws_secret_access_key = (.*)" +consistent!(warheadhateus_3, r"^aws_secret_access_key = (.*)"); + +// jieba-rs-0.2.2: r"([\u{4E00}-\u{9FD5}a-zA-Z0-9+#&\._%]+)" +consistent!(jieba_rs_0, r"([\u{4E00}-\u{9FD5}a-zA-Z0-9+#&\._%]+)"); + +// jieba-rs-0.2.2: r"(\r\n|\s)" +consistent!(jieba_rs_1, r"(\r\n|\s)"); + +// jieba-rs-0.2.2: "([\u{4E00}-\u{9FD5}]+)" +consistent!(jieba_rs_2, "([\u{4E00}-\u{9FD5}]+)"); + +// jieba-rs-0.2.2: r"[^a-zA-Z0-9+#\n]" +consistent!(jieba_rs_3, r"[^a-zA-Z0-9+#\n]"); + +// jieba-rs-0.2.2: r"([\u{4E00}-\u{9FD5}]+)" +consistent!(jieba_rs_4, r"([\u{4E00}-\u{9FD5}]+)"); + +// jieba-rs-0.2.2: r"([a-zA-Z0-9]+(?:.\d+)?%?)" +consistent!(jieba_rs_5, r"([a-zA-Z0-9]+(?:.\d+)?%?)"); + +// lalrpop-0.15.2: r"Span\([0-9 ,]*\)" +consistent!(lalrpop_0, r"Span\([0-9 ,]*\)"); + +// lalrpop-snap-0.15.2: r"Span\([0-9 ,]*\)" +consistent!(lalrpop_snap_0, r"Span\([0-9 ,]*\)"); + +// nlp-tokenize-0.1.0: r"[\S]+" +consistent!(nlp_tokenize_0, r"[\S]+"); + +// kbgpg-0.1.2: "[[:xdigit:]][70]" +consistent!(kbgpg_0, "[[:xdigit:]][70]"); + +// cdbd-0.1.1: r"^((?P<address>.*):)?(?P<port>\d+)$" +consistent!(cdbd_0, r"^((?P<address>.*):)?(?P<port>\d+)$"); + +// mbutiles-0.1.1: r"[\w\s=+-/]+\((\{(.|\n)*\})\);?" +consistent!(mbutiles_0, r"[\w\s=+-/]+\((\{(.|\n)*\})\);?"); + +// extrahop-0.2.5: r"^-\d+(?:ms|s|m|h|d|w|y)?$" +consistent!(extrahop_0, r"^-\d+(?:ms|s|m|h|d|w|y)?$"); + +// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$" +consistent!(pippin_0, "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$"); + +// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$" +consistent!( + pippin_1, + "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$" +); + +// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$" +consistent!(pippin_2, "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$"); + +// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$" +consistent!( + pippin_3, + "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$" +); + +// pippin-0.1.0: "^.*pn(0|[1-9][0-9]*)(-ss(0|[1-9][0-9]*)(\\.pip|-cl(0|[1-9][0-9]*)\\.piplog))?$" +consistent!(pippin_4, "^.*pn(0|[1-9][0-9]*)(-ss(0|[1-9][0-9]*)(\\.pip|-cl(0|[1-9][0-9]*)\\.piplog))?$"); + +// pippin-0.1.0: "^(.*)-ss(?:0|[1-9][0-9]*)(?:\\.pip|-cl(?:0|[1-9][0-9]*)\\.piplog)$" +consistent!( + pippin_5, + "^(.*)-ss(?:0|[1-9][0-9]*)(?:\\.pip|-cl(?:0|[1-9][0-9]*)\\.piplog)$" +); + +// pinyin-0.3.0: r"(?i)[āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň]" +consistent!( + pinyin_0, + r"(?i)[āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň]" +); + +// pinyin-0.3.0: r"([aeoiuvnm])([0-4])$" +consistent!(pinyin_1, r"([aeoiuvnm])([0-4])$"); + +// duration-parser-0.2.0: r"(?P<value>\d+)(?P<units>[a-z])" +consistent!(duration_parser_0, r"(?P<value>\d+)(?P<units>[a-z])"); + +// dutree-0.2.7: r"^\d+\D?$" +consistent!(dutree_0, r"^\d+\D?$"); + +// djangohashers-0.3.0: r"^[A-Za-z0-9]*$" +consistent!(djangohashers_0, r"^[A-Za-z0-9]*$"); + +// rtag-0.3.5: r"^[A-Z][A-Z0-9]{2,}$" +consistent!(rtag_0, r"^[A-Z][A-Z0-9]{2,}$"); + +// rtag-0.3.5: r"^http://www\.emusic\.com" +consistent!(rtag_1, r"^http://www\.emusic\.com"); + +// rtag-0.3.5: r"^[A-Z][A-Z0-9]{2,}" +consistent!(rtag_2, r"^[A-Z][A-Z0-9]{2,}"); + +// rtag-0.3.5: r"(^[\x{0}|\x{feff}|\x{fffe}]*|[\x{0}|\x{feff}|\x{fffe}]*$)" +consistent!( + rtag_3, + r"(^[\x{0}|\x{feff}|\x{fffe}]*|[\x{0}|\x{feff}|\x{fffe}]*$)" +); + +// rtow-0.1.0: r"(\d+)[xX](\d+)" +consistent!(rtow_0, r"(\d+)[xX](\d+)"); + +// pleingres-sql-plugin-0.1.0: r"\$([a-zA-Z0-9_]+)" +consistent!(pleingres_sql_plugin_0, r"\$([a-zA-Z0-9_]+)"); + +// dono-2.0.0: "[\\n]+" +consistent!(dono_0, "[\\n]+"); + +// dono-2.0.0: "(?m)^\\n" +consistent!(dono_1, "(?m)^\\n"); + +// dono-2.0.0: "(?m)^\\n" +consistent!(dono_2, "(?m)^\\n"); + +// ssb-common-0.3.0: r"^[0-9A-Za-z\+/]{43}=\.ed25519$" +consistent!(ssb_common_0, r"^[0-9A-Za-z\+/]{43}=\.ed25519$"); + +// ssb-common-0.3.0: r"^[0-9A-Za-z\+/]{86}==\.ed25519$" +consistent!(ssb_common_1, r"^[0-9A-Za-z\+/]{86}==\.ed25519$"); + +// ssb-common-0.3.0: r"^[0-9A-Za-z\+/]{43}=\.sha256$" +consistent!(ssb_common_2, r"^[0-9A-Za-z\+/]{43}=\.sha256$"); + +// mozversion-0.1.3: r"^(?P<major>\d+)\.(?P<minor>\d+)(?:\.(?P<patch>\d+))?(?:(?P<pre0>[a-z]+)(?P<pre1>\d*))?$" +consistent!(mozversion_0, r"^(?P<major>\d+)\.(?P<minor>\d+)(?:\.(?P<patch>\d+))?(?:(?P<pre0>[a-z]+)(?P<pre1>\d*))?$"); + +// monger-0.5.6: r"^(\d+)\.(\d+)$" +consistent!(monger_0, r"^(\d+)\.(\d+)$"); + +// mongo_rub-0.0.2: r"^[rv]2\.6" +consistent!(mongo_rub_0, r"^[rv]2\.6"); + +// flow-0.3.5: "body value" +consistent!(flow_0, "body value"); + +// flow-0.3.5: "start marker" +consistent!(flow_1, "start marker"); + +// flow-0.3.5: "end marker" +consistent!(flow_2, "end marker"); + +// flow-0.3.5: "body value" +consistent!(flow_3, "body value"); + +// vobsub-0.2.3: "^([A-Za-z/ ]+): (.*)" +consistent!(vobsub_0, "^([A-Za-z/ ]+): (.*)"); + +// voidmap-1.1.2: r"#([^\s=]+)*" +consistent!(voidmap_0, r"#([^\s=]+)*"); + +// voidmap-1.1.2: r"#(\S+)*" +consistent!(voidmap_1, r"#(\S+)*"); + +// voidmap-1.1.2: r"#prio=(\d+)" +consistent!(voidmap_2, r"#prio=(\d+)"); + +// voidmap-1.1.2: r"\[(\S+)\]" +consistent!(voidmap_3, r"\[(\S+)\]"); + +// voidmap-1.1.2: r"#limit=(\d+)" +consistent!(voidmap_4, r"#limit=(\d+)"); + +// voidmap-1.1.2: r"#tagged=(\S+)" +consistent!(voidmap_5, r"#tagged=(\S+)"); + +// voidmap-1.1.2: r"#rev\b" +consistent!(voidmap_6, r"#rev\b"); + +// voidmap-1.1.2: r"#done\b" +consistent!(voidmap_7, r"#done\b"); + +// voidmap-1.1.2: r"#open\b" +consistent!(voidmap_8, r"#open\b"); + +// voidmap-1.1.2: r"#since=(\S+)" +consistent!(voidmap_9, r"#since=(\S+)"); + +// voidmap-1.1.2: r"#until=(\S+)" +consistent!(voidmap_10, r"#until=(\S+)"); + +// voidmap-1.1.2: r"#plot=(\S+)" +consistent!(voidmap_11, r"#plot=(\S+)"); + +// voidmap-1.1.2: r"#n=(\d+)" +consistent!(voidmap_12, r"#n=(\d+)"); + +// voidmap-1.1.2: r"(\S+)" +consistent!(voidmap_13, r"(\S+)"); + +// voidmap-1.1.2: r"(?P<y>\d+)y" +consistent!(voidmap_14, r"(?P<y>\d+)y"); + +// voidmap-1.1.2: r"(?P<m>\d+)m" +consistent!(voidmap_15, r"(?P<m>\d+)m"); + +// voidmap-1.1.2: r"(?P<w>\d+)w" +consistent!(voidmap_16, r"(?P<w>\d+)w"); + +// voidmap-1.1.2: r"(?P<d>\d+)d" +consistent!(voidmap_17, r"(?P<d>\d+)d"); + +// voidmap-1.1.2: r"(?P<h>\d+)h" +consistent!(voidmap_18, r"(?P<h>\d+)h"); + +// voidmap-1.1.2: r"C-(.)" +consistent!(voidmap_19, r"C-(.)"); + +// qt_generator-0.2.0: r"^\.\./qt[^/]+/" +consistent!(qt_generator_0, r"^\.\./qt[^/]+/"); + +// qt_generator-0.2.0: "(href|src)=\"([^\"]*)\"" +consistent!(qt_generator_1, "(href|src)=\"([^\"]*)\""); + +// kryptos-0.6.1: r"[01]{5}" +consistent!(kryptos_0, r"[01]{5}"); + +// cifar_10_loader-0.2.0: "data_batch_[1-5].bin" +consistent!(cifar_10_loader_0, "data_batch_[1-5].bin"); + +// cifar_10_loader-0.2.0: "test_batch.bin" +consistent!(cifar_10_loader_1, "test_batch.bin"); + +// circadian-0.6.0: r"^\d+.\d+s$" +consistent!(circadian_0, r"^\d+.\d+s$"); + +// circadian-0.6.0: r"^\d+:\d+$" +consistent!(circadian_1, r"^\d+:\d+$"); + +// circadian-0.6.0: r"^\d+:\d+m$" +consistent!(circadian_2, r"^\d+:\d+m$"); + +// cicada-0.8.1: r"!!" +consistent!(cicada_0, r"!!"); + +// cicada-0.8.1: r"^([^`]*)`([^`]+)`(.*)$" +consistent!(cicada_1, r"^([^`]*)`([^`]+)`(.*)$"); + +// cicada-0.8.1: r"\*+" +consistent!(cicada_2, r"\*+"); + +// cicada-0.8.1: r"([^\$]*)\$\{?([A-Za-z0-9\?\$_]+)\}?(.*)" +consistent!(cicada_3, r"([^\$]*)\$\{?([A-Za-z0-9\?\$_]+)\}?(.*)"); + +// cicada-0.8.1: r"^ *alias +([a-zA-Z0-9_\.-]+)=(.*)$" +consistent!(cicada_4, r"^ *alias +([a-zA-Z0-9_\.-]+)=(.*)$"); + +// vterm-sys-0.1.0: r"hi" +consistent!(vterm_sys_0, r"hi"); + +// skim-0.5.0: r".*?\t" +consistent!(skim_0, r".*?\t"); + +// skim-0.5.0: r".*?[\t ]" +consistent!(skim_1, r".*?[\t ]"); + +// skim-0.5.0: r"(\{-?[0-9.,q]*?})" +consistent!(skim_2, r"(\{-?[0-9.,q]*?})"); + +// skim-0.5.0: r"[ \t\n]+" +consistent!(skim_3, r"[ \t\n]+"); + +// skim-0.5.0: r"[ \t\n]+" +consistent!(skim_4, r"[ \t\n]+"); + +// skim-0.5.0: r"([^ |]+( +\| +[^ |]*)+)|( +)" +consistent!(skim_5, r"([^ |]+( +\| +[^ |]*)+)|( +)"); + +// skim-0.5.0: r" +\| +" +consistent!(skim_6, r" +\| +"); + +// skim-0.5.0: r"^(?P<left>-?\d+)?(?P<sep>\.\.)?(?P<right>-?\d+)?$" +consistent!(skim_7, r"^(?P<left>-?\d+)?(?P<sep>\.\.)?(?P<right>-?\d+)?$"); + +// skim-0.5.0: "," +consistent!(skim_8, ","); + +// skim-0.5.0: ".*?," +consistent!(skim_9, ".*?,"); + +// skim-0.5.0: ".*?," +consistent!(skim_10, ".*?,"); + +// skim-0.5.0: "," +consistent!(skim_11, ","); + +// skim-0.5.0: r"\x1B\[(?:([0-9]+;[0-9]+[Hf])|([0-9]+[ABCD])|(s|u|2J|K)|([0-9;]*m)|(=[0-9]+[hI]))" +consistent!(skim_12, r"\x1B\[(?:([0-9]+;[0-9]+[Hf])|([0-9]+[ABCD])|(s|u|2J|K)|([0-9;]*m)|(=[0-9]+[hI]))"); + +// egg-mode-text-1.14.7: r"[-_./]\z" +consistent!(egg_mode_text_0, r"[-_./]\z"); + +// java-properties-1.1.1: "^[ \t\r\n\x0c]*[#!]" +consistent!(java_properties_0, "^[ \t\r\n\x0c]*[#!]"); + +// java-properties-1.1.1: r"^[ \t\x0c]*[#!][^\r\n]*$" +consistent!(java_properties_1, r"^[ \t\x0c]*[#!][^\r\n]*$"); + +// java-properties-1.1.1: r"^([ \t\x0c]*[:=][ \t\x0c]*|[ \t\x0c]+)$" +consistent!(java_properties_2, r"^([ \t\x0c]*[:=][ \t\x0c]*|[ \t\x0c]+)$"); + +// ipaddress-0.1.2: r":.+\." +consistent!(ipaddress_0, r":.+\."); + +// ipaddress-0.1.2: r"\." +consistent!(ipaddress_1, r"\."); + +// ipaddress-0.1.2: r":" +consistent!(ipaddress_2, r":"); + +// iptables-0.2.2: r"v(\d+)\.(\d+)\.(\d+)" +consistent!(iptables_0, r"v(\d+)\.(\d+)\.(\d+)"); + +// rsure-0.8.1: r"^([^-]+)-(.*)\.dat\.gz$" +consistent!(rsure_0, r"^([^-]+)-(.*)\.dat\.gz$"); + +// rs-jsonpath-0.1.0: "^(.*?)(<=|<|==|>=|>)(.*?)$" +consistent!(rs_jsonpath_0, "^(.*?)(<=|<|==|>=|>)(.*?)$"); + +// oatie-0.3.0: r"(\n|^)(\w+):([\n\w\W]+?)(\n(?:\w)|(\n\]))" +consistent!(oatie_0, r"(\n|^)(\w+):([\n\w\W]+?)(\n(?:\w)|(\n\]))"); + +// weld-0.2.0: "#.*$" +consistent!(weld_0, "#.*$"); + +// weld-0.2.0: r"^[A-Za-z$_][A-Za-z0-9$_]*$" +consistent!(weld_1, r"^[A-Za-z$_][A-Za-z0-9$_]*$"); + +// weld-0.2.0: r"^[0-9]+[cC]$" +consistent!(weld_2, r"^[0-9]+[cC]$"); + +// weld-0.2.0: r"^0b[0-1]+[cC]$" +consistent!(weld_3, r"^0b[0-1]+[cC]$"); + +// weld-0.2.0: r"^0x[0-9a-fA-F]+[cC]$" +consistent!(weld_4, r"^0x[0-9a-fA-F]+[cC]$"); + +// weld-0.2.0: r"^[0-9]+$" +consistent!(weld_5, r"^[0-9]+$"); + +// weld-0.2.0: r"^0b[0-1]+$" +consistent!(weld_6, r"^0b[0-1]+$"); + +// weld-0.2.0: r"^0x[0-9a-fA-F]+$" +consistent!(weld_7, r"^0x[0-9a-fA-F]+$"); + +// weld-0.2.0: r"^[0-9]+[lL]$" +consistent!(weld_8, r"^[0-9]+[lL]$"); + +// weld-0.2.0: r"^0b[0-1]+[lL]$" +consistent!(weld_9, r"^0b[0-1]+[lL]$"); + +// weld-0.2.0: r"^0x[0-9a-fA-F]+[lL]$" +consistent!(weld_10, r"^0x[0-9a-fA-F]+[lL]$"); + +// webgl_generator-0.1.0: "([(, ])enum\\b" +consistent!(webgl_generator_0, "([(, ])enum\\b"); + +// webgl_generator-0.1.0: "\\bAcquireResourcesCallback\\b" +consistent!(webgl_generator_1, "\\bAcquireResourcesCallback\\b"); + +// weave-0.2.0: r"^(\d+)(,(\d+))?([acd]).*$" +consistent!(weave_0, r"^(\d+)(,(\d+))?([acd]).*$"); + +// wemo-0.0.12: r"<BinaryState>(\d)(\|-?\d+)*</BinaryState>" +consistent!(wemo_0, r"<BinaryState>(\d)(\|-?\d+)*</BinaryState>"); + +// webscale-0.9.4: r"(http[s]?://[^\s]+)" +consistent!(webscale_0, r"(http[s]?://[^\s]+)"); + +// svgrep-1.1.0: r"^\d+.*$" +consistent!(svgrep_0, r"^\d+.*$"); + +// ignore-0.4.2: r"^[\pL\pN]+$" +consistent!(ignore_0, r"^[\pL\pN]+$"); + +// ommui_string_patterns-0.1.2: r"^([A-Za-z][0-9A-Za-z_]*)?$" +consistent!(ommui_string_patterns_0, r"^([A-Za-z][0-9A-Za-z_]*)?$"); + +// ommui_string_patterns-0.1.2: r"^(\S+(?:.*\S)?)?$" +consistent!(ommui_string_patterns_1, r"^(\S+(?:.*\S)?)?$"); + +// opcua-types-0.3.0: "^(?P<min>[0-9]{1,10})(:(?P<max>[0-9]{1,10}))?$" +consistent!(opcua_types_0, "^(?P<min>[0-9]{1,10})(:(?P<max>[0-9]{1,10}))?$"); + +// opcua-types-0.3.0: r"^(ns=(?P<ns>[0-9]+);)?(?P<t>[isgb])=(?P<v>.+)$" +consistent!(opcua_types_1, r"^(ns=(?P<ns>[0-9]+);)?(?P<t>[isgb])=(?P<v>.+)$"); + +// open_read_later-1.1.1: r"^(.+?)\s*:\s*(.+)$" +consistent!(open_read_later_0, r"^(.+?)\s*:\s*(.+)$"); + +// youtube-downloader-0.1.0: r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*" +consistent!(youtube_downloader_0, r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*"); + +// yobot-0.1.1: "." +consistent!(yobot_0, "."); + +// yobot-0.1.1: r"." +consistent!(yobot_1, r"."); + +// yobot-0.1.1: r".+" +consistent!(yobot_2, r".+"); + +// yobot-0.1.1: r"." +consistent!(yobot_3, r"."); + +// ubiquity-0.1.5: r"foo" +consistent!(ubiquity_0, r"foo"); + +// ubiquity-0.1.5: r"/target/" +consistent!(ubiquity_1, r"/target/"); + +// ubiquity-0.1.5: r".DS_Store" +consistent!(ubiquity_2, r".DS_Store"); + +// qasm-1.0.0: r"//.*" +consistent!(qasm_0, r"//.*"); + +// drill-0.3.5: r"\{\{ *([a-z\._]+) *\}\}" +consistent!(drill_0, r"\{\{ *([a-z\._]+) *\}\}"); + +// queryst-2.0.0: r"^([^\]\[]+)" +consistent!(queryst_0, r"^([^\]\[]+)"); + +// queryst-2.0.0: r"(\[[^\]\[]*\])" +consistent!(queryst_1, r"(\[[^\]\[]*\])"); + +// qui-vive-0.1.0: r"^/(\w+)$" +consistent!(qui_vive_0, r"^/(\w+)$"); + +// qui-vive-0.1.0: r"^/key$" +consistent!(qui_vive_1, r"^/key$"); + +// qui-vive-0.1.0: r"^/key/(\w+)$" +consistent!(qui_vive_2, r"^/key/(\w+)$"); + +// qui-vive-0.1.0: r"^/url$" +consistent!(qui_vive_3, r"^/url$"); + +// qui-vive-0.1.0: r"^/url/(\w+)$" +consistent!(qui_vive_4, r"^/url/(\w+)$"); + +// qui-vive-0.1.0: r"^/inv$" +consistent!(qui_vive_5, r"^/inv$"); + +// qui-vive-0.1.0: r"^/inv/(\w+)$" +consistent!(qui_vive_6, r"^/inv/(\w+)$"); + +// subdiff-0.1.0: r"\b" +// consistent!(subdiff_0, r"\b"); + +// substudy-0.4.5: r"^(\d+)/(\d+)$" +consistent!(substudy_0, r"^(\d+)/(\d+)$"); + +// substudy-0.4.5: r"\s+" +consistent!(substudy_1, r"\s+"); + +// substudy-0.4.5: r"<[a-z/][^>]*>" +consistent!(substudy_2, r"<[a-z/][^>]*>"); + +// substudy-0.4.5: r"(\([^)]*\)|♪[^♪]*♪|[A-Z]{2,} ?:)" +consistent!(substudy_3, r"(\([^)]*\)|♪[^♪]*♪|[A-Z]{2,} ?:)"); + +// substudy-0.4.5: r"\s+" +consistent!(substudy_4, r"\s+"); + +// isbnid-0.1.3: r"^(\d(-| )?){9}(x|X|\d|(\d(-| )?){3}\d)$" +consistent!(isbnid_0, r"^(\d(-| )?){9}(x|X|\d|(\d(-| )?){3}\d)$"); + +// isbnid-0.1.3: r"[^0-9X]" +consistent!(isbnid_1, r"[^0-9X]"); + +// ispc-0.3.5: r"Intel\(r\) SPMD Program Compiler \(ispc\), (\d+\.\d+\.\d+)" +consistent!( + ispc_0, + r"Intel\(r\) SPMD Program Compiler \(ispc\), (\d+\.\d+\.\d+)" +); diff --git a/regex-1.8.4/tests/crazy.rs b/regex-1.8.4/tests/crazy.rs new file mode 100644 index 0000000000000..293ac1ae7280d --- /dev/null +++ b/regex-1.8.4/tests/crazy.rs @@ -0,0 +1,459 @@ +mat!(ascii_literal, r"a", "a", Some((0, 1))); + +// Some crazy expressions from regular-expressions.info. +mat!( + match_ranges, + r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + "num: 255", + Some((5, 8)) +); +mat!( + match_ranges_not, + r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + "num: 256", + None +); +mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3))); +mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3))); +mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4))); +mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None); +mat!( + match_email, + r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + "mine is jam.slam@gmail.com ", + Some((8, 26)) +); +mat!( + match_email_not, + r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + "mine is jam.slam@gmail ", + None +); +mat!( + match_email_big, + r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", + "mine is jam.slam@gmail.com ", + Some((8, 26)) +); +mat!( + match_date1, + r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-01-01", + Some((0, 10)) +); +mat!( + match_date2, + r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-00-01", + None +); +mat!( + match_date3, + r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-13-01", + None +); + +// Do some crazy dancing with the start/end assertions. +matiter!(match_start_end_empty, r"^$", "", (0, 0)); +matiter!(match_start_end_empty_many_1, r"^$^$^$", "", (0, 0)); +matiter!(match_start_end_empty_many_2, r"^^^$$$", "", (0, 0)); +matiter!(match_start_end_empty_rev, r"$^", "", (0, 0)); +matiter!( + match_start_end_empty_rep, + r"(?:^$)*", + "a\nb\nc", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); +matiter!( + match_start_end_empty_rep_rev, + r"(?:$^)*", + "a\nb\nc", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); + +// Test negated character classes. +mat!(negclass_letters, r"[^ac]", "acx", Some((2, 3))); +mat!(negclass_letter_comma, r"[^a,]", "a,x", Some((2, 3))); +mat!(negclass_letter_space, r"[^a[:space:]]", "a x", Some((2, 3))); +mat!(negclass_comma, r"[^,]", ",,x", Some((2, 3))); +mat!(negclass_space, r"[^[:space:]]", " a", Some((1, 2))); +mat!(negclass_space_comma, r"[^,[:space:]]", ", a", Some((2, 3))); +mat!(negclass_comma_space, r"[^[:space:],]", " ,a", Some((2, 3))); +mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2))); + +// Test that repeated empty expressions don't loop forever. +mat!(lazy_many_many, r"((?:.*)*?)=", "a=b", Some((0, 2))); +mat!(lazy_many_optional, r"((?:.?)*?)=", "a=b", Some((0, 2))); +mat!(lazy_one_many_many, r"((?:.*)+?)=", "a=b", Some((0, 2))); +mat!(lazy_one_many_optional, r"((?:.?)+?)=", "a=b", Some((0, 2))); +mat!(lazy_range_min_many, r"((?:.*){1,}?)=", "a=b", Some((0, 2))); +mat!(lazy_range_many, r"((?:.*){1,2}?)=", "a=b", Some((0, 2))); +mat!(greedy_many_many, r"((?:.*)*)=", "a=b", Some((0, 2))); +mat!(greedy_many_optional, r"((?:.?)*)=", "a=b", Some((0, 2))); +mat!(greedy_one_many_many, r"((?:.*)+)=", "a=b", Some((0, 2))); +mat!(greedy_one_many_optional, r"((?:.?)+)=", "a=b", Some((0, 2))); +mat!(greedy_range_min_many, r"((?:.*){1,})=", "a=b", Some((0, 2))); +mat!(greedy_range_many, r"((?:.*){1,2})=", "a=b", Some((0, 2))); + +// Test that we handle various flavors of empty expressions. +matiter!(match_empty1, r"", "", (0, 0)); +matiter!(match_empty2, r"", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty3, r"()", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty4, r"()*", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty5, r"()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty6, r"()?", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty7, r"()()", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty8, r"()+|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty9, r"z|()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty10, r"()+|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty11, r"b|()+", "abc", (0, 0), (1, 2), (3, 3)); +matiter!(match_empty12, r"|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty13, r"b|", "abc", (0, 0), (1, 2), (3, 3)); +matiter!(match_empty14, r"|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty15, r"z|", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty16, r"|", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty17, r"||", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty18, r"||z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty19, r"(?:)|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty20, r"b|(?:)", "abc", (0, 0), (1, 2), (3, 3)); +matiter!(match_empty21, r"(?:|)", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty22, r"(?:|)|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty23, r"a(?:)|b", "abc", (0, 1), (1, 2)); + +// Test that the DFA can handle pathological cases. +// (This should result in the DFA's cache being flushed too frequently, which +// should cause it to quit and fall back to the NFA algorithm.) +#[test] +fn dfa_handles_pathological_case() { + fn ones_and_zeroes(count: usize) -> String { + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + + let mut rng = SmallRng::from_entropy(); + let mut s = String::new(); + for _ in 0..count { + if rng.gen() { + s.push('1'); + } else { + s.push('0'); + } + } + s + } + + let re = regex!(r"[01]*1[01]{20}$"); + let text = { + let mut pieces = ones_and_zeroes(100_000); + pieces.push('1'); + pieces.push_str(&ones_and_zeroes(20)); + pieces + }; + assert!(re.is_match(text!(&*text))); +} + +#[test] +fn nest_limit_makes_it_parse() { + use regex::RegexBuilder; + + RegexBuilder::new( + r#"(?-u) + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4}| + 3(?: + 12?[5-7]\d{2}| + 0(?: + 2(?: + [025-79]\d| + [348]\d{1,2} + )| + 3(?: + [2-4]\d| + [56]\d? + ) + )| + 2(?: + 1\d{2}| + 2(?: + [12]\d| + [35]\d{1,2}| + 4\d? + ) + )| + 3(?: + 1\d{2}| + 2(?: + [2356]\d| + 4\d{1,2} + ) + )| + 4(?: + 1\d{2}| + 2(?: + 2\d{1,2}| + [47]| + 5\d{2} + ) + )| + 5(?: + 1\d{2}| + 29 + )| + [67]1\d{2}| + 8(?: + 1\d{2}| + 2(?: + 2\d{2}| + 3| + 4\d + ) + ) + )\d{3}| + 4(?: + 0(?: + 2(?: + [09]\d| + 7 + )| + 33\d{2} + )| + 1\d{3}| + 2(?: + 1\d{2}| + 2(?: + [25]\d?| + [348]\d| + [67]\d{1,2} + ) + )| + 3(?: + 1\d{2}(?: + \d{2} + )?| + 2(?: + [045]\d| + [236-9]\d{1,2} + )| + 32\d{2} + )| + 4(?: + [18]\d{2}| + 2(?: + [2-46]\d{2}| + 3 + )| + 5[25]\d{2} + )| + 5(?: + 1\d{2}| + 2(?: + 3\d| + 5 + ) + )| + 6(?: + [18]\d{2}| + 2(?: + 3(?: + \d{2} + )?| + [46]\d{1,2}| + 5\d{2}| + 7\d + )| + 5(?: + 3\d?| + 4\d| + [57]\d{1,2}| + 6\d{2}| + 8 + ) + )| + 71\d{2}| + 8(?: + [18]\d{2}| + 23\d{2}| + 54\d{2} + )| + 9(?: + [18]\d{2}| + 2[2-5]\d{2}| + 53\d{1,2} + ) + )\d{3}| + 5(?: + 02[03489]\d{2}| + 1\d{2}| + 2(?: + 1\d{2}| + 2(?: + 2(?: + \d{2} + )?| + [457]\d{2} + ) + )| + 3(?: + 1\d{2}| + 2(?: + [37](?: + \d{2} + )?| + [569]\d{2} + ) + )| + 4(?: + 1\d{2}| + 2[46]\d{2} + )| + 5(?: + 1\d{2}| + 26\d{1,2} + )| + 6(?: + [18]\d{2}| + 2| + 53\d{2} + )| + 7(?: + 1| + 24 + )\d{2}| + 8(?: + 1| + 26 + )\d{2}| + 91\d{2} + )\d{3}| + 6(?: + 0(?: + 1\d{2}| + 2(?: + 3\d{2}| + 4\d{1,2} + ) + )| + 2(?: + 2[2-5]\d{2}| + 5(?: + [3-5]\d{2}| + 7 + )| + 8\d{2} + )| + 3(?: + 1| + 2[3478] + )\d{2}| + 4(?: + 1| + 2[34] + )\d{2}| + 5(?: + 1| + 2[47] + )\d{2}| + 6(?: + [18]\d{2}| + 6(?: + 2(?: + 2\d| + [34]\d{2} + )| + 5(?: + [24]\d{2}| + 3\d| + 5\d{1,2} + ) + ) + )| + 72[2-5]\d{2}| + 8(?: + 1\d{2}| + 2[2-5]\d{2} + )| + 9(?: + 1\d{2}| + 2[2-6]\d{2} + ) + )\d{3}| + 7(?: + (?: + 02| + [3-589]1| + 6[12]| + 72[24] + )\d{2}| + 21\d{3}| + 32 + )\d{3}| + 8(?: + (?: + 4[12]| + [5-7]2| + 1\d? + )| + (?: + 0| + 3[12]| + [5-7]1| + 217 + )\d + )\d{4}| + 9(?: + [35]1| + (?: + [024]2| + 81 + )\d| + (?: + 1| + [24]1 + )\d{2} + )\d{3} + "#, + ) + .build() + .unwrap(); +} diff --git a/regex-1.8.4/tests/flags.rs b/regex-1.8.4/tests/flags.rs new file mode 100644 index 0000000000000..c33b82d43472b --- /dev/null +++ b/regex-1.8.4/tests/flags.rs @@ -0,0 +1,31 @@ +mat!(match_flag_case, "(?-u)(?i)abc", "ABC", Some((0, 3))); +mat!(match_flag_weird_case, "(?-u)(?i)a(?-i)bc", "Abc", Some((0, 3))); +mat!(match_flag_weird_case_not, "(?-u)(?i)a(?-i)bc", "ABC", None); +mat!(match_flag_case_dotnl, "(?-u)(?is)a(?u:.)", "A\n", Some((0, 2))); +mat!( + match_flag_case_dotnl_toggle, + "(?-u)(?is)a(?u:.)(?-is)a(?u:.)", + "A\nab", + Some((0, 4)) +); +mat!( + match_flag_case_dotnl_toggle_not, + "(?-u)(?is)a(?u:.)(?-is)a(?u:.)", + "A\na\n", + None +); +mat!( + match_flag_case_dotnl_toggle_ok, + "(?-u)(?is)a(?u:.)(?-is:a(?u:.))?", + "A\na\n", + Some((0, 2)) +); +mat!( + match_flag_multi, + r"(?-u)(?m)(?:^\d+$\n?)+", + "123\n456\n789", + Some((0, 11)) +); +mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1))); +mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2))); +mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2))); diff --git a/regex-1.8.4/tests/fowler.rs b/regex-1.8.4/tests/fowler.rs new file mode 100644 index 0000000000000..7f56a758d3c6e --- /dev/null +++ b/regex-1.8.4/tests/fowler.rs @@ -0,0 +1,1588 @@ +// DO NOT EDIT. Automatically generated by 'scripts/regex-match-tests.py' +// on 2019-09-02 11:07:37.849994. + +// Tests from basic.dat +mat!(match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18))); +mat!(match_basic_4, r"a...b", r"abababbb", Some((2, 7))); +mat!(match_basic_5, r"XXXXXX", r"..XXXXXX", Some((2, 8))); +mat!(match_basic_6, r"\)", r"()", Some((1, 2))); +mat!(match_basic_7, r"a]", r"a]a", Some((0, 2))); +mat!(match_basic_9, r"\}", r"}", Some((0, 1))); +mat!(match_basic_10, r"\]", r"]", Some((0, 1))); +mat!(match_basic_12, r"]", r"]", Some((0, 1))); +mat!(match_basic_15, r"^a", r"ax", Some((0, 1))); +mat!(match_basic_16, r"\^a", r"a^a", Some((1, 3))); +mat!(match_basic_17, r"a\^", r"a^", Some((0, 2))); +mat!(match_basic_18, r"a$", r"aa", Some((1, 2))); +mat!(match_basic_19, r"a\$", r"a$", Some((0, 2))); +mat!(match_basic_20, r"^$", r"", Some((0, 0))); +mat!(match_basic_21, r"$^", r"", Some((0, 0))); +mat!(match_basic_22, r"a($)", r"aa", Some((1, 2)), Some((2, 2))); +mat!(match_basic_23, r"a*(^a)", r"aa", Some((0, 1)), Some((0, 1))); +mat!(match_basic_24, r"(..)*(...)*", r"a", Some((0, 0))); +mat!(match_basic_25, r"(..)*(...)*", r"abcd", Some((0, 4)), Some((2, 4))); +mat!( + match_basic_26, + r"(ab|a)(bc|c)", + r"abc", + Some((0, 3)), + Some((0, 2)), + Some((2, 3)) +); +mat!(match_basic_27, r"(ab)c|abc", r"abc", Some((0, 3)), Some((0, 2))); +mat!(match_basic_28, r"a{0}b", r"ab", Some((1, 2))); +mat!( + match_basic_29, + r"(a*)(b?)(b+)b{3}", + r"aaabbbbbbb", + Some((0, 10)), + Some((0, 3)), + Some((3, 4)), + Some((4, 7)) +); +mat!( + match_basic_30, + r"(a*)(b{0,1})(b{1,})b{3}", + r"aaabbbbbbb", + Some((0, 10)), + Some((0, 3)), + Some((3, 4)), + Some((4, 7)) +); +mat!( + match_basic_32, + r"((a|a)|a)", + r"a", + Some((0, 1)), + Some((0, 1)), + Some((0, 1)) +); +mat!( + match_basic_33, + r"(a*)(a|aa)", + r"aaaa", + Some((0, 4)), + Some((0, 3)), + Some((3, 4)) +); +mat!(match_basic_34, r"a*(a.|aa)", r"aaaa", Some((0, 4)), Some((2, 4))); +mat!( + match_basic_35, + r"a(b)|c(d)|a(e)f", + r"aef", + Some((0, 3)), + None, + None, + Some((1, 2)) +); +mat!(match_basic_36, r"(a|b)?.*", r"b", Some((0, 1)), Some((0, 1))); +mat!(match_basic_37, r"(a|b)c|a(b|c)", r"ac", Some((0, 2)), Some((0, 1))); +mat!( + match_basic_38, + r"(a|b)c|a(b|c)", + r"ab", + Some((0, 2)), + None, + Some((1, 2)) +); +mat!(match_basic_39, r"(a|b)*c|(a|ab)*c", r"abc", Some((0, 3)), Some((1, 2))); +mat!(match_basic_40, r"(a|b)*c|(a|ab)*c", r"xc", Some((1, 2))); +mat!( + match_basic_41, + r"(.a|.b).*|.*(.a|.b)", + r"xa", + Some((0, 2)), + Some((0, 2)) +); +mat!(match_basic_42, r"a?(ab|ba)ab", r"abab", Some((0, 4)), Some((0, 2))); +mat!(match_basic_43, r"a?(ac{0}b|ba)ab", r"abab", Some((0, 4)), Some((0, 2))); +mat!(match_basic_44, r"ab|abab", r"abbabab", Some((0, 2))); +mat!(match_basic_45, r"aba|bab|bba", r"baaabbbaba", Some((5, 8))); +mat!(match_basic_46, r"aba|bab", r"baaabbbaba", Some((6, 9))); +mat!( + match_basic_47, + r"(aa|aaa)*|(a|aaaaa)", + r"aa", + Some((0, 2)), + Some((0, 2)) +); +mat!( + match_basic_48, + r"(a.|.a.)*|(a|.a...)", + r"aa", + Some((0, 2)), + Some((0, 2)) +); +mat!(match_basic_49, r"ab|a", r"xabc", Some((1, 3))); +mat!(match_basic_50, r"ab|a", r"xxabc", Some((2, 4))); +mat!( + match_basic_51, + r"(?i)(?-u)(Ab|cD)*", + r"aBcD", + Some((0, 4)), + Some((2, 4)) +); +mat!(match_basic_52, r"[^-]", r"--a", Some((2, 3))); +mat!(match_basic_53, r"[a-]*", r"--a", Some((0, 3))); +mat!(match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4))); +mat!( + match_basic_55, + r":::1:::0:|:::1:1:0:", + r":::0:::1:::1:::0:", + Some((8, 17)) +); +mat!( + match_basic_56, + r":::1:::0:|:::1:1:1:", + r":::0:::1:::1:::0:", + Some((8, 17)) +); +mat!(match_basic_57, r"[[:upper:]]", r"A", Some((0, 1))); +mat!(match_basic_58, r"[[:lower:]]+", r"`az{", Some((1, 3))); +mat!(match_basic_59, r"[[:upper:]]+", r"@AZ[", Some((1, 3))); +mat!( + match_basic_65, + r" +", + r" +", + Some((0, 1)) +); +mat!( + match_basic_66, + r" +", + r" +", + Some((0, 1)) +); +mat!( + match_basic_67, + r"[^a]", + r" +", + Some((0, 1)) +); +mat!( + match_basic_68, + r" +a", + r" +a", + Some((0, 2)) +); +mat!( + match_basic_69, + r"(a)(b)(c)", + r"abc", + Some((0, 3)), + Some((0, 1)), + Some((1, 2)), + Some((2, 3)) +); +mat!(match_basic_70, r"xxx", r"xxx", Some((0, 3))); +mat!( + match_basic_71, + r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", + r"feb 6,", + Some((0, 6)) +); +mat!( + match_basic_72, + r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", + r"2/7", + Some((0, 3)) +); +mat!( + match_basic_73, + r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", + r"feb 1,Feb 6", + Some((5, 11)) +); +mat!( + match_basic_74, + r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))", + r"x", + Some((0, 1)), + Some((0, 1)), + Some((0, 1)) +); +mat!( + match_basic_75, + r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*", + r"xx", + Some((0, 2)), + Some((1, 2)), + Some((1, 2)) +); +mat!( + match_basic_76, + r"a?(ab|ba)*", + r"ababababababababababababababababababababababababababababababababababababababababa", + Some((0, 81)), + Some((79, 81)) +); +mat!( + match_basic_77, + r"abaa|abbaa|abbbaa|abbbbaa", + r"ababbabbbabbbabbbbabbbbaa", + Some((18, 25)) +); +mat!( + match_basic_78, + r"abaa|abbaa|abbbaa|abbbbaa", + r"ababbabbbabbbabbbbabaa", + Some((18, 22)) +); +mat!( + match_basic_79, + r"aaac|aabc|abac|abbc|baac|babc|bbac|bbbc", + r"baaabbbabac", + Some((7, 11)) +); +mat!(match_basic_80, r".*", r"", Some((0, 2))); +mat!( + match_basic_81, + r"aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll", + r"XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", + Some((53, 57)) +); +mat!(match_basic_83, r"a*a*a*a*a*b", r"aaaaaaaaab", Some((0, 10))); +mat!(match_basic_84, r"^", r"", Some((0, 0))); +mat!(match_basic_85, r"$", r"", Some((0, 0))); +mat!(match_basic_86, r"^$", r"", Some((0, 0))); +mat!(match_basic_87, r"^a$", r"a", Some((0, 1))); +mat!(match_basic_88, r"abc", r"abc", Some((0, 3))); +mat!(match_basic_89, r"abc", r"xabcy", Some((1, 4))); +mat!(match_basic_90, r"abc", r"ababc", Some((2, 5))); +mat!(match_basic_91, r"ab*c", r"abc", Some((0, 3))); +mat!(match_basic_92, r"ab*bc", r"abc", Some((0, 3))); +mat!(match_basic_93, r"ab*bc", r"abbc", Some((0, 4))); +mat!(match_basic_94, r"ab*bc", r"abbbbc", Some((0, 6))); +mat!(match_basic_95, r"ab+bc", r"abbc", Some((0, 4))); +mat!(match_basic_96, r"ab+bc", r"abbbbc", Some((0, 6))); +mat!(match_basic_97, r"ab?bc", r"abbc", Some((0, 4))); +mat!(match_basic_98, r"ab?bc", r"abc", Some((0, 3))); +mat!(match_basic_99, r"ab?c", r"abc", Some((0, 3))); +mat!(match_basic_100, r"^abc$", r"abc", Some((0, 3))); +mat!(match_basic_101, r"^abc", r"abcc", Some((0, 3))); +mat!(match_basic_102, r"abc$", r"aabc", Some((1, 4))); +mat!(match_basic_103, r"^", r"abc", Some((0, 0))); +mat!(match_basic_104, r"$", r"abc", Some((3, 3))); +mat!(match_basic_105, r"a.c", r"abc", Some((0, 3))); +mat!(match_basic_106, r"a.c", r"axc", Some((0, 3))); +mat!(match_basic_107, r"a.*c", r"axyzc", Some((0, 5))); +mat!(match_basic_108, r"a[bc]d", r"abd", Some((0, 3))); +mat!(match_basic_109, r"a[b-d]e", r"ace", Some((0, 3))); +mat!(match_basic_110, r"a[b-d]", r"aac", Some((1, 3))); +mat!(match_basic_111, r"a[-b]", r"a-", Some((0, 2))); +mat!(match_basic_112, r"a[b-]", r"a-", Some((0, 2))); +mat!(match_basic_113, r"a]", r"a]", Some((0, 2))); +mat!(match_basic_114, r"a[]]b", r"a]b", Some((0, 3))); +mat!(match_basic_115, r"a[^bc]d", r"aed", Some((0, 3))); +mat!(match_basic_116, r"a[^-b]c", r"adc", Some((0, 3))); +mat!(match_basic_117, r"a[^]b]c", r"adc", Some((0, 3))); +mat!(match_basic_118, r"ab|cd", r"abc", Some((0, 2))); +mat!(match_basic_119, r"ab|cd", r"abcd", Some((0, 2))); +mat!(match_basic_120, r"a\(b", r"a(b", Some((0, 3))); +mat!(match_basic_121, r"a\(*b", r"ab", Some((0, 2))); +mat!(match_basic_122, r"a\(*b", r"a((b", Some((0, 4))); +mat!( + match_basic_123, + r"((a))", + r"abc", + Some((0, 1)), + Some((0, 1)), + Some((0, 1)) +); +mat!( + match_basic_124, + r"(a)b(c)", + r"abc", + Some((0, 3)), + Some((0, 1)), + Some((2, 3)) +); +mat!(match_basic_125, r"a+b+c", r"aabbabc", Some((4, 7))); +mat!(match_basic_126, r"a*", r"aaa", Some((0, 3))); +mat!(match_basic_128, r"(a*)*", r"-", Some((0, 0)), None); +mat!(match_basic_129, r"(a*)+", r"-", Some((0, 0)), Some((0, 0))); +mat!(match_basic_131, r"(a*|b)*", r"-", Some((0, 0)), None); +mat!(match_basic_132, r"(a+|b)*", r"ab", Some((0, 2)), Some((1, 2))); +mat!(match_basic_133, r"(a+|b)+", r"ab", Some((0, 2)), Some((1, 2))); +mat!(match_basic_134, r"(a+|b)?", r"ab", Some((0, 1)), Some((0, 1))); +mat!(match_basic_135, r"[^ab]*", r"cde", Some((0, 3))); +mat!(match_basic_137, r"(^)*", r"-", Some((0, 0)), None); +mat!(match_basic_138, r"a*", r"", Some((0, 0))); +mat!(match_basic_139, r"([abc])*d", r"abbbcd", Some((0, 6)), Some((4, 5))); +mat!(match_basic_140, r"([abc])*bcd", r"abcd", Some((0, 4)), Some((0, 1))); +mat!(match_basic_141, r"a|b|c|d|e", r"e", Some((0, 1))); +mat!(match_basic_142, r"(a|b|c|d|e)f", r"ef", Some((0, 2)), Some((0, 1))); +mat!(match_basic_144, r"((a*|b))*", r"-", Some((0, 0)), None, None); +mat!(match_basic_145, r"abcd*efg", r"abcdefg", Some((0, 7))); +mat!(match_basic_146, r"ab*", r"xabyabbbz", Some((1, 3))); +mat!(match_basic_147, r"ab*", r"xayabbbz", Some((1, 2))); +mat!(match_basic_148, r"(ab|cd)e", r"abcde", Some((2, 5)), Some((2, 4))); +mat!(match_basic_149, r"[abhgefdc]ij", r"hij", Some((0, 3))); +mat!(match_basic_150, r"(a|b)c*d", r"abcd", Some((1, 4)), Some((1, 2))); +mat!(match_basic_151, r"(ab|ab*)bc", r"abc", Some((0, 3)), Some((0, 1))); +mat!(match_basic_152, r"a([bc]*)c*", r"abc", Some((0, 3)), Some((1, 3))); +mat!( + match_basic_153, + r"a([bc]*)(c*d)", + r"abcd", + Some((0, 4)), + Some((1, 3)), + Some((3, 4)) +); +mat!( + match_basic_154, + r"a([bc]+)(c*d)", + r"abcd", + Some((0, 4)), + Some((1, 3)), + Some((3, 4)) +); +mat!( + match_basic_155, + r"a([bc]*)(c+d)", + r"abcd", + Some((0, 4)), + Some((1, 2)), + Some((2, 4)) +); +mat!(match_basic_156, r"a[bcd]*dcdcde", r"adcdcde", Some((0, 7))); +mat!(match_basic_157, r"(ab|a)b*c", r"abc", Some((0, 3)), Some((0, 2))); +mat!( + match_basic_158, + r"((a)(b)c)(d)", + r"abcd", + Some((0, 4)), + Some((0, 3)), + Some((0, 1)), + Some((1, 2)), + Some((3, 4)) +); +mat!(match_basic_159, r"[A-Za-z_][A-Za-z0-9_]*", r"alpha", Some((0, 5))); +mat!(match_basic_160, r"^a(bc+|b[eh])g|.h$", r"abh", Some((1, 3))); +mat!( + match_basic_161, + r"(bc+d$|ef*g.|h?i(j|k))", + r"effgz", + Some((0, 5)), + Some((0, 5)) +); +mat!( + match_basic_162, + r"(bc+d$|ef*g.|h?i(j|k))", + r"ij", + Some((0, 2)), + Some((0, 2)), + Some((1, 2)) +); +mat!( + match_basic_163, + r"(bc+d$|ef*g.|h?i(j|k))", + r"reffgz", + Some((1, 6)), + Some((1, 6)) +); +mat!( + match_basic_164, + r"(((((((((a)))))))))", + r"a", + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)) +); +mat!( + match_basic_165, + r"multiple words", + r"multiple words yeah", + Some((0, 14)) +); +mat!( + match_basic_166, + r"(.*)c(.*)", + r"abcde", + Some((0, 5)), + Some((0, 2)), + Some((3, 5)) +); +mat!(match_basic_167, r"abcd", r"abcd", Some((0, 4))); +mat!(match_basic_168, r"a(bc)d", r"abcd", Some((0, 4)), Some((1, 3))); +mat!(match_basic_169, r"a[-]?c", r"ac", Some((0, 3))); +mat!( + match_basic_170, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Qaddafi", + Some((0, 15)), + None, + Some((10, 12)) +); +mat!( + match_basic_171, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Mo'ammar Gadhafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_172, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Kaddafi", + Some((0, 15)), + None, + Some((10, 12)) +); +mat!( + match_basic_173, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Qadhafi", + Some((0, 15)), + None, + Some((10, 12)) +); +mat!( + match_basic_174, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Gadafi", + Some((0, 14)), + None, + Some((10, 11)) +); +mat!( + match_basic_175, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Mu'ammar Qadafi", + Some((0, 15)), + None, + Some((11, 12)) +); +mat!( + match_basic_176, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Moamar Gaddafi", + Some((0, 14)), + None, + Some((9, 11)) +); +mat!( + match_basic_177, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Mu'ammar Qadhdhafi", + Some((0, 18)), + None, + Some((13, 15)) +); +mat!( + match_basic_178, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Khaddafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_179, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Ghaddafy", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_180, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Ghadafi", + Some((0, 15)), + None, + Some((11, 12)) +); +mat!( + match_basic_181, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Ghaddafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_182, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muamar Kaddafi", + Some((0, 14)), + None, + Some((9, 11)) +); +mat!( + match_basic_183, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Quathafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_184, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Gheddafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_185, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Moammar Khadafy", + Some((0, 15)), + None, + Some((11, 12)) +); +mat!( + match_basic_186, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Moammar Qudhafi", + Some((0, 15)), + None, + Some((10, 12)) +); +mat!(match_basic_187, r"a+(b|c)*d+", r"aabcdd", Some((0, 6)), Some((3, 4))); +mat!(match_basic_188, r"^.+$", r"vivi", Some((0, 4))); +mat!(match_basic_189, r"^(.+)$", r"vivi", Some((0, 4)), Some((0, 4))); +mat!( + match_basic_190, + r"^([^!.]+).att.com!(.+)$", + r"gryphon.att.com!eby", + Some((0, 19)), + Some((0, 7)), + Some((16, 19)) +); +mat!( + match_basic_191, + r"^([^!]+!)?([^!]+)$", + r"bas", + Some((0, 3)), + None, + Some((0, 3)) +); +mat!( + match_basic_192, + r"^([^!]+!)?([^!]+)$", + r"bar!bas", + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_193, + r"^([^!]+!)?([^!]+)$", + r"foo!bas", + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_194, + r"^.+!([^!]+!)([^!]+)$", + r"foo!bar!bas", + Some((0, 11)), + Some((4, 8)), + Some((8, 11)) +); +mat!( + match_basic_195, + r"((foo)|(bar))!bas", + r"bar!bas", + Some((0, 7)), + Some((0, 3)), + None, + Some((0, 3)) +); +mat!( + match_basic_196, + r"((foo)|(bar))!bas", + r"foo!bar!bas", + Some((4, 11)), + Some((4, 7)), + None, + Some((4, 7)) +); +mat!( + match_basic_197, + r"((foo)|(bar))!bas", + r"foo!bas", + Some((0, 7)), + Some((0, 3)), + Some((0, 3)) +); +mat!( + match_basic_198, + r"((foo)|bar)!bas", + r"bar!bas", + Some((0, 7)), + Some((0, 3)) +); +mat!( + match_basic_199, + r"((foo)|bar)!bas", + r"foo!bar!bas", + Some((4, 11)), + Some((4, 7)) +); +mat!( + match_basic_200, + r"((foo)|bar)!bas", + r"foo!bas", + Some((0, 7)), + Some((0, 3)), + Some((0, 3)) +); +mat!( + match_basic_201, + r"(foo|(bar))!bas", + r"bar!bas", + Some((0, 7)), + Some((0, 3)), + Some((0, 3)) +); +mat!( + match_basic_202, + r"(foo|(bar))!bas", + r"foo!bar!bas", + Some((4, 11)), + Some((4, 7)), + Some((4, 7)) +); +mat!( + match_basic_203, + r"(foo|(bar))!bas", + r"foo!bas", + Some((0, 7)), + Some((0, 3)) +); +mat!( + match_basic_204, + r"(foo|bar)!bas", + r"bar!bas", + Some((0, 7)), + Some((0, 3)) +); +mat!( + match_basic_205, + r"(foo|bar)!bas", + r"foo!bar!bas", + Some((4, 11)), + Some((4, 7)) +); +mat!( + match_basic_206, + r"(foo|bar)!bas", + r"foo!bas", + Some((0, 7)), + Some((0, 3)) +); +mat!( + match_basic_207, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"foo!bar!bas", + Some((0, 11)), + Some((0, 11)), + None, + None, + Some((4, 8)), + Some((8, 11)) +); +mat!( + match_basic_208, + r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", + r"bas", + Some((0, 3)), + None, + Some((0, 3)) +); +mat!( + match_basic_209, + r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", + r"bar!bas", + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_210, + r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", + r"foo!bar!bas", + Some((0, 11)), + None, + None, + Some((4, 8)), + Some((8, 11)) +); +mat!( + match_basic_211, + r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", + r"foo!bas", + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_212, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"bas", + Some((0, 3)), + Some((0, 3)), + None, + Some((0, 3)) +); +mat!( + match_basic_213, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"bar!bas", + Some((0, 7)), + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_214, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"foo!bar!bas", + Some((0, 11)), + Some((0, 11)), + None, + None, + Some((4, 8)), + Some((8, 11)) +); +mat!( + match_basic_215, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"foo!bas", + Some((0, 7)), + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!(match_basic_216, r".*(/XXX).*", r"/XXX", Some((0, 4)), Some((0, 4))); +mat!(match_basic_217, r".*(\\XXX).*", r"\XXX", Some((0, 4)), Some((0, 4))); +mat!(match_basic_218, r"\\XXX", r"\XXX", Some((0, 4))); +mat!(match_basic_219, r".*(/000).*", r"/000", Some((0, 4)), Some((0, 4))); +mat!(match_basic_220, r".*(\\000).*", r"\000", Some((0, 4)), Some((0, 4))); +mat!(match_basic_221, r"\\000", r"\000", Some((0, 4))); + +// Tests from nullsubexpr.dat +mat!(match_nullsubexpr_3, r"(a*)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_5, r"(a*)*", r"x", Some((0, 0)), None); +mat!(match_nullsubexpr_6, r"(a*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_7, r"(a*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_8, r"(a*)+", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_9, r"(a*)+", r"x", Some((0, 0)), Some((0, 0))); +mat!(match_nullsubexpr_10, r"(a*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_11, r"(a*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_12, r"(a+)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_13, r"(a+)*", r"x", Some((0, 0))); +mat!(match_nullsubexpr_14, r"(a+)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_15, r"(a+)*", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_16, r"(a+)+", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_17, r"(a+)+", r"x", None); +mat!(match_nullsubexpr_18, r"(a+)+", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_19, r"(a+)+", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_21, r"([a]*)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_23, r"([a]*)*", r"x", Some((0, 0)), None); +mat!(match_nullsubexpr_24, r"([a]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_25, r"([a]*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_26, r"([a]*)+", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_27, r"([a]*)+", r"x", Some((0, 0)), Some((0, 0))); +mat!(match_nullsubexpr_28, r"([a]*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_29, r"([a]*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_30, r"([^b]*)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_32, r"([^b]*)*", r"b", Some((0, 0)), None); +mat!(match_nullsubexpr_33, r"([^b]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!( + match_nullsubexpr_34, + r"([^b]*)*", + r"aaaaaab", + Some((0, 6)), + Some((0, 6)) +); +mat!(match_nullsubexpr_35, r"([ab]*)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_36, r"([ab]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_37, r"([ab]*)*", r"ababab", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_38, r"([ab]*)*", r"bababa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_39, r"([ab]*)*", r"b", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_40, r"([ab]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))); +mat!( + match_nullsubexpr_41, + r"([ab]*)*", + r"aaaabcde", + Some((0, 5)), + Some((0, 5)) +); +mat!(match_nullsubexpr_42, r"([^a]*)*", r"b", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_43, r"([^a]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_45, r"([^a]*)*", r"aaaaaa", Some((0, 0)), None); +mat!( + match_nullsubexpr_46, + r"([^ab]*)*", + r"ccccxx", + Some((0, 6)), + Some((0, 6)) +); +mat!(match_nullsubexpr_48, r"([^ab]*)*", r"ababab", Some((0, 0)), None); +mat!( + match_nullsubexpr_50, + r"((z)+|a)*", + r"zabcde", + Some((0, 2)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_69, + r"(a*)*(x)", + r"x", + Some((0, 1)), + None, + Some((0, 1)) +); +mat!( + match_nullsubexpr_70, + r"(a*)*(x)", + r"ax", + Some((0, 2)), + Some((0, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_71, + r"(a*)*(x)", + r"axa", + Some((0, 2)), + Some((0, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_73, + r"(a*)+(x)", + r"x", + Some((0, 1)), + Some((0, 0)), + Some((0, 1)) +); +mat!( + match_nullsubexpr_74, + r"(a*)+(x)", + r"ax", + Some((0, 2)), + Some((0, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_75, + r"(a*)+(x)", + r"axa", + Some((0, 2)), + Some((0, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_77, + r"(a*){2}(x)", + r"x", + Some((0, 1)), + Some((0, 0)), + Some((0, 1)) +); +mat!( + match_nullsubexpr_78, + r"(a*){2}(x)", + r"ax", + Some((0, 2)), + Some((1, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_79, + r"(a*){2}(x)", + r"axa", + Some((0, 2)), + Some((1, 1)), + Some((1, 2)) +); + +// Tests from repetition.dat +mat!(match_repetition_10, r"((..)|(.))", r"", None); +mat!(match_repetition_11, r"((..)|(.))((..)|(.))", r"", None); +mat!(match_repetition_12, r"((..)|(.))((..)|(.))((..)|(.))", r"", None); +mat!(match_repetition_14, r"((..)|(.)){1}", r"", None); +mat!(match_repetition_15, r"((..)|(.)){2}", r"", None); +mat!(match_repetition_16, r"((..)|(.)){3}", r"", None); +mat!(match_repetition_18, r"((..)|(.))*", r"", Some((0, 0))); +mat!( + match_repetition_20, + r"((..)|(.))", + r"a", + Some((0, 1)), + Some((0, 1)), + None, + Some((0, 1)) +); +mat!(match_repetition_21, r"((..)|(.))((..)|(.))", r"a", None); +mat!(match_repetition_22, r"((..)|(.))((..)|(.))((..)|(.))", r"a", None); +mat!( + match_repetition_24, + r"((..)|(.)){1}", + r"a", + Some((0, 1)), + Some((0, 1)), + None, + Some((0, 1)) +); +mat!(match_repetition_25, r"((..)|(.)){2}", r"a", None); +mat!(match_repetition_26, r"((..)|(.)){3}", r"a", None); +mat!( + match_repetition_28, + r"((..)|(.))*", + r"a", + Some((0, 1)), + Some((0, 1)), + None, + Some((0, 1)) +); +mat!( + match_repetition_30, + r"((..)|(.))", + r"aa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_31, + r"((..)|(.))((..)|(.))", + r"aa", + Some((0, 2)), + Some((0, 1)), + None, + Some((0, 1)), + Some((1, 2)), + None, + Some((1, 2)) +); +mat!(match_repetition_32, r"((..)|(.))((..)|(.))((..)|(.))", r"aa", None); +mat!( + match_repetition_34, + r"((..)|(.)){1}", + r"aa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_35, + r"((..)|(.)){2}", + r"aa", + Some((0, 2)), + Some((1, 2)), + None, + Some((1, 2)) +); +mat!(match_repetition_36, r"((..)|(.)){3}", r"aa", None); +mat!( + match_repetition_38, + r"((..)|(.))*", + r"aa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_40, + r"((..)|(.))", + r"aaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_41, + r"((..)|(.))((..)|(.))", + r"aaa", + Some((0, 3)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 3)), + None, + Some((2, 3)) +); +mat!( + match_repetition_42, + r"((..)|(.))((..)|(.))((..)|(.))", + r"aaa", + Some((0, 3)), + Some((0, 1)), + None, + Some((0, 1)), + Some((1, 2)), + None, + Some((1, 2)), + Some((2, 3)), + None, + Some((2, 3)) +); +mat!( + match_repetition_44, + r"((..)|(.)){1}", + r"aaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_46, + r"((..)|(.)){2}", + r"aaa", + Some((0, 3)), + Some((2, 3)), + Some((0, 2)), + Some((2, 3)) +); +mat!( + match_repetition_47, + r"((..)|(.)){3}", + r"aaa", + Some((0, 3)), + Some((2, 3)), + None, + Some((2, 3)) +); +mat!( + match_repetition_50, + r"((..)|(.))*", + r"aaa", + Some((0, 3)), + Some((2, 3)), + Some((0, 2)), + Some((2, 3)) +); +mat!( + match_repetition_52, + r"((..)|(.))", + r"aaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_53, + r"((..)|(.))((..)|(.))", + r"aaaa", + Some((0, 4)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_54, + r"((..)|(.))((..)|(.))((..)|(.))", + r"aaaa", + Some((0, 4)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 3)), + None, + Some((2, 3)), + Some((3, 4)), + None, + Some((3, 4)) +); +mat!( + match_repetition_56, + r"((..)|(.)){1}", + r"aaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_57, + r"((..)|(.)){2}", + r"aaaa", + Some((0, 4)), + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_59, + r"((..)|(.)){3}", + r"aaaa", + Some((0, 4)), + Some((3, 4)), + Some((0, 2)), + Some((3, 4)) +); +mat!( + match_repetition_61, + r"((..)|(.))*", + r"aaaa", + Some((0, 4)), + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_63, + r"((..)|(.))", + r"aaaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_64, + r"((..)|(.))((..)|(.))", + r"aaaaa", + Some((0, 4)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_65, + r"((..)|(.))((..)|(.))((..)|(.))", + r"aaaaa", + Some((0, 5)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None, + Some((4, 5)), + None, + Some((4, 5)) +); +mat!( + match_repetition_67, + r"((..)|(.)){1}", + r"aaaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_68, + r"((..)|(.)){2}", + r"aaaaa", + Some((0, 4)), + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_70, + r"((..)|(.)){3}", + r"aaaaa", + Some((0, 5)), + Some((4, 5)), + Some((2, 4)), + Some((4, 5)) +); +mat!( + match_repetition_73, + r"((..)|(.))*", + r"aaaaa", + Some((0, 5)), + Some((4, 5)), + Some((2, 4)), + Some((4, 5)) +); +mat!( + match_repetition_75, + r"((..)|(.))", + r"aaaaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_76, + r"((..)|(.))((..)|(.))", + r"aaaaaa", + Some((0, 4)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_77, + r"((..)|(.))((..)|(.))((..)|(.))", + r"aaaaaa", + Some((0, 6)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None, + Some((4, 6)), + Some((4, 6)), + None +); +mat!( + match_repetition_79, + r"((..)|(.)){1}", + r"aaaaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_80, + r"((..)|(.)){2}", + r"aaaaaa", + Some((0, 4)), + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_81, + r"((..)|(.)){3}", + r"aaaaaa", + Some((0, 6)), + Some((4, 6)), + Some((4, 6)), + None +); +mat!( + match_repetition_83, + r"((..)|(.))*", + r"aaaaaa", + Some((0, 6)), + Some((4, 6)), + Some((4, 6)), + None +); +mat!( + match_repetition_90, + r"X(.?){0,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_91, + r"X(.?){1,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_92, + r"X(.?){2,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_93, + r"X(.?){3,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_94, + r"X(.?){4,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_95, + r"X(.?){5,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_96, + r"X(.?){6,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_97, + r"X(.?){7,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_98, + r"X(.?){8,}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_100, + r"X(.?){0,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_102, + r"X(.?){1,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_104, + r"X(.?){2,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_106, + r"X(.?){3,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_108, + r"X(.?){4,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_110, + r"X(.?){5,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_112, + r"X(.?){6,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_114, + r"X(.?){7,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_115, + r"X(.?){8,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_126, + r"(a|ab|c|bcd){0,}(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_127, + r"(a|ab|c|bcd){1,}(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_128, + r"(a|ab|c|bcd){2,}(d*)", + r"ababcd", + Some((0, 6)), + Some((3, 6)), + Some((6, 6)) +); +mat!( + match_repetition_129, + r"(a|ab|c|bcd){3,}(d*)", + r"ababcd", + Some((0, 6)), + Some((3, 6)), + Some((6, 6)) +); +mat!(match_repetition_130, r"(a|ab|c|bcd){4,}(d*)", r"ababcd", None); +mat!( + match_repetition_131, + r"(a|ab|c|bcd){0,10}(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_132, + r"(a|ab|c|bcd){1,10}(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_133, + r"(a|ab|c|bcd){2,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((3, 6)), + Some((6, 6)) +); +mat!( + match_repetition_134, + r"(a|ab|c|bcd){3,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((3, 6)), + Some((6, 6)) +); +mat!(match_repetition_135, r"(a|ab|c|bcd){4,10}(d*)", r"ababcd", None); +mat!( + match_repetition_136, + r"(a|ab|c|bcd)*(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_137, + r"(a|ab|c|bcd)+(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_143, + r"(ab|a|c|bcd){0,}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_145, + r"(ab|a|c|bcd){1,}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_147, + r"(ab|a|c|bcd){2,}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_149, + r"(ab|a|c|bcd){3,}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!(match_repetition_150, r"(ab|a|c|bcd){4,}(d*)", r"ababcd", None); +mat!( + match_repetition_152, + r"(ab|a|c|bcd){0,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_154, + r"(ab|a|c|bcd){1,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_156, + r"(ab|a|c|bcd){2,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_158, + r"(ab|a|c|bcd){3,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!(match_repetition_159, r"(ab|a|c|bcd){4,10}(d*)", r"ababcd", None); +mat!( + match_repetition_161, + r"(ab|a|c|bcd)*(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_163, + r"(ab|a|c|bcd)+(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); diff --git a/regex-1.8.4/tests/macros.rs b/regex-1.8.4/tests/macros.rs new file mode 100644 index 0000000000000..e70e9489fd01e --- /dev/null +++ b/regex-1.8.4/tests/macros.rs @@ -0,0 +1,160 @@ +// Convenience macros. + +macro_rules! findall { + ($re:expr, $text:expr) => {{ + $re.find_iter(text!($text)) + .map(|m| (m.start(), m.end())).collect::<Vec<_>>() + }} +} + +// Macros for automatically producing tests. + +macro_rules! ismatch { + ($name:ident, $re:expr, $text:expr, $ismatch:expr) => { + #[test] + fn $name() { + let re = regex!($re); + assert_eq!($ismatch, re.is_match(text!($text))); + } + }; +} + +macro_rules! mat( + ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( + #[test] + fn $name() { + let text = text!($text); + let expected: Vec<Option<_>> = vec![$($loc)+]; + let r = regex!($re); + let got: Vec<Option<_>> = match r.captures(text) { + Some(c) => { + assert!(r.is_match(text)); + assert!(r.shortest_match(text).is_some()); + r.capture_names() + .enumerate() + .map(|(i, _)| c.get(i).map(|m| (m.start(), m.end()))) + .collect() + } + None => vec![None], + }; + // The test set sometimes leave out capture groups, so truncate + // actual capture groups to match test set. + let mut sgot = &got[..]; + if sgot.len() > expected.len() { + sgot = &sgot[0..expected.len()] + } + if expected != sgot { + panic!("For RE '{}' against '{:?}', \ + expected '{:?}' but got '{:?}'", + $re, text, expected, sgot); + } + } + ); +); + +macro_rules! matiter( + ($name:ident, $re:expr, $text:expr) => ( + #[test] + fn $name() { + let text = text!($text); + let expected: Vec<(usize, usize)> = vec![]; + let r = regex!($re); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); + if expected != got { + panic!("For RE '{}' against '{:?}', \ + expected '{:?}' but got '{:?}'", + $re, text, expected, got); + } + let captures_got: Vec<_> = + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); + if captures_got != got { + panic!("For RE '{}' against '{:?}', \ + got '{:?}' using find_iter but got '{:?}' \ + using captures_iter", + $re, text, got, captures_got); + } + } + ); + ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( + #[test] + fn $name() { + let text = text!($text); + let expected: Vec<_> = vec![$($loc)+]; + let r = regex!($re); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); + if expected != got { + panic!("For RE '{}' against '{:?}', \ + expected '{:?}' but got '{:?}'", + $re, text, expected, got); + } + let captures_got: Vec<_> = + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); + if captures_got != got { + panic!("For RE '{}' against '{:?}', \ + got '{:?}' using find_iter but got '{:?}' \ + using captures_iter", + $re, text, got, captures_got); + } + } + ); +); + +macro_rules! matset { + ($name:ident, $res:expr, $text:expr, $($match_index:expr),*) => { + #[test] + fn $name() { + let text = text!($text); + let set = regex_set!($res); + assert!(set.is_match(text)); + let expected = vec![$($match_index),*]; + let matches = set.matches(text); + assert!(matches.matched_any()); + let got: Vec<_> = matches.into_iter().collect(); + assert_eq!(expected, got); + } + } +} + +macro_rules! nomatset { + ($name:ident, $res:expr, $text:expr) => { + #[test] + fn $name() { + let text = text!($text); + let set = regex_set!($res); + assert!(!set.is_match(text)); + let matches = set.matches(text); + assert!(!matches.matched_any()); + assert_eq!(0, matches.into_iter().count()); + } + } +} + +macro_rules! split { + ($name:ident, $re:expr, $text:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let splitted: Vec<_> = re.split(t!($text)).collect(); + assert_eq!($expected, &*splitted); + } + } +} + +macro_rules! splitn { + ($name:ident, $re:expr, $text:expr, $limit:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let splitted: Vec<_> = re.splitn(t!($text), $limit).collect(); + assert_eq!($expected, &*splitted); + } + } +} diff --git a/regex-1.8.4/tests/macros_bytes.rs b/regex-1.8.4/tests/macros_bytes.rs new file mode 100644 index 0000000000000..3d6c8c3ac88e5 --- /dev/null +++ b/regex-1.8.4/tests/macros_bytes.rs @@ -0,0 +1,39 @@ +// Macros for use in writing tests generic over &str/&[u8]. +macro_rules! text { ($text:expr) => { $text.as_bytes() } } +macro_rules! t { ($re:expr) => { text!($re) } } +macro_rules! match_text { ($text:expr) => { $text.as_bytes() } } +macro_rules! use_ { ($($path: tt)*) => { use regex::bytes::$($path)*; } } +macro_rules! empty_vec { () => { <Vec<&[u8]>>::new() } } +macro_rules! bytes { ($text:expr) => { $text } } + +macro_rules! no_expand { + ($text:expr) => {{ + use regex::bytes::NoExpand; + NoExpand(text!($text)) + }} +} + +macro_rules! show { + ($text:expr) => {{ + use std::ascii::escape_default; + let mut s = vec![]; + for &b in bytes!($text) { + s.extend(escape_default(b)); + } + String::from_utf8(s).unwrap() + }} +} + +macro_rules! expand { + ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let cap = re.captures(t!($text)).unwrap(); + + let mut got = vec![]; + cap.expand(t!($expand), &mut got); + assert_eq!(show!(t!($expected)), show!(&*got)); + } + } +} diff --git a/regex-1.8.4/tests/macros_str.rs b/regex-1.8.4/tests/macros_str.rs new file mode 100644 index 0000000000000..7b7eb110c26b4 --- /dev/null +++ b/regex-1.8.4/tests/macros_str.rs @@ -0,0 +1,38 @@ +// Macros for use in writing tests generic over &str/&[u8]. +macro_rules! text { ($text:expr) => { $text } } +macro_rules! t { ($text:expr) => { text!($text) } } +macro_rules! match_text { ($text:expr) => { $text.as_str() } } +macro_rules! use_ { ($($path: tt)*) => { use regex::$($path)*; } } +macro_rules! empty_vec { () => { <Vec<&str>>::new() } } +macro_rules! bytes { ($text:expr) => { std::str::from_utf8($text.as_ref()).unwrap() } } + +macro_rules! no_expand { + ($text:expr) => {{ + use regex::NoExpand; + NoExpand(text!($text)) + }} +} + +macro_rules! show { ($text:expr) => { $text } } + +// N.B. The expansion API for &str and &[u8] APIs differs slightly for now, +// but they should be unified in 1.0. Then we can move this macro back into +// tests/api.rs where it is used. ---AG +macro_rules! expand { + ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let cap = re.captures(t!($text)).unwrap(); + + let mut got = String::new(); + cap.expand(t!($expand), &mut got); + assert_eq!(show!(t!($expected)), show!(&*got)); + } + } +} + +#[cfg(feature = "pattern")] +macro_rules! searcher_expr { ($e:expr) => ($e) } +#[cfg(not(feature = "pattern"))] +macro_rules! searcher_expr { ($e:expr) => ({}) } diff --git a/regex-1.8.4/tests/misc.rs b/regex-1.8.4/tests/misc.rs new file mode 100644 index 0000000000000..314811e2528cc --- /dev/null +++ b/regex-1.8.4/tests/misc.rs @@ -0,0 +1,4 @@ +mat!(prefix_literal_match, r"^abc", r"abc", Some((0, 3))); +mat!(prefix_literal_nomatch, r"^abc", r"zabc", None); +mat!(one_literal_edge, r"abc", r"xxxxxab", None); +matiter!(terminates, r"a$", r"a", (0, 1)); diff --git a/regex-1.8.4/tests/multiline.rs b/regex-1.8.4/tests/multiline.rs new file mode 100644 index 0000000000000..62ee47b62bab9 --- /dev/null +++ b/regex-1.8.4/tests/multiline.rs @@ -0,0 +1,144 @@ +matiter!( + match_multi_1, + r"(?m)^[a-z]+$", + "abc\ndef\nxyz", + (0, 3), + (4, 7), + (8, 11) +); +matiter!(match_multi_2, r"(?m)^$", "abc\ndef\nxyz"); +matiter!(match_multi_3, r"(?m)^", "abc\ndef\nxyz", (0, 0), (4, 4), (8, 8)); +matiter!(match_multi_4, r"(?m)$", "abc\ndef\nxyz", (3, 3), (7, 7), (11, 11)); +matiter!( + match_multi_5, + r"(?m)^[a-z]", + "abc\ndef\nxyz", + (0, 1), + (4, 5), + (8, 9) +); +matiter!(match_multi_6, r"(?m)[a-z]^", "abc\ndef\nxyz"); +matiter!( + match_multi_7, + r"(?m)[a-z]$", + "abc\ndef\nxyz", + (2, 3), + (6, 7), + (10, 11) +); +matiter!(match_multi_8, r"(?m)$[a-z]", "abc\ndef\nxyz"); +matiter!(match_multi_9, r"(?m)^$", "", (0, 0)); + +matiter!( + match_multi_rep_1, + r"(?m)(?:^$)*", + "a\nb\nc", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); +matiter!( + match_multi_rep_2, + r"(?m)(?:^|a)+", + "a\naaa\n", + (0, 0), + (2, 2), + (3, 5), + (6, 6) +); +matiter!( + match_multi_rep_3, + r"(?m)(?:^|a)*", + "a\naaa\n", + (0, 1), + (2, 5), + (6, 6) +); +matiter!( + match_multi_rep_4, + r"(?m)(?:^[a-z])+", + "abc\ndef\nxyz", + (0, 1), + (4, 5), + (8, 9) +); +matiter!( + match_multi_rep_5, + r"(?m)(?:^[a-z]{3}\n?)+", + "abc\ndef\nxyz", + (0, 11) +); +matiter!( + match_multi_rep_6, + r"(?m)(?:^[a-z]{3}\n?)*", + "abc\ndef\nxyz", + (0, 11) +); +matiter!( + match_multi_rep_7, + r"(?m)(?:\n?[a-z]{3}$)+", + "abc\ndef\nxyz", + (0, 11) +); +matiter!( + match_multi_rep_8, + r"(?m)(?:\n?[a-z]{3}$)*", + "abc\ndef\nxyz", + (0, 11) +); +matiter!( + match_multi_rep_9, + r"(?m)^*", + "\naa\n", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4) +); +matiter!(match_multi_rep_10, r"(?m)^+", "\naa\n", (0, 0), (1, 1), (4, 4)); +matiter!( + match_multi_rep_11, + r"(?m)$*", + "\naa\n", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4) +); +matiter!(match_multi_rep_12, r"(?m)$+", "\naa\n", (0, 0), (3, 3), (4, 4)); +matiter!(match_multi_rep_13, r"(?m)(?:$\n)+", "\n\naaa\n\n", (0, 2), (5, 7)); +matiter!( + match_multi_rep_14, + r"(?m)(?:$\n)*", + "\n\naaa\n\n", + (0, 2), + (3, 3), + (4, 4), + (5, 7) +); +matiter!(match_multi_rep_15, r"(?m)(?:$\n^)+", "\n\naaa\n\n", (0, 2), (5, 7)); +matiter!( + match_multi_rep_16, + r"(?m)(?:^|$)+", + "\n\naaa\n\n", + (0, 0), + (1, 1), + (2, 2), + (5, 5), + (6, 6), + (7, 7) +); +matiter!( + match_multi_rep_17, + r"(?m)(?:$\n)*", + "\n\naaa\n\n", + (0, 2), + (3, 3), + (4, 4), + (5, 7) +); diff --git a/regex-1.8.4/tests/noparse.rs b/regex-1.8.4/tests/noparse.rs new file mode 100644 index 0000000000000..8ded1dce7b7fc --- /dev/null +++ b/regex-1.8.4/tests/noparse.rs @@ -0,0 +1,45 @@ +macro_rules! noparse( + ($name:ident, $re:expr) => ( + #[test] + fn $name() { + let re = $re; + match regex_new!(re) { + Err(_) => {}, + Ok(_) => panic!("Regex '{}' should cause a parse error.", re), + } + } + ); +); + +noparse!(fail_no_repeat_arg, "*"); +noparse!(fail_incomplete_escape, "\\"); +noparse!(fail_class_incomplete, "[A-"); +noparse!(fail_class_not_closed, "[A"); +noparse!(fail_class_no_begin, r"[\A]"); +noparse!(fail_class_no_end, r"[\z]"); +noparse!(fail_class_no_boundary, r"[\b]"); +noparse!(fail_open_paren, "("); +noparse!(fail_close_paren, ")"); +noparse!(fail_invalid_range, "[a-Z]"); +noparse!(fail_empty_capture_name, "(?P<>a)"); +noparse!(fail_bad_capture_name, "(?P<na-me>)"); +noparse!(fail_bad_flag, "(?a)a"); +noparse!(fail_too_big, "a{10000000}"); +noparse!(fail_counted_no_close, "a{1001"); +noparse!(fail_counted_decreasing, "a{2,1}"); +noparse!(fail_counted_nonnegative, "a{-1,1}"); +noparse!(fail_unfinished_cap, "(?"); +noparse!(fail_unfinished_escape, "\\"); +noparse!(fail_octal_digit, r"\8"); +noparse!(fail_hex_digit, r"\xG0"); +noparse!(fail_hex_short, r"\xF"); +noparse!(fail_hex_long_digits, r"\x{fffg}"); +noparse!(fail_flag_bad, "(?a)"); +noparse!(fail_flag_empty, "(?)"); +noparse!(fail_double_neg, "(?-i-i)"); +noparse!(fail_neg_empty, "(?i-)"); +noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)"); +noparse!(fail_range_end_no_class, "[a-[:lower:]]"); +noparse!(fail_range_end_no_begin, r"[a-\A]"); +noparse!(fail_range_end_no_end, r"[a-\z]"); +noparse!(fail_range_end_no_boundary, r"[a-\b]"); diff --git a/regex-1.8.4/tests/regression.rs b/regex-1.8.4/tests/regression.rs new file mode 100644 index 0000000000000..291062a77dd87 --- /dev/null +++ b/regex-1.8.4/tests/regression.rs @@ -0,0 +1,263 @@ +// See: https://github.com/rust-lang/regex/issues/48 +#[test] +fn invalid_regexes_no_crash() { + assert!(regex_new!("(*)").is_err()); + assert!(regex_new!("(?:?)").is_err()); + assert!(regex_new!("(?)").is_err()); + assert!(regex_new!("*").is_err()); +} + +// See: https://github.com/rust-lang/regex/issues/98 +#[test] +fn regression_many_repeat_stack_overflow() { + let re = regex!("^.{1,2500}"); + assert_eq!(vec![(0, 1)], findall!(re, "a")); +} + +// See: https://github.com/rust-lang/regex/issues/555 +#[test] +fn regression_invalid_repetition_expr() { + assert!(regex_new!("(?m){1,1}").is_err()); +} + +// See: https://github.com/rust-lang/regex/issues/527 +#[test] +fn regression_invalid_flags_expression() { + assert!(regex_new!("(((?x)))").is_ok()); +} + +// See: https://github.com/rust-lang/regex/issues/75 +mat!(regression_unsorted_binary_search_1, r"(?i-u)[a_]+", "A_", Some((0, 2))); +mat!(regression_unsorted_binary_search_2, r"(?i-u)[A_]+", "a_", Some((0, 2))); + +// See: https://github.com/rust-lang/regex/issues/99 +#[cfg(feature = "unicode-case")] +mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None); +#[cfg(feature = "unicode-case")] +mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None); + +// See: https://github.com/rust-lang/regex/issues/101 +mat!(regression_ascii_word_underscore, r"[[:word:]]", "_", Some((0, 1))); + +// See: https://github.com/rust-lang/regex/issues/129 +#[test] +fn regression_captures_rep() { + let re = regex!(r"([a-f]){2}(?P<foo>[x-z])"); + let caps = re.captures(text!("abx")).unwrap(); + assert_eq!(match_text!(caps.name("foo").unwrap()), text!("x")); +} + +// See: https://github.com/rust-lang/regex/issues/153 +mat!(regression_alt_in_alt1, r"ab?|$", "az", Some((0, 1))); +mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3))); + +// See: https://github.com/rust-lang/regex/issues/169 +mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3))); + +// See: https://github.com/rust-lang/regex/issues/76 +#[cfg(all(feature = "unicode-case", feature = "unicode-gencat"))] +mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10))); + +// See: https://github.com/rust-lang/regex/issues/191 +mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3))); + +// burntsushi was bad and didn't create an issue for this bug. +mat!(anchored_prefix1, r"^a[[:^space:]]", "a ", None); +mat!(anchored_prefix2, r"^a[[:^space:]]", "foo boo a ", None); +mat!(anchored_prefix3, r"^-[a-z]", "r-f", None); + +// See: https://github.com/rust-lang/regex/issues/204 +#[cfg(feature = "unicode-perl")] +split!( + split_on_word_boundary, + r"\b", + r"Should this (work?)", + &[ + t!(""), + t!("Should"), + t!(" "), + t!("this"), + t!(" ("), + t!("work"), + t!("?)") + ] +); +#[cfg(feature = "unicode-perl")] +matiter!( + word_boundary_dfa, + r"\b", + "a b c", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); + +// See: https://github.com/rust-lang/regex/issues/268 +matiter!(partial_anchor, r"^a|b", "ba", (0, 1)); + +// See: https://github.com/rust-lang/regex/issues/280 +ismatch!(partial_anchor_alternate_begin, r"^a|z", "yyyyya", false); +ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false); + +// See: https://github.com/rust-lang/regex/issues/289 +mat!(lits_unambiguous1, r"(ABC|CDA|BC)X", "CDAX", Some((0, 4))); + +// See: https://github.com/rust-lang/regex/issues/291 +mat!( + lits_unambiguous2, + r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$", + "CIMG2341", + Some((0, 8)), + Some((0, 4)), + None, + Some((0, 4)), + Some((4, 8)) +); + +// See: https://github.com/rust-lang/regex/issues/271 +mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4))); +mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4))); +mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4))); +#[cfg(feature = "unicode-perl")] +mat!(wb_start_x, r"(?u:\b)^(?-u:X)", "X", Some((0, 1))); + +// See: https://github.com/rust-lang/regex/issues/321 +ismatch!(strange_anchor_non_complete_prefix, r"a^{2}", "", false); +ismatch!(strange_anchor_non_complete_suffix, r"${2}a", "", false); + +// See: https://github.com/BurntSushi/ripgrep/issues/1203 +ismatch!(reverse_suffix1, r"[0-4][0-4][0-4]000", "153.230000", true); +ismatch!(reverse_suffix2, r"[0-9][0-9][0-9]000", "153.230000\n", true); +matiter!(reverse_suffix3, r"[0-9][0-9][0-9]000", "153.230000\n", (4, 10)); + +// See: https://github.com/rust-lang/regex/issues/334 +// See: https://github.com/rust-lang/regex/issues/557 +mat!( + captures_after_dfa_premature_end1, + r"a(b*(X|$))?", + "abcbX", + Some((0, 1)), + None, + None +); +mat!( + captures_after_dfa_premature_end2, + r"a(bc*(X|$))?", + "abcbX", + Some((0, 1)), + None, + None +); +mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz", Some((0, 0))); + +// See: https://github.com/rust-lang/regex/issues/437 +ismatch!( + literal_panic, + r"typename type\-parameter\-[0-9]+\-[0-9]+::.+", + "test", + false +); + +// See: https://github.com/rust-lang/regex/issues/533 +ismatch!( + blank_matches_nothing_between_space_and_tab, + r"[[:blank:]]", + "\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\ + \u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\ + \u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}", + false +); + +ismatch!( + inverted_blank_matches_everything_between_space_and_tab, + r"^[[:^blank:]]+$", + "\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\ + \u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\ + \u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}", + true +); + +// Tests that our Aho-Corasick optimization works correctly. It only +// kicks in when we have >32 literals. By "works correctly," we mean that +// leftmost-first match semantics are properly respected. That is, samwise +// should match, not sam. +mat!( + ahocorasick1, + "samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|\ + A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z", + "samwise", + Some((0, 7)) +); + +// See: https://github.com/BurntSushi/ripgrep/issues/1247 +#[test] +#[cfg(feature = "unicode-perl")] +fn regression_nfa_stops1() { + let re = ::regex::bytes::Regex::new(r"\bs(?:[ab])").unwrap(); + assert_eq!(0, re.find_iter(b"s\xE4").count()); +} + +// See: https://github.com/rust-lang/regex/issues/640 +#[cfg(feature = "unicode-case")] +matiter!( + flags_are_unset, + r"((?i)foo)|Bar", + "foo Foo bar Bar", + (0, 3), + (4, 7), + (12, 15) +); + +// See: https://github.com/rust-lang/regex/issues/659 +// +// Note that 'Ј' is not 'j', but cyrillic Je +// https://en.wikipedia.org/wiki/Je_(Cyrillic) +ismatch!(empty_group_match, r"()Ј01", "zЈ01", true); +matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5)); + +// See: https://github.com/rust-lang/regex/issues/862 +mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1))); + +// See: https://github.com/rust-lang/regex/issues/981 +#[cfg(feature = "unicode")] +#[test] +fn regression_bad_word_boundary() { + let re = regex_new!(r#"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"#).unwrap(); + let hay = "ubi-Darwin-x86_64.tar.gz"; + assert!(!re.is_match(text!(hay))); + let hay = "ubi-Windows-x86_64.zip"; + assert!(re.is_match(text!(hay))); +} + +// See: https://github.com/rust-lang/regex/issues/982 +#[cfg(feature = "unicode-perl")] +#[test] +fn regression_unicode_perl_not_enabled() { + let pat = r"(\d+\s?(years|year|y))?\s?(\d+\s?(months|month|m))?\s?(\d+\s?(weeks|week|w))?\s?(\d+\s?(days|day|d))?\s?(\d+\s?(hours|hour|h))?"; + let re = regex_new!(pat); + assert!(re.is_ok()); +} + +// See: https://github.com/rust-lang/regex/issues/995 +#[test] +fn regression_big_regex_overflow() { + let pat = r" {2147483516}{2147483416}{5}"; + let re = regex_new!(pat); + assert!(re.is_err()); +} + +#[test] +fn regression_complete_literals_suffix_incorrect() { + let needles = vec![ + "aA", "bA", "cA", "dA", "eA", "fA", "gA", "hA", "iA", "jA", "kA", + "lA", "mA", "nA", "oA", "pA", "qA", "rA", "sA", "tA", "uA", "vA", + "wA", "xA", "yA", "zA", + ]; + let pattern = needles.join("|"); + let re = regex!(&pattern); + let hay = "FUBAR"; + assert_eq!(0, re.find_iter(text!(hay)).count()); +} diff --git a/regex-1.8.4/tests/regression_fuzz.rs b/regex-1.8.4/tests/regression_fuzz.rs new file mode 100644 index 0000000000000..5f49530a72bec --- /dev/null +++ b/regex-1.8.4/tests/regression_fuzz.rs @@ -0,0 +1,40 @@ +// These tests are only run for the "default" test target because some of them +// can take quite a long time. Some of them take long enough that it's not +// practical to run them in debug mode. :-/ + +// See: https://oss-fuzz.com/testcase-detail/5673225499181056 +// +// Ignored by default since it takes too long in debug mode (almost a minute). +#[test] +#[ignore] +fn fuzz1() { + regex!(r"1}{55}{0}*{1}{55}{55}{5}*{1}{55}+{56}|;**"); +} + +// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=26505 +// See: https://github.com/rust-lang/regex/issues/722 +#[test] +fn empty_any_errors_no_panic() { + assert!(regex_new!(r"\P{any}").is_err()); +} + +// This tests that a very large regex errors during compilation instead of +// using gratuitous amounts of memory. The specific problem is that the +// compiler wasn't accounting for the memory used by Unicode character classes +// correctly. +// +// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579 +#[test] +fn big_regex_fails_to_compile() { + let pat = "[\u{0}\u{e}\u{2}\\w~~>[l\t\u{0}]p?<]{971158}"; + assert!(regex_new!(pat).is_err()); +} + +// This was caught while on master but before a release went out(!). +// +// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=58173 +#[test] +fn todo() { + let pat = "(?:z|xx)@|xx"; + assert!(regex_new!(pat).is_ok()); +} diff --git a/regex-1.8.4/tests/replace.rs b/regex-1.8.4/tests/replace.rs new file mode 100644 index 0000000000000..f23c575515770 --- /dev/null +++ b/regex-1.8.4/tests/replace.rs @@ -0,0 +1,248 @@ +macro_rules! replace( + ($name:ident, $which:ident, $re:expr, + $search:expr, $replace:expr, $result:expr) => ( + #[test] + fn $name() { + let re = regex!($re); + assert_eq!(re.$which(text!($search), $replace), text!($result)); + } + ); +); + +replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6"); +replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z"); +replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ"); +replace!( + groups, + replace, + r"([^ ]+)[ ]+([^ ]+)", + "w1 w2", + t!("$2 $1"), + "w2 w1" +); +replace!( + double_dollar, + replace, + r"([^ ]+)[ ]+([^ ]+)", + "w1 w2", + t!("$2 $$1"), + "w2 $1" +); +// replace!(adjacent_index, replace, +// r"([^aeiouy])ies$", "skies", t!("$1y"), "sky"); +replace!( + named, + replace_all, + r"(?P<first>[^ ]+)[ ]+(?P<last>[^ ]+)(?P<space>[ ]*)", + "w1 w2 w3 w4", + t!("$last $first$space"), + "w2 w1 w4 w3" +); +replace!( + trim, + replace_all, + "^[ \t]+|[ \t]+$", + " \t trim me\t \t", + t!(""), + "trim me" +); +replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b"); +// replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b"); +replace!( + simple_expand, + replace_all, + r"([a-z]) ([a-z])", + "a b", + t!("$2 $1"), + "b a" +); +replace!( + literal_dollar1, + replace_all, + r"([a-z]+) ([a-z]+)", + "a b", + t!("$$1"), + "$1" +); +replace!( + literal_dollar2, + replace_all, + r"([a-z]+) ([a-z]+)", + "a b", + t!("$2 $$c $1"), + "b $c a" +); +replace!( + no_expand1, + replace, + r"([^ ]+)[ ]+([^ ]+)", + "w1 w2", + no_expand!("$2 $1"), + "$2 $1" +); +replace!( + no_expand2, + replace, + r"([^ ]+)[ ]+([^ ]+)", + "w1 w2", + no_expand!("$$1"), + "$$1" +); +use_!(Captures); +replace!( + closure_returning_reference, + replace, + r"([0-9]+)", + "age: 26", + |captures: &Captures<'_>| { + match_text!(captures.get(1).unwrap())[0..1].to_owned() + }, + "age: 2" +); +replace!( + closure_returning_value, + replace, + r"[0-9]+", + "age: 26", + |_captures: &Captures<'_>| t!("Z").to_owned(), + "age: Z" +); + +// See https://github.com/rust-lang/regex/issues/314 +replace!( + match_at_start_replace_with_empty, + replace_all, + r"foo", + "foobar", + t!(""), + "bar" +); + +// See https://github.com/rust-lang/regex/issues/393 +replace!(single_empty_match, replace, r"^", "bar", t!("foo"), "foobar"); + +// See https://github.com/rust-lang/regex/issues/399 +replace!( + capture_longest_possible_name, + replace_all, + r"(.)", + "b", + t!("${1}a $1a"), + "ba " +); + +replace!( + impl_string, + replace, + r"[0-9]", + "age: 26", + t!("Z".to_string()), + "age: Z6" +); +replace!( + impl_string_ref, + replace, + r"[0-9]", + "age: 26", + t!(&"Z".to_string()), + "age: Z6" +); +replace!( + impl_cow_str_borrowed, + replace, + r"[0-9]", + "age: 26", + t!(std::borrow::Cow::<'_, str>::Borrowed("Z")), + "age: Z6" +); +replace!( + impl_cow_str_borrowed_ref, + replace, + r"[0-9]", + "age: 26", + t!(&std::borrow::Cow::<'_, str>::Borrowed("Z")), + "age: Z6" +); +replace!( + impl_cow_str_owned, + replace, + r"[0-9]", + "age: 26", + t!(std::borrow::Cow::<'_, str>::Owned("Z".to_string())), + "age: Z6" +); +replace!( + impl_cow_str_owned_ref, + replace, + r"[0-9]", + "age: 26", + t!(&std::borrow::Cow::<'_, str>::Owned("Z".to_string())), + "age: Z6" +); + +replace!( + impl_vec_u8, + replace, + r"[0-9]", + "age: 26", + bytes!(vec![b'Z']), + "age: Z6" +); +replace!( + impl_vec_u8_ref, + replace, + r"[0-9]", + "age: 26", + bytes!(&vec![b'Z']), + "age: Z6" +); +replace!( + impl_cow_slice_borrowed, + replace, + r"[0-9]", + "age: 26", + bytes!(std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])), + "age: Z6" +); +replace!( + impl_cow_slice_borrowed_ref, + replace, + r"[0-9]", + "age: 26", + bytes!(&std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])), + "age: Z6" +); +replace!( + impl_cow_slice_owned, + replace, + r"[0-9]", + "age: 26", + bytes!(std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])), + "age: Z6" +); +replace!( + impl_cow_slice_owned_ref, + replace, + r"[0-9]", + "age: 26", + bytes!(&std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])), + "age: Z6" +); + +#[test] +fn replacen_no_captures() { + let re = regex!(r"[0-9]"); + assert_eq!( + re.replacen(text!("age: 1234"), 2, t!("Z")), + text!("age: ZZ34") + ); +} + +#[test] +fn replacen_with_captures() { + let re = regex!(r"([0-9])"); + assert_eq!( + re.replacen(text!("age: 1234"), 2, t!("${1}Z")), + text!("age: 1Z2Z34") + ); +} diff --git a/regex-1.8.4/tests/searcher.rs b/regex-1.8.4/tests/searcher.rs new file mode 100644 index 0000000000000..3779f54c31432 --- /dev/null +++ b/regex-1.8.4/tests/searcher.rs @@ -0,0 +1,95 @@ +macro_rules! searcher { + ($name:ident, $re:expr, $haystack:expr) => ( + searcher!($name, $re, $haystack, vec vec![]); + ); + ($name:ident, $re:expr, $haystack:expr, $($steps:expr,)*) => ( + searcher!($name, $re, $haystack, vec vec![$($steps),*]); + ); + ($name:ident, $re:expr, $haystack:expr, $($steps:expr),*) => ( + searcher!($name, $re, $haystack, vec vec![$($steps),*]); + ); + ($name:ident, $re:expr, $haystack:expr, vec $expect_steps:expr) => ( + #[test] + #[allow(unused_imports)] + fn $name() { + searcher_expr! {{ + use std::str::pattern::{Pattern, Searcher}; + use std::str::pattern::SearchStep::{Match, Reject, Done}; + let re = regex!($re); + let mut se = re.into_searcher($haystack); + let mut got_steps = vec![]; + loop { + match se.next() { + Done => break, + step => { got_steps.push(step); } + } + } + assert_eq!(got_steps, $expect_steps); + }} + } + ); +} + +searcher!(searcher_empty_regex_empty_haystack, r"", "", Match(0, 0)); +searcher!( + searcher_empty_regex, + r"", + "ab", + Match(0, 0), + Reject(0, 1), + Match(1, 1), + Reject(1, 2), + Match(2, 2) +); +searcher!(searcher_empty_haystack, r"\d", ""); +searcher!(searcher_one_match, r"\d", "5", Match(0, 1)); +searcher!(searcher_no_match, r"\d", "a", Reject(0, 1)); +searcher!( + searcher_two_adjacent_matches, + r"\d", + "56", + Match(0, 1), + Match(1, 2) +); +searcher!( + searcher_two_non_adjacent_matches, + r"\d", + "5a6", + Match(0, 1), + Reject(1, 2), + Match(2, 3) +); +searcher!(searcher_reject_first, r"\d", "a6", Reject(0, 1), Match(1, 2)); +searcher!( + searcher_one_zero_length_matches, + r"\d*", + "a1b2", + Match(0, 0), // ^ + Reject(0, 1), // a + Match(1, 2), // a1 + Reject(2, 3), // a1b + Match(3, 4), // a1b2 +); +searcher!( + searcher_many_zero_length_matches, + r"\d*", + "a1bbb2", + Match(0, 0), // ^ + Reject(0, 1), // a + Match(1, 2), // a1 + Reject(2, 3), // a1b + Match(3, 3), // a1bb + Reject(3, 4), // a1bb + Match(4, 4), // a1bbb + Reject(4, 5), // a1bbb + Match(5, 6), // a1bbba +); +searcher!( + searcher_unicode, + r".+?", + "Ⅰ1Ⅱ2", + Match(0, 3), + Match(3, 4), + Match(4, 7), + Match(7, 8) +); diff --git a/regex-1.8.4/tests/set.rs b/regex-1.8.4/tests/set.rs new file mode 100644 index 0000000000000..d1144d6623e6a --- /dev/null +++ b/regex-1.8.4/tests/set.rs @@ -0,0 +1,74 @@ +matset!(set1, &["a", "a"], "a", 0, 1); +matset!(set2, &["a", "a"], "ba", 0, 1); +matset!(set3, &["a", "b"], "a", 0); +matset!(set4, &["a", "b"], "b", 1); +matset!(set5, &["a|b", "b|a"], "b", 0, 1); +matset!(set6, &["foo", "oo"], "foo", 0, 1); +matset!(set7, &["^foo", "bar$"], "foo", 0); +matset!(set8, &["^foo", "bar$"], "foo bar", 0, 1); +matset!(set9, &["^foo", "bar$"], "bar", 1); +matset!(set10, &[r"[a-z]+$", "foo"], "01234 foo", 0, 1); +matset!(set11, &[r"[a-z]+$", "foo"], "foo 01234", 1); +matset!(set12, &[r".*?", "a"], "zzzzzza", 0, 1); +matset!(set13, &[r".*", "a"], "zzzzzza", 0, 1); +matset!(set14, &[r".*", "a"], "zzzzzz", 0); +matset!(set15, &[r"(?-u)\ba\b"], "hello a bye", 0); +matset!(set16, &["a"], "a", 0); +matset!(set17, &[".*a"], "a", 0); +matset!(set18, &["a", "β"], "β", 1); + +// regexes that match the empty string +matset!(setempty1, &["", "a"], "abc", 0, 1); +matset!(setempty2, &["", "b"], "abc", 0, 1); +matset!(setempty3, &["", "z"], "abc", 0); +matset!(setempty4, &["a", ""], "abc", 0, 1); +matset!(setempty5, &["b", ""], "abc", 0, 1); +matset!(setempty6, &["z", ""], "abc", 1); +matset!(setempty7, &["b", "(?:)"], "abc", 0, 1); +matset!(setempty8, &["(?:)", "b"], "abc", 0, 1); +matset!(setempty9, &["c(?:)", "b"], "abc", 0, 1); + +nomatset!(nset1, &["a", "a"], "b"); +nomatset!(nset2, &["^foo", "bar$"], "bar foo"); +nomatset!( + nset3, + { + let xs: &[&str] = &[]; + xs + }, + "a" +); +nomatset!(nset4, &[r"^rooted$", r"\.log$"], "notrooted"); + +// See: https://github.com/rust-lang/regex/issues/187 +#[test] +fn regression_subsequent_matches() { + let set = regex_set!(&["ab", "b"]); + let text = text!("ba"); + assert!(set.matches(text).matched(1)); + assert!(set.matches(text).matched(1)); +} + +#[test] +fn get_set_patterns() { + let set = regex_set!(&["a", "b"]); + assert_eq!(vec!["a", "b"], set.patterns()); +} + +#[test] +fn len_and_empty() { + let empty = regex_set!(&[""; 0]); + assert_eq!(empty.len(), 0); + assert!(empty.is_empty()); + + let not_empty = regex_set!(&["ab", "b"]); + assert_eq!(not_empty.len(), 2); + assert!(!not_empty.is_empty()); +} + +#[test] +fn default_set_is_empty() { + let set: regex::bytes::RegexSet = Default::default(); + assert_eq!(set.len(), 0); + assert!(set.is_empty()); +} diff --git a/regex-1.8.4/tests/shortest_match.rs b/regex-1.8.4/tests/shortest_match.rs new file mode 100644 index 0000000000000..f8b4fed15691d --- /dev/null +++ b/regex-1.8.4/tests/shortest_match.rs @@ -0,0 +1,14 @@ +macro_rules! shortmat { + ($name:ident, $re:expr, $text:expr, $shortest_match:expr) => { + #[test] + fn $name() { + let text = text!($text); + let re = regex!($re); + assert_eq!($shortest_match, re.shortest_match(text)); + } + }; +} + +shortmat!(t01, r"a+", r"aa", Some(1)); +// Test that the reverse suffix optimization gets it right. +shortmat!(t02, r".*(?:abcd)+", r"abcdabcd", Some(4)); diff --git a/regex-1.8.4/tests/suffix_reverse.rs b/regex-1.8.4/tests/suffix_reverse.rs new file mode 100644 index 0000000000000..774c9e85f0548 --- /dev/null +++ b/regex-1.8.4/tests/suffix_reverse.rs @@ -0,0 +1,6 @@ +mat!(t01, r".*abcd", r"abcd", Some((0, 4))); +mat!(t02, r".*(?:abcd)+", r"abcd", Some((0, 4))); +mat!(t03, r".*(?:abcd)+", r"abcdabcd", Some((0, 8))); +mat!(t04, r".*(?:abcd)+", r"abcdxabcd", Some((0, 9))); +mat!(t05, r".*x(?:abcd)+", r"abcdxabcd", Some((0, 9))); +mat!(t06, r"[^abcd]*x(?:abcd)+", r"abcdxabcd", Some((4, 9))); diff --git a/regex-1.8.4/tests/test_backtrack.rs b/regex-1.8.4/tests/test_backtrack.rs new file mode 100644 index 0000000000000..fb934e2d8f266 --- /dev/null +++ b/regex-1.8.4/tests/test_backtrack.rs @@ -0,0 +1,56 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .bounded_backtracking() + .build() + .map(|e| e.into_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .bounded_backtracking() + .build() + .map(|e| e.into_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/regex-1.8.4/tests/test_backtrack_bytes.rs b/regex-1.8.4/tests/test_backtrack_bytes.rs new file mode 100644 index 0000000000000..a59426c949c49 --- /dev/null +++ b/regex-1.8.4/tests/test_backtrack_bytes.rs @@ -0,0 +1,55 @@ +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_bytes.rs"); +include!("macros.rs"); + +mod api; +mod bytes; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_ascii; diff --git a/regex-1.8.4/tests/test_backtrack_utf8bytes.rs b/regex-1.8.4/tests/test_backtrack_utf8bytes.rs new file mode 100644 index 0000000000000..6d308e9e1c250 --- /dev/null +++ b/regex-1.8.4/tests/test_backtrack_utf8bytes.rs @@ -0,0 +1,58 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .bounded_backtracking() + .bytes(true) + .build() + .map(|e| e.into_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .bounded_backtracking() + .bytes(true) + .build() + .map(|e| e.into_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/regex-1.8.4/tests/test_crates_regex.rs b/regex-1.8.4/tests/test_crates_regex.rs new file mode 100644 index 0000000000000..a68160472762c --- /dev/null +++ b/regex-1.8.4/tests/test_crates_regex.rs @@ -0,0 +1,54 @@ +/* + * This test is a minimal version of <rofl_0> and <subdiff_0> + * + * Once this bug gets fixed, uncomment rofl_0 and subdiff_0 + * (in `tests/crates_regex.rs`). +#[test] +fn word_boundary_backtracking_default_mismatch() { + use regex::internal::ExecBuilder; + + let backtrack_re = ExecBuilder::new(r"\b") + .bounded_backtracking() + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_re = ExecBuilder::new(r"\b") + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = "䅅\\u{a0}"; + + let fi1 = backtrack_re.find_iter(input); + let fi2 = default_re.find_iter(input); + for (m1, m2) in fi1.zip(fi2) { + assert_eq!(m1, m2); + } +} +*/ + +mod consistent; + +mod crates_regex { + + macro_rules! consistent { + ($test_name:ident, $regex_src:expr) => { + #[test] + fn $test_name() { + use super::consistent::backends_are_consistent; + + if option_env!("RUST_REGEX_RANDOM_TEST").is_some() { + match backends_are_consistent($regex_src) { + Ok(_) => {} + Err(err) => panic!("{}", err), + } + } + } + }; + } + + include!("crates_regex.rs"); +} diff --git a/regex-1.8.4/tests/test_default.rs b/regex-1.8.4/tests/test_default.rs new file mode 100644 index 0000000000000..19a319af11275 --- /dev/null +++ b/regex-1.8.4/tests/test_default.rs @@ -0,0 +1,232 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +use regex; + +// Due to macro scoping rules, this definition only applies for the modules +// defined below. Effectively, it allows us to use the same tests for both +// native and dynamic regexes. +// +// This is also used to test the various matching engines. This one exercises +// the normal code path which automatically chooses the engine based on the +// regex and the input. Other dynamic tests explicitly set the engine to use. +macro_rules! regex_new { + ($re:expr) => {{ + use regex::Regex; + Regex::new($re) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::RegexSet; + RegexSet::new($re) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod misc; +mod multiline; +mod noparse; +mod regression; +mod regression_fuzz; +mod replace; +mod searcher; +mod set; +mod shortest_match; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; + +#[test] +fn disallow_non_utf8() { + assert!(regex::Regex::new(r"(?-u)\xFF").is_err()); + assert!(regex::Regex::new(r"(?-u).").is_err()); + assert!(regex::Regex::new(r"(?-u)[\xFF]").is_err()); + assert!(regex::Regex::new(r"(?-u)☃").is_err()); +} + +#[test] +fn disallow_octal() { + assert!(regex::Regex::new(r"\0").is_err()); +} + +#[test] +fn allow_octal() { + assert!(regex::RegexBuilder::new(r"\0").octal(true).build().is_ok()); +} + +#[test] +fn oibits() { + use regex::bytes; + use regex::{Regex, RegexBuilder, RegexSet, RegexSetBuilder}; + use std::panic::{RefUnwindSafe, UnwindSafe}; + + fn assert_send<T: Send>() {} + fn assert_sync<T: Sync>() {} + fn assert_unwind_safe<T: UnwindSafe>() {} + fn assert_ref_unwind_safe<T: RefUnwindSafe>() {} + + assert_send::<Regex>(); + assert_sync::<Regex>(); + assert_unwind_safe::<Regex>(); + assert_ref_unwind_safe::<Regex>(); + assert_send::<RegexBuilder>(); + assert_sync::<RegexBuilder>(); + assert_unwind_safe::<RegexBuilder>(); + assert_ref_unwind_safe::<RegexBuilder>(); + + assert_send::<bytes::Regex>(); + assert_sync::<bytes::Regex>(); + assert_unwind_safe::<bytes::Regex>(); + assert_ref_unwind_safe::<bytes::Regex>(); + assert_send::<bytes::RegexBuilder>(); + assert_sync::<bytes::RegexBuilder>(); + assert_unwind_safe::<bytes::RegexBuilder>(); + assert_ref_unwind_safe::<bytes::RegexBuilder>(); + + assert_send::<RegexSet>(); + assert_sync::<RegexSet>(); + assert_unwind_safe::<RegexSet>(); + assert_ref_unwind_safe::<RegexSet>(); + assert_send::<RegexSetBuilder>(); + assert_sync::<RegexSetBuilder>(); + assert_unwind_safe::<RegexSetBuilder>(); + assert_ref_unwind_safe::<RegexSetBuilder>(); + + assert_send::<bytes::RegexSet>(); + assert_sync::<bytes::RegexSet>(); + assert_unwind_safe::<bytes::RegexSet>(); + assert_ref_unwind_safe::<bytes::RegexSet>(); + assert_send::<bytes::RegexSetBuilder>(); + assert_sync::<bytes::RegexSetBuilder>(); + assert_unwind_safe::<bytes::RegexSetBuilder>(); + assert_ref_unwind_safe::<bytes::RegexSetBuilder>(); +} + +// See: https://github.com/rust-lang/regex/issues/568 +#[test] +fn oibits_regression() { + use regex::Regex; + use std::panic; + + let _ = panic::catch_unwind(|| Regex::new("a").unwrap()); +} + +// See: https://github.com/rust-lang/regex/issues/750 +#[test] +#[cfg(target_pointer_width = "64")] +fn regex_is_reasonably_small() { + use std::mem::size_of; + + use regex::bytes; + use regex::{Regex, RegexSet}; + + assert_eq!(16, size_of::<Regex>()); + assert_eq!(16, size_of::<RegexSet>()); + assert_eq!(16, size_of::<bytes::Regex>()); + assert_eq!(16, size_of::<bytes::RegexSet>()); +} + +// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 +// See: CVE-2022-24713 +// +// We test that our regex compiler will correctly return a "too big" error when +// we try to use a very large repetition on an *empty* sub-expression. +// +// At the time this test was written, the regex compiler does not represent +// empty sub-expressions with any bytecode instructions. In effect, it's an +// "optimization" to leave them out, since they would otherwise correspond +// to an unconditional JUMP in the regex bytecode (i.e., an unconditional +// epsilon transition in the NFA graph). Therefore, an empty sub-expression +// represents an interesting case for the compiler's size limits. Since it +// doesn't actually contribute any additional memory to the compiled regex +// instructions, the size limit machinery never detects it. Instead, it just +// dumbly tries to compile the empty sub-expression N times, where N is the +// repetition size. +// +// When N is very large, this will cause the compiler to essentially spin and +// do nothing for a decently large amount of time. It causes the regex to take +// quite a bit of time to compile, despite the concrete syntax of the regex +// being quite small. +// +// The degree to which this is actually a problem is somewhat of a judgment +// call. Some regexes simply take a long time to compile. But in general, you +// should be able to reasonably control this by setting lower or higher size +// limits on the compiled object size. But this mitigation doesn't work at all +// for this case. +// +// This particular test is somewhat narrow. It merely checks that regex +// compilation will, at some point, return a "too big" error. Before the +// fix landed, this test would eventually fail because the regex would be +// successfully compiled (after enough time elapsed). So while this test +// doesn't check that we exit in a reasonable amount of time, it does at least +// check that we are properly returning an error at some point. +#[test] +fn big_empty_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){4294967295}"); + assert!(result.is_err()); +} + +// Below is a "billion laughs" variant of the previous test case. +#[test] +fn big_empty_reps_chain_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}"); + assert!(result.is_err()); +} + +// Below is another situation where a zero-length sub-expression can be +// introduced. +#[test] +fn big_zero_reps_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"x{0}{4294967295}"); + assert!(result.is_err()); +} + +// Testing another case for completeness. +#[test] +fn empty_alt_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"(?:|){4294967295}"); + assert!(result.is_err()); +} + +// Regression test for: https://github.com/rust-lang/regex/issues/969 +#[test] +fn regression_i969() { + use regex::Regex; + + let re = Regex::new(r"c.*d\z").unwrap(); + assert_eq!(Some(6), re.shortest_match_at("ababcd", 4)); + assert_eq!(Some(6), re.find_at("ababcd", 4).map(|m| m.end())); +} diff --git a/regex-1.8.4/tests/test_default_bytes.rs b/regex-1.8.4/tests/test_default_bytes.rs new file mode 100644 index 0000000000000..f200596ba1872 --- /dev/null +++ b/regex-1.8.4/tests/test_default_bytes.rs @@ -0,0 +1,75 @@ +macro_rules! regex_new { + ($re:expr) => {{ + use regex::bytes::Regex; + Regex::new($re) + }}; +} + +macro_rules! regex_set_new { + ($res:expr) => {{ + use regex::bytes::RegexSet; + RegexSet::new($res) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_bytes.rs"); +include!("macros.rs"); + +// A silly wrapper to make it possible to write and match raw bytes. +struct R<'a>(&'a [u8]); +impl<'a> R<'a> { + fn as_bytes(&self) -> &'a [u8] { + self.0 + } +} + +// See: https://github.com/rust-lang/regex/issues/321 +// +// These tests are here because they do not have the same behavior in every +// regex engine. +mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3))); +mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None); +mat!( + invalid_utf8_nfa3, + r".", + R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"), + Some((1, 3)) +); +mat!( + invalid_utf8_nfa4, + r"${2}ä", + R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"), + None +); + +mod api; +mod bytes; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod set; +mod shortest_match; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/regex-1.8.4/tests/test_nfa.rs b/regex-1.8.4/tests/test_nfa.rs new file mode 100644 index 0000000000000..e5a67d180aa31 --- /dev/null +++ b/regex-1.8.4/tests/test_nfa.rs @@ -0,0 +1,50 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re).nfa().build().map(|e| e.into_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re).nfa().build().map(|e| e.into_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/regex-1.8.4/tests/test_nfa_bytes.rs b/regex-1.8.4/tests/test_nfa_bytes.rs new file mode 100644 index 0000000000000..0a10e032a2544 --- /dev/null +++ b/regex-1.8.4/tests/test_nfa_bytes.rs @@ -0,0 +1,55 @@ +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .nfa() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .nfa() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_bytes.rs"); +include!("macros.rs"); + +mod api; +mod bytes; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/regex-1.8.4/tests/test_nfa_utf8bytes.rs b/regex-1.8.4/tests/test_nfa_utf8bytes.rs new file mode 100644 index 0000000000000..36a572b5fce03 --- /dev/null +++ b/regex-1.8.4/tests/test_nfa_utf8bytes.rs @@ -0,0 +1,54 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re).nfa().bytes(true).build().map(|e| e.into_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .nfa() + .bytes(true) + .build() + .map(|e| e.into_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/regex-1.8.4/tests/unicode.rs b/regex-1.8.4/tests/unicode.rs new file mode 100644 index 0000000000000..d7dbdd31b8ea9 --- /dev/null +++ b/regex-1.8.4/tests/unicode.rs @@ -0,0 +1,254 @@ +mat!(uni_literal, r"☃", "☃", Some((0, 3))); +mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3))); +mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3))); +mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3))); +mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))); +mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))); +mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))); +mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))); +mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))); +mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))); +mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))); +mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))); + +// Test the Unicode friendliness of Perl character classes. +mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))); +mat!(uni_perl_w_not, r"\w+", "⥡", None); +mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))); +mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))); +mat!(uni_perl_d_not, r"\d+", "Ⅱ", None); +mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))); +mat!(uni_perl_s, r"\s+", " ", Some((0, 3))); +mat!(uni_perl_s_not, r"\s+", "☃", None); +mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))); + +// And do the same for word boundaries. +mat!(uni_boundary_none, r"\d\b", "6δ", None); +mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))); +mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1))); +mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); + +// Test general categories. +// +// We should test more, but there's a lot. Write a script to generate more of +// these tests. +mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter2, r"\p{gc=LC}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter3, r"\p{LC}", "A", Some((0, 3))); +mat!( + uni_class_gencat_close_punctuation, + r"\p{Close_Punctuation}", + "❯", + Some((0, 3)) +); +mat!( + uni_class_gencat_connector_punctuation, + r"\p{Connector_Punctuation}", + "⁀", + Some((0, 3)) +); +mat!(uni_class_gencat_control, r"\p{Control}", "\u{9f}", Some((0, 2))); +mat!( + uni_class_gencat_currency_symbol, + r"\p{Currency_Symbol}", + "£", + Some((0, 3)) +); +mat!( + uni_class_gencat_dash_punctuation, + r"\p{Dash_Punctuation}", + "〰", + Some((0, 3)) +); +mat!(uni_class_gencat_decimal_numer, r"\p{Decimal_Number}", "𑓙", Some((0, 4))); +mat!( + uni_class_gencat_enclosing_mark, + r"\p{Enclosing_Mark}", + "\u{A672}", + Some((0, 3)) +); +mat!( + uni_class_gencat_final_punctuation, + r"\p{Final_Punctuation}", + "⸡", + Some((0, 3)) +); +mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4))); +// See: https://github.com/rust-lang/regex/issues/719 +mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4))); +mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4))); +mat!(uni_class_gencat_format_abbrev3, r"\p{Sc}", "$", Some((0, 1))); +mat!( + uni_class_gencat_initial_punctuation, + r"\p{Initial_Punctuation}", + "⸜", + Some((0, 3)) +); +mat!(uni_class_gencat_letter, r"\p{Letter}", "Έ", Some((0, 2))); +mat!(uni_class_gencat_letter_number, r"\p{Letter_Number}", "ↂ", Some((0, 3))); +mat!( + uni_class_gencat_line_separator, + r"\p{Line_Separator}", + "\u{2028}", + Some((0, 3)) +); +mat!( + uni_class_gencat_lowercase_letter, + r"\p{Lowercase_Letter}", + "ϛ", + Some((0, 2)) +); +mat!(uni_class_gencat_mark, r"\p{Mark}", "\u{E01EF}", Some((0, 4))); +mat!(uni_class_gencat_math, r"\p{Math}", "⋿", Some((0, 3))); +mat!( + uni_class_gencat_modifier_letter, + r"\p{Modifier_Letter}", + "𖭃", + Some((0, 4)) +); +mat!( + uni_class_gencat_modifier_symbol, + r"\p{Modifier_Symbol}", + "🏿", + Some((0, 4)) +); +mat!( + uni_class_gencat_nonspacing_mark, + r"\p{Nonspacing_Mark}", + "\u{1E94A}", + Some((0, 4)) +); +mat!(uni_class_gencat_number, r"\p{Number}", "⓿", Some((0, 3))); +mat!( + uni_class_gencat_open_punctuation, + r"\p{Open_Punctuation}", + "⦅", + Some((0, 3)) +); +mat!(uni_class_gencat_other, r"\p{Other}", "\u{bc9}", Some((0, 3))); +mat!(uni_class_gencat_other_letter, r"\p{Other_Letter}", "ꓷ", Some((0, 3))); +mat!(uni_class_gencat_other_number, r"\p{Other_Number}", "㉏", Some((0, 3))); +mat!( + uni_class_gencat_other_punctuation, + r"\p{Other_Punctuation}", + "𞥞", + Some((0, 4)) +); +mat!(uni_class_gencat_other_symbol, r"\p{Other_Symbol}", "⅌", Some((0, 3))); +mat!( + uni_class_gencat_paragraph_separator, + r"\p{Paragraph_Separator}", + "\u{2029}", + Some((0, 3)) +); +mat!( + uni_class_gencat_private_use, + r"\p{Private_Use}", + "\u{10FFFD}", + Some((0, 4)) +); +mat!(uni_class_gencat_punctuation, r"\p{Punctuation}", "𑁍", Some((0, 4))); +mat!(uni_class_gencat_separator, r"\p{Separator}", "\u{3000}", Some((0, 3))); +mat!( + uni_class_gencat_space_separator, + r"\p{Space_Separator}", + "\u{205F}", + Some((0, 3)) +); +mat!( + uni_class_gencat_spacing_mark, + r"\p{Spacing_Mark}", + "\u{16F7E}", + Some((0, 4)) +); +mat!(uni_class_gencat_symbol, r"\p{Symbol}", "⯈", Some((0, 3))); +mat!( + uni_class_gencat_titlecase_letter, + r"\p{Titlecase_Letter}", + "ῼ", + Some((0, 3)) +); +mat!( + uni_class_gencat_unassigned, + r"\p{Unassigned}", + "\u{10FFFF}", + Some((0, 4)) +); +mat!( + uni_class_gencat_uppercase_letter, + r"\p{Uppercase_Letter}", + "Ꝋ", + Some((0, 3)) +); + +// Test a smattering of properties. +mat!(uni_class_prop_emoji1, r"\p{Emoji}", "\u{23E9}", Some((0, 3))); +mat!(uni_class_prop_emoji2, r"\p{emoji}", "\u{1F21A}", Some((0, 4))); +mat!( + uni_class_prop_picto1, + r"\p{extendedpictographic}", + "\u{1FA6E}", + Some((0, 4)) +); +mat!( + uni_class_prop_picto2, + r"\p{extendedpictographic}", + "\u{1FFFD}", + Some((0, 4)) +); + +// grapheme_cluster_break +mat!( + uni_class_gcb_prepend, + r"\p{grapheme_cluster_break=prepend}", + "\u{11D46}", + Some((0, 4)) +); +mat!( + uni_class_gcb_ri1, + r"\p{gcb=regional_indicator}", + "\u{1F1E6}", + Some((0, 4)) +); +mat!(uni_class_gcb_ri2, r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4))); +mat!( + uni_class_gcb_ri3, + r"\p{gcb=regionalindicator}", + "\u{1F1FF}", + Some((0, 4)) +); +mat!(uni_class_gcb_lvt, r"\p{gcb=lvt}", "\u{C989}", Some((0, 3))); +mat!(uni_class_gcb_zwj, r"\p{gcb=zwj}", "\u{200D}", Some((0, 3))); + +// word_break +mat!(uni_class_wb1, r"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3))); +mat!(uni_class_wb2, r"\p{wb=hebrewletter}", "\u{FB46}", Some((0, 3))); +mat!(uni_class_wb3, r"\p{wb=ExtendNumLet}", "\u{FF3F}", Some((0, 3))); +mat!(uni_class_wb4, r"\p{wb=WSegSpace}", "\u{3000}", Some((0, 3))); +mat!(uni_class_wb5, r"\p{wb=numeric}", "\u{1E950}", Some((0, 4))); + +// sentence_break +mat!(uni_class_sb1, r"\p{sentence_break=Lower}", "\u{0469}", Some((0, 2))); +mat!(uni_class_sb2, r"\p{sb=lower}", "\u{0469}", Some((0, 2))); +mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3))); +mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4))); +mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3))); + +// Test 'Vithkuqi' support, which was added in Unicode 14. +// See: https://github.com/rust-lang/regex/issues/877 +mat!( + uni_vithkuqi_literal_upper, + r"(?i)^\u{10570}$", + "\u{10570}", + Some((0, 4)) +); +mat!( + uni_vithkuqi_literal_lower, + r"(?i)^\u{10570}$", + "\u{10597}", + Some((0, 4)) +); +mat!(uni_vithkuqi_word_upper, r"^\w$", "\u{10570}", Some((0, 4))); +mat!(uni_vithkuqi_word_lower, r"^\w$", "\u{10597}", Some((0, 4))); diff --git a/regex-1.8.4/tests/word_boundary.rs b/regex-1.8.4/tests/word_boundary.rs new file mode 100644 index 0000000000000..7fe97a2974883 --- /dev/null +++ b/regex-1.8.4/tests/word_boundary.rs @@ -0,0 +1,89 @@ +// Many of these are cribbed from RE2's test suite. + +matiter!(wb1, r"\b", ""); +matiter!(wb2, r"\b", "a", (0, 0), (1, 1)); +matiter!(wb3, r"\b", "ab", (0, 0), (2, 2)); +matiter!(wb4, r"^\b", "ab", (0, 0)); +matiter!(wb5, r"\b$", "ab", (2, 2)); +matiter!(wb6, r"^\b$", "ab"); +matiter!(wb7, r"\bbar\b", "nobar bar foo bar", (6, 9), (14, 17)); +matiter!(wb8, r"a\b", "faoa x", (3, 4)); +matiter!(wb9, r"\bbar", "bar x", (0, 3)); +matiter!(wb10, r"\bbar", "foo\nbar x", (4, 7)); +matiter!(wb11, r"bar\b", "foobar", (3, 6)); +matiter!(wb12, r"bar\b", "foobar\nxxx", (3, 6)); +matiter!(wb13, r"(foo|bar|[A-Z])\b", "foo", (0, 3)); +matiter!(wb14, r"(foo|bar|[A-Z])\b", "foo\n", (0, 3)); +matiter!(wb15, r"\b(foo|bar|[A-Z])", "foo", (0, 3)); +matiter!(wb16, r"\b(foo|bar|[A-Z])\b", "X", (0, 1)); +matiter!(wb17, r"\b(foo|bar|[A-Z])\b", "XY"); +matiter!(wb18, r"\b(foo|bar|[A-Z])\b", "bar", (0, 3)); +matiter!(wb19, r"\b(foo|bar|[A-Z])\b", "foo", (0, 3)); +matiter!(wb20, r"\b(foo|bar|[A-Z])\b", "foo\n", (0, 3)); +matiter!(wb21, r"\b(foo|bar|[A-Z])\b", "ffoo bbar N x", (10, 11)); +matiter!(wb22, r"\b(fo|foo)\b", "fo", (0, 2)); +matiter!(wb23, r"\b(fo|foo)\b", "foo", (0, 3)); +matiter!(wb24, r"\b\b", ""); +matiter!(wb25, r"\b\b", "a", (0, 0), (1, 1)); +matiter!(wb26, r"\b$", ""); +matiter!(wb27, r"\b$", "x", (1, 1)); +matiter!(wb28, r"\b$", "y x", (3, 3)); +matiter!(wb29, r"\b.$", "x", (0, 1)); +matiter!(wb30, r"^\b(fo|foo)\b", "fo", (0, 2)); +matiter!(wb31, r"^\b(fo|foo)\b", "foo", (0, 3)); +matiter!(wb32, r"^\b$", ""); +matiter!(wb33, r"^\b$", "x"); +matiter!(wb34, r"^\b.$", "x", (0, 1)); +matiter!(wb35, r"^\b.\b$", "x", (0, 1)); +matiter!(wb36, r"^^^^^\b$$$$$", ""); +matiter!(wb37, r"^^^^^\b.$$$$$", "x", (0, 1)); +matiter!(wb38, r"^^^^^\b$$$$$", "x"); +matiter!(wb39, r"^^^^^\b\b\b.\b\b\b$$$$$", "x", (0, 1)); +matiter!(wb40, r"\b.+\b", "$$abc$$", (2, 5)); +matiter!(wb41, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + +matiter!(nb1, r"\Bfoo\B", "n foo xfoox that", (7, 10)); +matiter!(nb2, r"a\B", "faoa x", (1, 2)); +matiter!(nb3, r"\Bbar", "bar x"); +matiter!(nb4, r"\Bbar", "foo\nbar x"); +matiter!(nb5, r"bar\B", "foobar"); +matiter!(nb6, r"bar\B", "foobar\nxxx"); +matiter!(nb7, r"(foo|bar|[A-Z])\B", "foox", (0, 3)); +matiter!(nb8, r"(foo|bar|[A-Z])\B", "foo\n"); +matiter!(nb9, r"\B", "", (0, 0)); +matiter!(nb10, r"\B", "x"); +matiter!(nb11, r"\B(foo|bar|[A-Z])", "foo"); +matiter!(nb12, r"\B(foo|bar|[A-Z])\B", "xXy", (1, 2)); +matiter!(nb13, r"\B(foo|bar|[A-Z])\B", "XY"); +matiter!(nb14, r"\B(foo|bar|[A-Z])\B", "XYZ", (1, 2)); +matiter!(nb15, r"\B(foo|bar|[A-Z])\B", "abara", (1, 4)); +matiter!(nb16, r"\B(foo|bar|[A-Z])\B", "xfoo_", (1, 4)); +matiter!(nb17, r"\B(foo|bar|[A-Z])\B", "xfoo\n"); +matiter!(nb18, r"\B(foo|bar|[A-Z])\B", "foo bar vNX", (9, 10)); +matiter!(nb19, r"\B(fo|foo)\B", "xfoo", (1, 3)); +matiter!(nb20, r"\B(foo|fo)\B", "xfooo", (1, 4)); +matiter!(nb21, r"\B\B", "", (0, 0)); +matiter!(nb22, r"\B\B", "x"); +matiter!(nb23, r"\B$", "", (0, 0)); +matiter!(nb24, r"\B$", "x"); +matiter!(nb25, r"\B$", "y x"); +matiter!(nb26, r"\B.$", "x"); +matiter!(nb27, r"^\B(fo|foo)\B", "fo"); +matiter!(nb28, r"^\B(fo|foo)\B", "foo"); +matiter!(nb29, r"^\B", "", (0, 0)); +matiter!(nb30, r"^\B", "x"); +matiter!(nb31, r"^\B\B", "", (0, 0)); +matiter!(nb32, r"^\B\B", "x"); +matiter!(nb33, r"^\B$", "", (0, 0)); +matiter!(nb34, r"^\B$", "x"); +matiter!(nb35, r"^\B.$", "x"); +matiter!(nb36, r"^\B.\B$", "x"); +matiter!(nb37, r"^^^^^\B$$$$$", "", (0, 0)); +matiter!(nb38, r"^^^^^\B.$$$$$", "x"); +matiter!(nb39, r"^^^^^\B$$$$$", "x"); + +// These work for both Unicode and ASCII because all matches are reported as +// byte offsets, and « and » do not correspond to word boundaries at either +// the character or byte level. +matiter!(unicode1, r"\bx\b", "«x", (2, 3)); +matiter!(unicode2, r"\bx\b", "x»", (0, 1)); diff --git a/regex-1.8.4/tests/word_boundary_ascii.rs b/regex-1.8.4/tests/word_boundary_ascii.rs new file mode 100644 index 0000000000000..5a3cf1166cf54 --- /dev/null +++ b/regex-1.8.4/tests/word_boundary_ascii.rs @@ -0,0 +1,9 @@ +// ASCII word boundaries are completely oblivious to Unicode characters. +// For Unicode word boundaries, the tests are precisely inverted. +matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3)); +matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); + +// We still get Unicode word boundaries by default in byte regexes. +matiter!(unicode1, r"\bx\b", "áxβ"); +matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3)); diff --git a/regex-1.8.4/tests/word_boundary_unicode.rs b/regex-1.8.4/tests/word_boundary_unicode.rs new file mode 100644 index 0000000000000..c41355ffc4b88 --- /dev/null +++ b/regex-1.8.4/tests/word_boundary_unicode.rs @@ -0,0 +1,6 @@ +// Unicode word boundaries know about Unicode characters. +// For ASCII word boundaries, the tests are precisely inverted. +matiter!(unicode1, r"\bx\b", "áxβ"); +matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3)); + +matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));