From 27efef22246f4547de5289778fed8aabee230e4d Mon Sep 17 00:00:00 2001 From: Tim Thompson Date: Sat, 14 Aug 2021 17:38:17 +1000 Subject: [PATCH] feat #22: WIP on escape sequence handling, not as easy as I'd hoped because of HL7's full on spec. This will prob come in stages. --- benches/decoder.rs | 42 +++++++++++++++-- src/decoder/mod.rs | 110 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 126 insertions(+), 26 deletions(-) diff --git a/benches/decoder.rs b/benches/decoder.rs index 3f45745..a158cbd 100644 --- a/benches/decoder.rs +++ b/benches/decoder.rs @@ -1,9 +1,11 @@ use criterion::{criterion_group, criterion_main, Criterion}; use rusthl7::{decoder::*, separators::Separators}; +// Note that we;re calkling decode on a whole message here, although it would normally be on an individual field... +// this is just to make it work a bit harder on a larger dataset, not because it makes sense in a HL7 sense + fn no_sequences(c: &mut Criterion) { c.bench_function("No Escape Sequences", |b| { - let delims = Separators::default(); let decoder = EscapeSequence::new(delims); @@ -13,15 +15,47 @@ fn no_sequences(c: &mut Criterion) { }); } +fn no_sequences_but_backslash(c: &mut Criterion) { + c.bench_function("No Escape Sequences But Backslash", |b| { + let delims = Separators::default(); + let decoder = EscapeSequence::new(delims); + b.iter(|| { + let _ = decoder.decode(get_sample_message_with_backslash()); + }) + }); +} + +fn has_escape_sequences(c: &mut Criterion) { + c.bench_function("Has Escape Sequences", |b| { + let delims = Separators::default(); + let decoder = EscapeSequence::new(delims); + + b.iter(|| { + let _ = decoder.decode(get_sample_message_with_escape_sequences()); + }) + }); +} fn get_sample_message_no_sequence() -> &'static str { - "MSH|^~\\&|GHH LAB|ELAB-3|GHH OE|BLDG4|200202150930||ORU^R01|CNTRL-3456|P|2.4\rPID|||555-44-4444||EVERYWOMAN^EVE^E^^^^L|JONES|19620320|F|||153 FERNWOOD DR.^^STATESVILLE^OH^35292||(206)3345232|(206)752-121||||AC555444444||67-A4335^OH^20030520\rOBR|1|845439^GHH OE|1045813^GHH LAB|15545^GLUCOSE|||200202150730|||||||||555-55-5555^PRIMARY^PATRICIA P^^^^MD^^|||||||||F||||||444-44-4444^HIPPOCRATES^HOWARD H^^^^MD\rOBX|1|SN|1554-5^GLUCOSE^POST 12H CFST:MCNC:PT:SER/PLAS:QN||^182|mg/dl|70_105|H|||F" + // note we've stripped the backslash from the MSH + "MSH|^~*&|GHH LAB|ELAB-3|GHH OE|BLDG4|200202150930||ORU^R01|CNTRL-3456|P|2.4\rPID|||555-44-4444||EVERYWOMAN^EVE^E^^^^L|JONES|19620320|F|||153 FERNWOOD DR.^^STATESVILLE^OH^35292||(206)3345232|(206)752-121||||AC555444444||67-A4335^OH^20030520\rOBR|1|845439^GHH OE|1045813^GHH LAB|15545^GLUCOSE|||200202150730|||||||||555-55-5555^PRIMARY^PATRICIA P^^^^MD^^|||||||||F||||||444-44-4444^HIPPOCRATES^HOWARD H^^^^MD\rOBX|1|SN|1554-5^GLUCOSE^POST 12H CFST:MCNC:PT:SER/PLAS:QN||^182|mg/dl|70_105|H|||F" +} + +fn get_sample_message_with_backslash() -> &'static str { + //there's a backslash down at char 487! + "MSH|^~\\&|GHH LAB|ELAB-3|GHH OE|BLDG4|200202150930||ORU^R01|CNTRL-3456|P|2.4\rPID|||555-44-4444||EVERYWOMAN^EVE^E^^^^L|JONES|19620320|F|||153 FERNWOOD DR.^^STATESVILLE^OH^35292||(206)3345232|(206)752-121||||AC555444444||67-A4335^OH^20030520\rOBR|1|845439^GHH OE|1045813^GHH LAB|15545^GLUCOSE|||200202150730|||||||||555-55-5555^PRIMARY^PATRICIA P^^^^MD^^|||||||||F||||||444-44-4444^HIPPOCRATES^HOWARD H^^^^MD\rOBX|1|SN|1554-5^GLUCOSE^POST 12H CFST:MCNC:PT:SER/PLAS:QN||^182|mg/dl|\\70_105|H|||F" } +fn get_sample_message_with_escape_sequences() -> &'static str { + //there's a backslash down at char 487! + "MSH|^~\\&|GHH LAB|ELAB-3|GHH OE|BLDG4|200202150930||ORU^R01|CNTRL-3456|P|2.4\rPID|||\\F\\555-44-4444||EVERYWOMAN^EVE^E^^^^L|JONES|19620320|F|||153 FERNWOOD DR.^^STATESVILLE^OH^35292||(206)3345232|(206)752-121||||AC555444444||67-A4335^OH^20030520\rOBR|1|845439^GHH OE|1045813^GHH LAB|15545^GLUCOSE|||200202150730|||||||||555-55-5555^PRIMARY^PATRICIA P^^^^MD^^|||||||||F||||||444-44-4444^HIPPOCRATES^HOWARD H^^^^MD\rOBX|1|SN|1554-5^GLUCOSE^POST 12H CFST:MCNC:PT:SER/PLAS:QN||^182|mg/dl|\\70_105|H|||F" +} criterion_group!( decoder, - no_sequences + no_sequences, + no_sequences_but_backslash, + has_escape_sequences ); -criterion_main!(decoder); \ No newline at end of file +criterion_main!(decoder); diff --git a/src/decoder/mod.rs b/src/decoder/mod.rs index dd7eaff..b0dd668 100644 --- a/src/decoder/mod.rs +++ b/src/decoder/mod.rs @@ -3,48 +3,88 @@ For more info see [here](https://www.lyniate.com/knowledge-hub/hl7-escape-sequences/) or [here](https://confluence.hl7australia.com/display/OOADRM20181/Appendix+1+Parsing+HL7v2#Appendix1ParsingHL7v2-Dealingwithreservedcharactersanddelimiters) + ## Details + This decoder will replace some, ** but not all ** of the standard HL7 escape sequences. Specifically, the following sequences are **NOT** replaced: + - `\H\` - Indicates the start of highlighted text, this is a consuming application problem and will not be replaced + - `\N\` - Indicates the end of highlighted text and resumption of normal text. This is a consuming application problem and will not be replaced + - `\Z...\` - Custom application escape sequences, these are custom (as are most `Z` items in HL7) and will not be replaced */ -use std::borrow::Cow; -use regex::{Match, Regex}; use crate::separators::Separators; - +use regex::Regex; +use std::borrow::Cow; pub struct EscapeSequence { delims: Separators, - regex: Regex + escape_char_regex: Regex, + field_regex: Regex, } impl<'a> EscapeSequence { - pub fn new(delims: Separators) -> EscapeSequence { - EscapeSequence { + let return_val = EscapeSequence { delims, - regex: Regex::new("[/]").unwrap() - } - } + escape_char_regex: Regex::new(r#"\\"#).unwrap(), + field_regex: Regex::new(r#"\\F\\"#).unwrap(), + }; + return_val + } - pub fn decode(&self, input: S) -> Cow<'a, str> - where S: Into> + pub fn decode(&self, input: S) -> Cow<'a, str> + where + S: Into>, { - let input = input.into(); - let first = self.regex.find(&input); // find the first escape sequence + //let first = input.find(self.delims.escape_char); + let first = self.escape_char_regex.find(&input); if first.is_some() { - input.into() - } else { // no escape char in the string at all, just return what we have - input.into() + // We know there's a backslash, so we need to process stuff + // I wanted to use regex as a simple(ish) if slow(ish) way to get started with this, but the requirement for a dynamic escaping char (typically `\`) + // which may/may not need doubling up for regex interpretation reasons is making that harder... + + let output = self + .field_regex + .replace_all(&input, self.delims.field.to_string()); + + Cow::Owned(output.into()) + + /* + // TODO: Awesome forwards-only string manip work... gotta start simple + + let first = first.unwrap().start(); + let mut output: Vec = Vec::from(input[0..first].as_bytes()); + output.reserve(input.len() - first); + + let rest = input[first+1..].bytes(); // the +1 skips the initial backslash that got us here + + let mut iter = rest.into_iter(); + + while let Some(c) = iter.next() { + match c { + b'F' => { + output.extend_from_slice(&self.field_bytes); + iter.next(); // eat the next slash //TODO: How do we know that was a slash? + } + _ => output.push(c), + } + } + + Cow::Owned(String::from_utf8(output).unwrap()) + */ + } else { + // no escape char in the string at all, just return what we have + input } - } } - #[cfg(test)] mod tests { + use std::str::FromStr; + use super::*; #[test] @@ -52,22 +92,48 @@ mod tests { let delims = Separators::default(); let escaper = EscapeSequence::new(delims); - let input = "There are no escape sequences here/there."; let output = escaper.decode(input); assert_eq!(output, input); } #[test] - fn test_decode_handles_field_sequence() { + fn test_decode_does_nothing_if_backslash_is_not_escape_sequence() { let delims = Separators::default(); let escaper = EscapeSequence::new(delims); + let input = r#"There are no escape sequences here\there."#; + let output = escaper.decode(input); + assert_eq!(output, input); + } - let input = "Escape this \\F\\ please"; + #[test] + fn test_decode_handles_field_sequence() { + let delims = Separators::default(); + let escaper = EscapeSequence::new(delims); + + let input = r#"Escape this \F\ please"#; let output = escaper.decode(input); assert_eq!(output, "Escape this | please"); } + #[test] + fn ensure_decode_does_not_eat_chars_it_shouldnt() { + let delims = Separators::default(); + let escaper = EscapeSequence::new(delims); + + let input = r#"Escape this \F please"#; + let output = escaper.decode(input); + assert_eq!(output, input); + } + + #[test] + fn ensure_decode_handles_custom_delims() { + let delims = Separators::from_str("MSH|!@#$|").unwrap(); + let escaper = EscapeSequence::new(delims); -} \ No newline at end of file + let input = r#"Escape this #F# please"#; + let output = escaper.decode(input); + assert_eq!(output, "Escape this # please"); + } +}