Closes wokket#22: Support for HL7 escape sequence decoding (wokket#28)

* WIP: starting on an escape sequence decoder * feat wokket#22: WIP on escape sequence handling, not as easy as I'd hoped because of HL7's full on spec. This will prob come in stages. * feat wokket#22: Good progress on simple delimiter escape sequences * feat wokket#22: Ensure highlighted text sequences (\N\, \H\)are ignored * perf: better than halved the perf of the 'No escape sequence' benchmark using a regex(!) rather than a simple str.find() * feat: Added support for ignoring custom (\Zdd\) escape sequences * chore: docs pass, moved decoder into better module/location * perf: Moved to the regex for all searching ops, about a 15% improvement in the benchmark * docs * docs: Added demo example for info on how to use the library * docs: Docs pass * feat: Added support for \X..\ escape sequences * docs: Updated docs for \X\ sequences.
sempervictus · Aug 18, 2021 · fa9689c · fa9689c
1 parent a553051
commit fa9689c
Show file tree

Hide file tree

Showing 8 changed files with 478 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## 0.5.0
  - Add `query` functions to replace the string based `Index` impls in the version version.  These are functionally identical to the string `Index` implementations, but avoid some lifetime issues (returning `&&str`) and have visible documentation.
+ - Add `EscapeSequence` struct to support decoding [escape sequences](https://www.lyniate.com/knowledge-hub/hl7-escape-sequences/) back to their original values.
 
 ## 0.4.0
  - Large change (thanks @sempervictus) to allow querying of message content by both numerical indexer and dot-notation string indexers

diff --git a/Cargo.toml b/Cargo.toml
@@ -12,11 +12,18 @@ name="rusthl7"
 path="src/lib.rs"
 
 [dependencies]
+hex = "0.4"
+log = "0.4"
+regex = "1.5"
 thiserror = "1.0"
 
 [dev-dependencies]
 criterion = "0.3"
 
 [[bench]]
 name = "simple_parse"
+harness = false
+
+[[bench]]
+name = "decoder"
 harness = false
diff --git a/benches/decoder.rs b/benches/decoder.rs
@@ -0,0 +1,74 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+use rusthl7::{escape_sequence::*, separators::Separators};
+
+// Note that we;re calkling decode on a whole message here, although it would normally be on an individual field...
+// this is just to make it work a bit harder on a larger dataset, not because it makes sense in a HL7 sense
+
+fn no_sequences(c: &mut Criterion) {
+    c.bench_function("No Escape Sequences", |b| {
+        let delims = Separators::default();
+        let decoder = EscapeSequence::new(delims);
+
+        b.iter(|| {
+            let _ = decoder.decode(get_sample_message_no_sequence());
+        })
+    });
+}
+
+// We expect creation to be a little slower, as we init the regexes to make decode() calls faster
+// Amortizing this cost across multiple calls makes sense if we're caching the struct
+fn create_struct(c: &mut Criterion) {
+    c.bench_function("Create struct", |b| {
+        let delims = Separators::default();
+
+        b.iter(|| {
+            let _ = EscapeSequence::new(delims);
+        })
+    });
+}
+
+fn no_sequences_but_backslash(c: &mut Criterion) {
+    c.bench_function("No Escape Sequences But Backslash", |b| {
+        let delims = Separators::default();
+        let decoder = EscapeSequence::new(delims);
+
+        b.iter(|| {
+            let _ = decoder.decode(get_sample_message_with_backslash());
+        })
+    });
+}
+
+fn has_escape_sequences(c: &mut Criterion) {
+    c.bench_function("Has Escape Sequences", |b| {
+        let delims = Separators::default();
+        let decoder = EscapeSequence::new(delims);
+
+        b.iter(|| {
+            let _ = decoder.decode(get_sample_message_with_escape_sequences());
+        })
+    });
+}
+
+fn get_sample_message_no_sequence() -> &'static str {
+    // note we've stripped the backslash from the MSH
+    "MSH|^~*&|GHH LAB|ELAB-3|GHH OE|BLDG4|200202150930||ORU^R01|CNTRL-3456|P|2.4\rPID|||555-44-4444||EVERYWOMAN^EVE^E^^^^L|JONES|19620320|F|||153 FERNWOOD DR.^^STATESVILLE^OH^35292||(206)3345232|(206)752-121||||AC555444444||67-A4335^OH^20030520\rOBR|1|845439^GHH OE|1045813^GHH LAB|15545^GLUCOSE|||200202150730|||||||||555-55-5555^PRIMARY^PATRICIA P^^^^MD^^|||||||||F||||||444-44-4444^HIPPOCRATES^HOWARD H^^^^MD\rOBX|1|SN|1554-5^GLUCOSE^POST 12H CFST:MCNC:PT:SER/PLAS:QN||^182|mg/dl|70_105|H|||F"
+}
+
+fn get_sample_message_with_backslash() -> &'static str {
+    //there's a backslash down at char 487!
+    "MSH|^~\\&|GHH LAB|ELAB-3|GHH OE|BLDG4|200202150930||ORU^R01|CNTRL-3456|P|2.4\rPID|||555-44-4444||EVERYWOMAN^EVE^E^^^^L|JONES|19620320|F|||153 FERNWOOD DR.^^STATESVILLE^OH^35292||(206)3345232|(206)752-121||||AC555444444||67-A4335^OH^20030520\rOBR|1|845439^GHH OE|1045813^GHH LAB|15545^GLUCOSE|||200202150730|||||||||555-55-5555^PRIMARY^PATRICIA P^^^^MD^^|||||||||F||||||444-44-4444^HIPPOCRATES^HOWARD H^^^^MD\rOBX|1|SN|1554-5^GLUCOSE^POST 12H CFST:MCNC:PT:SER/PLAS:QN||^182|mg/dl|\\70_105|H|||F"
+}
+
+fn get_sample_message_with_escape_sequences() -> &'static str {
+    //there's a backslash down at char 487!
+    "MSH|^~\\&|GHH LAB|ELAB-3|GHH OE|BLDG4|200202150930||ORU^R01|CNTRL-3456|P|2.4\rPID|||\\F\\555-44-4444||EVERYWOMAN^EVE^E^^^^L|JONES|19620320|F|||153 FERNWOOD DR.^^STATESVILLE^OH^35292||(206)3345232|(206)752-121||||AC555444444||67-A4335^OH^20030520\rOBR|1|845439^GHH OE|1045813^GHH LAB|15545^GLUCOSE|||200202150730|||||||||555-55-5555^PRIMARY^PATRICIA P^^^^MD^^|||||||||F||||||444-44-4444^HIPPOCRATES^HOWARD H^^^^MD\rOBX|1|SN|1554-5^GLUCOSE^POST 12H CFST:MCNC:PT:SER/PLAS:QN||^182|mg/dl|\\70_105|H|||F"
+}
+
+criterion_group!(
+    decoder,
+    create_struct,
+    no_sequences,
+    no_sequences_but_backslash,
+    has_escape_sequences
+);
+criterion_main!(decoder);
diff --git a/examples/demo.rs b/examples/demo.rs
@@ -0,0 +1,34 @@
+/*!
+ A short example demonstrating one way to use this library for HL7 processing.
+*/
+
+use std::{convert::TryFrom, error::Error};
+use rusthl7::{escape_sequence::EscapeSequence, message::Message};
+
+fn main() -> Result<(), Box<dyn Error>> {
+
+    // Normally message would come over the wire from a remote service etc.  
+    // Consider using the hl7-mllp-code crate or similar to make building those network services easier.
+    let hl7_string = get_sample_message();
+
+    // Parse the string into a structured entity
+    let message = Message::try_from(hl7_string)?;
+
+    // We can deep query message fields using the `query` functionality
+    let postcode = message.query("PID.F11.C5"); // Field 11, Component 5
+    assert_eq!(postcode, "35292");
+
+    // If you have the potential for escape sequences in your data you can process those using `EscapeSequence`
+    let charge_to_practice = message.query("OBR.F23");
+    assert_eq!(charge_to_practice, r#"Joes Obs \T\ Gynae"#);
+
+    let decoder = EscapeSequence::new(message.get_separators());
+    let charge_to_practice = decoder.decode(charge_to_practice); // Handle the escape sequences
+    assert_eq!(charge_to_practice, "Joes Obs & Gynae"); // converted the \T\ sequence to an ampersand
+
+    Ok(())
+}
+
+fn get_sample_message() -> &'static str {
+    "MSH|^~\\&|GHH LAB|ELAB-3|GHH OE|BLDG4|200202150930||ORU^R01|CNTRL-3456|P|2.4\rPID|||555-44-4444||EVERYWOMAN^EVE^E^^^^L|JONES|19620320|F|||153 FERNWOOD DR.^^STATESVILLE^OH^35292||(206)3345232|(206)752-121||||AC555444444||67-A4335^OH^20030520\rOBR|1|845439^GHH OE|1045813^GHH LAB|15545^GLUCOSE|||200202150730|||||||||555-55-5555^PRIMARY^PATRICIA P^^^^MD^^|||||||Joes Obs \\T\\ Gynae||F||||||444-44-4444^HIPPOCRATES^HOWARD H^^^^MD\rOBX|1|SN|1554-5^GLUCOSE^POST 12H CFST:MCNC:PT:SER/PLAS:QN||^182|mg/dl|70_105|H|||F"
+}
diff --git a/readme.md b/readme.md
@@ -5,21 +5,27 @@
 
 Totally kind of like production ready!
 
-The first cut was intended to parse from a multiline text blob into a tree of string slices, representing all the different facets of info.
 This second cut provides consistent structure down to the sub-sub-field, efficient accessors to shared string reference data, with standardized implementations of common functionality.
 
-Interpreting these facets (type conversion, determining which fields they represent etc) is a future problem.
+Interpreting these facets (type conversion, determining which fields they represent etc) is a future problem... there is **no plan whatsoever** for message conformance checks or anything of that nature.
+
+This library is trying to provide the _tooling_ you need to build robust HL7 based systems, without dictating _how_ you go about it.  There's no one-size-fits-all here, so we try to provide a box of separate tools rather than a full framework.
 
 ### Intended Features and Design Notes:
 - [x] Initially use hl7 default separator chars
 - [x] Use separator chars from the message
-- [X] Add support for sub-field (repeat/component/subcomponent) items
+- [X] Add support for sub-field (component/subcomponent) items
+    - [ ] Field repeats (via `~`) are currently missing ([#26](https://github.com/wokket/rust-hl7/issues/26))
 - [X] Initially, avoid any per-segment knowledge, requirement to read the spec too much etc.
     - Implementing all the segments, across all the hl7 versions, version-specific parsing etc is tooooo much while we're getting started.
-- [ ] Add Decoding/Encoding of special chars
+- [-] Add support for [HL7 escape sequences](https://www.lyniate.com/knowledge-hub/hl7-escape-sequences/) ([#22](https://github.com/wokket/rust-hl7/issues/22))
+    - [x] Decoding of the most common escape sequences including `\E\`, `\R\`, `\S\` & `\T\`
+    - [x] Correctly passes through `\H\`, `\N\` and custom `\Z..\` sequences unchanged
+    - [X] Decodes `\X..\` sequences for hex-encoded chars
+    - [ ] Support for various unicode sequences (`\C..\`, `\M..\`).  These are lower priority as [HL7 Australia considers them deprecated](https://confluence.hl7australia.com/display/OO/3+Datatypes#id-3Datatypes-3.1.1.6EscapesequencessupportingmultiplecharactersetsforFT,ST,andTXdatatypes)
 - [ ] Add tighter MSH as an exception to the above
 - [ ] The above allows us to parse everything as strings, and provide helper methods for type conversions as required.
-- [ ] Parse using a from_str() impl rather than a dedicated parser (idiomatic but no lifetimes)
+- [x] Parse a message using a `TryFrom<&str>` impl rather than a dedicated parser
 - [x] Index into messages using HL7 string index notation and binary methods
     - [x] Index into sub-fields using HL7 string index notation and binary methods
     - [X] Index into the segment enum using HL7 string index notation and binary methods