option for trimming fixed sized character data

pacman82 · Aug 28, 2024 · 8eaca1e · 8eaca1e
1 parent a6c57fe
commit 8eaca1e
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 20 deletions.
diff --git a/src/reader.rs b/src/reader.rs
@@ -119,6 +119,7 @@ pub fn choose_column_strategy(
     col_index: u16,
     buffer_allocation_options: BufferAllocationOptions,
     map_value_errors_to_null: bool,
+    trim_fixed_sized_character_strings: bool,
 ) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
     let strat: Box<dyn ReadStrategy + Send> = match field.data_type() {
         ArrowDataType::Boolean => {
@@ -149,6 +150,7 @@ pub fn choose_column_strategy(
                 sql_type,
                 lazy_display_size,
                 buffer_allocation_options.max_text_size,
+                trim_fixed_sized_character_strings,
             )?
         }
         ArrowDataType::Decimal128(precision, scale @ 0..) => {

diff --git a/src/reader/odbc_reader.rs b/src/reader/odbc_reader.rs
@@ -209,6 +209,7 @@ pub struct OdbcReaderBuilder {
     max_binary_size: Option<usize>,
     map_value_errors_to_null: bool,
     fallibale_allocations: bool,
+    trim_fixed_sized_character_strings: bool,
 }
 
 impl OdbcReaderBuilder {
@@ -230,6 +231,7 @@ impl OdbcReaderBuilder {
             max_binary_size: None,
             fallibale_allocations: false,
             map_value_errors_to_null: false,
+            trim_fixed_sized_character_strings: false,
         }
     }
 
@@ -320,7 +322,11 @@ impl OdbcReaderBuilder {
 
     /// If set to `true` text in fixed sized character columns like e.g. CHAR are trimmed of
     /// whitespaces before converted into Arrow UTF-8 arrays. Default is `false`.
-    pub fn trim_fixed_sized_characters(&mut self, fixed_sized_character_strings_are_trimmed: bool) -> &mut Self{
+    pub fn trim_fixed_sized_characters(
+        &mut self,
+        fixed_sized_character_strings_are_trimmed: bool,
+    ) -> &mut Self {
+        self.trim_fixed_sized_character_strings = fixed_sized_character_strings_are_trimmed;
         self
     }
 
@@ -361,8 +367,13 @@ impl OdbcReaderBuilder {
             max_binary_size: self.max_binary_size,
             fallibale_allocations: self.fallibale_allocations,
         };
-        let converter =
-            ToRecordBatch::new(&mut cursor, self.schema.clone(), buffer_allocation_options, self.map_value_errors_to_null)?;
+        let converter = ToRecordBatch::new(
+            &mut cursor,
+            self.schema.clone(),
+            buffer_allocation_options,
+            self.map_value_errors_to_null,
+            self.trim_fixed_sized_character_strings
+        )?;
         let bytes_per_row = converter.row_size_in_bytes();
         let buffer_size_in_rows = self.buffer_size_in_rows(bytes_per_row)?;
         let row_set_buffer =

diff --git a/src/reader/text.rs b/src/reader/text.rs
@@ -16,13 +16,19 @@ pub fn choose_text_strategy(
     sql_type: OdbcDataType,
     lazy_display_size: impl FnOnce() -> Result<Option<NonZeroUsize>, odbc_api::Error>,
     max_text_size: Option<usize>,
+    trim_fixed_sized_character_strings: bool,
 ) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
     let apply_buffer_limit = |len| match (len, max_text_size) {
         (None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }),
         (None, Some(limit)) => Ok(limit),
         (Some(len), None) => Ok(len),
         (Some(len), Some(limit)) => Ok(min(len, limit)),
     };
+    let is_fixed_sized_char = matches!(
+        sql_type,
+        OdbcDataType::Char { .. } | OdbcDataType::WChar { .. }
+    );
+    let trim = trim_fixed_sized_character_strings && is_fixed_sized_char;
     let strategy: Box<dyn ReadStrategy + Send> = if cfg!(target_os = "windows") {
         let hex_len = sql_type
             .utf16_len()
@@ -31,7 +37,7 @@ pub fn choose_text_strategy(
             .transpose()
             .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
         let hex_len = apply_buffer_limit(hex_len.map(NonZeroUsize::get))?;
-        wide_text_strategy(hex_len)
+        wide_text_strategy(hex_len, trim)
     } else {
         let octet_len = sql_type
             .utf8_len()
@@ -43,20 +49,18 @@ pub fn choose_text_strategy(
         // So far only Linux users seemed to have complained about panics due to garbage indices?
         // Linux usually would use UTF-8, so we only invest work in working around this for narrow
         // strategies
-        narrow_text_strategy(octet_len)
+        narrow_text_strategy(octet_len, trim)
     };
 
     Ok(strategy)
 }
 
-fn wide_text_strategy(u16_len: usize) -> Box<dyn ReadStrategy + Send> {
-    Box::new(WideText::new(u16_len))
+fn wide_text_strategy(u16_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
+    Box::new(WideText::new(u16_len, trim))
 }
 
-fn narrow_text_strategy(
-    octet_len: usize,
-) -> Box<dyn ReadStrategy + Send> {
-    Box::new(NarrowText::new(octet_len))
+fn narrow_text_strategy(octet_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
+    Box::new(NarrowText::new(octet_len, trim))
 }
 
 /// Strategy requesting the text from the database as UTF-16 (Wide characters) and emmitting it as
@@ -65,11 +69,13 @@ fn narrow_text_strategy(
 pub struct WideText {
     /// Maximum string length in u16, excluding terminating zero
     max_str_len: usize,
+    /// Wether the string should be trimmed.
+    trim: bool,
 }
 
 impl WideText {
-    pub fn new(max_str_len: usize) -> Self {
-        Self { max_str_len }
+    pub fn new(max_str_len: usize, trim: bool) -> Self {
+        Self { max_str_len, trim }
     }
 }
 
@@ -96,7 +102,12 @@ impl ReadStrategy for WideText {
                 for c in decode_utf16(utf16.as_slice().iter().cloned()) {
                     buf_utf8.push(c.unwrap());
                 }
-                Some(&buf_utf8)
+                let slice = if self.trim {
+                    buf_utf8.trim()
+                } else {
+                    buf_utf8.as_str()
+                };
+                Some(slice)
             } else {
                 None
             };
@@ -109,11 +120,13 @@ impl ReadStrategy for WideText {
 pub struct NarrowText {
     /// Maximum string length in u8, excluding terminating zero
     max_str_len: usize,
+    /// Wether the string should be trimmed.
+    trim: bool,
 }
 
 impl NarrowText {
-    pub fn new(max_str_len: usize) -> Self {
-        Self { max_str_len }
+    pub fn new(max_str_len: usize, trim: bool) -> Self {
+        Self { max_str_len, trim }
     }
 }
 
@@ -129,8 +142,13 @@ impl ReadStrategy for NarrowText {
         let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len());
         for value in view.iter() {
             builder.append_option(value.map(|bytes| {
-                std::str::from_utf8(bytes)
-                    .expect("ODBC driver had been expected to return valid utf8, but did not.")
+                let untrimmed = std::str::from_utf8(bytes)
+                    .expect("ODBC driver had been expected to return valid utf8, but did not.");
+                if self.trim {
+                    untrimmed.trim()
+                } else {
+                    untrimmed
+                }
             }));
         }
         Ok(Arc::new(builder.finish()))

diff --git a/src/reader/to_record_batch.rs b/src/reader/to_record_batch.rs
@@ -28,6 +28,7 @@ impl ToRecordBatch {
         schema: Option<SchemaRef>,
         buffer_allocation_options: BufferAllocationOptions,
         map_value_errors_to_null: bool,
+        trim_fixed_sized_character_strings: bool,
     ) -> Result<Self, Error> {
         // Infer schema if not given by the user
         let schema = if let Some(schema) = schema {
@@ -48,6 +49,7 @@ impl ToRecordBatch {
                     col_index,
                     buffer_allocation_options,
                     map_value_errors_to_null,
+                    trim_fixed_sized_character_strings,
                 )
                 .map_err(|cause| cause.into_crate_error(field.name().clone(), index))
             })

diff --git a/tests/integration.rs b/tests/integration.rs
@@ -322,8 +322,8 @@ fn trim_fixed_sized_character_data() {
     // Assert that the correct values are found within the arrow batch
     let array_vals = array_any.as_any().downcast_ref::<StringArray>().unwrap();
     assert_eq!("1234", array_vals.value(0));
-    assert_eq!(" 123", array_vals.value(1));
-    assert_eq!("123 ", array_vals.value(2));
+    assert_eq!("123", array_vals.value(1));
+    assert_eq!("123", array_vals.value(2));
 }
 
 /// Fill a record batch of Strings from a nvarchar source column