Skip to content

Commit

Permalink
option for trimming fixed sized character data
Browse files Browse the repository at this point in the history
  • Loading branch information
pacman82 committed Aug 28, 2024
1 parent a6c57fe commit 8eaca1e
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 20 deletions.
2 changes: 2 additions & 0 deletions src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ pub fn choose_column_strategy(
col_index: u16,
buffer_allocation_options: BufferAllocationOptions,
map_value_errors_to_null: bool,
trim_fixed_sized_character_strings: bool,
) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
let strat: Box<dyn ReadStrategy + Send> = match field.data_type() {
ArrowDataType::Boolean => {
Expand Down Expand Up @@ -149,6 +150,7 @@ pub fn choose_column_strategy(
sql_type,
lazy_display_size,
buffer_allocation_options.max_text_size,
trim_fixed_sized_character_strings,
)?
}
ArrowDataType::Decimal128(precision, scale @ 0..) => {
Expand Down
17 changes: 14 additions & 3 deletions src/reader/odbc_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ pub struct OdbcReaderBuilder {
max_binary_size: Option<usize>,
map_value_errors_to_null: bool,
fallibale_allocations: bool,
trim_fixed_sized_character_strings: bool,
}

impl OdbcReaderBuilder {
Expand All @@ -230,6 +231,7 @@ impl OdbcReaderBuilder {
max_binary_size: None,
fallibale_allocations: false,
map_value_errors_to_null: false,
trim_fixed_sized_character_strings: false,
}
}

Expand Down Expand Up @@ -320,7 +322,11 @@ impl OdbcReaderBuilder {

/// If set to `true` text in fixed sized character columns like e.g. CHAR are trimmed of
/// whitespaces before converted into Arrow UTF-8 arrays. Default is `false`.
pub fn trim_fixed_sized_characters(&mut self, fixed_sized_character_strings_are_trimmed: bool) -> &mut Self{
pub fn trim_fixed_sized_characters(
&mut self,
fixed_sized_character_strings_are_trimmed: bool,
) -> &mut Self {
self.trim_fixed_sized_character_strings = fixed_sized_character_strings_are_trimmed;
self
}

Expand Down Expand Up @@ -361,8 +367,13 @@ impl OdbcReaderBuilder {
max_binary_size: self.max_binary_size,
fallibale_allocations: self.fallibale_allocations,
};
let converter =
ToRecordBatch::new(&mut cursor, self.schema.clone(), buffer_allocation_options, self.map_value_errors_to_null)?;
let converter = ToRecordBatch::new(
&mut cursor,
self.schema.clone(),
buffer_allocation_options,
self.map_value_errors_to_null,
self.trim_fixed_sized_character_strings
)?;
let bytes_per_row = converter.row_size_in_bytes();
let buffer_size_in_rows = self.buffer_size_in_rows(bytes_per_row)?;
let row_set_buffer =
Expand Down
48 changes: 33 additions & 15 deletions src/reader/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@ pub fn choose_text_strategy(
sql_type: OdbcDataType,
lazy_display_size: impl FnOnce() -> Result<Option<NonZeroUsize>, odbc_api::Error>,
max_text_size: Option<usize>,
trim_fixed_sized_character_strings: bool,
) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
let apply_buffer_limit = |len| match (len, max_text_size) {
(None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }),
(None, Some(limit)) => Ok(limit),
(Some(len), None) => Ok(len),
(Some(len), Some(limit)) => Ok(min(len, limit)),
};
let is_fixed_sized_char = matches!(
sql_type,
OdbcDataType::Char { .. } | OdbcDataType::WChar { .. }
);
let trim = trim_fixed_sized_character_strings && is_fixed_sized_char;
let strategy: Box<dyn ReadStrategy + Send> = if cfg!(target_os = "windows") {
let hex_len = sql_type
.utf16_len()
Expand All @@ -31,7 +37,7 @@ pub fn choose_text_strategy(
.transpose()
.map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
let hex_len = apply_buffer_limit(hex_len.map(NonZeroUsize::get))?;
wide_text_strategy(hex_len)
wide_text_strategy(hex_len, trim)
} else {
let octet_len = sql_type
.utf8_len()
Expand All @@ -43,20 +49,18 @@ pub fn choose_text_strategy(
// So far only Linux users seemed to have complained about panics due to garbage indices?
// Linux usually would use UTF-8, so we only invest work in working around this for narrow
// strategies
narrow_text_strategy(octet_len)
narrow_text_strategy(octet_len, trim)
};

Ok(strategy)
}

fn wide_text_strategy(u16_len: usize) -> Box<dyn ReadStrategy + Send> {
Box::new(WideText::new(u16_len))
fn wide_text_strategy(u16_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
Box::new(WideText::new(u16_len, trim))
}

fn narrow_text_strategy(
octet_len: usize,
) -> Box<dyn ReadStrategy + Send> {
Box::new(NarrowText::new(octet_len))
fn narrow_text_strategy(octet_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
Box::new(NarrowText::new(octet_len, trim))
}

/// Strategy requesting the text from the database as UTF-16 (Wide characters) and emmitting it as
Expand All @@ -65,11 +69,13 @@ fn narrow_text_strategy(
pub struct WideText {
/// Maximum string length in u16, excluding terminating zero
max_str_len: usize,
/// Wether the string should be trimmed.
trim: bool,
}

impl WideText {
pub fn new(max_str_len: usize) -> Self {
Self { max_str_len }
pub fn new(max_str_len: usize, trim: bool) -> Self {
Self { max_str_len, trim }
}
}

Expand All @@ -96,7 +102,12 @@ impl ReadStrategy for WideText {
for c in decode_utf16(utf16.as_slice().iter().cloned()) {
buf_utf8.push(c.unwrap());
}
Some(&buf_utf8)
let slice = if self.trim {
buf_utf8.trim()
} else {
buf_utf8.as_str()
};
Some(slice)
} else {
None
};
Expand All @@ -109,11 +120,13 @@ impl ReadStrategy for WideText {
pub struct NarrowText {
/// Maximum string length in u8, excluding terminating zero
max_str_len: usize,
/// Wether the string should be trimmed.
trim: bool,
}

impl NarrowText {
pub fn new(max_str_len: usize) -> Self {
Self { max_str_len }
pub fn new(max_str_len: usize, trim: bool) -> Self {
Self { max_str_len, trim }
}
}

Expand All @@ -129,8 +142,13 @@ impl ReadStrategy for NarrowText {
let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len());
for value in view.iter() {
builder.append_option(value.map(|bytes| {
std::str::from_utf8(bytes)
.expect("ODBC driver had been expected to return valid utf8, but did not.")
let untrimmed = std::str::from_utf8(bytes)
.expect("ODBC driver had been expected to return valid utf8, but did not.");
if self.trim {
untrimmed.trim()
} else {
untrimmed
}
}));
}
Ok(Arc::new(builder.finish()))
Expand Down
2 changes: 2 additions & 0 deletions src/reader/to_record_batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ impl ToRecordBatch {
schema: Option<SchemaRef>,
buffer_allocation_options: BufferAllocationOptions,
map_value_errors_to_null: bool,
trim_fixed_sized_character_strings: bool,
) -> Result<Self, Error> {
// Infer schema if not given by the user
let schema = if let Some(schema) = schema {
Expand All @@ -48,6 +49,7 @@ impl ToRecordBatch {
col_index,
buffer_allocation_options,
map_value_errors_to_null,
trim_fixed_sized_character_strings,
)
.map_err(|cause| cause.into_crate_error(field.name().clone(), index))
})
Expand Down
4 changes: 2 additions & 2 deletions tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,8 @@ fn trim_fixed_sized_character_data() {
// Assert that the correct values are found within the arrow batch
let array_vals = array_any.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("1234", array_vals.value(0));
assert_eq!(" 123", array_vals.value(1));
assert_eq!("123 ", array_vals.value(2));
assert_eq!("123", array_vals.value(1));
assert_eq!("123", array_vals.value(2));
}

/// Fill a record batch of Strings from a nvarchar source column
Expand Down

0 comments on commit 8eaca1e

Please sign in to comment.