Skip to content

Commit

Permalink
For CastOptions.safe as false case, applying optimized casting
Browse files Browse the repository at this point in the history
  • Loading branch information
viirya committed Jan 27, 2023
1 parent 9529101 commit 23acc63
Showing 1 changed file with 33 additions and 21 deletions.
54 changes: 33 additions & 21 deletions arrow-cast/src/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3429,27 +3429,39 @@ where
.as_any()
.downcast_ref::<GenericByteArray<GenericBinaryType<I>>>()
.unwrap();
Ok(Arc::new(
array
.iter()
.map(|maybe_value| match maybe_value {
Some(value) => {
let result = std::str::from_utf8(value);
if cast_options.safe {
Ok(result.ok())
} else {
Some(result.map_err(|_| {
ArrowError::CastError(
"Cannot cast binary to string".to_string(),
)
}))
.transpose()
}
}
None => Ok(None),
})
.collect::<Result<GenericByteArray<GenericStringType<O>>, _>>()?,
))

if !cast_options.safe {
let offsets = array.value_offsets();
let values = array.value_data();

// We only need to validate that all values are valid UTF-8
let validated = std::str::from_utf8(values)
.map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?;
for offset in offsets.iter() {
if !validated.is_char_boundary(offset.as_usize()) {
return Err(ArrowError::CastError("Invalid UTF-8 sequence".to_string()));
}
}

let builder = array
.into_data()
.into_builder()
.data_type(GenericStringArray::<O>::DATA_TYPE);
// SAFETY:
// Validated UTF-8 above
Ok(Arc::new(GenericStringArray::<O>::from(unsafe {
builder.build_unchecked()
})))
} else {
Ok(Arc::new(
array
.iter()
.map(|maybe_value| {
maybe_value.and_then(|value| std::str::from_utf8(value).ok())
})
.collect::<GenericByteArray<GenericStringType<O>>>(),
))
}
}

/// Helper function to cast from one `ByteArrayType` to another and vice versa.
Expand Down

0 comments on commit 23acc63

Please sign in to comment.