Skip to content

Commit

Permalink
vlen change encoding names
Browse files Browse the repository at this point in the history
  • Loading branch information
LDeakin committed Jul 18, 2024
1 parent 280d0e8 commit db563a2
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 37 deletions.
4 changes: 2 additions & 2 deletions src/array/codec/array_to_bytes/vlen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ fn get_arrow32_bytes_and_offsets<'a>(
for (curr, next) in offsets.iter().tuple_windows() {
if next < curr || *next > u32::try_from(bytes.len()).unwrap() {
return Err(CodecError::Other(
"Invalid bytes offsets in vlen Arrow64 encoded chunk".to_string(),
"Invalid bytes offsets in vlen Offset64 encoded chunk".to_string(),
));

Check warning on line 87 in src/array/codec/array_to_bytes/vlen.rs

View check run for this annotation

Codecov / codecov/patch

src/array/codec/array_to_bytes/vlen.rs#L85-L87

Added lines #L85 - L87 were not covered by tests
}
}
Expand Down Expand Up @@ -124,7 +124,7 @@ fn get_arrow64_bytes_and_offsets<'a>(
for (curr, next) in offsets.iter().tuple_windows() {
if next < curr || *next > bytes.len() as u64 {
return Err(CodecError::Other(
"Invalid bytes offsets in vlen Arrow64 encoded chunk".to_string(),
"Invalid bytes offsets in vlen Offset64 encoded chunk".to_string(),
));

Check warning on line 128 in src/array/codec/array_to_bytes/vlen.rs

View check run for this annotation

Codecov / codecov/patch

src/array/codec/array_to_bytes/vlen.rs#L126-L128

Added lines #L126 - L128 were not covered by tests
}
}
Expand Down
14 changes: 7 additions & 7 deletions src/array/codec/array_to_bytes/vlen/vlen_codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ pub struct VlenCodec {

impl Default for VlenCodec {
fn default() -> Self {
Self::new(Encoding::Arrow64)
Self::new(Encoding::Offset64)
}
}

Expand Down Expand Up @@ -94,7 +94,7 @@ impl ArrayToBytesCodecTraits for VlenCodec {
debug_assert_eq!(1 + num_elements, offsets.len() as u64);

let data: Vec<u8> = match self.encoding {
Encoding::Arrow32 => {
Encoding::Offset32 => {
let bytes_start = offsets.len() * size_of::<u32>();
let mut data: Vec<u8> = Vec::with_capacity(bytes_start + bytes.len());
let num_elements = u32::try_from(num_elements).map_err(|_| {
Expand All @@ -117,7 +117,7 @@ impl ArrayToBytesCodecTraits for VlenCodec {
data.extend_from_slice(&bytes);
data
}
Encoding::Arrow64 => {
Encoding::Offset64 => {
let bytes_start = offsets.len() * size_of::<u64>();
let mut data: Vec<u8> = Vec::with_capacity(bytes_start + bytes.len());
data.extend_from_slice(num_elements.to_le_bytes().as_slice());
Expand All @@ -137,7 +137,7 @@ impl ArrayToBytesCodecTraits for VlenCodec {
data.extend_from_slice(&bytes);
data
}
Encoding::Interleaved32 => {
Encoding::Length32 => {
let mut data: Vec<u8> =
Vec::with_capacity(offsets.len() * size_of::<u32>() + bytes.len());
// Number of elements
Expand Down Expand Up @@ -167,17 +167,17 @@ impl ArrayToBytesCodecTraits for VlenCodec {
) -> Result<ArrayBytes<'a>, CodecError> {
let num_elements = decoded_representation.num_elements_usize();
match self.encoding {
Encoding::Arrow32 => {
Encoding::Offset32 => {
let (bytes, offsets) = super::get_arrow32_bytes_and_offsets(num_elements, &bytes)?;
let offsets = super::offsets_u32_to_usize(offsets);
Ok(ArrayBytes::new_vlen(bytes.to_vec(), offsets))
}
Encoding::Arrow64 => {
Encoding::Offset64 => {
let (bytes, offsets) = super::get_arrow64_bytes_and_offsets(num_elements, &bytes)?;
let offsets = super::offsets_u64_to_usize(offsets);
Ok(ArrayBytes::new_vlen(bytes.to_vec(), offsets))
}
Encoding::Interleaved32 => {
Encoding::Length32 => {
let (bytes, offsets) =
super::get_interleaved_bytes_and_offsets(num_elements, &bytes)?;
Ok(ArrayBytes::new_vlen(bytes, offsets))
Expand Down
6 changes: 3 additions & 3 deletions src/array/codec/array_to_bytes/vlen/vlen_partial_decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,17 @@ fn decode_vlen_bytes<'a>(
if let Some(bytes) = bytes {
let num_elements = usize::try_from(shape.iter().product::<u64>()).unwrap();
match encoding {
Encoding::Arrow32 => {
Encoding::Offset32 => {
let (bytes, offsets) = super::get_arrow32_bytes_and_offsets(num_elements, &bytes)?;
let offsets = super::offsets_u32_to_usize(offsets);
extract_decoded_regions_arrow(decoded_regions, shape, bytes, &offsets)
}
Encoding::Arrow64 => {
Encoding::Offset64 => {
let (bytes, offsets) = super::get_arrow64_bytes_and_offsets(num_elements, &bytes)?;
let offsets = super::offsets_u64_to_usize(offsets);
extract_decoded_regions_arrow(decoded_regions, shape, bytes, &offsets)
}
Encoding::Interleaved32 => {
Encoding::Length32 => {
let (bytes, offsets) =
super::get_interleaved_bytes_and_offsets(num_elements, &bytes)?;
extract_decoded_regions_arrow(decoded_regions, shape, &bytes, &offsets)
Expand Down
20 changes: 11 additions & 9 deletions src/metadata/v3/codec/vlen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,32 @@ use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug, Display)]
#[serde(rename_all = "lowercase")]
pub enum Encoding {
/// 32-bit offsets.
///
/// Modeled on the 32-bit apache arrow variable-size binary layout with the validity map elided, no padding, and unsigned offsets.
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout>.
///
/// Structure:
/// - a `u32` indicating the number of elements (length)
/// - length + 1 `u32` element byte offsets
/// - packed element bytes
Offset32,
/// 64-bit offsets.
///
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout>.
Arrow32,
/// Modeled on the 64-bit apache arrow variable-size binary layout with the validity map elided, no padding, and unsigned offsets.
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout>.
///
/// Structure:
/// - a `u64` indicating the number of elements (length)
/// - length + 1 `u64` element byte offsets
/// - packed element bytes
///
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout>.
Arrow64,
/// 32-bit interleaved. Matches the numcodecs `VLenUTF8` codec.
Offset64,
/// 32-bit interleaved lengths. Matches the numcodecs `VLenUTF8` codec.
///
/// Structure:
/// - a `u32` length.
/// - interleaved `u32` element length (in bytes) and element bytes.
Interleaved32,
Length32,
}

/// The identifier for the `vlen` codec.
Expand Down Expand Up @@ -65,12 +67,12 @@ mod tests {

// #[test]
// fn codec_vlen_arrow32() {
// serde_json::from_str::<VlenCodecConfiguration>(r#"{"encoding":"arrow32"}"#).unwrap();
// serde_json::from_str::<VlenCodecConfiguration>(r#"{"encoding":"offset32"}"#).unwrap();
// }

#[test]
fn codec_vlen_arrow64() {
serde_json::from_str::<VlenCodecConfiguration>(r#"{"encoding":"arrow64"}"#).unwrap();
serde_json::from_str::<VlenCodecConfiguration>(r#"{"encoding":"offset64"}"#).unwrap();
}

// #[test]
Expand Down
32 changes: 16 additions & 16 deletions tests/cities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,25 +83,25 @@ fn cities() -> Result<(), Box<dyn Error>> {
assert_eq!(cities[47862], "Sariwŏn-si");
assert_eq!(cities[47867], "Charlotte Amalie");

print!("| encoding | None | Zstd 2 | Zstd 5 |\n");
print!("| --------------------------- | ------ | ------ | ------ |\n");
print!("| Interleaved32 | {} ", cities_impl(&cities, Encoding::Interleaved32, None, 1000, None)?);
print!("| {} ", cities_impl(&cities, Encoding::Interleaved32, Some(2), 1000, None)?);
print!("| {} |\n", cities_impl(&cities, Encoding::Interleaved32, Some(5), 1000, None)?);
print!("| Arrow32 | {} ", cities_impl(&cities, Encoding::Arrow32, None, 1000, None)?);
print!("| {} ", cities_impl(&cities, Encoding::Arrow32, Some(2), 1000, None)?);
print!("| {} |\n", cities_impl(&cities, Encoding::Arrow32, Some(5), 1000, None)?);
print!("| Arrow64 | {} ", cities_impl(&cities, Encoding::Arrow64, None, 1000, None)?);
print!("| {} ", cities_impl(&cities, Encoding::Arrow64, Some(2), 1000, None)?);
print!("| {} |\n", cities_impl(&cities, Encoding::Arrow64, Some(5), 1000, None)?);
print!("| encoding | None | Zstd 2 | Zstd 5 |\n");
print!("| -------- | ------ | ------ | ------ |\n");
print!("| length32 | {} ", cities_impl(&cities, Encoding::Length32, None, 1000, None)?);
print!("| {} ", cities_impl(&cities, Encoding::Length32, Some(2), 1000, None)?);
print!("| {} |\n", cities_impl(&cities, Encoding::Length32, Some(5), 1000, None)?);
print!("| offset32 | {} ", cities_impl(&cities, Encoding::Offset32, None, 1000, None)?);
print!("| {} ", cities_impl(&cities, Encoding::Offset32, Some(2), 1000, None)?);
print!("| {} |\n", cities_impl(&cities, Encoding::Offset32, Some(5), 1000, None)?);
print!("| offset64 | {} ", cities_impl(&cities, Encoding::Offset64, None, 1000, None)?);
print!("| {} ", cities_impl(&cities, Encoding::Offset64, Some(2), 1000, None)?);
print!("| {} |\n", cities_impl(&cities, Encoding::Offset64, Some(5), 1000, None)?);
println!();
// panic!();

// | encoding | None | Zstd 2 | Zstd 5 |
// | --------------------------- | ------ | ------ | ------ |
// | Interleaved32 | 642196 | 378905 | 362626 |
// | Arrow32 | 642388 | 466353 | 464095 |
// | Arrow64 | 834772 | 370769 | 373969 |
// | encoding | None | Zstd 2 | Zstd 5 |
// | -------- | ------ | ------ | ------ |
// | length32 | 642196 | 378905 | 362626 |
// | offset32 | 642388 | 466353 | 464095 |
// | offset64 | 834772 | 370769 | 373969 |

Ok(())
}

0 comments on commit db563a2

Please sign in to comment.