From 6b17775f37b939221d855514db4ffb3344deb1f4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 11 Apr 2023 14:33:19 +0100 Subject: [PATCH] Fix precision loss in Raw JSON decoder (#4049) (#4051) --- arrow-json/src/raw/mod.rs | 24 ++++++++++++++ arrow-json/src/raw/primitive_array.rs | 47 ++++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index c195524766c0..38b4cce9bd9a 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -1375,4 +1375,28 @@ mod tests { Some("+00:00".into()), )); } + + #[test] + fn test_truncation() { + let buf = r#" + {"i64": 9223372036854775807, "u64": 18446744073709551615 } + {"i64": "9223372036854775807", "u64": "18446744073709551615" } + {"i64": -9223372036854775808, "u64": 0 } + {"i64": "-9223372036854775808", "u64": 0 } + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("i64", DataType::Int64, true), + Field::new("u64", DataType::UInt64, true), + ])); + + let batches = do_read(buf, 1024, true, schema); + assert_eq!(batches.len(), 1); + + let i64 = batches[0].column(0).as_primitive::(); + assert_eq!(i64.values(), &[i64::MAX, i64::MAX, i64::MIN, i64::MIN]); + + let u64 = batches[0].column(1).as_primitive::(); + assert_eq!(u64.values(), &[u64::MAX, u64::MAX, u64::MIN, u64::MIN]); + } } diff --git a/arrow-json/src/raw/primitive_array.rs b/arrow-json/src/raw/primitive_array.rs index 72ce30203d01..6985821d65fe 100644 --- a/arrow-json/src/raw/primitive_array.rs +++ b/arrow-json/src/raw/primitive_array.rs @@ -27,6 +27,45 @@ use arrow_schema::{ArrowError, DataType}; use crate::raw::tape::{Tape, TapeElement}; use crate::raw::{tape_error, ArrayDecoder}; +/// A trait for JSON-specific primitive parsing logic +/// +/// According to the specification unquoted fields should be parsed as a double-precision +/// floating point numbers, including scientific representation such as `2e3` +/// +/// In practice, it is common to serialize numbers outside the range of an `f64` and expect +/// them to round-trip correctly. As such when parsing integers we first parse as the integer +/// and fallback to parsing as a floating point if this fails +trait ParseJsonNumber: Sized { + fn parse(s: &[u8]) -> Option; +} + +macro_rules! primitive_parse { + ($($t:ty),+) => { + $(impl ParseJsonNumber for $t { + fn parse(s: &[u8]) -> Option { + match lexical_core::parse::(s) { + Ok(f) => Some(f), + Err(_) => lexical_core::parse::(s).ok().and_then(NumCast::from), + } + } + })+ + }; +} + +primitive_parse!(i8, i16, i32, i64, u8, u16, u32, u64); + +impl ParseJsonNumber for f32 { + fn parse(s: &[u8]) -> Option { + lexical_core::parse::(s).ok() + } +} + +impl ParseJsonNumber for f64 { + fn parse(s: &[u8]) -> Option { + lexical_core::parse::(s).ok() + } +} + pub struct PrimitiveArrayDecoder { data_type: DataType, // Invariant and Send @@ -45,7 +84,7 @@ impl PrimitiveArrayDecoder

{ impl

ArrayDecoder for PrimitiveArrayDecoder

where P: ArrowPrimitiveType + Parser, - P::Native: NumCast, + P::Native: ParseJsonNumber, { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let mut builder = PrimitiveBuilder::

::with_capacity(pos.len()) @@ -67,10 +106,8 @@ where } TapeElement::Number(idx) => { let s = tape.get_string(idx); - let value = lexical_core::parse::(s.as_bytes()) - .ok() - .and_then(NumCast::from) - .ok_or_else(|| { + let value = + ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| { ArrowError::JsonError(format!( "failed to parse {s} as {}", self.data_type