Skip to content

Commit

Permalink
Fix precision loss in Raw JSON decoder (#4049) (#4051)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold authored Apr 11, 2023
1 parent 768430f commit 6b17775
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 5 deletions.
24 changes: 24 additions & 0 deletions arrow-json/src/raw/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1375,4 +1375,28 @@ mod tests {
Some("+00:00".into()),
));
}

#[test]
fn test_truncation() {
let buf = r#"
{"i64": 9223372036854775807, "u64": 18446744073709551615 }
{"i64": "9223372036854775807", "u64": "18446744073709551615" }
{"i64": -9223372036854775808, "u64": 0 }
{"i64": "-9223372036854775808", "u64": 0 }
"#;

let schema = Arc::new(Schema::new(vec![
Field::new("i64", DataType::Int64, true),
Field::new("u64", DataType::UInt64, true),
]));

let batches = do_read(buf, 1024, true, schema);
assert_eq!(batches.len(), 1);

let i64 = batches[0].column(0).as_primitive::<Int64Type>();
assert_eq!(i64.values(), &[i64::MAX, i64::MAX, i64::MIN, i64::MIN]);

let u64 = batches[0].column(1).as_primitive::<UInt64Type>();
assert_eq!(u64.values(), &[u64::MAX, u64::MAX, u64::MIN, u64::MIN]);
}
}
47 changes: 42 additions & 5 deletions arrow-json/src/raw/primitive_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,45 @@ use arrow_schema::{ArrowError, DataType};
use crate::raw::tape::{Tape, TapeElement};
use crate::raw::{tape_error, ArrayDecoder};

/// A trait for JSON-specific primitive parsing logic
///
/// According to the specification unquoted fields should be parsed as a double-precision
/// floating point numbers, including scientific representation such as `2e3`
///
/// In practice, it is common to serialize numbers outside the range of an `f64` and expect
/// them to round-trip correctly. As such when parsing integers we first parse as the integer
/// and fallback to parsing as a floating point if this fails
trait ParseJsonNumber: Sized {
fn parse(s: &[u8]) -> Option<Self>;
}

macro_rules! primitive_parse {
($($t:ty),+) => {
$(impl ParseJsonNumber for $t {
fn parse(s: &[u8]) -> Option<Self> {
match lexical_core::parse::<Self>(s) {
Ok(f) => Some(f),
Err(_) => lexical_core::parse::<f64>(s).ok().and_then(NumCast::from),
}
}
})+
};
}

primitive_parse!(i8, i16, i32, i64, u8, u16, u32, u64);

impl ParseJsonNumber for f32 {
fn parse(s: &[u8]) -> Option<Self> {
lexical_core::parse::<Self>(s).ok()
}
}

impl ParseJsonNumber for f64 {
fn parse(s: &[u8]) -> Option<Self> {
lexical_core::parse::<Self>(s).ok()
}
}

pub struct PrimitiveArrayDecoder<P: ArrowPrimitiveType> {
data_type: DataType,
// Invariant and Send
Expand All @@ -45,7 +84,7 @@ impl<P: ArrowPrimitiveType> PrimitiveArrayDecoder<P> {
impl<P> ArrayDecoder for PrimitiveArrayDecoder<P>
where
P: ArrowPrimitiveType + Parser,
P::Native: NumCast,
P::Native: ParseJsonNumber,
{
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
let mut builder = PrimitiveBuilder::<P>::with_capacity(pos.len())
Expand All @@ -67,10 +106,8 @@ where
}
TapeElement::Number(idx) => {
let s = tape.get_string(idx);
let value = lexical_core::parse::<f64>(s.as_bytes())
.ok()
.and_then(NumCast::from)
.ok_or_else(|| {
let value =
ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| {
ArrowError::JsonError(format!(
"failed to parse {s} as {}",
self.data_type
Expand Down

0 comments on commit 6b17775

Please sign in to comment.