-
Notifications
You must be signed in to change notification settings - Fork 842
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix generate_nested_dictionary_case
integration test failure
#1636
Changes from 2 commits
ae20913
3ba6256
f984dbf
7622bc3
3090a00
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -54,14 +54,15 @@ fn read_buffer(buf: &ipc::Buffer, a_data: &[u8]) -> Buffer { | |||||
/// - cast the 64-bit array to the appropriate data type | ||||||
fn create_array( | ||||||
nodes: &[ipc::FieldNode], | ||||||
data_type: &DataType, | ||||||
field: &Field, | ||||||
data: &[u8], | ||||||
buffers: &[ipc::Buffer], | ||||||
dictionaries: &[Option<ArrayRef>], | ||||||
dictionaries: &HashMap<i64, ArrayRef>, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Maybe this would more consistent with the names used in the rest of this PR |
||||||
mut node_index: usize, | ||||||
mut buffer_index: usize, | ||||||
) -> Result<(ArrayRef, usize, usize)> { | ||||||
use DataType::*; | ||||||
let data_type = field.data_type(); | ||||||
let array = match data_type { | ||||||
Utf8 | Binary | LargeBinary | LargeUtf8 => { | ||||||
let array = create_primitive_array( | ||||||
|
@@ -99,7 +100,7 @@ fn create_array( | |||||
buffer_index += 2; | ||||||
let triple = create_array( | ||||||
nodes, | ||||||
list_field.data_type(), | ||||||
list_field, | ||||||
data, | ||||||
buffers, | ||||||
dictionaries, | ||||||
|
@@ -121,7 +122,7 @@ fn create_array( | |||||
buffer_index += 1; | ||||||
let triple = create_array( | ||||||
nodes, | ||||||
list_field.data_type(), | ||||||
list_field, | ||||||
data, | ||||||
buffers, | ||||||
dictionaries, | ||||||
|
@@ -146,7 +147,7 @@ fn create_array( | |||||
for struct_field in struct_fields { | ||||||
let triple = create_array( | ||||||
nodes, | ||||||
struct_field.data_type(), | ||||||
struct_field, | ||||||
data, | ||||||
buffers, | ||||||
dictionaries, | ||||||
|
@@ -173,7 +174,9 @@ fn create_array( | |||||
.iter() | ||||||
.map(|buf| read_buffer(buf, data)) | ||||||
.collect(); | ||||||
let value_array = dictionaries[node_index].clone().unwrap(); | ||||||
|
||||||
let value_array = | ||||||
dictionaries.get(&field.dict_id().unwrap()).unwrap().clone(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could maybe return an error here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, updated. |
||||||
node_index += 1; | ||||||
buffer_index += 2; | ||||||
|
||||||
|
@@ -209,7 +212,7 @@ fn create_array( | |||||
for field in fields { | ||||||
let triple = create_array( | ||||||
nodes, | ||||||
field.data_type(), | ||||||
field, | ||||||
data, | ||||||
buffers, | ||||||
dictionaries, | ||||||
|
@@ -457,7 +460,7 @@ pub fn read_record_batch( | |||||
buf: &[u8], | ||||||
batch: ipc::RecordBatch, | ||||||
schema: SchemaRef, | ||||||
dictionaries: &[Option<ArrayRef>], | ||||||
dictionaries: &HashMap<i64, ArrayRef>, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
projection: Option<&[usize]>, | ||||||
) -> Result<RecordBatch> { | ||||||
let buffers = batch.buffers().ok_or_else(|| { | ||||||
|
@@ -477,7 +480,7 @@ pub fn read_record_batch( | |||||
let field = &fields[index]; | ||||||
let triple = create_array( | ||||||
field_nodes, | ||||||
field.data_type(), | ||||||
field, | ||||||
buf, | ||||||
buffers, | ||||||
dictionaries, | ||||||
|
@@ -495,7 +498,7 @@ pub fn read_record_batch( | |||||
for field in schema.fields() { | ||||||
let triple = create_array( | ||||||
field_nodes, | ||||||
field.data_type(), | ||||||
field, | ||||||
buf, | ||||||
buffers, | ||||||
dictionaries, | ||||||
|
@@ -516,7 +519,7 @@ pub fn read_dictionary( | |||||
buf: &[u8], | ||||||
batch: ipc::DictionaryBatch, | ||||||
schema: &Schema, | ||||||
dictionaries_by_field: &mut [Option<ArrayRef>], | ||||||
dictionaries_by_field: &mut HashMap<i64, ArrayRef>, | ||||||
) -> Result<()> { | ||||||
if batch.isDelta() { | ||||||
return Err(ArrowError::IoError( | ||||||
|
@@ -556,16 +559,10 @@ pub fn read_dictionary( | |||||
ArrowError::InvalidArgumentError("dictionary id not found in schema".to_string()) | ||||||
})?; | ||||||
|
||||||
// for all fields with this dictionary id, update the dictionaries vector | ||||||
// in the reader. Note that a dictionary batch may be shared between many fields. | ||||||
// We don't currently record the isOrdered field. This could be general | ||||||
// attributes of arrays. | ||||||
for (i, field) in schema.all_fields().iter().enumerate() { | ||||||
if field.dict_id() == Some(id) { | ||||||
// Add (possibly multiple) array refs to the dictionaries array. | ||||||
dictionaries_by_field[i] = Some(dictionary_values.clone()); | ||||||
} | ||||||
} | ||||||
// Add (possibly multiple) array refs to the dictionaries array. | ||||||
dictionaries_by_field.insert(id, dictionary_values.clone()); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Main difference here. In |
||||||
|
||||||
Ok(()) | ||||||
} | ||||||
|
@@ -592,7 +589,7 @@ pub struct FileReader<R: Read + Seek> { | |||||
/// Optional dictionaries for each schema field. | ||||||
/// | ||||||
/// Dictionaries may be appended to in the streaming format. | ||||||
dictionaries_by_field: Vec<Option<ArrayRef>>, | ||||||
dictionaries_by_field: HashMap<i64, ArrayRef>, | ||||||
|
||||||
/// Metadata version | ||||||
metadata_version: ipc::MetadataVersion, | ||||||
|
@@ -650,7 +647,7 @@ impl<R: Read + Seek> FileReader<R> { | |||||
let schema = ipc::convert::fb_to_schema(ipc_schema); | ||||||
|
||||||
// Create an array of optional dictionary value arrays, one per field. | ||||||
let mut dictionaries_by_field = vec![None; schema.all_fields().len()]; | ||||||
let mut dictionaries_by_field = HashMap::new(); | ||||||
if let Some(dictionaries) = footer.dictionaries() { | ||||||
for block in dictionaries { | ||||||
// read length from end of offset | ||||||
|
@@ -840,7 +837,7 @@ pub struct StreamReader<R: Read> { | |||||
/// Optional dictionaries for each schema field. | ||||||
/// | ||||||
/// Dictionaries may be appended to in the streaming format. | ||||||
dictionaries_by_field: Vec<Option<ArrayRef>>, | ||||||
dictionaries_by_field: HashMap<i64, ArrayRef>, | ||||||
|
||||||
/// An indicator of whether the stream is complete. | ||||||
/// | ||||||
|
@@ -884,7 +881,7 @@ impl<R: Read> StreamReader<R> { | |||||
let schema = ipc::convert::fb_to_schema(ipc_schema); | ||||||
|
||||||
// Create an array of optional dictionary value arrays, one per field. | ||||||
let dictionaries_by_field = vec![None; schema.all_fields().len()]; | ||||||
let dictionaries_by_field = HashMap::new(); | ||||||
|
||||||
let projection = match projection { | ||||||
Some(projection_indices) => { | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -578,7 +578,12 @@ fn array_from_json( | |
.get(&dict_id); | ||
match dictionary { | ||
Some(dictionary) => dictionary_array_from_json( | ||
field, json_col, key_type, value_type, dictionary, | ||
field, | ||
json_col, | ||
key_type, | ||
value_type, | ||
dictionary, | ||
dictionaries, | ||
), | ||
None => Err(ArrowError::JsonError(format!( | ||
"Unable to find dictionary for field {:?}", | ||
|
@@ -640,6 +645,7 @@ fn dictionary_array_from_json( | |
dict_key: &DataType, | ||
dict_value: &DataType, | ||
dictionary: &ArrowJsonDictionaryBatch, | ||
dictionaries: Option<&HashMap<i64, ArrowJsonDictionaryBatch>>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Passing in the map of dictionaries so nested dictionary can be used. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is where the following error comes out when running
After fixing this, there is index error fixed by #1636 (review). |
||
) -> Result<ArrayRef> { | ||
match dict_key { | ||
DataType::Int8 | ||
|
@@ -667,9 +673,11 @@ fn dictionary_array_from_json( | |
let keys = array_from_json(&key_field, json_col, None)?; | ||
// note: not enough info on nullability of dictionary | ||
let value_field = Field::new("value", dict_value.clone(), true); | ||
println!("dictionary value type: {:?}", dict_value); | ||
let values = | ||
array_from_json(&value_field, dictionary.data.columns[0].clone(), None)?; | ||
let values = array_from_json( | ||
&value_field, | ||
dictionary.data.columns[0].clone(), | ||
dictionaries, | ||
)?; | ||
|
||
// convert key and value to dictionary data | ||
let dict_data = ArrayData::builder(field.data_type().clone()) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be renamed to dictionaries_by_id, here and in all the other places?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. I will update them.