From fc6936aff50b0b1c8bea984bba2d57dac68a98b3 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 16 Dec 2024 19:29:17 +0100 Subject: [PATCH] Add deprecation warnings for everything related to `dict_id` (#6873) * Add deprecation warnings for everything related to `dict_id` * Add allow deprecations where necessary --- arrow-flight/src/encode.rs | 9 ++++ arrow-flight/src/lib.rs | 1 + arrow-flight/src/utils.rs | 1 + arrow-integration-test/src/field.rs | 27 +++++++----- arrow-integration-test/src/lib.rs | 3 ++ arrow-integration-test/src/schema.rs | 1 + .../integration_test.rs | 1 + .../integration_test.rs | 1 + arrow-ipc/src/convert.rs | 5 +++ arrow-ipc/src/reader.rs | 14 ++++++ arrow-ipc/src/reader/stream.rs | 2 + arrow-ipc/src/writer.rs | 44 +++++++++++++++++++ arrow-schema/src/ffi.rs | 1 + arrow-schema/src/field.rs | 36 ++++++++++++++- arrow-schema/src/schema.rs | 19 ++++++-- parquet/src/arrow/arrow_writer/mod.rs | 3 ++ parquet/src/arrow/schema/complex.rs | 5 ++- parquet/src/arrow/schema/mod.rs | 2 + 18 files changed, 158 insertions(+), 17 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 3ca192d0bb93..315b7b3cb6e5 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -535,8 +535,10 @@ fn prepare_field_for_flight( ) .with_metadata(field.metadata().clone()) } else { + #[allow(deprecated)] let dict_id = dictionary_tracker.set_dict_id(field.as_ref()); + #[allow(deprecated)] Field::new_dict( field.name(), field.data_type().clone(), @@ -583,7 +585,9 @@ fn prepare_schema_for_flight( ) .with_metadata(field.metadata().clone()) } else { + #[allow(deprecated)] let dict_id = dictionary_tracker.set_dict_id(field.as_ref()); + #[allow(deprecated)] Field::new_dict( field.name(), field.data_type().clone(), @@ -650,10 +654,12 @@ struct FlightIpcEncoder { impl FlightIpcEncoder { fn new(options: IpcWriteOptions, error_on_replacement: bool) -> Self { + #[allow(deprecated)] let preserve_dict_id = options.preserve_dict_id(); Self { options, data_gen: IpcDataGenerator::default(), + #[allow(deprecated)] dictionary_tracker: DictionaryTracker::new_with_preserve_dict_id( error_on_replacement, preserve_dict_id, @@ -1541,6 +1547,7 @@ mod tests { async fn verify_flight_round_trip(mut batches: Vec) { let expected_schema = batches.first().unwrap().schema(); + #[allow(deprecated)] let encoder = FlightDataEncoderBuilder::default() .with_options(IpcWriteOptions::default().with_preserve_dict_id(false)) .with_dictionary_handling(DictionaryHandling::Resend) @@ -1568,6 +1575,7 @@ mod tests { HashMap::from([("some_key".to_owned(), "some_value".to_owned())]), ); + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); let got = prepare_schema_for_flight(&schema, &mut dictionary_tracker, false); @@ -1598,6 +1606,7 @@ mod tests { options: &IpcWriteOptions, ) -> (Vec, FlightData) { let data_gen = IpcDataGenerator::default(); + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 81fa99faeb93..1dd2700794f3 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -143,6 +143,7 @@ pub struct IpcMessage(pub Bytes); fn flight_schema_as_encoded_data(arrow_schema: &Schema, options: &IpcWriteOptions) -> EncodedData { let data_gen = writer::IpcDataGenerator::default(); + #[allow(deprecated)] let mut dict_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); data_gen.schema_to_bytes_with_dictionary_tracker(arrow_schema, &mut dict_tracker, options) diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 3242f511ae61..428dde73ca6c 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -90,6 +90,7 @@ pub fn batches_to_flight_data( let mut flight_data = vec![]; let data_gen = writer::IpcDataGenerator::default(); + #[allow(deprecated)] let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index 32edc4165938..4b896ed391be 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -252,6 +252,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { _ => data_type, }; + #[allow(deprecated)] let mut field = Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered); field.set_metadata(metadata); Ok(field) @@ -274,17 +275,21 @@ pub fn field_to_json(field: &Field) -> serde_json::Value { }; match field.data_type() { - DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ - "name": field.name(), - "nullable": field.is_nullable(), - "type": data_type_to_json(value_type), - "children": children, - "dictionary": { - "id": field.dict_id().unwrap(), - "indexType": data_type_to_json(index_type), - "isOrdered": field.dict_is_ordered().unwrap(), - } - }), + DataType::Dictionary(ref index_type, ref value_type) => { + #[allow(deprecated)] + let dict_id = field.dict_id().unwrap(); + serde_json::json!({ + "name": field.name(), + "nullable": field.is_nullable(), + "type": data_type_to_json(value_type), + "children": children, + "dictionary": { + "id": dict_id, + "indexType": data_type_to_json(index_type), + "isOrdered": field.dict_is_ordered().unwrap(), + } + }) + } _ => serde_json::json!({ "name": field.name(), "nullable": field.is_nullable(), diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index a25f07ded1d9..f025009c22de 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -787,6 +787,7 @@ pub fn array_from_json( Ok(Arc::new(array)) } DataType::Dictionary(key_type, value_type) => { + #[allow(deprecated)] let dict_id = field.dict_id().ok_or_else(|| { ArrowError::JsonError(format!("Unable to find dict_id for field {field:?}")) })?; @@ -930,10 +931,12 @@ pub fn dictionary_array_from_json( let null_buf = create_null_buf(&json_col); // build the key data into a buffer, then construct values separately + #[allow(deprecated)] let key_field = Field::new_dict( "key", dict_key.clone(), field.is_nullable(), + #[allow(deprecated)] field .dict_id() .expect("Dictionary fields must have a dict_id value"), diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs index fb91aba00df3..512f0aed8e54 100644 --- a/arrow-integration-test/src/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -189,6 +189,7 @@ mod tests { Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), + #[allow(deprecated)] Field::new_dict( "c33", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index 62cdca975b0e..406419028d00 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -72,6 +72,7 @@ async fn upload_data( let (mut upload_tx, upload_rx) = mpsc::channel(10); let options = arrow::ipc::writer::IpcWriteOptions::default(); + #[allow(deprecated)] let mut dict_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); let data_gen = writer::IpcDataGenerator::default(); diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index 277217500813..92989a20393e 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -119,6 +119,7 @@ impl FlightService for FlightServiceImpl { .ok_or_else(|| Status::not_found(format!("Could not find flight. {key}")))?; let options = arrow::ipc::writer::IpcWriteOptions::default(); + #[allow(deprecated)] let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); let data_gen = writer::IpcDataGenerator::default(); diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 0aa07a6a47d2..37c5a19439c1 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -165,6 +165,7 @@ pub fn schema_to_fb_offset<'a>( impl From> for Field { fn from(field: crate::Field) -> Field { let arrow_field = if let Some(dictionary) = field.dictionary() { + #[allow(deprecated)] Field::new_dict( field.name().unwrap(), get_data_type(field, true), @@ -519,6 +520,7 @@ pub(crate) fn build_field<'a>( match dictionary_tracker { Some(tracker) => Some(get_fb_dictionary( index_type, + #[allow(deprecated)] tracker.set_dict_id(field), field .dict_is_ordered() @@ -527,6 +529,7 @@ pub(crate) fn build_field<'a>( )), None => Some(get_fb_dictionary( index_type, + #[allow(deprecated)] field .dict_id() .expect("Dictionary type must have a dictionary id"), @@ -1143,6 +1146,7 @@ mod tests { ), true, ), + #[allow(deprecated)] Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -1150,6 +1154,7 @@ mod tests { 123, true, ), + #[allow(deprecated)] Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)), diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 481fefbd5fbc..9ff4da30ed8c 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -196,6 +196,7 @@ fn create_array( let index_node = reader.next_node(field)?; let index_buffers = [reader.next_buffer()?, reader.next_buffer()?]; + #[allow(deprecated)] let dict_id = field.dict_id().ok_or_else(|| { ArrowError::ParseError(format!("Field {field} does not have dict id")) })?; @@ -617,6 +618,7 @@ fn read_dictionary_impl( } let id = batch.id(); + #[allow(deprecated)] let fields_using_this_dictionary = schema.fields_with_dict_id(id); let first_field = fields_using_this_dictionary.first().ok_or_else(|| { ArrowError::InvalidArgumentError(format!("dictionary id {id} not found in schema")) @@ -1725,6 +1727,7 @@ mod tests { let mut writer = crate::writer::FileWriter::try_new_with_options( &mut buf, batch.schema_ref(), + #[allow(deprecated)] IpcWriteOptions::default().with_preserve_dict_id(false), ) .unwrap(); @@ -1869,6 +1872,7 @@ mod tests { let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 1, 1, 3]); let key_dict_array = DictionaryArray::new(key_dict_keys, values); + #[allow(deprecated)] let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1876,6 +1880,7 @@ mod tests { 1, false, )); + #[allow(deprecated)] let values_field = Arc::new(Field::new_dict( "values", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1956,6 +1961,7 @@ mod tests { #[test] fn test_roundtrip_stream_dict_of_list_of_dict() { // list + #[allow(deprecated)] let list_data_type = DataType::List(Arc::new(Field::new_dict( "item", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1967,6 +1973,7 @@ mod tests { test_roundtrip_stream_dict_of_list_of_dict_impl::(list_data_type, offsets); // large list + #[allow(deprecated)] let list_data_type = DataType::LargeList(Arc::new(Field::new_dict( "item", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1985,6 +1992,7 @@ mod tests { let dict_array = DictionaryArray::new(keys, Arc::new(values)); let dict_data = dict_array.into_data(); + #[allow(deprecated)] let list_data_type = DataType::FixedSizeList( Arc::new(Field::new_dict( "item", @@ -2075,6 +2083,7 @@ mod tests { let key_dict_keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3]); let key_dict_array = DictionaryArray::new(key_dict_keys, utf8_view_array.clone()); + #[allow(deprecated)] let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8View)), @@ -2085,6 +2094,7 @@ mod tests { let value_dict_keys = Int8Array::from_iter_values([0, 3, 0, 1, 2, 0, 1]); let value_dict_array = DictionaryArray::new(value_dict_keys, bin_view_array); + #[allow(deprecated)] let values_field = Arc::new(Field::new_dict( "values", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::BinaryView)), @@ -2150,6 +2160,7 @@ mod tests { .unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); let (_, encoded) = gen .encoded_batch(&batch, &mut dict_tracker, &Default::default()) @@ -2187,6 +2198,7 @@ mod tests { .unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); let (_, encoded) = gen .encoded_batch(&batch, &mut dict_tracker, &Default::default()) @@ -2326,6 +2338,7 @@ mod tests { ["a", "b"] .iter() .map(|name| { + #[allow(deprecated)] Field::new_dict( name.to_string(), DataType::Dictionary( @@ -2360,6 +2373,7 @@ mod tests { let mut writer = crate::writer::StreamWriter::try_new_with_options( &mut buf, batch.schema().as_ref(), + #[allow(deprecated)] crate::writer::IpcWriteOptions::default().with_preserve_dict_id(false), ) .expect("Failed to create StreamWriter"); diff --git a/arrow-ipc/src/reader/stream.rs b/arrow-ipc/src/reader/stream.rs index de5f5bdd629f..9b0eea9b6198 100644 --- a/arrow-ipc/src/reader/stream.rs +++ b/arrow-ipc/src/reader/stream.rs @@ -324,6 +324,7 @@ mod tests { "test1", DataType::RunEndEncoded( Arc::new(Field::new("run_ends".to_string(), DataType::Int32, false)), + #[allow(deprecated)] Arc::new(Field::new_dict( "values".to_string(), DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -353,6 +354,7 @@ mod tests { let mut writer = StreamWriter::try_new_with_options( &mut buffer, &schema, + #[allow(deprecated)] IpcWriteOptions::default().with_preserve_dict_id(false), ) .expect("Failed to create StreamWriter"); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 32dab762447d..ee5b9a54cc90 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -65,6 +65,10 @@ pub struct IpcWriteOptions { /// schema or generate unique dictionary IDs internally during encoding. /// /// Defaults to `false` + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] preserve_dict_id: bool, } @@ -108,6 +112,7 @@ impl IpcWriteOptions { | crate::MetadataVersion::V3 => Err(ArrowError::InvalidArgumentError( "Writing IPC metadata version 3 and lower not supported".to_string(), )), + #[allow(deprecated)] crate::MetadataVersion::V4 => Ok(Self { alignment, write_legacy_ipc_format, @@ -121,6 +126,7 @@ impl IpcWriteOptions { "Legacy IPC format only supported on metadata version 4".to_string(), )) } else { + #[allow(deprecated)] Ok(Self { alignment, write_legacy_ipc_format, @@ -138,7 +144,12 @@ impl IpcWriteOptions { /// Return whether the writer is configured to preserve the dictionary IDs /// defined in the schema + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn preserve_dict_id(&self) -> bool { + #[allow(deprecated)] self.preserve_dict_id } @@ -149,6 +160,11 @@ impl IpcWriteOptions { /// to the dictionary batches in order to encode them correctly /// /// The default will change to `false` in future releases + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] + #[allow(deprecated)] pub fn with_preserve_dict_id(mut self, preserve_dict_id: bool) -> Self { self.preserve_dict_id = preserve_dict_id; self @@ -157,6 +173,7 @@ impl IpcWriteOptions { impl Default for IpcWriteOptions { fn default() -> Self { + #[allow(deprecated)] Self { alignment: 64, write_legacy_ipc_format: false, @@ -420,6 +437,7 @@ impl IpcDataGenerator { // It's importnat to only take the dict_id at this point, because the dict ID // sequence is assigned depth-first, so we need to first encode children and have // them take their assigned dict IDs before we take the dict ID for this field. + #[allow(deprecated)] let dict_id = dict_id_seq .next() .or_else(|| field.dict_id()) @@ -767,6 +785,10 @@ pub struct DictionaryTracker { written: HashMap, dict_ids: Vec, error_on_replacement: bool, + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] preserve_dict_id: bool, } @@ -782,6 +804,7 @@ impl DictionaryTracker { /// the last seen dictionary ID (or using `0` if no other dictionary IDs have been /// seen) pub fn new(error_on_replacement: bool) -> Self { + #[allow(deprecated)] Self { written: HashMap::new(), dict_ids: Vec::new(), @@ -795,7 +818,12 @@ impl DictionaryTracker { /// If `error_on_replacement` /// is true, an error will be generated if an update to an /// existing dictionary is attempted. + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn new_with_preserve_dict_id(error_on_replacement: bool, preserve_dict_id: bool) -> Self { + #[allow(deprecated)] Self { written: HashMap::new(), dict_ids: Vec::new(), @@ -811,8 +839,14 @@ impl DictionaryTracker { /// /// If `preserve_dict_id` is false, this will return the value of the last `dict_id` assigned incremented by 1 /// or 0 in the case where no dictionary IDs have yet been assigned + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn set_dict_id(&mut self, field: &Field) -> i64 { + #[allow(deprecated)] let next = if self.preserve_dict_id { + #[allow(deprecated)] field.dict_id().expect("no dict_id in field") } else { self.dict_ids @@ -936,7 +970,9 @@ impl FileWriter { writer.write_all(&super::ARROW_MAGIC)?; writer.write_all(&PADDING[..pad_len])?; // write the schema, set the written bytes to the schema + header + #[allow(deprecated)] let preserve_dict_id = write_options.preserve_dict_id; + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(true, preserve_dict_id); let encoded_message = data_gen.schema_to_bytes_with_dictionary_tracker( @@ -1013,7 +1049,9 @@ impl FileWriter { let mut fbb = FlatBufferBuilder::new(); let dictionaries = fbb.create_vector(&self.dictionary_blocks); let record_batches = fbb.create_vector(&self.record_blocks); + #[allow(deprecated)] let preserve_dict_id = self.write_options.preserve_dict_id; + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(true, preserve_dict_id); let schema = IpcSchemaEncoder::new() @@ -1144,7 +1182,9 @@ impl StreamWriter { write_options: IpcWriteOptions, ) -> Result { let data_gen = IpcDataGenerator::default(); + #[allow(deprecated)] let preserve_dict_id = write_options.preserve_dict_id; + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, preserve_dict_id); @@ -2032,6 +2072,7 @@ mod tests { let array = Arc::new(inner) as ArrayRef; // Dict field with id 2 + #[allow(deprecated)] let dctfield = Field::new_dict("dict", array.data_type().clone(), false, 2, false); let union_fields = [(0, Arc::new(dctfield))].into_iter().collect(); @@ -2049,6 +2090,7 @@ mod tests { let batch = RecordBatch::try_new(schema, vec![Arc::new(union)]).unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); gen.encoded_batch(&batch, &mut dict_tracker, &Default::default()) .unwrap(); @@ -2065,6 +2107,7 @@ mod tests { let array = Arc::new(inner) as ArrayRef; // Dict field with id 2 + #[allow(deprecated)] let dctfield = Arc::new(Field::new_dict( "dict", array.data_type().clone(), @@ -2085,6 +2128,7 @@ mod tests { let batch = RecordBatch::try_new(schema, vec![struct_array]).unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); gen.encoded_batch(&batch, &mut dict_tracker, &Default::default()) .unwrap(); diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 08b1e6d0b186..96c80974982c 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -923,6 +923,7 @@ mod tests { #[test] fn test_dictionary_ordered() { + #[allow(deprecated)] let schema = Schema::new(vec![Field::new_dict( "dict", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 131e23d68b9c..7d47c0ae1dea 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -38,6 +38,10 @@ pub struct Field { name: String, data_type: DataType, nullable: bool, + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] dict_id: i64, dict_is_ordered: bool, /// A map of key-value pairs containing additional custom meta data. @@ -122,6 +126,7 @@ impl Field { /// Creates a new field with the given name, type, and nullability pub fn new(name: impl Into, data_type: DataType, nullable: bool) -> Self { + #[allow(deprecated)] Field { name: name.into(), data_type, @@ -151,6 +156,10 @@ impl Field { } /// Creates a new field that has additional dictionary information + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With the dict_id field disappearing this function signature will change by removing the dict_id parameter." + )] pub fn new_dict( name: impl Into, data_type: DataType, @@ -158,6 +167,7 @@ impl Field { dict_id: i64, dict_is_ordered: bool, ) -> Self { + #[allow(deprecated)] Field { name: name.into(), data_type, @@ -386,19 +396,30 @@ impl Field { /// Returns a vector containing all (potentially nested) `Field` instances selected by the /// dictionary ID they use #[inline] + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] pub(crate) fn fields_with_dict_id(&self, id: i64) -> Vec<&Field> { self.fields() .into_iter() .filter(|&field| { - matches!(field.data_type(), DataType::Dictionary(_, _)) && field.dict_id == id + #[allow(deprecated)] + let matching_dict_id = field.dict_id == id; + matches!(field.data_type(), DataType::Dictionary(_, _)) && matching_dict_id }) .collect() } /// Returns the dictionary ID, if this is a dictionary type. #[inline] + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] pub const fn dict_id(&self) -> Option { match self.data_type { + #[allow(deprecated)] DataType::Dictionary(_, _) => Some(self.dict_id), _ => None, } @@ -428,6 +449,7 @@ impl Field { /// assert!(field.is_nullable()); /// ``` pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> { + #[allow(deprecated)] if from.dict_id != self.dict_id { return Err(ArrowError::SchemaError(format!( "Fail to merge schema field '{}' because from dict_id = {} does not match {}", @@ -570,9 +592,11 @@ impl Field { /// * self.metadata is a superset of other.metadata /// * all other fields are equal pub fn contains(&self, other: &Field) -> bool { + #[allow(deprecated)] + let matching_dict_id = self.dict_id == other.dict_id; self.name == other.name && self.data_type.contains(&other.data_type) - && self.dict_id == other.dict_id + && matching_dict_id && self.dict_is_ordered == other.dict_is_ordered // self need to be nullable or both of them are not nullable && (self.nullable || !other.nullable) @@ -621,6 +645,7 @@ mod test { fn test_new_dict_with_string() { // Fields should allow owned Strings to support reuse let s = "c1"; + #[allow(deprecated)] Field::new_dict(s, DataType::Int64, false, 4, false); } @@ -738,6 +763,7 @@ mod test { #[test] fn test_fields_with_dict_id() { + #[allow(deprecated)] let dict1 = Field::new_dict( "dict1", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), @@ -745,6 +771,7 @@ mod test { 10, false, ); + #[allow(deprecated)] let dict2 = Field::new_dict( "dict2", DataType::Dictionary(DataType::Int32.into(), DataType::Int8.into()), @@ -781,9 +808,11 @@ mod test { false, ); + #[allow(deprecated)] for field in field.fields_with_dict_id(10) { assert_eq!(dict1, *field); } + #[allow(deprecated)] for field in field.fields_with_dict_id(20) { assert_eq!(dict2, *field); } @@ -798,6 +827,7 @@ mod test { #[test] fn test_field_comparison_case() { // dictionary-encoding properties not used for field comparison + #[allow(deprecated)] let dict1 = Field::new_dict( "dict1", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), @@ -805,6 +835,7 @@ mod test { 10, false, ); + #[allow(deprecated)] let dict2 = Field::new_dict( "dict1", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), @@ -816,6 +847,7 @@ mod test { assert_eq!(dict1, dict2); assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2)); + #[allow(deprecated)] let dict1 = Field::new_dict( "dict0", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 47c22e2a9318..6c79da53f981 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -389,7 +389,12 @@ impl Schema { /// Returns a vector of immutable references to all [`Field`] instances selected by /// the dictionary ID they use. + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn fields_with_dict_id(&self, dict_id: i64) -> Vec<&Field> { + #[allow(deprecated)] self.fields .iter() .flat_map(|f| f.fields_with_dict_id(dict_id)) @@ -638,7 +643,9 @@ mod tests { assert_eq!(first_name.name(), "first_name"); assert_eq!(first_name.data_type(), &DataType::Utf8); assert!(!first_name.is_nullable()); - assert_eq!(first_name.dict_id(), None); + #[allow(deprecated)] + let dict_id = first_name.dict_id(); + assert_eq!(dict_id, None); assert_eq!(first_name.dict_is_ordered(), None); let metadata = first_name.metadata(); @@ -655,7 +662,9 @@ mod tests { interests.data_type(), &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) ); - assert_eq!(interests.dict_id(), Some(123)); + #[allow(deprecated)] + let dict_id = interests.dict_id(); + assert_eq!(dict_id, Some(123)); assert_eq!(interests.dict_is_ordered(), Some(true)); } @@ -691,6 +700,7 @@ mod tests { fn schema_field_with_dict_id() { let schema = person_schema(); + #[allow(deprecated)] let fields_dict_123: Vec<_> = schema .fields_with_dict_id(123) .iter() @@ -698,7 +708,9 @@ mod tests { .collect(); assert_eq!(fields_dict_123, vec!["interests"]); - assert!(schema.fields_with_dict_id(456).is_empty()); + #[allow(deprecated)] + let is_empty = schema.fields_with_dict_id(456).is_empty(); + assert!(is_empty); } fn person_schema() -> Schema { @@ -718,6 +730,7 @@ mod tests { ])), false, ), + #[allow(deprecated)] Field::new_dict( "interests", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index bb6ebf75ec8e..59cfac047194 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2587,6 +2587,7 @@ mod tests { #[test] fn arrow_writer_string_dictionary() { // define schema + #[allow(deprecated)] let schema = Arc::new(Schema::new(vec![Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -2608,6 +2609,7 @@ mod tests { #[test] fn arrow_writer_primitive_dictionary() { // define schema + #[allow(deprecated)] let schema = Arc::new(Schema::new(vec![Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)), @@ -2630,6 +2632,7 @@ mod tests { #[test] fn arrow_writer_string_dictionary_unsigned_index() { // define schema + #[allow(deprecated)] let schema = Arc::new(Schema::new(vec![Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index d2f5a0bd8d63..16d46bd852dc 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -552,8 +552,11 @@ fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: Option<& match arrow_hint { Some(hint) => { // If the inferred type is a dictionary, preserve dictionary metadata + #[allow(deprecated)] let field = match (&data_type, hint.dict_id(), hint.dict_is_ordered()) { - (DataType::Dictionary(_, _), Some(id), Some(ordered)) => { + (DataType::Dictionary(_, _), Some(id), Some(ordered)) => + { + #[allow(deprecated)] Field::new_dict(name, data_type, nullable, id, ordered) } _ => Field::new(name, data_type, nullable), diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 0fbcb4856e46..26b0d8b7b885 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -178,6 +178,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { /// Encodes the Arrow schema into the IPC format, and base64 encodes it fn encode_arrow_schema(schema: &Schema) -> String { let options = writer::IpcWriteOptions::default(); + #[allow(deprecated)] let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(true, options.preserve_dict_id()); let data_gen = writer::IpcDataGenerator::default(); @@ -1823,6 +1824,7 @@ mod tests { // Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), // Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), // Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), + #[allow(deprecated)] Field::new_dict( "c31", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),