/Users/andrewlamb/Software/datafusion/datafusion/common/src/config.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Runtime configuration, via [`ConfigOptions`] |
19 | | |
20 | | use std::any::Any; |
21 | | use std::collections::{BTreeMap, HashMap}; |
22 | | use std::fmt::{self, Display}; |
23 | | use std::str::FromStr; |
24 | | |
25 | | use crate::error::_config_err; |
26 | | use crate::parsers::CompressionTypeVariant; |
27 | | use crate::{DataFusionError, Result}; |
28 | | |
29 | | /// A macro that wraps a configuration struct and automatically derives |
30 | | /// [`Default`] and [`ConfigField`] for it, allowing it to be used |
31 | | /// in the [`ConfigOptions`] configuration tree |
32 | | /// |
33 | | /// For example, |
34 | | /// |
35 | | /// ```ignore |
36 | | /// config_namespace! { |
37 | | /// /// Amazing config |
38 | | /// pub struct MyConfig { |
39 | | /// /// Field 1 doc |
40 | | /// field1: String, default = "".to_string() |
41 | | /// |
42 | | /// /// Field 2 doc |
43 | | /// field2: usize, default = 232 |
44 | | /// |
45 | | /// /// Field 3 doc |
46 | | /// field3: Option<usize>, default = None |
47 | | /// } |
48 | | ///} |
49 | | /// ``` |
50 | | /// |
51 | | /// Will generate |
52 | | /// |
53 | | /// ```ignore |
54 | | /// /// Amazing config |
55 | | /// #[derive(Debug, Clone)] |
56 | | /// #[non_exhaustive] |
57 | | /// pub struct MyConfig { |
58 | | /// /// Field 1 doc |
59 | | /// field1: String, |
60 | | /// /// Field 2 doc |
61 | | /// field2: usize, |
62 | | /// /// Field 3 doc |
63 | | /// field3: Option<usize>, |
64 | | /// } |
65 | | /// impl ConfigField for MyConfig { |
66 | | /// fn set(&mut self, key: &str, value: &str) -> Result<()> { |
67 | | /// let (key, rem) = key.split_once('.').unwrap_or((key, "")); |
68 | | /// match key { |
69 | | /// "field1" => self.field1.set(rem, value), |
70 | | /// "field2" => self.field2.set(rem, value), |
71 | | /// "field3" => self.field3.set(rem, value), |
72 | | /// _ => _internal_err!( |
73 | | /// "Config value \"{}\" not found on MyConfig", |
74 | | /// key |
75 | | /// ), |
76 | | /// } |
77 | | /// } |
78 | | /// |
79 | | /// fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) { |
80 | | /// let key = format!("{}.field1", key_prefix); |
81 | | /// let desc = "Field 1 doc"; |
82 | | /// self.field1.visit(v, key.as_str(), desc); |
83 | | /// let key = format!("{}.field2", key_prefix); |
84 | | /// let desc = "Field 2 doc"; |
85 | | /// self.field2.visit(v, key.as_str(), desc); |
86 | | /// let key = format!("{}.field3", key_prefix); |
87 | | /// let desc = "Field 3 doc"; |
88 | | /// self.field3.visit(v, key.as_str(), desc); |
89 | | /// } |
90 | | /// } |
91 | | /// |
92 | | /// impl Default for MyConfig { |
93 | | /// fn default() -> Self { |
94 | | /// Self { |
95 | | /// field1: "".to_string(), |
96 | | /// field2: 232, |
97 | | /// field3: None, |
98 | | /// } |
99 | | /// } |
100 | | /// } |
101 | | /// ``` |
102 | | /// |
103 | | /// NB: Misplaced commas may result in nonsensical errors |
104 | | /// |
105 | | #[macro_export] |
106 | | macro_rules! config_namespace { |
107 | | ( |
108 | | $(#[doc = $struct_d:tt])* |
109 | | $vis:vis struct $struct_name:ident { |
110 | | $( |
111 | | $(#[doc = $d:tt])* |
112 | | $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr |
113 | | )*$(,)* |
114 | | } |
115 | | ) => { |
116 | | |
117 | | $(#[doc = $struct_d])* |
118 | | #[derive(Debug, Clone, PartialEq)] |
119 | | $vis struct $struct_name{ |
120 | | $( |
121 | | $(#[doc = $d])* |
122 | | $field_vis $field_name : $field_type, |
123 | | )* |
124 | | } |
125 | | |
126 | | impl ConfigField for $struct_name { |
127 | 4 | fn set(&mut self, key: &str, value: &str) -> Result<()> { |
128 | 4 | let (key, rem) = key.split_once('.').unwrap_or((key, "")); |
129 | 4 | match key { |
130 | 4 | $( |
131 | 4 | stringify!($field_name) => self.$field_name.set(rem, value)0 , |
132 | | )* |
133 | 0 | _ => return _config_err!( |
134 | 0 | "Config value \"{}\" not found on {}", key, stringify!($struct_name) |
135 | 0 | ) |
136 | | } |
137 | 4 | } |
138 | | |
139 | 0 | fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) { |
140 | 0 | $( |
141 | 0 | let key = format!(concat!("{}.", stringify!($field_name)), key_prefix); |
142 | 0 | let desc = concat!($($d),*).trim(); |
143 | 0 | self.$field_name.visit(v, key.as_str(), desc); |
144 | 0 | )* |
145 | 0 | } |
146 | | } |
147 | | |
148 | | impl Default for $struct_name { |
149 | 8.29k | fn default() -> Self { |
150 | 8.29k | Self { |
151 | 8.29k | $($field_name: $default),* |
152 | 8.29k | } |
153 | 8.29k | } |
154 | | } |
155 | | } |
156 | | } |
157 | | |
158 | | config_namespace! { |
159 | | /// Options related to catalog and directory scanning |
160 | | /// |
161 | | /// See also: [`SessionConfig`] |
162 | | /// |
163 | | /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html |
164 | | pub struct CatalogOptions { |
165 | | /// Whether the default catalog and schema should be created automatically. |
166 | | pub create_default_catalog_and_schema: bool, default = true |
167 | | |
168 | | /// The default catalog name - this impacts what SQL queries use if not specified |
169 | | pub default_catalog: String, default = "datafusion".to_string() |
170 | | |
171 | | /// The default schema name - this impacts what SQL queries use if not specified |
172 | | pub default_schema: String, default = "public".to_string() |
173 | | |
174 | | /// Should DataFusion provide access to `information_schema` |
175 | | /// virtual tables for displaying schema information |
176 | | pub information_schema: bool, default = false |
177 | | |
178 | | /// Location scanned to load tables for `default` schema |
179 | | pub location: Option<String>, default = None |
180 | | |
181 | | /// Type of `TableProvider` to use when loading `default` schema |
182 | | pub format: Option<String>, default = None |
183 | | |
184 | | /// Default value for `format.has_header` for `CREATE EXTERNAL TABLE` |
185 | | /// if not specified explicitly in the statement. |
186 | | pub has_header: bool, default = true |
187 | | |
188 | | /// Specifies whether newlines in (quoted) CSV values are supported. |
189 | | /// |
190 | | /// This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` |
191 | | /// if not specified explicitly in the statement. |
192 | | /// |
193 | | /// Parsing newlines in quoted values may be affected by execution behaviour such as |
194 | | /// parallel file scanning. Setting this to `true` ensures that newlines in values are |
195 | | /// parsed successfully, which may reduce performance. |
196 | | pub newlines_in_values: bool, default = false |
197 | | } |
198 | | } |
199 | | |
200 | | config_namespace! { |
201 | | /// Options related to SQL parser |
202 | | /// |
203 | | /// See also: [`SessionConfig`] |
204 | | /// |
205 | | /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html |
206 | | pub struct SqlParserOptions { |
207 | | /// When set to true, SQL parser will parse float as decimal type |
208 | | pub parse_float_as_decimal: bool, default = false |
209 | | |
210 | | /// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) |
211 | | pub enable_ident_normalization: bool, default = true |
212 | | |
213 | | /// When set to true, SQL parser will normalize options value (convert value to lowercase) |
214 | | pub enable_options_value_normalization: bool, default = true |
215 | | |
216 | | /// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, |
217 | | /// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. |
218 | | pub dialect: String, default = "generic".to_string() |
219 | | |
220 | | /// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but |
221 | | /// ignore the length. If false, error if a `VARCHAR` with a length is |
222 | | /// specified. The Arrow type system does not have a notion of maximum |
223 | | /// string length and thus DataFusion can not enforce such limits. |
224 | | pub support_varchar_with_length: bool, default = true |
225 | | } |
226 | | } |
227 | | |
228 | | config_namespace! { |
229 | | /// Options related to query execution |
230 | | /// |
231 | | /// See also: [`SessionConfig`] |
232 | | /// |
233 | | /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html |
234 | | pub struct ExecutionOptions { |
235 | | /// Default batch size while creating new batches, it's especially useful for |
236 | | /// buffer-in-memory batches since creating tiny batches would result in too much |
237 | | /// metadata memory consumption |
238 | | pub batch_size: usize, default = 8192 |
239 | | |
240 | | /// When set to true, record batches will be examined between each operator and |
241 | | /// small batches will be coalesced into larger batches. This is helpful when there |
242 | | /// are highly selective filters or joins that could produce tiny output batches. The |
243 | | /// target batch size is determined by the configuration setting |
244 | | pub coalesce_batches: bool, default = true |
245 | | |
246 | | /// Should DataFusion collect statistics after listing files |
247 | | pub collect_statistics: bool, default = false |
248 | | |
249 | | /// Number of partitions for query execution. Increasing partitions can increase |
250 | | /// concurrency. |
251 | | /// |
252 | | /// Defaults to the number of CPU cores on the system |
253 | | pub target_partitions: usize, default = num_cpus::get() |
254 | | |
255 | | /// The default time zone |
256 | | /// |
257 | | /// Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime |
258 | | /// according to this time zone, and then extract the hour |
259 | | pub time_zone: Option<String>, default = Some("+00:00".into()) |
260 | | |
261 | | /// Parquet options |
262 | | pub parquet: ParquetOptions, default = Default::default() |
263 | | |
264 | | /// Fan-out during initial physical planning. |
265 | | /// |
266 | | /// This is mostly use to plan `UNION` children in parallel. |
267 | | /// |
268 | | /// Defaults to the number of CPU cores on the system |
269 | | pub planning_concurrency: usize, default = num_cpus::get() |
270 | | |
271 | | /// Specifies the reserved memory for each spillable sort operation to |
272 | | /// facilitate an in-memory merge. |
273 | | /// |
274 | | /// When a sort operation spills to disk, the in-memory data must be |
275 | | /// sorted and merged before being written to a file. This setting reserves |
276 | | /// a specific amount of memory for that in-memory sort/merge process. |
277 | | /// |
278 | | /// Note: This setting is irrelevant if the sort operation cannot spill |
279 | | /// (i.e., if there's no `DiskManager` configured). |
280 | | pub sort_spill_reservation_bytes: usize, default = 10 * 1024 * 1024 |
281 | | |
282 | | /// When sorting, below what size should data be concatenated |
283 | | /// and sorted in a single RecordBatch rather than sorted in |
284 | | /// batches and merged. |
285 | | pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024 |
286 | | |
287 | | /// Number of files to read in parallel when inferring schema and statistics |
288 | | pub meta_fetch_concurrency: usize, default = 32 |
289 | | |
290 | | /// Guarantees a minimum level of output files running in parallel. |
291 | | /// RecordBatches will be distributed in round robin fashion to each |
292 | | /// parallel writer. Each writer is closed and a new file opened once |
293 | | /// soft_max_rows_per_output_file is reached. |
294 | | pub minimum_parallel_output_files: usize, default = 4 |
295 | | |
296 | | /// Target number of rows in output files when writing multiple. |
297 | | /// This is a soft max, so it can be exceeded slightly. There also |
298 | | /// will be one file smaller than the limit if the total |
299 | | /// number of rows written is not roughly divisible by the soft max |
300 | | pub soft_max_rows_per_output_file: usize, default = 50000000 |
301 | | |
302 | | /// This is the maximum number of RecordBatches buffered |
303 | | /// for each output file being worked. Higher values can potentially |
304 | | /// give faster write performance at the cost of higher peak |
305 | | /// memory consumption |
306 | | pub max_buffered_batches_per_output_file: usize, default = 2 |
307 | | |
308 | | /// Should sub directories be ignored when scanning directories for data |
309 | | /// files. Defaults to true (ignores subdirectories), consistent with |
310 | | /// Hive. Note that this setting does not affect reading partitioned |
311 | | /// tables (e.g. `/table/year=2021/month=01/data.parquet`). |
312 | | pub listing_table_ignore_subdirectory: bool, default = true |
313 | | |
314 | | /// Should DataFusion support recursive CTEs |
315 | | pub enable_recursive_ctes: bool, default = true |
316 | | |
317 | | /// Attempt to eliminate sorts by packing & sorting files with non-overlapping |
318 | | /// statistics into the same file groups. |
319 | | /// Currently experimental |
320 | | pub split_file_groups_by_statistics: bool, default = false |
321 | | |
322 | | /// Should DataFusion keep the columns used for partition_by in the output RecordBatches |
323 | | pub keep_partition_by_columns: bool, default = false |
324 | | |
325 | | /// Aggregation ratio (number of distinct groups / number of input rows) |
326 | | /// threshold for skipping partial aggregation. If the value is greater |
327 | | /// then partial aggregation will skip aggregation for further input |
328 | | pub skip_partial_aggregation_probe_ratio_threshold: f64, default = 0.8 |
329 | | |
330 | | /// Number of input rows partial aggregation partition should process, before |
331 | | /// aggregation ratio check and trying to switch to skipping aggregation mode |
332 | | pub skip_partial_aggregation_probe_rows_threshold: usize, default = 100_000 |
333 | | |
334 | | /// Should DataFusion use row number estimates at the input to decide |
335 | | /// whether increasing parallelism is beneficial or not. By default, |
336 | | /// only exact row numbers (not estimates) are used for this decision. |
337 | | /// Setting this flag to `true` will likely produce better plans. |
338 | | /// if the source of statistics is accurate. |
339 | | /// We plan to make this the default in the future. |
340 | | pub use_row_number_estimates_to_optimize_partitioning: bool, default = false |
341 | | } |
342 | | } |
343 | | |
344 | | config_namespace! { |
345 | | /// Options for reading and writing parquet files |
346 | | /// |
347 | | /// See also: [`SessionConfig`] |
348 | | /// |
349 | | /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html |
350 | | pub struct ParquetOptions { |
351 | | // The following options affect reading parquet files |
352 | | |
353 | | /// (reading) If true, reads the Parquet data page level metadata (the |
354 | | /// Page Index), if present, to reduce the I/O and number of |
355 | | /// rows decoded. |
356 | | pub enable_page_index: bool, default = true |
357 | | |
358 | | /// (reading) If true, the parquet reader attempts to skip entire row groups based |
359 | | /// on the predicate in the query and the metadata (min/max values) stored in |
360 | | /// the parquet file |
361 | | pub pruning: bool, default = true |
362 | | |
363 | | /// (reading) If true, the parquet reader skip the optional embedded metadata that may be in |
364 | | /// the file Schema. This setting can help avoid schema conflicts when querying |
365 | | /// multiple parquet files with schemas containing compatible types but different metadata |
366 | | pub skip_metadata: bool, default = true |
367 | | |
368 | | /// (reading) If specified, the parquet reader will try and fetch the last `size_hint` |
369 | | /// bytes of the parquet file optimistically. If not specified, two reads are required: |
370 | | /// One read to fetch the 8-byte parquet footer and |
371 | | /// another to fetch the metadata length encoded in the footer |
372 | | pub metadata_size_hint: Option<usize>, default = None |
373 | | |
374 | | /// (reading) If true, filter expressions are be applied during the parquet decoding operation to |
375 | | /// reduce the number of rows decoded. This optimization is sometimes called "late materialization". |
376 | | pub pushdown_filters: bool, default = false |
377 | | |
378 | | /// (reading) If true, filter expressions evaluated during the parquet decoding operation |
379 | | /// will be reordered heuristically to minimize the cost of evaluation. If false, |
380 | | /// the filters are applied in the same order as written in the query |
381 | | pub reorder_filters: bool, default = false |
382 | | |
383 | | /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, |
384 | | /// and `Binary/BinaryLarge` with `BinaryView`. |
385 | | pub schema_force_view_types: bool, default = false |
386 | | |
387 | | // The following options affect writing to parquet files |
388 | | // and map to parquet::file::properties::WriterProperties |
389 | | |
390 | | /// (writing) Sets best effort maximum size of data page in bytes |
391 | | pub data_pagesize_limit: usize, default = 1024 * 1024 |
392 | | |
393 | | /// (writing) Sets write_batch_size in bytes |
394 | | pub write_batch_size: usize, default = 1024 |
395 | | |
396 | | /// (writing) Sets parquet writer version |
397 | | /// valid values are "1.0" and "2.0" |
398 | | pub writer_version: String, default = "1.0".to_string() |
399 | | |
400 | | /// (writing) Sets default parquet compression codec. |
401 | | /// Valid values are: uncompressed, snappy, gzip(level), |
402 | | /// lzo, brotli(level), lz4, zstd(level), and lz4_raw. |
403 | | /// These values are not case sensitive. If NULL, uses |
404 | | /// default parquet writer setting |
405 | | /// |
406 | | /// Note that this default setting is not the same as |
407 | | /// the default parquet writer setting. |
408 | | pub compression: Option<String>, default = Some("zstd(3)".into()) |
409 | | |
410 | | /// (writing) Sets if dictionary encoding is enabled. If NULL, uses |
411 | | /// default parquet writer setting |
412 | | pub dictionary_enabled: Option<bool>, default = Some(true) |
413 | | |
414 | | /// (writing) Sets best effort maximum dictionary page size, in bytes |
415 | | pub dictionary_page_size_limit: usize, default = 1024 * 1024 |
416 | | |
417 | | /// (writing) Sets if statistics are enabled for any column |
418 | | /// Valid values are: "none", "chunk", and "page" |
419 | | /// These values are not case sensitive. If NULL, uses |
420 | | /// default parquet writer setting |
421 | | pub statistics_enabled: Option<String>, default = Some("page".into()) |
422 | | |
423 | | /// (writing) Sets max statistics size for any column. If NULL, uses |
424 | | /// default parquet writer setting |
425 | | pub max_statistics_size: Option<usize>, default = Some(4096) |
426 | | |
427 | | /// (writing) Target maximum number of rows in each row group (defaults to 1M |
428 | | /// rows). Writing larger row groups requires more memory to write, but |
429 | | /// can get better compression and be faster to read. |
430 | | pub max_row_group_size: usize, default = 1024 * 1024 |
431 | | |
432 | | /// (writing) Sets "created by" property |
433 | | pub created_by: String, default = concat!("datafusion version ", env!("CARGO_PKG_VERSION")).into() |
434 | | |
435 | | /// (writing) Sets column index truncate length |
436 | | pub column_index_truncate_length: Option<usize>, default = Some(64) |
437 | | |
438 | | /// (writing) Sets best effort maximum number of rows in data page |
439 | | pub data_page_row_count_limit: usize, default = 20_000 |
440 | | |
441 | | /// (writing) Sets default encoding for any column. |
442 | | /// Valid values are: plain, plain_dictionary, rle, |
443 | | /// bit_packed, delta_binary_packed, delta_length_byte_array, |
444 | | /// delta_byte_array, rle_dictionary, and byte_stream_split. |
445 | | /// These values are not case sensitive. If NULL, uses |
446 | | /// default parquet writer setting |
447 | | pub encoding: Option<String>, default = None |
448 | | |
449 | | /// (writing) Use any available bloom filters when reading parquet files |
450 | | pub bloom_filter_on_read: bool, default = true |
451 | | |
452 | | /// (writing) Write bloom filters for all columns when creating parquet files |
453 | | pub bloom_filter_on_write: bool, default = false |
454 | | |
455 | | /// (writing) Sets bloom filter false positive probability. If NULL, uses |
456 | | /// default parquet writer setting |
457 | | pub bloom_filter_fpp: Option<f64>, default = None |
458 | | |
459 | | /// (writing) Sets bloom filter number of distinct values. If NULL, uses |
460 | | /// default parquet writer setting |
461 | | pub bloom_filter_ndv: Option<u64>, default = None |
462 | | |
463 | | /// (writing) Controls whether DataFusion will attempt to speed up writing |
464 | | /// parquet files by serializing them in parallel. Each column |
465 | | /// in each row group in each output file are serialized in parallel |
466 | | /// leveraging a maximum possible core count of n_files*n_row_groups*n_columns. |
467 | | pub allow_single_file_parallelism: bool, default = true |
468 | | |
469 | | /// (writing) By default parallel parquet writer is tuned for minimum |
470 | | /// memory usage in a streaming execution plan. You may see |
471 | | /// a performance benefit when writing large parquet files |
472 | | /// by increasing maximum_parallel_row_group_writers and |
473 | | /// maximum_buffered_record_batches_per_stream if your system |
474 | | /// has idle cores and can tolerate additional memory usage. |
475 | | /// Boosting these values is likely worthwhile when |
476 | | /// writing out already in-memory data, such as from a cached |
477 | | /// data frame. |
478 | | pub maximum_parallel_row_group_writers: usize, default = 1 |
479 | | |
480 | | /// (writing) By default parallel parquet writer is tuned for minimum |
481 | | /// memory usage in a streaming execution plan. You may see |
482 | | /// a performance benefit when writing large parquet files |
483 | | /// by increasing maximum_parallel_row_group_writers and |
484 | | /// maximum_buffered_record_batches_per_stream if your system |
485 | | /// has idle cores and can tolerate additional memory usage. |
486 | | /// Boosting these values is likely worthwhile when |
487 | | /// writing out already in-memory data, such as from a cached |
488 | | /// data frame. |
489 | | pub maximum_buffered_record_batches_per_stream: usize, default = 2 |
490 | | } |
491 | | } |
492 | | |
493 | | config_namespace! { |
494 | | /// Options related to query optimization |
495 | | /// |
496 | | /// See also: [`SessionConfig`] |
497 | | /// |
498 | | /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html |
499 | | pub struct OptimizerOptions { |
500 | | /// When set to true, the optimizer will push a limit operation into |
501 | | /// grouped aggregations which have no aggregate expressions, as a soft limit, |
502 | | /// emitting groups once the limit is reached, before all rows in the group are read. |
503 | | pub enable_distinct_aggregation_soft_limit: bool, default = true |
504 | | |
505 | | /// When set to true, the physical plan optimizer will try to add round robin |
506 | | /// repartitioning to increase parallelism to leverage more CPU cores |
507 | | pub enable_round_robin_repartition: bool, default = true |
508 | | |
509 | | /// When set to true, the optimizer will attempt to perform limit operations |
510 | | /// during aggregations, if possible |
511 | | pub enable_topk_aggregation: bool, default = true |
512 | | |
513 | | /// When set to true, the optimizer will insert filters before a join between |
514 | | /// a nullable and non-nullable column to filter out nulls on the nullable side. This |
515 | | /// filter can add additional overhead when the file format does not fully support |
516 | | /// predicate push down. |
517 | | pub filter_null_join_keys: bool, default = false |
518 | | |
519 | | /// Should DataFusion repartition data using the aggregate keys to execute aggregates |
520 | | /// in parallel using the provided `target_partitions` level |
521 | | pub repartition_aggregations: bool, default = true |
522 | | |
523 | | /// Minimum total files size in bytes to perform file scan repartitioning. |
524 | | pub repartition_file_min_size: usize, default = 10 * 1024 * 1024 |
525 | | |
526 | | /// Should DataFusion repartition data using the join keys to execute joins in parallel |
527 | | /// using the provided `target_partitions` level |
528 | | pub repartition_joins: bool, default = true |
529 | | |
530 | | /// Should DataFusion allow symmetric hash joins for unbounded data sources even when |
531 | | /// its inputs do not have any ordering or filtering If the flag is not enabled, |
532 | | /// the SymmetricHashJoin operator will be unable to prune its internal buffers, |
533 | | /// resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, |
534 | | /// RightAnti, and RightSemi - being produced only at the end of the execution. |
535 | | /// This is not typical in stream processing. Additionally, without proper design for |
536 | | /// long runner execution, all types of joins may encounter out-of-memory errors. |
537 | | pub allow_symmetric_joins_without_pruning: bool, default = true |
538 | | |
539 | | /// When set to `true`, file groups will be repartitioned to achieve maximum parallelism. |
540 | | /// Currently Parquet and CSV formats are supported. |
541 | | /// |
542 | | /// If set to `true`, all files will be repartitioned evenly (i.e., a single large file |
543 | | /// might be partitioned into smaller chunks) for parallel scanning. |
544 | | /// If set to `false`, different files will be read in parallel, but repartitioning won't |
545 | | /// happen within a single file. |
546 | | pub repartition_file_scans: bool, default = true |
547 | | |
548 | | /// Should DataFusion repartition data using the partitions keys to execute window |
549 | | /// functions in parallel using the provided `target_partitions` level |
550 | | pub repartition_windows: bool, default = true |
551 | | |
552 | | /// Should DataFusion execute sorts in a per-partition fashion and merge |
553 | | /// afterwards instead of coalescing first and sorting globally. |
554 | | /// With this flag is enabled, plans in the form below |
555 | | /// |
556 | | /// ```text |
557 | | /// "SortExec: [a@0 ASC]", |
558 | | /// " CoalescePartitionsExec", |
559 | | /// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", |
560 | | /// ``` |
561 | | /// would turn into the plan below which performs better in multithreaded environments |
562 | | /// |
563 | | /// ```text |
564 | | /// "SortPreservingMergeExec: [a@0 ASC]", |
565 | | /// " SortExec: [a@0 ASC]", |
566 | | /// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", |
567 | | /// ``` |
568 | | pub repartition_sorts: bool, default = true |
569 | | |
570 | | /// When true, DataFusion will opportunistically remove sorts when the data is already sorted, |
571 | | /// (i.e. setting `preserve_order` to true on `RepartitionExec` and |
572 | | /// using `SortPreservingMergeExec`) |
573 | | /// |
574 | | /// When false, DataFusion will maximize plan parallelism using |
575 | | /// `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. |
576 | | pub prefer_existing_sort: bool, default = false |
577 | | |
578 | | /// When set to true, the logical plan optimizer will produce warning |
579 | | /// messages if any optimization rules produce errors and then proceed to the next |
580 | | /// rule. When set to false, any rules that produce errors will cause the query to fail |
581 | | pub skip_failed_rules: bool, default = false |
582 | | |
583 | | /// Number of times that the optimizer will attempt to optimize the plan |
584 | | pub max_passes: usize, default = 3 |
585 | | |
586 | | /// When set to true, the physical plan optimizer will run a top down |
587 | | /// process to reorder the join keys |
588 | | pub top_down_join_key_reordering: bool, default = true |
589 | | |
590 | | /// When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. |
591 | | /// HashJoin can work more efficiently than SortMergeJoin but consumes more memory |
592 | | pub prefer_hash_join: bool, default = true |
593 | | |
594 | | /// The maximum estimated size in bytes for one input side of a HashJoin |
595 | | /// will be collected into a single partition |
596 | | pub hash_join_single_partition_threshold: usize, default = 1024 * 1024 |
597 | | |
598 | | /// The maximum estimated size in rows for one input side of a HashJoin |
599 | | /// will be collected into a single partition |
600 | | pub hash_join_single_partition_threshold_rows: usize, default = 1024 * 128 |
601 | | |
602 | | /// The default filter selectivity used by Filter Statistics |
603 | | /// when an exact selectivity cannot be determined. Valid values are |
604 | | /// between 0 (no selectivity) and 100 (all rows are selected). |
605 | | pub default_filter_selectivity: u8, default = 20 |
606 | | |
607 | | /// When set to true, the optimizer will not attempt to convert Union to Interleave |
608 | | pub prefer_existing_union: bool, default = false |
609 | | |
610 | | /// When set to true, if the returned type is a view type |
611 | | /// then the output will be coerced to a non-view. |
612 | | /// Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. |
613 | | pub expand_views_at_output: bool, default = false |
614 | | } |
615 | | } |
616 | | |
617 | | config_namespace! { |
618 | | /// Options controlling explain output |
619 | | /// |
620 | | /// See also: [`SessionConfig`] |
621 | | /// |
622 | | /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html |
623 | | pub struct ExplainOptions { |
624 | | /// When set to true, the explain statement will only print logical plans |
625 | | pub logical_plan_only: bool, default = false |
626 | | |
627 | | /// When set to true, the explain statement will only print physical plans |
628 | | pub physical_plan_only: bool, default = false |
629 | | |
630 | | /// When set to true, the explain statement will print operator statistics |
631 | | /// for physical plans |
632 | | pub show_statistics: bool, default = false |
633 | | |
634 | | /// When set to true, the explain statement will print the partition sizes |
635 | | pub show_sizes: bool, default = true |
636 | | |
637 | | /// When set to true, the explain statement will print schema information |
638 | | pub show_schema: bool, default = false |
639 | | } |
640 | | } |
641 | | |
642 | | /// A key value pair, with a corresponding description |
643 | | #[derive(Debug)] |
644 | | pub struct ConfigEntry { |
645 | | /// A unique string to identify this config value |
646 | | pub key: String, |
647 | | |
648 | | /// The value if any |
649 | | pub value: Option<String>, |
650 | | |
651 | | /// A description of this configuration entry |
652 | | pub description: &'static str, |
653 | | } |
654 | | |
655 | | /// Configuration options struct, able to store both built-in configuration and custom options |
656 | | #[derive(Debug, Clone, Default)] |
657 | | #[non_exhaustive] |
658 | | pub struct ConfigOptions { |
659 | | /// Catalog options |
660 | | pub catalog: CatalogOptions, |
661 | | /// Execution options |
662 | | pub execution: ExecutionOptions, |
663 | | /// Optimizer options |
664 | | pub optimizer: OptimizerOptions, |
665 | | /// SQL parser options |
666 | | pub sql_parser: SqlParserOptions, |
667 | | /// Explain options |
668 | | pub explain: ExplainOptions, |
669 | | /// Optional extensions registered using [`Extensions::insert`] |
670 | | pub extensions: Extensions, |
671 | | } |
672 | | |
673 | | impl ConfigField for ConfigOptions { |
674 | 4 | fn set(&mut self, key: &str, value: &str) -> Result<()> { |
675 | 4 | // Extensions are handled in the public `ConfigOptions::set` |
676 | 4 | let (key, rem) = key.split_once('.').unwrap_or((key, "")); |
677 | 4 | match key { |
678 | 4 | "catalog" => self.catalog.set(rem, value)0 , |
679 | 4 | "execution" => self.execution.set(rem, value), |
680 | 0 | "optimizer" => self.optimizer.set(rem, value), |
681 | 0 | "explain" => self.explain.set(rem, value), |
682 | 0 | "sql_parser" => self.sql_parser.set(rem, value), |
683 | 0 | _ => _config_err!("Config value \"{key}\" not found on ConfigOptions"), |
684 | | } |
685 | 4 | } |
686 | | |
687 | 0 | fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) { |
688 | 0 | self.catalog.visit(v, "datafusion.catalog", ""); |
689 | 0 | self.execution.visit(v, "datafusion.execution", ""); |
690 | 0 | self.optimizer.visit(v, "datafusion.optimizer", ""); |
691 | 0 | self.explain.visit(v, "datafusion.explain", ""); |
692 | 0 | self.sql_parser.visit(v, "datafusion.sql_parser", ""); |
693 | 0 | } |
694 | | } |
695 | | |
696 | | impl ConfigOptions { |
697 | | /// Creates a new [`ConfigOptions`] with default values |
698 | 1.38k | pub fn new() -> Self { |
699 | 1.38k | Self::default() |
700 | 1.38k | } |
701 | | |
702 | | /// Set extensions to provided value |
703 | 0 | pub fn with_extensions(mut self, extensions: Extensions) -> Self { |
704 | 0 | self.extensions = extensions; |
705 | 0 | self |
706 | 0 | } |
707 | | |
708 | | /// Set a configuration option |
709 | 4 | pub fn set(&mut self, key: &str, value: &str) -> Result<()> { |
710 | 4 | let Some((prefix, key)) = key.split_once('.') else { |
711 | 0 | return _config_err!("could not find config namespace for key \"{key}\""); |
712 | | }; |
713 | | |
714 | 4 | if prefix == "datafusion" { |
715 | 4 | return ConfigField::set(self, key, value); |
716 | 0 | } |
717 | | |
718 | 0 | let Some(e) = self.extensions.0.get_mut(prefix) else { |
719 | 0 | return _config_err!("Could not find config namespace \"{prefix}\""); |
720 | | }; |
721 | 0 | e.0.set(key, value) |
722 | 4 | } |
723 | | |
724 | | /// Create new ConfigOptions struct, taking values from |
725 | | /// environment variables where possible. |
726 | | /// |
727 | | /// For example, setting `DATAFUSION_EXECUTION_BATCH_SIZE` will |
728 | | /// control `datafusion.execution.batch_size`. |
729 | 0 | pub fn from_env() -> Result<Self> { |
730 | | struct Visitor(Vec<String>); |
731 | | |
732 | | impl Visit for Visitor { |
733 | 0 | fn some<V: Display>(&mut self, key: &str, _: V, _: &'static str) { |
734 | 0 | self.0.push(key.to_string()) |
735 | 0 | } |
736 | | |
737 | 0 | fn none(&mut self, key: &str, _: &'static str) { |
738 | 0 | self.0.push(key.to_string()) |
739 | 0 | } |
740 | | } |
741 | | |
742 | | // Extract the names of all fields and then look up the corresponding |
743 | | // environment variables. This isn't hugely efficient but avoids |
744 | | // ambiguity between `a.b` and `a_b` which would both correspond |
745 | | // to an environment variable of `A_B` |
746 | | |
747 | 0 | let mut keys = Visitor(vec![]); |
748 | 0 | let mut ret = Self::default(); |
749 | 0 | ret.visit(&mut keys, "datafusion", ""); |
750 | | |
751 | 0 | for key in keys.0 { |
752 | 0 | let env = key.to_uppercase().replace('.', "_"); |
753 | 0 | if let Some(var) = std::env::var_os(env) { |
754 | 0 | ret.set(&key, var.to_string_lossy().as_ref())?; |
755 | 0 | } |
756 | | } |
757 | | |
758 | 0 | Ok(ret) |
759 | 0 | } |
760 | | |
761 | | /// Create new ConfigOptions struct, taking values from a string hash map. |
762 | | /// |
763 | | /// Only the built-in configurations will be extracted from the hash map |
764 | | /// and other key value pairs will be ignored. |
765 | 0 | pub fn from_string_hash_map(settings: &HashMap<String, String>) -> Result<Self> { |
766 | | struct Visitor(Vec<String>); |
767 | | |
768 | | impl Visit for Visitor { |
769 | 0 | fn some<V: Display>(&mut self, key: &str, _: V, _: &'static str) { |
770 | 0 | self.0.push(key.to_string()) |
771 | 0 | } |
772 | | |
773 | 0 | fn none(&mut self, key: &str, _: &'static str) { |
774 | 0 | self.0.push(key.to_string()) |
775 | 0 | } |
776 | | } |
777 | | |
778 | 0 | let mut keys = Visitor(vec![]); |
779 | 0 | let mut ret = Self::default(); |
780 | 0 | ret.visit(&mut keys, "datafusion", ""); |
781 | | |
782 | 0 | for key in keys.0 { |
783 | 0 | if let Some(var) = settings.get(&key) { |
784 | 0 | ret.set(&key, var)?; |
785 | 0 | } |
786 | | } |
787 | | |
788 | 0 | Ok(ret) |
789 | 0 | } |
790 | | |
791 | | /// Returns the [`ConfigEntry`] stored within this [`ConfigOptions`] |
792 | 0 | pub fn entries(&self) -> Vec<ConfigEntry> { |
793 | | struct Visitor(Vec<ConfigEntry>); |
794 | | |
795 | | impl Visit for Visitor { |
796 | 0 | fn some<V: Display>( |
797 | 0 | &mut self, |
798 | 0 | key: &str, |
799 | 0 | value: V, |
800 | 0 | description: &'static str, |
801 | 0 | ) { |
802 | 0 | self.0.push(ConfigEntry { |
803 | 0 | key: key.to_string(), |
804 | 0 | value: Some(value.to_string()), |
805 | 0 | description, |
806 | 0 | }) |
807 | 0 | } |
808 | | |
809 | 0 | fn none(&mut self, key: &str, description: &'static str) { |
810 | 0 | self.0.push(ConfigEntry { |
811 | 0 | key: key.to_string(), |
812 | 0 | value: None, |
813 | 0 | description, |
814 | 0 | }) |
815 | 0 | } |
816 | | } |
817 | | |
818 | 0 | let mut v = Visitor(vec![]); |
819 | 0 | self.visit(&mut v, "datafusion", ""); |
820 | 0 |
|
821 | 0 | v.0.extend(self.extensions.0.values().flat_map(|e| e.0.entries())); |
822 | 0 | v.0 |
823 | 0 | } |
824 | | |
825 | | /// Generate documentation that can be included in the user guide |
826 | 0 | pub fn generate_config_markdown() -> String { |
827 | | use std::fmt::Write as _; |
828 | | |
829 | 0 | let mut s = Self::default(); |
830 | 0 |
|
831 | 0 | // Normalize for display |
832 | 0 | s.execution.target_partitions = 0; |
833 | 0 | s.execution.planning_concurrency = 0; |
834 | 0 |
|
835 | 0 | let mut docs = "| key | default | description |\n".to_string(); |
836 | 0 | docs += "|-----|---------|-------------|\n"; |
837 | 0 | let mut entries = s.entries(); |
838 | 0 | entries.sort_unstable_by(|a, b| a.key.cmp(&b.key)); |
839 | | |
840 | 0 | for entry in s.entries() { |
841 | 0 | let _ = writeln!( |
842 | 0 | &mut docs, |
843 | 0 | "| {} | {} | {} |", |
844 | 0 | entry.key, |
845 | 0 | entry.value.as_deref().unwrap_or("NULL"), |
846 | 0 | entry.description |
847 | 0 | ); |
848 | 0 | } |
849 | 0 | docs |
850 | 0 | } |
851 | | } |
852 | | |
853 | | /// [`ConfigExtension`] provides a mechanism to store third-party configuration within DataFusion |
854 | | /// |
855 | | /// Unfortunately associated constants are not currently object-safe, and so this |
856 | | /// extends the object-safe [`ExtensionOptions`] |
857 | | pub trait ConfigExtension: ExtensionOptions { |
858 | | /// Configuration namespace prefix to use |
859 | | /// |
860 | | /// All values under this will be prefixed with `$PREFIX + "."` |
861 | | const PREFIX: &'static str; |
862 | | } |
863 | | |
864 | | /// An object-safe API for storing arbitrary configuration |
865 | | pub trait ExtensionOptions: Send + Sync + std::fmt::Debug + 'static { |
866 | | /// Return `self` as [`Any`] |
867 | | /// |
868 | | /// This is needed until trait upcasting is stabilised |
869 | | fn as_any(&self) -> &dyn Any; |
870 | | |
871 | | /// Return `self` as [`Any`] |
872 | | /// |
873 | | /// This is needed until trait upcasting is stabilised |
874 | | fn as_any_mut(&mut self) -> &mut dyn Any; |
875 | | |
876 | | /// Return a deep clone of this [`ExtensionOptions`] |
877 | | /// |
878 | | /// It is important this does not share mutable state to avoid consistency issues |
879 | | /// with configuration changing whilst queries are executing |
880 | | fn cloned(&self) -> Box<dyn ExtensionOptions>; |
881 | | |
882 | | /// Set the given `key`, `value` pair |
883 | | fn set(&mut self, key: &str, value: &str) -> Result<()>; |
884 | | |
885 | | /// Returns the [`ConfigEntry`] stored in this [`ExtensionOptions`] |
886 | | fn entries(&self) -> Vec<ConfigEntry>; |
887 | | } |
888 | | |
889 | | /// A type-safe container for [`ConfigExtension`] |
890 | | #[derive(Debug, Default, Clone)] |
891 | | pub struct Extensions(BTreeMap<&'static str, ExtensionBox>); |
892 | | |
893 | | impl Extensions { |
894 | | /// Create a new, empty [`Extensions`] |
895 | 0 | pub fn new() -> Self { |
896 | 0 | Self(BTreeMap::new()) |
897 | 0 | } |
898 | | |
899 | | /// Registers a [`ConfigExtension`] with this [`ConfigOptions`] |
900 | 0 | pub fn insert<T: ConfigExtension>(&mut self, extension: T) { |
901 | 0 | assert_ne!(T::PREFIX, "datafusion"); |
902 | 0 | let e = ExtensionBox(Box::new(extension)); |
903 | 0 | self.0.insert(T::PREFIX, e); |
904 | 0 | } |
905 | | |
906 | | /// Retrieves the extension of the given type if any |
907 | 0 | pub fn get<T: ConfigExtension>(&self) -> Option<&T> { |
908 | 0 | self.0.get(T::PREFIX)?.0.as_any().downcast_ref() |
909 | 0 | } |
910 | | |
911 | | /// Retrieves the extension of the given type if any |
912 | 0 | pub fn get_mut<T: ConfigExtension>(&mut self) -> Option<&mut T> { |
913 | 0 | let e = self.0.get_mut(T::PREFIX)?; |
914 | 0 | e.0.as_any_mut().downcast_mut() |
915 | 0 | } |
916 | | } |
917 | | |
918 | | #[derive(Debug)] |
919 | | struct ExtensionBox(Box<dyn ExtensionOptions>); |
920 | | |
921 | | impl Clone for ExtensionBox { |
922 | 0 | fn clone(&self) -> Self { |
923 | 0 | Self(self.0.cloned()) |
924 | 0 | } |
925 | | } |
926 | | |
927 | | /// A trait implemented by `config_namespace` and for field types that provides |
928 | | /// the ability to walk and mutate the configuration tree |
929 | | pub trait ConfigField { |
930 | | fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str); |
931 | | |
932 | | fn set(&mut self, key: &str, value: &str) -> Result<()>; |
933 | | } |
934 | | |
935 | | impl<F: ConfigField + Default> ConfigField for Option<F> { |
936 | 0 | fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) { |
937 | 0 | match self { |
938 | 0 | Some(s) => s.visit(v, key, description), |
939 | 0 | None => v.none(key, description), |
940 | | } |
941 | 0 | } |
942 | | |
943 | 0 | fn set(&mut self, key: &str, value: &str) -> Result<()> { |
944 | 0 | self.get_or_insert_with(Default::default).set(key, value) |
945 | 0 | } |
946 | | } |
947 | | |
948 | | #[macro_export] |
949 | | macro_rules! config_field { |
950 | | ($t:ty) => { |
951 | | impl ConfigField for $t { |
952 | 0 | fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) { |
953 | 0 | v.some(key, self, description) |
954 | 0 | } |
955 | | |
956 | 0 | fn set(&mut self, _: &str, value: &str) -> Result<()> { |
957 | 4 | *self0 = value.parse().map_err(0 |e| { |
958 | 0 | DataFusionError::Context( |
959 | 0 | format!(concat!("Error parsing {} as ", stringify!($t),), value), |
960 | 0 | Box::new(DataFusionError::External(Box::new(e))), |
961 | 0 | ) |
962 | 4 | })?0 ; |
963 | 4 | Ok(()) |
964 | 4 | } |
965 | | } |
966 | | }; |
967 | | } |
968 | | |
969 | | config_field!(String); |
970 | | config_field!(bool); |
971 | | config_field!(usize); |
972 | | config_field!(f64); |
973 | | config_field!(u64); |
974 | | |
975 | | impl ConfigField for u8 { |
976 | 0 | fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) { |
977 | 0 | v.some(key, self, description) |
978 | 0 | } |
979 | | |
980 | 0 | fn set(&mut self, key: &str, value: &str) -> Result<()> { |
981 | 0 | if value.is_empty() { |
982 | 0 | return Err(DataFusionError::Configuration(format!( |
983 | 0 | "Input string for {} key is empty", |
984 | 0 | key |
985 | 0 | ))); |
986 | 0 | } |
987 | | // Check if the string is a valid number |
988 | 0 | if let Ok(num) = value.parse::<u8>() { |
989 | 0 | // TODO: Let's decide how we treat the numerical strings. |
990 | 0 | *self = num; |
991 | 0 | } else { |
992 | 0 | let bytes = value.as_bytes(); |
993 | 0 | // Check if the first character is ASCII (single byte) |
994 | 0 | if bytes.len() > 1 || !value.chars().next().unwrap().is_ascii() { |
995 | 0 | return Err(DataFusionError::Configuration(format!( |
996 | 0 | "Error parsing {} as u8. Non-ASCII string provided", |
997 | 0 | value |
998 | 0 | ))); |
999 | 0 | } |
1000 | 0 | *self = bytes[0]; |
1001 | | } |
1002 | 0 | Ok(()) |
1003 | 0 | } |
1004 | | } |
1005 | | |
1006 | | impl ConfigField for CompressionTypeVariant { |
1007 | 0 | fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) { |
1008 | 0 | v.some(key, self, description) |
1009 | 0 | } |
1010 | | |
1011 | 0 | fn set(&mut self, _: &str, value: &str) -> Result<()> { |
1012 | 0 | *self = CompressionTypeVariant::from_str(value)?; |
1013 | 0 | Ok(()) |
1014 | 0 | } |
1015 | | } |
1016 | | |
1017 | | /// An implementation trait used to recursively walk configuration |
1018 | | pub trait Visit { |
1019 | | fn some<V: Display>(&mut self, key: &str, value: V, description: &'static str); |
1020 | | |
1021 | | fn none(&mut self, key: &str, description: &'static str); |
1022 | | } |
1023 | | |
1024 | | /// Convenience macro to create [`ExtensionsOptions`]. |
1025 | | /// |
1026 | | /// The created structure implements the following traits: |
1027 | | /// |
1028 | | /// - [`Clone`] |
1029 | | /// - [`Debug`] |
1030 | | /// - [`Default`] |
1031 | | /// - [`ExtensionOptions`] |
1032 | | /// |
1033 | | /// # Usage |
1034 | | /// The syntax is: |
1035 | | /// |
1036 | | /// ```text |
1037 | | /// extensions_options! { |
1038 | | /// /// Struct docs (optional). |
1039 | | /// [<vis>] struct <StructName> { |
1040 | | /// /// Field docs (optional) |
1041 | | /// [<vis>] <field_name>: <field_type>, default = <default_value> |
1042 | | /// |
1043 | | /// ... more fields |
1044 | | /// } |
1045 | | /// } |
1046 | | /// ``` |
1047 | | /// |
1048 | | /// The placeholders are: |
1049 | | /// - `[<vis>]`: Optional visibility modifier like `pub` or `pub(crate)`. |
1050 | | /// - `<StructName>`: Struct name like `MyStruct`. |
1051 | | /// - `<field_name>`: Field name like `my_field`. |
1052 | | /// - `<field_type>`: Field type like `u8`. |
1053 | | /// - `<default_value>`: Default value matching the field type like `42`. |
1054 | | /// |
1055 | | /// # Example |
1056 | | /// ``` |
1057 | | /// use datafusion_common::extensions_options; |
1058 | | /// |
1059 | | /// extensions_options! { |
1060 | | /// /// My own config options. |
1061 | | /// pub struct MyConfig { |
1062 | | /// /// Should "foo" be replaced by "bar"? |
1063 | | /// pub foo_to_bar: bool, default = true |
1064 | | /// |
1065 | | /// /// How many "baz" should be created? |
1066 | | /// pub baz_count: usize, default = 1337 |
1067 | | /// } |
1068 | | /// } |
1069 | | /// ``` |
1070 | | /// |
1071 | | /// |
1072 | | /// [`Debug`]: std::fmt::Debug |
1073 | | /// [`ExtensionsOptions`]: crate::config::ExtensionOptions |
1074 | | #[macro_export] |
1075 | | macro_rules! extensions_options { |
1076 | | ( |
1077 | | $(#[doc = $struct_d:tt])* |
1078 | | $vis:vis struct $struct_name:ident { |
1079 | | $( |
1080 | | $(#[doc = $d:tt])* |
1081 | | $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr |
1082 | | )*$(,)* |
1083 | | } |
1084 | | ) => { |
1085 | | $(#[doc = $struct_d])* |
1086 | | #[derive(Debug, Clone)] |
1087 | | #[non_exhaustive] |
1088 | | $vis struct $struct_name{ |
1089 | | $( |
1090 | | $(#[doc = $d])* |
1091 | | $field_vis $field_name : $field_type, |
1092 | | )* |
1093 | | } |
1094 | | |
1095 | | impl Default for $struct_name { |
1096 | | fn default() -> Self { |
1097 | | Self { |
1098 | | $($field_name: $default),* |
1099 | | } |
1100 | | } |
1101 | | } |
1102 | | |
1103 | | impl $crate::config::ExtensionOptions for $struct_name { |
1104 | | fn as_any(&self) -> &dyn ::std::any::Any { |
1105 | | self |
1106 | | } |
1107 | | |
1108 | | fn as_any_mut(&mut self) -> &mut dyn ::std::any::Any { |
1109 | | self |
1110 | | } |
1111 | | |
1112 | | fn cloned(&self) -> Box<dyn $crate::config::ExtensionOptions> { |
1113 | | Box::new(self.clone()) |
1114 | | } |
1115 | | |
1116 | | fn set(&mut self, key: &str, value: &str) -> $crate::Result<()> { |
1117 | | match key { |
1118 | | $( |
1119 | | stringify!($field_name) => { |
1120 | | self.$field_name = value.parse().map_err(|e| { |
1121 | | $crate::DataFusionError::Context( |
1122 | | format!(concat!("Error parsing {} as ", stringify!($t),), value), |
1123 | | Box::new($crate::DataFusionError::External(Box::new(e))), |
1124 | | ) |
1125 | | })?; |
1126 | | Ok(()) |
1127 | | } |
1128 | | )* |
1129 | | _ => Err($crate::DataFusionError::Configuration( |
1130 | | format!(concat!("Config value \"{}\" not found on ", stringify!($struct_name)), key) |
1131 | | )) |
1132 | | } |
1133 | | } |
1134 | | |
1135 | | fn entries(&self) -> Vec<$crate::config::ConfigEntry> { |
1136 | | vec![ |
1137 | | $( |
1138 | | $crate::config::ConfigEntry { |
1139 | | key: stringify!($field_name).to_owned(), |
1140 | | value: (self.$field_name != $default).then(|| self.$field_name.to_string()), |
1141 | | description: concat!($($d),*).trim(), |
1142 | | }, |
1143 | | )* |
1144 | | ] |
1145 | | } |
1146 | | } |
1147 | | } |
1148 | | } |
1149 | | |
1150 | | /// These file types have special built in behavior for configuration. |
1151 | | /// Use TableOptions::Extensions for configuring other file types. |
1152 | | #[derive(Debug, Clone)] |
1153 | | pub enum ConfigFileType { |
1154 | | CSV, |
1155 | | #[cfg(feature = "parquet")] |
1156 | | PARQUET, |
1157 | | JSON, |
1158 | | } |
1159 | | |
1160 | | /// Represents the configuration options available for handling different table formats within a data processing application. |
1161 | | /// This struct encompasses options for various file formats including CSV, Parquet, and JSON, allowing for flexible configuration |
1162 | | /// of parsing and writing behaviors specific to each format. Additionally, it supports extending functionality through custom extensions. |
1163 | | #[derive(Debug, Clone, Default)] |
1164 | | pub struct TableOptions { |
1165 | | /// Configuration options for CSV file handling. This includes settings like the delimiter, |
1166 | | /// quote character, and whether the first row is considered as headers. |
1167 | | pub csv: CsvOptions, |
1168 | | |
1169 | | /// Configuration options for Parquet file handling. This includes settings for compression, |
1170 | | /// encoding, and other Parquet-specific file characteristics. |
1171 | | pub parquet: TableParquetOptions, |
1172 | | |
1173 | | /// Configuration options for JSON file handling. |
1174 | | pub json: JsonOptions, |
1175 | | |
1176 | | /// The current file format that the table operations should assume. This option allows |
1177 | | /// for dynamic switching between the supported file types (e.g., CSV, Parquet, JSON). |
1178 | | pub current_format: Option<ConfigFileType>, |
1179 | | |
1180 | | /// Optional extensions that can be used to extend or customize the behavior of the table |
1181 | | /// options. Extensions can be registered using `Extensions::insert` and might include |
1182 | | /// custom file handling logic, additional configuration parameters, or other enhancements. |
1183 | | pub extensions: Extensions, |
1184 | | } |
1185 | | |
1186 | | impl ConfigField for TableOptions { |
1187 | | /// Visits configuration settings for the current file format, or all formats if none is selected. |
1188 | | /// |
1189 | | /// This method adapts the behavior based on whether a file format is currently selected in `current_format`. |
1190 | | /// If a format is selected, it visits only the settings relevant to that format. Otherwise, |
1191 | | /// it visits all available format settings. |
1192 | 0 | fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) { |
1193 | 0 | if let Some(file_type) = &self.current_format { |
1194 | 0 | match file_type { |
1195 | | #[cfg(feature = "parquet")] |
1196 | | ConfigFileType::PARQUET => self.parquet.visit(v, "format", ""), |
1197 | 0 | ConfigFileType::CSV => self.csv.visit(v, "format", ""), |
1198 | 0 | ConfigFileType::JSON => self.json.visit(v, "format", ""), |
1199 | | } |
1200 | 0 | } else { |
1201 | 0 | self.csv.visit(v, "csv", ""); |
1202 | 0 | self.parquet.visit(v, "parquet", ""); |
1203 | 0 | self.json.visit(v, "json", ""); |
1204 | 0 | } |
1205 | 0 | } |
1206 | | |
1207 | | /// Sets a configuration value for a specific key within `TableOptions`. |
1208 | | /// |
1209 | | /// This method delegates setting configuration values to the specific file format configurations, |
1210 | | /// based on the current format selected. If no format is selected, it returns an error. |
1211 | | /// |
1212 | | /// # Parameters |
1213 | | /// |
1214 | | /// * `key`: The configuration key specifying which setting to adjust, prefixed with the format (e.g., "format.delimiter") |
1215 | | /// for CSV format. |
1216 | | /// * `value`: The value to set for the specified configuration key. |
1217 | | /// |
1218 | | /// # Returns |
1219 | | /// |
1220 | | /// A result indicating success or an error if the key is not recognized, if a format is not specified, |
1221 | | /// or if setting the configuration value fails for the specific format. |
1222 | 0 | fn set(&mut self, key: &str, value: &str) -> Result<()> { |
1223 | 0 | // Extensions are handled in the public `ConfigOptions::set` |
1224 | 0 | let (key, rem) = key.split_once('.').unwrap_or((key, "")); |
1225 | 0 | let Some(format) = &self.current_format else { |
1226 | 0 | return _config_err!("Specify a format for TableOptions"); |
1227 | | }; |
1228 | 0 | match key { |
1229 | 0 | "format" => match format { |
1230 | | #[cfg(feature = "parquet")] |
1231 | | ConfigFileType::PARQUET => self.parquet.set(rem, value), |
1232 | 0 | ConfigFileType::CSV => self.csv.set(rem, value), |
1233 | 0 | ConfigFileType::JSON => self.json.set(rem, value), |
1234 | | }, |
1235 | 0 | _ => _config_err!("Config value \"{key}\" not found on TableOptions"), |
1236 | | } |
1237 | 0 | } |
1238 | | } |
1239 | | |
1240 | | impl TableOptions { |
1241 | | /// Constructs a new instance of `TableOptions` with default settings. |
1242 | | /// |
1243 | | /// # Returns |
1244 | | /// |
1245 | | /// A new `TableOptions` instance with default configuration values. |
1246 | 0 | pub fn new() -> Self { |
1247 | 0 | Self::default() |
1248 | 0 | } |
1249 | | |
1250 | | /// Creates a new `TableOptions` instance initialized with settings from a given session config. |
1251 | | /// |
1252 | | /// # Parameters |
1253 | | /// |
1254 | | /// * `config`: A reference to the session `ConfigOptions` from which to derive initial settings. |
1255 | | /// |
1256 | | /// # Returns |
1257 | | /// |
1258 | | /// A new `TableOptions` instance with settings applied from the session config. |
1259 | 0 | pub fn default_from_session_config(config: &ConfigOptions) -> Self { |
1260 | 0 | let initial = TableOptions::default(); |
1261 | 0 | initial.combine_with_session_config(config); |
1262 | 0 | initial |
1263 | 0 | } |
1264 | | |
1265 | | /// Updates the current `TableOptions` with settings from a given session config. |
1266 | | /// |
1267 | | /// # Parameters |
1268 | | /// |
1269 | | /// * `config`: A reference to the session `ConfigOptions` whose settings are to be applied. |
1270 | | /// |
1271 | | /// # Returns |
1272 | | /// |
1273 | | /// A new `TableOptions` instance with updated settings from the session config. |
1274 | 0 | pub fn combine_with_session_config(&self, config: &ConfigOptions) -> Self { |
1275 | 0 | let mut clone = self.clone(); |
1276 | 0 | clone.parquet.global = config.execution.parquet.clone(); |
1277 | 0 | clone |
1278 | 0 | } |
1279 | | |
1280 | | /// Sets the file format for the table. |
1281 | | /// |
1282 | | /// # Parameters |
1283 | | /// |
1284 | | /// * `format`: The file format to use (e.g., CSV, Parquet). |
1285 | 0 | pub fn set_config_format(&mut self, format: ConfigFileType) { |
1286 | 0 | self.current_format = Some(format); |
1287 | 0 | } |
1288 | | |
1289 | | /// Sets the extensions for this `TableOptions` instance. |
1290 | | /// |
1291 | | /// # Parameters |
1292 | | /// |
1293 | | /// * `extensions`: The `Extensions` instance to set. |
1294 | | /// |
1295 | | /// # Returns |
1296 | | /// |
1297 | | /// A new `TableOptions` instance with the specified extensions applied. |
1298 | 0 | pub fn with_extensions(mut self, extensions: Extensions) -> Self { |
1299 | 0 | self.extensions = extensions; |
1300 | 0 | self |
1301 | 0 | } |
1302 | | |
1303 | | /// Sets a specific configuration option. |
1304 | | /// |
1305 | | /// # Parameters |
1306 | | /// |
1307 | | /// * `key`: The configuration key (e.g., "format.delimiter"). |
1308 | | /// * `value`: The value to set for the specified key. |
1309 | | /// |
1310 | | /// # Returns |
1311 | | /// |
1312 | | /// A result indicating success or failure in setting the configuration option. |
1313 | 0 | pub fn set(&mut self, key: &str, value: &str) -> Result<()> { |
1314 | 0 | let Some((prefix, _)) = key.split_once('.') else { |
1315 | 0 | return _config_err!("could not find config namespace for key \"{key}\""); |
1316 | | }; |
1317 | | |
1318 | 0 | if prefix == "format" { |
1319 | 0 | return ConfigField::set(self, key, value); |
1320 | 0 | } |
1321 | 0 |
|
1322 | 0 | if prefix == "execution" { |
1323 | 0 | return Ok(()); |
1324 | 0 | } |
1325 | | |
1326 | 0 | let Some(e) = self.extensions.0.get_mut(prefix) else { |
1327 | 0 | return _config_err!("Could not find config namespace \"{prefix}\""); |
1328 | | }; |
1329 | 0 | e.0.set(key, value) |
1330 | 0 | } |
1331 | | |
1332 | | /// Initializes a new `TableOptions` from a hash map of string settings. |
1333 | | /// |
1334 | | /// # Parameters |
1335 | | /// |
1336 | | /// * `settings`: A hash map where each key-value pair represents a configuration setting. |
1337 | | /// |
1338 | | /// # Returns |
1339 | | /// |
1340 | | /// A result containing the new `TableOptions` instance or an error if any setting could not be applied. |
1341 | 0 | pub fn from_string_hash_map(settings: &HashMap<String, String>) -> Result<Self> { |
1342 | 0 | let mut ret = Self::default(); |
1343 | 0 | for (k, v) in settings { |
1344 | 0 | ret.set(k, v)?; |
1345 | | } |
1346 | | |
1347 | 0 | Ok(ret) |
1348 | 0 | } |
1349 | | |
1350 | | /// Modifies the current `TableOptions` instance with settings from a hash map. |
1351 | | /// |
1352 | | /// # Parameters |
1353 | | /// |
1354 | | /// * `settings`: A hash map where each key-value pair represents a configuration setting. |
1355 | | /// |
1356 | | /// # Returns |
1357 | | /// |
1358 | | /// A result indicating success or failure in applying the settings. |
1359 | 0 | pub fn alter_with_string_hash_map( |
1360 | 0 | &mut self, |
1361 | 0 | settings: &HashMap<String, String>, |
1362 | 0 | ) -> Result<()> { |
1363 | 0 | for (k, v) in settings { |
1364 | 0 | self.set(k, v)?; |
1365 | | } |
1366 | 0 | Ok(()) |
1367 | 0 | } |
1368 | | |
1369 | | /// Retrieves all configuration entries from this `TableOptions`. |
1370 | | /// |
1371 | | /// # Returns |
1372 | | /// |
1373 | | /// A vector of `ConfigEntry` instances, representing all the configuration options within this `TableOptions`. |
1374 | 0 | pub fn entries(&self) -> Vec<ConfigEntry> { |
1375 | | struct Visitor(Vec<ConfigEntry>); |
1376 | | |
1377 | | impl Visit for Visitor { |
1378 | 0 | fn some<V: Display>( |
1379 | 0 | &mut self, |
1380 | 0 | key: &str, |
1381 | 0 | value: V, |
1382 | 0 | description: &'static str, |
1383 | 0 | ) { |
1384 | 0 | self.0.push(ConfigEntry { |
1385 | 0 | key: key.to_string(), |
1386 | 0 | value: Some(value.to_string()), |
1387 | 0 | description, |
1388 | 0 | }) |
1389 | 0 | } |
1390 | | |
1391 | 0 | fn none(&mut self, key: &str, description: &'static str) { |
1392 | 0 | self.0.push(ConfigEntry { |
1393 | 0 | key: key.to_string(), |
1394 | 0 | value: None, |
1395 | 0 | description, |
1396 | 0 | }) |
1397 | 0 | } |
1398 | | } |
1399 | | |
1400 | 0 | let mut v = Visitor(vec![]); |
1401 | 0 | self.visit(&mut v, "format", ""); |
1402 | 0 |
|
1403 | 0 | v.0.extend(self.extensions.0.values().flat_map(|e| e.0.entries())); |
1404 | 0 | v.0 |
1405 | 0 | } |
1406 | | } |
1407 | | |
1408 | | /// Options that control how Parquet files are read, including global options |
1409 | | /// that apply to all columns and optional column-specific overrides |
1410 | | /// |
1411 | | /// Closely tied to [`ParquetWriterOptions`](crate::file_options::parquet_writer::ParquetWriterOptions). |
1412 | | /// Properties not included in [`TableParquetOptions`] may not be configurable at the external API |
1413 | | /// (e.g. sorting_columns). |
1414 | | #[derive(Clone, Default, Debug, PartialEq)] |
1415 | | pub struct TableParquetOptions { |
1416 | | /// Global Parquet options that propagates to all columns. |
1417 | | pub global: ParquetOptions, |
1418 | | /// Column specific options. Default usage is parquet.XX::column. |
1419 | | pub column_specific_options: HashMap<String, ParquetColumnOptions>, |
1420 | | /// Additional file-level metadata to include. Inserted into the key_value_metadata |
1421 | | /// for the written [`FileMetaData`](https://docs.rs/parquet/latest/parquet/file/metadata/struct.FileMetaData.html). |
1422 | | /// |
1423 | | /// Multiple entries are permitted |
1424 | | /// ```sql |
1425 | | /// OPTIONS ( |
1426 | | /// 'format.metadata::key1' '', |
1427 | | /// 'format.metadata::key2' 'value', |
1428 | | /// 'format.metadata::key3' 'value has spaces', |
1429 | | /// 'format.metadata::key4' 'value has special chars :: :', |
1430 | | /// 'format.metadata::key_dupe' 'original will be overwritten', |
1431 | | /// 'format.metadata::key_dupe' 'final' |
1432 | | /// ) |
1433 | | /// ``` |
1434 | | pub key_value_metadata: HashMap<String, Option<String>>, |
1435 | | } |
1436 | | |
1437 | | impl TableParquetOptions { |
1438 | | /// Return new default TableParquetOptions |
1439 | 0 | pub fn new() -> Self { |
1440 | 0 | Self::default() |
1441 | 0 | } |
1442 | | } |
1443 | | |
1444 | | impl ConfigField for TableParquetOptions { |
1445 | 0 | fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, description: &'static str) { |
1446 | 0 | self.global.visit(v, key_prefix, description); |
1447 | 0 | self.column_specific_options |
1448 | 0 | .visit(v, key_prefix, description) |
1449 | 0 | } |
1450 | | |
1451 | 0 | fn set(&mut self, key: &str, value: &str) -> Result<()> { |
1452 | 0 | // Determine if the key is a global, metadata, or column-specific setting |
1453 | 0 | if key.starts_with("metadata::") { |
1454 | 0 | let k = match key.split("::").collect::<Vec<_>>()[..] { |
1455 | 0 | [_meta] | [_meta, ""] => { |
1456 | 0 | return _config_err!( |
1457 | 0 | "Invalid metadata key provided, missing key in metadata::<key>" |
1458 | 0 | ) |
1459 | | } |
1460 | 0 | [_meta, k] => k.into(), |
1461 | | _ => { |
1462 | 0 | return _config_err!( |
1463 | 0 | "Invalid metadata key provided, found too many '::' in \"{key}\"" |
1464 | 0 | ) |
1465 | | } |
1466 | | }; |
1467 | 0 | self.key_value_metadata.insert(k, Some(value.into())); |
1468 | 0 | Ok(()) |
1469 | 0 | } else if key.contains("::") { |
1470 | 0 | self.column_specific_options.set(key, value) |
1471 | | } else { |
1472 | 0 | self.global.set(key, value) |
1473 | | } |
1474 | 0 | } |
1475 | | } |
1476 | | |
1477 | | macro_rules! config_namespace_with_hashmap { |
1478 | | ( |
1479 | | $(#[doc = $struct_d:tt])* |
1480 | | $vis:vis struct $struct_name:ident { |
1481 | | $( |
1482 | | $(#[doc = $d:tt])* |
1483 | | $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr |
1484 | | )*$(,)* |
1485 | | } |
1486 | | ) => { |
1487 | | |
1488 | | $(#[doc = $struct_d])* |
1489 | | #[derive(Debug, Clone, PartialEq)] |
1490 | | $vis struct $struct_name{ |
1491 | | $( |
1492 | | $(#[doc = $d])* |
1493 | | $field_vis $field_name : $field_type, |
1494 | | )* |
1495 | | } |
1496 | | |
1497 | | impl ConfigField for $struct_name { |
1498 | 0 | fn set(&mut self, key: &str, value: &str) -> Result<()> { |
1499 | 0 | let (key, rem) = key.split_once('.').unwrap_or((key, "")); |
1500 | 0 | match key { |
1501 | 0 | $( |
1502 | 0 | stringify!($field_name) => self.$field_name.set(rem, value), |
1503 | | )* |
1504 | 0 | _ => _config_err!( |
1505 | 0 | "Config value \"{}\" not found on {}", key, stringify!($struct_name) |
1506 | 0 | ) |
1507 | | } |
1508 | 0 | } |
1509 | | |
1510 | 0 | fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) { |
1511 | 0 | $( |
1512 | 0 | let key = format!(concat!("{}.", stringify!($field_name)), key_prefix); |
1513 | 0 | let desc = concat!($($d),*).trim(); |
1514 | 0 | self.$field_name.visit(v, key.as_str(), desc); |
1515 | 0 | )* |
1516 | 0 | } |
1517 | | } |
1518 | | |
1519 | | impl Default for $struct_name { |
1520 | 0 | fn default() -> Self { |
1521 | 0 | Self { |
1522 | 0 | $($field_name: $default),* |
1523 | 0 | } |
1524 | 0 | } |
1525 | | } |
1526 | | |
1527 | | impl ConfigField for HashMap<String,$struct_name> { |
1528 | 0 | fn set(&mut self, key: &str, value: &str) -> Result<()> { |
1529 | 0 | let parts: Vec<&str> = key.splitn(2, "::").collect(); |
1530 | 0 | match parts.as_slice() { |
1531 | 0 | [inner_key, hashmap_key] => { |
1532 | 0 | // Get or create the ColumnOptions for the specified column |
1533 | 0 | let inner_value = self |
1534 | 0 | .entry((*hashmap_key).to_owned()) |
1535 | 0 | .or_insert_with($struct_name::default); |
1536 | 0 |
|
1537 | 0 | inner_value.set(inner_key, value) |
1538 | | } |
1539 | 0 | _ => _config_err!("Unrecognized key '{key}'."), |
1540 | | } |
1541 | 0 | } |
1542 | | |
1543 | 0 | fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) { |
1544 | 0 | for (column_name, col_options) in self { |
1545 | 0 | $( |
1546 | 0 | let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name)); |
1547 | 0 | let desc = concat!($($d),*).trim(); |
1548 | 0 | col_options.$field_name.visit(v, key.as_str(), desc); |
1549 | 0 | )* |
1550 | 0 | } |
1551 | 0 | } |
1552 | | } |
1553 | | } |
1554 | | } |
1555 | | |
1556 | | config_namespace_with_hashmap! { |
1557 | | /// Options controlling parquet format for individual columns. |
1558 | | /// |
1559 | | /// See [`ParquetOptions`] for more details |
1560 | | pub struct ParquetColumnOptions { |
1561 | | /// Sets if bloom filter is enabled for the column path. |
1562 | | pub bloom_filter_enabled: Option<bool>, default = None |
1563 | | |
1564 | | /// Sets encoding for the column path. |
1565 | | /// Valid values are: plain, plain_dictionary, rle, |
1566 | | /// bit_packed, delta_binary_packed, delta_length_byte_array, |
1567 | | /// delta_byte_array, rle_dictionary, and byte_stream_split. |
1568 | | /// These values are not case-sensitive. If NULL, uses |
1569 | | /// default parquet options |
1570 | | pub encoding: Option<String>, default = None |
1571 | | |
1572 | | /// Sets if dictionary encoding is enabled for the column path. If NULL, uses |
1573 | | /// default parquet options |
1574 | | pub dictionary_enabled: Option<bool>, default = None |
1575 | | |
1576 | | /// Sets default parquet compression codec for the column path. |
1577 | | /// Valid values are: uncompressed, snappy, gzip(level), |
1578 | | /// lzo, brotli(level), lz4, zstd(level), and lz4_raw. |
1579 | | /// These values are not case-sensitive. If NULL, uses |
1580 | | /// default parquet options |
1581 | | pub compression: Option<String>, default = None |
1582 | | |
1583 | | /// Sets if statistics are enabled for the column |
1584 | | /// Valid values are: "none", "chunk", and "page" |
1585 | | /// These values are not case sensitive. If NULL, uses |
1586 | | /// default parquet options |
1587 | | pub statistics_enabled: Option<String>, default = None |
1588 | | |
1589 | | /// Sets bloom filter false positive probability for the column path. If NULL, uses |
1590 | | /// default parquet options |
1591 | | pub bloom_filter_fpp: Option<f64>, default = None |
1592 | | |
1593 | | /// Sets bloom filter number of distinct values. If NULL, uses |
1594 | | /// default parquet options |
1595 | | pub bloom_filter_ndv: Option<u64>, default = None |
1596 | | |
1597 | | /// Sets max statistics size for the column path. If NULL, uses |
1598 | | /// default parquet options |
1599 | | pub max_statistics_size: Option<usize>, default = None |
1600 | | } |
1601 | | } |
1602 | | |
1603 | | config_namespace! { |
1604 | | /// Options controlling CSV format |
1605 | | pub struct CsvOptions { |
1606 | | /// Specifies whether there is a CSV header (i.e. the first line |
1607 | | /// consists of is column names). The value `None` indicates that |
1608 | | /// the configuration should be consulted. |
1609 | | pub has_header: Option<bool>, default = None |
1610 | | pub delimiter: u8, default = b',' |
1611 | | pub quote: u8, default = b'"' |
1612 | | pub terminator: Option<u8>, default = None |
1613 | | pub escape: Option<u8>, default = None |
1614 | | pub double_quote: Option<bool>, default = None |
1615 | | /// Specifies whether newlines in (quoted) values are supported. |
1616 | | /// |
1617 | | /// Parsing newlines in quoted values may be affected by execution behaviour such as |
1618 | | /// parallel file scanning. Setting this to `true` ensures that newlines in values are |
1619 | | /// parsed successfully, which may reduce performance. |
1620 | | /// |
1621 | | /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting. |
1622 | | pub newlines_in_values: Option<bool>, default = None |
1623 | | pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED |
1624 | | pub schema_infer_max_rec: usize, default = 100 |
1625 | | pub date_format: Option<String>, default = None |
1626 | | pub datetime_format: Option<String>, default = None |
1627 | | pub timestamp_format: Option<String>, default = None |
1628 | | pub timestamp_tz_format: Option<String>, default = None |
1629 | | pub time_format: Option<String>, default = None |
1630 | | pub null_value: Option<String>, default = None |
1631 | | pub comment: Option<u8>, default = None |
1632 | | } |
1633 | | } |
1634 | | |
1635 | | impl CsvOptions { |
1636 | | /// Set a limit in terms of records to scan to infer the schema |
1637 | | /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD` |
1638 | 0 | pub fn with_compression( |
1639 | 0 | mut self, |
1640 | 0 | compression_type_variant: CompressionTypeVariant, |
1641 | 0 | ) -> Self { |
1642 | 0 | self.compression = compression_type_variant; |
1643 | 0 | self |
1644 | 0 | } |
1645 | | |
1646 | | /// Set a limit in terms of records to scan to infer the schema |
1647 | | /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD` |
1648 | 0 | pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self { |
1649 | 0 | self.schema_infer_max_rec = max_rec; |
1650 | 0 | self |
1651 | 0 | } |
1652 | | |
1653 | | /// Set true to indicate that the first line is a header. |
1654 | | /// - default to true |
1655 | 0 | pub fn with_has_header(mut self, has_header: bool) -> Self { |
1656 | 0 | self.has_header = Some(has_header); |
1657 | 0 | self |
1658 | 0 | } |
1659 | | |
1660 | | /// Returns true if the first line is a header. If format options does not |
1661 | | /// specify whether there is a header, returns `None` (indicating that the |
1662 | | /// configuration should be consulted). |
1663 | 0 | pub fn has_header(&self) -> Option<bool> { |
1664 | 0 | self.has_header |
1665 | 0 | } |
1666 | | |
1667 | | /// The character separating values within a row. |
1668 | | /// - default to ',' |
1669 | 0 | pub fn with_delimiter(mut self, delimiter: u8) -> Self { |
1670 | 0 | self.delimiter = delimiter; |
1671 | 0 | self |
1672 | 0 | } |
1673 | | |
1674 | | /// The quote character in a row. |
1675 | | /// - default to '"' |
1676 | 0 | pub fn with_quote(mut self, quote: u8) -> Self { |
1677 | 0 | self.quote = quote; |
1678 | 0 | self |
1679 | 0 | } |
1680 | | |
1681 | | /// The character that terminates a row. |
1682 | | /// - default to None (CRLF) |
1683 | 0 | pub fn with_terminator(mut self, terminator: Option<u8>) -> Self { |
1684 | 0 | self.terminator = terminator; |
1685 | 0 | self |
1686 | 0 | } |
1687 | | |
1688 | | /// The escape character in a row. |
1689 | | /// - default is None |
1690 | 0 | pub fn with_escape(mut self, escape: Option<u8>) -> Self { |
1691 | 0 | self.escape = escape; |
1692 | 0 | self |
1693 | 0 | } |
1694 | | |
1695 | | /// Set true to indicate that the CSV quotes should be doubled. |
1696 | | /// - default to true |
1697 | 0 | pub fn with_double_quote(mut self, double_quote: bool) -> Self { |
1698 | 0 | self.double_quote = Some(double_quote); |
1699 | 0 | self |
1700 | 0 | } |
1701 | | |
1702 | | /// Specifies whether newlines in (quoted) values are supported. |
1703 | | /// |
1704 | | /// Parsing newlines in quoted values may be affected by execution behaviour such as |
1705 | | /// parallel file scanning. Setting this to `true` ensures that newlines in values are |
1706 | | /// parsed successfully, which may reduce performance. |
1707 | | /// |
1708 | | /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting. |
1709 | 0 | pub fn with_newlines_in_values(mut self, newlines_in_values: bool) -> Self { |
1710 | 0 | self.newlines_in_values = Some(newlines_in_values); |
1711 | 0 | self |
1712 | 0 | } |
1713 | | |
1714 | | /// Set a `CompressionTypeVariant` of CSV |
1715 | | /// - defaults to `CompressionTypeVariant::UNCOMPRESSED` |
1716 | 0 | pub fn with_file_compression_type( |
1717 | 0 | mut self, |
1718 | 0 | compression: CompressionTypeVariant, |
1719 | 0 | ) -> Self { |
1720 | 0 | self.compression = compression; |
1721 | 0 | self |
1722 | 0 | } |
1723 | | |
1724 | | /// The delimiter character. |
1725 | 0 | pub fn delimiter(&self) -> u8 { |
1726 | 0 | self.delimiter |
1727 | 0 | } |
1728 | | |
1729 | | /// The quote character. |
1730 | 0 | pub fn quote(&self) -> u8 { |
1731 | 0 | self.quote |
1732 | 0 | } |
1733 | | |
1734 | | /// The terminator character. |
1735 | 0 | pub fn terminator(&self) -> Option<u8> { |
1736 | 0 | self.terminator |
1737 | 0 | } |
1738 | | |
1739 | | /// The escape character. |
1740 | 0 | pub fn escape(&self) -> Option<u8> { |
1741 | 0 | self.escape |
1742 | 0 | } |
1743 | | } |
1744 | | |
1745 | | config_namespace! { |
1746 | | /// Options controlling JSON format |
1747 | | pub struct JsonOptions { |
1748 | | pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED |
1749 | | pub schema_infer_max_rec: usize, default = 100 |
1750 | | } |
1751 | | } |
1752 | | |
1753 | | pub trait FormatOptionsExt: Display {} |
1754 | | |
1755 | | #[derive(Debug, Clone, PartialEq)] |
1756 | | #[allow(clippy::large_enum_variant)] |
1757 | | pub enum FormatOptions { |
1758 | | CSV(CsvOptions), |
1759 | | JSON(JsonOptions), |
1760 | | #[cfg(feature = "parquet")] |
1761 | | PARQUET(TableParquetOptions), |
1762 | | AVRO, |
1763 | | ARROW, |
1764 | | } |
1765 | | |
1766 | | impl Display for FormatOptions { |
1767 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1768 | 0 | let out = match self { |
1769 | 0 | FormatOptions::CSV(_) => "csv", |
1770 | 0 | FormatOptions::JSON(_) => "json", |
1771 | | #[cfg(feature = "parquet")] |
1772 | | FormatOptions::PARQUET(_) => "parquet", |
1773 | 0 | FormatOptions::AVRO => "avro", |
1774 | 0 | FormatOptions::ARROW => "arrow", |
1775 | | }; |
1776 | 0 | write!(f, "{}", out) |
1777 | 0 | } |
1778 | | } |
1779 | | |
1780 | | #[cfg(test)] |
1781 | | mod tests { |
1782 | | use std::any::Any; |
1783 | | use std::collections::HashMap; |
1784 | | |
1785 | | use crate::config::{ |
1786 | | ConfigEntry, ConfigExtension, ConfigFileType, ExtensionOptions, Extensions, |
1787 | | TableOptions, |
1788 | | }; |
1789 | | |
1790 | | #[derive(Default, Debug, Clone)] |
1791 | | pub struct TestExtensionConfig { |
1792 | | /// Should "foo" be replaced by "bar"? |
1793 | | pub properties: HashMap<String, String>, |
1794 | | } |
1795 | | |
1796 | | impl ExtensionOptions for TestExtensionConfig { |
1797 | | fn as_any(&self) -> &dyn Any { |
1798 | | self |
1799 | | } |
1800 | | |
1801 | | fn as_any_mut(&mut self) -> &mut dyn Any { |
1802 | | self |
1803 | | } |
1804 | | |
1805 | | fn cloned(&self) -> Box<dyn ExtensionOptions> { |
1806 | | Box::new(self.clone()) |
1807 | | } |
1808 | | |
1809 | | fn set(&mut self, key: &str, value: &str) -> crate::Result<()> { |
1810 | | let (key, rem) = key.split_once('.').unwrap_or((key, "")); |
1811 | | assert_eq!(key, "test"); |
1812 | | self.properties.insert(rem.to_owned(), value.to_owned()); |
1813 | | Ok(()) |
1814 | | } |
1815 | | |
1816 | | fn entries(&self) -> Vec<ConfigEntry> { |
1817 | | self.properties |
1818 | | .iter() |
1819 | | .map(|(k, v)| ConfigEntry { |
1820 | | key: k.into(), |
1821 | | value: Some(v.into()), |
1822 | | description: "", |
1823 | | }) |
1824 | | .collect() |
1825 | | } |
1826 | | } |
1827 | | |
1828 | | impl ConfigExtension for TestExtensionConfig { |
1829 | | const PREFIX: &'static str = "test"; |
1830 | | } |
1831 | | |
1832 | | #[test] |
1833 | | fn create_table_config() { |
1834 | | let mut extension = Extensions::new(); |
1835 | | extension.insert(TestExtensionConfig::default()); |
1836 | | let table_config = TableOptions::new().with_extensions(extension); |
1837 | | let kafka_config = table_config.extensions.get::<TestExtensionConfig>(); |
1838 | | assert!(kafka_config.is_some()) |
1839 | | } |
1840 | | |
1841 | | #[test] |
1842 | | fn alter_test_extension_config() { |
1843 | | let mut extension = Extensions::new(); |
1844 | | extension.insert(TestExtensionConfig::default()); |
1845 | | let mut table_config = TableOptions::new().with_extensions(extension); |
1846 | | table_config.set_config_format(ConfigFileType::CSV); |
1847 | | table_config.set("format.delimiter", ";").unwrap(); |
1848 | | assert_eq!(table_config.csv.delimiter, b';'); |
1849 | | table_config.set("test.bootstrap.servers", "asd").unwrap(); |
1850 | | let kafka_config = table_config |
1851 | | .extensions |
1852 | | .get::<TestExtensionConfig>() |
1853 | | .unwrap(); |
1854 | | assert_eq!( |
1855 | | kafka_config.properties.get("bootstrap.servers").unwrap(), |
1856 | | "asd" |
1857 | | ); |
1858 | | } |
1859 | | |
1860 | | #[test] |
1861 | | fn csv_u8_table_options() { |
1862 | | let mut table_config = TableOptions::new(); |
1863 | | table_config.set_config_format(ConfigFileType::CSV); |
1864 | | table_config.set("format.delimiter", ";").unwrap(); |
1865 | | assert_eq!(table_config.csv.delimiter as char, ';'); |
1866 | | table_config.set("format.escape", "\"").unwrap(); |
1867 | | assert_eq!(table_config.csv.escape.unwrap() as char, '"'); |
1868 | | table_config.set("format.escape", "\'").unwrap(); |
1869 | | assert_eq!(table_config.csv.escape.unwrap() as char, '\''); |
1870 | | } |
1871 | | |
1872 | | #[cfg(feature = "parquet")] |
1873 | | #[test] |
1874 | | fn parquet_table_options() { |
1875 | | let mut table_config = TableOptions::new(); |
1876 | | table_config.set_config_format(ConfigFileType::PARQUET); |
1877 | | table_config |
1878 | | .set("format.bloom_filter_enabled::col1", "true") |
1879 | | .unwrap(); |
1880 | | assert_eq!( |
1881 | | table_config.parquet.column_specific_options["col1"].bloom_filter_enabled, |
1882 | | Some(true) |
1883 | | ); |
1884 | | } |
1885 | | |
1886 | | #[cfg(feature = "parquet")] |
1887 | | #[test] |
1888 | | fn parquet_table_options_config_entry() { |
1889 | | let mut table_config = TableOptions::new(); |
1890 | | table_config.set_config_format(ConfigFileType::PARQUET); |
1891 | | table_config |
1892 | | .set("format.bloom_filter_enabled::col1", "true") |
1893 | | .unwrap(); |
1894 | | let entries = table_config.entries(); |
1895 | | assert!(entries |
1896 | | .iter() |
1897 | | .any(|item| item.key == "format.bloom_filter_enabled::col1")) |
1898 | | } |
1899 | | |
1900 | | #[cfg(feature = "parquet")] |
1901 | | #[test] |
1902 | | fn parquet_table_options_config_metadata_entry() { |
1903 | | let mut table_config = TableOptions::new(); |
1904 | | table_config.set_config_format(ConfigFileType::PARQUET); |
1905 | | table_config.set("format.metadata::key1", "").unwrap(); |
1906 | | table_config.set("format.metadata::key2", "value2").unwrap(); |
1907 | | table_config |
1908 | | .set("format.metadata::key3", "value with spaces ") |
1909 | | .unwrap(); |
1910 | | table_config |
1911 | | .set("format.metadata::key4", "value with special chars :: :") |
1912 | | .unwrap(); |
1913 | | |
1914 | | let parsed_metadata = table_config.parquet.key_value_metadata.clone(); |
1915 | | assert_eq!(parsed_metadata.get("should not exist1"), None); |
1916 | | assert_eq!(parsed_metadata.get("key1"), Some(&Some("".into()))); |
1917 | | assert_eq!(parsed_metadata.get("key2"), Some(&Some("value2".into()))); |
1918 | | assert_eq!( |
1919 | | parsed_metadata.get("key3"), |
1920 | | Some(&Some("value with spaces ".into())) |
1921 | | ); |
1922 | | assert_eq!( |
1923 | | parsed_metadata.get("key4"), |
1924 | | Some(&Some("value with special chars :: :".into())) |
1925 | | ); |
1926 | | |
1927 | | // duplicate keys are overwritten |
1928 | | table_config.set("format.metadata::key_dupe", "A").unwrap(); |
1929 | | table_config.set("format.metadata::key_dupe", "B").unwrap(); |
1930 | | let parsed_metadata = table_config.parquet.key_value_metadata; |
1931 | | assert_eq!(parsed_metadata.get("key_dupe"), Some(&Some("B".into()))); |
1932 | | } |
1933 | | } |