Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/common/src/config.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Runtime configuration, via [`ConfigOptions`]
19
20
use std::any::Any;
21
use std::collections::{BTreeMap, HashMap};
22
use std::fmt::{self, Display};
23
use std::str::FromStr;
24
25
use crate::error::_config_err;
26
use crate::parsers::CompressionTypeVariant;
27
use crate::{DataFusionError, Result};
28
29
/// A macro that wraps a configuration struct and automatically derives
30
/// [`Default`] and [`ConfigField`] for it, allowing it to be used
31
/// in the [`ConfigOptions`] configuration tree
32
///
33
/// For example,
34
///
35
/// ```ignore
36
/// config_namespace! {
37
///    /// Amazing config
38
///    pub struct MyConfig {
39
///        /// Field 1 doc
40
///        field1: String, default = "".to_string()
41
///
42
///        /// Field 2 doc
43
///        field2: usize, default = 232
44
///
45
///        /// Field 3 doc
46
///        field3: Option<usize>, default = None
47
///    }
48
///}
49
/// ```
50
///
51
/// Will generate
52
///
53
/// ```ignore
54
/// /// Amazing config
55
/// #[derive(Debug, Clone)]
56
/// #[non_exhaustive]
57
/// pub struct MyConfig {
58
///     /// Field 1 doc
59
///     field1: String,
60
///     /// Field 2 doc
61
///     field2: usize,
62
///     /// Field 3 doc
63
///     field3: Option<usize>,
64
/// }
65
/// impl ConfigField for MyConfig {
66
///     fn set(&mut self, key: &str, value: &str) -> Result<()> {
67
///         let (key, rem) = key.split_once('.').unwrap_or((key, ""));
68
///         match key {
69
///             "field1" => self.field1.set(rem, value),
70
///             "field2" => self.field2.set(rem, value),
71
///             "field3" => self.field3.set(rem, value),
72
///             _ => _internal_err!(
73
///                 "Config value \"{}\" not found on MyConfig",
74
///                 key
75
///             ),
76
///         }
77
///     }
78
///
79
///     fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
80
///         let key = format!("{}.field1", key_prefix);
81
///         let desc = "Field 1 doc";
82
///         self.field1.visit(v, key.as_str(), desc);
83
///         let key = format!("{}.field2", key_prefix);
84
///         let desc = "Field 2 doc";
85
///         self.field2.visit(v, key.as_str(), desc);
86
///         let key = format!("{}.field3", key_prefix);
87
///         let desc = "Field 3 doc";
88
///         self.field3.visit(v, key.as_str(), desc);
89
///     }
90
/// }
91
///
92
/// impl Default for MyConfig {
93
///     fn default() -> Self {
94
///         Self {
95
///             field1: "".to_string(),
96
///             field2: 232,
97
///             field3: None,
98
///         }
99
///     }
100
/// }
101
/// ```
102
///
103
/// NB: Misplaced commas may result in nonsensical errors
104
///
105
#[macro_export]
106
macro_rules! config_namespace {
107
    (
108
     $(#[doc = $struct_d:tt])*
109
     $vis:vis struct $struct_name:ident {
110
        $(
111
        $(#[doc = $d:tt])*
112
        $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr
113
        )*$(,)*
114
    }
115
    ) => {
116
117
        $(#[doc = $struct_d])*
118
        #[derive(Debug, Clone, PartialEq)]
119
        $vis struct $struct_name{
120
            $(
121
            $(#[doc = $d])*
122
            $field_vis $field_name : $field_type,
123
            )*
124
        }
125
126
        impl ConfigField for $struct_name {
127
4
            fn set(&mut self, key: &str, value: &str) -> Result<()> {
128
4
                let (key, rem) = key.split_once('.').unwrap_or((key, ""));
129
4
                match key {
130
4
                    $(
131
4
                       stringify!($field_name) => 
self.$field_name.set(rem, value)0
,
132
                    )*
133
0
                    _ => return _config_err!(
134
0
                        "Config value \"{}\" not found on {}", key, stringify!($struct_name)
135
0
                    )
136
                }
137
4
            }
138
139
0
            fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
140
0
                $(
141
0
                let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
142
0
                let desc = concat!($($d),*).trim();
143
0
                self.$field_name.visit(v, key.as_str(), desc);
144
0
                )*
145
0
            }
146
        }
147
148
        impl Default for $struct_name {
149
8.29k
            fn default() -> Self {
150
8.29k
                Self {
151
8.29k
                    $($field_name: $default),*
152
8.29k
                }
153
8.29k
            }
154
        }
155
    }
156
}
157
158
config_namespace! {
159
    /// Options related to catalog and directory scanning
160
    ///
161
    /// See also: [`SessionConfig`]
162
    ///
163
    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
164
    pub struct CatalogOptions {
165
        /// Whether the default catalog and schema should be created automatically.
166
        pub create_default_catalog_and_schema: bool, default = true
167
168
        /// The default catalog name - this impacts what SQL queries use if not specified
169
        pub default_catalog: String, default = "datafusion".to_string()
170
171
        /// The default schema name - this impacts what SQL queries use if not specified
172
        pub default_schema: String, default = "public".to_string()
173
174
        /// Should DataFusion provide access to `information_schema`
175
        /// virtual tables for displaying schema information
176
        pub information_schema: bool, default = false
177
178
        /// Location scanned to load tables for `default` schema
179
        pub location: Option<String>, default = None
180
181
        /// Type of `TableProvider` to use when loading `default` schema
182
        pub format: Option<String>, default = None
183
184
        /// Default value for `format.has_header` for `CREATE EXTERNAL TABLE`
185
        /// if not specified explicitly in the statement.
186
        pub has_header: bool, default = true
187
188
        /// Specifies whether newlines in (quoted) CSV values are supported.
189
        ///
190
        /// This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE`
191
        /// if not specified explicitly in the statement.
192
        ///
193
        /// Parsing newlines in quoted values may be affected by execution behaviour such as
194
        /// parallel file scanning. Setting this to `true` ensures that newlines in values are
195
        /// parsed successfully, which may reduce performance.
196
        pub newlines_in_values: bool, default = false
197
    }
198
}
199
200
config_namespace! {
201
    /// Options related to SQL parser
202
    ///
203
    /// See also: [`SessionConfig`]
204
    ///
205
    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
206
    pub struct SqlParserOptions {
207
        /// When set to true, SQL parser will parse float as decimal type
208
        pub parse_float_as_decimal: bool, default = false
209
210
        /// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
211
        pub enable_ident_normalization: bool, default = true
212
213
        /// When set to true, SQL parser will normalize options value (convert value to lowercase)
214
        pub enable_options_value_normalization: bool, default = true
215
216
        /// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic,
217
        /// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi.
218
        pub dialect: String, default = "generic".to_string()
219
220
        /// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but
221
        /// ignore the length. If false, error if a `VARCHAR` with a length is
222
        /// specified. The Arrow type system does not have a notion of maximum
223
        /// string length and thus DataFusion can not enforce such limits.
224
        pub support_varchar_with_length: bool, default = true
225
    }
226
}
227
228
config_namespace! {
229
    /// Options related to query execution
230
    ///
231
    /// See also: [`SessionConfig`]
232
    ///
233
    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
234
    pub struct ExecutionOptions {
235
        /// Default batch size while creating new batches, it's especially useful for
236
        /// buffer-in-memory batches since creating tiny batches would result in too much
237
        /// metadata memory consumption
238
        pub batch_size: usize, default = 8192
239
240
        /// When set to true, record batches will be examined between each operator and
241
        /// small batches will be coalesced into larger batches. This is helpful when there
242
        /// are highly selective filters or joins that could produce tiny output batches. The
243
        /// target batch size is determined by the configuration setting
244
        pub coalesce_batches: bool, default = true
245
246
        /// Should DataFusion collect statistics after listing files
247
        pub collect_statistics: bool, default = false
248
249
        /// Number of partitions for query execution. Increasing partitions can increase
250
        /// concurrency.
251
        ///
252
        /// Defaults to the number of CPU cores on the system
253
        pub target_partitions: usize, default = num_cpus::get()
254
255
        /// The default time zone
256
        ///
257
        /// Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime
258
        /// according to this time zone, and then extract the hour
259
        pub time_zone: Option<String>, default = Some("+00:00".into())
260
261
        /// Parquet options
262
        pub parquet: ParquetOptions, default = Default::default()
263
264
        /// Fan-out during initial physical planning.
265
        ///
266
        /// This is mostly use to plan `UNION` children in parallel.
267
        ///
268
        /// Defaults to the number of CPU cores on the system
269
        pub planning_concurrency: usize, default = num_cpus::get()
270
271
        /// Specifies the reserved memory for each spillable sort operation to
272
        /// facilitate an in-memory merge.
273
        ///
274
        /// When a sort operation spills to disk, the in-memory data must be
275
        /// sorted and merged before being written to a file. This setting reserves
276
        /// a specific amount of memory for that in-memory sort/merge process.
277
        ///
278
        /// Note: This setting is irrelevant if the sort operation cannot spill
279
        /// (i.e., if there's no `DiskManager` configured).
280
        pub sort_spill_reservation_bytes: usize, default = 10 * 1024 * 1024
281
282
        /// When sorting, below what size should data be concatenated
283
        /// and sorted in a single RecordBatch rather than sorted in
284
        /// batches and merged.
285
        pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024
286
287
        /// Number of files to read in parallel when inferring schema and statistics
288
        pub meta_fetch_concurrency: usize, default = 32
289
290
        /// Guarantees a minimum level of output files running in parallel.
291
        /// RecordBatches will be distributed in round robin fashion to each
292
        /// parallel writer. Each writer is closed and a new file opened once
293
        /// soft_max_rows_per_output_file is reached.
294
        pub minimum_parallel_output_files: usize, default = 4
295
296
        /// Target number of rows in output files when writing multiple.
297
        /// This is a soft max, so it can be exceeded slightly. There also
298
        /// will be one file smaller than the limit if the total
299
        /// number of rows written is not roughly divisible by the soft max
300
        pub soft_max_rows_per_output_file: usize, default = 50000000
301
302
        /// This is the maximum number of RecordBatches buffered
303
        /// for each output file being worked. Higher values can potentially
304
        /// give faster write performance at the cost of higher peak
305
        /// memory consumption
306
        pub max_buffered_batches_per_output_file: usize, default = 2
307
308
        /// Should sub directories be ignored when scanning directories for data
309
        /// files. Defaults to true (ignores subdirectories), consistent with
310
        /// Hive. Note that this setting does not affect reading partitioned
311
        /// tables (e.g. `/table/year=2021/month=01/data.parquet`).
312
        pub listing_table_ignore_subdirectory: bool, default = true
313
314
        /// Should DataFusion support recursive CTEs
315
        pub enable_recursive_ctes: bool, default = true
316
317
        /// Attempt to eliminate sorts by packing & sorting files with non-overlapping
318
        /// statistics into the same file groups.
319
        /// Currently experimental
320
        pub split_file_groups_by_statistics: bool, default = false
321
322
        /// Should DataFusion keep the columns used for partition_by in the output RecordBatches
323
        pub keep_partition_by_columns: bool, default = false
324
325
        /// Aggregation ratio (number of distinct groups / number of input rows)
326
        /// threshold for skipping partial aggregation. If the value is greater
327
        /// then partial aggregation will skip aggregation for further input
328
        pub skip_partial_aggregation_probe_ratio_threshold: f64, default = 0.8
329
330
        /// Number of input rows partial aggregation partition should process, before
331
        /// aggregation ratio check and trying to switch to skipping aggregation mode
332
        pub skip_partial_aggregation_probe_rows_threshold: usize, default = 100_000
333
334
        /// Should DataFusion use row number estimates at the input to decide
335
        /// whether increasing parallelism is beneficial or not. By default,
336
        /// only exact row numbers (not estimates) are used for this decision.
337
        /// Setting this flag to `true` will likely produce better plans.
338
        /// if the source of statistics is accurate.
339
        /// We plan to make this the default in the future.
340
        pub use_row_number_estimates_to_optimize_partitioning: bool, default = false
341
    }
342
}
343
344
config_namespace! {
345
    /// Options for reading and writing parquet files
346
    ///
347
    /// See also: [`SessionConfig`]
348
    ///
349
    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
350
    pub struct ParquetOptions {
351
        // The following options affect reading parquet files
352
353
        /// (reading) If true, reads the Parquet data page level metadata (the
354
        /// Page Index), if present, to reduce the I/O and number of
355
        /// rows decoded.
356
        pub enable_page_index: bool, default = true
357
358
        /// (reading) If true, the parquet reader attempts to skip entire row groups based
359
        /// on the predicate in the query and the metadata (min/max values) stored in
360
        /// the parquet file
361
        pub pruning: bool, default = true
362
363
        /// (reading) If true, the parquet reader skip the optional embedded metadata that may be in
364
        /// the file Schema. This setting can help avoid schema conflicts when querying
365
        /// multiple parquet files with schemas containing compatible types but different metadata
366
        pub skip_metadata: bool, default = true
367
368
        /// (reading) If specified, the parquet reader will try and fetch the last `size_hint`
369
        /// bytes of the parquet file optimistically. If not specified, two reads are required:
370
        /// One read to fetch the 8-byte parquet footer and
371
        /// another to fetch the metadata length encoded in the footer
372
        pub metadata_size_hint: Option<usize>, default = None
373
374
        /// (reading) If true, filter expressions are be applied during the parquet decoding operation to
375
        /// reduce the number of rows decoded. This optimization is sometimes called "late materialization".
376
        pub pushdown_filters: bool, default = false
377
378
        /// (reading) If true, filter expressions evaluated during the parquet decoding operation
379
        /// will be reordered heuristically to minimize the cost of evaluation. If false,
380
        /// the filters are applied in the same order as written in the query
381
        pub reorder_filters: bool, default = false
382
383
        /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
384
        /// and `Binary/BinaryLarge` with `BinaryView`.
385
        pub schema_force_view_types: bool, default = false
386
387
        // The following options affect writing to parquet files
388
        // and map to parquet::file::properties::WriterProperties
389
390
        /// (writing) Sets best effort maximum size of data page in bytes
391
        pub data_pagesize_limit: usize, default = 1024 * 1024
392
393
        /// (writing) Sets write_batch_size in bytes
394
        pub write_batch_size: usize, default = 1024
395
396
        /// (writing) Sets parquet writer version
397
        /// valid values are "1.0" and "2.0"
398
        pub writer_version: String, default = "1.0".to_string()
399
400
        /// (writing) Sets default parquet compression codec.
401
        /// Valid values are: uncompressed, snappy, gzip(level),
402
        /// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
403
        /// These values are not case sensitive. If NULL, uses
404
        /// default parquet writer setting
405
        ///
406
        /// Note that this default setting is not the same as
407
        /// the default parquet writer setting.
408
        pub compression: Option<String>, default = Some("zstd(3)".into())
409
410
        /// (writing) Sets if dictionary encoding is enabled. If NULL, uses
411
        /// default parquet writer setting
412
        pub dictionary_enabled: Option<bool>, default = Some(true)
413
414
        /// (writing) Sets best effort maximum dictionary page size, in bytes
415
        pub dictionary_page_size_limit: usize, default = 1024 * 1024
416
417
        /// (writing) Sets if statistics are enabled for any column
418
        /// Valid values are: "none", "chunk", and "page"
419
        /// These values are not case sensitive. If NULL, uses
420
        /// default parquet writer setting
421
        pub statistics_enabled: Option<String>, default = Some("page".into())
422
423
        /// (writing) Sets max statistics size for any column. If NULL, uses
424
        /// default parquet writer setting
425
        pub max_statistics_size: Option<usize>, default = Some(4096)
426
427
        /// (writing) Target maximum number of rows in each row group (defaults to 1M
428
        /// rows). Writing larger row groups requires more memory to write, but
429
        /// can get better compression and be faster to read.
430
        pub max_row_group_size: usize, default =  1024 * 1024
431
432
        /// (writing) Sets "created by" property
433
        pub created_by: String, default = concat!("datafusion version ", env!("CARGO_PKG_VERSION")).into()
434
435
        /// (writing) Sets column index truncate length
436
        pub column_index_truncate_length: Option<usize>, default = Some(64)
437
438
        /// (writing) Sets best effort maximum number of rows in data page
439
        pub data_page_row_count_limit: usize, default = 20_000
440
441
        /// (writing)  Sets default encoding for any column.
442
        /// Valid values are: plain, plain_dictionary, rle,
443
        /// bit_packed, delta_binary_packed, delta_length_byte_array,
444
        /// delta_byte_array, rle_dictionary, and byte_stream_split.
445
        /// These values are not case sensitive. If NULL, uses
446
        /// default parquet writer setting
447
        pub encoding: Option<String>, default = None
448
449
        /// (writing) Use any available bloom filters when reading parquet files
450
        pub bloom_filter_on_read: bool, default = true
451
452
        /// (writing) Write bloom filters for all columns when creating parquet files
453
        pub bloom_filter_on_write: bool, default = false
454
455
        /// (writing) Sets bloom filter false positive probability. If NULL, uses
456
        /// default parquet writer setting
457
        pub bloom_filter_fpp: Option<f64>, default = None
458
459
        /// (writing) Sets bloom filter number of distinct values. If NULL, uses
460
        /// default parquet writer setting
461
        pub bloom_filter_ndv: Option<u64>, default = None
462
463
        /// (writing) Controls whether DataFusion will attempt to speed up writing
464
        /// parquet files by serializing them in parallel. Each column
465
        /// in each row group in each output file are serialized in parallel
466
        /// leveraging a maximum possible core count of n_files*n_row_groups*n_columns.
467
        pub allow_single_file_parallelism: bool, default = true
468
469
        /// (writing) By default parallel parquet writer is tuned for minimum
470
        /// memory usage in a streaming execution plan. You may see
471
        /// a performance benefit when writing large parquet files
472
        /// by increasing maximum_parallel_row_group_writers and
473
        /// maximum_buffered_record_batches_per_stream if your system
474
        /// has idle cores and can tolerate additional memory usage.
475
        /// Boosting these values is likely worthwhile when
476
        /// writing out already in-memory data, such as from a cached
477
        /// data frame.
478
        pub maximum_parallel_row_group_writers: usize, default = 1
479
480
        /// (writing) By default parallel parquet writer is tuned for minimum
481
        /// memory usage in a streaming execution plan. You may see
482
        /// a performance benefit when writing large parquet files
483
        /// by increasing maximum_parallel_row_group_writers and
484
        /// maximum_buffered_record_batches_per_stream if your system
485
        /// has idle cores and can tolerate additional memory usage.
486
        /// Boosting these values is likely worthwhile when
487
        /// writing out already in-memory data, such as from a cached
488
        /// data frame.
489
        pub maximum_buffered_record_batches_per_stream: usize, default = 2
490
    }
491
}
492
493
config_namespace! {
494
    /// Options related to query optimization
495
    ///
496
    /// See also: [`SessionConfig`]
497
    ///
498
    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
499
    pub struct OptimizerOptions {
500
        /// When set to true, the optimizer will push a limit operation into
501
        /// grouped aggregations which have no aggregate expressions, as a soft limit,
502
        /// emitting groups once the limit is reached, before all rows in the group are read.
503
        pub enable_distinct_aggregation_soft_limit: bool, default = true
504
505
        /// When set to true, the physical plan optimizer will try to add round robin
506
        /// repartitioning to increase parallelism to leverage more CPU cores
507
        pub enable_round_robin_repartition: bool, default = true
508
509
        /// When set to true, the optimizer will attempt to perform limit operations
510
        /// during aggregations, if possible
511
        pub enable_topk_aggregation: bool, default = true
512
513
        /// When set to true, the optimizer will insert filters before a join between
514
        /// a nullable and non-nullable column to filter out nulls on the nullable side. This
515
        /// filter can add additional overhead when the file format does not fully support
516
        /// predicate push down.
517
        pub filter_null_join_keys: bool, default = false
518
519
        /// Should DataFusion repartition data using the aggregate keys to execute aggregates
520
        /// in parallel using the provided `target_partitions` level
521
        pub repartition_aggregations: bool, default = true
522
523
        /// Minimum total files size in bytes to perform file scan repartitioning.
524
        pub repartition_file_min_size: usize, default = 10 * 1024 * 1024
525
526
        /// Should DataFusion repartition data using the join keys to execute joins in parallel
527
        /// using the provided `target_partitions` level
528
        pub repartition_joins: bool, default = true
529
530
        /// Should DataFusion allow symmetric hash joins for unbounded data sources even when
531
        /// its inputs do not have any ordering or filtering If the flag is not enabled,
532
        /// the SymmetricHashJoin operator will be unable to prune its internal buffers,
533
        /// resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right,
534
        /// RightAnti, and RightSemi - being produced only at the end of the execution.
535
        /// This is not typical in stream processing. Additionally, without proper design for
536
        /// long runner execution, all types of joins may encounter out-of-memory errors.
537
        pub allow_symmetric_joins_without_pruning: bool, default = true
538
539
        /// When set to `true`, file groups will be repartitioned to achieve maximum parallelism.
540
        /// Currently Parquet and CSV formats are supported.
541
        ///
542
        /// If set to `true`, all files will be repartitioned evenly (i.e., a single large file
543
        /// might be partitioned into smaller chunks) for parallel scanning.
544
        /// If set to `false`, different files will be read in parallel, but repartitioning won't
545
        /// happen within a single file.
546
        pub repartition_file_scans: bool, default = true
547
548
        /// Should DataFusion repartition data using the partitions keys to execute window
549
        /// functions in parallel using the provided `target_partitions` level
550
        pub repartition_windows: bool, default = true
551
552
        /// Should DataFusion execute sorts in a per-partition fashion and merge
553
        /// afterwards instead of coalescing first and sorting globally.
554
        /// With this flag is enabled, plans in the form below
555
        ///
556
        /// ```text
557
        ///      "SortExec: [a@0 ASC]",
558
        ///      "  CoalescePartitionsExec",
559
        ///      "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
560
        /// ```
561
        /// would turn into the plan below which performs better in multithreaded environments
562
        ///
563
        /// ```text
564
        ///      "SortPreservingMergeExec: [a@0 ASC]",
565
        ///      "  SortExec: [a@0 ASC]",
566
        ///      "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
567
        /// ```
568
        pub repartition_sorts: bool, default = true
569
570
        /// When true, DataFusion will opportunistically remove sorts when the data is already sorted,
571
        /// (i.e. setting `preserve_order` to true on `RepartitionExec`  and
572
        /// using `SortPreservingMergeExec`)
573
        ///
574
        /// When false, DataFusion will maximize plan parallelism using
575
        /// `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`.
576
        pub prefer_existing_sort: bool, default = false
577
578
        /// When set to true, the logical plan optimizer will produce warning
579
        /// messages if any optimization rules produce errors and then proceed to the next
580
        /// rule. When set to false, any rules that produce errors will cause the query to fail
581
        pub skip_failed_rules: bool, default = false
582
583
        /// Number of times that the optimizer will attempt to optimize the plan
584
        pub max_passes: usize, default = 3
585
586
        /// When set to true, the physical plan optimizer will run a top down
587
        /// process to reorder the join keys
588
        pub top_down_join_key_reordering: bool, default = true
589
590
        /// When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin.
591
        /// HashJoin can work more efficiently than SortMergeJoin but consumes more memory
592
        pub prefer_hash_join: bool, default = true
593
594
        /// The maximum estimated size in bytes for one input side of a HashJoin
595
        /// will be collected into a single partition
596
        pub hash_join_single_partition_threshold: usize, default = 1024 * 1024
597
598
        /// The maximum estimated size in rows for one input side of a HashJoin
599
        /// will be collected into a single partition
600
        pub hash_join_single_partition_threshold_rows: usize, default = 1024 * 128
601
602
        /// The default filter selectivity used by Filter Statistics
603
        /// when an exact selectivity cannot be determined. Valid values are
604
        /// between 0 (no selectivity) and 100 (all rows are selected).
605
        pub default_filter_selectivity: u8, default = 20
606
607
        /// When set to true, the optimizer will not attempt to convert Union to Interleave
608
        pub prefer_existing_union: bool, default = false
609
610
        /// When set to true, if the returned type is a view type
611
        /// then the output will be coerced to a non-view.
612
        /// Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.
613
        pub expand_views_at_output: bool, default = false
614
    }
615
}
616
617
config_namespace! {
618
    /// Options controlling explain output
619
    ///
620
    /// See also: [`SessionConfig`]
621
    ///
622
    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
623
    pub struct ExplainOptions {
624
        /// When set to true, the explain statement will only print logical plans
625
        pub logical_plan_only: bool, default = false
626
627
        /// When set to true, the explain statement will only print physical plans
628
        pub physical_plan_only: bool, default = false
629
630
        /// When set to true, the explain statement will print operator statistics
631
        /// for physical plans
632
        pub show_statistics: bool, default = false
633
634
        /// When set to true, the explain statement will print the partition sizes
635
        pub show_sizes: bool, default = true
636
637
        /// When set to true, the explain statement will print schema information
638
        pub show_schema: bool, default = false
639
    }
640
}
641
642
/// A key value pair, with a corresponding description
643
#[derive(Debug)]
644
pub struct ConfigEntry {
645
    /// A unique string to identify this config value
646
    pub key: String,
647
648
    /// The value if any
649
    pub value: Option<String>,
650
651
    /// A description of this configuration entry
652
    pub description: &'static str,
653
}
654
655
/// Configuration options struct, able to store both built-in configuration and custom options
656
#[derive(Debug, Clone, Default)]
657
#[non_exhaustive]
658
pub struct ConfigOptions {
659
    /// Catalog options
660
    pub catalog: CatalogOptions,
661
    /// Execution options
662
    pub execution: ExecutionOptions,
663
    /// Optimizer options
664
    pub optimizer: OptimizerOptions,
665
    /// SQL parser options
666
    pub sql_parser: SqlParserOptions,
667
    /// Explain options
668
    pub explain: ExplainOptions,
669
    /// Optional extensions registered using [`Extensions::insert`]
670
    pub extensions: Extensions,
671
}
672
673
impl ConfigField for ConfigOptions {
674
4
    fn set(&mut self, key: &str, value: &str) -> Result<()> {
675
4
        // Extensions are handled in the public `ConfigOptions::set`
676
4
        let (key, rem) = key.split_once('.').unwrap_or((key, ""));
677
4
        match key {
678
4
            "catalog" => 
self.catalog.set(rem, value)0
,
679
4
            "execution" => self.execution.set(rem, value),
680
0
            "optimizer" => self.optimizer.set(rem, value),
681
0
            "explain" => self.explain.set(rem, value),
682
0
            "sql_parser" => self.sql_parser.set(rem, value),
683
0
            _ => _config_err!("Config value \"{key}\" not found on ConfigOptions"),
684
        }
685
4
    }
686
687
0
    fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) {
688
0
        self.catalog.visit(v, "datafusion.catalog", "");
689
0
        self.execution.visit(v, "datafusion.execution", "");
690
0
        self.optimizer.visit(v, "datafusion.optimizer", "");
691
0
        self.explain.visit(v, "datafusion.explain", "");
692
0
        self.sql_parser.visit(v, "datafusion.sql_parser", "");
693
0
    }
694
}
695
696
impl ConfigOptions {
697
    /// Creates a new [`ConfigOptions`] with default values
698
1.38k
    pub fn new() -> Self {
699
1.38k
        Self::default()
700
1.38k
    }
701
702
    /// Set extensions to provided value
703
0
    pub fn with_extensions(mut self, extensions: Extensions) -> Self {
704
0
        self.extensions = extensions;
705
0
        self
706
0
    }
707
708
    /// Set a configuration option
709
4
    pub fn set(&mut self, key: &str, value: &str) -> Result<()> {
710
4
        let Some((prefix, key)) = key.split_once('.') else {
711
0
            return _config_err!("could not find config namespace for key \"{key}\"");
712
        };
713
714
4
        if prefix == "datafusion" {
715
4
            return ConfigField::set(self, key, value);
716
0
        }
717
718
0
        let Some(e) = self.extensions.0.get_mut(prefix) else {
719
0
            return _config_err!("Could not find config namespace \"{prefix}\"");
720
        };
721
0
        e.0.set(key, value)
722
4
    }
723
724
    /// Create new ConfigOptions struct, taking values from
725
    /// environment variables where possible.
726
    ///
727
    /// For example, setting `DATAFUSION_EXECUTION_BATCH_SIZE` will
728
    /// control `datafusion.execution.batch_size`.
729
0
    pub fn from_env() -> Result<Self> {
730
        struct Visitor(Vec<String>);
731
732
        impl Visit for Visitor {
733
0
            fn some<V: Display>(&mut self, key: &str, _: V, _: &'static str) {
734
0
                self.0.push(key.to_string())
735
0
            }
736
737
0
            fn none(&mut self, key: &str, _: &'static str) {
738
0
                self.0.push(key.to_string())
739
0
            }
740
        }
741
742
        // Extract the names of all fields and then look up the corresponding
743
        // environment variables. This isn't hugely efficient but avoids
744
        // ambiguity between `a.b` and `a_b` which would both correspond
745
        // to an environment variable of `A_B`
746
747
0
        let mut keys = Visitor(vec![]);
748
0
        let mut ret = Self::default();
749
0
        ret.visit(&mut keys, "datafusion", "");
750
751
0
        for key in keys.0 {
752
0
            let env = key.to_uppercase().replace('.', "_");
753
0
            if let Some(var) = std::env::var_os(env) {
754
0
                ret.set(&key, var.to_string_lossy().as_ref())?;
755
0
            }
756
        }
757
758
0
        Ok(ret)
759
0
    }
760
761
    /// Create new ConfigOptions struct, taking values from a string hash map.
762
    ///
763
    /// Only the built-in configurations will be extracted from the hash map
764
    /// and other key value pairs will be ignored.
765
0
    pub fn from_string_hash_map(settings: &HashMap<String, String>) -> Result<Self> {
766
        struct Visitor(Vec<String>);
767
768
        impl Visit for Visitor {
769
0
            fn some<V: Display>(&mut self, key: &str, _: V, _: &'static str) {
770
0
                self.0.push(key.to_string())
771
0
            }
772
773
0
            fn none(&mut self, key: &str, _: &'static str) {
774
0
                self.0.push(key.to_string())
775
0
            }
776
        }
777
778
0
        let mut keys = Visitor(vec![]);
779
0
        let mut ret = Self::default();
780
0
        ret.visit(&mut keys, "datafusion", "");
781
782
0
        for key in keys.0 {
783
0
            if let Some(var) = settings.get(&key) {
784
0
                ret.set(&key, var)?;
785
0
            }
786
        }
787
788
0
        Ok(ret)
789
0
    }
790
791
    /// Returns the [`ConfigEntry`] stored within this [`ConfigOptions`]
792
0
    pub fn entries(&self) -> Vec<ConfigEntry> {
793
        struct Visitor(Vec<ConfigEntry>);
794
795
        impl Visit for Visitor {
796
0
            fn some<V: Display>(
797
0
                &mut self,
798
0
                key: &str,
799
0
                value: V,
800
0
                description: &'static str,
801
0
            ) {
802
0
                self.0.push(ConfigEntry {
803
0
                    key: key.to_string(),
804
0
                    value: Some(value.to_string()),
805
0
                    description,
806
0
                })
807
0
            }
808
809
0
            fn none(&mut self, key: &str, description: &'static str) {
810
0
                self.0.push(ConfigEntry {
811
0
                    key: key.to_string(),
812
0
                    value: None,
813
0
                    description,
814
0
                })
815
0
            }
816
        }
817
818
0
        let mut v = Visitor(vec![]);
819
0
        self.visit(&mut v, "datafusion", "");
820
0
821
0
        v.0.extend(self.extensions.0.values().flat_map(|e| e.0.entries()));
822
0
        v.0
823
0
    }
824
825
    /// Generate documentation that can be included in the user guide
826
0
    pub fn generate_config_markdown() -> String {
827
        use std::fmt::Write as _;
828
829
0
        let mut s = Self::default();
830
0
831
0
        // Normalize for display
832
0
        s.execution.target_partitions = 0;
833
0
        s.execution.planning_concurrency = 0;
834
0
835
0
        let mut docs = "| key | default | description |\n".to_string();
836
0
        docs += "|-----|---------|-------------|\n";
837
0
        let mut entries = s.entries();
838
0
        entries.sort_unstable_by(|a, b| a.key.cmp(&b.key));
839
840
0
        for entry in s.entries() {
841
0
            let _ = writeln!(
842
0
                &mut docs,
843
0
                "| {} | {} | {} |",
844
0
                entry.key,
845
0
                entry.value.as_deref().unwrap_or("NULL"),
846
0
                entry.description
847
0
            );
848
0
        }
849
0
        docs
850
0
    }
851
}
852
853
/// [`ConfigExtension`] provides a mechanism to store third-party configuration within DataFusion
854
///
855
/// Unfortunately associated constants are not currently object-safe, and so this
856
/// extends the object-safe [`ExtensionOptions`]
857
pub trait ConfigExtension: ExtensionOptions {
858
    /// Configuration namespace prefix to use
859
    ///
860
    /// All values under this will be prefixed with `$PREFIX + "."`
861
    const PREFIX: &'static str;
862
}
863
864
/// An object-safe API for storing arbitrary configuration
865
pub trait ExtensionOptions: Send + Sync + std::fmt::Debug + 'static {
866
    /// Return `self` as [`Any`]
867
    ///
868
    /// This is needed until trait upcasting is stabilised
869
    fn as_any(&self) -> &dyn Any;
870
871
    /// Return `self` as [`Any`]
872
    ///
873
    /// This is needed until trait upcasting is stabilised
874
    fn as_any_mut(&mut self) -> &mut dyn Any;
875
876
    /// Return a deep clone of this [`ExtensionOptions`]
877
    ///
878
    /// It is important this does not share mutable state to avoid consistency issues
879
    /// with configuration changing whilst queries are executing
880
    fn cloned(&self) -> Box<dyn ExtensionOptions>;
881
882
    /// Set the given `key`, `value` pair
883
    fn set(&mut self, key: &str, value: &str) -> Result<()>;
884
885
    /// Returns the [`ConfigEntry`] stored in this [`ExtensionOptions`]
886
    fn entries(&self) -> Vec<ConfigEntry>;
887
}
888
889
/// A type-safe container for [`ConfigExtension`]
890
#[derive(Debug, Default, Clone)]
891
pub struct Extensions(BTreeMap<&'static str, ExtensionBox>);
892
893
impl Extensions {
894
    /// Create a new, empty [`Extensions`]
895
0
    pub fn new() -> Self {
896
0
        Self(BTreeMap::new())
897
0
    }
898
899
    /// Registers a [`ConfigExtension`] with this [`ConfigOptions`]
900
0
    pub fn insert<T: ConfigExtension>(&mut self, extension: T) {
901
0
        assert_ne!(T::PREFIX, "datafusion");
902
0
        let e = ExtensionBox(Box::new(extension));
903
0
        self.0.insert(T::PREFIX, e);
904
0
    }
905
906
    /// Retrieves the extension of the given type if any
907
0
    pub fn get<T: ConfigExtension>(&self) -> Option<&T> {
908
0
        self.0.get(T::PREFIX)?.0.as_any().downcast_ref()
909
0
    }
910
911
    /// Retrieves the extension of the given type if any
912
0
    pub fn get_mut<T: ConfigExtension>(&mut self) -> Option<&mut T> {
913
0
        let e = self.0.get_mut(T::PREFIX)?;
914
0
        e.0.as_any_mut().downcast_mut()
915
0
    }
916
}
917
918
#[derive(Debug)]
919
struct ExtensionBox(Box<dyn ExtensionOptions>);
920
921
impl Clone for ExtensionBox {
922
0
    fn clone(&self) -> Self {
923
0
        Self(self.0.cloned())
924
0
    }
925
}
926
927
/// A trait implemented by `config_namespace` and for field types that provides
928
/// the ability to walk and mutate the configuration tree
929
pub trait ConfigField {
930
    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str);
931
932
    fn set(&mut self, key: &str, value: &str) -> Result<()>;
933
}
934
935
impl<F: ConfigField + Default> ConfigField for Option<F> {
936
0
    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
937
0
        match self {
938
0
            Some(s) => s.visit(v, key, description),
939
0
            None => v.none(key, description),
940
        }
941
0
    }
942
943
0
    fn set(&mut self, key: &str, value: &str) -> Result<()> {
944
0
        self.get_or_insert_with(Default::default).set(key, value)
945
0
    }
946
}
947
948
#[macro_export]
949
macro_rules! config_field {
950
    ($t:ty) => {
951
        impl ConfigField for $t {
952
0
            fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
953
0
                v.some(key, self, description)
954
0
            }
955
956
0
            fn set(&mut self, _: &str, value: &str) -> Result<()> {
957
4
                
*self0
=
value.parse().map_err(0
|e| {
958
0
                    DataFusionError::Context(
959
0
                        format!(concat!("Error parsing {} as ", stringify!($t),), value),
960
0
                        Box::new(DataFusionError::External(Box::new(e))),
961
0
                    )
962
4
                })
?0
;
963
4
                Ok(())
964
4
            }
965
        }
966
    };
967
}
968
969
config_field!(String);
970
config_field!(bool);
971
config_field!(usize);
972
config_field!(f64);
973
config_field!(u64);
974
975
impl ConfigField for u8 {
976
0
    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
977
0
        v.some(key, self, description)
978
0
    }
979
980
0
    fn set(&mut self, key: &str, value: &str) -> Result<()> {
981
0
        if value.is_empty() {
982
0
            return Err(DataFusionError::Configuration(format!(
983
0
                "Input string for {} key is empty",
984
0
                key
985
0
            )));
986
0
        }
987
        // Check if the string is a valid number
988
0
        if let Ok(num) = value.parse::<u8>() {
989
0
            // TODO: Let's decide how we treat the numerical strings.
990
0
            *self = num;
991
0
        } else {
992
0
            let bytes = value.as_bytes();
993
0
            // Check if the first character is ASCII (single byte)
994
0
            if bytes.len() > 1 || !value.chars().next().unwrap().is_ascii() {
995
0
                return Err(DataFusionError::Configuration(format!(
996
0
                    "Error parsing {} as u8. Non-ASCII string provided",
997
0
                    value
998
0
                )));
999
0
            }
1000
0
            *self = bytes[0];
1001
        }
1002
0
        Ok(())
1003
0
    }
1004
}
1005
1006
impl ConfigField for CompressionTypeVariant {
1007
0
    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
1008
0
        v.some(key, self, description)
1009
0
    }
1010
1011
0
    fn set(&mut self, _: &str, value: &str) -> Result<()> {
1012
0
        *self = CompressionTypeVariant::from_str(value)?;
1013
0
        Ok(())
1014
0
    }
1015
}
1016
1017
/// An implementation trait used to recursively walk configuration
1018
pub trait Visit {
1019
    fn some<V: Display>(&mut self, key: &str, value: V, description: &'static str);
1020
1021
    fn none(&mut self, key: &str, description: &'static str);
1022
}
1023
1024
/// Convenience macro to create [`ExtensionsOptions`].
1025
///
1026
/// The created structure implements the following traits:
1027
///
1028
/// - [`Clone`]
1029
/// - [`Debug`]
1030
/// - [`Default`]
1031
/// - [`ExtensionOptions`]
1032
///
1033
/// # Usage
1034
/// The syntax is:
1035
///
1036
/// ```text
1037
/// extensions_options! {
1038
///      /// Struct docs (optional).
1039
///     [<vis>] struct <StructName> {
1040
///         /// Field docs (optional)
1041
///         [<vis>] <field_name>: <field_type>, default = <default_value>
1042
///
1043
///         ... more fields
1044
///     }
1045
/// }
1046
/// ```
1047
///
1048
/// The placeholders are:
1049
/// - `[<vis>]`: Optional visibility modifier like `pub` or `pub(crate)`.
1050
/// - `<StructName>`: Struct name like `MyStruct`.
1051
/// - `<field_name>`: Field name like `my_field`.
1052
/// - `<field_type>`: Field type like `u8`.
1053
/// - `<default_value>`: Default value matching the field type like `42`.
1054
///
1055
/// # Example
1056
/// ```
1057
/// use datafusion_common::extensions_options;
1058
///
1059
/// extensions_options! {
1060
///     /// My own config options.
1061
///     pub struct MyConfig {
1062
///         /// Should "foo" be replaced by "bar"?
1063
///         pub foo_to_bar: bool, default = true
1064
///
1065
///         /// How many "baz" should be created?
1066
///         pub baz_count: usize, default = 1337
1067
///     }
1068
/// }
1069
/// ```
1070
///
1071
///
1072
/// [`Debug`]: std::fmt::Debug
1073
/// [`ExtensionsOptions`]: crate::config::ExtensionOptions
1074
#[macro_export]
1075
macro_rules! extensions_options {
1076
    (
1077
     $(#[doc = $struct_d:tt])*
1078
     $vis:vis struct $struct_name:ident {
1079
        $(
1080
        $(#[doc = $d:tt])*
1081
        $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr
1082
        )*$(,)*
1083
    }
1084
    ) => {
1085
        $(#[doc = $struct_d])*
1086
        #[derive(Debug, Clone)]
1087
        #[non_exhaustive]
1088
        $vis struct $struct_name{
1089
            $(
1090
            $(#[doc = $d])*
1091
            $field_vis $field_name : $field_type,
1092
            )*
1093
        }
1094
1095
        impl Default for $struct_name {
1096
            fn default() -> Self {
1097
                Self {
1098
                    $($field_name: $default),*
1099
                }
1100
            }
1101
        }
1102
1103
        impl $crate::config::ExtensionOptions for $struct_name {
1104
            fn as_any(&self) -> &dyn ::std::any::Any {
1105
                self
1106
            }
1107
1108
            fn as_any_mut(&mut self) -> &mut dyn ::std::any::Any {
1109
                self
1110
            }
1111
1112
            fn cloned(&self) -> Box<dyn $crate::config::ExtensionOptions> {
1113
                Box::new(self.clone())
1114
            }
1115
1116
            fn set(&mut self, key: &str, value: &str) -> $crate::Result<()> {
1117
                match key {
1118
                    $(
1119
                       stringify!($field_name) => {
1120
                        self.$field_name = value.parse().map_err(|e| {
1121
                            $crate::DataFusionError::Context(
1122
                                format!(concat!("Error parsing {} as ", stringify!($t),), value),
1123
                                Box::new($crate::DataFusionError::External(Box::new(e))),
1124
                            )
1125
                        })?;
1126
                        Ok(())
1127
                       }
1128
                    )*
1129
                    _ => Err($crate::DataFusionError::Configuration(
1130
                        format!(concat!("Config value \"{}\" not found on ", stringify!($struct_name)), key)
1131
                    ))
1132
                }
1133
            }
1134
1135
            fn entries(&self) -> Vec<$crate::config::ConfigEntry> {
1136
                vec![
1137
                    $(
1138
                        $crate::config::ConfigEntry {
1139
                            key: stringify!($field_name).to_owned(),
1140
                            value: (self.$field_name != $default).then(|| self.$field_name.to_string()),
1141
                            description: concat!($($d),*).trim(),
1142
                        },
1143
                    )*
1144
                ]
1145
            }
1146
        }
1147
    }
1148
}
1149
1150
/// These file types have special built in behavior for configuration.
1151
/// Use TableOptions::Extensions for configuring other file types.
1152
#[derive(Debug, Clone)]
1153
pub enum ConfigFileType {
1154
    CSV,
1155
    #[cfg(feature = "parquet")]
1156
    PARQUET,
1157
    JSON,
1158
}
1159
1160
/// Represents the configuration options available for handling different table formats within a data processing application.
1161
/// This struct encompasses options for various file formats including CSV, Parquet, and JSON, allowing for flexible configuration
1162
/// of parsing and writing behaviors specific to each format. Additionally, it supports extending functionality through custom extensions.
1163
#[derive(Debug, Clone, Default)]
1164
pub struct TableOptions {
1165
    /// Configuration options for CSV file handling. This includes settings like the delimiter,
1166
    /// quote character, and whether the first row is considered as headers.
1167
    pub csv: CsvOptions,
1168
1169
    /// Configuration options for Parquet file handling. This includes settings for compression,
1170
    /// encoding, and other Parquet-specific file characteristics.
1171
    pub parquet: TableParquetOptions,
1172
1173
    /// Configuration options for JSON file handling.
1174
    pub json: JsonOptions,
1175
1176
    /// The current file format that the table operations should assume. This option allows
1177
    /// for dynamic switching between the supported file types (e.g., CSV, Parquet, JSON).
1178
    pub current_format: Option<ConfigFileType>,
1179
1180
    /// Optional extensions that can be used to extend or customize the behavior of the table
1181
    /// options. Extensions can be registered using `Extensions::insert` and might include
1182
    /// custom file handling logic, additional configuration parameters, or other enhancements.
1183
    pub extensions: Extensions,
1184
}
1185
1186
impl ConfigField for TableOptions {
1187
    /// Visits configuration settings for the current file format, or all formats if none is selected.
1188
    ///
1189
    /// This method adapts the behavior based on whether a file format is currently selected in `current_format`.
1190
    /// If a format is selected, it visits only the settings relevant to that format. Otherwise,
1191
    /// it visits all available format settings.
1192
0
    fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) {
1193
0
        if let Some(file_type) = &self.current_format {
1194
0
            match file_type {
1195
                #[cfg(feature = "parquet")]
1196
                ConfigFileType::PARQUET => self.parquet.visit(v, "format", ""),
1197
0
                ConfigFileType::CSV => self.csv.visit(v, "format", ""),
1198
0
                ConfigFileType::JSON => self.json.visit(v, "format", ""),
1199
            }
1200
0
        } else {
1201
0
            self.csv.visit(v, "csv", "");
1202
0
            self.parquet.visit(v, "parquet", "");
1203
0
            self.json.visit(v, "json", "");
1204
0
        }
1205
0
    }
1206
1207
    /// Sets a configuration value for a specific key within `TableOptions`.
1208
    ///
1209
    /// This method delegates setting configuration values to the specific file format configurations,
1210
    /// based on the current format selected. If no format is selected, it returns an error.
1211
    ///
1212
    /// # Parameters
1213
    ///
1214
    /// * `key`: The configuration key specifying which setting to adjust, prefixed with the format (e.g., "format.delimiter")
1215
    ///   for CSV format.
1216
    /// * `value`: The value to set for the specified configuration key.
1217
    ///
1218
    /// # Returns
1219
    ///
1220
    /// A result indicating success or an error if the key is not recognized, if a format is not specified,
1221
    /// or if setting the configuration value fails for the specific format.
1222
0
    fn set(&mut self, key: &str, value: &str) -> Result<()> {
1223
0
        // Extensions are handled in the public `ConfigOptions::set`
1224
0
        let (key, rem) = key.split_once('.').unwrap_or((key, ""));
1225
0
        let Some(format) = &self.current_format else {
1226
0
            return _config_err!("Specify a format for TableOptions");
1227
        };
1228
0
        match key {
1229
0
            "format" => match format {
1230
                #[cfg(feature = "parquet")]
1231
                ConfigFileType::PARQUET => self.parquet.set(rem, value),
1232
0
                ConfigFileType::CSV => self.csv.set(rem, value),
1233
0
                ConfigFileType::JSON => self.json.set(rem, value),
1234
            },
1235
0
            _ => _config_err!("Config value \"{key}\" not found on TableOptions"),
1236
        }
1237
0
    }
1238
}
1239
1240
impl TableOptions {
1241
    /// Constructs a new instance of `TableOptions` with default settings.
1242
    ///
1243
    /// # Returns
1244
    ///
1245
    /// A new `TableOptions` instance with default configuration values.
1246
0
    pub fn new() -> Self {
1247
0
        Self::default()
1248
0
    }
1249
1250
    /// Creates a new `TableOptions` instance initialized with settings from a given session config.
1251
    ///
1252
    /// # Parameters
1253
    ///
1254
    /// * `config`: A reference to the session `ConfigOptions` from which to derive initial settings.
1255
    ///
1256
    /// # Returns
1257
    ///
1258
    /// A new `TableOptions` instance with settings applied from the session config.
1259
0
    pub fn default_from_session_config(config: &ConfigOptions) -> Self {
1260
0
        let initial = TableOptions::default();
1261
0
        initial.combine_with_session_config(config);
1262
0
        initial
1263
0
    }
1264
1265
    /// Updates the current `TableOptions` with settings from a given session config.
1266
    ///
1267
    /// # Parameters
1268
    ///
1269
    /// * `config`: A reference to the session `ConfigOptions` whose settings are to be applied.
1270
    ///
1271
    /// # Returns
1272
    ///
1273
    /// A new `TableOptions` instance with updated settings from the session config.
1274
0
    pub fn combine_with_session_config(&self, config: &ConfigOptions) -> Self {
1275
0
        let mut clone = self.clone();
1276
0
        clone.parquet.global = config.execution.parquet.clone();
1277
0
        clone
1278
0
    }
1279
1280
    /// Sets the file format for the table.
1281
    ///
1282
    /// # Parameters
1283
    ///
1284
    /// * `format`: The file format to use (e.g., CSV, Parquet).
1285
0
    pub fn set_config_format(&mut self, format: ConfigFileType) {
1286
0
        self.current_format = Some(format);
1287
0
    }
1288
1289
    /// Sets the extensions for this `TableOptions` instance.
1290
    ///
1291
    /// # Parameters
1292
    ///
1293
    /// * `extensions`: The `Extensions` instance to set.
1294
    ///
1295
    /// # Returns
1296
    ///
1297
    /// A new `TableOptions` instance with the specified extensions applied.
1298
0
    pub fn with_extensions(mut self, extensions: Extensions) -> Self {
1299
0
        self.extensions = extensions;
1300
0
        self
1301
0
    }
1302
1303
    /// Sets a specific configuration option.
1304
    ///
1305
    /// # Parameters
1306
    ///
1307
    /// * `key`: The configuration key (e.g., "format.delimiter").
1308
    /// * `value`: The value to set for the specified key.
1309
    ///
1310
    /// # Returns
1311
    ///
1312
    /// A result indicating success or failure in setting the configuration option.
1313
0
    pub fn set(&mut self, key: &str, value: &str) -> Result<()> {
1314
0
        let Some((prefix, _)) = key.split_once('.') else {
1315
0
            return _config_err!("could not find config namespace for key \"{key}\"");
1316
        };
1317
1318
0
        if prefix == "format" {
1319
0
            return ConfigField::set(self, key, value);
1320
0
        }
1321
0
1322
0
        if prefix == "execution" {
1323
0
            return Ok(());
1324
0
        }
1325
1326
0
        let Some(e) = self.extensions.0.get_mut(prefix) else {
1327
0
            return _config_err!("Could not find config namespace \"{prefix}\"");
1328
        };
1329
0
        e.0.set(key, value)
1330
0
    }
1331
1332
    /// Initializes a new `TableOptions` from a hash map of string settings.
1333
    ///
1334
    /// # Parameters
1335
    ///
1336
    /// * `settings`: A hash map where each key-value pair represents a configuration setting.
1337
    ///
1338
    /// # Returns
1339
    ///
1340
    /// A result containing the new `TableOptions` instance or an error if any setting could not be applied.
1341
0
    pub fn from_string_hash_map(settings: &HashMap<String, String>) -> Result<Self> {
1342
0
        let mut ret = Self::default();
1343
0
        for (k, v) in settings {
1344
0
            ret.set(k, v)?;
1345
        }
1346
1347
0
        Ok(ret)
1348
0
    }
1349
1350
    /// Modifies the current `TableOptions` instance with settings from a hash map.
1351
    ///
1352
    /// # Parameters
1353
    ///
1354
    /// * `settings`: A hash map where each key-value pair represents a configuration setting.
1355
    ///
1356
    /// # Returns
1357
    ///
1358
    /// A result indicating success or failure in applying the settings.
1359
0
    pub fn alter_with_string_hash_map(
1360
0
        &mut self,
1361
0
        settings: &HashMap<String, String>,
1362
0
    ) -> Result<()> {
1363
0
        for (k, v) in settings {
1364
0
            self.set(k, v)?;
1365
        }
1366
0
        Ok(())
1367
0
    }
1368
1369
    /// Retrieves all configuration entries from this `TableOptions`.
1370
    ///
1371
    /// # Returns
1372
    ///
1373
    /// A vector of `ConfigEntry` instances, representing all the configuration options within this `TableOptions`.
1374
0
    pub fn entries(&self) -> Vec<ConfigEntry> {
1375
        struct Visitor(Vec<ConfigEntry>);
1376
1377
        impl Visit for Visitor {
1378
0
            fn some<V: Display>(
1379
0
                &mut self,
1380
0
                key: &str,
1381
0
                value: V,
1382
0
                description: &'static str,
1383
0
            ) {
1384
0
                self.0.push(ConfigEntry {
1385
0
                    key: key.to_string(),
1386
0
                    value: Some(value.to_string()),
1387
0
                    description,
1388
0
                })
1389
0
            }
1390
1391
0
            fn none(&mut self, key: &str, description: &'static str) {
1392
0
                self.0.push(ConfigEntry {
1393
0
                    key: key.to_string(),
1394
0
                    value: None,
1395
0
                    description,
1396
0
                })
1397
0
            }
1398
        }
1399
1400
0
        let mut v = Visitor(vec![]);
1401
0
        self.visit(&mut v, "format", "");
1402
0
1403
0
        v.0.extend(self.extensions.0.values().flat_map(|e| e.0.entries()));
1404
0
        v.0
1405
0
    }
1406
}
1407
1408
/// Options that control how Parquet files are read, including global options
1409
/// that apply to all columns and optional column-specific overrides
1410
///
1411
/// Closely tied to [`ParquetWriterOptions`](crate::file_options::parquet_writer::ParquetWriterOptions).
1412
/// Properties not included in [`TableParquetOptions`] may not be configurable at the external API
1413
/// (e.g. sorting_columns).
1414
#[derive(Clone, Default, Debug, PartialEq)]
1415
pub struct TableParquetOptions {
1416
    /// Global Parquet options that propagates to all columns.
1417
    pub global: ParquetOptions,
1418
    /// Column specific options. Default usage is parquet.XX::column.
1419
    pub column_specific_options: HashMap<String, ParquetColumnOptions>,
1420
    /// Additional file-level metadata to include. Inserted into the key_value_metadata
1421
    /// for the written [`FileMetaData`](https://docs.rs/parquet/latest/parquet/file/metadata/struct.FileMetaData.html).
1422
    ///
1423
    /// Multiple entries are permitted
1424
    /// ```sql
1425
    /// OPTIONS (
1426
    ///    'format.metadata::key1' '',
1427
    ///    'format.metadata::key2' 'value',
1428
    ///    'format.metadata::key3' 'value has spaces',
1429
    ///    'format.metadata::key4' 'value has special chars :: :',
1430
    ///    'format.metadata::key_dupe' 'original will be overwritten',
1431
    ///    'format.metadata::key_dupe' 'final'
1432
    /// )
1433
    /// ```
1434
    pub key_value_metadata: HashMap<String, Option<String>>,
1435
}
1436
1437
impl TableParquetOptions {
1438
    /// Return new default TableParquetOptions
1439
0
    pub fn new() -> Self {
1440
0
        Self::default()
1441
0
    }
1442
}
1443
1444
impl ConfigField for TableParquetOptions {
1445
0
    fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, description: &'static str) {
1446
0
        self.global.visit(v, key_prefix, description);
1447
0
        self.column_specific_options
1448
0
            .visit(v, key_prefix, description)
1449
0
    }
1450
1451
0
    fn set(&mut self, key: &str, value: &str) -> Result<()> {
1452
0
        // Determine if the key is a global, metadata, or column-specific setting
1453
0
        if key.starts_with("metadata::") {
1454
0
            let k = match key.split("::").collect::<Vec<_>>()[..] {
1455
0
                [_meta] | [_meta, ""] => {
1456
0
                    return _config_err!(
1457
0
                        "Invalid metadata key provided, missing key in metadata::<key>"
1458
0
                    )
1459
                }
1460
0
                [_meta, k] => k.into(),
1461
                _ => {
1462
0
                    return _config_err!(
1463
0
                        "Invalid metadata key provided, found too many '::' in \"{key}\""
1464
0
                    )
1465
                }
1466
            };
1467
0
            self.key_value_metadata.insert(k, Some(value.into()));
1468
0
            Ok(())
1469
0
        } else if key.contains("::") {
1470
0
            self.column_specific_options.set(key, value)
1471
        } else {
1472
0
            self.global.set(key, value)
1473
        }
1474
0
    }
1475
}
1476
1477
macro_rules! config_namespace_with_hashmap {
1478
    (
1479
     $(#[doc = $struct_d:tt])*
1480
     $vis:vis struct $struct_name:ident {
1481
        $(
1482
        $(#[doc = $d:tt])*
1483
        $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr
1484
        )*$(,)*
1485
    }
1486
    ) => {
1487
1488
        $(#[doc = $struct_d])*
1489
        #[derive(Debug, Clone, PartialEq)]
1490
        $vis struct $struct_name{
1491
            $(
1492
            $(#[doc = $d])*
1493
            $field_vis $field_name : $field_type,
1494
            )*
1495
        }
1496
1497
        impl ConfigField for $struct_name {
1498
0
            fn set(&mut self, key: &str, value: &str) -> Result<()> {
1499
0
                let (key, rem) = key.split_once('.').unwrap_or((key, ""));
1500
0
                match key {
1501
0
                    $(
1502
0
                       stringify!($field_name) => self.$field_name.set(rem, value),
1503
                    )*
1504
0
                    _ => _config_err!(
1505
0
                        "Config value \"{}\" not found on {}", key, stringify!($struct_name)
1506
0
                    )
1507
                }
1508
0
            }
1509
1510
0
            fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
1511
0
                $(
1512
0
                let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
1513
0
                let desc = concat!($($d),*).trim();
1514
0
                self.$field_name.visit(v, key.as_str(), desc);
1515
0
                )*
1516
0
            }
1517
        }
1518
1519
        impl Default for $struct_name {
1520
0
            fn default() -> Self {
1521
0
                Self {
1522
0
                    $($field_name: $default),*
1523
0
                }
1524
0
            }
1525
        }
1526
1527
        impl ConfigField for HashMap<String,$struct_name> {
1528
0
            fn set(&mut self, key: &str, value: &str) -> Result<()> {
1529
0
                let parts: Vec<&str> = key.splitn(2, "::").collect();
1530
0
                match parts.as_slice() {
1531
0
                    [inner_key, hashmap_key] => {
1532
0
                        // Get or create the ColumnOptions for the specified column
1533
0
                        let inner_value = self
1534
0
                            .entry((*hashmap_key).to_owned())
1535
0
                            .or_insert_with($struct_name::default);
1536
0
1537
0
                        inner_value.set(inner_key, value)
1538
                    }
1539
0
                    _ => _config_err!("Unrecognized key '{key}'."),
1540
                }
1541
0
            }
1542
1543
0
            fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
1544
0
                for (column_name, col_options) in self {
1545
0
                    $(
1546
0
                    let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name));
1547
0
                    let desc = concat!($($d),*).trim();
1548
0
                    col_options.$field_name.visit(v, key.as_str(), desc);
1549
0
                    )*
1550
0
                }
1551
0
            }
1552
        }
1553
    }
1554
}
1555
1556
config_namespace_with_hashmap! {
1557
    /// Options controlling parquet format for individual columns.
1558
    ///
1559
    /// See [`ParquetOptions`] for more details
1560
    pub struct ParquetColumnOptions {
1561
        /// Sets if bloom filter is enabled for the column path.
1562
        pub bloom_filter_enabled: Option<bool>, default = None
1563
1564
        /// Sets encoding for the column path.
1565
        /// Valid values are: plain, plain_dictionary, rle,
1566
        /// bit_packed, delta_binary_packed, delta_length_byte_array,
1567
        /// delta_byte_array, rle_dictionary, and byte_stream_split.
1568
        /// These values are not case-sensitive. If NULL, uses
1569
        /// default parquet options
1570
        pub encoding: Option<String>, default = None
1571
1572
        /// Sets if dictionary encoding is enabled for the column path. If NULL, uses
1573
        /// default parquet options
1574
        pub dictionary_enabled: Option<bool>, default = None
1575
1576
        /// Sets default parquet compression codec for the column path.
1577
        /// Valid values are: uncompressed, snappy, gzip(level),
1578
        /// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
1579
        /// These values are not case-sensitive. If NULL, uses
1580
        /// default parquet options
1581
        pub compression: Option<String>, default = None
1582
1583
        /// Sets if statistics are enabled for the column
1584
        /// Valid values are: "none", "chunk", and "page"
1585
        /// These values are not case sensitive. If NULL, uses
1586
        /// default parquet options
1587
        pub statistics_enabled: Option<String>, default = None
1588
1589
        /// Sets bloom filter false positive probability for the column path. If NULL, uses
1590
        /// default parquet options
1591
        pub bloom_filter_fpp: Option<f64>, default = None
1592
1593
        /// Sets bloom filter number of distinct values. If NULL, uses
1594
        /// default parquet options
1595
        pub bloom_filter_ndv: Option<u64>, default = None
1596
1597
        /// Sets max statistics size for the column path. If NULL, uses
1598
        /// default parquet options
1599
        pub max_statistics_size: Option<usize>, default = None
1600
    }
1601
}
1602
1603
config_namespace! {
1604
    /// Options controlling CSV format
1605
    pub struct CsvOptions {
1606
        /// Specifies whether there is a CSV header (i.e. the first line
1607
        /// consists of is column names). The value `None` indicates that
1608
        /// the configuration should be consulted.
1609
        pub has_header: Option<bool>, default = None
1610
        pub delimiter: u8, default = b','
1611
        pub quote: u8, default = b'"'
1612
        pub terminator: Option<u8>, default = None
1613
        pub escape: Option<u8>, default = None
1614
        pub double_quote: Option<bool>, default = None
1615
        /// Specifies whether newlines in (quoted) values are supported.
1616
        ///
1617
        /// Parsing newlines in quoted values may be affected by execution behaviour such as
1618
        /// parallel file scanning. Setting this to `true` ensures that newlines in values are
1619
        /// parsed successfully, which may reduce performance.
1620
        ///
1621
        /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting.
1622
        pub newlines_in_values: Option<bool>, default = None
1623
        pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED
1624
        pub schema_infer_max_rec: usize, default = 100
1625
        pub date_format: Option<String>, default = None
1626
        pub datetime_format: Option<String>, default = None
1627
        pub timestamp_format: Option<String>, default = None
1628
        pub timestamp_tz_format: Option<String>, default = None
1629
        pub time_format: Option<String>, default = None
1630
        pub null_value: Option<String>, default = None
1631
        pub comment: Option<u8>, default = None
1632
    }
1633
}
1634
1635
impl CsvOptions {
1636
    /// Set a limit in terms of records to scan to infer the schema
1637
    /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
1638
0
    pub fn with_compression(
1639
0
        mut self,
1640
0
        compression_type_variant: CompressionTypeVariant,
1641
0
    ) -> Self {
1642
0
        self.compression = compression_type_variant;
1643
0
        self
1644
0
    }
1645
1646
    /// Set a limit in terms of records to scan to infer the schema
1647
    /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
1648
0
    pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self {
1649
0
        self.schema_infer_max_rec = max_rec;
1650
0
        self
1651
0
    }
1652
1653
    /// Set true to indicate that the first line is a header.
1654
    /// - default to true
1655
0
    pub fn with_has_header(mut self, has_header: bool) -> Self {
1656
0
        self.has_header = Some(has_header);
1657
0
        self
1658
0
    }
1659
1660
    /// Returns true if the first line is a header. If format options does not
1661
    /// specify whether there is a header, returns `None` (indicating that the
1662
    /// configuration should be consulted).
1663
0
    pub fn has_header(&self) -> Option<bool> {
1664
0
        self.has_header
1665
0
    }
1666
1667
    /// The character separating values within a row.
1668
    /// - default to ','
1669
0
    pub fn with_delimiter(mut self, delimiter: u8) -> Self {
1670
0
        self.delimiter = delimiter;
1671
0
        self
1672
0
    }
1673
1674
    /// The quote character in a row.
1675
    /// - default to '"'
1676
0
    pub fn with_quote(mut self, quote: u8) -> Self {
1677
0
        self.quote = quote;
1678
0
        self
1679
0
    }
1680
1681
    /// The character that terminates a row.
1682
    /// - default to None (CRLF)
1683
0
    pub fn with_terminator(mut self, terminator: Option<u8>) -> Self {
1684
0
        self.terminator = terminator;
1685
0
        self
1686
0
    }
1687
1688
    /// The escape character in a row.
1689
    /// - default is None
1690
0
    pub fn with_escape(mut self, escape: Option<u8>) -> Self {
1691
0
        self.escape = escape;
1692
0
        self
1693
0
    }
1694
1695
    /// Set true to indicate that the CSV quotes should be doubled.
1696
    /// - default to true
1697
0
    pub fn with_double_quote(mut self, double_quote: bool) -> Self {
1698
0
        self.double_quote = Some(double_quote);
1699
0
        self
1700
0
    }
1701
1702
    /// Specifies whether newlines in (quoted) values are supported.
1703
    ///
1704
    /// Parsing newlines in quoted values may be affected by execution behaviour such as
1705
    /// parallel file scanning. Setting this to `true` ensures that newlines in values are
1706
    /// parsed successfully, which may reduce performance.
1707
    ///
1708
    /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting.
1709
0
    pub fn with_newlines_in_values(mut self, newlines_in_values: bool) -> Self {
1710
0
        self.newlines_in_values = Some(newlines_in_values);
1711
0
        self
1712
0
    }
1713
1714
    /// Set a `CompressionTypeVariant` of CSV
1715
    /// - defaults to `CompressionTypeVariant::UNCOMPRESSED`
1716
0
    pub fn with_file_compression_type(
1717
0
        mut self,
1718
0
        compression: CompressionTypeVariant,
1719
0
    ) -> Self {
1720
0
        self.compression = compression;
1721
0
        self
1722
0
    }
1723
1724
    /// The delimiter character.
1725
0
    pub fn delimiter(&self) -> u8 {
1726
0
        self.delimiter
1727
0
    }
1728
1729
    /// The quote character.
1730
0
    pub fn quote(&self) -> u8 {
1731
0
        self.quote
1732
0
    }
1733
1734
    /// The terminator character.
1735
0
    pub fn terminator(&self) -> Option<u8> {
1736
0
        self.terminator
1737
0
    }
1738
1739
    /// The escape character.
1740
0
    pub fn escape(&self) -> Option<u8> {
1741
0
        self.escape
1742
0
    }
1743
}
1744
1745
config_namespace! {
1746
    /// Options controlling JSON format
1747
    pub struct JsonOptions {
1748
        pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED
1749
        pub schema_infer_max_rec: usize, default = 100
1750
    }
1751
}
1752
1753
pub trait FormatOptionsExt: Display {}
1754
1755
#[derive(Debug, Clone, PartialEq)]
1756
#[allow(clippy::large_enum_variant)]
1757
pub enum FormatOptions {
1758
    CSV(CsvOptions),
1759
    JSON(JsonOptions),
1760
    #[cfg(feature = "parquet")]
1761
    PARQUET(TableParquetOptions),
1762
    AVRO,
1763
    ARROW,
1764
}
1765
1766
impl Display for FormatOptions {
1767
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1768
0
        let out = match self {
1769
0
            FormatOptions::CSV(_) => "csv",
1770
0
            FormatOptions::JSON(_) => "json",
1771
            #[cfg(feature = "parquet")]
1772
            FormatOptions::PARQUET(_) => "parquet",
1773
0
            FormatOptions::AVRO => "avro",
1774
0
            FormatOptions::ARROW => "arrow",
1775
        };
1776
0
        write!(f, "{}", out)
1777
0
    }
1778
}
1779
1780
#[cfg(test)]
1781
mod tests {
1782
    use std::any::Any;
1783
    use std::collections::HashMap;
1784
1785
    use crate::config::{
1786
        ConfigEntry, ConfigExtension, ConfigFileType, ExtensionOptions, Extensions,
1787
        TableOptions,
1788
    };
1789
1790
    #[derive(Default, Debug, Clone)]
1791
    pub struct TestExtensionConfig {
1792
        /// Should "foo" be replaced by "bar"?
1793
        pub properties: HashMap<String, String>,
1794
    }
1795
1796
    impl ExtensionOptions for TestExtensionConfig {
1797
        fn as_any(&self) -> &dyn Any {
1798
            self
1799
        }
1800
1801
        fn as_any_mut(&mut self) -> &mut dyn Any {
1802
            self
1803
        }
1804
1805
        fn cloned(&self) -> Box<dyn ExtensionOptions> {
1806
            Box::new(self.clone())
1807
        }
1808
1809
        fn set(&mut self, key: &str, value: &str) -> crate::Result<()> {
1810
            let (key, rem) = key.split_once('.').unwrap_or((key, ""));
1811
            assert_eq!(key, "test");
1812
            self.properties.insert(rem.to_owned(), value.to_owned());
1813
            Ok(())
1814
        }
1815
1816
        fn entries(&self) -> Vec<ConfigEntry> {
1817
            self.properties
1818
                .iter()
1819
                .map(|(k, v)| ConfigEntry {
1820
                    key: k.into(),
1821
                    value: Some(v.into()),
1822
                    description: "",
1823
                })
1824
                .collect()
1825
        }
1826
    }
1827
1828
    impl ConfigExtension for TestExtensionConfig {
1829
        const PREFIX: &'static str = "test";
1830
    }
1831
1832
    #[test]
1833
    fn create_table_config() {
1834
        let mut extension = Extensions::new();
1835
        extension.insert(TestExtensionConfig::default());
1836
        let table_config = TableOptions::new().with_extensions(extension);
1837
        let kafka_config = table_config.extensions.get::<TestExtensionConfig>();
1838
        assert!(kafka_config.is_some())
1839
    }
1840
1841
    #[test]
1842
    fn alter_test_extension_config() {
1843
        let mut extension = Extensions::new();
1844
        extension.insert(TestExtensionConfig::default());
1845
        let mut table_config = TableOptions::new().with_extensions(extension);
1846
        table_config.set_config_format(ConfigFileType::CSV);
1847
        table_config.set("format.delimiter", ";").unwrap();
1848
        assert_eq!(table_config.csv.delimiter, b';');
1849
        table_config.set("test.bootstrap.servers", "asd").unwrap();
1850
        let kafka_config = table_config
1851
            .extensions
1852
            .get::<TestExtensionConfig>()
1853
            .unwrap();
1854
        assert_eq!(
1855
            kafka_config.properties.get("bootstrap.servers").unwrap(),
1856
            "asd"
1857
        );
1858
    }
1859
1860
    #[test]
1861
    fn csv_u8_table_options() {
1862
        let mut table_config = TableOptions::new();
1863
        table_config.set_config_format(ConfigFileType::CSV);
1864
        table_config.set("format.delimiter", ";").unwrap();
1865
        assert_eq!(table_config.csv.delimiter as char, ';');
1866
        table_config.set("format.escape", "\"").unwrap();
1867
        assert_eq!(table_config.csv.escape.unwrap() as char, '"');
1868
        table_config.set("format.escape", "\'").unwrap();
1869
        assert_eq!(table_config.csv.escape.unwrap() as char, '\'');
1870
    }
1871
1872
    #[cfg(feature = "parquet")]
1873
    #[test]
1874
    fn parquet_table_options() {
1875
        let mut table_config = TableOptions::new();
1876
        table_config.set_config_format(ConfigFileType::PARQUET);
1877
        table_config
1878
            .set("format.bloom_filter_enabled::col1", "true")
1879
            .unwrap();
1880
        assert_eq!(
1881
            table_config.parquet.column_specific_options["col1"].bloom_filter_enabled,
1882
            Some(true)
1883
        );
1884
    }
1885
1886
    #[cfg(feature = "parquet")]
1887
    #[test]
1888
    fn parquet_table_options_config_entry() {
1889
        let mut table_config = TableOptions::new();
1890
        table_config.set_config_format(ConfigFileType::PARQUET);
1891
        table_config
1892
            .set("format.bloom_filter_enabled::col1", "true")
1893
            .unwrap();
1894
        let entries = table_config.entries();
1895
        assert!(entries
1896
            .iter()
1897
            .any(|item| item.key == "format.bloom_filter_enabled::col1"))
1898
    }
1899
1900
    #[cfg(feature = "parquet")]
1901
    #[test]
1902
    fn parquet_table_options_config_metadata_entry() {
1903
        let mut table_config = TableOptions::new();
1904
        table_config.set_config_format(ConfigFileType::PARQUET);
1905
        table_config.set("format.metadata::key1", "").unwrap();
1906
        table_config.set("format.metadata::key2", "value2").unwrap();
1907
        table_config
1908
            .set("format.metadata::key3", "value with spaces ")
1909
            .unwrap();
1910
        table_config
1911
            .set("format.metadata::key4", "value with special chars :: :")
1912
            .unwrap();
1913
1914
        let parsed_metadata = table_config.parquet.key_value_metadata.clone();
1915
        assert_eq!(parsed_metadata.get("should not exist1"), None);
1916
        assert_eq!(parsed_metadata.get("key1"), Some(&Some("".into())));
1917
        assert_eq!(parsed_metadata.get("key2"), Some(&Some("value2".into())));
1918
        assert_eq!(
1919
            parsed_metadata.get("key3"),
1920
            Some(&Some("value with spaces ".into()))
1921
        );
1922
        assert_eq!(
1923
            parsed_metadata.get("key4"),
1924
            Some(&Some("value with special chars :: :".into()))
1925
        );
1926
1927
        // duplicate keys are overwritten
1928
        table_config.set("format.metadata::key_dupe", "A").unwrap();
1929
        table_config.set("format.metadata::key_dupe", "B").unwrap();
1930
        let parsed_metadata = table_config.parquet.key_value_metadata;
1931
        assert_eq!(parsed_metadata.get("key_dupe"), Some(&Some("B".into())));
1932
    }
1933
}