/Users/andrewlamb/Software/datafusion/datafusion/common/src/test_util.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Utility functions to make testing DataFusion based crates easier |
19 | | |
20 | | use std::{error::Error, path::PathBuf}; |
21 | | |
22 | | /// Compares formatted output of a record batch with an expected |
23 | | /// vector of strings, with the result of pretty formatting record |
24 | | /// batches. This is a macro so errors appear on the correct line |
25 | | /// |
26 | | /// Designed so that failure output can be directly copy/pasted |
27 | | /// into the test code as expected results. |
28 | | /// |
29 | | /// Expects to be called about like this: |
30 | | /// |
31 | | /// `assert_batch_eq!(expected_lines: &[&str], batches: &[RecordBatch])` |
32 | | /// |
33 | | /// # Example |
34 | | /// ``` |
35 | | /// # use std::sync::Arc; |
36 | | /// # use arrow::record_batch::RecordBatch; |
37 | | /// # use arrow_array::{ArrayRef, Int32Array}; |
38 | | /// # use datafusion_common::assert_batches_eq; |
39 | | /// let col: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); |
40 | | /// let batch = RecordBatch::try_from_iter([("column", col)]).unwrap(); |
41 | | /// // Expected output is a vec of strings |
42 | | /// let expected = vec![ |
43 | | /// "+--------+", |
44 | | /// "| column |", |
45 | | /// "+--------+", |
46 | | /// "| 1 |", |
47 | | /// "| 2 |", |
48 | | /// "+--------+", |
49 | | /// ]; |
50 | | /// // compare the formatted output of the record batch with the expected output |
51 | | /// assert_batches_eq!(expected, &[batch]); |
52 | | /// ``` |
53 | | #[macro_export] |
54 | | macro_rules! assert_batches_eq { |
55 | | ($EXPECTED_LINES: expr, $CHUNKS: expr) => { |
56 | | let expected_lines: Vec<String> = |
57 | 3.28k | $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); |
58 | | |
59 | | let formatted = $crate::arrow::util::pretty::pretty_format_batches_with_options( |
60 | | $CHUNKS, |
61 | | &$crate::format::DEFAULT_FORMAT_OPTIONS, |
62 | | ) |
63 | | .unwrap() |
64 | | .to_string(); |
65 | | |
66 | | let actual_lines: Vec<&str> = formatted.trim().lines().collect(); |
67 | | |
68 | | assert_eq!( |
69 | | expected_lines, actual_lines, |
70 | | "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", |
71 | | expected_lines, actual_lines |
72 | | ); |
73 | | }; |
74 | | } |
75 | | |
76 | | /// Compares formatted output of a record batch with an expected |
77 | | /// vector of strings in a way that order does not matter. |
78 | | /// This is a macro so errors appear on the correct line |
79 | | /// |
80 | | /// See [`assert_batches_eq`] for more details and example. |
81 | | /// |
82 | | /// Expects to be called about like this: |
83 | | /// |
84 | | /// `assert_batch_sorted_eq!(expected_lines: &[&str], batches: &[RecordBatch])` |
85 | | #[macro_export] |
86 | | macro_rules! assert_batches_sorted_eq { |
87 | | ($EXPECTED_LINES: expr, $CHUNKS: expr) => { |
88 | | let mut expected_lines: Vec<String> = |
89 | 1.17k | $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); |
90 | | |
91 | | // sort except for header + footer |
92 | | let num_lines = expected_lines.len(); |
93 | | if num_lines > 3 { |
94 | | expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() |
95 | | } |
96 | | |
97 | | let formatted = $crate::arrow::util::pretty::pretty_format_batches_with_options( |
98 | | $CHUNKS, |
99 | | &$crate::format::DEFAULT_FORMAT_OPTIONS, |
100 | | ) |
101 | | .unwrap() |
102 | | .to_string(); |
103 | | // fix for windows: \r\n --> |
104 | | |
105 | | let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); |
106 | | |
107 | | // sort except for header + footer |
108 | | let num_lines = actual_lines.len(); |
109 | | if num_lines > 3 { |
110 | | actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() |
111 | | } |
112 | | |
113 | | assert_eq!( |
114 | | expected_lines, actual_lines, |
115 | | "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", |
116 | | expected_lines, actual_lines |
117 | | ); |
118 | | }; |
119 | | } |
120 | | |
121 | | /// A macro to assert that one string is contained within another with |
122 | | /// a nice error message if they are not. |
123 | | /// |
124 | | /// Usage: `assert_contains!(actual, expected)` |
125 | | /// |
126 | | /// Is a macro so test error |
127 | | /// messages are on the same line as the failure; |
128 | | /// |
129 | | /// Both arguments must be convertable into Strings ([`Into`]<[`String`]>) |
130 | | #[macro_export] |
131 | | macro_rules! assert_contains { |
132 | | ($ACTUAL: expr, $EXPECTED: expr) => { |
133 | | let actual_value: String = $ACTUAL.into(); |
134 | | let expected_value: String = $EXPECTED.into(); |
135 | | assert!( |
136 | | actual_value.contains(&expected_value), |
137 | | "Can not find expected in actual.\n\nExpected:\n{}\n\nActual:\n{}", |
138 | | expected_value, |
139 | | actual_value |
140 | | ); |
141 | | }; |
142 | | } |
143 | | |
144 | | /// A macro to assert that one string is NOT contained within another with |
145 | | /// a nice error message if they are are. |
146 | | /// |
147 | | /// Usage: `assert_not_contains!(actual, unexpected)` |
148 | | /// |
149 | | /// Is a macro so test error |
150 | | /// messages are on the same line as the failure; |
151 | | /// |
152 | | /// Both arguments must be convertable into Strings ([`Into`]<[`String`]>) |
153 | | #[macro_export] |
154 | | macro_rules! assert_not_contains { |
155 | | ($ACTUAL: expr, $UNEXPECTED: expr) => { |
156 | | let actual_value: String = $ACTUAL.into(); |
157 | | let unexpected_value: String = $UNEXPECTED.into(); |
158 | | assert!( |
159 | | !actual_value.contains(&unexpected_value), |
160 | | "Found unexpected in actual.\n\nUnexpected:\n{}\n\nActual:\n{}", |
161 | | unexpected_value, |
162 | | actual_value |
163 | | ); |
164 | | }; |
165 | | } |
166 | | |
167 | | /// Returns the datafusion test data directory, which is by default rooted at `datafusion/core/tests/data`. |
168 | | /// |
169 | | /// The default can be overridden by the optional environment |
170 | | /// variable `DATAFUSION_TEST_DATA` |
171 | | /// |
172 | | /// panics when the directory can not be found. |
173 | | /// |
174 | | /// Example: |
175 | | /// ``` |
176 | | /// let testdata = datafusion_common::test_util::datafusion_test_data(); |
177 | | /// let csvdata = format!("{}/window_1.csv", testdata); |
178 | | /// assert!(std::path::PathBuf::from(csvdata).exists()); |
179 | | /// ``` |
180 | | pub fn datafusion_test_data() -> String { |
181 | | match get_data_dir("DATAFUSION_TEST_DATA", "../../datafusion/core/tests/data") { |
182 | | Ok(pb) => pb.display().to_string(), |
183 | | Err(err) => panic!("failed to get arrow data dir: {err}"), |
184 | | } |
185 | | } |
186 | | |
187 | | /// Returns the arrow test data directory, which is by default stored |
188 | | /// in a git submodule rooted at `testing/data`. |
189 | | /// |
190 | | /// The default can be overridden by the optional environment |
191 | | /// variable `ARROW_TEST_DATA` |
192 | | /// |
193 | | /// panics when the directory can not be found. |
194 | | /// |
195 | | /// Example: |
196 | | /// ``` |
197 | | /// let testdata = datafusion_common::test_util::arrow_test_data(); |
198 | | /// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata); |
199 | | /// assert!(std::path::PathBuf::from(csvdata).exists()); |
200 | | /// ``` |
201 | | pub fn arrow_test_data() -> String { |
202 | | match get_data_dir("ARROW_TEST_DATA", "../../testing/data") { |
203 | | Ok(pb) => pb.display().to_string(), |
204 | | Err(err) => panic!("failed to get arrow data dir: {err}"), |
205 | | } |
206 | | } |
207 | | |
208 | | /// Returns the parquet test data directory, which is by default |
209 | | /// stored in a git submodule rooted at |
210 | | /// `parquet-testing/data`. |
211 | | /// |
212 | | /// The default can be overridden by the optional environment variable |
213 | | /// `PARQUET_TEST_DATA` |
214 | | /// |
215 | | /// panics when the directory can not be found. |
216 | | /// |
217 | | /// Example: |
218 | | /// ``` |
219 | | /// let testdata = datafusion_common::test_util::parquet_test_data(); |
220 | | /// let filename = format!("{}/binary.parquet", testdata); |
221 | | /// assert!(std::path::PathBuf::from(filename).exists()); |
222 | | /// ``` |
223 | | #[cfg(feature = "parquet")] |
224 | | pub fn parquet_test_data() -> String { |
225 | | match get_data_dir("PARQUET_TEST_DATA", "../../parquet-testing/data") { |
226 | | Ok(pb) => pb.display().to_string(), |
227 | | Err(err) => panic!("failed to get parquet data dir: {err}"), |
228 | | } |
229 | | } |
230 | | |
231 | | /// Returns a directory path for finding test data. |
232 | | /// |
233 | | /// udf_env: name of an environment variable |
234 | | /// |
235 | | /// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR) |
236 | | /// |
237 | | /// Returns either: |
238 | | /// The path referred to in `udf_env` if that variable is set and refers to a directory |
239 | | /// The submodule_data directory relative to CARGO_MANIFEST_PATH |
240 | | pub fn get_data_dir( |
241 | | udf_env: &str, |
242 | | submodule_data: &str, |
243 | | ) -> Result<PathBuf, Box<dyn Error>> { |
244 | | // Try user defined env. |
245 | | if let Ok(dir) = std::env::var(udf_env) { |
246 | | let trimmed = dir.trim().to_string(); |
247 | | if !trimmed.is_empty() { |
248 | | let pb = PathBuf::from(trimmed); |
249 | | if pb.is_dir() { |
250 | | return Ok(pb); |
251 | | } else { |
252 | | return Err(format!( |
253 | | "the data dir `{}` defined by env {} not found", |
254 | | pb.display(), |
255 | | udf_env |
256 | | ) |
257 | | .into()); |
258 | | } |
259 | | } |
260 | | } |
261 | | |
262 | | // The env is undefined or its value is trimmed to empty, let's try default dir. |
263 | | |
264 | | // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package", |
265 | | // set by `cargo run` or `cargo test`, see: |
266 | | // https://doc.rust-lang.org/cargo/reference/environment-variables.html |
267 | | let dir = env!("CARGO_MANIFEST_DIR"); |
268 | | |
269 | | let pb = PathBuf::from(dir).join(submodule_data); |
270 | | if pb.is_dir() { |
271 | | Ok(pb) |
272 | | } else { |
273 | | Err(format!( |
274 | | "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\ |
275 | | HINT: try running `git submodule update --init`", |
276 | | udf_env, |
277 | | pb.display(), |
278 | | ).into()) |
279 | | } |
280 | | } |
281 | | |
282 | | #[cfg(test)] |
283 | | mod tests { |
284 | | use super::*; |
285 | | use std::env; |
286 | | |
287 | | #[test] |
288 | | fn test_data_dir() { |
289 | | let udf_env = "get_data_dir"; |
290 | | let cwd = env::current_dir().unwrap(); |
291 | | |
292 | | let existing_pb = cwd.join(".."); |
293 | | let existing = existing_pb.display().to_string(); |
294 | | let existing_str = existing.as_str(); |
295 | | |
296 | | let non_existing = cwd.join("non-existing-dir").display().to_string(); |
297 | | let non_existing_str = non_existing.as_str(); |
298 | | |
299 | | env::set_var(udf_env, non_existing_str); |
300 | | let res = get_data_dir(udf_env, existing_str); |
301 | | assert!(res.is_err()); |
302 | | |
303 | | env::set_var(udf_env, ""); |
304 | | let res = get_data_dir(udf_env, existing_str); |
305 | | assert!(res.is_ok()); |
306 | | assert_eq!(res.unwrap(), existing_pb); |
307 | | |
308 | | env::set_var(udf_env, " "); |
309 | | let res = get_data_dir(udf_env, existing_str); |
310 | | assert!(res.is_ok()); |
311 | | assert_eq!(res.unwrap(), existing_pb); |
312 | | |
313 | | env::set_var(udf_env, existing_str); |
314 | | let res = get_data_dir(udf_env, existing_str); |
315 | | assert!(res.is_ok()); |
316 | | assert_eq!(res.unwrap(), existing_pb); |
317 | | |
318 | | env::remove_var(udf_env); |
319 | | let res = get_data_dir(udf_env, non_existing_str); |
320 | | assert!(res.is_err()); |
321 | | |
322 | | let res = get_data_dir(udf_env, existing_str); |
323 | | assert!(res.is_ok()); |
324 | | assert_eq!(res.unwrap(), existing_pb); |
325 | | } |
326 | | |
327 | | #[test] |
328 | | #[cfg(feature = "parquet")] |
329 | | fn test_happy() { |
330 | | let res = arrow_test_data(); |
331 | | assert!(PathBuf::from(res).is_dir()); |
332 | | |
333 | | let res = parquet_test_data(); |
334 | | assert!(PathBuf::from(res).is_dir()); |
335 | | } |
336 | | } |