From 4549168e7f34a42067d969f28c11c9e31baf7bcd Mon Sep 17 00:00:00 2001 From: Eason <30045503+Eason0729@users.noreply.github.com> Date: Mon, 23 Sep 2024 02:50:19 +0800 Subject: [PATCH] docs: :memo: Add expected answers to `DataFrame` method examples (#12564) * docs: :memo: Add expected answers to `DataFrame` method examples * test: :memo: use assert_batches_sorted_eq and simplify example_long.csv * Update datafusion/core/src/dataframe/mod.rs Co-authored-by: Oleks V --------- Co-authored-by: Andrew Lamb Co-authored-by: Oleks V --- datafusion/core/src/dataframe/mod.rs | 243 ++++++++++++++++++-- datafusion/core/tests/data/example_long.csv | 4 + 2 files changed, 224 insertions(+), 23 deletions(-) create mode 100644 datafusion/core/tests/data/example_long.csv diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 0faf53633020..663d8bf022f0 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -184,7 +184,7 @@ impl DataFrame { } /// Creates logical expression from a SQL query text. - /// The expression is created and processed againt the current schema. + /// The expression is created and processed against the current schema. /// /// # Example: Parsing SQL queries /// ``` @@ -221,11 +221,20 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; /// let df = df.select_columns(&["a", "b"])?; + /// let expected = vec![ + /// "+---+---+", + /// "| a | b |", + /// "+---+---+", + /// "| 1 | 2 |", + /// "+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -279,11 +288,20 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let df = df.select(vec![col("a") * col("b"), col("c")])?; + /// let df = df.select(vec![col("a"), col("b") * col("c")])?; + /// let expected = vec![ + /// "+---+-----------------------+", + /// "| a | ?table?.b * ?table?.c |", + /// "+---+-----------------------+", + /// "| 1 | 6 |", + /// "+---+-----------------------+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -307,11 +325,25 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// // +----+----+----+ + /// // | a | b | c | + /// // +----+----+----+ + /// // | 1 | 2 | 3 | + /// // +----+----+----+ /// let df = df.drop_columns(&["a"])?; + /// let expected = vec![ + /// "+---+---+", + /// "| b | c |", + /// "+---+---+", + /// "| 2 | 3 |", + /// "+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -369,11 +401,26 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_json("tests/data/unnest.json", NdJsonReadOptions::default()).await?; + /// // expand into multiple columns if it's json array, flatten field name if it's nested structure /// let df = df.unnest_columns(&["b","c","d"])?; + /// let expected = vec![ + /// "+---+------+-------+-----+-----+", + /// "| a | b | c | d.e | d.f |", + /// "+---+------+-------+-----+-----+", + /// "| 1 | 2.0 | false | 1 | 2 |", + /// "| 1 | 1.3 | true | 1 | 2 |", + /// "| 1 | -6.1 | | 1 | 2 |", + /// "| 2 | 3.0 | false | | |", + /// "| 2 | 2.3 | true | | |", + /// "| 2 | -7.1 | | | |", + /// "+---+------+-------+-----+-----+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -411,11 +458,23 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; /// let df = df.filter(col("a").lt_eq(col("b")))?; + /// // all rows where a <= b are returned + /// let expected = vec![ + /// "+---+---+---+", + /// "| a | b | c |", + /// "+---+---+---+", + /// "| 1 | 2 | 3 |", + /// "| 4 | 5 | 6 |", + /// "| 7 | 8 | 9 |", + /// "+---+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -437,16 +496,34 @@ impl DataFrame { /// # use datafusion::prelude::*; /// # use datafusion::error::Result; /// # use datafusion::functions_aggregate::expr_fn::min; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; /// /// // The following use is the equivalent of "SELECT MIN(b) GROUP BY a" - /// let _ = df.clone().aggregate(vec![col("a")], vec![min(col("b"))])?; - /// + /// let df1 = df.clone().aggregate(vec![col("a")], vec![min(col("b"))])?; + /// let expected1 = vec![ + /// "+---+----------------+", + /// "| a | min(?table?.b) |", + /// "+---+----------------+", + /// "| 1 | 2 |", + /// "| 4 | 5 |", + /// "| 7 | 8 |", + /// "+---+----------------+" + /// ]; + /// assert_batches_sorted_eq!(expected1, &df1.collect().await?); /// // The following use is the equivalent of "SELECT MIN(b)" - /// let _ = df.aggregate(vec![], vec![min(col("b"))])?; + /// let df2 = df.aggregate(vec![], vec![min(col("b"))])?; + /// let expected2 = vec![ + /// "+----------------+", + /// "| min(?table?.b) |", + /// "+----------------+", + /// "| 2 |", + /// "+----------------+" + /// ]; + /// # assert_batches_sorted_eq!(expected2, &df2.collect().await?); /// # Ok(()) /// # } /// ``` @@ -486,11 +563,21 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let df = df.limit(0, Some(100))?; + /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = df.limit(1, Some(2))?; + /// let expected = vec![ + /// "+---+---+---+", + /// "| a | b | c |", + /// "+---+---+---+", + /// "| 4 | 5 | 6 |", + /// "| 7 | 8 | 9 |", + /// "+---+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -512,12 +599,22 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? ; /// let d2 = df.clone(); /// let df = df.union(d2)?; + /// let expected = vec![ + /// "+---+---+---+", + /// "| a | b | c |", + /// "+---+---+---+", + /// "| 1 | 2 | 3 |", + /// "| 1 | 2 | 3 |", + /// "+---+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -540,12 +637,22 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; /// let d2 = df.clone(); /// let df = df.union_distinct(d2)?; + /// // df2 are duplicate of df + /// let expected = vec![ + /// "+---+---+---+", + /// "| a | b | c |", + /// "+---+---+---+", + /// "| 1 | 2 | 3 |", + /// "+---+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -565,11 +672,20 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; /// let df = df.distinct()?; + /// let expected = vec![ + /// "+---+---+---+", + /// "| a | b | c |", + /// "+---+---+---+", + /// "| 1 | 2 | 3 |", + /// "+---+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -589,12 +705,21 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? /// // Return a single row (a, b) for each distinct value of a /// .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?; + /// let expected = vec![ + /// "+---+---+", + /// "| a | b |", + /// "+---+---+", + /// "| 1 | 2 |", + /// "+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -623,12 +748,26 @@ impl DataFrame { /// # use datafusion::prelude::*; /// # use datafusion::error::Result; /// # use arrow::util::pretty; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?; - /// df.describe().await.unwrap(); - /// + /// let stat = df.describe().await?; + /// # // some output column are ignored + /// let expected = vec![ + /// "+------------+--------------------+--------------------+------------------------------------+--------------------+-----------------+--------------------+--------------+----------------------------------------------------------------------------------------------------------+", + /// "| describe | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment |", + /// "+------------+--------------------+--------------------+------------------------------------+--------------------+-----------------+--------------------+--------------+----------------------------------------------------------------------------------------------------------+", + /// "| count | 9.0 | 9 | 9 | 9.0 | 9 | 9.0 | 9 | 9 |", + /// "| max | 10.0 | Customer#000000010 | xKiAFTjUsCuxfeleNqefumTrjS | 20.0 | 30-114-968-4951 | 9561.95 | MACHINERY | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious |", + /// "| mean | 6.0 | null | null | 9.88888888888889 | null | 5153.2155555555555 | null | null |", + /// "| median | 6.0 | null | null | 8.0 | null | 6819.74 | null | null |", + /// "| min | 2.0 | Customer#000000002 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 1.0 | 11-719-748-3364 | 121.65 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov |", + /// "| null_count | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 |", + /// "| std | 2.7386127875258306 | null | null | 7.2188026092359046 | null | 3522.169804254585 | null | null |", + /// "+------------+--------------------+--------------------+------------------------------------+--------------------+-----------------+--------------------+--------------+----------------------------------------------------------------------------------------------------------+"]; + /// assert_batches_sorted_eq!(expected, &stat.collect().await?); /// # Ok(()) /// # } /// ``` @@ -820,14 +959,25 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; /// let df = df.sort(vec![ - /// col("a").sort(true, true), // a ASC, nulls first - /// col("b").sort(false, false), // b DESC, nulls last + /// col("a").sort(false, true), // a DESC, nulls first + /// col("b").sort(true, false), // b ASC, nulls last /// ])?; + /// let expected = vec![ + /// "+---+---+---+", + /// "| a | b | c |", + /// "+---+---+---+", + /// "| 1 | 2 | 3 |", + /// "| 4 | 5 | 6 |", + /// "| 7 | 8 | 9 |", + /// "+---+---+---+", + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -857,6 +1007,7 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); @@ -869,7 +1020,14 @@ impl DataFrame { /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = b2)` /// // finding all pairs of rows from `left` and `right` where `a = a2` and `b = b2`. /// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?; - /// let batches = join.collect().await?; + /// let expected = vec![ + /// "+---+---+---+----+----+----+", + /// "| a | b | c | a2 | b2 | c2 |", + /// "+---+---+---+----+----+----+", + /// "| 1 | 2 | 3 | 1 | 2 | 3 |", + /// "+---+---+---+----+----+----+" + /// ]; + /// assert_batches_sorted_eq!(expected, &join.collect().await?); /// # Ok(()) /// # } /// ``` @@ -906,6 +1064,7 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); @@ -929,7 +1088,13 @@ impl DataFrame { /// JoinType::Inner, /// [col("a").not_eq(col("a2")), col("b").not_eq(col("b2"))], /// )?; - /// let batches = join_on.collect().await?; + /// let expected = vec![ + /// "+---+---+---+----+----+----+", + /// "| a | b | c | a2 | b2 | c2 |", + /// "+---+---+---+----+----+----+", + /// "+---+---+---+----+----+----+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &join_on.collect().await?); /// # Ok(()) /// # } /// ``` @@ -954,11 +1119,22 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; /// let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?; + /// let expected = vec![ + /// "+---+---+---+", + /// "| a | b | c |", + /// "+---+---+---+", + /// "| 1 | 2 | 3 |", + /// "| 4 | 5 | 6 |", + /// "| 7 | 8 | 9 |", + /// "+---+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df1.collect().await?); /// # Ok(()) /// # } /// ``` @@ -985,7 +1161,8 @@ impl DataFrame { /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let count = df.count().await?; + /// let count = df.count().await?; // 1 + /// # assert_eq!(count, 1); /// # Ok(()) /// # } /// ``` @@ -1266,12 +1443,21 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = df.clone(); + /// let d2 = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; /// let df = df.intersect(d2)?; + /// let expected = vec![ + /// "+---+---+---+", + /// "| a | b | c |", + /// "+---+---+---+", + /// "| 1 | 2 | 3 |", + /// "+---+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) /// # } /// ``` @@ -1290,12 +1476,23 @@ impl DataFrame { /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; + /// # use datafusion_common::assert_batches_sorted_eq; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = df.clone(); - /// let df = df.except(d2)?; + /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let d2 = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let result = df.except(d2)?; + /// // those columns are not in example.csv, but in example_long.csv + /// let expected = vec![ + /// "+---+---+---+", + /// "| a | b | c |", + /// "+---+---+---+", + /// "| 4 | 5 | 6 |", + /// "| 7 | 8 | 9 |", + /// "+---+---+---+" + /// ]; + /// # assert_batches_sorted_eq!(expected, &result.collect().await?); /// # Ok(()) /// # } /// ``` diff --git a/datafusion/core/tests/data/example_long.csv b/datafusion/core/tests/data/example_long.csv new file mode 100644 index 000000000000..83d4cdde1ce1 --- /dev/null +++ b/datafusion/core/tests/data/example_long.csv @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +4,5,6 +7,8,9 \ No newline at end of file