Skip to content
This repository has been archived by the owner on Sep 18, 2023. It is now read-only.

[NSE-333] Arrow Data Source: CSV format support fix #336

Merged
merged 2 commits into from
May 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ object ArrowUtils {
options: ArrowOptions): Option[org.apache.arrow.dataset.file.FileFormat] = {
Option(options.originalFormat match {
case "parquet" => org.apache.arrow.dataset.file.FileFormat.PARQUET
case "csv" => org.apache.arrow.dataset.file.FileFormat.CSV
case _ => throw new IllegalArgumentException("Unrecognizable format")
})
}
Expand Down
35 changes: 35 additions & 0 deletions arrow-data-source/standard/src/test/resources/example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
id1,id2,id3,id4,id5,id6,v1,v2,v3
id016,id016,id0000042202,15,24,5971,5,11,37.211254
id039,id045,id0000029558,40,49,39457,5,4,48.951141
id047,id023,id0000071286,68,20,74463,2,14,60.469241
id043,id057,id0000015141,32,43,63743,1,15,7.692145
id054,id040,id0000011083,9,25,16920,2,9,22.863525
id029,id020,id0000017974,40,43,14435,3,13,87.521355
id047,id023,id0000084849,90,96,35790,2,9,93.348148
id091,id022,id0000031441,50,44,71525,3,11,81.013682
id090,id048,id0000067778,24,2,51862,4,9,30.718821
id070,id008,id0000091167,78,4,23333,5,15,70.95464
id039,id084,id0000013708,94,81,44406,1,3,54.368009
id023,id061,id0000011331,36,67,86498,5,2,13.847979
id070,id054,id0000099110,24,15,47054,4,2,92.057305
id022,id008,id0000038862,38,92,63088,3,10,33.517765
id020,id070,id0000028952,17,57,50831,4,15,48.060814
id078,id022,id0000082008,69,44,15891,1,4,95.75571
id024,id033,id0000074157,1,57,83341,2,1,72.118722
id053,id076,id0000061759,55,43,59469,5,10,10.574836
id058,id087,id0000094028,14,49,72962,4,4,37.914258
id095,id091,id0000066931,35,20,98979,3,3,16.733062
id054,id061,id0000004843,69,58,14096,4,13,53.746802
id019,id078,id0000047661,5,33,13347,5,5,95.013936
id086,id088,id0000039469,45,86,65332,3,11,65.71087
id021,id055,id0000035603,96,97,36475,4,9,90.835613
id004,id034,id0000008260,99,8,73046,3,11,69.540405
id053,id052,id0000008764,47,13,49231,1,15,32.039599
id014,id050,id0000066034,45,32,33268,2,3,93.752279
id099,id057,id0000062408,27,7,63984,5,6,77.454794
id013,id067,id0000046109,69,90,21214,4,6,83.899656
id042,id043,id0000025883,64,21,85711,4,14,84.141247
id024,id062,id0000026824,79,16,49757,2,10,15.822967
id058,id077,id0000016555,71,8,24728,3,9,92.085521
id053,id012,id0000005595,73,28,79781,2,10,6.053862
id100,id096,id0000073858,11,9,25962,1,10,87.268781
3 changes: 3 additions & 0 deletions arrow-data-source/standard/src/test/resources/people.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name,age,job
Jorge,30,Developer
Bob,32,Developer
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
super.afterAll()
}

test("reading parquet file") {
test("read parquet file") {
val path = ArrowDataSourceTest.locateResourcePath(parquetFile1)
verifyParquet(
spark.read
Expand Down Expand Up @@ -254,39 +254,52 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
assert(fdGrowth < 100)
}

// csv cases: not implemented
private val csvFile = "cars.csv"
private val csvFile1 = "people.csv"
private val csvFile2 = "example.csv"

ignore("reading csv file without specifying original format") {
verifyCsv(spark.read.format("arrow").load(csvFile))
ignore("read csv file without specifying original format") {
// not implemented
verifyFrame(spark.read.format("arrow")
.load(ArrowDataSourceTest.locateResourcePath(csvFile1)), 1, 2)
}

ignore("reading csv file") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile)
verifyCsv(
test("read csv file") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile1)
verifyFrame(
spark.read
.format("arrow")
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv")
.load(path))
.load(path), 2, 3)
}

ignore("read csv file - programmatic API ") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile)
verifyCsv(
test("read csv file 2") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile2)
verifyFrame(
spark.read
.format("arrow")
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv")
.load(path), 34, 9)
}

test("read csv file - programmatic API ") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile1)
verifyFrame(
spark.read
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv")
.arrow(path))
.arrow(path), 2, 3)
}

def verifyFrame(frame: DataFrame, rowCount: Int, columnCount: Int): Unit = {
assert(frame.schema.length === columnCount)
assert(frame.collect().length === rowCount)
}

def verifyCsv(frame: DataFrame): Unit = {
// todo assert something
}

def verifyParquet(frame: DataFrame): Unit = {
assert(
frame.schema ===
StructType(Seq(StructField("col", LongType))))
assert(frame.collect().length === 5)
verifyFrame(frame, 5, 1)
}

def delete(path: String): Unit = {
Expand Down
2 changes: 1 addition & 1 deletion native-sql-engine/cpp/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ set(SPARK_COLUMNAR_PLUGIN_SRCS
precompile/unsafe_array.cc
precompile/gandiva_projector.cc
third_party/gandiva/decimal_ops.cc
third_party/gandiva/time.cc
third_party/gandiva/time.cc
)

add_subdirectory(third_party/gandiva)
Expand Down