-
Notifications
You must be signed in to change notification settings - Fork 242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support nested types in ORC writer #3696
Changes from 2 commits
52da3f9
533aa89
4d97290
4d07e95
bd5c0f7
d0593b3
6ba8dd9
b1cac2c
e877b7e
225d487
b2c6b73
0b6e599
e4f7bdd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
package org.apache.spark.sql.rapids | ||
|
||
import ai.rapids.cudf._ | ||
import ai.rapids.cudf.ColumnWriterOptions._ | ||
import com.nvidia.spark.rapids._ | ||
import org.apache.hadoop.mapred.JobConf | ||
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} | ||
|
@@ -29,7 +30,7 @@ import org.apache.spark.sql.SparkSession | |
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap | ||
import org.apache.spark.sql.execution.datasources.FileFormat | ||
import org.apache.spark.sql.execution.datasources.orc.{OrcFileFormat, OrcOptions, OrcUtils} | ||
import org.apache.spark.sql.types.StructType | ||
import org.apache.spark.sql.types._ | ||
|
||
object GpuOrcFileFormat extends Logging { | ||
// The classname used when Spark is configured to use the Hive implementation for ORC. | ||
|
@@ -112,6 +113,59 @@ object GpuOrcFileFormat extends Logging { | |
None | ||
} | ||
} | ||
|
||
def orcWriterOptionsFromField[T <: NestedBuilder[_, _], V <: ColumnWriterOptions]( | ||
builder: ColumnWriterOptions.NestedBuilder[T, V], | ||
dataType: DataType, | ||
name: String, | ||
nullable: Boolean): T = { | ||
dataType match { | ||
case dt: DecimalType => | ||
builder.withDecimalColumn(name, dt.precision, nullable) | ||
case TimestampType => | ||
builder.withTimestampColumn(name, false, nullable) | ||
case s: StructType => | ||
builder.withStructColumn( | ||
orcWriterOptionsFromSchema(structBuilder(name, nullable), s).build() | ||
) | ||
case a: ArrayType => | ||
builder.withListColumn( | ||
orcWriterOptionsFromField( | ||
listBuilder(name, nullable), | ||
a.elementType, | ||
name, | ||
nullable).build()) | ||
case m: MapType => | ||
builder.withMapColumn( | ||
mapColumn(name, | ||
orcWriterOptionsFromField( | ||
ORCWriterOptions.builder(), | ||
m.keyType, | ||
"key", | ||
nullable = false).build().getChildColumnOptions()(0), | ||
orcWriterOptionsFromField( | ||
ORCWriterOptions.builder(), | ||
m.valueType, | ||
"value", | ||
nullable).build().getChildColumnOptions()(0))) | ||
case _ => | ||
builder.withColumns(nullable, name) | ||
} | ||
builder.asInstanceOf[T] | ||
} | ||
|
||
/** | ||
* (We could try to merge this with `parquetWriterOptionsFromSchema` after fixing the issue | ||
* https://github.com/rapidsai/cudf/issues/7654) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This issue is circumvented by pruning masks ourselves and we are already calling the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the info. But I tried locally and still getting the exception below when running the Parquet test
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is probably a bug in plugin. Working on it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed it by using |
||
*/ | ||
def orcWriterOptionsFromSchema[T <: NestedBuilder[_, _], V <: ColumnWriterOptions]( | ||
builder: ColumnWriterOptions.NestedBuilder[T, V], | ||
schema: StructType): T = { | ||
schema.foreach(field => | ||
orcWriterOptionsFromField(builder, field.dataType, field.name, field.nullable) | ||
) | ||
builder.asInstanceOf[T] | ||
} | ||
} | ||
|
||
class GpuOrcFileFormat extends ColumnarFileFormat with Logging { | ||
|
@@ -161,18 +215,9 @@ class GpuOrcWriter(path: String, | |
extends ColumnarOutputWriter(path, context, dataSchema, "ORC") { | ||
|
||
override val tableWriter: TableWriter = { | ||
val builder= ORCWriterOptions.builder() | ||
val builder = GpuOrcFileFormat | ||
.orcWriterOptionsFromSchema(ORCWriterOptions.builder(), dataSchema) | ||
.withCompressionType(CompressionType.valueOf(OrcConf.COMPRESS.getString(conf))) | ||
|
||
dataSchema.foreach(entry => { | ||
if (entry.nullable) { | ||
builder.withColumnNames(entry.name) | ||
} else { | ||
builder.withNotNullableColumnNames(entry.name) | ||
} | ||
}) | ||
|
||
val options = builder.build() | ||
Table.writeORCChunked(options, this) | ||
Table.writeORCChunked(builder.build(), this) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mind expanding on this comment a bit, why "not all because of nesting"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good suggestion, I will update it in a following PR since this PR should be merged as soon as possible.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I removed this comment.