From 3f29808eee72a8e8ecf6567a4dc1200b60c72aa9 Mon Sep 17 00:00:00 2001 From: zhangyiqun Date: Tue, 29 Aug 2023 20:33:30 -0700 Subject: [PATCH] ORC-1489: Assign a writer id to CUDF ### What changes were proposed in this pull request? This pr is aimed at assigning a writer id to the CUDF. ### Why are the changes needed? This helps to locate the writer of a specific orc file, and it also helps the reader to do some special reads for files created by different writers. ### How was this patch tested? Added UT Closes #1594 from guiyanakuang/ORC-1489. Lead-authored-by: zhangyiqun Co-authored-by: Yiqun Zhang Signed-off-by: Dongjoon Hyun --- c++/include/orc/Common.hh | 1 + c++/src/Common.cc | 2 ++ c++/src/Reader.cc | 2 +- java/core/src/java/org/apache/orc/OrcFile.java | 4 ++++ java/core/src/java/org/apache/orc/OrcUtils.java | 3 +++ java/core/src/test/org/apache/orc/TestVectorOrcFile.java | 6 ++++++ proto/orc_proto.proto | 4 ++++ site/specification/ORCv1.md | 1 + site/specification/ORCv2.md | 1 + 9 files changed, 23 insertions(+), 1 deletion(-) diff --git a/c++/include/orc/Common.hh b/c++/include/orc/Common.hh index beae9dd6f31..9da67a3f199 100644 --- a/c++/include/orc/Common.hh +++ b/c++/include/orc/Common.hh @@ -70,6 +70,7 @@ namespace orc { PRESTO_WRITER = 2, SCRITCHLEY_GO = 3, TRINO_WRITER = 4, + CUDF_WRITER = 5, UNKNOWN_WRITER = INT32_MAX }; diff --git a/c++/src/Common.cc b/c++/src/Common.cc index e220e274def..cf2ff27ef14 100644 --- a/c++/src/Common.cc +++ b/c++/src/Common.cc @@ -82,6 +82,8 @@ namespace orc { return "Scritchley Go"; case TRINO_WRITER: return "Trino"; + case CUDF_WRITER: + return "CUDF"; default: { std::ostringstream buffer; buffer << "Unknown(" << id << ")"; diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index b52675abb32..386793f0b2f 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -622,7 +622,7 @@ namespace orc { WriterId ReaderImpl::getWriterId() const { if (footer->has_writer()) { uint32_t id = footer->writer(); - if (id > WriterId::TRINO_WRITER) { + if (id > WriterId::CUDF_WRITER) { return WriterId::UNKNOWN_WRITER; } else { return static_cast(id); diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java index 6c2de467dd2..e109e8cd40b 100644 --- a/java/core/src/java/org/apache/orc/OrcFile.java +++ b/java/core/src/java/org/apache/orc/OrcFile.java @@ -132,6 +132,7 @@ public enum WriterImplementation { PRESTO(2), // Presto writer SCRITCHLEY_GO(3), // Go writer from https://github.com/scritchley/orc TRINO(4), // Trino writer + CUDF(5), // CUDF writer UNKNOWN(Integer.MAX_VALUE); private final int id; @@ -189,6 +190,9 @@ public enum WriterVersion { // Trino Writer TRINO_ORIGINAL(WriterImplementation.TRINO, 6), + // CUDF Writer + CUDF_ORIGINAL(WriterImplementation.CUDF, 6), + // Don't use any magic numbers here except for the below: FUTURE(WriterImplementation.UNKNOWN, Integer.MAX_VALUE); // a version from a future writer diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java index c121537d3c5..7dde0bc0fd0 100644 --- a/java/core/src/java/org/apache/orc/OrcUtils.java +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -414,6 +414,9 @@ public static String getSoftwareVersion(int writer, case 4: base = "Trino"; break; + case 5: + base = "CUDF"; + break; default: base = String.format("Unknown(%d)", writer); break; diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index 661b8f864f8..7b449c99632 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -3624,6 +3624,8 @@ public void testWriterVersion(Version fileFormat) throws Exception { OrcFile.WriterImplementation.from(2)); assertEquals(OrcFile.WriterImplementation.TRINO, OrcFile.WriterImplementation.from(4)); + assertEquals(OrcFile.WriterImplementation.CUDF, + OrcFile.WriterImplementation.from(5)); assertEquals(OrcFile.WriterImplementation.UNKNOWN, OrcFile.WriterImplementation.from(99)); @@ -3642,6 +3644,8 @@ public void testWriterVersion(Version fileFormat) throws Exception { OrcFile.WriterVersion.from(OrcFile.WriterImplementation.PRESTO, 6)); assertEquals(OrcFile.WriterVersion.TRINO_ORIGINAL, OrcFile.WriterVersion.from(OrcFile.WriterImplementation.TRINO, 6)); + assertEquals(OrcFile.WriterVersion.CUDF_ORIGINAL, + OrcFile.WriterVersion.from(OrcFile.WriterImplementation.CUDF, 6)); assertEquals(OrcFile.WriterVersion.FUTURE, OrcFile.WriterVersion.from(OrcFile.WriterImplementation.UNKNOWN, 0)); @@ -3660,6 +3664,8 @@ public void testWriterVersion(Version fileFormat) throws Exception { OrcFile.WriterVersion.PRESTO_ORIGINAL)); assertTrue(OrcFile.WriterVersion.HIVE_12055.includes( OrcFile.WriterVersion.TRINO_ORIGINAL)); + assertTrue(OrcFile.WriterVersion.HIVE_12055.includes( + OrcFile.WriterVersion.CUDF_ORIGINAL)); } @ParameterizedTest diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto index ff05657a547..45d7d2a0546 100644 --- a/proto/orc_proto.proto +++ b/proto/orc_proto.proto @@ -367,6 +367,7 @@ message Footer { // 2 = Presto // 3 = Scritchley Go from https://github.com/scritchley/orc // 4 = Trino + // 5 = CUDF optional uint32 writer = 9; // information about the encryption in this file @@ -432,6 +433,9 @@ message PostScript { // Version of the Trino writer: // 6 = original // + // Version of the CUDF writer: + // 6 = original + // optional uint32 writerVersion = 6; // the number of bytes in the encrypted stripe statistics diff --git a/site/specification/ORCv1.md b/site/specification/ORCv1.md index 28347642ec5..c9c9311aab8 100644 --- a/site/specification/ORCv1.md +++ b/site/specification/ORCv1.md @@ -136,6 +136,7 @@ message Footer { // 2 = Presto // 3 = Scritchley Go from https://github.com/scritchley/orc // 4 = Trino + // 5 = CUDF optional uint32 writer = 9; // information about the encryption in this file optional Encryption encryption = 10; diff --git a/site/specification/ORCv2.md b/site/specification/ORCv2.md index 010de73c97c..62d640786da 100644 --- a/site/specification/ORCv2.md +++ b/site/specification/ORCv2.md @@ -156,6 +156,7 @@ message Footer { // 2 = Presto // 3 = Scritchley Go from https://github.com/scritchley/orc // 4 = Trino + // 5 = CUDF optional uint32 writer = 9; // information about the encryption in this file optional Encryption encryption = 10;