Skip to content

Commit

Permalink
ORC-1489: Assign a writer id to CUDF
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

This pr is aimed at assigning a writer id to the CUDF.

### Why are the changes needed?

This helps to locate the writer of a specific orc file, and it also helps the reader to do some special reads for files created by different writers.

### How was this patch tested?

Added UT

Closes apache#1594 from guiyanakuang/ORC-1489.

Lead-authored-by: zhangyiqun <[email protected]>
Co-authored-by: Yiqun Zhang <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
2 people authored and cxzl25 committed Jan 11, 2024
1 parent 70cb171 commit 3f29808
Show file tree
Hide file tree
Showing 9 changed files with 23 additions and 1 deletion.
1 change: 1 addition & 0 deletions c++/include/orc/Common.hh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ namespace orc {
PRESTO_WRITER = 2,
SCRITCHLEY_GO = 3,
TRINO_WRITER = 4,
CUDF_WRITER = 5,
UNKNOWN_WRITER = INT32_MAX
};

Expand Down
2 changes: 2 additions & 0 deletions c++/src/Common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ namespace orc {
return "Scritchley Go";
case TRINO_WRITER:
return "Trino";
case CUDF_WRITER:
return "CUDF";
default: {
std::ostringstream buffer;
buffer << "Unknown(" << id << ")";
Expand Down
2 changes: 1 addition & 1 deletion c++/src/Reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ namespace orc {
WriterId ReaderImpl::getWriterId() const {
if (footer->has_writer()) {
uint32_t id = footer->writer();
if (id > WriterId::TRINO_WRITER) {
if (id > WriterId::CUDF_WRITER) {
return WriterId::UNKNOWN_WRITER;
} else {
return static_cast<WriterId>(id);
Expand Down
4 changes: 4 additions & 0 deletions java/core/src/java/org/apache/orc/OrcFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ public enum WriterImplementation {
PRESTO(2), // Presto writer
SCRITCHLEY_GO(3), // Go writer from https://github.com/scritchley/orc
TRINO(4), // Trino writer
CUDF(5), // CUDF writer
UNKNOWN(Integer.MAX_VALUE);

private final int id;
Expand Down Expand Up @@ -189,6 +190,9 @@ public enum WriterVersion {
// Trino Writer
TRINO_ORIGINAL(WriterImplementation.TRINO, 6),

// CUDF Writer
CUDF_ORIGINAL(WriterImplementation.CUDF, 6),

// Don't use any magic numbers here except for the below:
FUTURE(WriterImplementation.UNKNOWN, Integer.MAX_VALUE); // a version from a future writer

Expand Down
3 changes: 3 additions & 0 deletions java/core/src/java/org/apache/orc/OrcUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,9 @@ public static String getSoftwareVersion(int writer,
case 4:
base = "Trino";
break;
case 5:
base = "CUDF";
break;
default:
base = String.format("Unknown(%d)", writer);
break;
Expand Down
6 changes: 6 additions & 0 deletions java/core/src/test/org/apache/orc/TestVectorOrcFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -3624,6 +3624,8 @@ public void testWriterVersion(Version fileFormat) throws Exception {
OrcFile.WriterImplementation.from(2));
assertEquals(OrcFile.WriterImplementation.TRINO,
OrcFile.WriterImplementation.from(4));
assertEquals(OrcFile.WriterImplementation.CUDF,
OrcFile.WriterImplementation.from(5));
assertEquals(OrcFile.WriterImplementation.UNKNOWN,
OrcFile.WriterImplementation.from(99));

Expand All @@ -3642,6 +3644,8 @@ public void testWriterVersion(Version fileFormat) throws Exception {
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.PRESTO, 6));
assertEquals(OrcFile.WriterVersion.TRINO_ORIGINAL,
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.TRINO, 6));
assertEquals(OrcFile.WriterVersion.CUDF_ORIGINAL,
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.CUDF, 6));
assertEquals(OrcFile.WriterVersion.FUTURE,
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.UNKNOWN, 0));

Expand All @@ -3660,6 +3664,8 @@ public void testWriterVersion(Version fileFormat) throws Exception {
OrcFile.WriterVersion.PRESTO_ORIGINAL));
assertTrue(OrcFile.WriterVersion.HIVE_12055.includes(
OrcFile.WriterVersion.TRINO_ORIGINAL));
assertTrue(OrcFile.WriterVersion.HIVE_12055.includes(
OrcFile.WriterVersion.CUDF_ORIGINAL));
}

@ParameterizedTest
Expand Down
4 changes: 4 additions & 0 deletions proto/orc_proto.proto
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ message Footer {
// 2 = Presto
// 3 = Scritchley Go from https://github.com/scritchley/orc
// 4 = Trino
// 5 = CUDF
optional uint32 writer = 9;

// information about the encryption in this file
Expand Down Expand Up @@ -432,6 +433,9 @@ message PostScript {
// Version of the Trino writer:
// 6 = original
//
// Version of the CUDF writer:
// 6 = original
//
optional uint32 writerVersion = 6;

// the number of bytes in the encrypted stripe statistics
Expand Down
1 change: 1 addition & 0 deletions site/specification/ORCv1.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ message Footer {
// 2 = Presto
// 3 = Scritchley Go from https://github.com/scritchley/orc
// 4 = Trino
// 5 = CUDF
optional uint32 writer = 9;
// information about the encryption in this file
optional Encryption encryption = 10;
Expand Down
1 change: 1 addition & 0 deletions site/specification/ORCv2.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ message Footer {
// 2 = Presto
// 3 = Scritchley Go from https://github.com/scritchley/orc
// 4 = Trino
// 5 = CUDF
optional uint32 writer = 9;
// information about the encryption in this file
optional Encryption encryption = 10;
Expand Down

0 comments on commit 3f29808

Please sign in to comment.