-
Notifications
You must be signed in to change notification settings - Fork 915
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement JNI for chunked ORC reader (#15446)
This adds JNI implementation for chunked ORC reader, allowing to read ORC files by an iterative manner. Depends on: * #15094 Closes #12228. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Jason Lowe (https://github.com/jlowe) URL: #15446
- Loading branch information
Showing
4 changed files
with
352 additions
and
14 deletions.
There are no files selected for viewing
169 changes: 169 additions & 0 deletions
169
java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
/* | ||
* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
*/ | ||
|
||
package ai.rapids.cudf; | ||
|
||
/** | ||
* Provide an interface for reading an ORC file in an iterative manner. | ||
*/ | ||
public class ORCChunkedReader implements AutoCloseable { | ||
static { | ||
NativeDepsLoader.loadNativeDeps(); | ||
} | ||
|
||
/** | ||
* Construct the reader instance from read limits, output row granularity, | ||
* and a file already loaded in a memory buffer. | ||
* | ||
* @param chunkReadLimit Limit on total number of bytes to be returned per read, | ||
* or 0 if there is no limit. | ||
* @param passReadLimit Limit on the amount of memory used by the chunked reader, | ||
* or 0 if there is no limit. | ||
* @param opts The options for ORC reading. | ||
* @param buffer Raw ORC file content. | ||
* @param offset The starting offset into buffer. | ||
* @param len The number of bytes to parse the given buffer. | ||
*/ | ||
public ORCChunkedReader(long chunkReadLimit, long passReadLimit, | ||
ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) { | ||
handle = createReader(chunkReadLimit, passReadLimit, | ||
opts.getIncludeColumnNames(), buffer.getAddress() + offset, len, | ||
opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(), | ||
opts.getDecimal128Columns()); | ||
if (handle == 0) { | ||
throw new IllegalStateException("Cannot create native chunked ORC reader object."); | ||
} | ||
} | ||
|
||
/** | ||
* Construct a chunked ORC reader instance, similar to | ||
* {@link ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long)}, | ||
* with an additional parameter to control the granularity of the output table. | ||
* When reading a chunk table, with respect to the given size limits, a subset of stripes may | ||
* be loaded, decompressed and decoded into a large intermediate table. The reader will then | ||
* subdivide that table into smaller tables for final output using | ||
* {@code outputRowSizingGranularity} as the subdivision step. If the chunked reader is | ||
* constructed without this parameter, the default value of 10k rows will be used. | ||
* | ||
* @param outputRowSizingGranularity The change step in number of rows in the output table. | ||
* @see ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long) | ||
*/ | ||
public ORCChunkedReader(long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity, | ||
ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) { | ||
handle = createReaderWithOutputGranularity(chunkReadLimit, passReadLimit, outputRowSizingGranularity, | ||
opts.getIncludeColumnNames(), buffer.getAddress() + offset, len, | ||
opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(), | ||
opts.getDecimal128Columns()); | ||
if (handle == 0) { | ||
throw new IllegalStateException("Cannot create native chunked ORC reader object."); | ||
} | ||
} | ||
|
||
/** | ||
* Check if the given file has anything left to read. | ||
* | ||
* @return A boolean value indicating if there is more data to read from file. | ||
*/ | ||
public boolean hasNext() { | ||
if (handle == 0) { | ||
throw new IllegalStateException("Native chunked ORC reader object may have been closed."); | ||
} | ||
|
||
if (firstCall) { | ||
// This function needs to return true at least once, so an empty table | ||
// (but having empty columns instead of no column) can be returned by readChunk() | ||
// if the input file has no row. | ||
firstCall = false; | ||
return true; | ||
} | ||
return hasNext(handle); | ||
} | ||
|
||
/** | ||
* Read a chunk of rows in the given ORC file such that the returning data has total size | ||
* does not exceed the given read limit. If the given file has no data, or all data has been read | ||
* before by previous calls to this function, a null Table will be returned. | ||
* | ||
* @return A table of new rows reading from the given file. | ||
*/ | ||
public Table readChunk() { | ||
if (handle == 0) { | ||
throw new IllegalStateException("Native chunked ORC reader object may have been closed."); | ||
} | ||
|
||
long[] columnPtrs = readChunk(handle); | ||
return columnPtrs != null ? new Table(columnPtrs) : null; | ||
} | ||
|
||
@Override | ||
public void close() { | ||
if (handle != 0) { | ||
close(handle); | ||
handle = 0; | ||
} | ||
} | ||
|
||
|
||
/** | ||
* Auxiliary variable to help {@link #hasNext()} returning true at least once. | ||
*/ | ||
private boolean firstCall = true; | ||
|
||
/** | ||
* Handle for memory address of the native ORC chunked reader class. | ||
*/ | ||
private long handle; | ||
|
||
/** | ||
* Create a native chunked ORC reader object on heap and return its memory address. | ||
* | ||
* @param chunkReadLimit Limit on total number of bytes to be returned per read, | ||
* or 0 if there is no limit. | ||
* @param passReadLimit Limit on the amount of memory used by the chunked reader, | ||
* or 0 if there is no limit. | ||
* @param filterColumnNames Name of the columns to read, or an empty array if we want to read all. | ||
* @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer. | ||
* @param length The length of the buffer to read from. | ||
* @param usingNumPyTypes Whether the parser should implicitly promote TIMESTAMP | ||
* columns to TIMESTAMP_MILLISECONDS for compatibility with NumPy. | ||
* @param timeUnit return type of TimeStamp in units | ||
* @param decimal128Columns name of the columns which are read as Decimal128 rather than Decimal64 | ||
*/ | ||
private static native long createReader(long chunkReadLimit, long passReadLimit, | ||
String[] filterColumnNames, long bufferAddrs, long length, | ||
boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns); | ||
|
||
/** | ||
* Create a native chunked ORC reader object, similar to | ||
* {@link ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[])}, | ||
* with an additional parameter to control the granularity of the output table. | ||
* | ||
* @param outputRowSizingGranularity The change step in number of rows in the output table. | ||
* @see ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[]) | ||
*/ | ||
private static native long createReaderWithOutputGranularity( | ||
long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity, | ||
String[] filterColumnNames, long bufferAddrs, long length, | ||
boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns); | ||
|
||
private static native boolean hasNext(long handle); | ||
|
||
private static native long[] readChunk(long handle); | ||
|
||
private static native void close(long handle); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.