-
Notifications
You must be signed in to change notification settings - Fork 917
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add JNI support for converting Arrow buffers to CUDF ColumnVectors (#…
…7222) This adds in the JNI layer to be able to take build up Arrow column vectors which are just references to off heap arrow buffers and then convert those into CUDF ColumnVectors by directly copying the arrow data to the GPU. The way this works is you create a ArrowColumnBuilder for each column you need. You call addBatch for each separate arrow buffer you want to add into that column and then you call buildAndPutOnDevice() on the Builder. That will cause the arrow pointer to be passed into CUDF, an Arrow Table with 1 column is created, that Arrow table gets passed into the cudf::from_arrow which returns a CUDF Table and we grab the 1 column from that and return it. Note this only supports primitive types and Strings for now. List, Struct, Dictionary, and Decimal are not supported yet. Signed-off-by: Thomas Graves <[email protected]> Authors: - Thomas Graves (@tgravescs) Approvers: - Robert (Bobby) Evans (@revans2) - Jason Lowe (@jlowe) URL: #7222
- Loading branch information
Showing
5 changed files
with
574 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
113 changes: 113 additions & 0 deletions
113
java/src/main/java/ai/rapids/cudf/ArrowColumnBuilder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
/* | ||
* | ||
* Copyright (c) 2021, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
*/ | ||
|
||
package ai.rapids.cudf; | ||
|
||
import java.nio.ByteBuffer; | ||
import java.util.ArrayList; | ||
|
||
/** | ||
* Column builder from Arrow data. This builder takes in byte buffers referencing | ||
* Arrow data and allows efficient building of CUDF ColumnVectors from that Arrow data. | ||
* The caller can add multiple batches where each batch corresponds to Arrow data | ||
* and those batches get concatenated together after being converted to CUDF | ||
* ColumnVectors. | ||
* This currently only supports primitive types and Strings, Decimals and nested types | ||
* such as list and struct are not supported. | ||
*/ | ||
public final class ArrowColumnBuilder implements AutoCloseable { | ||
private DType type; | ||
private final ArrayList<ByteBuffer> data = new ArrayList<>(); | ||
private final ArrayList<ByteBuffer> validity = new ArrayList<>(); | ||
private final ArrayList<ByteBuffer> offsets = new ArrayList<>(); | ||
private final ArrayList<Long> nullCount = new ArrayList<>(); | ||
private final ArrayList<Long> rows = new ArrayList<>(); | ||
|
||
public ArrowColumnBuilder(HostColumnVector.DataType type) { | ||
this.type = type.getType(); | ||
} | ||
|
||
/** | ||
* Add an Arrow buffer. This API allows you to add multiple if you want them | ||
* combined into a single ColumnVector. | ||
* Note, this takes all data, validity, and offsets buffers, but they may not all | ||
* be needed based on the data type. The buffer should be null if its not used | ||
* for that type. | ||
* This API only supports primitive types and Strings, Decimals and nested types | ||
* such as list and struct are not supported. | ||
* @param rows - number of rows in this Arrow buffer | ||
* @param nullCount - number of null values in this Arrow buffer | ||
* @param data - ByteBuffer of the Arrow data buffer | ||
* @param validity - ByteBuffer of the Arrow validity buffer | ||
* @param offsets - ByteBuffer of the Arrow offsets buffer | ||
*/ | ||
public void addBatch(long rows, long nullCount, ByteBuffer data, ByteBuffer validity, | ||
ByteBuffer offsets) { | ||
this.rows.add(rows); | ||
this.nullCount.add(nullCount); | ||
this.data.add(data); | ||
this.validity.add(validity); | ||
this.offsets.add(offsets); | ||
} | ||
|
||
/** | ||
* Create the immutable ColumnVector, copied to the device based on the Arrow data. | ||
* @return - new ColumnVector | ||
*/ | ||
public final ColumnVector buildAndPutOnDevice() { | ||
int numBatches = rows.size(); | ||
ArrayList<ColumnVector> allVecs = new ArrayList<>(numBatches); | ||
ColumnVector vecRet; | ||
try { | ||
for (int i = 0; i < numBatches; i++) { | ||
allVecs.add(ColumnVector.fromArrow(type, rows.get(i), nullCount.get(i), | ||
data.get(i), validity.get(i), offsets.get(i))); | ||
} | ||
if (numBatches == 1) { | ||
vecRet = allVecs.get(0); | ||
} else if (numBatches > 1) { | ||
vecRet = ColumnVector.concatenate(allVecs.toArray(new ColumnVector[0])); | ||
} else { | ||
throw new IllegalStateException("Can't build a ColumnVector when no Arrow batches specified"); | ||
} | ||
} finally { | ||
// close the vectors that were concatenated | ||
if (numBatches > 1) { | ||
allVecs.forEach(cv -> cv.close()); | ||
} | ||
} | ||
return vecRet; | ||
} | ||
|
||
@Override | ||
public void close() { | ||
// memory buffers owned outside of this | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "ArrowColumnBuilder{" + | ||
"type=" + type + | ||
", data=" + data + | ||
", validity=" + validity + | ||
", offsets=" + offsets + | ||
", nullCount=" + nullCount + | ||
", rows=" + rows + | ||
'}'; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.