Skip to content

Commit

Permalink
Added method to remove null_masks if the column has no nulls (#9061)
Browse files Browse the repository at this point in the history
This PR adds a method to remove the validity vector in cases where there are columns in a Table with no nulls but still have a validity vector.

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - David Wendt (https://github.com/davidwendt)

URL: #9061
  • Loading branch information
razajafri authored Aug 20, 2021
1 parent 8c92812 commit 6cd0167
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 31 deletions.
56 changes: 55 additions & 1 deletion java/src/main/java/ai/rapids/cudf/ColumnView.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,39 @@ public ColumnView(DType type, long rows, Optional<Long> nullCount,
|| !nullCount.isPresent();
}

/**
* Create a new column view based off of data already on the device. Ref count on the buffers
* is not incremented and none of the underlying buffers are owned by this view. The returned
* ColumnView is only valid as long as the underlying buffers remain valid. If the buffers are
* closed before this ColumnView is closed, it will result in undefined behavior.
*
* If ownership is needed, call {@link ColumnView#copyToColumnVector}
*
* @param type the type of the vector
* @param rows the number of rows in this vector.
* @param nullCount the number of nulls in the dataset.
* @param dataBuffer a host buffer required for nested types including strings and string
* categories. The ownership doesn't change on this buffer
* @param validityBuffer an optional validity buffer. Must be provided if nullCount != 0.
* The ownership doesn't change on this buffer
* @param offsetBuffer The offsetbuffer for columns that need an offset buffer
*/
public ColumnView(DType type, long rows, Optional<Long> nullCount,
BaseDeviceMemoryBuffer dataBuffer,
BaseDeviceMemoryBuffer validityBuffer, BaseDeviceMemoryBuffer offsetBuffer) {
this(type, (int) rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
dataBuffer, validityBuffer, offsetBuffer, null);
assert (!type.isNestedType());
assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
|| !nullCount.isPresent();
}

private ColumnView(DType type, long rows, int nullCount,
BaseDeviceMemoryBuffer dataBuffer, BaseDeviceMemoryBuffer validityBuffer,
BaseDeviceMemoryBuffer offsetBuffer, ColumnView[] children) {
this(ColumnVector.initViewHandle(type, (int) rows, nullCount, dataBuffer, validityBuffer,
offsetBuffer, Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray()));
offsetBuffer, children == null ? new long[]{} :
Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray()));
}

/** Creates a ColumnVector from a column view handle
Expand Down Expand Up @@ -140,6 +168,32 @@ public final DType getType() {
return type;
}

/**
* Returns the child column views for this view
* Please note that it is the responsibility of the caller to close these views.
* @return an array of child column views
*/
public final ColumnView[] getChildColumnViews() {
int numChildren = getNumChildren();
if (!getType().isNestedType()) {
return null;
}
ColumnView[] views = new ColumnView[numChildren];
try {
for (int i = 0; i < numChildren; i++) {
views[i] = getChildColumnView(i);
}
return views;
} catch(Throwable t) {
for (ColumnView v: views) {
if (v != null) {
v.close();
}
}
throw t;
}
}

/**
* Returns the child column view at a given index.
* Please note that it is the responsibility of the caller to close this view.
Expand Down
11 changes: 10 additions & 1 deletion java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,19 @@ public long getDeviceMemorySize() {
return total;
}

/**
* This method is internal and exposed purely for testing purpopses
*/
static Table removeNullMasksIfNeeded(Table table) {
return new Table(removeNullMasksIfNeeded(table.nativeHandle));
}

/////////////////////////////////////////////////////////////////////////////
// NATIVE APIs
/////////////////////////////////////////////////////////////////////////////


private static native long[] removeNullMasksIfNeeded(long tableView) throws CudfException;

private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices);

private static native long[] partition(long inputTable, long partitionView,
Expand Down
63 changes: 61 additions & 2 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -929,13 +929,71 @@ jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results,
return combine_join_results(env, std::move(left_cols), std::move(right_cols));
}

cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
if (!cudf::is_compound(column_view.type())) {
if (column_view.nullable() && column_view.null_count() == 0) {
// null_mask is allocated but no nulls present therefore we create a new column_view without
// the null_mask to avoid things blowing up in reading the parquet file
return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
0, column_view.offset());
} else {
return cudf::column_view(column_view);
}
} else {
std::unique_ptr<cudf::column_view> ret;
std::vector<cudf::column_view> children;
children.reserve(column_view.num_children());
for (auto it = column_view.child_begin(); it != column_view.child_end(); it++) {
children.push_back(remove_validity_from_col(*it));
}
if (!column_view.nullable() || column_view.null_count() != 0) {
ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr,
column_view.null_mask(), column_view.null_count(),
column_view.offset(), children));
} else {
ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr, nullptr, 0,
column_view.offset(), children));
}
return *ret.release();
}
}

cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
std::vector<cudf::column_view> views;
views.reserve(input_table_view->num_columns());
for (auto it = input_table_view->begin(); it != input_table_view->end(); it++) {
views.push_back(remove_validity_from_col(*it));
}

return cudf::table_view(views);
}

} // namespace

} // namespace jni
} // namespace cudf

extern "C" {

// This is a method purely added for testing remove_validity_if_needed method
JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv *env, jclass,
jlong j_table_view) {
JNI_NULL_CHECK(env, j_table_view, "table view handle is null", 0);
try {
cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table_view);
cudf::table_view result = cudf::jni::remove_validity_if_needed(tview);
cudf::table m_tbl(result);
std::vector<std::unique_ptr<cudf::column>> cols = m_tbl.release();
auto results = cudf::jni::native_jlongArray(env, cols.size());
int i = 0;
for (auto it = cols.begin(); it != cols.end(); it++) {
results[i++] = reinterpret_cast<jlong>(it->release());
}
return results.get_jArray();
}
CATCH_STD(env, 0);
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *env, jclass,
jlongArray j_cudf_columns) {
JNI_NULL_CHECK(env, j_cudf_columns, "columns are null", 0);
Expand Down Expand Up @@ -1357,7 +1415,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
JNI_NULL_CHECK(env, j_state, "null state", );

using namespace cudf::io;
cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table);
cudf::table_view *tview_with_empty_nullmask = reinterpret_cast<cudf::table_view *>(j_table);
cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_with_empty_nullmask);
cudf::jni::native_parquet_writer_handle *state =
reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);

Expand All @@ -1367,7 +1426,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
}
try {
cudf::jni::auto_set_device(env);
state->writer->write(*tview);
state->writer->write(tview);
}
CATCH_STD(env, )
}
Expand Down
Loading

0 comments on commit 6cd0167

Please sign in to comment.