diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index c8f842fcc637..8efc4c6793ba 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -239,6 +239,9 @@ private static native long[] readJSON(String[] columnNames, String filePath, long address, long length, boolean dayFirst, boolean lines) throws CudfException; + private static native long readAndInferJSON(long address, long length, + boolean dayFirst, boolean lines) throws CudfException; + /** * Read in Parquet formatted data. * @param filterColumnNames name of the columns to read, or an empty array if we want to read @@ -918,6 +921,26 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon } } + /** + * Read JSON formatted data and infer the column names and schema. + * @param opts various JSON parsing options. + * @param buffer raw UTF8 formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @return the data parsed as a table on the GPU and the metadata for the table returned. + */ + public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer, + long offset, long len) { + if (len <= 0) { + len = buffer.length - offset; + } + assert len > 0; + assert len <= buffer.length - offset; + assert offset >= 0 && offset < buffer.length; + return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len, + opts.isDayFirst(), opts.isLines())); + } + /** * Read JSON formatted data. * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. diff --git a/java/src/main/java/ai/rapids/cudf/TableWithMeta.java b/java/src/main/java/ai/rapids/cudf/TableWithMeta.java new file mode 100644 index 000000000000..9baa127d39da --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/TableWithMeta.java @@ -0,0 +1,67 @@ +/* + * + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +package ai.rapids.cudf; + +/** + * A table along with some metadata about the table. This is typically returned when + * reading data from an input file where the metadata can be important. + */ +public class TableWithMeta implements AutoCloseable { + private long handle; + + TableWithMeta(long handle) { + this.handle = handle; + } + + /** + * Get the table out of this metadata. Note that this can only be called once. Later calls + * will return a null. + */ + public Table releaseTable() { + long[] ptr = releaseTable(handle); + if (ptr == null) { + return null; + } else { + return new Table(ptr); + } + } + + /** + * Get the names of the top level columns. In the future new APIs can be added to get + * names of child columns. + */ + public String[] getColumnNames() { + return getColumnNames(handle); + } + + @Override + public void close() throws Exception { + if (handle != 0) { + close(handle); + handle = 0; + } + } + + private static native void close(long handle); + + private static native long[] releaseTable(long handle); + + private static native String[] getColumnNames(long handle); +} diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index d511512431b8..ddf4e2138c8a 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1314,6 +1314,77 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( CATCH_STD(env, NULL); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( + JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { + + JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); + if (buffer_length <= 0) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0); + } + + try { + cudf::jni::auto_set_device(env); + + auto source = cudf::io::source_info{reinterpret_cast(buffer), + static_cast(buffer_length)}; + + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)); + + auto result = + std::make_unique(cudf::io::read_json(opts.build())); + + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv *env, jclass, jlong handle) { + JNI_NULL_CHECK(env, handle, "handle is null", ); + + try { + cudf::jni::auto_set_device(env); + delete reinterpret_cast(handle); + } + CATCH_STD(env, ); +} + +JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_TableWithMeta_getColumnNames(JNIEnv *env, jclass, + jlong handle) { + JNI_NULL_CHECK(env, handle, "handle is null", nullptr); + + try { + cudf::jni::auto_set_device(env); + auto ptr = reinterpret_cast(handle); + auto length = ptr->metadata.column_names.size(); + auto ret = static_cast( + env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr)); + for (size_t i = 0; i < length; i++) { + env->SetObjectArrayElement(ret, i, env->NewStringUTF(ptr->metadata.column_names[i].c_str())); + } + + return ret; + } + CATCH_STD(env, nullptr); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv *env, jclass, + jlong handle) { + JNI_NULL_CHECK(env, handle, "handle is null", nullptr); + + try { + cudf::jni::auto_set_device(env); + auto ptr = reinterpret_cast(handle); + if (ptr->tbl) { + return convert_table_for_return(env, ptr->tbl); + } else { + return nullptr; + } + } + CATCH_STD(env, nullptr); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {