Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] call cudaMemcpyAsync/cudaMemsetAsync in JNI [skip ci] #5913

Merged
merged 3 commits into from
Aug 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
- PR #5381 Change numpy usages to cupy in `10min.ipynb`
- PR #5408 Update pyrrow and arrow-cpp to 0.17.1
- PR #5366 Add benchmarks for cuIO writers
- PR #5913 Call cudaMemcpyAsync/cudaMemsetAsync in JNI
- PR #5405 Add Error message to `StringColumn.unary_operator`
- PR #5424 Add python plumbing for `.str.character_tokenize`
- PR #5420 Aligning signature of `Series.value_counts` to Pandas
Expand Down
36 changes: 30 additions & 6 deletions java/src/main/java/ai/rapids/cudf/Cuda.java
Original file line number Diff line number Diff line change
Expand Up @@ -259,29 +259,53 @@ public void close() {
static native void freePinned(long ptr) throws CudaException;

/**
* Copies count bytes from the memory area pointed to by src to the memory area pointed to by
* dst.
* Calling cudaMemcpy() with dst and src pointers that do not
* match the direction of the copy results in an undefined behavior.
* Copies bytes between buffers using the default CUDA stream.
* The copy has completed when this returns, but the memory copy could overlap with
* operations occurring on other streams.
* Specifying pointers that do not match the copy direction results in undefined behavior.
* @param dst - Destination memory address
* @param src - Source memory address
* @param count - Size in bytes to copy
* @param kind - Type of transfer. {@link CudaMemcpyKind}
*/
static void memcpy(long dst, long src, long count, CudaMemcpyKind kind) {
memcpy(dst, src, count, kind.getValue());
memcpy(dst, src, count, kind, DEFAULT_STREAM);
}

private static native void memcpy(long dst, long src, long count, int kind) throws CudaException;
/**
* Copies bytes between buffers using the default CUDA stream.
* The copy has not necessarily completed when this returns, but the memory copy could
* overlap with operations occurring on other streams.
* Specifying pointers that do not match the copy direction results in undefined behavior.
* @param dst - Destination memory address
* @param src - Source memory address
* @param count - Size in bytes to copy
* @param kind - Type of transfer. {@link CudaMemcpyKind}
*/
static void asyncMemcpy(long dst, long src, long count, CudaMemcpyKind kind) {
asyncMemcpy(dst, src, count, kind, DEFAULT_STREAM);
}

/**
* Sets count bytes starting at the memory area pointed to by dst, with value.
jlowe marked this conversation as resolved.
Show resolved Hide resolved
* The operation has completed when this returns, but it could overlap with operations occurring
* on other streams.
* @param dst - Destination memory address
* @param value - Byte value to set dst with
* @param count - Size in bytes to set
*/
public static native void memset(long dst, byte value, long count) throws CudaException;
jlowe marked this conversation as resolved.
Show resolved Hide resolved

/**
* Sets count bytes starting at the memory area pointed to by dst, with value.
* The operation has not necessarily completed when this returns, but it could overlap with
* operations occurring on other streams.
* @param dst - Destination memory address
* @param value - Byte value to set dst with
* @param count - Size in bytes to set
*/
public static native void asyncMemset(long dst, byte value, long count) throws CudaException;

/**
* Get the id of the current device.
* @return the id of the current device
Expand Down
101 changes: 49 additions & 52 deletions java/src/main/native/src/CudaJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ void set_cudf_device(int device) {
* If a cudf device has been specified then this ensures the calling thread
* is using the same device.
*/
void auto_set_device(JNIEnv* env) {
void auto_set_device(JNIEnv *env) {
if (Cudf_device != cudaInvalidDeviceId) {
int device;
cudaError_t cuda_status = cudaGetDevice(&device);
Expand Down Expand Up @@ -70,50 +70,48 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas
}

jobject info_obj = env->NewObject(info_class, ctor_id, (jlong)free, (jlong)total);
// No need to check for exceptions of null return value as we are just handing the object back to
// the JVM. which will handle throwing any exceptions that happened in the constructor.
// No need to check for exceptions of null return value as we are just handing the object back
// to the JVM which will handle throwing any exceptions that happened in the constructor.
return info_obj;
} CATCH_STD(env, nullptr);
}
CATCH_STD(env, nullptr);
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jclass, jlong size) {
try {
cudf::jni::auto_set_device(env);
void * ret = nullptr;
void *ret = nullptr;
JNI_CUDA_TRY(env, 0, cudaMallocHost(&ret, size));
return reinterpret_cast<jlong>(ret);
} CATCH_STD(env, 0);
}
CATCH_STD(env, 0);
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv *env, jclass, jlong ptr) {
try {
cudf::jni::auto_set_device(env);
JNI_CUDA_TRY(env, , cudaFreeHost(reinterpret_cast<void *>(ptr)));
} CATCH_STD(env, );
}
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpy(JNIEnv *env, jclass, jlong jdst, jlong jsrc,
jlong count, jint jkind) {
if (count == 0) {
return;
}
JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlong dst, jbyte value,
jlong count, jint kind) {
JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
try {
cudf::jni::auto_set_device(env);
auto dst = reinterpret_cast<void*>(jdst);
auto src = reinterpret_cast<void*>(jsrc);
auto kind = static_cast<cudaMemcpyKind>(jkind);
JNI_CUDA_TRY(env, , cudaMemcpy(dst, src, count, kind));
} CATCH_STD(env, );
JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count));
JNI_CUDA_TRY(env, , cudaStreamSynchronize(0));
}
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlong dst, jbyte value,
jlong count, jint kind) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(JNIEnv *env, jclass, jlong dst,
jbyte value, jlong count, jint kind) {
JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
try {
cudf::jni::auto_set_device(env);
JNI_CUDA_TRY(env, , cudaMemset((void *)dst, value, count));
JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count));
}
CATCH_STD(env, );
}
Expand Down Expand Up @@ -142,7 +140,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv *env, jclass, j
try {
if (Cudf_device != cudaInvalidDeviceId && dev != Cudf_device) {
cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
"Cannot change device after RMM init");
"Cannot change device after RMM init");
}
JNI_CUDA_TRY(env, , cudaSetDevice(dev));
}
Expand All @@ -156,7 +154,6 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_autoSetDevice(JNIEnv *env, jclas
CATCH_STD(env, );
}


JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
try {
cudf::jni::auto_set_device(env);
Expand All @@ -165,8 +162,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
CATCH_STD(env, );
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv* env, jclass,
jboolean isNonBlocking) {
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv *env, jclass,
jboolean isNonBlocking) {
try {
cudf::jni::auto_set_device(env);
cudaStream_t stream = nullptr;
Expand All @@ -177,8 +174,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv* env, jclas
CATCH_STD(env, 0);
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv* env, jclass,
jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclass, jlong jstream) {
try {
cudf::jni::auto_set_device(env);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
Expand All @@ -187,8 +183,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv* env, jclas
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv* env, jclass,
jlong jstream, jlong jevent) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv *env, jclass, jlong jstream,
jlong jevent) {
try {
cudf::jni::auto_set_device(env);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
Expand All @@ -198,8 +194,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv* env, jcl
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv* env, jclass,
jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, jclass,
jlong jstream) {
try {
cudf::jni::auto_set_device(env);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
Expand All @@ -208,8 +204,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv* env, j
CATCH_STD(env, );
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv* env, jclass,
jboolean enableTiming, jboolean blockingSync) {
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv *env, jclass,
jboolean enableTiming,
jboolean blockingSync) {
try {
cudf::jni::auto_set_device(env);
cudaEvent_t event = nullptr;
Expand All @@ -226,8 +223,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv* env, jclass
CATCH_STD(env, 0);
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv* env, jclass,
jlong jevent) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass, jlong jevent) {
try {
cudf::jni::auto_set_device(env);
auto event = reinterpret_cast<cudaEvent_t>(jevent);
Expand All @@ -236,25 +232,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv* env, jclass
CATCH_STD(env, );
}

JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv* env, jclass,
jlong jevent) {
JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv *env, jclass, jlong jevent) {
try {
cudf::jni::auto_set_device(env);
auto event = reinterpret_cast<cudaEvent_t>(jevent);
auto result = cudaEventQuery(event);
if (result == cudaSuccess) {
return true;
return true;
} else if (result == cudaErrorNotReady) {
return false;
return false;
} // else
JNI_CUDA_TRY(env, false, result);
}
CATCH_STD(env, false);
return false;
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv* env, jclass,
jlong jevent, jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv *env, jclass, jlong jevent,
jlong jstream) {
try {
cudf::jni::auto_set_device(env);
auto event = reinterpret_cast<cudaEvent_t>(jevent);
Expand All @@ -264,8 +259,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv* env, jclass,
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv* env, jclass,
jlong jevent) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jclass,
jlong jevent) {
try {
cudf::jni::auto_set_device(env);
auto event = reinterpret_cast<cudaEvent_t>(jevent);
Expand All @@ -274,17 +269,18 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv* env, jc
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv* env, jclass,
jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jclass, jlong jdst,
jlong jsrc, jlong count, jint jkind,
jlong jstream) {
if (count == 0) {
return;
}
JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
try {
cudf::jni::auto_set_device(env);
auto dst = reinterpret_cast<void*>(jdst);
auto src = reinterpret_cast<void*>(jsrc);
auto dst = reinterpret_cast<void *>(jdst);
auto src = reinterpret_cast<void *>(jsrc);
auto kind = static_cast<cudaMemcpyKind>(jkind);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream));
Expand All @@ -293,17 +289,18 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv* env, jcla
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv* env, jclass,
jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env, jclass, jlong jdst,
jlong jsrc, jlong count,
jint jkind, jlong jstream) {
if (count == 0) {
return;
}
JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
try {
cudf::jni::auto_set_device(env);
auto dst = reinterpret_cast<void*>(jdst);
auto src = reinterpret_cast<void*>(jsrc);
auto dst = reinterpret_cast<void *>(jdst);
auto src = reinterpret_cast<void *>(jsrc);
auto kind = static_cast<cudaMemcpyKind>(jkind);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream));
Expand Down