Skip to content

Commit

Permalink
Merge pull request #5913 from rongou/jni-async
Browse files Browse the repository at this point in the history
[REVIEW] call cudaMemcpyAsync/cudaMemsetAsync in JNI
  • Loading branch information
jlowe authored Aug 11, 2020
2 parents b773c9e + c6bc78e commit ef1629c
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 58 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
- PR #5381 Change numpy usages to cupy in `10min.ipynb`
- PR #5408 Update pyrrow and arrow-cpp to 0.17.1
- PR #5366 Add benchmarks for cuIO writers
- PR #5913 Call cudaMemcpyAsync/cudaMemsetAsync in JNI
- PR #5405 Add Error message to `StringColumn.unary_operator`
- PR #5424 Add python plumbing for `.str.character_tokenize`
- PR #5420 Aligning signature of `Series.value_counts` to Pandas
Expand Down
36 changes: 30 additions & 6 deletions java/src/main/java/ai/rapids/cudf/Cuda.java
Original file line number Diff line number Diff line change
Expand Up @@ -259,29 +259,53 @@ public void close() {
static native void freePinned(long ptr) throws CudaException;

/**
* Copies count bytes from the memory area pointed to by src to the memory area pointed to by
* dst.
* Calling cudaMemcpy() with dst and src pointers that do not
* match the direction of the copy results in an undefined behavior.
* Copies bytes between buffers using the default CUDA stream.
* The copy has completed when this returns, but the memory copy could overlap with
* operations occurring on other streams.
* Specifying pointers that do not match the copy direction results in undefined behavior.
* @param dst - Destination memory address
* @param src - Source memory address
* @param count - Size in bytes to copy
* @param kind - Type of transfer. {@link CudaMemcpyKind}
*/
static void memcpy(long dst, long src, long count, CudaMemcpyKind kind) {
memcpy(dst, src, count, kind.getValue());
memcpy(dst, src, count, kind, DEFAULT_STREAM);
}

private static native void memcpy(long dst, long src, long count, int kind) throws CudaException;
/**
* Copies bytes between buffers using the default CUDA stream.
* The copy has not necessarily completed when this returns, but the memory copy could
* overlap with operations occurring on other streams.
* Specifying pointers that do not match the copy direction results in undefined behavior.
* @param dst - Destination memory address
* @param src - Source memory address
* @param count - Size in bytes to copy
* @param kind - Type of transfer. {@link CudaMemcpyKind}
*/
static void asyncMemcpy(long dst, long src, long count, CudaMemcpyKind kind) {
asyncMemcpy(dst, src, count, kind, DEFAULT_STREAM);
}

/**
* Sets count bytes starting at the memory area pointed to by dst, with value.
* The operation has completed when this returns, but it could overlap with operations occurring
* on other streams.
* @param dst - Destination memory address
* @param value - Byte value to set dst with
* @param count - Size in bytes to set
*/
public static native void memset(long dst, byte value, long count) throws CudaException;

/**
* Sets count bytes starting at the memory area pointed to by dst, with value.
* The operation has not necessarily completed when this returns, but it could overlap with
* operations occurring on other streams.
* @param dst - Destination memory address
* @param value - Byte value to set dst with
* @param count - Size in bytes to set
*/
public static native void asyncMemset(long dst, byte value, long count) throws CudaException;

/**
* Get the id of the current device.
* @return the id of the current device
Expand Down
101 changes: 49 additions & 52 deletions java/src/main/native/src/CudaJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ void set_cudf_device(int device) {
* If a cudf device has been specified then this ensures the calling thread
* is using the same device.
*/
void auto_set_device(JNIEnv* env) {
void auto_set_device(JNIEnv *env) {
if (Cudf_device != cudaInvalidDeviceId) {
int device;
cudaError_t cuda_status = cudaGetDevice(&device);
Expand Down Expand Up @@ -70,50 +70,48 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas
}

jobject info_obj = env->NewObject(info_class, ctor_id, (jlong)free, (jlong)total);
// No need to check for exceptions of null return value as we are just handing the object back to
// the JVM. which will handle throwing any exceptions that happened in the constructor.
// No need to check for exceptions of null return value as we are just handing the object back
// to the JVM which will handle throwing any exceptions that happened in the constructor.
return info_obj;
} CATCH_STD(env, nullptr);
}
CATCH_STD(env, nullptr);
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jclass, jlong size) {
try {
cudf::jni::auto_set_device(env);
void * ret = nullptr;
void *ret = nullptr;
JNI_CUDA_TRY(env, 0, cudaMallocHost(&ret, size));
return reinterpret_cast<jlong>(ret);
} CATCH_STD(env, 0);
}
CATCH_STD(env, 0);
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv *env, jclass, jlong ptr) {
try {
cudf::jni::auto_set_device(env);
JNI_CUDA_TRY(env, , cudaFreeHost(reinterpret_cast<void *>(ptr)));
} CATCH_STD(env, );
}
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpy(JNIEnv *env, jclass, jlong jdst, jlong jsrc,
jlong count, jint jkind) {
if (count == 0) {
return;
}
JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlong dst, jbyte value,
jlong count, jint kind) {
JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
try {
cudf::jni::auto_set_device(env);
auto dst = reinterpret_cast<void*>(jdst);
auto src = reinterpret_cast<void*>(jsrc);
auto kind = static_cast<cudaMemcpyKind>(jkind);
JNI_CUDA_TRY(env, , cudaMemcpy(dst, src, count, kind));
} CATCH_STD(env, );
JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count));
JNI_CUDA_TRY(env, , cudaStreamSynchronize(0));
}
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlong dst, jbyte value,
jlong count, jint kind) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(JNIEnv *env, jclass, jlong dst,
jbyte value, jlong count, jint kind) {
JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
try {
cudf::jni::auto_set_device(env);
JNI_CUDA_TRY(env, , cudaMemset((void *)dst, value, count));
JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count));
}
CATCH_STD(env, );
}
Expand Down Expand Up @@ -142,7 +140,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv *env, jclass, j
try {
if (Cudf_device != cudaInvalidDeviceId && dev != Cudf_device) {
cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
"Cannot change device after RMM init");
"Cannot change device after RMM init");
}
JNI_CUDA_TRY(env, , cudaSetDevice(dev));
}
Expand All @@ -156,7 +154,6 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_autoSetDevice(JNIEnv *env, jclas
CATCH_STD(env, );
}


JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
try {
cudf::jni::auto_set_device(env);
Expand All @@ -165,8 +162,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
CATCH_STD(env, );
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv* env, jclass,
jboolean isNonBlocking) {
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv *env, jclass,
jboolean isNonBlocking) {
try {
cudf::jni::auto_set_device(env);
cudaStream_t stream = nullptr;
Expand All @@ -177,8 +174,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv* env, jclas
CATCH_STD(env, 0);
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv* env, jclass,
jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclass, jlong jstream) {
try {
cudf::jni::auto_set_device(env);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
Expand All @@ -187,8 +183,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv* env, jclas
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv* env, jclass,
jlong jstream, jlong jevent) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv *env, jclass, jlong jstream,
jlong jevent) {
try {
cudf::jni::auto_set_device(env);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
Expand All @@ -198,8 +194,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv* env, jcl
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv* env, jclass,
jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, jclass,
jlong jstream) {
try {
cudf::jni::auto_set_device(env);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
Expand All @@ -208,8 +204,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv* env, j
CATCH_STD(env, );
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv* env, jclass,
jboolean enableTiming, jboolean blockingSync) {
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv *env, jclass,
jboolean enableTiming,
jboolean blockingSync) {
try {
cudf::jni::auto_set_device(env);
cudaEvent_t event = nullptr;
Expand All @@ -226,8 +223,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv* env, jclass
CATCH_STD(env, 0);
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv* env, jclass,
jlong jevent) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass, jlong jevent) {
try {
cudf::jni::auto_set_device(env);
auto event = reinterpret_cast<cudaEvent_t>(jevent);
Expand All @@ -236,25 +232,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv* env, jclass
CATCH_STD(env, );
}

JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv* env, jclass,
jlong jevent) {
JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv *env, jclass, jlong jevent) {
try {
cudf::jni::auto_set_device(env);
auto event = reinterpret_cast<cudaEvent_t>(jevent);
auto result = cudaEventQuery(event);
if (result == cudaSuccess) {
return true;
return true;
} else if (result == cudaErrorNotReady) {
return false;
return false;
} // else
JNI_CUDA_TRY(env, false, result);
}
CATCH_STD(env, false);
return false;
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv* env, jclass,
jlong jevent, jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv *env, jclass, jlong jevent,
jlong jstream) {
try {
cudf::jni::auto_set_device(env);
auto event = reinterpret_cast<cudaEvent_t>(jevent);
Expand All @@ -264,8 +259,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv* env, jclass,
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv* env, jclass,
jlong jevent) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jclass,
jlong jevent) {
try {
cudf::jni::auto_set_device(env);
auto event = reinterpret_cast<cudaEvent_t>(jevent);
Expand All @@ -274,17 +269,18 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv* env, jc
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv* env, jclass,
jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jclass, jlong jdst,
jlong jsrc, jlong count, jint jkind,
jlong jstream) {
if (count == 0) {
return;
}
JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
try {
cudf::jni::auto_set_device(env);
auto dst = reinterpret_cast<void*>(jdst);
auto src = reinterpret_cast<void*>(jsrc);
auto dst = reinterpret_cast<void *>(jdst);
auto src = reinterpret_cast<void *>(jsrc);
auto kind = static_cast<cudaMemcpyKind>(jkind);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream));
Expand All @@ -293,17 +289,18 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv* env, jcla
CATCH_STD(env, );
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv* env, jclass,
jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream) {
JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env, jclass, jlong jdst,
jlong jsrc, jlong count,
jint jkind, jlong jstream) {
if (count == 0) {
return;
}
JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
try {
cudf::jni::auto_set_device(env);
auto dst = reinterpret_cast<void*>(jdst);
auto src = reinterpret_cast<void*>(jsrc);
auto dst = reinterpret_cast<void *>(jdst);
auto src = reinterpret_cast<void *>(jsrc);
auto kind = static_cast<cudaMemcpyKind>(jkind);
auto stream = reinterpret_cast<cudaStream_t>(jstream);
JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream));
Expand Down

0 comments on commit ef1629c

Please sign in to comment.