diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp index 21adf99c2ccf78..a4e9ea83799b36 100644 --- a/test/cpp/tensorexpr/test_cuda.cpp +++ b/test/cpp/tensorexpr/test_cuda.cpp @@ -77,14 +77,65 @@ void testCudaTestVectorAdd01() { cudaFree(b_dev); cudaFree(c_dev); } -} // namespace jit -} // namespace torch -#else // USE_CUDA -namespace torch { -namespace jit { -void testCudaTestVectorAdd01() { } +static void testCudaTestVectorAdd02_impl(int N, int block_size) { + Buffer a_buf("a", kFloat32, {N}); + Buffer b_buf("b", kFloat32, {N}); + Tensor c = Compute( + "c", + { + {N, "N"}, + }, + [&](const Var& n) { return a_buf(n) + b_buf(n); }); + Schedule sch({c}); + const Var& n = c.arg(0); + Var n_outer; + Var n_inner; + c.SplitWithMask(n, block_size, true, &n_outer, &n_inner); + c.GPUExecConfig({n_outer}, {n_inner}); + Stmt stmt = sch.Lower(); + CudaCodeGen cuda_cg(stmt, c, a_buf, b_buf); + PaddedBuffer a_v(N); + PaddedBuffer b_v(N); + PaddedBuffer c_v(N); + PaddedBuffer c_ref(N); + + for (int i = 0; i < N; i++) { + a_v(i) = i; + b_v(i) = i * 3 + 7; + c_ref(i) = a_v(i) + b_v(i); + } + + // TODO: move gpu support into PaddedBuffer + float* a_dev = nullptr; + cudaMalloc(&a_dev, N * sizeof(float)); + float* b_dev = nullptr; + cudaMalloc(&b_dev, N * sizeof(float)); + float* c_dev = nullptr; + cudaMalloc(&c_dev, N * sizeof(float)); + cudaMemcpy(a_dev, a_v.data(), N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(b_dev, b_v.data(), N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(c_dev, c_v.data(), N * sizeof(float), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + cuda_cg(c_dev, a_dev, b_dev); + + cudaDeviceSynchronize(); + cudaMemcpy(c_v.data(), c_dev, N * sizeof(float), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + ExpectAllNear(c_v, c_ref, 1e-5); + + cudaFree(a_dev); + cudaFree(b_dev); + cudaFree(c_dev); } + +void testCudaTestVectorAdd02() { + testCudaTestVectorAdd02_impl(1024, 128); + testCudaTestVectorAdd02_impl(1030, 128); } +} // namespace jit +} // namespace torch #endif diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h index 76f55dae2ca51f..db57a2393118f7 100644 --- a/test/cpp/tensorexpr/tests.h +++ b/test/cpp/tensorexpr/tests.h @@ -73,7 +73,6 @@ namespace jit { _(LLVMBroadcastAdd) \ _(LLVMDynamicShapeAdd) \ _(LLVMBindDynamicShapeAdd) \ - _(CudaTestVectorAdd01) \ _(Cond01) \ _(ATen_cast_Float) \ _(ATennegInt) \ @@ -110,7 +109,9 @@ namespace jit { _(ATenleInt) \ _(ATenltInt) -#define TH_FORALL_TESTS_CUDA(_) +#define TH_FORALL_TESTS_CUDA(_) \ + _(CudaTestVectorAdd01) \ + _(CudaTestVectorAdd02) #define DECLARE_TENSOREXPR_TEST(name) void test##name(); TH_FORALL_TESTS(DECLARE_TENSOREXPR_TEST) diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp index 6831f1e5067537..6a6674eb23cc64 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp @@ -1,6 +1,6 @@ #include "torch/csrc/jit/tensorexpr/cuda_codegen.h" -#define DEBUG_PRINT 0 +#define DEBUG_PRINT 1 namespace torch { namespace jit {