rapidsai · afender · Jul 24, 2019 · Jul 22, 2019 · Jul 22, 2019 · Jul 22, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,7 @@
 - PR #378 cugraph was unable to import device_of_gpu_pointer
 - PR #384 Fixed bug in snmg coo2csr causing error in dask-cugraph tests.
 - PR #382 Disabled vertex id check to allow Azure deployment
+- PR #410 Fixed overflow error in SNMG COO2CSR
 - PR #395 run omp_ge_num_threads in a parallel context
 
 

diff --git a/cpp/src/snmg/COO2CSR/COO2CSR.cu b/cpp/src/snmg/COO2CSR/COO2CSR.cu
@@ -68,7 +68,7 @@ void serializeMessage(cugraph::SNMGinfo& env, std::string message){
 
 template<typename idx_t, typename val_t>
 __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-findStartRange(idx_t n, idx_t* result, idx_t edgeCount, val_t* scanned) {
+findStartRange(idx_t n, idx_t* result, val_t edgeCount, val_t* scanned) {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
     if (scanned[i] < edgeCount && scanned[i + 1] >= edgeCount)
       *result = i + 1;
@@ -190,7 +190,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
   // Each thread searches the global source node counts prefix sum to find the start of its vertex ID range
   idx_t myStartVertex = 0;
   if (i != 0) {
-    idx_t edgeCount = (globalEdgeCount / p) * i;
+    unsigned long long int edgeCount = (globalEdgeCount / p) * i;
     idx_t* vertexRangeStart;
     ALLOC_TRY(&vertexRangeStart, sizeof(idx_t), nullptr);
     dim3 nthreads, nblocks;
@@ -213,7 +213,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
   cudaCheckError();
 #pragma omp barrier
 
-  // Each thread determines how many edges it will have in it's partition
+  // Each thread determines how many edges it will have in its partition
   idx_t myEndVertex = part_offsets[i + 1];
   unsigned long long int startEdge;
   unsigned long long int endEdge;
@@ -245,6 +245,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
     auto zippy = thrust::make_zip_iterator(thrust::make_tuple(cooRowTemp, cooColTemp));
     thrust::sort(rmm::exec_policy(nullptr)->on(nullptr), zippy, zippy + size);
   }
+  cudaDeviceSynchronize();
   cudaCheckError();
 
   // Each thread determines the count of rows it needs to transfer to each other thread
@@ -255,15 +256,15 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
   ALLOC_TRY(&endPositions, sizeof(idx_t) * (p - 1), nullptr);
   for (int j = 0; j < p - 1; j++) {
     idx_t endVertexId = part_offsets[j + 1];
-    if (endVertexId < localMinId) {
+    if (endVertexId <= localMinId) {
       // Write out zero for this position
       writeSingleValue<<<1, 256>>>(endPositions + j, static_cast<idx_t>(0));
     }
     else if (endVertexId >= localMaxId) {
       // Write out size for this position
       writeSingleValue<<<1, 256>>>(endPositions + j, size);
     }
-    else if (endVertexId >= localMinId && endVertexId < localMaxId) {
+    else if (endVertexId > localMinId && endVertexId < localMaxId) {
       dim3 nthreads, nblocks;
       nthreads.x = min(size, static_cast<idx_t>(CUDA_MAX_KERNEL_THREADS));
       nthreads.y = 1;

diff --git a/cpp/src/tests/snmg_coo2csr/snmg_coo2csr_test.cu b/cpp/src/tests/snmg_coo2csr/snmg_coo2csr_test.cu
@@ -276,6 +276,242 @@ INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGcoo2csr,
                                           MGcoo2csr_Usecase("test/datasets/web-Google.mtx"),
                                           MGcoo2csr_Usecase("test/datasets/wiki-Talk.mtx")));
 
+class Tests_MGcoo2csrTrans: public ::testing::TestWithParam<MGcoo2csr_Usecase> {
+public:
+  Tests_MGcoo2csrTrans() {
+  }
+  static void SetupTestCase() {
+  }
+  static void TearDownTestCase() {
+  }
+  virtual void SetUp() {
+  }
+  virtual void TearDown() {
+  }
+
+  static std::vector<double> mgspmv_time;
+
+  template<typename idx_t, typename val_t>
+  void run_current_test(const MGcoo2csr_Usecase& param) {
+    const ::testing::TestInfo* const test_info =
+        ::testing::UnitTest::GetInstance()->current_test_info();
+    std::stringstream ss;
+    std::string test_id = std::string(test_info->test_case_name()) + std::string(".")
+        + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file)
+        + std::string("_") + ss.str().c_str();
+    std::cout << test_id << "\n";
+    int m, k, nnz, n_gpus;
+    MM_typecode mc;
+    gdf_error status;
+
+    double t;
+
+    FILE* fpin = fopen(param.matrix_file.c_str(), "r");
+
+    if (!fpin) {
+      std::cout << "Could not open file: " << param.matrix_file << "\n";
+      FAIL();
+    }
+
+    ASSERT_EQ(mm_properties<int>(fpin, 1, &mc, &m, &k, &nnz),0)<< "could not read Matrix Market file properties"<< "\n";
+    ASSERT_TRUE(mm_is_matrix(mc));
+    ASSERT_TRUE(mm_is_coordinate(mc));
+    ASSERT_FALSE(mm_is_complex(mc));
+    ASSERT_FALSE(mm_is_skew(mc));
+
+    // Allocate memory on host
+    std::vector<idx_t> cooRowInd(nnz), cooColInd(nnz), csrColInd(nnz), csrRowPtr(m + 1);
+    std::vector<idx_t> degree_h(m, 0.0), degree_ref(m, 0.0);
+    std::vector<val_t> csrVal(nnz, 0.0);
+
+    // Read
+    ASSERT_EQ( (mm_to_coo<int,int>(fpin, 1, nnz, &cooColInd[0], &cooRowInd[0], NULL, NULL)) , 0)<< "could not read matrix data"<< "\n";
+    ASSERT_EQ(fclose(fpin), 0);
+    //ASSERT_EQ( (coo_to_csr<int,val_t> (m, m, nnz, &cooRowInd[0],  &cooColInd[0], NULL, NULL, &csrRowPtr[0], NULL, NULL, NULL)), 0) << "could not covert COO to CSR "<< "\n";
+    std::vector<idx_t> cooRowInd_tmp(cooRowInd);
+    std::vector<idx_t> cooColInd_tmp(cooColInd);
+    coo2csr(cooRowInd_tmp, cooColInd_tmp, csrRowPtr, csrColInd);
+
+    CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus));
+    std::vector<size_t> v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus + 1), part_offset_r(n_gpus
+        + 1);
+    void* comm1;
+
+    if (nnz < 1200000000) {
+#pragma omp parallel num_threads(1)
+      {
+        //omp_set_num_threads(n_gpus);
+        auto i = omp_get_thread_num();
+        auto p = omp_get_num_threads();
+        CUDA_RT_CALL(cudaSetDevice(i));
+
+#ifdef SNMG_VERBOSE
+#pragma omp master
+        {
+          std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
+          std::cout << "Number of threads : "<< p <<std::endl;
+        }
+#endif
+
+        gdf_column *csr_off = new gdf_column;
+        gdf_column *csr_ind = new gdf_column;
+        gdf_column *csr_val = new gdf_column;
+        gdf_column *col_off = new gdf_column;
+        gdf_column *col_ind = new gdf_column;
+        gdf_column *col_val = new gdf_column;
+        gdf_column *coo_row = new gdf_column;
+        gdf_column *coo_col = new gdf_column;
+        gdf_column *coo_val = new gdf_column;
+
+#pragma omp barrier
+
+        //load a chunk of the graph on each GPU
+        load_csr_loc(csrRowPtr, csrColInd, csrVal,
+                     v_loc,
+                     e_loc, part_offset,
+                     col_off,
+                     col_ind, col_val);
+
+        //load a chunk of the graph on each GPU COO
+        load_coo_loc(cooRowInd, cooColInd, csrVal, coo_row, coo_col, coo_val);
+
+        t = omp_get_wtime();
+        status = gdf_snmg_coo2csr(&part_offset_r[0],
+                                  false,
+                                  &comm1,
+                                  coo_row,
+                                  coo_col,
+                                  coo_val,
+                                  csr_off,
+                                  csr_ind,
+                                  csr_val);
+
+        if (status != 0) {
+          std::cout << "Call to gdf_snmg_coo2csr failed: " << gdf_error_get_name(status) << "\n";
+        }
+        EXPECT_EQ(status, 0);
+#pragma omp master
+        {
+          std::cout << "GPU time: " << omp_get_wtime() - t << "\n";
+        }
+
+        // Compare the results with those generated on the host
+        if (status == 0) {
+          EXPECT_EQ(part_offset[0], part_offset_r[0]);
+          EXPECT_EQ(part_offset[1], part_offset_r[1]);
+          EXPECT_TRUE(gdf_csr_equal<idx_t>(csr_off, csr_ind, col_off, col_ind));
+        }
+
+        gdf_col_delete(col_off);
+        gdf_col_delete(col_ind);
+        gdf_col_delete(col_val);
+        gdf_col_delete(csr_off);
+        gdf_col_delete(csr_ind);
+        gdf_col_delete(csr_val);
+        gdf_col_delete(coo_row);
+        gdf_col_delete(coo_col);
+        gdf_col_delete(coo_val);
+      }
+    }
+    if (n_gpus > 1)
+        {
+      // Only using the 4 fully connected GPUs on DGX1
+      if (n_gpus == 8)
+        n_gpus = 4;
+
+#pragma omp parallel num_threads(n_gpus)
+      {
+        //omp_set_num_threads(n_gpus);
+        auto i = omp_get_thread_num();
+        auto p = omp_get_num_threads();
+        CUDA_RT_CALL(cudaSetDevice(i));
+
+#ifdef SNMG_VERBOSE
+#pragma omp master
+        {
+          std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
+          std::cout << "Number of threads : "<< p <<std::endl;
+        }
+#endif
+
+        gdf_column *csr_off = new gdf_column;
+        gdf_column *csr_ind = new gdf_column;
+        gdf_column *csr_val = new gdf_column;
+        gdf_column *col_off = new gdf_column;
+        gdf_column *col_ind = new gdf_column;
+        gdf_column *col_val = new gdf_column;
+        gdf_column *coo_row = new gdf_column;
+        gdf_column *coo_col = new gdf_column;
+        gdf_column *coo_val = new gdf_column;
+#pragma omp barrier
+
+        //load a chunk of the graph on each GPU
+        load_csr_loc(csrRowPtr, csrColInd, csrVal,
+                     v_loc,
+                     e_loc, part_offset,
+                     col_off,
+                     col_ind, col_val);
+
+        //load a chunk of the graph on each GPU COO
+        load_coo_loc(cooRowInd, cooColInd, csrVal, coo_row, coo_col, coo_val);
+
+        t = omp_get_wtime();
+        status = gdf_snmg_coo2csr(&part_offset_r[0],
+                                  false,
+                                  &comm1,
+                                  coo_row,
+                                  coo_col,
+                                  coo_val,
+                                  csr_off,
+                                  csr_ind,
+                                  csr_val);
+        if (status != 0) {
+          std::cout << "Call to gdf_snmg_coo2csr failed: " << gdf_error_get_name(status) << "\n";
+        }
+        EXPECT_EQ(status, 0);
+#pragma omp master
+        {
+          std::cout << "multi-GPU time: " << omp_get_wtime() - t << "\n";
+        }
+
+        // Compare the results with those generated on the host
+        if (status == 0) {
+          for (int j = 0; j < n_gpus + 1; j++)
+            EXPECT_EQ(part_offset[j], part_offset_r[j]);
+          EXPECT_TRUE(gdf_csr_equal<idx_t>(csr_off, csr_ind, col_off, col_ind));
+        }
+
+        gdf_col_delete(col_off);
+        gdf_col_delete(col_ind);
+        gdf_col_delete(col_val);
+        gdf_col_delete(csr_off);
+        gdf_col_delete(csr_ind);
+        gdf_col_delete(csr_val);
+        gdf_col_delete(coo_row);
+        gdf_col_delete(coo_col);
+        gdf_col_delete(coo_val);
+      }
+    }
+    std::cout << std::endl;
+  }
+};
+
+TEST_P(Tests_MGcoo2csrTrans, CheckInt32_floatmtx) {
+  run_current_test<int, float>(GetParam());
+}
+
+TEST_P(Tests_MGcoo2csrTrans, CheckInt32_doublemtx) {
+  run_current_test<int, double>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGcoo2csrTrans,
+                        ::testing::Values(MGcoo2csr_Usecase("test/datasets/karate.mtx"),
+                                          MGcoo2csr_Usecase("test/datasets/netscience.mtx"),
+                                          MGcoo2csr_Usecase("test/datasets/cit-Patents.mtx"),
+                                          MGcoo2csr_Usecase("test/datasets/webbase-1M.mtx"),
+                                          MGcoo2csr_Usecase("test/datasets/web-Google.mtx"),
+                                          MGcoo2csr_Usecase("test/datasets/wiki-Talk.mtx")));
+
 class Tests_MGcoo2csr_hibench: public ::testing::TestWithParam<MGcoo2csr_Usecase> {
 public:
   Tests_MGcoo2csr_hibench() {