diff --git a/CHANGELOG.md b/CHANGELOG.md index ca839e7ea3f..d1057c15013 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ - PR #378 cugraph was unable to import device_of_gpu_pointer - PR #384 Fixed bug in snmg coo2csr causing error in dask-cugraph tests. - PR #382 Disabled vertex id check to allow Azure deployment +- PR #410 Fixed overflow error in SNMG COO2CSR - PR #395 run omp_ge_num_threads in a parallel context diff --git a/cpp/src/snmg/COO2CSR/COO2CSR.cu b/cpp/src/snmg/COO2CSR/COO2CSR.cu index 7247eb55587..8808b3e1286 100644 --- a/cpp/src/snmg/COO2CSR/COO2CSR.cu +++ b/cpp/src/snmg/COO2CSR/COO2CSR.cu @@ -68,7 +68,7 @@ void serializeMessage(cugraph::SNMGinfo& env, std::string message){ template __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) -findStartRange(idx_t n, idx_t* result, idx_t edgeCount, val_t* scanned) { +findStartRange(idx_t n, idx_t* result, val_t edgeCount, val_t* scanned) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) if (scanned[i] < edgeCount && scanned[i + 1] >= edgeCount) *result = i + 1; @@ -190,7 +190,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets, // Each thread searches the global source node counts prefix sum to find the start of its vertex ID range idx_t myStartVertex = 0; if (i != 0) { - idx_t edgeCount = (globalEdgeCount / p) * i; + unsigned long long int edgeCount = (globalEdgeCount / p) * i; idx_t* vertexRangeStart; ALLOC_TRY(&vertexRangeStart, sizeof(idx_t), nullptr); dim3 nthreads, nblocks; @@ -213,7 +213,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets, cudaCheckError(); #pragma omp barrier - // Each thread determines how many edges it will have in it's partition + // Each thread determines how many edges it will have in its partition idx_t myEndVertex = part_offsets[i + 1]; unsigned long long int startEdge; unsigned long long int endEdge; @@ -245,6 +245,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets, auto zippy = thrust::make_zip_iterator(thrust::make_tuple(cooRowTemp, cooColTemp)); thrust::sort(rmm::exec_policy(nullptr)->on(nullptr), zippy, zippy + size); } + cudaDeviceSynchronize(); cudaCheckError(); // Each thread determines the count of rows it needs to transfer to each other thread @@ -255,7 +256,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets, ALLOC_TRY(&endPositions, sizeof(idx_t) * (p - 1), nullptr); for (int j = 0; j < p - 1; j++) { idx_t endVertexId = part_offsets[j + 1]; - if (endVertexId < localMinId) { + if (endVertexId <= localMinId) { // Write out zero for this position writeSingleValue<<<1, 256>>>(endPositions + j, static_cast(0)); } @@ -263,7 +264,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets, // Write out size for this position writeSingleValue<<<1, 256>>>(endPositions + j, size); } - else if (endVertexId >= localMinId && endVertexId < localMaxId) { + else if (endVertexId > localMinId && endVertexId < localMaxId) { dim3 nthreads, nblocks; nthreads.x = min(size, static_cast(CUDA_MAX_KERNEL_THREADS)); nthreads.y = 1; diff --git a/cpp/src/tests/snmg_coo2csr/snmg_coo2csr_test.cu b/cpp/src/tests/snmg_coo2csr/snmg_coo2csr_test.cu index 5b9ec83ad3e..300e88686de 100644 --- a/cpp/src/tests/snmg_coo2csr/snmg_coo2csr_test.cu +++ b/cpp/src/tests/snmg_coo2csr/snmg_coo2csr_test.cu @@ -276,6 +276,242 @@ INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGcoo2csr, MGcoo2csr_Usecase("test/datasets/web-Google.mtx"), MGcoo2csr_Usecase("test/datasets/wiki-Talk.mtx"))); +class Tests_MGcoo2csrTrans: public ::testing::TestWithParam { +public: + Tests_MGcoo2csrTrans() { + } + static void SetupTestCase() { + } + static void TearDownTestCase() { + } + virtual void SetUp() { + } + virtual void TearDown() { + } + + static std::vector mgspmv_time; + + template + void run_current_test(const MGcoo2csr_Usecase& param) { + const ::testing::TestInfo* const test_info = + ::testing::UnitTest::GetInstance()->current_test_info(); + std::stringstream ss; + std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file) + + std::string("_") + ss.str().c_str(); + std::cout << test_id << "\n"; + int m, k, nnz, n_gpus; + MM_typecode mc; + gdf_error status; + + double t; + + FILE* fpin = fopen(param.matrix_file.c_str(), "r"); + + if (!fpin) { + std::cout << "Could not open file: " << param.matrix_file << "\n"; + FAIL(); + } + + ASSERT_EQ(mm_properties(fpin, 1, &mc, &m, &k, &nnz),0)<< "could not read Matrix Market file properties"<< "\n"; + ASSERT_TRUE(mm_is_matrix(mc)); + ASSERT_TRUE(mm_is_coordinate(mc)); + ASSERT_FALSE(mm_is_complex(mc)); + ASSERT_FALSE(mm_is_skew(mc)); + + // Allocate memory on host + std::vector cooRowInd(nnz), cooColInd(nnz), csrColInd(nnz), csrRowPtr(m + 1); + std::vector degree_h(m, 0.0), degree_ref(m, 0.0); + std::vector csrVal(nnz, 0.0); + + // Read + ASSERT_EQ( (mm_to_coo(fpin, 1, nnz, &cooColInd[0], &cooRowInd[0], NULL, NULL)) , 0)<< "could not read matrix data"<< "\n"; + ASSERT_EQ(fclose(fpin), 0); + //ASSERT_EQ( (coo_to_csr (m, m, nnz, &cooRowInd[0], &cooColInd[0], NULL, NULL, &csrRowPtr[0], NULL, NULL, NULL)), 0) << "could not covert COO to CSR "<< "\n"; + std::vector cooRowInd_tmp(cooRowInd); + std::vector cooColInd_tmp(cooColInd); + coo2csr(cooRowInd_tmp, cooColInd_tmp, csrRowPtr, csrColInd); + + CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus)); + std::vector v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus + 1), part_offset_r(n_gpus + + 1); + void* comm1; + + if (nnz < 1200000000) { +#pragma omp parallel num_threads(1) + { + //omp_set_num_threads(n_gpus); + auto i = omp_get_thread_num(); + auto p = omp_get_num_threads(); + CUDA_RT_CALL(cudaSetDevice(i)); + +#ifdef SNMG_VERBOSE +#pragma omp master + { + std::cout << "Number of GPUs : "<< n_gpus <(csr_off, csr_ind, col_off, col_ind)); + } + + gdf_col_delete(col_off); + gdf_col_delete(col_ind); + gdf_col_delete(col_val); + gdf_col_delete(csr_off); + gdf_col_delete(csr_ind); + gdf_col_delete(csr_val); + gdf_col_delete(coo_row); + gdf_col_delete(coo_col); + gdf_col_delete(coo_val); + } + } + if (n_gpus > 1) + { + // Only using the 4 fully connected GPUs on DGX1 + if (n_gpus == 8) + n_gpus = 4; + +#pragma omp parallel num_threads(n_gpus) + { + //omp_set_num_threads(n_gpus); + auto i = omp_get_thread_num(); + auto p = omp_get_num_threads(); + CUDA_RT_CALL(cudaSetDevice(i)); + +#ifdef SNMG_VERBOSE +#pragma omp master + { + std::cout << "Number of GPUs : "<< n_gpus <(csr_off, csr_ind, col_off, col_ind)); + } + + gdf_col_delete(col_off); + gdf_col_delete(col_ind); + gdf_col_delete(col_val); + gdf_col_delete(csr_off); + gdf_col_delete(csr_ind); + gdf_col_delete(csr_val); + gdf_col_delete(coo_row); + gdf_col_delete(coo_col); + gdf_col_delete(coo_val); + } + } + std::cout << std::endl; + } +}; + +TEST_P(Tests_MGcoo2csrTrans, CheckInt32_floatmtx) { + run_current_test(GetParam()); +} + +TEST_P(Tests_MGcoo2csrTrans, CheckInt32_doublemtx) { + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGcoo2csrTrans, + ::testing::Values(MGcoo2csr_Usecase("test/datasets/karate.mtx"), + MGcoo2csr_Usecase("test/datasets/netscience.mtx"), + MGcoo2csr_Usecase("test/datasets/cit-Patents.mtx"), + MGcoo2csr_Usecase("test/datasets/webbase-1M.mtx"), + MGcoo2csr_Usecase("test/datasets/web-Google.mtx"), + MGcoo2csr_Usecase("test/datasets/wiki-Talk.mtx"))); + class Tests_MGcoo2csr_hibench: public ::testing::TestWithParam { public: Tests_MGcoo2csr_hibench() {