Skip to content

Commit

Permalink
Merge pull request rapidsai#410 from jwyles/snmg_coo2csr_debug
Browse files Browse the repository at this point in the history
Snmg coo2csr debug
  • Loading branch information
afender authored Jul 24, 2019
2 parents e038d75 + 13c6582 commit bbda844
Show file tree
Hide file tree
Showing 3 changed files with 243 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
- PR #378 cugraph was unable to import device_of_gpu_pointer
- PR #384 Fixed bug in snmg coo2csr causing error in dask-cugraph tests.
- PR #382 Disabled vertex id check to allow Azure deployment
- PR #410 Fixed overflow error in SNMG COO2CSR
- PR #395 run omp_ge_num_threads in a parallel context


Expand Down
11 changes: 6 additions & 5 deletions cpp/src/snmg/COO2CSR/COO2CSR.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void serializeMessage(cugraph::SNMGinfo& env, std::string message){

template<typename idx_t, typename val_t>
__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
findStartRange(idx_t n, idx_t* result, idx_t edgeCount, val_t* scanned) {
findStartRange(idx_t n, idx_t* result, val_t edgeCount, val_t* scanned) {
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
if (scanned[i] < edgeCount && scanned[i + 1] >= edgeCount)
*result = i + 1;
Expand Down Expand Up @@ -190,7 +190,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
// Each thread searches the global source node counts prefix sum to find the start of its vertex ID range
idx_t myStartVertex = 0;
if (i != 0) {
idx_t edgeCount = (globalEdgeCount / p) * i;
unsigned long long int edgeCount = (globalEdgeCount / p) * i;
idx_t* vertexRangeStart;
ALLOC_TRY(&vertexRangeStart, sizeof(idx_t), nullptr);
dim3 nthreads, nblocks;
Expand All @@ -213,7 +213,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
cudaCheckError();
#pragma omp barrier

// Each thread determines how many edges it will have in it's partition
// Each thread determines how many edges it will have in its partition
idx_t myEndVertex = part_offsets[i + 1];
unsigned long long int startEdge;
unsigned long long int endEdge;
Expand Down Expand Up @@ -245,6 +245,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
auto zippy = thrust::make_zip_iterator(thrust::make_tuple(cooRowTemp, cooColTemp));
thrust::sort(rmm::exec_policy(nullptr)->on(nullptr), zippy, zippy + size);
}
cudaDeviceSynchronize();
cudaCheckError();

// Each thread determines the count of rows it needs to transfer to each other thread
Expand All @@ -255,15 +256,15 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
ALLOC_TRY(&endPositions, sizeof(idx_t) * (p - 1), nullptr);
for (int j = 0; j < p - 1; j++) {
idx_t endVertexId = part_offsets[j + 1];
if (endVertexId < localMinId) {
if (endVertexId <= localMinId) {
// Write out zero for this position
writeSingleValue<<<1, 256>>>(endPositions + j, static_cast<idx_t>(0));
}
else if (endVertexId >= localMaxId) {
// Write out size for this position
writeSingleValue<<<1, 256>>>(endPositions + j, size);
}
else if (endVertexId >= localMinId && endVertexId < localMaxId) {
else if (endVertexId > localMinId && endVertexId < localMaxId) {
dim3 nthreads, nblocks;
nthreads.x = min(size, static_cast<idx_t>(CUDA_MAX_KERNEL_THREADS));
nthreads.y = 1;
Expand Down
236 changes: 236 additions & 0 deletions cpp/src/tests/snmg_coo2csr/snmg_coo2csr_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,242 @@ INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGcoo2csr,
MGcoo2csr_Usecase("test/datasets/web-Google.mtx"),
MGcoo2csr_Usecase("test/datasets/wiki-Talk.mtx")));

class Tests_MGcoo2csrTrans: public ::testing::TestWithParam<MGcoo2csr_Usecase> {
public:
Tests_MGcoo2csrTrans() {
}
static void SetupTestCase() {
}
static void TearDownTestCase() {
}
virtual void SetUp() {
}
virtual void TearDown() {
}

static std::vector<double> mgspmv_time;

template<typename idx_t, typename val_t>
void run_current_test(const MGcoo2csr_Usecase& param) {
const ::testing::TestInfo* const test_info =
::testing::UnitTest::GetInstance()->current_test_info();
std::stringstream ss;
std::string test_id = std::string(test_info->test_case_name()) + std::string(".")
+ std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file)
+ std::string("_") + ss.str().c_str();
std::cout << test_id << "\n";
int m, k, nnz, n_gpus;
MM_typecode mc;
gdf_error status;

double t;

FILE* fpin = fopen(param.matrix_file.c_str(), "r");

if (!fpin) {
std::cout << "Could not open file: " << param.matrix_file << "\n";
FAIL();
}

ASSERT_EQ(mm_properties<int>(fpin, 1, &mc, &m, &k, &nnz),0)<< "could not read Matrix Market file properties"<< "\n";
ASSERT_TRUE(mm_is_matrix(mc));
ASSERT_TRUE(mm_is_coordinate(mc));
ASSERT_FALSE(mm_is_complex(mc));
ASSERT_FALSE(mm_is_skew(mc));

// Allocate memory on host
std::vector<idx_t> cooRowInd(nnz), cooColInd(nnz), csrColInd(nnz), csrRowPtr(m + 1);
std::vector<idx_t> degree_h(m, 0.0), degree_ref(m, 0.0);
std::vector<val_t> csrVal(nnz, 0.0);

// Read
ASSERT_EQ( (mm_to_coo<int,int>(fpin, 1, nnz, &cooColInd[0], &cooRowInd[0], NULL, NULL)) , 0)<< "could not read matrix data"<< "\n";
ASSERT_EQ(fclose(fpin), 0);
//ASSERT_EQ( (coo_to_csr<int,val_t> (m, m, nnz, &cooRowInd[0], &cooColInd[0], NULL, NULL, &csrRowPtr[0], NULL, NULL, NULL)), 0) << "could not covert COO to CSR "<< "\n";
std::vector<idx_t> cooRowInd_tmp(cooRowInd);
std::vector<idx_t> cooColInd_tmp(cooColInd);
coo2csr(cooRowInd_tmp, cooColInd_tmp, csrRowPtr, csrColInd);

CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus));
std::vector<size_t> v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus + 1), part_offset_r(n_gpus
+ 1);
void* comm1;

if (nnz < 1200000000) {
#pragma omp parallel num_threads(1)
{
//omp_set_num_threads(n_gpus);
auto i = omp_get_thread_num();
auto p = omp_get_num_threads();
CUDA_RT_CALL(cudaSetDevice(i));

#ifdef SNMG_VERBOSE
#pragma omp master
{
std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
std::cout << "Number of threads : "<< p <<std::endl;
}
#endif

gdf_column *csr_off = new gdf_column;
gdf_column *csr_ind = new gdf_column;
gdf_column *csr_val = new gdf_column;
gdf_column *col_off = new gdf_column;
gdf_column *col_ind = new gdf_column;
gdf_column *col_val = new gdf_column;
gdf_column *coo_row = new gdf_column;
gdf_column *coo_col = new gdf_column;
gdf_column *coo_val = new gdf_column;

#pragma omp barrier

//load a chunk of the graph on each GPU
load_csr_loc(csrRowPtr, csrColInd, csrVal,
v_loc,
e_loc, part_offset,
col_off,
col_ind, col_val);

//load a chunk of the graph on each GPU COO
load_coo_loc(cooRowInd, cooColInd, csrVal, coo_row, coo_col, coo_val);

t = omp_get_wtime();
status = gdf_snmg_coo2csr(&part_offset_r[0],
false,
&comm1,
coo_row,
coo_col,
coo_val,
csr_off,
csr_ind,
csr_val);

if (status != 0) {
std::cout << "Call to gdf_snmg_coo2csr failed: " << gdf_error_get_name(status) << "\n";
}
EXPECT_EQ(status, 0);
#pragma omp master
{
std::cout << "GPU time: " << omp_get_wtime() - t << "\n";
}

// Compare the results with those generated on the host
if (status == 0) {
EXPECT_EQ(part_offset[0], part_offset_r[0]);
EXPECT_EQ(part_offset[1], part_offset_r[1]);
EXPECT_TRUE(gdf_csr_equal<idx_t>(csr_off, csr_ind, col_off, col_ind));
}

gdf_col_delete(col_off);
gdf_col_delete(col_ind);
gdf_col_delete(col_val);
gdf_col_delete(csr_off);
gdf_col_delete(csr_ind);
gdf_col_delete(csr_val);
gdf_col_delete(coo_row);
gdf_col_delete(coo_col);
gdf_col_delete(coo_val);
}
}
if (n_gpus > 1)
{
// Only using the 4 fully connected GPUs on DGX1
if (n_gpus == 8)
n_gpus = 4;

#pragma omp parallel num_threads(n_gpus)
{
//omp_set_num_threads(n_gpus);
auto i = omp_get_thread_num();
auto p = omp_get_num_threads();
CUDA_RT_CALL(cudaSetDevice(i));

#ifdef SNMG_VERBOSE
#pragma omp master
{
std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
std::cout << "Number of threads : "<< p <<std::endl;
}
#endif

gdf_column *csr_off = new gdf_column;
gdf_column *csr_ind = new gdf_column;
gdf_column *csr_val = new gdf_column;
gdf_column *col_off = new gdf_column;
gdf_column *col_ind = new gdf_column;
gdf_column *col_val = new gdf_column;
gdf_column *coo_row = new gdf_column;
gdf_column *coo_col = new gdf_column;
gdf_column *coo_val = new gdf_column;
#pragma omp barrier

//load a chunk of the graph on each GPU
load_csr_loc(csrRowPtr, csrColInd, csrVal,
v_loc,
e_loc, part_offset,
col_off,
col_ind, col_val);

//load a chunk of the graph on each GPU COO
load_coo_loc(cooRowInd, cooColInd, csrVal, coo_row, coo_col, coo_val);

t = omp_get_wtime();
status = gdf_snmg_coo2csr(&part_offset_r[0],
false,
&comm1,
coo_row,
coo_col,
coo_val,
csr_off,
csr_ind,
csr_val);
if (status != 0) {
std::cout << "Call to gdf_snmg_coo2csr failed: " << gdf_error_get_name(status) << "\n";
}
EXPECT_EQ(status, 0);
#pragma omp master
{
std::cout << "multi-GPU time: " << omp_get_wtime() - t << "\n";
}

// Compare the results with those generated on the host
if (status == 0) {
for (int j = 0; j < n_gpus + 1; j++)
EXPECT_EQ(part_offset[j], part_offset_r[j]);
EXPECT_TRUE(gdf_csr_equal<idx_t>(csr_off, csr_ind, col_off, col_ind));
}

gdf_col_delete(col_off);
gdf_col_delete(col_ind);
gdf_col_delete(col_val);
gdf_col_delete(csr_off);
gdf_col_delete(csr_ind);
gdf_col_delete(csr_val);
gdf_col_delete(coo_row);
gdf_col_delete(coo_col);
gdf_col_delete(coo_val);
}
}
std::cout << std::endl;
}
};

TEST_P(Tests_MGcoo2csrTrans, CheckInt32_floatmtx) {
run_current_test<int, float>(GetParam());
}

TEST_P(Tests_MGcoo2csrTrans, CheckInt32_doublemtx) {
run_current_test<int, double>(GetParam());
}

INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGcoo2csrTrans,
::testing::Values(MGcoo2csr_Usecase("test/datasets/karate.mtx"),
MGcoo2csr_Usecase("test/datasets/netscience.mtx"),
MGcoo2csr_Usecase("test/datasets/cit-Patents.mtx"),
MGcoo2csr_Usecase("test/datasets/webbase-1M.mtx"),
MGcoo2csr_Usecase("test/datasets/web-Google.mtx"),
MGcoo2csr_Usecase("test/datasets/wiki-Talk.mtx")));

class Tests_MGcoo2csr_hibench: public ::testing::TestWithParam<MGcoo2csr_Usecase> {
public:
Tests_MGcoo2csr_hibench() {
Expand Down

0 comments on commit bbda844

Please sign in to comment.