Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Snmg coo2csr debug #410

Merged
merged 12 commits into from
Jul 24, 2019
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
- PR #378 cugraph was unable to import device_of_gpu_pointer
- PR #384 Fixed bug in snmg coo2csr causing error in dask-cugraph tests.
- PR #382 Disabled vertex id check to allow Azure deployment
- PR #410 Fixed overflow error in SNMG COO2CSR
- PR #395 run omp_ge_num_threads in a parallel context


Expand Down
11 changes: 6 additions & 5 deletions cpp/src/snmg/COO2CSR/COO2CSR.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void serializeMessage(cugraph::SNMGinfo& env, std::string message){

template<typename idx_t, typename val_t>
__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
findStartRange(idx_t n, idx_t* result, idx_t edgeCount, val_t* scanned) {
findStartRange(idx_t n, idx_t* result, val_t edgeCount, val_t* scanned) {
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
if (scanned[i] < edgeCount && scanned[i + 1] >= edgeCount)
*result = i + 1;
Expand Down Expand Up @@ -190,7 +190,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
// Each thread searches the global source node counts prefix sum to find the start of its vertex ID range
idx_t myStartVertex = 0;
if (i != 0) {
idx_t edgeCount = (globalEdgeCount / p) * i;
unsigned long long int edgeCount = (globalEdgeCount / p) * i;
idx_t* vertexRangeStart;
ALLOC_TRY(&vertexRangeStart, sizeof(idx_t), nullptr);
dim3 nthreads, nblocks;
Expand All @@ -213,7 +213,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
cudaCheckError();
#pragma omp barrier

// Each thread determines how many edges it will have in it's partition
// Each thread determines how many edges it will have in its partition
idx_t myEndVertex = part_offsets[i + 1];
unsigned long long int startEdge;
unsigned long long int endEdge;
Expand Down Expand Up @@ -245,6 +245,7 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
auto zippy = thrust::make_zip_iterator(thrust::make_tuple(cooRowTemp, cooColTemp));
thrust::sort(rmm::exec_policy(nullptr)->on(nullptr), zippy, zippy + size);
}
cudaDeviceSynchronize();
cudaCheckError();

// Each thread determines the count of rows it needs to transfer to each other thread
Expand All @@ -255,15 +256,15 @@ gdf_error snmg_coo2csr_impl(size_t* part_offsets,
ALLOC_TRY(&endPositions, sizeof(idx_t) * (p - 1), nullptr);
for (int j = 0; j < p - 1; j++) {
idx_t endVertexId = part_offsets[j + 1];
if (endVertexId < localMinId) {
if (endVertexId <= localMinId) {
// Write out zero for this position
writeSingleValue<<<1, 256>>>(endPositions + j, static_cast<idx_t>(0));
}
else if (endVertexId >= localMaxId) {
// Write out size for this position
writeSingleValue<<<1, 256>>>(endPositions + j, size);
}
else if (endVertexId >= localMinId && endVertexId < localMaxId) {
else if (endVertexId > localMinId && endVertexId < localMaxId) {
dim3 nthreads, nblocks;
nthreads.x = min(size, static_cast<idx_t>(CUDA_MAX_KERNEL_THREADS));
nthreads.y = 1;
Expand Down
236 changes: 236 additions & 0 deletions cpp/src/tests/snmg_coo2csr/snmg_coo2csr_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,242 @@ INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGcoo2csr,
MGcoo2csr_Usecase("test/datasets/web-Google.mtx"),
MGcoo2csr_Usecase("test/datasets/wiki-Talk.mtx")));

class Tests_MGcoo2csrTrans: public ::testing::TestWithParam<MGcoo2csr_Usecase> {
public:
Tests_MGcoo2csrTrans() {
}
static void SetupTestCase() {
}
static void TearDownTestCase() {
}
virtual void SetUp() {
}
virtual void TearDown() {
}

static std::vector<double> mgspmv_time;

template<typename idx_t, typename val_t>
void run_current_test(const MGcoo2csr_Usecase& param) {
const ::testing::TestInfo* const test_info =
::testing::UnitTest::GetInstance()->current_test_info();
std::stringstream ss;
std::string test_id = std::string(test_info->test_case_name()) + std::string(".")
+ std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file)
+ std::string("_") + ss.str().c_str();
std::cout << test_id << "\n";
int m, k, nnz, n_gpus;
MM_typecode mc;
gdf_error status;

double t;

FILE* fpin = fopen(param.matrix_file.c_str(), "r");

if (!fpin) {
std::cout << "Could not open file: " << param.matrix_file << "\n";
FAIL();
}

ASSERT_EQ(mm_properties<int>(fpin, 1, &mc, &m, &k, &nnz),0)<< "could not read Matrix Market file properties"<< "\n";
ASSERT_TRUE(mm_is_matrix(mc));
ASSERT_TRUE(mm_is_coordinate(mc));
ASSERT_FALSE(mm_is_complex(mc));
ASSERT_FALSE(mm_is_skew(mc));

// Allocate memory on host
std::vector<idx_t> cooRowInd(nnz), cooColInd(nnz), csrColInd(nnz), csrRowPtr(m + 1);
std::vector<idx_t> degree_h(m, 0.0), degree_ref(m, 0.0);
std::vector<val_t> csrVal(nnz, 0.0);

// Read
ASSERT_EQ( (mm_to_coo<int,int>(fpin, 1, nnz, &cooColInd[0], &cooRowInd[0], NULL, NULL)) , 0)<< "could not read matrix data"<< "\n";
ASSERT_EQ(fclose(fpin), 0);
//ASSERT_EQ( (coo_to_csr<int,val_t> (m, m, nnz, &cooRowInd[0], &cooColInd[0], NULL, NULL, &csrRowPtr[0], NULL, NULL, NULL)), 0) << "could not covert COO to CSR "<< "\n";
std::vector<idx_t> cooRowInd_tmp(cooRowInd);
std::vector<idx_t> cooColInd_tmp(cooColInd);
coo2csr(cooRowInd_tmp, cooColInd_tmp, csrRowPtr, csrColInd);

CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus));
std::vector<size_t> v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus + 1), part_offset_r(n_gpus
+ 1);
void* comm1;

if (nnz < 1200000000) {
#pragma omp parallel num_threads(1)
{
//omp_set_num_threads(n_gpus);
auto i = omp_get_thread_num();
auto p = omp_get_num_threads();
CUDA_RT_CALL(cudaSetDevice(i));

#ifdef SNMG_VERBOSE
#pragma omp master
{
std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
std::cout << "Number of threads : "<< p <<std::endl;
}
#endif

gdf_column *csr_off = new gdf_column;
gdf_column *csr_ind = new gdf_column;
gdf_column *csr_val = new gdf_column;
gdf_column *col_off = new gdf_column;
gdf_column *col_ind = new gdf_column;
gdf_column *col_val = new gdf_column;
gdf_column *coo_row = new gdf_column;
gdf_column *coo_col = new gdf_column;
gdf_column *coo_val = new gdf_column;

#pragma omp barrier

//load a chunk of the graph on each GPU
load_csr_loc(csrRowPtr, csrColInd, csrVal,
v_loc,
e_loc, part_offset,
col_off,
col_ind, col_val);

//load a chunk of the graph on each GPU COO
load_coo_loc(cooRowInd, cooColInd, csrVal, coo_row, coo_col, coo_val);

t = omp_get_wtime();
status = gdf_snmg_coo2csr(&part_offset_r[0],
false,
&comm1,
coo_row,
coo_col,
coo_val,
csr_off,
csr_ind,
csr_val);

if (status != 0) {
std::cout << "Call to gdf_snmg_coo2csr failed: " << gdf_error_get_name(status) << "\n";
}
EXPECT_EQ(status, 0);
#pragma omp master
{
std::cout << "GPU time: " << omp_get_wtime() - t << "\n";
}

// Compare the results with those generated on the host
if (status == 0) {
EXPECT_EQ(part_offset[0], part_offset_r[0]);
EXPECT_EQ(part_offset[1], part_offset_r[1]);
EXPECT_TRUE(gdf_csr_equal<idx_t>(csr_off, csr_ind, col_off, col_ind));
}

gdf_col_delete(col_off);
gdf_col_delete(col_ind);
gdf_col_delete(col_val);
gdf_col_delete(csr_off);
gdf_col_delete(csr_ind);
gdf_col_delete(csr_val);
gdf_col_delete(coo_row);
gdf_col_delete(coo_col);
gdf_col_delete(coo_val);
}
}
if (n_gpus > 1)
{
// Only using the 4 fully connected GPUs on DGX1
if (n_gpus == 8)
n_gpus = 4;

#pragma omp parallel num_threads(n_gpus)
{
//omp_set_num_threads(n_gpus);
auto i = omp_get_thread_num();
auto p = omp_get_num_threads();
CUDA_RT_CALL(cudaSetDevice(i));

#ifdef SNMG_VERBOSE
#pragma omp master
{
std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
std::cout << "Number of threads : "<< p <<std::endl;
}
#endif

gdf_column *csr_off = new gdf_column;
gdf_column *csr_ind = new gdf_column;
gdf_column *csr_val = new gdf_column;
gdf_column *col_off = new gdf_column;
gdf_column *col_ind = new gdf_column;
gdf_column *col_val = new gdf_column;
gdf_column *coo_row = new gdf_column;
gdf_column *coo_col = new gdf_column;
gdf_column *coo_val = new gdf_column;
#pragma omp barrier

//load a chunk of the graph on each GPU
load_csr_loc(csrRowPtr, csrColInd, csrVal,
v_loc,
e_loc, part_offset,
col_off,
col_ind, col_val);

//load a chunk of the graph on each GPU COO
load_coo_loc(cooRowInd, cooColInd, csrVal, coo_row, coo_col, coo_val);

t = omp_get_wtime();
status = gdf_snmg_coo2csr(&part_offset_r[0],
false,
&comm1,
coo_row,
coo_col,
coo_val,
csr_off,
csr_ind,
csr_val);
if (status != 0) {
std::cout << "Call to gdf_snmg_coo2csr failed: " << gdf_error_get_name(status) << "\n";
}
EXPECT_EQ(status, 0);
#pragma omp master
{
std::cout << "multi-GPU time: " << omp_get_wtime() - t << "\n";
}

// Compare the results with those generated on the host
if (status == 0) {
for (int j = 0; j < n_gpus + 1; j++)
EXPECT_EQ(part_offset[j], part_offset_r[j]);
EXPECT_TRUE(gdf_csr_equal<idx_t>(csr_off, csr_ind, col_off, col_ind));
}

gdf_col_delete(col_off);
gdf_col_delete(col_ind);
gdf_col_delete(col_val);
gdf_col_delete(csr_off);
gdf_col_delete(csr_ind);
gdf_col_delete(csr_val);
gdf_col_delete(coo_row);
gdf_col_delete(coo_col);
gdf_col_delete(coo_val);
}
}
std::cout << std::endl;
}
};

TEST_P(Tests_MGcoo2csrTrans, CheckInt32_floatmtx) {
run_current_test<int, float>(GetParam());
}

TEST_P(Tests_MGcoo2csrTrans, CheckInt32_doublemtx) {
run_current_test<int, double>(GetParam());
}

INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGcoo2csrTrans,
::testing::Values(MGcoo2csr_Usecase("test/datasets/karate.mtx"),
MGcoo2csr_Usecase("test/datasets/netscience.mtx"),
MGcoo2csr_Usecase("test/datasets/cit-Patents.mtx"),
MGcoo2csr_Usecase("test/datasets/webbase-1M.mtx"),
MGcoo2csr_Usecase("test/datasets/web-Google.mtx"),
MGcoo2csr_Usecase("test/datasets/wiki-Talk.mtx")));

class Tests_MGcoo2csr_hibench: public ::testing::TestWithParam<MGcoo2csr_Usecase> {
public:
Tests_MGcoo2csr_hibench() {
Expand Down