-
Notifications
You must be signed in to change notification settings - Fork 0
Optimisation Guide
kaanolgu edited this page Aug 3, 2023
·
1 revision
This page is used for the paper (doi soon) to showcase the code optimisations done and their effect on the performance metrics All the test results are according to the tests with RMAT-19-32 with 4 Compute units
// Old code
unsigned int nodes_start = usm_nodes_start[idx];
unsigned int nodes_end = usm_nodes_start[idx +1 ];
// New code with split device pointers
device_ptr<unsigned int> DevicePtr_start(usm_nodes_start+offset);
device_ptr<unsigned int> DevicePtr_end(usm_nodes_start + 1+offset);
unsigned int nodes_start = DevicePtr_start[idx];
unsigned int nodes_end = DevicePtr_end[idx];
Vector version - execution time : 0.10s Single version - execution time : 0.08s
// Vector code
std::vector<event> e_vec;
e_vec.push_back(e1);
e_vec.push_back(e2);
e_vec.push_back(e3);
auto e =q.single_task<class Task>( e_vec, [=](){
...
// Seperate Events
auto e =q.single_task<class Task>( {e1,e2,e3}, [=]() {
...
Throughput : parallel_for > singletask > NDrange
q.single_task<class SingleTask>( [=]() [[intel::kernel_args_restrict]] {
#pragma unroll 16
for(int tid =0; tid < no_of_nodes; tid++){
unsigned int condition = usm_updating_mask[tid];
if(condition){
usm_updating_mask[tid]=0;
}
}
});
q.parallel_for<class ParallelFor>(no_of_nodes, [=]() [[intel::kernel_args_restrict]] {
unsigned int condition = usm_updating_mask[tid];
if(condition){
usm_updating_mask[tid]=0;
}
});
int BLOCK_SIZE = 512;
int global_work_size = (no_of_nodes + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE;
range<1> gws (global_work_size);
range<1> lws (BLOCK_SIZE);
q.parallel_for<class NDRange>(nd_range<1>(gws, lws), [=] (nd_item<1> item) [[intel::kernel_args_restrict]] {
int gid = item.get_global_id();
if(gid<no_of_nodes){
unsigned int condition = usm_updating_mask[tid];
if(condition){
usm_updating_mask[tid]=0;
}
}
});