From 184b9a7c180af9a0b5e2cfa04960c7afdcf79371 Mon Sep 17 00:00:00 2001 From: Abhishek Bhaumick Date: Thu, 24 Mar 2022 20:00:28 -0400 Subject: [PATCH 1/3] Added full coalescing to L1 atomics - best case coalescing of atomic operations - full CAM based search - integrated with DPRINTF with ATOMICS Flag --- configs/tested-cfgs/SM7_QV100/gpgpusim.config | 3 +- src/abstract_hardware_model.cc | 76 ++++++++++++++++++- src/abstract_hardware_model.h | 2 + src/gpgpu-sim/gpu-sim.cc | 5 ++ src/trace.h | 8 ++ src/trace_streams.tup | 1 + 6 files changed, 92 insertions(+), 3 deletions(-) diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config index 425bc1690..90a635363 100644 --- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config +++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config @@ -94,6 +94,7 @@ -gpgpu_shmem_num_banks 32 -gpgpu_shmem_limited_broadcast 0 -gpgpu_shmem_warp_parts 1 +-gpgpu_shmem_atomic_warp_parts 2 -gpgpu_coalesce_arch 70 # Volta has four schedulers per core @@ -204,5 +205,5 @@ # tracing functionality #-trace_enabled 1 -#-trace_components WARP_SCHEDULER,SCOREBOARD +#-trace_components ATOMICS #-trace_sampling_core 0 \ No newline at end of file diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc index 30aee60c9..13a832dac 100644 --- a/src/abstract_hardware_model.cc +++ b/src/abstract_hardware_model.cc @@ -587,7 +587,10 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, // see the CUDA manual where it discusses coalescing rules before reading this unsigned segment_size = 0; - unsigned warp_parts = m_config->mem_warp_parts; + // unsigned warp_parts = m_config->mem_warp_parts; + // unsigned warp_parts = m_config->mem_atomic_warp_parts; // Use atomic_mem_warp_parts + unsigned warp_parts = m_config->mem_atomic_warp_parts; // Use atomic_mem_warp_parts + bool sector_segment_size = false; if (m_config->gpgpu_coalesce_arch >= 20 && @@ -603,6 +606,10 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, sector_segment_size = true; } + // Segment size = full line if NOT segemented $ + // = segement size (16B) if segmented $ + // Sector size changed to 16 Bytes + // since we are coalescing threads @ 16 B granularity switch (data_size) { case 1: segment_size = 32; @@ -618,6 +625,9 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, } unsigned subwarp_size = m_config->warp_size / warp_parts; + static int debugCount; + std::stringstream ss; + for (unsigned subwarp = 0; subwarp < warp_parts; subwarp++) { std::map > subwarp_transactions; // each block addr maps to a list of transactions @@ -638,6 +648,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, assert(block_address == line_size_based_tag_func(addr + data_size - 1, segment_size)); + if (debugCount < 10) { + DPRINTF_RAW(ATOMICS, " @ %ld \n", addr); + } + + // Commented out for testing Atomic Coalescing Issue + // /* */ // Find a transaction that does not conflict with this thread's accesses bool new_transaction = true; std::list::iterator it; @@ -665,6 +681,20 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, assert(!info->bytes.test(idx + i)); info->bytes.set(idx + i); } + /* */ + } + + if (debugCount < 10) { + DPRINTF_RAW(ATOMICS, "--Start--\n"); + ss << "\t" << "Subwarp Txns : \n"; + for (auto item : subwarp_transactions) { + ss << "\t\t" << item.first << "-> \n"; + for (auto txn : item.second) { + ss << "\t\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n"; + } + } + DPRINTF_RAW(ATOMICS, ss.str().c_str()); + ss.str(""); } // step 2: reduce each transaction size, if possible @@ -675,15 +705,57 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, new_addr_type addr = t_list->first; const std::list &transaction_list = t_list->second; + if (debugCount < 10) { + ss << "\t" << "Txn List : " << addr << "\n"; + for (auto txn : transaction_list) { + ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n"; + } + DPRINTF_RAW(ATOMICS, ss.str().c_str()); + ss.str(""); + } + std::list::const_iterator t; - for (t = transaction_list.begin(); t != transaction_list.end(); t++) { + std::list reduced_transaction_list; + // Special Coalescing for Atomics Conflict + for (t = transaction_list.begin(); t != transaction_list.end(); t++) { + const transaction_info &info = *t; + + // Find a coalescable transaction + bool newTxn = true; + for (auto& cTxn : reduced_transaction_list) { + if ((cTxn.chunks == info.chunks) + && (cTxn.bytes == info.bytes) + && ((cTxn.active & info.active) == 0)) { + // Squish the two transactions + cTxn.active = cTxn.active | info.active; + newTxn = false; + break; + } + } + if (newTxn) { + reduced_transaction_list.push_back(info); + } + } + + if (debugCount < 10) { + ss << "\t" << "RedTxn List : " << addr << "\n"; + for (auto txn : reduced_transaction_list) { + ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n"; + } + DPRINTF_RAW(ATOMICS, ss.str().c_str()); + ss.str(""); + } + + for (t = reduced_transaction_list.begin(); t != reduced_transaction_list.end(); t++) { // For each transaction const transaction_info &info = *t; memory_coalescing_arch_reduce_and_send(is_write, access_type, info, addr, segment_size); } + DPRINTF_RAW(ATOMICS, "--End--\n"); } } + ++ debugCount; } void warp_inst_t::memory_coalescing_arch_reduce_and_send( diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h index 35e28ca57..485af9b49 100644 --- a/src/abstract_hardware_model.h +++ b/src/abstract_hardware_model.h @@ -372,6 +372,7 @@ class core_config { return ((addr / WORD_SIZE) % num_shmem_bank); } unsigned mem_warp_parts; + unsigned mem_atomic_warp_parts; mutable unsigned gpgpu_shmem_size; char *gpgpu_shmem_option; std::vector shmem_opt_list; @@ -1070,6 +1071,7 @@ class warp_inst_t : public inst_t { mem_access_byte_mask_t bytes; active_mask_t active; // threads in this transaction + // Returns true if any of the bits between start_bit and end_bit are true bool test_bytes(unsigned start_bit, unsigned end_bit) { for (unsigned i = start_bit; i <= end_bit; i++) if (bytes.test(i)) return true; diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 56ede056c..7fcd08e6d 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -359,6 +359,11 @@ void shader_core_config::reg_options(class OptionParser *opp) { "Number of portions a warp is divided into for shared " "memory bank conflict check ", "2"); + option_parser_register(opp, "-gpgpu_shmem_atomic_warp_parts", OPT_INT32, + &mem_atomic_warp_parts, + "Number of portions an atomic warp is divided into for shared " + "memory bank conflict check ", + "2"); option_parser_register( opp, "-gpgpu_mem_unit_ports", OPT_INT32, &mem_unit_ports, "The number of memory transactions allowed per core cycle", "1"); diff --git a/src/trace.h b/src/trace.h index 8d7415177..d83af41e0 100644 --- a/src/trace.h +++ b/src/trace.h @@ -77,6 +77,14 @@ void init(); } \ } while (0) +#define DPRINTF_RAW(x, ...) \ + do { \ + if (DTRACE(x)) { \ + printf("%s : ", Trace::trace_streams_str[Trace::x]); \ + printf(__VA_ARGS__); \ + } \ + } while (0) + #else #define DTRACE(x) (false) diff --git a/src/trace_streams.tup b/src/trace_streams.tup index 074c7c880..aed7401ef 100644 --- a/src/trace_streams.tup +++ b/src/trace_streams.tup @@ -32,5 +32,6 @@ TS_TUP_BEGIN( trace_streams_type ) TS_TUP( MEMORY_SUBPARTITION_UNIT ), TS_TUP( INTERCONNECT ), TS_TUP( LIVENESS ), + TS_TUP( ATOMICS ), TS_TUP( NUM_TRACE_STREAMS ) TS_TUP_END( trace_streams_type ) From f5088238ae07a3743231d26758c25810f85cc7c4 Mon Sep 17 00:00:00 2001 From: Abhishek Bhaumick Date: Thu, 24 Mar 2022 20:17:52 -0400 Subject: [PATCH 2/3] Modified Atomics coalescing to match Volta V100 - replaced full CAM coalescing with common case coalescing - correlated with QV100 GPU --- configs/tested-cfgs/SM7_QV100/gpgpusim.config | 4 +- src/abstract_hardware_model.cc | 53 +++++++++++++++++-- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config index 90a635363..4ad607b80 100644 --- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config +++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config @@ -204,6 +204,6 @@ -power_simulation_enabled 0 # tracing functionality -#-trace_enabled 1 -#-trace_components ATOMICS +-trace_enabled 1 +-trace_components ATOMICS #-trace_sampling_core 0 \ No newline at end of file diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc index 13a832dac..b550b2075 100644 --- a/src/abstract_hardware_model.cc +++ b/src/abstract_hardware_model.cc @@ -699,6 +699,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, // step 2: reduce each transaction size, if possible std::map >::iterator t_list; + for (t_list = subwarp_transactions.begin(); t_list != subwarp_transactions.end(); t_list++) { // For each block addr @@ -715,14 +716,15 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, } std::list::const_iterator t; - std::list reduced_transaction_list; + std::list coalesced_transaction_list; // Special Coalescing for Atomics Conflict for (t = transaction_list.begin(); t != transaction_list.end(); t++) { const transaction_info &info = *t; + // Option 1 : Full CAM based coalsecing of global atomics txns // Find a coalescable transaction bool newTxn = true; - for (auto& cTxn : reduced_transaction_list) { + for (auto& cTxn : coalesced_transaction_list) { if ((cTxn.chunks == info.chunks) && (cTxn.bytes == info.bytes) && ((cTxn.active & info.active) == 0)) { @@ -732,12 +734,53 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, break; } } + if (newTxn) { + coalesced_transaction_list.push_back(info); + } + } + + if (debugCount < 10) { + ss << "\t" << "CoalTxn List : " << addr << "\n"; + for (auto txn : coalesced_transaction_list) { + ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n"; + } + DPRINTF_RAW(ATOMICS, ss.str().c_str()); + ss.str(""); + } + + // Option 2 : V100 Hardware correlated - only squish if 16 threads conflict on same address + std::list reduced_transaction_list; + active_mask_t subwarp_full_upper_mask = ((((unsigned long long) 1) << subwarp_size) - 1) << subwarp_size; + active_mask_t subwarp_full_lower_mask = (((unsigned long long) 1) << subwarp_size) - 1; + + // Place non-coalesced txns into final Txn List + for (t = transaction_list.begin(); t != transaction_list.end(); t++) { + const transaction_info &info = *t; + + bool newTxn = true; + for (auto& cTxn : coalesced_transaction_list) { + if ((cTxn.active == subwarp_full_upper_mask || cTxn.active == subwarp_full_lower_mask) + && (cTxn.chunks == info.chunks) + && (cTxn.bytes == info.bytes)) { + // No need to create any split transactions + newTxn = false; + break; + } + } + if (newTxn) { reduced_transaction_list.push_back(info); } } + + for (auto& cTxn : coalesced_transaction_list) { + if (cTxn.active == subwarp_full_upper_mask || cTxn.active == subwarp_full_lower_mask) { + reduced_transaction_list.push_back(cTxn); + } + } if (debugCount < 10) { + ss << " " << subwarp_full_upper_mask << " " << subwarp_full_lower_mask << "\n"; ss << "\t" << "RedTxn List : " << addr << "\n"; for (auto txn : reduced_transaction_list) { ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n"; @@ -752,8 +795,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, memory_coalescing_arch_reduce_and_send(is_write, access_type, info, addr, segment_size); } - DPRINTF_RAW(ATOMICS, "--End--\n"); + if (debugCount < 10) { + DPRINTF_RAW(ATOMICS, "--End--\n"); + } } + + } ++ debugCount; } From e42de8565f93dabf16f70105c4931396c909f4f6 Mon Sep 17 00:00:00 2001 From: Abhishek Bhaumick Date: Fri, 25 Mar 2022 23:21:36 -0400 Subject: [PATCH 3/3] Cleaned up debug messages, disbled tracing in config - added ATOMICS_DETAIL trace flag - made ATOMICS prints concise - disabled tracing and restored default trace flags in QV100 tested-cfgs --- configs/tested-cfgs/SM7_QV100/gpgpusim.config | 4 +- src/abstract_hardware_model.cc | 39 ++++++++++++------- src/trace_streams.tup | 1 + 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config index 10d2500ec..6c36b82dd 100644 --- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config +++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config @@ -233,6 +233,6 @@ -visualizer_enabled 0 # tracing functionality --trace_enabled 1 --trace_components ATOMICS +#-trace_enabled 1 +#-trace_components WARP_SCHEDULER,SCOREBOARD #-trace_sampling_core 0 \ No newline at end of file diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc index 7d3049dc4..e10a6e00b 100644 --- a/src/abstract_hardware_model.cc +++ b/src/abstract_hardware_model.cc @@ -631,6 +631,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, unsigned subwarp_size = m_config->warp_size / warp_parts; static int debugCount; + const int debugThreshold = 10; std::stringstream ss; for (unsigned subwarp = 0; subwarp < warp_parts; subwarp++) { @@ -653,9 +654,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, assert(block_address == line_size_based_tag_func(addr + data_size - 1, segment_size)); - if (debugCount < 10) { - DPRINTF_RAW(ATOMICS, " @ %ld \n", addr); - } + ss << " @ " << addr << ", "; // Commented out for testing Atomic Coalescing Issue // /* */ @@ -689,8 +688,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, /* */ } - if (debugCount < 10) { - DPRINTF_RAW(ATOMICS, "--Start--\n"); + ss << " \n"; + DPRINTF_RAW(ATOMICS, ss.str().c_str()); + ss.str(""); + + if (debugCount < debugThreshold) { + DPRINTF_RAW(ATOMICS_DETAIL, "--Start--\n"); ss << "\t" << "Subwarp Txns : \n"; for (auto item : subwarp_transactions) { ss << "\t\t" << item.first << "-> \n"; @@ -698,7 +701,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, ss << "\t\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n"; } } - DPRINTF_RAW(ATOMICS, ss.str().c_str()); + DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str()); ss.str(""); } @@ -711,12 +714,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, new_addr_type addr = t_list->first; const std::list &transaction_list = t_list->second; - if (debugCount < 10) { + if (debugCount < debugThreshold) { ss << "\t" << "Txn List : " << addr << "\n"; for (auto txn : transaction_list) { ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n"; } - DPRINTF_RAW(ATOMICS, ss.str().c_str()); + DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str()); ss.str(""); } @@ -744,12 +747,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, } } - if (debugCount < 10) { + if (debugCount < debugThreshold) { ss << "\t" << "CoalTxn List : " << addr << "\n"; for (auto txn : coalesced_transaction_list) { ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n"; } - DPRINTF_RAW(ATOMICS, ss.str().c_str()); + DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str()); ss.str(""); } @@ -784,24 +787,32 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write, } } - if (debugCount < 10) { + if (debugCount < debugThreshold) { ss << " " << subwarp_full_upper_mask << " " << subwarp_full_lower_mask << "\n"; ss << "\t" << "RedTxn List : " << addr << "\n"; for (auto txn : reduced_transaction_list) { ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n"; } - DPRINTF_RAW(ATOMICS, ss.str().c_str()); + DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str()); ss.str(""); } + ss << " "; + ss << "SubWarp Txns " << subwarp_transactions.size() << " -> "; + ss << "Transactions " << transaction_list.size() << " -> "; + ss << "Coalesced Txns " << coalesced_transaction_list.size() << " -> "; + ss << "Reduced Txns " << reduced_transaction_list.size() << "\n"; + DPRINTF_RAW(ATOMICS, ss.str().c_str()); + ss.str(""); + for (t = reduced_transaction_list.begin(); t != reduced_transaction_list.end(); t++) { // For each transaction const transaction_info &info = *t; memory_coalescing_arch_reduce_and_send(is_write, access_type, info, addr, segment_size); } - if (debugCount < 10) { - DPRINTF_RAW(ATOMICS, "--End--\n"); + if (debugCount < debugThreshold) { + DPRINTF_RAW(ATOMICS_DETAIL, "--End--\n"); } } diff --git a/src/trace_streams.tup b/src/trace_streams.tup index aed7401ef..bcf008894 100644 --- a/src/trace_streams.tup +++ b/src/trace_streams.tup @@ -33,5 +33,6 @@ TS_TUP_BEGIN( trace_streams_type ) TS_TUP( INTERCONNECT ), TS_TUP( LIVENESS ), TS_TUP( ATOMICS ), + TS_TUP( ATOMICS_DETAIL ), TS_TUP( NUM_TRACE_STREAMS ) TS_TUP_END( trace_streams_type )