From 184b9a7c180af9a0b5e2cfa04960c7afdcf79371 Mon Sep 17 00:00:00 2001
From: Abhishek Bhaumick <a.bhaumick@live.in>
Date: Thu, 24 Mar 2022 20:00:28 -0400
Subject: [PATCH 1/3] Added full coalescing to L1 atomics

- best case coalescing of atomic operations - full CAM based search
- integrated with DPRINTF with ATOMICS Flag
---
 configs/tested-cfgs/SM7_QV100/gpgpusim.config |  3 +-
 src/abstract_hardware_model.cc                | 76 ++++++++++++++++++-
 src/abstract_hardware_model.h                 |  2 +
 src/gpgpu-sim/gpu-sim.cc                      |  5 ++
 src/trace.h                                   |  8 ++
 src/trace_streams.tup                         |  1 +
 6 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 425bc1690..90a635363 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -94,6 +94,7 @@
 -gpgpu_shmem_num_banks 32
 -gpgpu_shmem_limited_broadcast 0
 -gpgpu_shmem_warp_parts 1
+-gpgpu_shmem_atomic_warp_parts 2
 -gpgpu_coalesce_arch 70
 
 # Volta has four schedulers per core
@@ -204,5 +205,5 @@
 
 # tracing functionality
 #-trace_enabled 1
-#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_components ATOMICS
 #-trace_sampling_core 0
\ No newline at end of file
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 30aee60c9..13a832dac 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -587,7 +587,10 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
 
   // see the CUDA manual where it discusses coalescing rules before reading this
   unsigned segment_size = 0;
-  unsigned warp_parts = m_config->mem_warp_parts;
+  // unsigned warp_parts = m_config->mem_warp_parts;
+  // unsigned warp_parts = m_config->mem_atomic_warp_parts;   //  Use atomic_mem_warp_parts
+  unsigned warp_parts = m_config->mem_atomic_warp_parts;   //  Use atomic_mem_warp_parts
+
   bool sector_segment_size = false;
 
   if (m_config->gpgpu_coalesce_arch >= 20 &&
@@ -603,6 +606,10 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
     sector_segment_size = true;
   }
 
+  //  Segment size = full line if NOT segemented $
+  //               = segement size (16B) if segmented $
+  //  Sector size changed to 16 Bytes 
+  //  since we are coalescing threads @ 16 B granularity
   switch (data_size) {
     case 1:
       segment_size = 32;
@@ -618,6 +625,9 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
   }
   unsigned subwarp_size = m_config->warp_size / warp_parts;
 
+  static int debugCount;
+  std::stringstream ss;
+
   for (unsigned subwarp = 0; subwarp < warp_parts; subwarp++) {
     std::map<new_addr_type, std::list<transaction_info> >
         subwarp_transactions;  // each block addr maps to a list of transactions
@@ -638,6 +648,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       assert(block_address ==
              line_size_based_tag_func(addr + data_size - 1, segment_size));
 
+      if (debugCount < 10) {
+        DPRINTF_RAW(ATOMICS, " @ %ld \n", addr);
+      }
+
+      //  Commented out for testing Atomic Coalescing Issue 
+      // /* */
       // Find a transaction that does not conflict with this thread's accesses
       bool new_transaction = true;
       std::list<transaction_info>::iterator it;
@@ -665,6 +681,20 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
         assert(!info->bytes.test(idx + i));
         info->bytes.set(idx + i);
       }
+      /* */
+    }
+
+    if (debugCount < 10) {
+      DPRINTF_RAW(ATOMICS, "--Start--\n");
+      ss << "\t" << "Subwarp Txns : \n";
+      for (auto item : subwarp_transactions) {
+        ss << "\t\t" << item.first << "-> \n";
+        for (auto txn : item.second) {
+          ss << "\t\t\t" << txn.chunks << "  " << txn.bytes << "  " << txn.active << " \n";
+        }
+      }
+      DPRINTF_RAW(ATOMICS, ss.str().c_str());
+      ss.str("");
     }
 
     // step 2: reduce each transaction size, if possible
@@ -675,15 +705,57 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       new_addr_type addr = t_list->first;
       const std::list<transaction_info> &transaction_list = t_list->second;
 
+      if (debugCount < 10) {
+        ss << "\t" << "Txn List : " << addr << "\n";
+        for (auto txn : transaction_list) {
+          ss << "\t\t" << txn.chunks << "  " << txn.bytes << "  " << txn.active << " \n";
+        }
+        DPRINTF_RAW(ATOMICS, ss.str().c_str());
+        ss.str("");
+      }
+
       std::list<transaction_info>::const_iterator t;
-      for (t = transaction_list.begin(); t != transaction_list.end(); t++) {
+      std::list<transaction_info> reduced_transaction_list;
+      //  Special Coalescing for Atomics Conflict
+      for (t = transaction_list.begin(); t !=  transaction_list.end(); t++) {
+        const transaction_info &info = *t;
+
+        // Find a coalescable transaction
+        bool newTxn = true;
+        for (auto& cTxn : reduced_transaction_list) {
+          if ((cTxn.chunks == info.chunks)
+              && (cTxn.bytes == info.bytes)
+              && ((cTxn.active & info.active) == 0)) {
+            // Squish the two transactions
+            cTxn.active = cTxn.active | info.active;
+            newTxn = false;
+            break;
+          }
+        }
+        if (newTxn) {
+          reduced_transaction_list.push_back(info);
+        }
+      }
+      
+      if (debugCount < 10) {
+        ss << "\t" << "RedTxn List : " << addr << "\n";
+        for (auto txn : reduced_transaction_list) {
+          ss << "\t\t" << txn.chunks << "  " << txn.bytes << "  " << txn.active << " \n";
+        }
+        DPRINTF_RAW(ATOMICS, ss.str().c_str());
+        ss.str("");
+      }
+
+      for (t = reduced_transaction_list.begin(); t != reduced_transaction_list.end(); t++) {
         // For each transaction
         const transaction_info &info = *t;
         memory_coalescing_arch_reduce_and_send(is_write, access_type, info,
                                                addr, segment_size);
       }
+      DPRINTF_RAW(ATOMICS, "--End--\n");
     }
   }
+  ++ debugCount;
 }
 
 void warp_inst_t::memory_coalescing_arch_reduce_and_send(
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 35e28ca57..485af9b49 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -372,6 +372,7 @@ class core_config {
     return ((addr / WORD_SIZE) % num_shmem_bank);
   }
   unsigned mem_warp_parts;
+  unsigned mem_atomic_warp_parts;
   mutable unsigned gpgpu_shmem_size;
   char *gpgpu_shmem_option;
   std::vector<unsigned> shmem_opt_list;
@@ -1070,6 +1071,7 @@ class warp_inst_t : public inst_t {
     mem_access_byte_mask_t bytes;
     active_mask_t active;  // threads in this transaction
 
+    //  Returns true if any of the bits between start_bit and end_bit are true
     bool test_bytes(unsigned start_bit, unsigned end_bit) {
       for (unsigned i = start_bit; i <= end_bit; i++)
         if (bytes.test(i)) return true;
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 56ede056c..7fcd08e6d 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -359,6 +359,11 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          "Number of portions a warp is divided into for shared "
                          "memory bank conflict check ",
                          "2");
+  option_parser_register(opp, "-gpgpu_shmem_atomic_warp_parts", OPT_INT32,
+                         &mem_atomic_warp_parts,
+                         "Number of portions an atomic warp is divided into for shared "
+                         "memory bank conflict check ",
+                         "2");
   option_parser_register(
       opp, "-gpgpu_mem_unit_ports", OPT_INT32, &mem_unit_ports,
       "The number of memory transactions allowed per core cycle", "1");
diff --git a/src/trace.h b/src/trace.h
index 8d7415177..d83af41e0 100644
--- a/src/trace.h
+++ b/src/trace.h
@@ -77,6 +77,14 @@ void init();
     }                                                          \
   } while (0)
 
+#define DPRINTF_RAW(x, ...)                                    \
+  do {                                                         \
+    if (DTRACE(x)) {                                           \
+      printf("%s : ", Trace::trace_streams_str[Trace::x]);              \
+      printf(__VA_ARGS__);                                     \
+    }                                                          \
+  } while (0)
+
 #else
 
 #define DTRACE(x) (false)
diff --git a/src/trace_streams.tup b/src/trace_streams.tup
index 074c7c880..aed7401ef 100644
--- a/src/trace_streams.tup
+++ b/src/trace_streams.tup
@@ -32,5 +32,6 @@ TS_TUP_BEGIN( trace_streams_type )
     TS_TUP( MEMORY_SUBPARTITION_UNIT ),
     TS_TUP( INTERCONNECT ),
     TS_TUP( LIVENESS ),
+    TS_TUP( ATOMICS ),
     TS_TUP( NUM_TRACE_STREAMS )
 TS_TUP_END( trace_streams_type )

From f5088238ae07a3743231d26758c25810f85cc7c4 Mon Sep 17 00:00:00 2001
From: Abhishek Bhaumick <a.bhaumick@live.in>
Date: Thu, 24 Mar 2022 20:17:52 -0400
Subject: [PATCH 2/3] Modified Atomics coalescing to match Volta V100

- replaced full CAM coalescing with common case coalescing
- correlated with QV100 GPU
---
 configs/tested-cfgs/SM7_QV100/gpgpusim.config |  4 +-
 src/abstract_hardware_model.cc                | 53 +++++++++++++++++--
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 90a635363..4ad607b80 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -204,6 +204,6 @@
 -power_simulation_enabled 0
 
 # tracing functionality
-#-trace_enabled 1
-#-trace_components ATOMICS
+-trace_enabled 1
+-trace_components ATOMICS
 #-trace_sampling_core 0
\ No newline at end of file
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 13a832dac..b550b2075 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -699,6 +699,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
 
     // step 2: reduce each transaction size, if possible
     std::map<new_addr_type, std::list<transaction_info> >::iterator t_list;
+
     for (t_list = subwarp_transactions.begin();
          t_list != subwarp_transactions.end(); t_list++) {
       // For each block addr
@@ -715,14 +716,15 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       }
 
       std::list<transaction_info>::const_iterator t;
-      std::list<transaction_info> reduced_transaction_list;
+      std::list<transaction_info> coalesced_transaction_list;
       //  Special Coalescing for Atomics Conflict
       for (t = transaction_list.begin(); t !=  transaction_list.end(); t++) {
         const transaction_info &info = *t;
 
+        // Option 1 : Full CAM based coalsecing of global atomics txns
         // Find a coalescable transaction
         bool newTxn = true;
-        for (auto& cTxn : reduced_transaction_list) {
+        for (auto& cTxn : coalesced_transaction_list) {
           if ((cTxn.chunks == info.chunks)
               && (cTxn.bytes == info.bytes)
               && ((cTxn.active & info.active) == 0)) {
@@ -732,12 +734,53 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
             break;
           }
         }
+        if (newTxn) {
+          coalesced_transaction_list.push_back(info);
+        }
+      }
+
+      if (debugCount < 10) {
+        ss << "\t" << "CoalTxn List : " << addr << "\n";
+        for (auto txn : coalesced_transaction_list) {
+          ss << "\t\t" << txn.chunks << "  " << txn.bytes << "  " << txn.active << " \n";
+        }
+        DPRINTF_RAW(ATOMICS, ss.str().c_str());
+        ss.str("");
+      }
+
+      // Option 2 : V100 Hardware correlated - only squish if 16 threads conflict on same address
+      std::list<transaction_info> reduced_transaction_list;
+      active_mask_t subwarp_full_upper_mask = ((((unsigned long long) 1) << subwarp_size) - 1) << subwarp_size; 
+      active_mask_t subwarp_full_lower_mask = (((unsigned long long) 1) << subwarp_size) - 1; 
+
+      // Place non-coalesced txns into final Txn List
+      for (t = transaction_list.begin(); t != transaction_list.end(); t++) {
+        const transaction_info &info = *t;
+
+        bool newTxn = true;
+        for (auto& cTxn : coalesced_transaction_list) {
+          if ((cTxn.active == subwarp_full_upper_mask || cTxn.active == subwarp_full_lower_mask)
+              && (cTxn.chunks == info.chunks)
+              && (cTxn.bytes == info.bytes)) {
+            // No need to create any split transactions
+            newTxn = false;
+            break;
+          }
+        }
+
         if (newTxn) {
           reduced_transaction_list.push_back(info);
         }
       }
+
+      for (auto& cTxn : coalesced_transaction_list) {
+        if (cTxn.active == subwarp_full_upper_mask || cTxn.active == subwarp_full_lower_mask) {
+          reduced_transaction_list.push_back(cTxn);
+        }
+      }
       
       if (debugCount < 10) {
+        ss << "  " << subwarp_full_upper_mask << "  " << subwarp_full_lower_mask << "\n";
         ss << "\t" << "RedTxn List : " << addr << "\n";
         for (auto txn : reduced_transaction_list) {
           ss << "\t\t" << txn.chunks << "  " << txn.bytes << "  " << txn.active << " \n";
@@ -752,8 +795,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
         memory_coalescing_arch_reduce_and_send(is_write, access_type, info,
                                                addr, segment_size);
       }
-      DPRINTF_RAW(ATOMICS, "--End--\n");
+      if (debugCount < 10) {
+        DPRINTF_RAW(ATOMICS, "--End--\n");
+      }
     }
+
+
   }
   ++ debugCount;
 }

From e42de8565f93dabf16f70105c4931396c909f4f6 Mon Sep 17 00:00:00 2001
From: Abhishek Bhaumick <a.bhaumick@live.in>
Date: Fri, 25 Mar 2022 23:21:36 -0400
Subject: [PATCH 3/3] Cleaned up debug messages, disbled tracing in config

- added ATOMICS_DETAIL trace flag
- made ATOMICS prints concise
- disabled tracing and restored default trace flags in QV100 tested-cfgs
---
 configs/tested-cfgs/SM7_QV100/gpgpusim.config |  4 +-
 src/abstract_hardware_model.cc                | 39 ++++++++++++-------
 src/trace_streams.tup                         |  1 +
 3 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 10d2500ec..6c36b82dd 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -233,6 +233,6 @@
 -visualizer_enabled 0
 
 # tracing functionality
--trace_enabled 1
--trace_components ATOMICS
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
 #-trace_sampling_core 0
\ No newline at end of file
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 7d3049dc4..e10a6e00b 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -631,6 +631,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
   unsigned subwarp_size = m_config->warp_size / warp_parts;
 
   static int debugCount;
+  const int debugThreshold = 10;
   std::stringstream ss;
 
   for (unsigned subwarp = 0; subwarp < warp_parts; subwarp++) {
@@ -653,9 +654,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       assert(block_address ==
              line_size_based_tag_func(addr + data_size - 1, segment_size));
 
-      if (debugCount < 10) {
-        DPRINTF_RAW(ATOMICS, " @ %ld \n", addr);
-      }
+      ss << " @ " << addr << ", ";
 
       //  Commented out for testing Atomic Coalescing Issue 
       // /* */
@@ -689,8 +688,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       /* */
     }
 
-    if (debugCount < 10) {
-      DPRINTF_RAW(ATOMICS, "--Start--\n");
+    ss << " \n";
+    DPRINTF_RAW(ATOMICS, ss.str().c_str());
+    ss.str("");
+
+    if (debugCount < debugThreshold) {
+      DPRINTF_RAW(ATOMICS_DETAIL, "--Start--\n");
       ss << "\t" << "Subwarp Txns : \n";
       for (auto item : subwarp_transactions) {
         ss << "\t\t" << item.first << "-> \n";
@@ -698,7 +701,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
           ss << "\t\t\t" << txn.chunks << "  " << txn.bytes << "  " << txn.active << " \n";
         }
       }
-      DPRINTF_RAW(ATOMICS, ss.str().c_str());
+      DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str());
       ss.str("");
     }
 
@@ -711,12 +714,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       new_addr_type addr = t_list->first;
       const std::list<transaction_info> &transaction_list = t_list->second;
 
-      if (debugCount < 10) {
+      if (debugCount < debugThreshold) {
         ss << "\t" << "Txn List : " << addr << "\n";
         for (auto txn : transaction_list) {
           ss << "\t\t" << txn.chunks << "  " << txn.bytes << "  " << txn.active << " \n";
         }
-        DPRINTF_RAW(ATOMICS, ss.str().c_str());
+        DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str());
         ss.str("");
       }
 
@@ -744,12 +747,12 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
         }
       }
 
-      if (debugCount < 10) {
+      if (debugCount < debugThreshold) {
         ss << "\t" << "CoalTxn List : " << addr << "\n";
         for (auto txn : coalesced_transaction_list) {
           ss << "\t\t" << txn.chunks << "  " << txn.bytes << "  " << txn.active << " \n";
         }
-        DPRINTF_RAW(ATOMICS, ss.str().c_str());
+        DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str());
         ss.str("");
       }
 
@@ -784,24 +787,32 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
         }
       }
       
-      if (debugCount < 10) {
+      if (debugCount < debugThreshold) {
         ss << "  " << subwarp_full_upper_mask << "  " << subwarp_full_lower_mask << "\n";
         ss << "\t" << "RedTxn List : " << addr << "\n";
         for (auto txn : reduced_transaction_list) {
           ss << "\t\t" << txn.chunks << "  " << txn.bytes << "  " << txn.active << " \n";
         }
-        DPRINTF_RAW(ATOMICS, ss.str().c_str());
+        DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str());
         ss.str("");
       }
 
+      ss << "  ";
+      ss << "SubWarp Txns " << subwarp_transactions.size() << " -> ";
+      ss << "Transactions " << transaction_list.size() << " -> ";
+      ss << "Coalesced Txns " << coalesced_transaction_list.size() << " -> ";
+      ss << "Reduced Txns " << reduced_transaction_list.size() << "\n";
+      DPRINTF_RAW(ATOMICS, ss.str().c_str());
+      ss.str("");
+
       for (t = reduced_transaction_list.begin(); t != reduced_transaction_list.end(); t++) {
         // For each transaction
         const transaction_info &info = *t;
         memory_coalescing_arch_reduce_and_send(is_write, access_type, info,
                                                addr, segment_size);
       }
-      if (debugCount < 10) {
-        DPRINTF_RAW(ATOMICS, "--End--\n");
+      if (debugCount < debugThreshold) {
+        DPRINTF_RAW(ATOMICS_DETAIL, "--End--\n");
       }
     }
 
diff --git a/src/trace_streams.tup b/src/trace_streams.tup
index aed7401ef..bcf008894 100644
--- a/src/trace_streams.tup
+++ b/src/trace_streams.tup
@@ -33,5 +33,6 @@ TS_TUP_BEGIN( trace_streams_type )
     TS_TUP( INTERCONNECT ),
     TS_TUP( LIVENESS ),
     TS_TUP( ATOMICS ),
+    TS_TUP( ATOMICS_DETAIL ),
     TS_TUP( NUM_TRACE_STREAMS )
 TS_TUP_END( trace_streams_type )