Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for atomic coalescing at L1, correlated with QV100 hardware #33

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions configs/tested-cfgs/SM7_QV100/gpgpusim.config
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
-gpgpu_shmem_num_banks 32
-gpgpu_shmem_limited_broadcast 0
-gpgpu_shmem_warp_parts 1
-gpgpu_shmem_atomic_warp_parts 2
-gpgpu_coalesce_arch 70

# Volta has four schedulers per core
Expand Down
132 changes: 131 additions & 1 deletion src/abstract_hardware_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,10 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,

// see the CUDA manual where it discusses coalescing rules before reading this
unsigned segment_size = 0;
unsigned warp_parts = m_config->mem_warp_parts;
// unsigned warp_parts = m_config->mem_warp_parts;
// unsigned warp_parts = m_config->mem_atomic_warp_parts; // Use atomic_mem_warp_parts
unsigned warp_parts = m_config->mem_atomic_warp_parts; // Use atomic_mem_warp_parts

bool sector_segment_size = false;

if (m_config->gpgpu_coalesce_arch >= 20 &&
Expand All @@ -608,6 +611,10 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
sector_segment_size = true;
}

// Segment size = full line if NOT segemented $
// = segement size (16B) if segmented $
// Sector size changed to 16 Bytes
// since we are coalescing threads @ 16 B granularity
switch (data_size) {
case 1:
segment_size = 32;
Expand All @@ -623,6 +630,10 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
}
unsigned subwarp_size = m_config->warp_size / warp_parts;

static int debugCount;
const int debugThreshold = 10;
std::stringstream ss;

for (unsigned subwarp = 0; subwarp < warp_parts; subwarp++) {
std::map<new_addr_type, std::list<transaction_info> >
subwarp_transactions; // each block addr maps to a list of transactions
Expand All @@ -643,6 +654,10 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
assert(block_address ==
line_size_based_tag_func(addr + data_size - 1, segment_size));

ss << " @ " << addr << ", ";

// Commented out for testing Atomic Coalescing Issue
// /* */
// Find a transaction that does not conflict with this thread's accesses
bool new_transaction = true;
std::list<transaction_info>::iterator it;
Expand Down Expand Up @@ -670,25 +685,140 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
assert(!info->bytes.test(idx + i));
info->bytes.set(idx + i);
}
/* */
}

ss << " \n";
DPRINTF_RAW(ATOMICS, ss.str().c_str());
ss.str("");

if (debugCount < debugThreshold) {
DPRINTF_RAW(ATOMICS_DETAIL, "--Start--\n");
ss << "\t" << "Subwarp Txns : \n";
for (auto item : subwarp_transactions) {
ss << "\t\t" << item.first << "-> \n";
for (auto txn : item.second) {
ss << "\t\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n";
}
}
DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str());
ss.str("");
}

// step 2: reduce each transaction size, if possible
std::map<new_addr_type, std::list<transaction_info> >::iterator t_list;

for (t_list = subwarp_transactions.begin();
t_list != subwarp_transactions.end(); t_list++) {
// For each block addr
new_addr_type addr = t_list->first;
const std::list<transaction_info> &transaction_list = t_list->second;

if (debugCount < debugThreshold) {
ss << "\t" << "Txn List : " << addr << "\n";
for (auto txn : transaction_list) {
ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n";
}
DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str());
ss.str("");
}

std::list<transaction_info>::const_iterator t;
std::list<transaction_info> coalesced_transaction_list;
// Special Coalescing for Atomics Conflict
for (t = transaction_list.begin(); t != transaction_list.end(); t++) {
const transaction_info &info = *t;

// Option 1 : Full CAM based coalsecing of global atomics txns
// Find a coalescable transaction
bool newTxn = true;
for (auto& cTxn : coalesced_transaction_list) {
if ((cTxn.chunks == info.chunks)
&& (cTxn.bytes == info.bytes)
&& ((cTxn.active & info.active) == 0)) {
// Squish the two transactions
cTxn.active = cTxn.active | info.active;
newTxn = false;
break;
}
}
if (newTxn) {
coalesced_transaction_list.push_back(info);
}
}

if (debugCount < debugThreshold) {
ss << "\t" << "CoalTxn List : " << addr << "\n";
for (auto txn : coalesced_transaction_list) {
ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n";
}
DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str());
ss.str("");
}

// Option 2 : V100 Hardware correlated - only squish if 16 threads conflict on same address
std::list<transaction_info> reduced_transaction_list;
active_mask_t subwarp_full_upper_mask = ((((unsigned long long) 1) << subwarp_size) - 1) << subwarp_size;
active_mask_t subwarp_full_lower_mask = (((unsigned long long) 1) << subwarp_size) - 1;

// Place non-coalesced txns into final Txn List
for (t = transaction_list.begin(); t != transaction_list.end(); t++) {
const transaction_info &info = *t;

bool newTxn = true;
for (auto& cTxn : coalesced_transaction_list) {
if ((cTxn.active == subwarp_full_upper_mask || cTxn.active == subwarp_full_lower_mask)
&& (cTxn.chunks == info.chunks)
&& (cTxn.bytes == info.bytes)) {
// No need to create any split transactions
newTxn = false;
break;
}
}

if (newTxn) {
reduced_transaction_list.push_back(info);
}
}

for (auto& cTxn : coalesced_transaction_list) {
if (cTxn.active == subwarp_full_upper_mask || cTxn.active == subwarp_full_lower_mask) {
reduced_transaction_list.push_back(cTxn);
}
}

if (debugCount < debugThreshold) {
ss << " " << subwarp_full_upper_mask << " " << subwarp_full_lower_mask << "\n";
ss << "\t" << "RedTxn List : " << addr << "\n";
for (auto txn : reduced_transaction_list) {
ss << "\t\t" << txn.chunks << " " << txn.bytes << " " << txn.active << " \n";
}
DPRINTF_RAW(ATOMICS_DETAIL, ss.str().c_str());
ss.str("");
}

ss << " ";
ss << "SubWarp Txns " << subwarp_transactions.size() << " -> ";
ss << "Transactions " << transaction_list.size() << " -> ";
ss << "Coalesced Txns " << coalesced_transaction_list.size() << " -> ";
ss << "Reduced Txns " << reduced_transaction_list.size() << "\n";
DPRINTF_RAW(ATOMICS, ss.str().c_str());
ss.str("");

for (t = reduced_transaction_list.begin(); t != reduced_transaction_list.end(); t++) {
// For each transaction
const transaction_info &info = *t;
memory_coalescing_arch_reduce_and_send(is_write, access_type, info,
addr, segment_size);
}
if (debugCount < debugThreshold) {
DPRINTF_RAW(ATOMICS_DETAIL, "--End--\n");
}
}


}
++ debugCount;
}

void warp_inst_t::memory_coalescing_arch_reduce_and_send(
Expand Down
2 changes: 2 additions & 0 deletions src/abstract_hardware_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ class core_config {
return ((addr / WORD_SIZE) % num_shmem_bank);
}
unsigned mem_warp_parts;
unsigned mem_atomic_warp_parts;
mutable unsigned gpgpu_shmem_size;
char *gpgpu_shmem_option;
std::vector<unsigned> shmem_opt_list;
Expand Down Expand Up @@ -1119,6 +1120,7 @@ class warp_inst_t : public inst_t {
mem_access_byte_mask_t bytes;
active_mask_t active; // threads in this transaction

// Returns true if any of the bits between start_bit and end_bit are true
bool test_bytes(unsigned start_bit, unsigned end_bit) {
for (unsigned i = start_bit; i <= end_bit; i++)
if (bytes.test(i)) return true;
Expand Down
5 changes: 5 additions & 0 deletions src/gpgpu-sim/gpu-sim.cc
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,11 @@ void shader_core_config::reg_options(class OptionParser *opp) {
"Number of portions a warp is divided into for shared "
"memory bank conflict check ",
"2");
option_parser_register(opp, "-gpgpu_shmem_atomic_warp_parts", OPT_INT32,
&mem_atomic_warp_parts,
"Number of portions an atomic warp is divided into for shared "
"memory bank conflict check ",
"2");
option_parser_register(
opp, "-gpgpu_mem_unit_ports", OPT_INT32, &mem_unit_ports,
"The number of memory transactions allowed per core cycle", "1");
Expand Down
8 changes: 8 additions & 0 deletions src/trace.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ void init();
} \
} while (0)

#define DPRINTF_RAW(x, ...) \
do { \
if (DTRACE(x)) { \
printf("%s : ", Trace::trace_streams_str[Trace::x]); \
printf(__VA_ARGS__); \
} \
} while (0)

#else

#define DTRACE(x) (false)
Expand Down
2 changes: 2 additions & 0 deletions src/trace_streams.tup
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,7 @@ TS_TUP_BEGIN( trace_streams_type )
TS_TUP( MEMORY_SUBPARTITION_UNIT ),
TS_TUP( INTERCONNECT ),
TS_TUP( LIVENESS ),
TS_TUP( ATOMICS ),
TS_TUP( ATOMICS_DETAIL ),
TS_TUP( NUM_TRACE_STREAMS )
TS_TUP_END( trace_streams_type )