From a600005493d32cd7049ddcc8816fd8885a4c45e9 Mon Sep 17 00:00:00 2001 From: Elias Stehle Date: Thu, 15 Apr 2021 18:15:08 +0200 Subject: [PATCH] remove nanosleep (#7962) See #7951 Authors: - Elias Stehle (https://github.com/elstehle) --- cpp/src/io/comp/debrotli.cu | 8 ++------ cpp/src/io/comp/gpuinflate.cu | 13 ++++--------- cpp/src/io/comp/unsnap.cu | 9 ++++----- cpp/src/io/utilities/block_utils.cuh | 10 ---------- 4 files changed, 10 insertions(+), 30 deletions(-) diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 953872ab7ed..541163eb086 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -357,8 +357,6 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes, first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block); if (first_free_block == ~0 || first_free_block >= ext_heap_size) { // Some other block is holding the heap or there are no free blocks: try again later - // Wait a bit in an attempt to make the spin less resource-hungry - nanosleep(100); continue; } if (first_free_block == 0) { @@ -408,8 +406,7 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes, } } while (blk_next != 0 && blk_next < ext_heap_size); first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block); - // Wait a while since reaching here means the heap is full - nanosleep(10000); + // Reaching here means the heap is full // Just in case we're trying to allocate more than the entire heap if (len > ext_heap_size - 4 * sizeof(uint32_t)) { break; } } @@ -429,8 +426,7 @@ static __device__ void ext_heap_free(void *ptr, for (;;) { first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block); if (first_free_block != ~0) { break; } - // Some other block is holding the heap: wait - nanosleep(50); + // Some other block is holding the heap } if (first_free_block >= ext_heap_size) { // Heap is currently empty diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index a31cf1717e7..eda1d37f78c 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -512,13 +512,10 @@ __device__ void decode_symbols(inflate_state_s *s) #if ENABLE_PREFETCH // Wait for prefetcher to fetch a worst-case of 48 bits per symbol while ((*(volatile int32_t *)&s->pref.cur_p - (int32_t)(size_t)cur < batch_size * 6) || - (s->x.batch_len[batch] != 0)) + (s->x.batch_len[batch] != 0)) {} #else - while (s->x.batch_len[batch] != 0) + while (s->x.batch_len[batch] != 0) {} #endif - { - nanosleep(100); - } batch_len = 0; #if ENABLE_PREFETCH if (cur + (bitpos >> 3) >= end) { @@ -662,7 +659,7 @@ __device__ void decode_symbols(inflate_state_s *s) if (batch_len != 0) batch = (batch + 1) & (batch_count - 1); } while (sym != 256); - while (s->x.batch_len[batch] != 0) { nanosleep(150); } + while (s->x.batch_len[batch] != 0) {} s->x.batch_len[batch] = -1; s->bitbuf = bitbuf; s->bitpos = bitpos; @@ -779,7 +776,7 @@ __device__ void process_symbols(inflate_state_s *s, int t) uint32_t lit_mask; if (t == 0) { - while ((batch_len = s->x.batch_len[batch]) == 0) { nanosleep(100); } + while ((batch_len = s->x.batch_len[batch]) == 0) {} } else { batch_len = 0; } @@ -962,8 +959,6 @@ __device__ void prefetch_warp(volatile inflate_state_s *s, int t) s->pref.cur_p = cur_p; __threadfence_block(); } - } else if (t == 0) { - nanosleep(150); } } } diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index 2b799b5e1bf..c58880c9ed8 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -99,7 +99,6 @@ __device__ void snappy_prefetch_bytestream(unsnap_state_s *s, int t) blen = 0; break; } - nanosleep(100); } } blen = shuffle(blen); @@ -281,7 +280,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) if (t == 0) { s->q.prefetch_rdpos = cur; #pragma unroll(1) // We don't want unrolling here - while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { nanosleep(50); } + while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) {} b = &s->q.batch[batch * batch_size]; } // Process small symbols in parallel: for data that does not get good compression, @@ -441,7 +440,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) // Wait for prefetcher s->q.prefetch_rdpos = cur; #pragma unroll(1) // We don't want unrolling here - while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { nanosleep(50); } + while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) {} dst_pos += blen; if (bytes_left < blen) break; bytes_left -= blen; @@ -457,7 +456,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) } batch_len = shuffle(batch_len); if (t == 0) { - while (s->q.batch_len[batch] != 0) { nanosleep(100); } + while (s->q.batch_len[batch] != 0) {} } if (batch_len != batch_size) { break; } } @@ -490,7 +489,7 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s int32_t batch_len, blen_t, dist_t; if (t == 0) { - while ((batch_len = s->q.batch_len[batch]) == 0) { nanosleep(100); } + while ((batch_len = s->q.batch_len[batch]) == 0) {} } else { batch_len = 0; } diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh index 9046eebcb02..0d009af8295 100644 --- a/cpp/src/io/utilities/block_utils.cuh +++ b/cpp/src/io/utilities/block_utils.cuh @@ -36,16 +36,6 @@ inline __device__ void syncwarp(void) { __syncwarp(); } inline __device__ uint32_t ballot(int pred) { return __ballot_sync(~0, pred); } -template -inline __device__ void nanosleep(T d) -{ -#if (__CUDA_ARCH__ >= 700) - __nanosleep(d); -#else - clock(); -#endif -} - // Warp reduction helpers template inline __device__ T WarpReduceSum2(T acc)