From a600005493d32cd7049ddcc8816fd8885a4c45e9 Mon Sep 17 00:00:00 2001
From: Elias Stehle <elias.stehle@gmail.com>
Date: Thu, 15 Apr 2021 18:15:08 +0200
Subject: [PATCH] remove nanosleep (#7962)

See #7951

Authors:
- Elias Stehle (https://github.com/elstehle)
---
 cpp/src/io/comp/debrotli.cu          |  8 ++------
 cpp/src/io/comp/gpuinflate.cu        | 13 ++++---------
 cpp/src/io/comp/unsnap.cu            |  9 ++++-----
 cpp/src/io/utilities/block_utils.cuh | 10 ----------
 4 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 953872ab7ed..541163eb086 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -357,8 +357,6 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes,
     first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block);
     if (first_free_block == ~0 || first_free_block >= ext_heap_size) {
       // Some other block is holding the heap or there are no free blocks: try again later
-      // Wait a bit in an attempt to make the spin less resource-hungry
-      nanosleep(100);
       continue;
     }
     if (first_free_block == 0) {
@@ -408,8 +406,7 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes,
       }
     } while (blk_next != 0 && blk_next < ext_heap_size);
     first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block);
-    // Wait a while since reaching here means the heap is full
-    nanosleep(10000);
+    // Reaching here means the heap is full
     // Just in case we're trying to allocate more than the entire heap
     if (len > ext_heap_size - 4 * sizeof(uint32_t)) { break; }
   }
@@ -429,8 +426,7 @@ static __device__ void ext_heap_free(void *ptr,
   for (;;) {
     first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block);
     if (first_free_block != ~0) { break; }
-    // Some other block is holding the heap: wait
-    nanosleep(50);
+    // Some other block is holding the heap
   }
   if (first_free_block >= ext_heap_size) {
     // Heap is currently empty
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index a31cf1717e7..eda1d37f78c 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -512,13 +512,10 @@ __device__ void decode_symbols(inflate_state_s *s)
 #if ENABLE_PREFETCH
     // Wait for prefetcher to fetch a worst-case of 48 bits per symbol
     while ((*(volatile int32_t *)&s->pref.cur_p - (int32_t)(size_t)cur < batch_size * 6) ||
-           (s->x.batch_len[batch] != 0))
+           (s->x.batch_len[batch] != 0)) {}
 #else
-    while (s->x.batch_len[batch] != 0)
+    while (s->x.batch_len[batch] != 0) {}
 #endif
-    {
-      nanosleep(100);
-    }
     batch_len = 0;
 #if ENABLE_PREFETCH
     if (cur + (bitpos >> 3) >= end) {
@@ -662,7 +659,7 @@ __device__ void decode_symbols(inflate_state_s *s)
     if (batch_len != 0) batch = (batch + 1) & (batch_count - 1);
   } while (sym != 256);
 
-  while (s->x.batch_len[batch] != 0) { nanosleep(150); }
+  while (s->x.batch_len[batch] != 0) {}
   s->x.batch_len[batch] = -1;
   s->bitbuf             = bitbuf;
   s->bitpos             = bitpos;
@@ -779,7 +776,7 @@ __device__ void process_symbols(inflate_state_s *s, int t)
     uint32_t lit_mask;
 
     if (t == 0) {
-      while ((batch_len = s->x.batch_len[batch]) == 0) { nanosleep(100); }
+      while ((batch_len = s->x.batch_len[batch]) == 0) {}
     } else {
       batch_len = 0;
     }
@@ -962,8 +959,6 @@ __device__ void prefetch_warp(volatile inflate_state_s *s, int t)
         s->pref.cur_p = cur_p;
         __threadfence_block();
       }
-    } else if (t == 0) {
-      nanosleep(150);
     }
   }
 }
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index 2b799b5e1bf..c58880c9ed8 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -99,7 +99,6 @@ __device__ void snappy_prefetch_bytestream(unsnap_state_s *s, int t)
           blen = 0;
           break;
         }
-        nanosleep(100);
       }
     }
     blen = shuffle(blen);
@@ -281,7 +280,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
     if (t == 0) {
       s->q.prefetch_rdpos = cur;
 #pragma unroll(1)  // We don't want unrolling here
-      while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { nanosleep(50); }
+      while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) {}
       b = &s->q.batch[batch * batch_size];
     }
     // Process small symbols in parallel: for data that does not get good compression,
@@ -441,7 +440,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
           // Wait for prefetcher
           s->q.prefetch_rdpos = cur;
 #pragma unroll(1)  // We don't want unrolling here
-          while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { nanosleep(50); }
+          while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) {}
           dst_pos += blen;
           if (bytes_left < blen) break;
           bytes_left -= blen;
@@ -457,7 +456,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
     }
     batch_len = shuffle(batch_len);
     if (t == 0) {
-      while (s->q.batch_len[batch] != 0) { nanosleep(100); }
+      while (s->q.batch_len[batch] != 0) {}
     }
     if (batch_len != batch_size) { break; }
   }
@@ -490,7 +489,7 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s
     int32_t batch_len, blen_t, dist_t;
 
     if (t == 0) {
-      while ((batch_len = s->q.batch_len[batch]) == 0) { nanosleep(100); }
+      while ((batch_len = s->q.batch_len[batch]) == 0) {}
     } else {
       batch_len = 0;
     }
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index 9046eebcb02..0d009af8295 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -36,16 +36,6 @@ inline __device__ void syncwarp(void) { __syncwarp(); }
 
 inline __device__ uint32_t ballot(int pred) { return __ballot_sync(~0, pred); }
 
-template <typename T>
-inline __device__ void nanosleep(T d)
-{
-#if (__CUDA_ARCH__ >= 700)
-  __nanosleep(d);
-#else
-  clock();
-#endif
-}
-
 // Warp reduction helpers
 template <typename T>
 inline __device__ T WarpReduceSum2(T acc)