rapidsai · dantegd · Apr 4, 2019 · Mar 17, 2019 · Mar 18, 2019 · Apr 2, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@
 - PR #422: Issue in the PCA tests was solved and CI can run with driver 418
 - PR #409: Add entry to gitmodules to ignore build artifacts
 - PR #412: Fix for svdQR function in ml-prims
+- PR #358: Fixed an issue when switching streams on MLCommon::device_buffer and MLCommon::host_buffer
 - PR #434: Fixing bug in CSR tests
 
 # cuML 0.6.0 (22 Mar 2019)
@@ -92,7 +93,6 @@
 - PR #380: Allow arbitrary data size on ingress for numba_utils.row_matrix
 - PR #385: Fix for long import cuml time in containers and fix for setup_pip
 
-
 # cuML 0.5.1 (05 Feb 2019)
 
 ## Bug Fixes

diff --git a/ml-prims/src/common/buffer_base.hpp b/ml-prims/src/common/buffer_base.hpp
@@ -88,6 +88,23 @@ class buffer_base
     size_type                           _size;
     size_type                           _capacity;
     value_type*                         _data;
+    void set_stream( cudaStream_t stream )
+    {
+        if ( _stream != stream )
+        {
+            cudaEvent_t event;
+            CUDA_CHECK( cudaEventCreateWithFlags( &event, cudaEventDisableTiming ) );
+            CUDA_CHECK( cudaEventRecord( event, _stream ) );
+            CUDA_CHECK( cudaStreamWaitEvent ( stream, event, 0 ) );
+            _stream = stream;
+            CUDA_CHECK( cudaEventDestroy( event ) );
+        }
+    }
+    cudaStream_t get_stream() const
+    {
+        return _stream;
+    }
+private:
     cudaStream_t                        _stream;
 };
 

diff --git a/ml-prims/src/common/device_buffer.hpp b/ml-prims/src/common/device_buffer.hpp
@@ -64,30 +64,30 @@ class device_buffer : public buffer_base<T>
     {
         if ( _size > 0 )
         {
-            _data = static_cast<value_type*>(_allocator->allocate( _capacity*sizeof(value_type), _stream ));
-            CUDA_CHECK( cudaStreamSynchronize( _stream ) );
+            _data = static_cast<value_type*>(_allocator->allocate( _capacity*sizeof(value_type), get_stream() ));
+            CUDA_CHECK( cudaStreamSynchronize( get_stream() ) );
         }
     }
 
     ~device_buffer()
     {
         if ( nullptr != _data ) 
         {
-            _allocator->deallocate( _data, _capacity*sizeof(value_type), _stream );
+            _allocator->deallocate( _data, _capacity*sizeof(value_type), get_stream() );
         }
     }
 
     void reserve( const size_type new_capacity, cudaStream_t stream )
     {
-        _stream = stream;
+        set_stream( stream );
         if ( new_capacity > _capacity )
         {
-            value_type* new_data = static_cast<value_type*>(_allocator->allocate( new_capacity*sizeof(value_type), _stream ));
+            value_type* new_data = static_cast<value_type*>(_allocator->allocate( new_capacity*sizeof(value_type), get_stream() ));
             if ( _size > 0 ) {
-                CUDA_CHECK( cudaMemcpyAsync( new_data, _data, _size*sizeof(value_type), cudaMemcpyDeviceToDevice, _stream ) );
+                CUDA_CHECK( cudaMemcpyAsync( new_data, _data, _size*sizeof(value_type), cudaMemcpyDeviceToDevice, get_stream() ) );
             }
             if ( nullptr != _data ) {
-                _allocator->deallocate( _data, _capacity*sizeof(value_type), _stream );
+                _allocator->deallocate( _data, _capacity*sizeof(value_type), get_stream() );
             }
             _data = new_data;
             _capacity = new_capacity;
@@ -102,9 +102,9 @@ class device_buffer : public buffer_base<T>
 
     void release( cudaStream_t stream )
     {
-        _stream = stream;
+        set_stream( stream );
         if ( nullptr != _data ) {
-            _allocator->deallocate( _data, _capacity*sizeof(value_type), _stream );
+            _allocator->deallocate( _data, _capacity*sizeof(value_type), get_stream() );
         }
         _data = nullptr;
         _capacity = 0;
@@ -121,7 +121,6 @@ class device_buffer : public buffer_base<T>
     using buffer_base<T>::_size;
     using buffer_base<T>::_capacity;
     using buffer_base<T>::_data;
-    using buffer_base<T>::_stream;
 };
 
 } // end namespace ML
diff --git a/ml-prims/src/common/host_buffer.hpp b/ml-prims/src/common/host_buffer.hpp
@@ -65,16 +65,16 @@ class host_buffer : public buffer_base<T>
     {
         if ( _capacity > 0 )
         {
-            _data = static_cast<value_type*>(_allocator->allocate( _capacity*sizeof(value_type), _stream ));
-            CUDA_CHECK( cudaStreamSynchronize( _stream ) );
+            _data = static_cast<value_type*>(_allocator->allocate( _capacity*sizeof(value_type), get_stream() ));
+            CUDA_CHECK( cudaStreamSynchronize( get_stream() ) );
         }
     }
 
     ~host_buffer()
     {
         if ( nullptr != _data ) 
         {
-            _allocator->deallocate( _data, _capacity*sizeof(value_type), _stream );
+            _allocator->deallocate( _data, _capacity*sizeof(value_type), get_stream() );
         }
     }
 
@@ -90,15 +90,15 @@ class host_buffer : public buffer_base<T>
 
     void reserve( const size_type new_capacity, cudaStream_t stream )
     {
-        _stream = stream;
+        set_stream( stream );
         if ( new_capacity > _capacity )
         {
-            value_type* new_data = static_cast<value_type*>(_allocator->allocate( new_capacity*sizeof(value_type), _stream ));
+            value_type* new_data = static_cast<value_type*>(_allocator->allocate( new_capacity*sizeof(value_type), get_stream() ));
             if ( _size > 0 ) {
-                CUDA_CHECK( cudaMemcpyAsync( new_data, _data, _size*sizeof(value_type), cudaMemcpyHostToHost, _stream ) );
+                CUDA_CHECK( cudaMemcpyAsync( new_data, _data, _size*sizeof(value_type), cudaMemcpyHostToHost, get_stream() ) );
             }
             if ( nullptr != _data ) {
-                _allocator->deallocate( _data, _capacity*sizeof(value_type), _stream );
+                _allocator->deallocate( _data, _capacity*sizeof(value_type), get_stream() );
             }
             _data = new_data;
             _capacity = new_capacity;
@@ -113,9 +113,9 @@ class host_buffer : public buffer_base<T>
 
     void release( cudaStream_t stream )
     {
-        _stream = stream;
+        set_stream( stream );
         if ( nullptr != _data ) {
-            _allocator->deallocate( _data, _capacity*sizeof(value_type), _stream );
+            _allocator->deallocate( _data, _capacity*sizeof(value_type), get_stream() );
         }
         _data = nullptr;
         _capacity = 0;
@@ -132,7 +132,6 @@ class host_buffer : public buffer_base<T>
     using buffer_base<T>::_size;
     using buffer_base<T>::_capacity;
     using buffer_base<T>::_data;
-    using buffer_base<T>::_stream;
 };
 
 } // end namespace ML