diff --git a/pluto/CMakeLists.txt b/pluto/CMakeLists.txt index 0bbb392ef..bdd38afa1 100644 --- a/pluto/CMakeLists.txt +++ b/pluto/CMakeLists.txt @@ -19,7 +19,7 @@ project( pluto VERSION ${atlas_VERSION} LANGUAGES CXX ) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(PLUTO_HAVE_PMR 0) +set(PLUTO_HAVE_PMR 1) ### Normally we should be auto-detecting std::pmr (c++17) for polymorphic memory resource # However, CUDA or HIP compiler seems to not find the right headers, #  so for now we don't rely on it diff --git a/pluto/examples/use_streams.cc b/pluto/examples/use_streams.cc index b16d80344..20474857f 100644 --- a/pluto/examples/use_streams.cc +++ b/pluto/examples/use_streams.cc @@ -108,8 +108,12 @@ int main(int argc, char* argv[]) { std::cerr << "device alloc" << std::endl; device_array array_d1(size); + std::cerr << "async loop start" << std::endl; auto start = std::chrono::steady_clock::now(); for(std::size_t jstream=0; jstream stream_tmp(stream_size); + auto* dtmp = stream_tmp.data(); + h1[stream_size-1] = 1.; h2[stream_size-1] = -1.; pluto::copy_host_to_device(d1, h1, stream_size, stream); plus_one_on_device(d1, stream_size, stream); pluto::copy_device_to_host(h2, d1, stream_size, stream); } + std::cerr << "async loop end" << std::endl; pluto::wait(); auto end = std::chrono::steady_clock::now(); std::cout << "execution without allocations took " << std::chrono::duration(end-start).count() << " s" << std::endl; diff --git a/pluto/src/pluto/device/allocator.h b/pluto/src/pluto/device/allocator.h index 23306517c..398aa2d1d 100644 --- a/pluto/src/pluto/device/allocator.h +++ b/pluto/src/pluto/device/allocator.h @@ -14,6 +14,7 @@ #include "pluto/memory_resource/memory_resource.h" #include "pluto/offload/wait.h" +#include "pluto/offload/Stream.h" namespace pluto::device { @@ -36,28 +37,38 @@ template class allocator { public: using value_type = T; + + allocator(memory_resource* mr, const Stream& stream) : + memory_resource_(mr), + stream_(stream) {} + allocator() : - memory_resource_(get_default_resource()) {} + allocator(get_default_resource(), get_default_stream()) {} allocator(const allocator& other) : - memory_resource_(other.memory_resource_) {} + allocator(other.memory_resource_, other.stream_) {} allocator(memory_resource* mr) : - memory_resource_(mr) {} + allocator(mr, get_default_stream()) {} + + allocator(const Stream& stream) : + allocator(get_default_resource(), stream) {} value_type* allocate(std::size_t size) { + DefaultStream scope{stream_}; return static_cast(memory_resource_->allocate(size * sizeof(value_type), 256)); } void deallocate(value_type* ptr, std::size_t size) { + DefaultStream scope{stream_}; memory_resource_->deallocate(ptr, size * sizeof(value_type), 256); } template void construct(U* p, Args&&... args) { #if HIC_COMPILER - new_on_device<<<1, 1>>>(p, std::forward(args)...); - pluto::wait(); + new_on_device<<<1, 1, 0, stream_.value()>>>(p, std::forward(args)...); + pluto::wait(stream_); #else new_on_device(p, args...); #endif @@ -66,14 +77,15 @@ class allocator { template void destroy(U* p) { #if HIC_COMPILER - delete_on_device<<<1, 1>>>(p); - pluto::wait(); + delete_on_device<<<1, 1, 0, stream_.value()>>>(p); + pluto::wait(stream_); #else delete_on_device(p); #endif } private: memory_resource* memory_resource_{nullptr}; + const Stream& stream_; }; // -------------------------------------------------------------------------------------------------------- diff --git a/pluto/src/pluto/memory_resource/DeviceMemoryResource.cc b/pluto/src/pluto/memory_resource/DeviceMemoryResource.cc index a64ef66aa..0379b78ea 100644 --- a/pluto/src/pluto/memory_resource/DeviceMemoryResource.cc +++ b/pluto/src/pluto/memory_resource/DeviceMemoryResource.cc @@ -37,7 +37,7 @@ memory_pool_resource* device_pool_resource() { void* DeviceMemoryResource::do_allocate(std::size_t bytes, alignment_t) { void* ptr; const auto& stream = get_default_stream(); - if (false) {//stream.value()) { + if (stream.value()) { if constexpr (PLUTO_HAVE_HIC) { HIC_CALL( hicMallocAsync(&ptr, bytes, stream.value() ) ); } @@ -64,7 +64,7 @@ void* DeviceMemoryResource::do_allocate(std::size_t bytes, alignment_t) { void DeviceMemoryResource::do_deallocate(void* ptr, std::size_t bytes, alignment_t) { const auto& stream = get_default_stream(); - if (false) {//if (stream.value()) { + if (stream.value()) { if constexpr (PLUTO_HAVE_HIC) { HIC_CALL( hicFreeAsync(ptr, stream.value()) ); }