Skip to content

Commit

Permalink
Revert DML changes
Browse files Browse the repository at this point in the history
  • Loading branch information
yuslepukhin committed Oct 10, 2024
1 parent 25b1c38 commit 6bef50c
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 23 deletions.
68 changes: 52 additions & 16 deletions onnxruntime/core/session/lora_adapters.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
#include "core/session/lora_adapters.h"
#include "lora/adapter_format_utils.h"

#include <unordered_map>

#include "core/framework/data_transfer.h"
#include "core/framework/error_code_helper.h"
#include "core/framework/execution_provider.h"
#include "core/session/onnxruntime_c_api.h"
#include "core/session/allocator_adapters.h"
#include "core/session/ort_apis.h"
Expand All @@ -16,6 +15,15 @@
#include "core/providers/cuda/cuda_provider_factory.h"
#endif

#ifdef USE_DML
#include "core/session/abi_session_options_impl.h"
#include "core/providers/dml/dml_provider_factory_creator.h"
#include "core/providers/dml/dml_provider_factory.h"
#endif

#include <functional>
#include <unordered_map>

namespace onnxruntime {

#ifdef USE_CUDA
Expand Down Expand Up @@ -50,28 +58,55 @@ void LoraAdapter::MemoryMap(const std::filesystem::path& file_path) {
InitializeParamsValues();
}

static std::unique_ptr<IDataTransfer> GetDataTransfer(const OrtMemoryInfo& mem_info) {
namespace {
struct DataTransfer {
std::unique_ptr<IExecutionProvider> ep;
std::unique_ptr<IDataTransfer> data_transfer;

if (strcmp(mem_info.name, onnxruntime::CPU) == 0) {
return data_transfer;
Status CopyTensor(const Tensor& src, Tensor& dst) const {
return data_transfer->CopyTensor(src, dst);
}
Status Sync() const {
#if USE_DML
return ep->Sync();
#else
return Status::OK();
#endif
}
};
} // namespace

static Status GetDataTransfer(const OrtMemoryInfo& mem_info, [[maybe_unused]] DataTransfer& dt) {
ORT_RETURN_IF(strcmp(mem_info.name, onnxruntime::CPU) == 0, "Expecting on device allocator for LoraAdapter");

if (strcmp(mem_info.name, onnxruntime::CUDA) == 0) {
#ifdef USE_CUDA
auto* cuda_provider_info = TryGetProviderInfo_CUDA();
if (cuda_provider_info != nullptr) {
data_transfer = cuda_provider_info->CreateGPUDataTransfer();
dt.data_transfer = cuda_provider_info->CreateGPUDataTransfer();
} else {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider could not be loaded");
}
#else
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider is not enabled in this build");
#endif
} else if (strcmp(mem_info.name, onnxruntime::DML) == 0) {
#ifdef USE_DML
auto ep_factory = onnxruntime::DMLProviderFactoryCreator::Create(ConfigOptions{}, 0, false, false, false);
dt.ep = ep_factory->CreateProvider();
dt.data_transfer = dt.ep->GetDataTransfer();
#else
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "DML provider is not enabled in this build");
#endif
} else {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported device allocator");
}

return data_transfer;
return Status::OK();
}

static Status CreateOrtValueOnDevice(const OrtValue& ort_value_mapped,
const AllocatorPtr& device_allocator,
const IDataTransfer& data_transfer,
const DataTransfer& data_transfer,
OrtValue& out) {
OrtValue result;
const auto& src = ort_value_mapped.Get<Tensor>();
Expand All @@ -87,25 +122,22 @@ void LoraAdapter::InitializeParamsValues() {
ORT_THROW("Adapter is not loaded yet.");
}

std::unique_ptr<IDataTransfer> data_transfer;
DataTransfer data_transfer;
if (device_allocator_) {
data_transfer = GetDataTransfer(device_allocator_->Info());
if (data_transfer == nullptr) {
ORT_THROW("Data transfer is not available for the specified device allocator, it also must not be a CPU allocator");
}
ORT_THROW_IF_ERROR(GetDataTransfer(device_allocator_->Info(), data_transfer));
}

const auto* params = adapter_->parameters();
ORT_ENFORCE(params != nullptr, "Params absent");
std::unordered_map<std::string, Param> params_values;
params_values.reserve(params->size());
// Re-work in two separate loops due to compiler issues
if (data_transfer) {
if (device_allocator_) {
for (const auto* param : *params) {
auto [name, ort_value] = adapters::utils::CreateOrtValueOverLoraParameter(*param);
OrtValue ort_value_ondevice;
ORT_THROW_IF_ERROR(CreateOrtValueOnDevice(ort_value, device_allocator_,
*data_transfer, ort_value_ondevice));
data_transfer, ort_value_ondevice));
Param lora_param(std::move(ort_value), std::move(ort_value_ondevice));
params_values.emplace(std::move(name), std::move(lora_param));
}
Expand All @@ -117,6 +149,10 @@ void LoraAdapter::InitializeParamsValues() {
}
}

if (device_allocator_) {
ORT_THROW_IF_ERROR(data_transfer.Sync());
}

params_values_.swap(params_values);
}

Expand Down
47 changes: 40 additions & 7 deletions onnxruntime/test/lora/lora_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -200,13 +200,11 @@ TEST(LoraAdapterTest, Load) {
}

#ifdef USE_CUDA
TEST(LoraAdapterTest, VerifyDeviceCopy) {
TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
auto cpu_ep = DefaultCpuExecutionProvider();
auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
auto cuda_ep = DefaultCudaExecutionProvider();
auto cuda_allocator = cuda_ep->CreatePreferredAllocators()[0];

auto gpu_transfer = cuda_ep->GetDataTransfer();
auto cuda_allocator = DefaultCudaExecutionProvider()->CreatePreferredAllocators()[0];
auto cuda_transfer = DefaultCudaExecutionProvider()->GetDataTransfer();

auto test_params = GenerateTestParameters<float>()();
lora::LoraAdapter adapter(std::move(cuda_allocator));
Expand All @@ -222,9 +220,43 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) {
ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());

Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator);
ASSERT_TRUE(gpu_transfer->CanCopy(tensor_device.Location().device,
ASSERT_TRUE(cuda_transfer->CanCopy(tensor_device.Location().device,
copy.Location().device));
ASSERT_STATUS_OK(cuda_transfer->CopyTensor(tensor_device, copy));

auto expected_span = tensor_cpu.DataAsSpan<float>();
auto copy_span = copy.DataAsSpan<float>();

ASSERT_EQ(expected_span, copy_span);
}
}
#endif

#ifdef USE_DML
TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
auto cpu_ep = DefaultCpuExecutionProvider();
auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];

auto dml_allocator = DefaultDmlExecutionProvider()->CreatePreferredAllocators()[0];
auto dml_transfer = DefaultDmlExecutionProvider()->GetDataTransfer();

auto test_params = GenerateTestParameters<float>()();
lora::LoraAdapter adapter(std::move(dml_allocator));
adapter.Load(std::move(test_params));

auto [begin, end] = adapter.GetParamIterators();
for (; begin != end; ++begin) {
const auto& [_, param] = *begin;
const auto& tensor_device = param.GetDeviceOrMapped().Get<Tensor>();
ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::DML));

const auto& tensor_cpu = param.GetMapped().Get<Tensor>();
ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());

Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator);
ASSERT_TRUE(dml_transfer->CanCopy(tensor_device.Location().device,
copy.Location().device));
ASSERT_STATUS_OK(gpu_transfer->CopyTensor(tensor_device, copy));
ASSERT_STATUS_OK(dml_transfer->CopyTensor(tensor_device, copy));

auto expected_span = tensor_cpu.DataAsSpan<float>();
auto copy_span = copy.DataAsSpan<float>();
Expand All @@ -233,5 +265,6 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) {
}
}
#endif

} // namespace test
} // namespace onnxruntime

0 comments on commit 6bef50c

Please sign in to comment.