diff --git a/onnxruntime/core/session/lora_adapters.cc b/onnxruntime/core/session/lora_adapters.cc index 466edce187a56..0f6c0e0ea346a 100644 --- a/onnxruntime/core/session/lora_adapters.cc +++ b/onnxruntime/core/session/lora_adapters.cc @@ -4,10 +4,9 @@ #include "core/session/lora_adapters.h" #include "lora/adapter_format_utils.h" -#include - #include "core/framework/data_transfer.h" #include "core/framework/error_code_helper.h" +#include "core/framework/execution_provider.h" #include "core/session/onnxruntime_c_api.h" #include "core/session/allocator_adapters.h" #include "core/session/ort_apis.h" @@ -16,6 +15,15 @@ #include "core/providers/cuda/cuda_provider_factory.h" #endif +#ifdef USE_DML +#include "core/session/abi_session_options_impl.h" +#include "core/providers/dml/dml_provider_factory_creator.h" +#include "core/providers/dml/dml_provider_factory.h" +#endif + +#include +#include + namespace onnxruntime { #ifdef USE_CUDA @@ -50,28 +58,55 @@ void LoraAdapter::MemoryMap(const std::filesystem::path& file_path) { InitializeParamsValues(); } -static std::unique_ptr GetDataTransfer(const OrtMemoryInfo& mem_info) { +namespace { +struct DataTransfer { + std::unique_ptr ep; std::unique_ptr data_transfer; - - if (strcmp(mem_info.name, onnxruntime::CPU) == 0) { - return data_transfer; + Status CopyTensor(const Tensor& src, Tensor& dst) const { + return data_transfer->CopyTensor(src, dst); } + Status Sync() const { +#if USE_DML + return ep->Sync(); +#else + return Status::OK(); +#endif + } +}; +} // namespace + +static Status GetDataTransfer(const OrtMemoryInfo& mem_info, [[maybe_unused]] DataTransfer& dt) { + ORT_RETURN_IF(strcmp(mem_info.name, onnxruntime::CPU) == 0, "Expecting on device allocator for LoraAdapter"); if (strcmp(mem_info.name, onnxruntime::CUDA) == 0) { #ifdef USE_CUDA auto* cuda_provider_info = TryGetProviderInfo_CUDA(); if (cuda_provider_info != nullptr) { - data_transfer = cuda_provider_info->CreateGPUDataTransfer(); + dt.data_transfer = cuda_provider_info->CreateGPUDataTransfer(); + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider could not be loaded"); } +#else + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider is not enabled in this build"); #endif + } else if (strcmp(mem_info.name, onnxruntime::DML) == 0) { +#ifdef USE_DML + auto ep_factory = onnxruntime::DMLProviderFactoryCreator::Create(ConfigOptions{}, 0, false, false, false); + dt.ep = ep_factory->CreateProvider(); + dt.data_transfer = dt.ep->GetDataTransfer(); +#else + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "DML provider is not enabled in this build"); +#endif + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported device allocator"); } - return data_transfer; + return Status::OK(); } static Status CreateOrtValueOnDevice(const OrtValue& ort_value_mapped, const AllocatorPtr& device_allocator, - const IDataTransfer& data_transfer, + const DataTransfer& data_transfer, OrtValue& out) { OrtValue result; const auto& src = ort_value_mapped.Get(); @@ -87,12 +122,9 @@ void LoraAdapter::InitializeParamsValues() { ORT_THROW("Adapter is not loaded yet."); } - std::unique_ptr data_transfer; + DataTransfer data_transfer; if (device_allocator_) { - data_transfer = GetDataTransfer(device_allocator_->Info()); - if (data_transfer == nullptr) { - ORT_THROW("Data transfer is not available for the specified device allocator, it also must not be a CPU allocator"); - } + ORT_THROW_IF_ERROR(GetDataTransfer(device_allocator_->Info(), data_transfer)); } const auto* params = adapter_->parameters(); @@ -100,12 +132,12 @@ void LoraAdapter::InitializeParamsValues() { std::unordered_map params_values; params_values.reserve(params->size()); // Re-work in two separate loops due to compiler issues - if (data_transfer) { + if (device_allocator_) { for (const auto* param : *params) { auto [name, ort_value] = adapters::utils::CreateOrtValueOverLoraParameter(*param); OrtValue ort_value_ondevice; ORT_THROW_IF_ERROR(CreateOrtValueOnDevice(ort_value, device_allocator_, - *data_transfer, ort_value_ondevice)); + data_transfer, ort_value_ondevice)); Param lora_param(std::move(ort_value), std::move(ort_value_ondevice)); params_values.emplace(std::move(name), std::move(lora_param)); } @@ -117,6 +149,10 @@ void LoraAdapter::InitializeParamsValues() { } } + if (device_allocator_) { + ORT_THROW_IF_ERROR(data_transfer.Sync()); + } + params_values_.swap(params_values); } diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc index e8291a36447ca..fde603858f9a9 100644 --- a/onnxruntime/test/lora/lora_test.cc +++ b/onnxruntime/test/lora/lora_test.cc @@ -200,13 +200,11 @@ TEST(LoraAdapterTest, Load) { } #ifdef USE_CUDA -TEST(LoraAdapterTest, VerifyDeviceCopy) { +TEST(LoraAdapterTest, VerifyCudaDeviceCopy) { auto cpu_ep = DefaultCpuExecutionProvider(); auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0]; - auto cuda_ep = DefaultCudaExecutionProvider(); - auto cuda_allocator = cuda_ep->CreatePreferredAllocators()[0]; - - auto gpu_transfer = cuda_ep->GetDataTransfer(); + auto cuda_allocator = DefaultCudaExecutionProvider()->CreatePreferredAllocators()[0]; + auto cuda_transfer = DefaultCudaExecutionProvider()->GetDataTransfer(); auto test_params = GenerateTestParameters()(); lora::LoraAdapter adapter(std::move(cuda_allocator)); @@ -222,9 +220,43 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) { ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size()); Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator); - ASSERT_TRUE(gpu_transfer->CanCopy(tensor_device.Location().device, + ASSERT_TRUE(cuda_transfer->CanCopy(tensor_device.Location().device, + copy.Location().device)); + ASSERT_STATUS_OK(cuda_transfer->CopyTensor(tensor_device, copy)); + + auto expected_span = tensor_cpu.DataAsSpan(); + auto copy_span = copy.DataAsSpan(); + + ASSERT_EQ(expected_span, copy_span); + } +} +#endif + +#ifdef USE_DML +TEST(LoraAdapterTest, VerifyDmlDeviceCopy) { + auto cpu_ep = DefaultCpuExecutionProvider(); + auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0]; + + auto dml_allocator = DefaultDmlExecutionProvider()->CreatePreferredAllocators()[0]; + auto dml_transfer = DefaultDmlExecutionProvider()->GetDataTransfer(); + + auto test_params = GenerateTestParameters()(); + lora::LoraAdapter adapter(std::move(dml_allocator)); + adapter.Load(std::move(test_params)); + + auto [begin, end] = adapter.GetParamIterators(); + for (; begin != end; ++begin) { + const auto& [_, param] = *begin; + const auto& tensor_device = param.GetDeviceOrMapped().Get(); + ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::DML)); + + const auto& tensor_cpu = param.GetMapped().Get(); + ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size()); + + Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator); + ASSERT_TRUE(dml_transfer->CanCopy(tensor_device.Location().device, copy.Location().device)); - ASSERT_STATUS_OK(gpu_transfer->CopyTensor(tensor_device, copy)); + ASSERT_STATUS_OK(dml_transfer->CopyTensor(tensor_device, copy)); auto expected_span = tensor_cpu.DataAsSpan(); auto copy_span = copy.DataAsSpan(); @@ -233,5 +265,6 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) { } } #endif + } // namespace test } // namespace onnxruntime