diff --git a/onnxruntime/core/session/lora_adapters.cc b/onnxruntime/core/session/lora_adapters.cc
index 466edce187a56..0f6c0e0ea346a 100644
--- a/onnxruntime/core/session/lora_adapters.cc
+++ b/onnxruntime/core/session/lora_adapters.cc
@@ -4,10 +4,9 @@
 #include "core/session/lora_adapters.h"
 #include "lora/adapter_format_utils.h"
 
-#include <unordered_map>
-
 #include "core/framework/data_transfer.h"
 #include "core/framework/error_code_helper.h"
+#include "core/framework/execution_provider.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/allocator_adapters.h"
 #include "core/session/ort_apis.h"
@@ -16,6 +15,15 @@
 #include "core/providers/cuda/cuda_provider_factory.h"
 #endif
 
+#ifdef USE_DML
+#include "core/session/abi_session_options_impl.h"
+#include "core/providers/dml/dml_provider_factory_creator.h"
+#include "core/providers/dml/dml_provider_factory.h"
+#endif
+
+#include <functional>
+#include <unordered_map>
+
 namespace onnxruntime {
 
 #ifdef USE_CUDA
@@ -50,28 +58,55 @@ void LoraAdapter::MemoryMap(const std::filesystem::path& file_path) {
   InitializeParamsValues();
 }
 
-static std::unique_ptr<IDataTransfer> GetDataTransfer(const OrtMemoryInfo& mem_info) {
+namespace {
+struct DataTransfer {
+  std::unique_ptr<IExecutionProvider> ep;
   std::unique_ptr<IDataTransfer> data_transfer;
-
-  if (strcmp(mem_info.name, onnxruntime::CPU) == 0) {
-    return data_transfer;
+  Status CopyTensor(const Tensor& src, Tensor& dst) const {
+    return data_transfer->CopyTensor(src, dst);
   }
+  Status Sync() const {
+#if USE_DML
+    return ep->Sync();
+#else
+    return Status::OK();
+#endif
+  }
+};
+}  // namespace
+
+static Status GetDataTransfer(const OrtMemoryInfo& mem_info, [[maybe_unused]] DataTransfer& dt) {
+  ORT_RETURN_IF(strcmp(mem_info.name, onnxruntime::CPU) == 0, "Expecting on device allocator for LoraAdapter");
 
   if (strcmp(mem_info.name, onnxruntime::CUDA) == 0) {
 #ifdef USE_CUDA
     auto* cuda_provider_info = TryGetProviderInfo_CUDA();
     if (cuda_provider_info != nullptr) {
-      data_transfer = cuda_provider_info->CreateGPUDataTransfer();
+      dt.data_transfer = cuda_provider_info->CreateGPUDataTransfer();
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider could not be loaded");
     }
+#else
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider is not enabled in this build");
 #endif
+  } else if (strcmp(mem_info.name, onnxruntime::DML) == 0) {
+#ifdef USE_DML
+    auto ep_factory = onnxruntime::DMLProviderFactoryCreator::Create(ConfigOptions{}, 0, false, false, false);
+    dt.ep = ep_factory->CreateProvider();
+    dt.data_transfer = dt.ep->GetDataTransfer();
+#else
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "DML provider is not enabled in this build");
+#endif
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported device allocator");
   }
 
-  return data_transfer;
+  return Status::OK();
 }
 
 static Status CreateOrtValueOnDevice(const OrtValue& ort_value_mapped,
                                      const AllocatorPtr& device_allocator,
-                                     const IDataTransfer& data_transfer,
+                                     const DataTransfer& data_transfer,
                                      OrtValue& out) {
   OrtValue result;
   const auto& src = ort_value_mapped.Get<Tensor>();
@@ -87,12 +122,9 @@ void LoraAdapter::InitializeParamsValues() {
     ORT_THROW("Adapter is not loaded yet.");
   }
 
-  std::unique_ptr<IDataTransfer> data_transfer;
+  DataTransfer data_transfer;
   if (device_allocator_) {
-    data_transfer = GetDataTransfer(device_allocator_->Info());
-    if (data_transfer == nullptr) {
-      ORT_THROW("Data transfer is not available for the specified device allocator, it also must not be a CPU allocator");
-    }
+    ORT_THROW_IF_ERROR(GetDataTransfer(device_allocator_->Info(), data_transfer));
   }
 
   const auto* params = adapter_->parameters();
@@ -100,12 +132,12 @@ void LoraAdapter::InitializeParamsValues() {
   std::unordered_map<std::string, Param> params_values;
   params_values.reserve(params->size());
   // Re-work in two separate loops due to compiler issues
-  if (data_transfer) {
+  if (device_allocator_) {
     for (const auto* param : *params) {
       auto [name, ort_value] = adapters::utils::CreateOrtValueOverLoraParameter(*param);
       OrtValue ort_value_ondevice;
       ORT_THROW_IF_ERROR(CreateOrtValueOnDevice(ort_value, device_allocator_,
-                                                *data_transfer, ort_value_ondevice));
+                                                data_transfer, ort_value_ondevice));
       Param lora_param(std::move(ort_value), std::move(ort_value_ondevice));
       params_values.emplace(std::move(name), std::move(lora_param));
     }
@@ -117,6 +149,10 @@ void LoraAdapter::InitializeParamsValues() {
     }
   }
 
+  if (device_allocator_) {
+    ORT_THROW_IF_ERROR(data_transfer.Sync());
+  }
+
   params_values_.swap(params_values);
 }
 
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index e8291a36447ca..fde603858f9a9 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -200,13 +200,11 @@ TEST(LoraAdapterTest, Load) {
 }
 
 #ifdef USE_CUDA
-TEST(LoraAdapterTest, VerifyDeviceCopy) {
+TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
-  auto cuda_ep = DefaultCudaExecutionProvider();
-  auto cuda_allocator = cuda_ep->CreatePreferredAllocators()[0];
-
-  auto gpu_transfer = cuda_ep->GetDataTransfer();
+  auto cuda_allocator = DefaultCudaExecutionProvider()->CreatePreferredAllocators()[0];
+  auto cuda_transfer = DefaultCudaExecutionProvider()->GetDataTransfer();
 
   auto test_params = GenerateTestParameters<float>()();
   lora::LoraAdapter adapter(std::move(cuda_allocator));
@@ -222,9 +220,43 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) {
     ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());
 
     Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator);
-    ASSERT_TRUE(gpu_transfer->CanCopy(tensor_device.Location().device,
+    ASSERT_TRUE(cuda_transfer->CanCopy(tensor_device.Location().device,
+                                       copy.Location().device));
+    ASSERT_STATUS_OK(cuda_transfer->CopyTensor(tensor_device, copy));
+
+    auto expected_span = tensor_cpu.DataAsSpan<float>();
+    auto copy_span = copy.DataAsSpan<float>();
+
+    ASSERT_EQ(expected_span, copy_span);
+  }
+}
+#endif
+
+#ifdef USE_DML
+TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
+  auto cpu_ep = DefaultCpuExecutionProvider();
+  auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
+
+  auto dml_allocator = DefaultDmlExecutionProvider()->CreatePreferredAllocators()[0];
+  auto dml_transfer = DefaultDmlExecutionProvider()->GetDataTransfer();
+
+  auto test_params = GenerateTestParameters<float>()();
+  lora::LoraAdapter adapter(std::move(dml_allocator));
+  adapter.Load(std::move(test_params));
+
+  auto [begin, end] = adapter.GetParamIterators();
+  for (; begin != end; ++begin) {
+    const auto& [_, param] = *begin;
+    const auto& tensor_device = param.GetDeviceOrMapped().Get<Tensor>();
+    ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::DML));
+
+    const auto& tensor_cpu = param.GetMapped().Get<Tensor>();
+    ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());
+
+    Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator);
+    ASSERT_TRUE(dml_transfer->CanCopy(tensor_device.Location().device,
                                       copy.Location().device));
-    ASSERT_STATUS_OK(gpu_transfer->CopyTensor(tensor_device, copy));
+    ASSERT_STATUS_OK(dml_transfer->CopyTensor(tensor_device, copy));
 
     auto expected_span = tensor_cpu.DataAsSpan<float>();
     auto copy_span = copy.DataAsSpan<float>();
@@ -233,5 +265,6 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) {
   }
 }
 #endif
+
 }  // namespace test
 }  // namespace onnxruntime