diff --git a/NDTensors/Project.toml b/NDTensors/Project.toml
index 77f81d1deb..8a27aead81 100644
--- a/NDTensors/Project.toml
+++ b/NDTensors/Project.toml
@@ -37,12 +37,14 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 TBLIS = "48530278-0828-4a49-9772-0f3830dfa1e9"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 
 [extensions]
 NDTensorsCUDAExt = "CUDA"
 NDTensorsMetalExt = "Metal"
 NDTensorsOctavianExt = "Octavian"
 NDTensorsTBLISExt = "TBLIS"
+NDTensorsAMDGPUExt = "AMDGPU"
 
 [compat]
 Accessors = "0.1.33"
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/NDTensorsAMDGPUExt.jl b/NDTensors/ext/NDTensorsAMDGPUExt/NDTensorsAMDGPUExt.jl
new file mode 100644
index 0000000000..76fa1b065c
--- /dev/null
+++ b/NDTensors/ext/NDTensorsAMDGPUExt/NDTensorsAMDGPUExt.jl
@@ -0,0 +1,11 @@
+module NDTensorsAMDGPUExt
+
+include("copyto.jl")
+include("set_types.jl")
+include("adapt.jl")
+include("indexing.jl")
+include("linearalgebra.jl")
+include("mul.jl")
+include("permutedims.jl")
+
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/adapt.jl b/NDTensors/ext/NDTensorsAMDGPUExt/adapt.jl
new file mode 100644
index 0000000000..9e4a25c7a7
--- /dev/null
+++ b/NDTensors/ext/NDTensorsAMDGPUExt/adapt.jl
@@ -0,0 +1,31 @@
+using NDTensors: NDTensors, EmptyStorage, adapt_storagetype, emptytype
+using NDTensors.AMDGPUExtensions: AMDGPUExtensions, ROCArrayAdaptor
+using NDTensors.GPUArraysCoreExtensions: storagemode
+using NDTensors.TypeParameterAccessors:
+  default_type_parameter,
+  set_type_parameter,
+  set_type_parameters,
+  type_parameter,
+  type_parameters
+using Adapt: Adapt, adapt
+using AMDGPU: AMDGPU, ROCArray, ROCVector
+using Functors: fmap
+
+function AMDGPUExtensions.roc(xs; storagemode=default_type_parameter(ROCArray, storagemode))
+  return fmap(x -> adapt(ROCArrayAdaptor{storagemode}(), x), xs)
+end
+
+function Adapt.adapt_storage(adaptor::ROCArrayAdaptor, xs::AbstractArray)
+  new_parameters = (type_parameters(xs, (eltype, ndims))..., storagemode(adaptor))
+  roctype = set_type_parameters(ROCArray, (eltype, ndims, storagemode), new_parameters)
+  return isbits(xs) ? xs : adapt(roctype, xs)
+end
+
+function NDTensors.adapt_storagetype(
+  adaptor::ROCArrayAdaptor, xs::Type{EmptyStorage{ElT,StoreT}}
+) where {ElT,StoreT}
+  roctype = set_type_parameters(
+    ROCVector, (eltype, storagemode), (ElT, storagemode(adaptor))
+  )
+  return emptytype(adapt_storagetype(roctype, StoreT))
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/copyto.jl b/NDTensors/ext/NDTensorsAMDGPUExt/copyto.jl
new file mode 100644
index 0000000000..cba61603a2
--- /dev/null
+++ b/NDTensors/ext/NDTensorsAMDGPUExt/copyto.jl
@@ -0,0 +1,35 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using LinearAlgebra: LinearAlgebra, Adjoint
+using AMDGPU: ROCArray
+
+# Same definition as `MtlArray`.
+function Base.copy(src::Exposed{<:ROCArray,<:Base.ReshapedArray})
+  return reshape(copy(parent(src)), size(unexpose(src)))
+end
+
+function Base.copy(
+  src::Exposed{
+    <:ROCArray,<:SubArray{<:Any,<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}}
+  },
+)
+  return copy(@view copy(expose(parent(src)))[parentindices(unexpose(src))...])
+end
+
+function Base.copyto!(dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:SubArray})
+  copyto!(dest, expose(copy(src)))
+  return unexpose(dest)
+end
+
+function Base.copyto!(
+  dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:Base.ReshapedArray}
+)
+  copyto!(dest, expose(parent(src)))
+  return unexpose(dest)
+end
+
+function Base.copyto!(
+  dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:LinearAlgebra.Transpose}
+)
+  copyto!(expose(transpose(dest)), expose(parent(src)))
+  return unexpose(dest)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl b/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl
new file mode 100644
index 0000000000..c0b9fc4afd
--- /dev/null
+++ b/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl
@@ -0,0 +1,23 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using NDTensors.GPUArraysCoreExtensions: cpu
+using AMDGPU: AMDGPU, ROCArray
+using GPUArraysCore: @allowscalar
+
+function Base.getindex(E::Exposed{<:ROCArray})
+  return @allowscalar unexpose(E)[]
+end
+
+function Base.setindex!(E::Exposed{<:ROCArray}, x::Number)
+  @allowscalar unexpose(E)[] = x
+  return unexpose(E)
+end
+
+function Base.getindex(E::Exposed{<:ROCArray,<:Adjoint}, i, j)
+  return (expose(parent(E))[j, i])'
+end
+
+Base.any(f, E::Exposed{<:ROCArray,<:NDTensors.Tensor}) = any(f, data(unexpose(E)))
+
+function Base.print_array(io::IO, E::Exposed{<:ROCArray})
+  return Base.print_array(io, expose(cpu(E)))
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/linearalgebra.jl b/NDTensors/ext/NDTensorsAMDGPUExt/linearalgebra.jl
new file mode 100644
index 0000000000..642d2e6da0
--- /dev/null
+++ b/NDTensors/ext/NDTensorsAMDGPUExt/linearalgebra.jl
@@ -0,0 +1,22 @@
+using NDTensors.AMDGPUExtensions: roc
+using NDTensors.Expose: Expose, Exposed, expose, ql, ql_positive
+using NDTensors.GPUArraysCoreExtensions: cpu
+using NDTensors.TypeParameterAccessors: unwrap_array_type
+using LinearAlgebra: svd
+using Adapt: adapt
+using AMDGPU: ROCMatrix
+
+function LinearAlgebra.svd(A::Exposed{<:ROCMatrix}; kwargs...)
+  U, S, V = svd(cpu(A))
+  return roc.((U, S, V))
+end
+
+## TODO currently AMDGPU doesn't have ql so make a ql function
+function Expose.ql(A::Exposed{<:ROCMatrix})
+  Q, L = ql(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
+function Expose.ql_positive(A::Exposed{<:ROCMatrix})
+  Q, L = ql_positive(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/mul.jl b/NDTensors/ext/NDTensorsAMDGPUExt/mul.jl
new file mode 100644
index 0000000000..8d332e8452
--- /dev/null
+++ b/NDTensors/ext/NDTensorsAMDGPUExt/mul.jl
@@ -0,0 +1,45 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using LinearAlgebra: LinearAlgebra, Adjoint, Transpose, mul!
+using AMDGPU: ROCArray
+
+# This was calling generic matrix multiplication.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:ROCArray,<:LinearAlgebra.Transpose},
+  AM::Exposed{<:ROCArray},
+  BM::Exposed{<:ROCArray},
+  α,
+  β,
+)
+  mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
+  return unexpose(CM)
+end
+
+# This was calling generic matrix multiplication.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:ROCArray,<:LinearAlgebra.Adjoint},
+  AM::Exposed{<:ROCArray},
+  BM::Exposed{<:ROCArray},
+  α,
+  β,
+)
+  mul!(CM', BM', AM', α, β)
+  return unexpose(CM)
+end
+
+# Fix issue in AMDGPU.jl where it cannot distinguish
+# Transpose{Reshape{Adjoint{ROCArray}}} as a ROCArray and calls generic matmul
+function LinearAlgebra.mul!(
+  CM::Exposed{<:ROCArray},
+  AM::Exposed{<:ROCArray},
+  BM::Exposed{
+    <:ROCArray,
+    <:LinearAlgebra.Transpose{
+      <:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
+    },
+  },
+  α,
+  β,
+)
+  mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/permutedims.jl b/NDTensors/ext/NDTensorsAMDGPUExt/permutedims.jl
new file mode 100644
index 0000000000..cc284e6389
--- /dev/null
+++ b/NDTensors/ext/NDTensorsAMDGPUExt/permutedims.jl
@@ -0,0 +1,23 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using AMDGPU: ROCArray
+
+function Base.permutedims!(
+  Edest::Exposed{<:ROCArray,<:Base.ReshapedArray}, Esrc::Exposed{<:ROCArray}, perm
+)
+  Aperm = permutedims(Esrc, perm)
+  copyto!(expose(parent(Edest)), expose(Aperm))
+  return unexpose(Edest)
+end
+
+# There is an issue in AMDGPU where if Edest is a reshaped{<:Adjoint}
+# .= can fail. So instead force Esrc into the shape of parent(Edest)
+function Base.permutedims!(
+  Edest::Exposed{<:ROCArray,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}},
+  Esrc::Exposed{<:ROCArray},
+  perm,
+  f,
+)
+  Aperm = reshape(permutedims(Esrc, perm), size(parent(Edest)))
+  parent(Edest) .= f.(parent(Edest), Aperm)
+  return unexpose(Edest)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/set_types.jl b/NDTensors/ext/NDTensorsAMDGPUExt/set_types.jl
new file mode 100644
index 0000000000..59ed52b5d0
--- /dev/null
+++ b/NDTensors/ext/NDTensorsAMDGPUExt/set_types.jl
@@ -0,0 +1,11 @@
+# TypeParameterAccessors definitions
+using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
+using NDTensors.GPUArraysCoreExtensions: storagemode
+using AMDGPU: AMDGPU, ROCArray
+
+function TypeParameterAccessors.default_type_parameters(::Type{<:ROCArray})
+  return (Float64, 1, AMDGPU.Mem.HIPBuffer)
+end
+TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(eltype)) = Position(1)
+TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(ndims)) = Position(2)
+TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(storagemode)) = Position(3)
diff --git a/NDTensors/ext/NDTensorsCUDAExt/linearalgebra.jl b/NDTensors/ext/NDTensorsCUDAExt/linearalgebra.jl
index 4720df1e21..4781386ea2 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/linearalgebra.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/linearalgebra.jl
@@ -41,3 +41,16 @@ function NDTensors.svd_catch_error(A::CuMatrix, ::CUDA.CUSOLVER.QRAlgorithm)
   end
   return USV
 end
+
+using NDTensors.GPUArraysCoreExtensions: cpu
+using NDTensors.Expose: Expose, expose, ql, ql_positive
+using NDTensors.TypeParameterAccessors: unwrap_array_type
+## TODO currently AMDGPU doesn't have ql so make a ql function
+function Expose.ql(A::Exposed{<:CuMatrix})
+  Q, L = ql(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
+function Expose.ql_positive(A::Exposed{<:CuMatrix})
+  Q, L = ql_positive(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/set_types.jl b/NDTensors/ext/NDTensorsCUDAExt/set_types.jl
index 9244e99d12..3b0d7a592a 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/set_types.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/set_types.jl
@@ -1,7 +1,7 @@
 # TypeParameterAccessors definitions
 using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
 using NDTensors.GPUArraysCoreExtensions: storagemode
-## TODO remove TypeParameterAccessors when SetParameters is removed
+
 function TypeParameterAccessors.position(::Type{<:CuArray}, ::typeof(eltype))
   return Position(1)
 end
diff --git a/NDTensors/src/adapt.jl b/NDTensors/src/adapt.jl
index cffebe862a..df5770224f 100644
--- a/NDTensors/src/adapt.jl
+++ b/NDTensors/src/adapt.jl
@@ -2,7 +2,6 @@ using .GPUArraysCoreExtensions: GPUArraysCoreExtensions
 adapt_structure(to, x::TensorStorage) = setdata(x, adapt(to, data(x)))
 adapt_structure(to, x::Tensor) = setstorage(x, adapt(to, storage(x)))
 
-## use unwrap cpu here because Expose is included before NDTensors
 function GPUArraysCoreExtensions.cpu(eltype::Type{<:Number}, x)
   return fmap(x -> adapt(Array{eltype}, x), x)
 end
diff --git a/NDTensors/src/imports.jl b/NDTensors/src/imports.jl
index 4d1c789e41..21d3bcd5d5 100644
--- a/NDTensors/src/imports.jl
+++ b/NDTensors/src/imports.jl
@@ -29,6 +29,7 @@ for lib in [
   :UnspecifiedTypes,
   :TypeParameterAccessors,
   :GPUArraysCoreExtensions,
+  :AMDGPUExtensions,
   :CUDAExtensions,
   :MetalExtensions,
   :Expose,
@@ -58,9 +59,10 @@ using Base.Cartesian: @nexprs
 
 using Base.Threads: @spawn
 
+using .AMDGPUExtensions: roc
 using .CUDAExtensions: cu
-using .MetalExtensions: mtl
 using .GPUArraysCoreExtensions: cpu
+using .MetalExtensions: mtl
 
 import Base:
   # Types
diff --git a/NDTensors/src/lib/AMDGPUExtensions/.JuliaFormatter.toml b/NDTensors/src/lib/AMDGPUExtensions/.JuliaFormatter.toml
new file mode 100644
index 0000000000..08f664cdb9
--- /dev/null
+++ b/NDTensors/src/lib/AMDGPUExtensions/.JuliaFormatter.toml
@@ -0,0 +1,2 @@
+style = "blue"
+indent = 2
diff --git a/NDTensors/src/lib/AMDGPUExtensions/src/AMDGPUExtensions.jl b/NDTensors/src/lib/AMDGPUExtensions/src/AMDGPUExtensions.jl
new file mode 100644
index 0000000000..e9e77e6cc5
--- /dev/null
+++ b/NDTensors/src/lib/AMDGPUExtensions/src/AMDGPUExtensions.jl
@@ -0,0 +1,4 @@
+module AMDGPUExtensions
+include("roc.jl")
+
+end
diff --git a/NDTensors/src/lib/AMDGPUExtensions/src/roc.jl b/NDTensors/src/lib/AMDGPUExtensions/src/roc.jl
new file mode 100644
index 0000000000..2cd0aca64f
--- /dev/null
+++ b/NDTensors/src/lib/AMDGPUExtensions/src/roc.jl
@@ -0,0 +1,14 @@
+using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
+using NDTensors.GPUArraysCoreExtensions: storagemode
+# Implemented in NDTensorsAMDGPUExt
+function roc end
+
+## Here we need an ROCArrayAdaptor to prevent conversion of 64 bit numbers to 32 bit.  
+## We cannot write `adapt(CuVector, x)` because this
+## will not allow us to properly utilize the buffer preference without changing the value of
+## default_buffertype. Also `adapt(CuVector{<:Any, <:Any, Buffertype})` fails to work properly
+struct ROCArrayAdaptor{B} end
+
+function TypeParameterAccessors.position(::Type{<:ROCArrayAdaptor}, ::typeof(storagemode))
+  return Position(1)
+end
diff --git a/NDTensors/src/lib/AMDGPUExtensions/test/runtests.jl b/NDTensors/src/lib/AMDGPUExtensions/test/runtests.jl
new file mode 100644
index 0000000000..da274f21da
--- /dev/null
+++ b/NDTensors/src/lib/AMDGPUExtensions/test/runtests.jl
@@ -0,0 +1,9 @@
+@eval module $(gensym())
+using Test: @testset, @test
+using NDTensors.AMDGPUExtensions: roc, ROCArrayAdaptor
+using NDTensors.GPUArraysCoreExtensions: storagemode
+@testset "roc and ROCArrayAdaptor" begin
+  @test roc isa Function
+  @test storagemode(ROCArrayAdaptor{1}) == 1
+end
+end
diff --git a/NDTensors/src/lib/CUDAExtensions/src/cuda.jl b/NDTensors/src/lib/CUDAExtensions/src/cuda.jl
index 36b793f748..9fa41e2f5b 100644
--- a/NDTensors/src/lib/CUDAExtensions/src/cuda.jl
+++ b/NDTensors/src/lib/CUDAExtensions/src/cuda.jl
@@ -1,9 +1,9 @@
 using NDTensors.TypeParameterAccessors: TypeParameterAccessors
 using NDTensors.GPUArraysCoreExtensions: storagemode
-# Implemented in `ITensorGPU` and NDTensorCUDA
+# Implemented in `ITensorGPU` and NDTensorsCUDAExt
 function cu end
 
-## Here we need an NDTensorCuArrayAdaptor because the CuArrayAdaptor provided by CUDA
+## Here we need an CuArrayAdaptor because the CuArrayAdaptor provided by CUDA
 ## converts 64 bit numbers to 32 bit.  We cannot write `adapt(CuVector, x)` because this
 ## Will not allow us to properly utilize the buffer preference without changing the value of
 ## default_buffertype. Also `adapt(CuVector{<:Any, <:Any, Buffertype})` fails to work properly
diff --git a/NDTensors/src/linearalgebra/linearalgebra.jl b/NDTensors/src/linearalgebra/linearalgebra.jl
index d880cb5cfa..bd1690d127 100644
--- a/NDTensors/src/linearalgebra/linearalgebra.jl
+++ b/NDTensors/src/linearalgebra/linearalgebra.jl
@@ -389,11 +389,6 @@ function ql_positive(M::AbstractMatrix)
   # TODO: Change to `isgpu`, or better yet rewrite
   # in terms of broadcasting and linear algebra
   # like `qr_positive`.
-  iscuda = iscu(M)
-  if iscuda
-    cutype = unwrap_array_type(M)
-    M = NDTensors.cpu(M)
-  end
   sparseQ, L = ql(M)
   Q = convert(typeof(L), sparseQ)
   nr, nc = size(L)
@@ -407,10 +402,6 @@ function ql_positive(M::AbstractMatrix)
       end
     end
   end
-  if iscuda
-    Q = adapt(cutype, Q)
-    L = adapt(cutype, L)
-  end
   return (Q, L)
 end
 
@@ -423,16 +414,7 @@ function ql(A::AbstractMatrix)
   T = eltype(A)
   AA = similar(A, LinearAlgebra._qreltype(T), size(A))
   copyto!(expose(AA), expose(A))
-  iscuda = iscu(AA)
-  if iscuda
-    cutype = unwrap_array_type(AA)
-    AA = NDTensors.cpu(AA)
-  end
   Q, L = ql!(AA)
-  if iscuda
-    Q = adapt(cutype, Q)
-    L = adapt(cutype, L)
-  end
   return (Q, L)
 end
 #
@@ -440,6 +422,8 @@ end
 # about unpacking Q and L from the A matrix.
 #
 function ql!(A::StridedMatrix{<:LAPACK.BlasFloat})
+  ## TODO is this really necessary here, we could create Expose function if
+  ## we need this function on CU/GPU
   if iscu(A)
     throw("Error: ql is not implemented in CUDA.jl")
   end
diff --git a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
index b1b63ab483..d431edb189 100644
--- a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
+++ b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
@@ -5,6 +5,7 @@ module TestITensorDMRG
 using ITensors
 using NDTensors
 using NDTensors.CUDAExtensions: cu
+using NDTensors.AMDGPUExtensions: roc
 using Random
 
 reference_energies = Dict([
@@ -12,7 +13,10 @@ reference_energies = Dict([
 ])
 
 is_broken(dev, elt::Type, conserve_qns::Val) = false
+## Disable blocksparse GPU testing on CUDA and ROC backends while
+## we work on the blocksparse backend. In the future these will work too
 is_broken(dev::typeof(cu), elt::Type, conserve_qns::Val{true}) = true
+is_broken(dev::typeof(roc), elt::Type, conserve_qns::Val{true}) = true
 
 include("dmrg.jl")
 
diff --git a/NDTensors/test/NDTensorsTestUtils/device_list.jl b/NDTensors/test/NDTensorsTestUtils/device_list.jl
index b4b3f58d21..8e105f6fe5 100644
--- a/NDTensors/test/NDTensorsTestUtils/device_list.jl
+++ b/NDTensors/test/NDTensorsTestUtils/device_list.jl
@@ -2,6 +2,9 @@ using NDTensors: NDTensors
 if "cuda" in ARGS || "all" in ARGS
   using CUDA
 end
+if "rocm" in ARGS || "all" in ARGS
+  using AMDGPU
+end
 if "metal" in ARGS || "all" in ARGS
   using Metal
 end
@@ -22,6 +25,10 @@ function devices_list(test_args)
     end
   end
 
+  if "rocm" in test_args || "all" in test_args
+    push!(devs, NDTensors.AMDGPUExtensions.roc)
+  end
+
   if "metal" in test_args || "all" in test_args
     push!(devs, NDTensors.MetalExtensions.mtl)
   end
diff --git a/NDTensors/test/Project.toml b/NDTensors/test/Project.toml
index 7e4cbc9661..a460653466 100644
--- a/NDTensors/test/Project.toml
+++ b/NDTensors/test/Project.toml
@@ -23,4 +23,5 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extras]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
diff --git a/NDTensors/test/lib/runtests.jl b/NDTensors/test/lib/runtests.jl
index 65038f82a2..619bfa1c18 100644
--- a/NDTensors/test/lib/runtests.jl
+++ b/NDTensors/test/lib/runtests.jl
@@ -4,6 +4,7 @@ using Test: @testset
 @testset "Test NDTensors lib $lib" for lib in [
   "AlgorithmSelection",
   "AllocateData",
+  "AMDGPUExtensions",
   "BaseExtensions",
   "BlockSparseArrays",
   "BroadcastMapConversion",
diff --git a/NDTensors/test/runtests.jl b/NDTensors/test/runtests.jl
index 6d81096d07..66366562c1 100644
--- a/NDTensors/test/runtests.jl
+++ b/NDTensors/test/runtests.jl
@@ -2,6 +2,7 @@ using SafeTestsets: @safetestset
 
 @safetestset "NDTensors" begin
   using Test: @testset
+  using NDTensors: NDTensors
   @testset "$(@__DIR__)" begin
     filenames = filter(readdir(@__DIR__)) do f
       startswith("test_")(f) && endswith(".jl")(f)
@@ -15,11 +16,9 @@ using SafeTestsets: @safetestset
     end
   end
   if "cuda" in ARGS || "all" in ARGS
-    using NDTensors: NDTensors
     include(joinpath(pkgdir(NDTensors), "ext", "examples", "NDTensorCUDA.jl"))
   end
   if "metal" in ARGS || "all" in ARGS
-    using NDTensors: NDTensors
     include(joinpath(pkgdir(NDTensors), "ext", "examples", "NDTensorMetal.jl"))
   end
 end
diff --git a/NDTensors/test/test_linearalgebra.jl b/NDTensors/test/test_linearalgebra.jl
index 289b2c4625..96bde8efdf 100644
--- a/NDTensors/test/test_linearalgebra.jl
+++ b/NDTensors/test/test_linearalgebra.jl
@@ -39,6 +39,11 @@ end
     if !is_supported_eltype(dev, elt)
       continue
     end
+    ## Looks like AMDGPU has an issue with QR when A is singular
+    ## TODO potentially make an is_broken function?
+    if dev == NDTensors.AMDGPUExtensions.roc && singular
+      continue
+    end
     eps = Base.eps(real(elt)) * 100 #this is set rather tight, so if you increase/change m,n you may have open up the tolerance on eps.
     n, m = 4, 8
     Id = Diagonal(fill(1.0, min(n, m)))
diff --git a/src/mps/dmrg.jl b/src/mps/dmrg.jl
index 6eb753eac9..80e8af50d8 100644
--- a/src/mps/dmrg.jl
+++ b/src/mps/dmrg.jl
@@ -251,6 +251,7 @@ function dmrg(
         ## Right now there is a conversion problem in CUDA.jl where `UnifiedMemory` Arrays are being converted
         ## into `DeviceMemory`. This conversion line is here temporarily to fix that problem when it arises
         ## Adapt is only called when using CUDA backend. CPU will work as implemented previously.
+        ## TODO this might be the only place we really need iscu if its not fixed.
         phi::ITensor = if NDTensors.iscu(phi) && NDTensors.iscu(vecs[1])
           adapt(set_eltype(unwrap_array_type(phi), eltype(vecs[1])), vecs[1])
         else