ITensor · mtfishman · Mar 21, 2024 · Feb 1, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/NDTensors/Project.toml b/NDTensors/Project.toml
@@ -37,12 +37,14 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 TBLIS = "48530278-0828-4a49-9772-0f3830dfa1e9"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 
 [extensions]
 NDTensorsCUDAExt = "CUDA"
 NDTensorsMetalExt = "Metal"
 NDTensorsOctavianExt = "Octavian"
 NDTensorsTBLISExt = "TBLIS"
+NDTensorsAMDGPUExt = "AMDGPU"
 
 [compat]
 Accessors = "0.1.33"

diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/NDTensorsAMDGPUExt.jl b/NDTensors/ext/NDTensorsAMDGPUExt/NDTensorsAMDGPUExt.jl
@@ -0,0 +1,12 @@
+module NDTensorsAMDGPUExt
+using Functors
+
+include("copyto.jl")
+include("set_types.jl")
+include("adapt.jl")
+include("indexing.jl")
+include("linearalgebra.jl")
+include("mul.jl")
+include("permutedims.jl")
+
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/adapt.jl b/NDTensors/ext/NDTensorsAMDGPUExt/adapt.jl
@@ -0,0 +1,29 @@
+using NDTensors: NDTensors, EmptyStorage, adapt_storagetype, emptytype
+using NDTensors.AMDGPUExtensions: AMDGPUExtensions, ROCArrayAdaptor
+using NDTensors.GPUArraysCoreExtensions: storagemode
+using NDTensors.TypeParameterAccessors:
+  set_type_parameter, set_type_parameters, type_parameter, type_parameters
+using Adapt: Adapt, adapt
+using AMDGPU: AMDGPU, ROCArray, ROCVector
+
+function AMDGPUExtensions.roc(xs)
+  return fmap(x -> adapt(ROCArrayAdaptor{AMDGPU.Runtime.Mem.HIPBuffer}(), x), xs)
+end
+
+function Adapt.adapt_storage(adaptor::ROCArrayAdaptor, xs::AbstractArray)
+  roctype = set_type_parameters(
+    ROCArray, (eltype, ndims), type_parameters(xs, (eltype, ndims))
+  )
+  roctype = set_type_parameter(roctype, storagemode, storagemode(adaptor))
+
+  return isbits(xs) ? xs : adapt(roctype, xs)
+end
+
+function NDTensors.adapt_storagetype(
+  adaptor::ROCArrayAdaptor, xs::Type{EmptyStorage{ElT,StoreT}}
+) where {ElT,StoreT}
+  roctype = set_type_parameters(
+    ROCVector, (eltype, storagemode), (ElT, storagemode(adaptor))
+  )
+  return emptytype(adapt_storagetype(roctype, StoreT))
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/copyto.jl b/NDTensors/ext/NDTensorsAMDGPUExt/copyto.jl
@@ -0,0 +1,35 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using LinearAlgebra: LinearAlgebra, Adjoint
+using AMDGPU: ROCArray
+
+# Same definition as `MtlArray`.
+function Base.copy(src::Exposed{<:ROCArray,<:Base.ReshapedArray})
+  return reshape(copy(parent(src)), size(unexpose(src)))
+end
+
+function Base.copy(
+  src::Exposed{
+    <:ROCArray,<:SubArray{<:Any,<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}}
+  },
+)
+  return copy(@view copy(expose(parent(src)))[parentindices(unexpose(src))...])
+end
+
+function Base.copyto!(dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:SubArray})
+  copyto!(dest, expose(copy(src)))
+  return unexpose(dest)
+end
+
+function Base.copyto!(
+  dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:Base.ReshapedArray}
+)
+  copyto!(dest, expose(parent(src)))
+  return unexpose(dest)
+end
+
+function Base.copyto!(
+  dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:LinearAlgebra.Transpose}
+)
+  copyto!(expose(transpose(dest)), expose(parent(src)))
+  return unexpose(dest)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl b/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl
@@ -0,0 +1,23 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using NDTensors.GPUArraysCoreExtensions: cpu
+using AMDGPU: AMDGPU, ROCArray
+using GPUArraysCore: @allowscalar
+
+function Base.getindex(E::Exposed{<:ROCArray})
+  return @allowscalar unexpose(E)[]
+end
+
+function Base.setindex!(E::Exposed{<:ROCArray}, x::Number)
+  AMDGPU.@allowscalar unexpose(E)[] = x
+  return unexpose(E)
+end
+
+function Base.getindex(E::Exposed{<:ROCArray,<:Adjoint}, i, j)
+  return (expose(parent(E))[j, i])'
+end
+
+Base.any(f, E::Exposed{<:ROCArray,<:NDTensors.Tensor}) = any(f, data(unexpose(E)))
+
+function Base.print_array(io::IO, E::Exposed{<:ROCArray})
+  return Base.print_array(io, expose(cpu(E)))
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/linearalgebra.jl b/NDTensors/ext/NDTensorsAMDGPUExt/linearalgebra.jl
@@ -0,0 +1,22 @@
+using NDTensors.AMDGPUExtensions: roc
+using NDTensors.Expose: Expose, Exposed, expose, ql, ql_positive
+using NDTensors.GPUArraysCoreExtensions: cpu
+using NDTensors.TypeParameterAccessors: unwrap_array_type
+using LinearAlgebra: svd
+using Adapt: adapt
+using AMDGPU: ROCMatrix
+
+function LinearAlgebra.svd(A::Exposed{<:ROCMatrix}; kwargs...)
+  U, S, V = svd(cpu(A))
+  return roc.((U, S, V))
+end
+
+## TODO currently AMDGPU doesn't have ql so make a ql function
+function Expose.ql(A::Exposed{<:ROCMatrix})
+  Q, L = ql(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
+function Expose.ql_positive(A::Exposed{<:ROCMatrix})
+  Q, L = ql_positive(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/mul.jl b/NDTensors/ext/NDTensorsAMDGPUExt/mul.jl
@@ -0,0 +1,45 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using LinearAlgebra: LinearAlgebra, Adjoint, Transpose, mul!
+using AMDGPU: ROCArray
+
+# This was calling generic matrix multiplication.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:ROCArray,<:LinearAlgebra.Transpose},
+  AM::Exposed{<:ROCArray},
+  BM::Exposed{<:ROCArray},
+  α,
+  β,
+)
+  mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
+  return unexpose(CM)
+end
+
+# This was calling generic matrix multiplication.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:ROCArray,<:LinearAlgebra.Adjoint},
+  AM::Exposed{<:ROCArray},
+  BM::Exposed{<:ROCArray},
+  α,
+  β,
+)
+  mul!(CM', BM', AM', α, β)
+  return unexpose(CM)
+end
+
+# Fix issue in AMDGPU.jl where it cannot distinguish
+# Transpose{Reshape{Adjoint{ROCArray}}} as a ROCArray and calls generic matmul
+function LinearAlgebra.mul!(
+  CM::Exposed{<:ROCArray},
+  AM::Exposed{<:ROCArray},
+  BM::Exposed{
+    <:ROCArray,
+    <:LinearAlgebra.Transpose{
+      <:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
+    },
+  },
+  α,
+  β,
+)
+  mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/permutedims.jl b/NDTensors/ext/NDTensorsAMDGPUExt/permutedims.jl
@@ -0,0 +1,23 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using AMDGPU: ROCArray
+
+function Base.permutedims!(
+  Edest::Exposed{<:ROCArray,<:Base.ReshapedArray}, Esrc::Exposed{<:ROCArray}, perm
+)
+  Aperm = permutedims(Esrc, perm)
+  copyto!(expose(parent(Edest)), expose(Aperm))
+  return unexpose(Edest)
+end
+
+# There is an issue in AMDGPU where if Edest is a reshaped{<:Adjoint}
+# .= can fail. So instead force Esrc into the shape of parent(Edest)
+function Base.permutedims!(
+  Edest::Exposed{<:ROCArray,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}},
+  Esrc::Exposed{<:ROCArray},
+  perm,
+  f,
+)
+  Aperm = reshape(permutedims(Esrc, perm), size(parent(Edest)))
+  parent(Edest) .= f.(parent(Edest), Aperm)
+  return unexpose(Edest)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/set_types.jl b/NDTensors/ext/NDTensorsAMDGPUExt/set_types.jl
@@ -0,0 +1,11 @@
+# TypeParameterAccessors definitions
+using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
+using NDTensors.GPUArraysCoreExtensions: storagemode
+using AMDGPU: AMDGPU, ROCArray
+
+function TypeParameterAccessors.default_type_parameters(::Type{<:ROCArray})
+  return (Float64, 1, AMDGPU.Mem.HIPBuffer)
+end
+TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(eltype)) = Position(1)
+TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(ndims)) = Position(2)
+TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(storagemode)) = Position(3)
diff --git a/NDTensors/ext/NDTensorsCUDAExt/linearalgebra.jl b/NDTensors/ext/NDTensorsCUDAExt/linearalgebra.jl
@@ -41,3 +41,16 @@ function NDTensors.svd_catch_error(A::CuMatrix, ::CUDA.CUSOLVER.QRAlgorithm)
   end
   return USV
 end
+
+using NDTensors.GPUArraysCoreExtensions: cpu
+using NDTensors.Expose: Expose, expose, ql, ql_positive
+using NDTensors.TypeParameterAccessors: unwrap_array_type
+## TODO currently AMDGPU doesn't have ql so make a ql function
+function Expose.ql(A::Exposed{<:CuMatrix})
+  Q, L = ql(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
+function Expose.ql_positive(A::Exposed{<:CuMatrix})
+  Q, L = ql_positive(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/set_types.jl b/NDTensors/ext/NDTensorsCUDAExt/set_types.jl
@@ -1,7 +1,7 @@
 # TypeParameterAccessors definitions
 using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
 using NDTensors.GPUArraysCoreExtensions: storagemode
-## TODO remove TypeParameterAccessors when SetParameters is removed
+
 function TypeParameterAccessors.position(::Type{<:CuArray}, ::typeof(eltype))
   return Position(1)
 end

diff --git a/NDTensors/ext/examples/NDTensorAMDGPU.jl b/NDTensors/ext/examples/NDTensorAMDGPU.jl
@@ -0,0 +1,65 @@
+## Code adapted from NDTensors/ext/examples/NDTensorCUDA.jl
+
+using AMDGPU
+using NDTensors
+using ITensors
+using ITensors: Index, ITensor, orthogonalize, qr, siteinds, svd
+using Test: @test
+
+function main()
+  # using ITensorGPU
+  cpu = NDTensors.cpu
+  gpu = NDTensors.roc
+  # Here is an example of how to utilize NDTensors based tensors with AMDGPU datatypes
+  i = Index(2)
+  j = Index(5)
+  k = Index(3)
+  l = Index(6)
+
+  dim1 = (i, j, l)
+  dim2 = (j, k)
+
+  # Create 2 ITensors with AMDGPU backends
+  A = ITensor(randomTensor(ROCArray, dim1))
+  B = ITensor(randomTensor(ROCArray, dim2))
+
+  # Contract the two tensors
+  C = A * B
+  A = cpu(A)
+  B = cpu(B)
+  @test cpu(C) ≈ A * B
+  @test eltype(C) == Float64
+
+  # Create 2 ITensors on CPU with different eltypes
+  A = ITensor(Float32, dim1)
+  B = ITensor(Float64, dim2)
+
+  fill!(A, randn())
+  fill!(B, randn())
+
+  # Convert the ITensors to GPU
+  cA = gpu(A)
+  cB = gpu(B)
+
+  # Check that backend of contraction is GPU
+  @test A * A ≈ cpu(cA * cA)
+  @test B * B ≈ cpu(cB * cB)
+  @test A * B ≈ cpu(cA * cB)
+  @test B * A ≈ cpu(cB * cA)
+
+  dim3 = (l, k)
+  dim4 = (i,)
+  cC = ITensor(randomTensor(ROCArray{Float64,AMDGPU.Runtime.Mem.HIPBuffer}, dim3))
+  cD = ITensor(Tensor(ROCArray{Float32}, dim4))
+  fill!(cD, randn())
+
+  # Its possible to compute QR of GPU tensor
+  cq = qr(cA, (i,), (j, l))
+  A ≈ cpu(cq[1]) * cpu(cq[2])
+
+  res = ITensors.svd(A, (i,), (j, l))
+  @show res
+end
+
+## running the main function with Float64
+main()
diff --git a/NDTensors/src/adapt.jl b/NDTensors/src/adapt.jl
@@ -2,7 +2,6 @@ using .GPUArraysCoreExtensions: GPUArraysCoreExtensions
 adapt_structure(to, x::TensorStorage) = setdata(x, adapt(to, data(x)))
 adapt_structure(to, x::Tensor) = setstorage(x, adapt(to, storage(x)))
 
-## use unwrap cpu here because Expose is included before NDTensors
 function GPUArraysCoreExtensions.cpu(eltype::Type{<:Number}, x)
   return fmap(x -> adapt(Array{eltype}, x), x)
 end

diff --git a/NDTensors/src/imports.jl b/NDTensors/src/imports.jl
@@ -29,6 +29,7 @@ for lib in [
   :UnspecifiedTypes,
   :TypeParameterAccessors,
   :GPUArraysCoreExtensions,
+  :AMDGPUExtensions,
   :CUDAExtensions,
   :MetalExtensions,
   :Expose,
@@ -58,9 +59,10 @@ using Base.Cartesian: @nexprs
 
 using Base.Threads: @spawn
 
+using .AMDGPUExtensions: roc
 using .CUDAExtensions: cu
-using .MetalExtensions: mtl
 using .GPUArraysCoreExtensions: cpu
+using .MetalExtensions: mtl
 
 import Base:
   # Types

diff --git a/NDTensors/src/lib/AMDGPUExtensions/.JuliaFormatter.toml b/NDTensors/src/lib/AMDGPUExtensions/.JuliaFormatter.toml
@@ -0,0 +1,2 @@
+style = "blue"
+indent = 2
diff --git a/NDTensors/src/lib/AMDGPUExtensions/src/AMDGPUExtensions.jl b/NDTensors/src/lib/AMDGPUExtensions/src/AMDGPUExtensions.jl
@@ -0,0 +1,4 @@
+module AMDGPUExtensions
+include("roc.jl")
+
+end
diff --git a/NDTensors/src/lib/AMDGPUExtensions/src/roc.jl b/NDTensors/src/lib/AMDGPUExtensions/src/roc.jl
@@ -0,0 +1,14 @@
+using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
+using NDTensors.GPUArraysCoreExtensions: storagemode
+# Implemented in NDTensorsAMDGPUExt
+function roc end
+
+## Here we need an ROCArrayAdaptor to prevent conversion of 64 bit numbers to 32 bit.  
+## We cannot write `adapt(CuVector, x)` because this
+## will not allow us to properly utilize the buffer preference without changing the value of
+## default_buffertype. Also `adapt(CuVector{<:Any, <:Any, Buffertype})` fails to work properly
+struct ROCArrayAdaptor{B} end
+
+function TypeParameterAccessors.position(::Type{<:ROCArrayAdaptor}, ::typeof(storagemode))
+  return Position(1)
+end
diff --git a/NDTensors/src/lib/AMDGPUExtensions/test/runtests.jl b/NDTensors/src/lib/AMDGPUExtensions/test/runtests.jl
@@ -0,0 +1,9 @@
+@eval module $(gensym())
+using Test: @testset, @test
+using NDTensors.AMDGPUExtensions: roc, ROCArrayAdaptor
+using NDTensors.GPUArraysCoreExtensions: storagemode
+@testset "roc and ROCArrayAdaptor" begin
+  @test roc isa Function
+  @test storagemode(ROCArrayAdaptor{1}) == 1
+end
+end
diff --git a/NDTensors/src/lib/CUDAExtensions/src/cuda.jl b/NDTensors/src/lib/CUDAExtensions/src/cuda.jl
@@ -1,9 +1,9 @@
 using NDTensors.TypeParameterAccessors: TypeParameterAccessors
 using NDTensors.GPUArraysCoreExtensions: storagemode
-# Implemented in `ITensorGPU` and NDTensorCUDA
+# Implemented in `ITensorGPU` and NDTensorsCUDAExt
 function cu end
 
-## Here we need an NDTensorCuArrayAdaptor because the CuArrayAdaptor provided by CUDA
+## Here we need an CuArrayAdaptor because the CuArrayAdaptor provided by CUDA
 ## converts 64 bit numbers to 32 bit.  We cannot write `adapt(CuVector, x)` because this
 ## Will not allow us to properly utilize the buffer preference without changing the value of
 ## default_buffertype. Also `adapt(CuVector{<:Any, <:Any, Buffertype})` fails to work properly