From a9a3c8b7cc9a92a9421f1f6d7c31f0a75d7b140f Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Sun, 14 Apr 2024 11:03:04 -0400
Subject: [PATCH] Try making the tests deterministic

---
 .buildkite/pipeline.yml              |  3 ++-
 .github/workflows/Downgrade.yml      |  2 +-
 Project.toml                         | 22 +++++++++++-----------
 ext/LuxLibcuDNNExt/LuxLibcuDNNExt.jl |  3 +--
 ext/LuxLibcuDNNExt/batchnorm.jl      |  3 +--
 test/api/batchnorm_tests.jl          | 10 +++++-----
 test/api/groupnorm_tests.jl          | 27 +++++++++++++++------------
 test/api/instancenorm_tests.jl       | 15 +++++++++------
 test/api/layernorm_tests.jl          |  6 +++---
 test/shared_testsetup.jl             |  8 +++++++-
 10 files changed, 55 insertions(+), 44 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index dfdd6637..c3bbdb8a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -18,6 +18,7 @@ steps:
           cuda: "*"
         env:
           GROUP: "CUDA"
+          RETESTITEMS_NWORKERS: 0  # Distributed is causing stalling issues with CUDA
         if: build.message !~ /\[skip tests\]/
         timeout_in_minutes: 60
         matrix:
@@ -160,6 +161,6 @@ steps:
               - "Boltz"
 
 env:
-  RETESTITEMS_NWORKERS: 2
+  RETESTITEMS_NWORKERS: 4
   RETESTITEMS_NWORKER_THREADS: 2
   SECRET_CODECOV_TOKEN: "wMpDLaAVEHe6EJAc+LZBl4jF3wADVN6F+15vr/ONJHOv/XXbtYovuc1PCQwhz0AzZjWpSO12IDTyKfwVgYvqaGYfQ9yGyplJtSu2MiL2k44B/IY+wEZhsfkBIhXlG89si5A/I+/f8T8QuwxBqBLh8fYq7oxC+gNzKhbj8vIT4n5hCusvYYGufgKRC2U9P4ij0Sf40egQ5B+StaTykqJNq1163UARjNBypHIVDbYE0HUHiF7WB4eI5LxBBzlcHmsUkuGp6ZlqAu/8C83k65lwDnyHDfjvBM24q9GQTDFA5r7RUfYKHElQEBPk3GhoJn7XGIfD2pC0VNcw5jYCwsX2mw==;U2FsdGVkX1+euKMib66zno5Kkw7OxXo6v4RnkAA/HElJM46qfX17VgZ9iVLg45jOOWRgghmyYuy2WQ8RcVbuOg=="
diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml
index 04cbe75e..c89327b2 100644
--- a/.github/workflows/Downgrade.yml
+++ b/.github/workflows/Downgrade.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        version: ['1.9']
+        version: ['1.10']
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
diff --git a/Project.toml b/Project.toml
index 1181f429..925e361c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "LuxLib"
 uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "0.3.11"
+version = "0.3.12"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -32,33 +32,33 @@ LuxLibTrackercuDNNExt = ["CUDA", "Tracker", "cuDNN"]
 LuxLibcuDNNExt = ["CUDA", "cuDNN"]
 
 [compat]
-AMDGPU = "0.8"
-Aqua = "0.8"
+AMDGPU = "0.8.4"
+Aqua = "0.8.7"
 CUDA = "5.2"
 ChainRulesCore = "1.20"
 ComponentArrays = "0.15.8"
 ExplicitImports = "1.4.1"
 FastClosures = "0.3.2"
 ForwardDiff = "0.10.36"
-KernelAbstractions = "0.9.2"
+KernelAbstractions = "0.9.15"
 LuxAMDGPU = "0.2.1"
 LuxCUDA = "0.3.1"
 LuxCore = "0.1.13"
 LuxTestUtils = "0.1.15"
-Markdown = "1.9"
-NNlib = "0.9.9"
+Markdown = "1.10"
+NNlib = "0.9.10"
 PrecompileTools = "1.2"
-Random = "1.9"
+Random = "1.10"
 ReTestItems = "1"
 Reexport = "1"
 ReverseDiff = "1.15"
 StableRNGs = "1"
-Statistics = "1.9"
-Test = "1.9"
-Tracker = "0.2.26"
+Statistics = "1.10"
+Test = "1.10"
+Tracker = "0.2.31"
 Zygote = "0.6.69"
 cuDNN = "1.3"
-julia = "1.9"
+julia = "1.10"
 
 [extras]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
diff --git a/ext/LuxLibcuDNNExt/LuxLibcuDNNExt.jl b/ext/LuxLibcuDNNExt/LuxLibcuDNNExt.jl
index 3727b3b5..044929ea 100644
--- a/ext/LuxLibcuDNNExt/LuxLibcuDNNExt.jl
+++ b/ext/LuxLibcuDNNExt/LuxLibcuDNNExt.jl
@@ -19,11 +19,10 @@ const CUDNN_BN_ARRAY_TYPE = Union{
     CuArray{<:Union{Float32, Float64}, 5}}
 const BNParamType = Union{Nothing, CuVector{<:Union{Float32, Float64}}}
 
-function batchnorm(x::CUDNN_BN_ARRAY_TYPE, scale::BNParamType, bias::BNParamType,
+function LuxLib.batchnorm(x::CUDNN_BN_ARRAY_TYPE, scale::BNParamType, bias::BNParamType,
         running_mean::BNParamType, running_var::BNParamType;
         momentum::Real, training::Val, epsilon::Real)
     rm, rv = LuxLib._get_batchnorm_statistics(x, running_mean, running_var, training)
-
     x_ = first(LuxLib.batchnorm_cudnn(rm, rv, scale, bias, x, momentum, epsilon, training))
     return x_, (; running_mean=rm, running_var=rv)
 end
diff --git a/ext/LuxLibcuDNNExt/batchnorm.jl b/ext/LuxLibcuDNNExt/batchnorm.jl
index e3787220..aea36e21 100644
--- a/ext/LuxLibcuDNNExt/batchnorm.jl
+++ b/ext/LuxLibcuDNNExt/batchnorm.jl
@@ -1,8 +1,7 @@
-# NOTE: This can be upstreamed to LuxCUDA once we drop support for v1.6
 # Difference from the NNlib version: We expose the mean and inv_variance computed in the
 # cudnn call, since they can be used at other places like forward mode AD
 @inline function _wsize(x::AbstractArray{T, N}) where {T, N}
-    return ntuple(i -> ifelse(i == N - 1, size(x, N - 1), 1), N)
+    return ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
 end
 
 function LuxLib.batchnorm_cudnn(γ::Nothing, β::Nothing, x::DenseCuArray, args...; kwargs...)
diff --git a/test/api/batchnorm_tests.jl b/test/api/batchnorm_tests.jl
index 5453ff9f..d533746e 100644
--- a/test/api/batchnorm_tests.jl
+++ b/test/api/batchnorm_tests.jl
@@ -2,13 +2,13 @@
     rng = get_stable_rng(12345)
 
     function _setup_batchnorm(aType, T, sz; affine::Bool=true, track_stats::Bool)
-        x = randn(T, sz) |> aType
-        scale = affine ? aType(randn(T, sz[end - 1])) : nothing
-        bias = affine ? aType(randn(T, sz[end - 1])) : nothing
+        x = __generate_fixed_array(T, sz) |> aType
+        scale = affine ? aType(__generate_fixed_array(T, sz[end - 1])) : nothing
+        bias = affine ? aType(__generate_fixed_array(T, sz[end - 1])) : nothing
 
         if track_stats
-            running_mean = randn(T, sz[end - 1]) |> aType
-            running_var = abs2.(randn(T, sz[end - 1])) |> aType
+            running_mean = __generate_fixed_array(T, sz[end - 1]) |> aType
+            running_var = abs2.(__generate_fixed_array(T, sz[end - 1])) |> aType
             return x, scale, bias, running_mean, running_var
         else
             return x, scale, bias, nothing, nothing
diff --git a/test/api/groupnorm_tests.jl b/test/api/groupnorm_tests.jl
index 3f4e03f4..26284846 100644
--- a/test/api/groupnorm_tests.jl
+++ b/test/api/groupnorm_tests.jl
@@ -1,10 +1,16 @@
 @testsetup module GroupNormSetup
 using LuxLib
 
+@inline __generate_fixed_array(::Type{T}, sz...) where {T} = __generate_fixed_array(T, sz)
+@inline function __generate_fixed_array(::Type{T}, sz) where {T}
+    return reshape(T.(collect(1:prod(sz)) ./ prod(sz)), sz...)
+end
+@inline __generate_fixed_array(::Type{T}, sz::Int) where {T} = T.(collect(1:sz) ./ sz)
+
 function _setup_groupnorm(aType, T, sz, groups)
-    x = randn(T, sz) |> aType
-    scale = randn(T, sz[end - 1]) |> aType
-    bias = randn(T, sz[end - 1]) |> aType
+    x = __generate_fixed_array(T, sz) |> aType
+    scale = __generate_fixed_array(T, sz[end - 1]) |> aType
+    bias = __generate_fixed_array(T, sz[end - 1]) |> aType
     return x, scale, bias
 end
 
@@ -27,8 +33,6 @@ end
             sz in ((16, 16, 6, 4), (32, 32, 6, 4), (64, 64, 12, 4)),
             groups in (2, 3)
 
-            T === Float16 && mode == "AMDGPU" && continue
-
             _f = (args...) -> groupnorm(args...; groups, epsilon)
 
             epsilon = T(1e-5)
@@ -40,8 +44,7 @@ end
 
             @inferred groupnorm(x, scale, bias; groups, epsilon)
 
-            # @jet _f(x, scale, bias)  # test_call throws exception
-            LuxTestUtils.JET.@test_opt target_modules=(LuxLib,) _f(x, scale, bias)
+            @jet _f(x, scale, bias)
 
             @test y isa aType{T, length(sz)}
             @test size(y) == sz
@@ -55,14 +58,14 @@ end
 
             # The KA implementation reorders operations manually for maximal
             # performance. Hence equality cannot be guaranteed.
-            @test check_approx(y, y_; atol=1.0f-3, rtol=1.0f-3)
-            @test check_approx(gs_x, gs_x_; atol=1.0f-3, rtol=1.0f-3)
-            @test check_approx(gs_scale, gs_scale_; atol=1.0f-3, rtol=1.0f-3)
-            @test check_approx(gs_bias, gs_bias_; atol=1.0f-3, rtol=1.0f-3)
+            @test check_approx(y, y_; atol=1.0f-1, rtol=1.0f-1)
+            @test check_approx(gs_x, gs_x_; atol=1.0f-1, rtol=1.0f-1)
+            @test check_approx(gs_scale, gs_scale_; atol=1.0f-1, rtol=1.0f-1)
+            @test check_approx(gs_bias, gs_bias_; atol=1.0f-1, rtol=1.0f-1)
 
             fp16 = T == Float16
             __f = (args...) -> sum(groupnorm(x, args...; groups, epsilon))
-            @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu atol=1.0f-3 rtol=1.0f-3 soft_fail=$fp16
+            @eval @test_gradients $__f $scale $bias gpu_testing=$on_gpu atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16
         end
     end
 end
diff --git a/test/api/instancenorm_tests.jl b/test/api/instancenorm_tests.jl
index b601e227..26a2dba0 100644
--- a/test/api/instancenorm_tests.jl
+++ b/test/api/instancenorm_tests.jl
@@ -4,9 +4,9 @@
     rng = get_stable_rng(12345)
 
     function _setup_instancenorm(aType, T, sz; affine::Bool=true)
-        x = randn(T, sz) |> aType
-        scale = affine ? aType(ones(T, sz[end - 1])) : nothing
-        bias = affine ? aType(zeros(T, sz[end - 1])) : nothing
+        x = __generate_fixed_array(T, sz) |> aType
+        scale = affine ? aType(__generate_fixed_array(T, sz[end - 1])) : nothing
+        bias = affine ? aType(__generate_fixed_array(T, sz[end - 1])) : nothing
         return x, scale, bias
     end
 
@@ -30,9 +30,12 @@
             @test y isa aType{T, length(sz)}
             @test size(y) == sz
 
-            _target_std = ones(ntuple(_ -> 1, length(sz) - 2)..., size(x)[(end - 1):end]...)
-            @eval @test check_approx(std(Array($y); dims=1:($(length(sz) - 2))),
-                $_target_std; atol=0.2, rtol=0.2)
+            if !affine
+                _target_std = ones(
+                    ntuple(_ -> 1, length(sz) - 2)..., size(x)[(end - 1):end]...)
+                @test check_approx(
+                    std(Array(y); dims=1:(length(sz) - 2)), _target_std; atol=0.2, rtol=0.2)
+            end
             @test std(y; dims=1:(length(sz) - 2)) != std(x; dims=1:(length(sz) - 2))
 
             if __istraining(training) && affine
diff --git a/test/api/layernorm_tests.jl b/test/api/layernorm_tests.jl
index 4cd2d9d4..8aa39671 100644
--- a/test/api/layernorm_tests.jl
+++ b/test/api/layernorm_tests.jl
@@ -2,10 +2,10 @@
     using Statistics
 
     function _setup_layernorm(aType, T, x_size, affine_shape)
-        x = randn(T, x_size) |> aType
+        x = __generate_fixed_array(T, x_size) |> aType
         if affine_shape !== nothing
-            scale = randn(T, affine_shape..., 1) |> aType
-            bias = randn(T, affine_shape..., 1) |> aType
+            scale = __generate_fixed_array(T, (affine_shape..., 1)) |> aType
+            bias = __generate_fixed_array(T, (affine_shape..., 1)) |> aType
             return x, scale, bias
         else
             return x, nothing, nothing
diff --git a/test/shared_testsetup.jl b/test/shared_testsetup.jl
index 886b20d6..acff5d77 100644
--- a/test/shared_testsetup.jl
+++ b/test/shared_testsetup.jl
@@ -28,6 +28,12 @@ get_stable_rng(seed=12345) = StableRNG(seed)
 
 __istraining(::Val{training}) where {training} = training
 
+@inline __generate_fixed_array(::Type{T}, sz...) where {T} = __generate_fixed_array(T, sz)
+@inline function __generate_fixed_array(::Type{T}, sz) where {T}
+    return reshape(T.(collect(1:prod(sz)) ./ prod(sz)), sz...)
+end
+@inline __generate_fixed_array(::Type{T}, sz::Int) where {T} = T.(collect(1:sz) ./ sz)
+
 export cpu_testing, cuda_testing, amdgpu_testing, MODES, get_stable_rng, __istraining,
-       check_approx, @jet, @test_gradients
+       check_approx, @jet, @test_gradients, __generate_fixed_array
 end