From cb9fdbf11f884b0501d1c23a48af258ab4acb57f Mon Sep 17 00:00:00 2001
From: "Li-Huai (Allan) Lin" <qqaatw@gmail.com>
Date: Mon, 11 Nov 2024 11:07:33 -0800
Subject: [PATCH 1/3] [MPS] Lift MSL version to 3.0+ and use relevant helpers
 (#8719)

Summary:
1. Remove the custom atomic add function and use the one provided by MSL 3.0+ instead.
2. Use `MetalShaderLibrary` class.
---
 torchvision/csrc/ops/mps/mps_kernels.h        | 87 +++----------------
 .../csrc/ops/mps/ps_roi_pool_kernel.mm        |  1 -
 2 files changed, 14 insertions(+), 74 deletions(-)
diff --git a/torchvision/csrc/ops/mps/mps_kernels.h b/torchvision/csrc/ops/mps/mps_kernels.h
index e720a1608f1..f85546a6c41 100644
--- a/torchvision/csrc/ops/mps/mps_kernels.h
+++ b/torchvision/csrc/ops/mps/mps_kernels.h
@@ -5,7 +5,7 @@ namespace ops {
 
 namespace mps {
 
-static const char* METAL_VISION = R"VISION_METAL(
+static at::native::mps::MetalShaderLibrary lib(R"VISION_METAL(
 
 #include <metal_atomic>
 #include <metal_stdlib>
@@ -26,46 +26,15 @@ inline T ceil_div(T n, T m) {
   return (n + m - 1) / m;
 }
 
-template <typename T>
-inline void atomic_add_float( device T* data_ptr, const T val)
+inline void atomic_add_float(device float* data_ptr, const float val)
 {
-#if __METAL_VERSION__ >= 300
-  // atomic_float is supported in Metal 3 (macOS Ventura) onward.
-  device atomic_fetch_add_explicit((device atomic_float*) data_ptr, val, memory_order_relaxed);
-#else
-  // Custom atomic addition implementation
-  // https://github.com/ShoYamanishi/AppleNumericalComputing/blob/053f06c1f5a831095c4bcc29aaf11366fce5231e/03_dot/metal/dot.metal#L447-L472
-  // https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639
-  // https://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf (See the last slide)
-  
-  // Create an atomic uint pointer for atomic transaction.
-  device atomic_uint* atom_var = (device atomic_uint*)data_ptr;
-  // Create necessary storage.
-  uint  fetched_uint,  assigning_uint;
-  T fetched_float, assigning_float;
-
-  // Replace the value in atom_var with 0 and return the previous value in atom_var.
-  fetched_uint = atomic_exchange_explicit( atom_var, 0 /*desired*/, memory_order_relaxed);
-  // Read out the previous value as float.
-  fetched_float = *( (thread T*) &fetched_uint );
-
-  // Do addition and represent the addition result in uint for atomic transaction.
-  assigning_float = fetched_float + val;
-  assigning_uint =  *((thread uint*) &assigning_float);
-
-  // atom_var should be 0 now, try to assign the addition result back to the atom_var (data_ptr).
-  while ((fetched_uint = atomic_exchange_explicit( atom_var, assigning_uint /*desired*/, memory_order_relaxed)) != 0)  {
-    // If atom_var was not 0, i.e. fetched_uint != 0, it means that the data has been modified by other threads.
-    // Try to assign 0 and get the previously assigned addition result.
-    uint fetched_uint_again = atomic_exchange_explicit(atom_var, 0 /*desired*/, memory_order_relaxed);
-    T fetched_float_again = *( (thread T*) &fetched_uint_again );
-    // Re-add again
-    fetched_float = *((thread T*) &(fetched_uint));
-    // Previously assigned addition result + addition result from other threads.
-    assigning_float = fetched_float_again + fetched_float;
-    assigning_uint =  *( (thread uint*) &assigning_float);
-  }
-#endif
+  atomic_fetch_add_explicit((device atomic_float*) data_ptr, val, memory_order_relaxed);
+}
+
+
+inline void atomic_add_float(device half* data_ptr, const half val)
+{
+  atomic_fetch_add_explicit((device atomic_float*) data_ptr, static_cast<float>(val), memory_order_relaxed);
 }
 
 template <typename T, typename integer_t>
@@ -1061,40 +1030,12 @@ REGISTER_PS_ROI_POOL_OP(half, int64_t);
 REGISTER_PS_ROI_POOL_BACKWARD_OP(float, int64_t);
 REGISTER_PS_ROI_POOL_BACKWARD_OP(half, int64_t);
 
-)VISION_METAL";
-
-static id<MTLLibrary> compileVisionOpsLibrary(id<MTLDevice> device) {
-  static id<MTLLibrary> visionLibrary = nil;
-  if (visionLibrary) {
-    return visionLibrary;
-  }
-
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  visionLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_VISION encoding:NSASCIIStringEncoding]
-                                       options:options
-                                         error:&error];
-  TORCH_CHECK(visionLibrary, "Failed to create metal vision library, error: ", [[error description] UTF8String]);
-  return visionLibrary;
-}
-
-static id<MTLComputePipelineState> visionPipelineState(id<MTLDevice> device, const std::string& kernel) {
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
-  id<MTLComputePipelineState> pso = psoCache[kernel];
-  if (pso) {
-    return pso;
-  }
-
-  NSError* error = nil;
-  id<MTLLibrary> visionLib = compileVisionOpsLibrary(device);
-  id<MTLFunction> visionFunc = [visionLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
-  TORCH_CHECK(visionFunc, "Failed to create function state object for: ", kernel);
-  pso = [device newComputePipelineStateWithFunction:visionFunc error:&error];
-  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
+)VISION_METAL");
 
-  psoCache[kernel] = pso;
-  return pso;
+static id<MTLComputePipelineState> visionPipelineState(
+    id<MTLDevice> device,
+    const std::string& kernel) {
+  return lib.getPipelineStateForFunc(kernel);
 }
 
 } // namespace mps
diff --git a/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
index fc24f6990fa..75d0ff4845f 100644
--- a/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
+++ b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
@@ -123,7 +123,6 @@
 
   float spatial_scale_f = static_cast<float>(spatial_scale);
 
-  auto num_rois = rois.size(0);
   auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
 
   if (grad.numel() == 0) {

From 7d077f131217dc03813d97d7524ea3aeba7dd7e1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Tue, 12 Nov 2024 13:15:11 +0000
Subject: [PATCH 2/3] Revert "Fix memory leak in decode_webp (#8712)" (#8723)

---
 torchvision/csrc/io/image/cpu/decode_webp.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/torchvision/csrc/io/image/cpu/decode_webp.cpp b/torchvision/csrc/io/image/cpu/decode_webp.cpp
index 0a9ff9ddbce..b202473c039 100644
--- a/torchvision/csrc/io/image/cpu/decode_webp.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_webp.cpp
@@ -44,12 +44,10 @@ torch::Tensor decode_webp(
 
   auto decoded_data =
       decoding_func(encoded_data_p, encoded_data_size, &width, &height);
-
   TORCH_CHECK(decoded_data != nullptr, "WebPDecodeRGB[A] failed.");
 
-  auto deleter = [decoded_data](void*) { WebPFree(decoded_data); };
   auto out = torch::from_blob(
-      decoded_data, {height, width, num_channels}, deleter, torch::kUInt8);
+      decoded_data, {height, width, num_channels}, torch::kUInt8);
 
   return out.permute({2, 0, 1});
 }

From 518ee93dbd1469524040e1607a345fff90fa7fcd Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Thu, 14 Nov 2024 01:59:45 +0000
Subject: [PATCH 3/3] Migrate towards linux_job_v2.yml (#8725)

---
 .github/scripts/cmake.sh                        | 4 ++++
 .github/workflows/build-cmake.yml               | 3 +--
 .github/workflows/docs.yml                      | 6 +++---
 .github/workflows/lint.yml                      | 6 +++---
 .github/workflows/prototype-tests-linux-gpu.yml | 4 ++--
 .github/workflows/tests.yml                     | 6 +++---
 6 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/.github/scripts/cmake.sh b/.github/scripts/cmake.sh
index bc49c80a309..4217a9d24be 100755
--- a/.github/scripts/cmake.sh
+++ b/.github/scripts/cmake.sh
@@ -30,6 +30,10 @@ else
   JOBS=$(nproc)
 fi
 
+if [[ $OS_TYPE == linux ]]; then
+  export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+fi
+
 TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))")
 if [[ $OS_TYPE == windows ]]; then
   PACKAGING_DIR="${PWD}/packaging"
diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml
index 561b419297f..9cee3bfc26d 100644
--- a/.github/workflows/build-cmake.yml
+++ b/.github/workflows/build-cmake.yml
@@ -20,7 +20,7 @@ jobs:
             gpu-arch-type: cuda
             gpu-arch-version: "11.8"
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
@@ -33,7 +33,6 @@ jobs:
         export PYTHON_VERSION=3.9
         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
-
         ./.github/scripts/cmake.sh
 
   macos:
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 4899b81c956..f6ec4201da3 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -14,7 +14,7 @@ on:
 
 jobs:
   build:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/vision
       upload-artifact: docs
@@ -77,11 +77,11 @@ jobs:
 
   upload:
     needs: build
-    if: github.repository == 'pytorch/vision' && github.event_name == 'push' && 
+    if: github.repository == 'pytorch/vision' && github.event_name == 'push' &&
         ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
     permissions:
       contents: write
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/vision
       download-artifact: docs
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index eac1c009eec..7e9943668cd 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   python-source-and-configs:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/vision
       test-infra-ref: main
@@ -38,7 +38,7 @@ jobs:
         fi
 
   c-source:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/vision
       test-infra-ref: main
@@ -65,7 +65,7 @@ jobs:
 
 
   python-types:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/vision
       test-infra-ref: main
diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
index 7fbe77ca146..e1d6498761b 100644
--- a/.github/workflows/prototype-tests-linux-gpu.yml
+++ b/.github/workflows/prototype-tests-linux-gpu.yml
@@ -23,7 +23,7 @@ jobs:
             gpu-arch-type: cuda
             gpu-arch-version: "11.8"
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
@@ -37,7 +37,7 @@ jobs:
         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
         ./.github/scripts/setup-env.sh
-        
+
         # Prepare conda
         CONDA_PATH=$(which conda)
         eval "$(${CONDA_PATH} shell.bash hook)"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 6f03b0a59eb..b4a74733967 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -26,7 +26,7 @@ jobs:
             gpu-arch-type: cuda
             gpu-arch-version: "11.8"
       fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/vision
       runner: ${{ matrix.runner }}
@@ -104,7 +104,7 @@ jobs:
         ./.github/scripts/unittest.sh
 
   onnx:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/vision
       test-infra-ref: main
@@ -135,7 +135,7 @@ jobs:
         echo '::endgroup::'
 
   unittests-extended:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     if: contains(github.event.pull_request.labels.*.name, 'run-extended')
     with:
       repository: pytorch/vision