From cb9fdbf11f884b0501d1c23a48af258ab4acb57f Mon Sep 17 00:00:00 2001 From: "Li-Huai (Allan) Lin" Date: Mon, 11 Nov 2024 11:07:33 -0800 Subject: [PATCH 1/3] [MPS] Lift MSL version to 3.0+ and use relevant helpers (#8719) Summary: 1. Remove the custom atomic add function and use the one provided by MSL 3.0+ instead. 2. Use `MetalShaderLibrary` class. --- torchvision/csrc/ops/mps/mps_kernels.h | 87 +++---------------- .../csrc/ops/mps/ps_roi_pool_kernel.mm | 1 - 2 files changed, 14 insertions(+), 74 deletions(-) diff --git a/torchvision/csrc/ops/mps/mps_kernels.h b/torchvision/csrc/ops/mps/mps_kernels.h index e720a1608f1..f85546a6c41 100644 --- a/torchvision/csrc/ops/mps/mps_kernels.h +++ b/torchvision/csrc/ops/mps/mps_kernels.h @@ -5,7 +5,7 @@ namespace ops { namespace mps { -static const char* METAL_VISION = R"VISION_METAL( +static at::native::mps::MetalShaderLibrary lib(R"VISION_METAL( #include #include @@ -26,46 +26,15 @@ inline T ceil_div(T n, T m) { return (n + m - 1) / m; } -template -inline void atomic_add_float( device T* data_ptr, const T val) +inline void atomic_add_float(device float* data_ptr, const float val) { -#if __METAL_VERSION__ >= 300 - // atomic_float is supported in Metal 3 (macOS Ventura) onward. - device atomic_fetch_add_explicit((device atomic_float*) data_ptr, val, memory_order_relaxed); -#else - // Custom atomic addition implementation - // https://github.com/ShoYamanishi/AppleNumericalComputing/blob/053f06c1f5a831095c4bcc29aaf11366fce5231e/03_dot/metal/dot.metal#L447-L472 - // https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639 - // https://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf (See the last slide) - - // Create an atomic uint pointer for atomic transaction. - device atomic_uint* atom_var = (device atomic_uint*)data_ptr; - // Create necessary storage. - uint fetched_uint, assigning_uint; - T fetched_float, assigning_float; - - // Replace the value in atom_var with 0 and return the previous value in atom_var. - fetched_uint = atomic_exchange_explicit( atom_var, 0 /*desired*/, memory_order_relaxed); - // Read out the previous value as float. - fetched_float = *( (thread T*) &fetched_uint ); - - // Do addition and represent the addition result in uint for atomic transaction. - assigning_float = fetched_float + val; - assigning_uint = *((thread uint*) &assigning_float); - - // atom_var should be 0 now, try to assign the addition result back to the atom_var (data_ptr). - while ((fetched_uint = atomic_exchange_explicit( atom_var, assigning_uint /*desired*/, memory_order_relaxed)) != 0) { - // If atom_var was not 0, i.e. fetched_uint != 0, it means that the data has been modified by other threads. - // Try to assign 0 and get the previously assigned addition result. - uint fetched_uint_again = atomic_exchange_explicit(atom_var, 0 /*desired*/, memory_order_relaxed); - T fetched_float_again = *( (thread T*) &fetched_uint_again ); - // Re-add again - fetched_float = *((thread T*) &(fetched_uint)); - // Previously assigned addition result + addition result from other threads. - assigning_float = fetched_float_again + fetched_float; - assigning_uint = *( (thread uint*) &assigning_float); - } -#endif + atomic_fetch_add_explicit((device atomic_float*) data_ptr, val, memory_order_relaxed); +} + + +inline void atomic_add_float(device half* data_ptr, const half val) +{ + atomic_fetch_add_explicit((device atomic_float*) data_ptr, static_cast(val), memory_order_relaxed); } template @@ -1061,40 +1030,12 @@ REGISTER_PS_ROI_POOL_OP(half, int64_t); REGISTER_PS_ROI_POOL_BACKWARD_OP(float, int64_t); REGISTER_PS_ROI_POOL_BACKWARD_OP(half, int64_t); -)VISION_METAL"; - -static id compileVisionOpsLibrary(id device) { - static id visionLibrary = nil; - if (visionLibrary) { - return visionLibrary; - } - - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - visionLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_VISION encoding:NSASCIIStringEncoding] - options:options - error:&error]; - TORCH_CHECK(visionLibrary, "Failed to create metal vision library, error: ", [[error description] UTF8String]); - return visionLibrary; -} - -static id visionPipelineState(id device, const std::string& kernel) { - static std::unordered_map> psoCache; - id pso = psoCache[kernel]; - if (pso) { - return pso; - } - - NSError* error = nil; - id visionLib = compileVisionOpsLibrary(device); - id visionFunc = [visionLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]]; - TORCH_CHECK(visionFunc, "Failed to create function state object for: ", kernel); - pso = [device newComputePipelineStateWithFunction:visionFunc error:&error]; - TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]); +)VISION_METAL"); - psoCache[kernel] = pso; - return pso; +static id visionPipelineState( + id device, + const std::string& kernel) { + return lib.getPipelineStateForFunc(kernel); } } // namespace mps diff --git a/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm index fc24f6990fa..75d0ff4845f 100644 --- a/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm +++ b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm @@ -123,7 +123,6 @@ float spatial_scale_f = static_cast(spatial_scale); - auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); if (grad.numel() == 0) { From 7d077f131217dc03813d97d7524ea3aeba7dd7e1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 12 Nov 2024 13:15:11 +0000 Subject: [PATCH 2/3] Revert "Fix memory leak in decode_webp (#8712)" (#8723) --- torchvision/csrc/io/image/cpu/decode_webp.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/torchvision/csrc/io/image/cpu/decode_webp.cpp b/torchvision/csrc/io/image/cpu/decode_webp.cpp index 0a9ff9ddbce..b202473c039 100644 --- a/torchvision/csrc/io/image/cpu/decode_webp.cpp +++ b/torchvision/csrc/io/image/cpu/decode_webp.cpp @@ -44,12 +44,10 @@ torch::Tensor decode_webp( auto decoded_data = decoding_func(encoded_data_p, encoded_data_size, &width, &height); - TORCH_CHECK(decoded_data != nullptr, "WebPDecodeRGB[A] failed."); - auto deleter = [decoded_data](void*) { WebPFree(decoded_data); }; auto out = torch::from_blob( - decoded_data, {height, width, num_channels}, deleter, torch::kUInt8); + decoded_data, {height, width, num_channels}, torch::kUInt8); return out.permute({2, 0, 1}); } From 518ee93dbd1469524040e1607a345fff90fa7fcd Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 14 Nov 2024 01:59:45 +0000 Subject: [PATCH 3/3] Migrate towards linux_job_v2.yml (#8725) --- .github/scripts/cmake.sh | 4 ++++ .github/workflows/build-cmake.yml | 3 +-- .github/workflows/docs.yml | 6 +++--- .github/workflows/lint.yml | 6 +++--- .github/workflows/prototype-tests-linux-gpu.yml | 4 ++-- .github/workflows/tests.yml | 6 +++--- 6 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/scripts/cmake.sh b/.github/scripts/cmake.sh index bc49c80a309..4217a9d24be 100755 --- a/.github/scripts/cmake.sh +++ b/.github/scripts/cmake.sh @@ -30,6 +30,10 @@ else JOBS=$(nproc) fi +if [[ $OS_TYPE == linux ]]; then + export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" +fi + TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))") if [[ $OS_TYPE == windows ]]; then PACKAGING_DIR="${PWD}/packaging" diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml index 561b419297f..9cee3bfc26d 100644 --- a/.github/workflows/build-cmake.yml +++ b/.github/workflows/build-cmake.yml @@ -20,7 +20,7 @@ jobs: gpu-arch-type: cuda gpu-arch-version: "11.8" fail-fast: false - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: repository: pytorch/vision runner: ${{ matrix.runner }} @@ -33,7 +33,6 @@ jobs: export PYTHON_VERSION=3.9 export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} - ./.github/scripts/cmake.sh macos: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4899b81c956..f6ec4201da3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,7 +14,7 @@ on: jobs: build: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: repository: pytorch/vision upload-artifact: docs @@ -77,11 +77,11 @@ jobs: upload: needs: build - if: github.repository == 'pytorch/vision' && github.event_name == 'push' && + if: github.repository == 'pytorch/vision' && github.event_name == 'push' && ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag') permissions: contents: write - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: repository: pytorch/vision download-artifact: docs diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index eac1c009eec..7e9943668cd 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -11,7 +11,7 @@ on: jobs: python-source-and-configs: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: repository: pytorch/vision test-infra-ref: main @@ -38,7 +38,7 @@ jobs: fi c-source: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: repository: pytorch/vision test-infra-ref: main @@ -65,7 +65,7 @@ jobs: python-types: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: repository: pytorch/vision test-infra-ref: main diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml index 7fbe77ca146..e1d6498761b 100644 --- a/.github/workflows/prototype-tests-linux-gpu.yml +++ b/.github/workflows/prototype-tests-linux-gpu.yml @@ -23,7 +23,7 @@ jobs: gpu-arch-type: cuda gpu-arch-version: "11.8" fail-fast: false - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: repository: pytorch/vision runner: ${{ matrix.runner }} @@ -37,7 +37,7 @@ jobs: export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} ./.github/scripts/setup-env.sh - + # Prepare conda CONDA_PATH=$(which conda) eval "$(${CONDA_PATH} shell.bash hook)" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6f03b0a59eb..b4a74733967 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -26,7 +26,7 @@ jobs: gpu-arch-type: cuda gpu-arch-version: "11.8" fail-fast: false - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: repository: pytorch/vision runner: ${{ matrix.runner }} @@ -104,7 +104,7 @@ jobs: ./.github/scripts/unittest.sh onnx: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: repository: pytorch/vision test-infra-ref: main @@ -135,7 +135,7 @@ jobs: echo '::endgroup::' unittests-extended: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main if: contains(github.event.pull_request.labels.*.name, 'run-extended') with: repository: pytorch/vision