From 6600003c2314af88befcec2cd6662957a662981d Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Fri, 5 Jan 2024 10:10:21 +0000
Subject: [PATCH] Don't install Triton nightly separately in CI

Instead count on PyTorch nightlies pulling in a recent enough (and well tested!) version of Triton

We also disable Triton in some components on V100, because we started seeing failures on recent Triton versions. This is what these failures looked like (the first 20 ones): P967812823.

ghstack-source-id: 612d5ea3406be3729f4be98b0a583759b3048ae3
Pull Request resolved: https://github.com/fairinternal/xformers/pull/985

__original_commit__ = fairinternal/xformers@d654838f63104a278af8c2472ead0708453b7413
---
 requirements-test.txt                        |  2 -
 tests/test_core_attention.py                 |  7 +---
 tests/test_triton_blocksparse.py             |  9 +----
 xformers/components/attention/blocksparse.py |  6 +--
 xformers/components/attention/core.py        |  4 +-
 xformers/triton/utils.py                     | 41 +++++++++-----------
 6 files changed, 28 insertions(+), 41 deletions(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index 1588aa5f77..0f460733f7 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -29,5 +29,3 @@ scipy
 
 # Dependency for fused layers, optional
 cmake
---extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/
-triton-nightly<=2.1.0.post20231125000000
diff --git a/tests/test_core_attention.py b/tests/test_core_attention.py
index 0beace4427..bdd3a788f2 100644
--- a/tests/test_core_attention.py
+++ b/tests/test_core_attention.py
@@ -15,10 +15,10 @@
 from xformers.components.attention.core import scaled_dot_product_attention
 
 if _is_triton_available():
-    from xformers.triton.utils import gpu_capabilities_older_than_70
+    from xformers.triton.utils import gpu_capabilities_older_than_80
 
 _is_blocksparse_available = (
-    _is_triton_available() and not gpu_capabilities_older_than_70()
+    _is_triton_available() and not gpu_capabilities_older_than_80()
 )
 
 
@@ -166,9 +166,6 @@ def test_switch_blocksparse(device, data_type):
     # Mask with causal flag
     m_att_mask = AttentionMask.make_causal(s, s, device, dtype=a.dtype)
 
-    def kernel():
-        return scaled_dot_product_attention(a, a, a, m_att_mask)
-
     # Check that a switch to blocksparse is only triggered by causal flag
     with torch.cuda.amp.autocast():
         r_custom = scaled_dot_product_attention(a, a, a, m_custom)
diff --git a/tests/test_triton_blocksparse.py b/tests/test_triton_blocksparse.py
index e8e4a4dbea..e67b042561 100644
--- a/tests/test_triton_blocksparse.py
+++ b/tests/test_triton_blocksparse.py
@@ -12,7 +12,6 @@
 from xformers.components import MultiHeadDispatch
 from xformers.components.attention import build_attention
 from xformers.components.attention.attention_patterns import block_sparsify_tensor
-from xformers.triton.utils import get_current_cuda_device
 
 
 def catch_oor(fn):
@@ -45,9 +44,9 @@ def fn_and_catch_oor(*args, **kwargs):
         from triton.ops.blocksparse import softmax as blocksparse_softmax
 
         from xformers.components.attention import BlockSparseAttention
-        from xformers.triton.utils import gpu_capabilities_older_than_70
+        from xformers.triton.utils import gpu_capabilities_older_than_80
 
-        _triton_available = not gpu_capabilities_older_than_70()
+        _triton_available = not gpu_capabilities_older_than_80()
         _matmul_types = ["sdd", "dsd", "dds"]
     except (ImportError, ModuleNotFoundError) as e:
         import logging
@@ -64,10 +63,6 @@ def mask_tensor(x, mask, block, value=0):
 
 
 @pytest.mark.skipif(not _triton_available, reason="Triton requires a recent CUDA gpu")
-@pytest.mark.skipif(
-    not _triton_available or get_current_cuda_device() == "T4",
-    reason="FIXME - blocksparse matmuls are slightly off on T4s",
-)
 @pytest.mark.parametrize("MODE", _matmul_types)
 @pytest.mark.parametrize("TRANS_A", [False, True])
 @pytest.mark.parametrize("TRANS_B", [False, True])
diff --git a/xformers/components/attention/blocksparse.py b/xformers/components/attention/blocksparse.py
index f09962817d..89bdfbffbc 100644
--- a/xformers/components/attention/blocksparse.py
+++ b/xformers/components/attention/blocksparse.py
@@ -23,10 +23,10 @@
     from triton.ops.blocksparse import matmul as blocksparse_matmul  # type: ignore
     from triton.ops.blocksparse import softmax as blocksparse_softmax  # type: ignore
 
-    from xformers.triton.utils import gpu_capabilities_older_than_70
+    from xformers.triton.utils import gpu_capabilities_older_than_80
 
-    # Blocksparse requires Tensor cores
-    if gpu_capabilities_older_than_70():
+    # Blocksparse requires Tensor cores, but we also disable it on V100 because of Triton issues
+    if gpu_capabilities_older_than_80():
         logger.warning(
             "Blocksparse is not available: the current GPU does not expose Tensor cores"
         )
diff --git a/xformers/components/attention/core.py b/xformers/components/attention/core.py
index 0ab54cbdd6..22f7a576c4 100644
--- a/xformers/components/attention/core.py
+++ b/xformers/components/attention/core.py
@@ -20,10 +20,10 @@
 
 if _is_triton_available():
     from xformers.triton.softmax import softmax as triton_softmax
-    from xformers.triton.utils import gpu_capabilities_older_than_70
+    from xformers.triton.utils import gpu_capabilities_older_than_80
 
 _is_blocksparse_available = (
-    _is_triton_available() and not gpu_capabilities_older_than_70()
+    _is_triton_available() and not gpu_capabilities_older_than_80()
 )
 
 if _is_blocksparse_available:
diff --git a/xformers/triton/utils.py b/xformers/triton/utils.py
index 7c2fd70053..ab71b5824a 100644
--- a/xformers/triton/utils.py
+++ b/xformers/triton/utils.py
@@ -4,37 +4,34 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
 
 logger = logging.getLogger("xformers")
 
 
-_gpu_is_old: Optional[bool] = None
+_oldest_gpu: Optional[Tuple[int, int]] = None
 
 
-def gpu_capabilities_older_than_70() -> bool:
-    """Return True if the GPU's compute capability is older than SM70."""
-    global _gpu_is_old
-    if _gpu_is_old is None:
-        for i in range(torch.cuda.device_count()):
-            major, _ = torch.cuda.get_device_capability(f"cuda:{i}")
-            if major < 7:
-                _gpu_is_old = True
-        if _gpu_is_old is None:
-            _gpu_is_old = False
-    return _gpu_is_old
-
+def _get_oldest_gpu() -> Tuple[int, int]:
+    global _oldest_gpu
+    if _oldest_gpu is None:
+        _oldest_gpu = min(
+            (
+                torch.cuda.get_device_capability(f"cuda:{i}")
+                for i in range(torch.cuda.device_count())
+            ),
+            default=(0, 0),
+        )
+    return _oldest_gpu
 
-SUPPORTED_CUDA_DEVICES = ["V100", "A100", "T4"]
 
+def gpu_capabilities_older_than_70() -> bool:
+    """Return True if the GPU's compute capability is older than SM70."""
+    return _get_oldest_gpu() < (7, 0)
 
-def get_current_cuda_device():
-    current_device = str(torch.cuda.get_device_properties(torch.cuda.current_device()))
-    for device_str in SUPPORTED_CUDA_DEVICES:
-        if current_device.find(device_str) > 0:
-            return device_str
 
-    logger.warning("Unsupported device, Triton code generation may fail")
-    return "P100"  # default to an old GPU
+def gpu_capabilities_older_than_80() -> bool:
+    """Return True if the GPU's compute capability is older than SM80."""
+    return _get_oldest_gpu() < (8, 0)