pytorch · airMeng · Jan 17, 2025 · jerryzh168 · Jan 17, 2025 · airMeng
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -8,7 +8,7 @@
     run_tests,
 )
 
-from torchao.dtypes import CutlassInt4PackedLayout, Int4CPULayout, SemiSparseLayout
+from torchao.dtypes import CutlassInt4PackedLayout, Int4XPULayout, Int4CPULayout, SemiSparseLayout
 from torchao.quantization import (
     float8_weight_only,
     int4_weight_only,
@@ -20,6 +20,7 @@
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
+    TORCH_VERSION_AT_LEAST_2_7,
     is_sm_at_least_89,
 )
 
@@ -46,6 +47,18 @@ def get_quantization_functions(
                         zero_point_domain=ZeroPointDomain.INT,
                     )
                 )
+        elif device == "xpu" and  TORCH_VERSION_AT_LEAST_2_6:
+            base_functions.append(
+                int4_weight_only(group_size=32, layout=Int4XPULayout())
+            )
+            if int4_zp_int:
+                base_functions.append(
+                    int4_weight_only(
+                        group_size=32,
+                        layout=Int4XPULayout(),
+                        zero_point_domain=ZeroPointDomain.INT,
+                    )
+                )
         else:
             base_functions.append(int4_weight_only(group_size=32))
             if device == "cuda":

diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -18,7 +18,7 @@
 from torch._inductor.utils import run_and_get_code
 
 import torchao
-from torchao.dtypes import Int4CPULayout, TensorCoreTiledLayout
+from torchao.dtypes import Int4CPULayout, Int4XPULayout, TensorCoreTiledLayout
 from torchao.dtypes.utils import is_device
 from torchao.quantization import safe_int_mm
 from torchao.quantization.autoquant import (
@@ -139,6 +139,11 @@ def _int4wo_api(mod):
             mod, int4_weight_only(layout=Int4CPULayout()), set_inductor_config=False
         )
         unwrap_tensor_subclass(mod)
+    elif is_device(next(mod.parameters()).device.type, "xpu") and TORCH_VERSION_AT_LEAST_2_7:
+        quantize_(
+            mod, int4_weight_only(layout=Int4XPULayout()), set_inductor_config=False
+        )
+        unwrap_tensor_subclass(mod)
     elif TORCH_VERSION_AT_LEAST_2_4:
         quantize_(mod, int4_weight_only(), set_inductor_config=False)
         if not TORCH_VERSION_AT_LEAST_2_5:
@@ -1079,6 +1084,8 @@ def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype):
         layout_list = []
         if device == "cpu" and TORCH_VERSION_AT_LEAST_2_6:
             layout_list.append(Int4CPULayout())
+        elif device == "xpu" and TORCH_VERSION_AT_LEAST_2_6:
+            layout_list.append(Int4XPULayout())
         else:
             for inner_k_tiles in [4, 2]:
                 layout_list.append(TensorCoreTiledLayout(inner_k_tiles=inner_k_tiles))

diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -33,6 +33,7 @@
     TORCH_VERSION_AT_LEAST_2_4,
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
+    TORCH_VERSION_AT_LEAST_2_7,
     is_fbcode,
 )
 
@@ -130,7 +131,8 @@ def _groupwise_affine_quantize_tensor_from_qparams(
         )
 
     if TORCH_VERSION_AT_LEAST_2_5:
-        if not (is_device(w.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6):
+        if (not (is_device(w.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6)) \
+            and ((not is_device(w.device.type, "xpu") and TORCH_VERSION_AT_LEAST_2_7)):
             w_int4x8 = (w_int4x8[::, ::2] << 4 | w_int4x8[::, 1::2]).to(torch.uint8)
 
     return w_int4x8
@@ -739,8 +741,9 @@ def test_groupwise_affine_dequantize_tensor_from_qparams(self):
                 zeros = torch.randint(0, 15, (10, 2), dtype=torch.int32)
             if TORCH_VERSION_AT_LEAST_2_5:
                 input_tmp = input
-                if not (
-                    is_device(input.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6
+                if (not (
+                    is_device(input.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6)
+                   and (not is_device(input.device.type, "xpu") and TORCH_VERSION_AT_LEAST_2_7)
                 ):
                     input_tmp = (input[::, ::2] << 4 | input[::, 1::2]).to(torch.uint8)
                 w_bf16 = groupwise_affine_dequantize_tensor_from_qparams(

diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -16,6 +16,7 @@
     BlockSparseLayout,
     CutlassInt4PackedLayout,
     Int4CPULayout,
+    Int4XPULayout,
     MarlinQQQLayout,
     MarlinQQQTensor,
     MarlinSparseLayout,
@@ -52,4 +53,5 @@
     "MarlinQQQLayout",
     "Int4CPULayout",
     "CutlassInt4PackedLayout",
+    "Int4XPULayout",
 ]
diff --git a/torchao/dtypes/uintx/__init__.py b/torchao/dtypes/uintx/__init__.py
@@ -7,6 +7,9 @@
 from .int4_cpu_layout import (
     Int4CPULayout,
 )
+from .int4_xpu_layout import (
+    Int4XPULayout,
+)
 from .marlin_qqq_tensor import (
     MarlinQQQLayout,
     MarlinQQQTensor,
@@ -36,4 +39,5 @@
     "MarlinQQQTensor",
     "to_marlinqqq_quantized_intx",
     "CutlassInt4PackedLayout",
+    "Int4XPULayout"
 ]