From 31020f5ae5db6060f294a5b10578b896f758a9b1 Mon Sep 17 00:00:00 2001
From: Elton Zheng <eltonz@microsoft.com>
Date: Sat, 15 Oct 2022 15:42:42 -0700
Subject: [PATCH 1/2] Fix build issues on Windows

---
 build_win.bat                                 | 19 +++++++++++++++++++
 .../inference/includes/inference_context.h    |  2 +-
 setup.py                                      | 18 ++++++++++--------
 3 files changed, 30 insertions(+), 9 deletions(-)
 create mode 100644 build_win.bat

diff --git a/build_win.bat b/build_win.bat
new file mode 100644
index 000000000000..ec8c8a362a78
--- /dev/null
+++ b/build_win.bat
@@ -0,0 +1,19 @@
+@echo off
+
+set DS_BUILD_AIO=0
+set DS_BUILD_SPARSE_ATTN=0
+
+echo Administrative permissions required. Detecting permissions...
+
+net session >nul 2>&1
+if %errorLevel% == 0 (
+    echo Success: Administrative permissions confirmed.
+) else (
+    echo Failure: Current permissions inadequate.
+    goto end
+)
+
+
+python setup.py bdist_wheel
+
+:end
diff --git a/csrc/transformer/inference/includes/inference_context.h b/csrc/transformer/inference/includes/inference_context.h
index 64e490ef47fc..330da050cced 100644
--- a/csrc/transformer/inference/includes/inference_context.h
+++ b/csrc/transformer/inference/includes/inference_context.h
@@ -157,7 +157,7 @@ class Context {
     void* GetWorkSpace() { return _workspace; }
     void* GetAttentionUnfusedWorkspace()
     {
-        return _workspace + _attention_unfused_workspace_offset;
+        return (char*)_workspace + _attention_unfused_workspace_offset;
     }
 
     inline unsigned new_token(unsigned layer_id)
diff --git a/setup.py b/setup.py
index 24e5ec62b7dd..099fa1b929e7 100755
--- a/setup.py
+++ b/setup.py
@@ -4,12 +4,13 @@
 DeepSpeed library
 
 To build wheel on Windows:
-    1. Install pytorch, such as pytorch 1.8 + cuda 11.1
+    1. Install pytorch, such as pytorch 1.12 + cuda 11.6
     2. Install visual cpp build tool
-    3. Launch cmd console with Administrator privilege for creating required symlink folders
+    3. Include cuda toolkit
+    4. Launch cmd console with Administrator privilege for creating required symlink folders
 
 Create a new wheel via the following command:
-    python setup.py bdist_wheel
+    build_win.bat
 
 The wheel will be located at: dist/*.whl
 """
@@ -228,11 +229,12 @@ def create_dir_symlink(src, dest):
 hip_version = "0.0"
 if torch_available and torch.version.cuda is not None:
     cuda_version = ".".join(torch.version.cuda.split('.')[:2])
-    if isinstance(torch.cuda.nccl.version(), int):
-        # This will break if minor version > 9
-        nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
-    else:
-        nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
+    if sys.platform != "win32":
+        if isinstance(torch.cuda.nccl.version(), int):
+            # This will break if minor version > 9
+            nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
+        else:
+            nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
     if hasattr(torch.cuda, 'is_bf16_supported') and torch.cuda.is_available():
         bf16_support = torch.cuda.is_bf16_supported()
 if torch_available and hasattr(torch.version, 'hip') and torch.version.hip is not None:

From e10c0f2a819be3c0862d4c46f86c978b9a05e322 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Tue, 18 Oct 2022 00:53:09 +0500
Subject: [PATCH 2/2] small fix to complie with new version of Microsoft C++
 Build Tools

---
 op_builder/builder.py    | 1 +
 op_builder/fused_adam.py | 9 +++++++--
 op_builder/fused_lamb.py | 9 +++++++--
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/op_builder/builder.py b/op_builder/builder.py
index 09b781fddd72..9fa94985b780 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -650,6 +650,7 @@ def nvcc_args(self):
         else:
             cuda_major, _ = installed_cuda_version()
             args += [
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
                 '--use_fast_math',
                 '-std=c++17'
                 if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
index 6ff264fbf1a1..2883d417ede9 100644
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@@ -3,6 +3,8 @@
 """
 from .builder import CUDAOpBuilder
 
+import sys
+
 
 class FusedAdamBuilder(CUDAOpBuilder):
     BUILD_VAR = "DS_BUILD_FUSED_ADAM"
@@ -27,6 +29,9 @@ def cxx_args(self):
     def nvcc_args(self):
         nvcc_flags = ['-O3'] + self.version_dependent_macros()
         if not self.is_rocm_pytorch():
-            nvcc_flags.extend(['-lineinfo',
-                               '--use_fast_math'] + self.compute_capability_args())
+            nvcc_flags.extend([
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
+                '-lineinfo',
+                '--use_fast_math'
+            ] + self.compute_capability_args())
         return nvcc_flags
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
index 106728f6f3fe..d5f88d0b1ad1 100644
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
@@ -3,6 +3,8 @@
 """
 from .builder import CUDAOpBuilder
 
+import sys
+
 
 class FusedLambBuilder(CUDAOpBuilder):
     BUILD_VAR = 'DS_BUILD_FUSED_LAMB'
@@ -33,6 +35,9 @@ def nvcc_args(self):
                 '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
             ]
         else:
-            nvcc_flags.extend(['-lineinfo',
-                               '--use_fast_math'] + self.compute_capability_args())
+            nvcc_flags.extend([
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
+                '-lineinfo',
+                '--use_fast_math'
+            ] + self.compute_capability_args())
         return nvcc_flags