Skip to content

Commit

Permalink
Use ninja to speed up build (microsoft#5088)
Browse files Browse the repository at this point in the history
Deepspeed have too many ops now, and it take too many time to pre-build
all ops.
I notice deepspeed disabled `ninja` 4 years ago
(microsoft#298) and I think we should
consider enable it now.
The issue mentioned in microsoft#298
can be solved by resolving `include_dirs` to absolute path.

---------

Co-authored-by: Logan Adams <[email protected]>
Co-authored-by: Logan Adams <[email protected]>
Co-authored-by: Olatunji Ruwase <[email protected]>
Co-authored-by: Michael Wyatt <[email protected]>
  • Loading branch information
5 people authored and SNahir committed Mar 11, 2024
1 parent 60f0fdc commit b7822cd
Show file tree
Hide file tree
Showing 17 changed files with 34 additions and 21 deletions.
16 changes: 12 additions & 4 deletions op_builder/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,9 +453,10 @@ def deepspeed_src_path(self, code_path):

def builder(self):
from torch.utils.cpp_extension import CppExtension
include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
return CppExtension(name=self.absolute_name(),
sources=self.strip_empty_entries(self.sources()),
include_dirs=self.strip_empty_entries(self.include_paths()),
include_dirs=include_dirs,
extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
extra_link_args=self.strip_empty_entries(self.extra_ldflags()))

Expand Down Expand Up @@ -638,7 +639,7 @@ def builder(self):
from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
else:
from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder

include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
{'cxx': self.strip_empty_entries(self.cxx_args()), \
'nvcc': self.strip_empty_entries(self.nvcc_args())}
Expand All @@ -651,7 +652,7 @@ def builder(self):

cuda_ext = ExtensionBuilder(name=self.absolute_name(),
sources=self.strip_empty_entries(self.sources()),
include_dirs=self.strip_empty_entries(self.include_paths()),
include_dirs=include_dirs,
libraries=self.strip_empty_entries(self.libraries_args()),
extra_compile_args=compile_args,
extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
Expand Down Expand Up @@ -702,11 +703,18 @@ def nvcc_args(self):
'-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
]
else:
try:
nvcc_threads = int(os.getenv("DS_NVCC_THREADS", ""))
if nvcc_threads <= 0:
raise ValueError("")
except ValueError:
nvcc_threads = min(os.cpu_count(), 8)

cuda_major, _ = installed_cuda_version()
args += [
'-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math',
'-std=c++17' if cuda_major > 10 else '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__'
'-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__', f'--threads={nvcc_threads}'
]
if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
args.append('--ptxas-options=-v')
Expand Down
6 changes: 4 additions & 2 deletions op_builder/cpu/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

# DeepSpeed Team

import os

try:
# is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
# if successful this also means we're doing a local install and not JIT compile path
Expand All @@ -16,12 +18,12 @@ class CPUOpBuilder(OpBuilder):

def builder(self):
from torch.utils.cpp_extension import CppExtension as ExtensionBuilder

include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}

cpp_ext = ExtensionBuilder(name=self.absolute_name(),
sources=self.strip_empty_entries(self.sources()),
include_dirs=self.strip_empty_entries(self.include_paths()),
include_dirs=include_dirs,
libraries=self.strip_empty_entries(self.libraries_args()),
extra_compile_args=compile_args)

Expand Down
6 changes: 4 additions & 2 deletions op_builder/hpu/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

# DeepSpeed Team

import os

try:
# is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
# if successful this also means we're doing a local install and not JIT compile path
Expand All @@ -17,12 +19,12 @@ class CPUOpBuilder(OpBuilder):

def builder(self):
from torch.utils.cpp_extension import CppExtension as ExtensionBuilder

include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}

cpp_ext = ExtensionBuilder(name=self.absolute_name(),
sources=self.strip_empty_entries(self.sources()),
include_dirs=self.strip_empty_entries(self.include_paths()),
include_dirs=include_dirs,
libraries=self.strip_empty_entries(self.libraries_args()),
extra_compile_args=compile_args)

Expand Down
8 changes: 4 additions & 4 deletions op_builder/inference_core_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,13 @@ def sources(self):
sources = [
"inference/v2/kernels/core_ops/core_ops.cpp",
"inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp",
"inference/v2/kernels/core_ops/bias_activations/bias_activation.cu",
"inference/v2/kernels/core_ops/bias_activations/bias_activation_cuda.cu",
"inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cpp",
"inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu",
"inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm_cuda.cu",
"inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cpp",
"inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu",
"inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_cuda.cu",
"inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp",
"inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu",
"inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu",
]

prefix = self.get_prefix()
Expand Down
12 changes: 6 additions & 6 deletions op_builder/ragged_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,18 @@ def sources(self):
"inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp",
"inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp",
"inference/v2/kernels/ragged_ops/embed/embed.cpp",
"inference/v2/kernels/ragged_ops/embed/embed.cu",
"inference/v2/kernels/ragged_ops/embed/embed_cuda.cu",
"inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp",
"inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu",
"inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu",
"inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp",
"inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu",
"inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu",
"inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp",
"inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu",
"inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu",
"inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp",
"inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu",
"inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu",
"inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp",
"inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cpp",
"inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cu",
"inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu",
]

prefix = self.get_prefix()
Expand Down
4 changes: 2 additions & 2 deletions op_builder/xpu/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ def builder(self):
from intel_extension_for_pytorch.xpu.cpp_extension import DPCPPExtension
except ImportError:
from intel_extension_for_pytorch.xpu.utils import DPCPPExtension

include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
print("dpcpp sources = {}".format(self.sources()))
dpcpp_ext = DPCPPExtension(name=self.absolute_name(),
sources=self.strip_empty_entries(self.sources()),
include_dirs=self.strip_empty_entries(self.include_paths()),
include_dirs=include_dirs,
extra_compile_args={
'cxx': self.strip_empty_entries(self.cxx_args()),
},
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def get_env_if_set(key, default: typing.Any = ""):
# For any pre-installed ops force disable ninja.
if torch_available:
from accelerator import get_accelerator
cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False)
use_ninja = not is_env_set("DS_DISABLE_NINJA")
cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=use_ninja)

if torch_available:
TORCH_MAJOR = torch.__version__.split('.')[0]
Expand Down

0 comments on commit b7822cd

Please sign in to comment.