Skip to content

Commit

Permalink
[aarch64] cherry-pick Xbyak crash fix and linter error fixes from main (
Browse files Browse the repository at this point in the history
#1649)

* cleanup mkldnn patching (#1630)

pytorch is moved to oneDNN v3.3.2 and some of the
 old patches are not applicable any more.

* Add `aarch64_linux` to the list of linted files

* Actually fix lint this type

* aarch64: patch mkl-dnn for xbyak crashes due to /sys not accessible

There are platforms with /sys not mounted. skip handling HW caps
for such platforms.

cherry-pick of: oneapi-src/oneDNN#1773
This fixes the issue# pytorch/pytorch#115482

---------

Co-authored-by: Nikita Shulga <[email protected]>
  • Loading branch information
snadampal and malfet authored Jan 3, 2024
1 parent b5527e4 commit 68a5236
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 45 deletions.
2 changes: 1 addition & 1 deletion .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ merge_base_with = "origin/main"

[[linter]]
code = 'RUFF'
include_patterns = ['test/smoke_test/*.py', 's3_management/*.py']
include_patterns = ['test/smoke_test/*.py', 's3_management/*.py', 'aarch64_linux/*.py']
command = [
'python3',
'tools/linter/adapters/ruff_linter.py',
Expand Down
28 changes: 15 additions & 13 deletions aarch64_linux/aarch64_wheel_ci_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# encoding: UTF-8

import os
import subprocess
from subprocess import check_output
from pygit2 import Repository
from typing import List

Expand All @@ -11,18 +11,20 @@ def list_dir(path: str) -> List[str]:
''''
Helper for getting paths for Python
'''
return subprocess.check_output(["ls", "-1", path]).decode().split("\n")
return check_output(["ls", "-1", path]).decode().split("\n")


def build_ArmComputeLibrary(git_clone_flags: str = "") -> None:
'''
Using ArmComputeLibrary for aarch64 PyTorch
'''
print('Building Arm Compute Library')
acl_build_flags=" ".join(["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0",
"arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"])
os.system("cd / && mkdir /acl")
os.system(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}")
os.system("cd ComputeLibrary; export acl_install_dir=/acl; "
"scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native build_dir=$acl_install_dir/build; "
f"scons Werror=1 -j8 {acl_build_flags} build_dir=$acl_install_dir/build; "
"cp -r arm_compute $acl_install_dir; "
"cp -r include $acl_install_dir; "
"cp -r utils $acl_install_dir; "
Expand Down Expand Up @@ -86,13 +88,12 @@ def parse_arguments():
if override_package_version is not None:
version = override_package_version
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
else:
if branch in ['nightly', 'master']:
build_date = subprocess.check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '')
version = subprocess.check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2]
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
if branch.startswith("v1.") or branch.startswith("v2."):
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
elif branch in ['nightly', 'master']:
build_date = check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '')
version = check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2]
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
elif branch.startswith(("v1.", "v2.")):
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "

if enable_mkldnn:
build_ArmComputeLibrary(git_clone_flags)
Expand All @@ -105,9 +106,10 @@ def parse_arguments():
else:
print("build pytorch without mkldnn backend")

# work around to fix Raspberry pie crash
print("Applying mkl-dnn patch to fix readdir crash")
os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-readdir-crash.patch")
# patch mkldnn to fix aarch64 mac and aws lambda crash
print("Applying mkl-dnn patch to fix crash due to /sys not accesible")
os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/fix-xbyak-failure.patch")

os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
pytorch_wheel_name = complete_wheel("pytorch")
print(f"Build Compelete. Created {pytorch_wheel_name}..")
33 changes: 18 additions & 15 deletions aarch64_linux/build_aarch64_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

# This script is for building AARCH64 wheels using AWS EC2 instances.
# To generate binaries for the release follow these steps:
# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this: "v1.11.0": ("0.11.0", "rc1"),
# 2. Run script with following arguments for each of the supported python versions and specify required RC tag for example: v1.11.0-rc3:
# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>
# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
# "v1.11.0": ("0.11.0", "rc1"),
# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3


import boto3
Expand Down Expand Up @@ -177,7 +178,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
try:
with socket.create_connection((addr, port), timeout=timeout):
return
except (ConnectionRefusedError, socket.timeout):
except (ConnectionRefusedError, socket.timeout): # noqa: PERF203
if i == attempt_cnt - 1:
raise
time.sleep(timeout)
Expand All @@ -203,7 +204,7 @@ def install_condaforge(host: RemoteHost,
if host.using_docker():
host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
else:
host.run_cmd(['sed', '-i', '\'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH\'', '.bashrc'])
host.run_cmd(['sed', '-i', '\'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH\'', '.bashrc']) # noqa: E501


def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
Expand All @@ -221,12 +222,13 @@ def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
print('Building OpenBLAS')
host.run_cmd(f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.25 {git_clone_flags}")
make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
host.run_cmd(f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS")
host.run_cmd(f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS") # noqa: E501


def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
print('Building Arm Compute Library')
acl_build_flags="debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native"
acl_build_flags=" ".join(["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0",
"arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"])
host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}")
host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")

Expand Down Expand Up @@ -301,7 +303,7 @@ def build_torchvision(host: RemoteHost, *,
# Remove .so files to force static linking
host.run_cmd("rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so")
# And patch setup.py to include libz dependency for libpng
host.run_cmd(['sed -i -e \'s/image_link_flags\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'])
host.run_cmd(['sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py']) # noqa: E501

build_vars = ""
if branch == "nightly":
Expand Down Expand Up @@ -525,7 +527,7 @@ def start_build(host: RemoteHost, *,
if host.using_docker():
print("Move libgfortant.a into a standard location")
# HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
# libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17'
# libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' # noqa: E501
# Workaround by copying gfortran library from the host
host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
Expand All @@ -543,22 +545,23 @@ def start_build(host: RemoteHost, *,
# Breakpad build fails on aarch64
build_vars = "USE_BREAKPAD=0 "
if branch == 'nightly':
build_date = host.check_output("cd pytorch && git log --pretty=format:%s -1").strip().split()[0].replace("-", "")
build_date = host.check_output("cd pytorch && git log --pretty=format:%s -1").strip().split()[0].replace("-", "") # noqa: E501
version = host.check_output("cat pytorch/version.txt").strip()[:-2]
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
if branch.startswith("v1.") or branch.startswith("v2."):
if branch.startswith(("v1.", "v2.")):
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
if enable_mkldnn:
build_ArmComputeLibrary(host, git_clone_flags)
print("build pytorch with mkldnn+acl backend")
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
host.run_cmd(f"cd $HOME && git clone https://github.com/pytorch/builder.git")
host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}")
host.run_cmd("cd $HOME && git clone https://github.com/pytorch/builder.git && cd builder && git checkout release/2.2") # noqa: E501
host.run_cmd("cd $HOME/pytorch/third_party/ideep/mkl-dnn && patch -p1 < $HOME/builder/mkldnn_fix/fix-xbyak-failure.patch") # noqa: E501
host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}") # noqa: E501
print('Repair the wheel')
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
host.run_cmd(f"export LD_LIBRARY_PATH=$HOME/acl/build:$HOME/pytorch/build/lib && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}")
host.run_cmd(f"export LD_LIBRARY_PATH=$HOME/acl/build:$HOME/pytorch/build/lib && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}") # noqa: E501
print('replace the original wheel with the repaired one')
pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
host.run_cmd(f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}")
Expand Down Expand Up @@ -706,7 +709,7 @@ def parse_arguments():
parser.add_argument("--build-only", action="store_true")
parser.add_argument("--test-only", type=str)
parser.add_argument("--os", type=str, choices=list(os_amis.keys()), default='ubuntu20_04')
parser.add_argument("--python-version", type=str, choices=['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'], default=None)
parser.add_argument("--python-version", type=str, choices=[f'3.{d}' for d in range(6, 12)], default=None)
parser.add_argument("--alloc-instance", action="store_true")
parser.add_argument("--list-instances", action="store_true")
parser.add_argument("--pytorch-only", action="store_true")
Expand Down
4 changes: 2 additions & 2 deletions aarch64_linux/embed_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def replace_tag(filename):
with open(filename, 'r') as f:
with open(filename) as f:
lines = f.read().split("\\n")
for i,line in enumerate(lines):
if not line.startswith("Tag: "):
Expand Down Expand Up @@ -42,7 +42,7 @@ def embed_library(whl_path, lib_soname, update_tag=False):
torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
ctx.out_wheel=tmp_whl_name
new_lib_path, new_lib_soname = None, None
for filename, elf in elf_file_filter(ctx.iter_files()):
for filename, _ in elf_file_filter(ctx.iter_files()):
if not filename.startswith('torch/lib'):
continue
libtree = lddtree(filename)
Expand Down
14 changes: 0 additions & 14 deletions mkldnn_fix/aarch64-fix-readdir-crash.patch

This file was deleted.

96 changes: 96 additions & 0 deletions mkldnn_fix/fix-xbyak-failure.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
cpu: aarch64: fix xbyak functions for /sys access failures

There are platforms with /sys not mounted. skip handling HW caps
for such platforms.

This fixes the issue# pytorch/pytorch#115482
---
.../xbyak_aarch64/src/util_impl_linux.h | 24 ++++++++++++++-----
.../aarch64/xbyak_aarch64/src/util_impl_mac.h | 9 ++++---
2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
index 2c7b28e58b..860a05700f 100644
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
@@ -144,8 +144,13 @@ private:
regex_t regexBuf;
regmatch_t match[1];

- if (regcomp(&regexBuf, regex, REG_EXTENDED) != 0)
- throw ERR_INTERNAL;
+ if (regcomp(&regexBuf, regex, REG_EXTENDED) != 0) {
+ /* There are platforms with /sys not mounted. return empty buffers
+ * in these scenarios
+ */
+ buf[0] = '\0';
+ return 0;
+ }

const int retVal = regexec(&regexBuf, path, 1, match, 0);
regfree(&regexBuf);
@@ -187,8 +192,12 @@ private:
regex_t regexBuf;
regmatch_t match[2];

- if (regcomp(&regexBuf, "index[0-9]*$", REG_EXTENDED) != 0)
- throw ERR_INTERNAL;
+ if (regcomp(&regexBuf, "index[0-9]*$", REG_EXTENDED) != 0) {
+ /* There are platforms with /sys not mounted. return gracefully
+ * in these scenarios
+ */
+ goto init_and_return_false;
+ }

if (regexec(&regexBuf, dp->d_name, 1, match, 0) == 0) { // Found index[1-9][0-9]. directory
char *dir_name = buf0;
@@ -438,12 +447,15 @@ private:

FILE *file = fopen(path_midr_el1, "r");
if (file == nullptr) {
- throw Error(ERR_INTERNAL);
+ /* There are platforms with /sys not mounted. return empty buffer
+ * in these scenarios
+ */
+ cacheInfo_.midr_el1 = 0xFE << 24;
return;
}

if (fread(buf, sizeof(char), 64, file) == 0) {
- throw Error(ERR_INTERNAL);
+ cacheInfo_.midr_el1 = 0xFE << 24;
return;
}

diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h
index ebd6dba7c0..93bdae1d7a 100644
--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h
+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h
@@ -102,18 +102,21 @@ private:
size_t val = 0;
size_t len = sizeof(val);

+ /* There are platforms with /sys not mounted. skip
+ * handling HW caps for such platforms.
+ */
if (sysctlbyname(hw_opt_atomics, &val, &len, NULL, 0) != 0)
- throw Error(ERR_INTERNAL);
+ type_ = 0;
else
type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_ATOMIC : 0;

if (sysctlbyname(hw_opt_fp, &val, &len, NULL, 0) != 0)
- throw Error(ERR_INTERNAL);
+ type_ = 0;
else
type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_FP : 0;

if (sysctlbyname(hw_opt_neon, &val, &len, NULL, 0) != 0)
- throw Error(ERR_INTERNAL);
+ type_ = 0;
else
type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_ADVSIMD : 0;
}
--
2.34.1

0 comments on commit 68a5236

Please sign in to comment.