diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml new file mode 100644 index 0000000000000..36fa23577f27d --- /dev/null +++ b/.github/actionlint.yaml @@ -0,0 +1,19 @@ +self-hosted-runner: + labels: + - cn + - cuda + - OpenGL + - vulkan + - m1 + - driver470 + - driver510 + - benchmark + - release + - build + - sm70 + - sm86 + - amdgpu + - online + +# config-variables: +# - ENVIRONMENT_STAGE diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000000000..5da34e3b05b9b --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,395 @@ +name: Build Taichi +on: + workflow_call: + inputs: + build_id: + required: true + type: string + nightly: + required: true + type: boolean + python: + required: true + type: string + secrets: + BOT_MINIO_ACCESS_KEY: + required: true + BOT_MINIO_SECRET_KEY: + required: true + workflow_dispatch: + inputs: + build_id: + description: 'The build id. e.g.: 20230427-102544-abcdefab' + required: true + type: string + nightly: + description: 'Are we building nightly wheels?' + required: true + default: false + type: boolean + python: + description: 'JSON encoded python versions need building wheel. e.g.: ["3.7"]' + required: true + type: string + +concurrency: + group: build-${{ github.event.number || github.run_id }} + cancel-in-progress: true + +env: + TI_CI: "1" + TI_SKIP_VERSION_CHECK: 'ON' + CI_IMAGE_VERSION: '202304251731' + TI_USE_GIT_CACHE: ${{ vars.TI_USE_GIT_CACHE }} + NIGHTLY: ${{ inputs.nightly && 'nightly' || '' }} + +jobs: + show_environ: + name: Show Environment Variables + # Disable this workflow on forks + if: github.repository_owner == 'taichi-dev' + runs-on: [self-hosted, Linux] + steps: + - name: Environment Variables + run: env + - name: Github Object + run: | + cat <<'EOF' + ${{ toJson(github) }} + EOF + + build_cpu_mac: + name: Build macOS-x86 + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + python: ${{ fromJSON(inputs.python) }} + _designated: [''] + include: + - _designated: '' + designated: designated + runs-on: + - self-hosted + - macos-10.15 + env: + TAICHI_CMAKE_ARGS: >- + -DTI_WITH_OPENGL:BOOL=OFF + -DTI_WITH_VULKAN:BOOL=ON + -DTI_WITH_C_API:BOOL=ON + -DTI_BUILD_TESTS:BOOL=ON + steps: + - name: Workaround checkout Needed single revision issue + run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true + + - uses: actions/checkout@v3 + with: + fetch-depth: '0' + submodules: 'recursive' + + - name: Build + run: ./build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} ${SHOULD_TAG_CONFIG:+--tag-config} + + - name: Upload Built Wheel + uses: shallwefootball/s3-upload-action@v1.3.3 + with: + aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }} + aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }} + aws_bucket: built-wheels + source_dir: dist + destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }} + endpoint: http://botmaster.tgr:9000 + + build_linux: + name: Build Linux + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + cuda: ['', cuda] + llvm: ['', llvm] + gl: ['', gl] + vk: ['', vk] + python: ${{ fromJSON(inputs.python) }} + include: + - {cuda: cuda, llvm: llvm, gl: gl, vk: vk, designated: designated} + exclude: + - {llvm: '', cuda: cuda} + runs-on: [self-hosted, online, Linux, build] + env: + TAICHI_CMAKE_ARGS: >- + -DTI_WITH_CUDA:BOOL=${{ matrix.cuda && 'ON' || 'OFF' }} + -DTI_WITH_LLVM:BOOL=${{ matrix.llvm && 'ON' || 'OFF' }} + -DTI_WITH_OPENGL:BOOL=${{ matrix.gl && 'ON' || 'OFF' }} + -DTI_WITH_VULKAN:BOOL=${{ matrix.vk && 'ON' || 'OFF' }} + -DTI_WITH_METAL:BOOL=OFF + -DTI_WITH_BACKTRACE:BOOL=ON + -DTI_BUILD_TESTS:BOOL=ON + + steps: + - name: Workaround checkout Needed single revision issue + run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true + + - uses: actions/checkout@v3 + with: + submodules: 'recursive' + fetch-depth: '0' + + - name: Build + run: | + . .github/workflows/scripts/common-utils.sh + ci-docker-run-gpu \ + -v $(pwd):/home/dev/taichi \ + registry.botmaster.tgr/taichi-build-cuda:${{ env.CI_IMAGE_VERSION }} \ + /home/dev/taichi/build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} ${SHOULD_TAG_CONFIG:+--tag-config} + env: + SHOULD_TAG_CONFIG: ${{ !matrix.designated && 'yes' || '' }} + + - name: Upload Built Wheel + uses: shallwefootball/s3-upload-action@v1.3.3 + with: + aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }} + aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }} + aws_bucket: built-wheels + source_dir: dist + destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }} + endpoint: http://botmaster.tgr:9000 + + build_manylinux2014: + name: Build manylinux2014 + timeout-minutes: 30 + runs-on: [self-hosted, online, Linux, build] + strategy: + fail-fast: false + matrix: + python: ${{ fromJSON(inputs.python) }} + _designated: [''] + include: + - _designated: '' + designated: designated + env: + TAICHI_CMAKE_ARGS: >- + -DTI_WITH_OPENGL:BOOL=OFF + -DTI_WITH_VULKAN:BOOL=OFF + -DTI_BUILD_TESTS:BOOL=ON + + steps: + - uses: actions/checkout@v3 + with: + submodules: 'recursive' + fetch-depth: '0' + + - name: Build + run: | + . .github/workflows/scripts/common-utils.sh + + ci-docker-run-gpu \ + -v $(pwd):/home/dev/taichi \ + registry.botmaster.tgr/taichi-build-manylinux2014-cuda:${{ env.CI_IMAGE_VERSION }} \ + /home/dev/taichi/build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} ${SHOULD_TAG_CONFIG:+--tag-config} + + - name: Upload Built Wheel + uses: shallwefootball/s3-upload-action@v1.3.3 + with: + aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }} + aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }} + aws_bucket: built-wheels + source_dir: dist + destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }} + endpoint: http://botmaster.tgr:9000 + + build_amdgpu_linux: + name: Build AMDGPU + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + python: ${{ fromJSON(inputs.python) }} + designated: [''] + runs-on: [self-hosted, online, Linux, build] + env: + TAICHI_CMAKE_ARGS: >- + -DTI_WITH_CUDA:BOOL=OFF + -DTI_WITH_VULKAN:BOOL=OFF + -DTI_WITH_OPENGL:BOOL=OFF + -DTI_BUILD_TESTS:BOOL=ON + -DTI_WITH_AMDGPU:BOOL=ON + + steps: + - name: Workaround checkout Needed single revision issue + run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true + + - uses: actions/checkout@v3 + with: + submodules: 'recursive' + fetch-depth: '0' + + - name: Build & Install + run: | + . .github/workflows/scripts/common-utils.sh + + ci-docker-run \ + -v $(pwd):/home/dev/taichi \ + registry.botmaster.tgr/taichi-build-amdgpu:${{ env.CI_IMAGE_VERSION }} \ + /home/dev/taichi/build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} --tag-local=amd + + - name: Upload Built Wheel + uses: shallwefootball/s3-upload-action@v1.3.3 + with: + aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }} + aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }} + aws_bucket: built-wheels + source_dir: dist + destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }} + endpoint: http://botmaster.tgr:9000 + + build_windows: + name: Build Windows + strategy: + fail-fast: false + matrix: + cuda: ['', cuda] + llvm: ['', llvm] + gl: ['', gl] + vk: ['', vk] + python: ${{ fromJSON(inputs.python) }} + include: + - {cuda: cuda, llvm: llvm, gl: gl, vk: vk, lto: lto, pdb: pdb, designated: designated} + exclude: + - {llvm: '', cuda: cuda} + runs-on: [self-hosted, online, Windows, build] + timeout-minutes: 30 + env: + TAICHI_CMAKE_ARGS: >- + -DTI_WITH_CUDA:BOOL=${{ matrix.cuda && 'ON' || 'OFF' }} + -DTI_WITH_LLVM:BOOL=${{ matrix.llvm && 'ON' || 'OFF' }} + -DTI_WITH_OPENGL:BOOL=${{ matrix.gl && 'ON' || 'OFF' }} + -DTI_WITH_VULKAN:BOOL=${{ matrix.vk && 'ON' || 'OFF' }} + -DTI_WITH_METAL:BOOL=OFF + -DTI_WITH_BACKTRACE:BOOL=ON + -DTI_WITH_DX11:BOOL=ON + -DTI_WITH_DX12:BOOL=ON + -DTI_BUILD_TESTS:BOOL=ON + -DTI_WITH_C_API:BOOL=ON + -DTI_WITH_LTO:BOOL=${{ matrix.lto && 'ON' || 'OFF' }} + -DTI_GENERATE_PDB:BOOL=${{ matrix.pdb && 'ON' || 'OFF' }} + steps: + - name: Workaround checkout Needed single revision issue + shell: pwsh + run: | + $ErrorActionPreference = 'SilentlyContinue' + git config --system core.longpaths true + git submodule foreach --recursive 'git rev-parse HEAD || rm -rf $PWD' + $LASTEXITCODE = 0 + + - uses: actions/checkout@v3 + with: + fetch-depth: '0' + submodules: 'recursive' + + - uses: actions/setup-python@v4 + with: + # force a 3.7 is ok, build.py will handle actual python env + python-version: 3.7 + + - name: Build + shell: pwsh + run: | + $nightlyFlag = $null + if ($env:NIGHTLY) { $nightlyFlag = "--nightly" } + $tagFlag = $null + if ($env:SHOULD_TAG_CONFIG) { $tagFlag = "--tag-config" } + python build.py $nightlyFlag --python=${{ matrix.python }} $tagFlag + env: + SHOULD_TAG_CONFIG: ${{ !matrix.designated && 'yes' || '' }} + + - name: Upload Built Wheel + uses: shallwefootball/s3-upload-action@v1.3.3 + with: + aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }} + aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }} + aws_bucket: built-wheels + source_dir: dist + destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }} + endpoint: http://botmaster.tgr:9000 + + - name: Cleanup Git Cache Configs + shell: pwsh + if: always() + run: | + . .github/workflows/scripts/common-utils.ps1 + UnsetGitCachingProxy + exit 0 + + build_m1: + name: Build M1 + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + python: ${{ fromJSON(inputs.python) }} + _designated: [''] + exclude: + - python: "3.7" + include: + - _designated: '' + designated: designated + defaults: + run: + shell: '/usr/bin/arch -arch arm64e /bin/bash --noprofile --norc -eo pipefail {0}' + runs-on: [self-hosted, online, m1] + env: + TAICHI_CMAKE_ARGS: >- + -DTI_WITH_OPENGL:BOOL=OFF + -DTI_WITH_CUDA:BOOL=OFF + -DTI_WITH_VULKAN:BOOL=ON + -DTI_BUILD_TESTS:BOOL=ON + -DTI_WITH_C_API=ON + -DTI_WITH_STATIC_C_API=ON + PLATFORM: 'm1' + steps: + - name: Workaround checkout Needed single revision issue + run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true + + - uses: actions/checkout@v3 + with: + fetch-depth: '0' + submodules: 'recursive' + + - name: Build + run: | + brew install molten-vk + ./build.py ${NIGHTLY:+--nightly} --python=${{ matrix.python }} + + - name: Upload Built Wheel + uses: shallwefootball/s3-upload-action@v1.3.3 + with: + aws_key_id: ${{ secrets.BOT_MINIO_ACCESS_KEY }} + aws_secret_access_key: ${{ secrets.BOT_MINIO_SECRET_KEY }} + aws_bucket: built-wheels + source_dir: dist + destination_dir: built-wheels/${{ inputs.build_id }}/${{ !matrix.designated && 'matrix/' || '' }} + endpoint: http://botmaster.tgr:9000 + + build_ios_capi: + name: Build iOS C-API Static Library + timeout-minutes: 30 + runs-on: [self-hosted, online, m1] + steps: + - name: Workaround checkout Needed single revision issue + run: git submodule foreach 'git rev-parse HEAD > /dev/null 2>&1 || rm -rf $PWD' || true + + - uses: actions/checkout@v3 + with: + fetch-depth: '0' + submodules: 'recursive' + + - name: Build + run: .github/workflows/scripts/build.py ios + + - name: Save Compiled Static Library + uses: actions/upload-artifact@v3 + with: + name: libtaichi_c_api.iOS.a + path: 'dist/C-API-iOS/*.a' + retention-days: 7 diff --git a/.github/workflows/initiator.yaml b/.github/workflows/initiator.yaml new file mode 100644 index 0000000000000..8cd5cfd4ddf8a --- /dev/null +++ b/.github/workflows/initiator.yaml @@ -0,0 +1,74 @@ +name: Taichi Workflow Initiator +on: + push: + branches: + - master + - rc-* + +concurrency: + group: ${{ github.event.number || github.run_id }} + cancel-in-progress: true + +env: + TI_CI: "1" + TI_SKIP_VERSION_CHECK: 'ON' + TI_LITE_TEST: ${{ github.event_name == 'pull_request' && ! contains(github.event.pull_request.labels.*.name, 'full-ci') && ! startsWith(github.base_ref, 'rc-') && '1' || '' }} + TI_TEST_OFFLINE_CACHE: ${{ github.event.schedule == '0 18 * * *' && '1' || '' }} + CI_IMAGE_VERSION: '202304251731' + TI_USE_GIT_CACHE: ${{ vars.TI_USE_GIT_CACHE }} + REDIS_HOST: 172.16.5.1 + +jobs: + show_environ: + name: Show Environment Variables + # Disable this workflow on forks + if: github.repository_owner == 'taichi-dev' + runs-on: [self-hosted, Linux] + steps: + - name: Environment Variables + run: env + - name: Github Object + run: | + cat <<'EOF' + ${{ toJson(github) }} + EOF + + preparation: + name: Preparation + runs-on: [self-hosted, Linux] + outputs: + date: ${{ steps.gather.outputs.date }} + short_sha: ${{ steps.gather.outputs.short_sha }} + build_id: ${{ steps.gather.outputs.build_id }} + steps: + - name: Gather Information + id: gather + run: | + DATE=$(date +'%Y%m%d-%H%M%S') + SHORT_SHA=$(echo $GITHUB_SHA | cut -c '1-10') + echo date=$DATE >> $GITHUB_OUTPUT + echo short_sha=$SHORT_SHA >> $GITHUB_OUTPUT + echo build_id=$DATE-$SHORT_SHA >> $GITHUB_OUTPUT + + build: + name: Build Artifacts + needs: [preparation] + uses: ./.github/workflows/build.yaml + with: + build_id: ${{ needs.preparation.outputs.build_id }} + nightly: false + python: '["3.9", "3.10", "3.11"]' + secrets: + BOT_MINIO_ACCESS_KEY: ${{ secrets.BOT_MINIO_ACCESS_KEY }} + BOT_MINIO_SECRET_KEY: ${{ secrets.BOT_MINIO_SECRET_KEY }} + + save_build_metadata: + name: Save Build Metadata + runs-on: [self-hosted, Linux] + needs: [preparation, build] + steps: + - name: Saving Metadata + id: gather + run: >- + redis-cli -h $REDIS_HOST --raw + set "latest-build-id:$GITHUB_REPOSITORY:$GITHUB_REF" ${{ needs.preparation.outputs.build_id }} diff --git a/.github/workflows/scripts/ti_build/cmake.py b/.github/workflows/scripts/ti_build/cmake.py index cf6d07ed5df14..2dc327710e40b 100644 --- a/.github/workflows/scripts/ti_build/cmake.py +++ b/.github/workflows/scripts/ti_build/cmake.py @@ -14,7 +14,7 @@ from .misc import banner # -- code -- -OPTION_RE = re.compile(r'option\(([A-Z0-9_]*) +"(.*?)" +(ON|OFF)\)') +OPTION_RE = re.compile(r'option\(([A-Z0-9_]*) +"(.*?)" +(ON|OFF)\)(?: *# wheel-tag: (.*))?') DEF_RE = re.compile(r"-D([A-Z0-9_]*)(?::BOOL)?=([^ ]+)(?: |$)") @@ -28,10 +28,7 @@ def __init__(self, environ_name): self.environ_name = environ_name self.definitions = {} self.option_definitions = { - "CMAKE_EXPORT_COMPILE_COMMANDS": ( - "Generate compile_commands.json", - False, - ), + "CMAKE_EXPORT_COMPILE_COMMANDS": ("Generate compile_commands.json", False, ""), } self.finalized = False @@ -39,9 +36,9 @@ def __init__(self, environ_name): def collect_options(self, *files: str) -> None: for fn in files: with open(fn, "r") as f: - for name, desc, default in OPTION_RE.findall(f.read()): + for name, desc, default, wheel_tag in OPTION_RE.findall(f.read()): default = self._VMAP.get(default, default) - self.option_definitions[name] = (desc, default) + self.option_definitions[name] = (desc, default, wheel_tag) def parse_initial_args(self) -> None: args = os.environ.get(self.environ_name, "") @@ -49,7 +46,7 @@ def parse_initial_args(self) -> None: self.set(name, value) def get_effective(self, name: str) -> Union[str, bool]: - _, default = self.option_definitions.get(name, ("", None)) + _, default, _ = self.option_definitions.get(name, ("", None, "")) return self.definitions.get(name, default) def set(self, name: str, value: Union[str, bool]) -> None: @@ -57,7 +54,7 @@ def set(self, name: str, value: Union[str, bool]) -> None: desc = "" value = self._VMAP.get(value, value) default = None - desc, default = self.option_definitions.get(name, ("", None)) + desc, default, wheel_tag = self.option_definitions.get(name, ("", None, "")) desc = desc and f" ({desc}) " is_bool = isinstance(default, bool) assert not is_bool or isinstance(value, bool), f"Option {name} must be bool" @@ -84,6 +81,7 @@ def set(self, name: str, value: Union[str, bool]) -> None: else: p(f"{B}:: CMAKE: Already disabled: {name}{desc}{N}") else: + assert not wheel_tag, "Set a non boolean value to an option with wheel-tag" if orig != value: if orig != default: p(f"{R}:: CMAKE- {name}={orig}{desc}{N}") @@ -99,7 +97,7 @@ def render(self) -> List[Tuple[str, str, str]]: else: v = f"-D{name}={value}" - desc, _ = self.option_definitions.get(name, ("", None)) + desc, _, _ = self.option_definitions.get(name, ("", None, "")) if desc: prefix = "DO NOT " if not value else "" desc = f" ({prefix}{desc})" @@ -108,6 +106,15 @@ def render(self) -> List[Tuple[str, str, str]]: return lst + def render_wheel_tag(self) -> str: + tags = [] + for name, (_, default, wheel_tag) in self.option_definitions.items(): + if not wheel_tag: + continue + if self.definitions.get(name, default): + tags.append(wheel_tag) + return ".".join(sorted(tags)) + @banner("{self.environ_name} Summary") def print_summary(self, rendered) -> None: p = lambda s: print(s, file=sys.stderr, flush=True) diff --git a/.github/workflows/scripts/ti_build/compiler.py b/.github/workflows/scripts/ti_build/compiler.py index 5854705bc3f55..4c3b66dc15f60 100644 --- a/.github/workflows/scripts/ti_build/compiler.py +++ b/.github/workflows/scripts/ti_build/compiler.py @@ -3,14 +3,18 @@ # -- stdlib -- from pathlib import Path import os +import json import platform import shutil +import tempfile +import sys # -- third party -- # -- own -- from .cmake import cmake_args from .dep import download_dep from .misc import banner, error, get_cache_home, warn +from .tinysh import powershell # -- code -- @@ -56,36 +60,90 @@ def setup_clang(as_compiler=True) -> None: cmake_args["CMAKE_CXX_COMPILER"] = clangpp +ENV_EXTRACT_SCRIPT = """ +param ([string]$DevShell, [string]$VsPath, [string]$OutFile) +$WarningPreference = 'SilentlyContinue' +Import-Module $DevShell +Enter-VsDevShell -VsInstallPath $VsPath -SkipAutomaticLocation -DevCmdArguments "-arch=x64" +Get-ChildItem env:* | ConvertTo-Json -Depth 1 | Out-File $OutFile +""" + + +def _vs_devshell(vs): + dll = vs / "Common7" / "Tools" / "Microsoft.VisualStudio.DevShell.dll" + + if not dll.exists(): + error("Could not find Visual Studio DevShell") + return + + with tempfile.TemporaryDirectory() as tmp: + tmp = Path(tmp) + script = tmp / "extract.ps1" + with script.open("w") as f: + f.write(ENV_EXTRACT_SCRIPT) + outfile = tmp / "env.json" + powershell( + "-ExecutionPolicy", + "Bypass", + "-File", + str(script), + "-DevShell", + str(dll), + "-VsPath", + str(vs), + "-OutFile", + str(outfile), + ) + with outfile.open(encoding="utf-16") as f: + envs = json.load(f) + + for v in envs: + os.environ[v["Key"]] = v["Value"] + + @banner("Setup MSVC") def setup_msvc() -> None: assert platform.system() == "Windows" - os.environ["TAICHI_USE_MSBUILD"] = "1" - base = Path(r"C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools") - for edition in ("Enterprise", "Professional", "Community", "BuildTools"): - if (base / edition).exists(): - return + base = Path("C:\\Program Files (x86)\\Microsoft Visual Studio") + for ver in ("2022",): + for edition in ("Enterprise", "Professional", "Community", "BuildTools"): + vs = base / ver / edition + if not vs.exists(): + continue + + if os.environ.get("TI_CI") and not os.environ.get("TAICHI_USE_MSBUILD"): + # Use Ninja + MSVC in CI, for better caching + _vs_devshell(vs) + cmake_args["CMAKE_C_COMPILER"] = "cl.exe" + cmake_args["CMAKE_CXX_COMPILER"] = "cl.exe" + else: + os.environ["TAICHI_USE_MSBUILD"] = "1" - url = "https://aka.ms/vs/17/release/vs_BuildTools.exe" - out = base - download_dep( - url, - out, - elevate=True, - args=[ - "--passive", - "--wait", - "--norestart", - "--includeRecommended", - "--add", - "Microsoft.VisualStudio.Workload.VCTools", - # NOTE: We are using the custom built Clang++, - # so components below are not necessary anymore. - # '--add', - # 'Microsoft.VisualStudio.Component.VC.Llvm.Clang', - # '--add', - # 'Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Llvm.Clang', - # '--add', - # 'Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset', - ], - ) + return + else: + url = "https://aka.ms/vs/17/release/vs_BuildTools.exe" + out = base / "2022" / "BuildTools" + download_dep( + url, + out, + elevate=True, + args=[ + "--passive", + "--wait", + "--norestart", + "--includeRecommended", + "--add", + "Microsoft.VisualStudio.Workload.VCTools", + # NOTE: We are using the custom built Clang++, + # so components below are not necessary anymore. + # '--add', + # 'Microsoft.VisualStudio.Component.VC.Llvm.Clang', + # '--add', + # 'Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Llvm.Clang', + # '--add', + # 'Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset', + ], + ) + warn("Please restart build.py after Visual Studio Build Tools is installed.") + sys.exit(1) diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py index 41dd136be38e9..663de529f1463 100644 --- a/.github/workflows/scripts/ti_build/entry.py +++ b/.github/workflows/scripts/ti_build/entry.py @@ -2,6 +2,7 @@ # -- stdlib -- import argparse +import datetime import os import platform import subprocess @@ -20,7 +21,7 @@ from .ospkg import setup_os_pkgs from .python import get_desired_python_version, setup_python from .sccache import setup_sccache -from .tinysh import Command, CommandFailed, git +from .tinysh import Command, CommandFailed, git, nice from .vulkan import setup_vulkan @@ -30,15 +31,22 @@ def build_wheel(python: Command, pip: Command) -> None: """ Build the Taichi wheel """ + git.fetch("origin", "master", "--tags") - proj = os.environ.get("PROJECT_NAME", "taichi") proj_tags = [] extra = [] - if proj == "taichi-nightly": - proj_tags.extend(["egg_info", "--tag-date", "--tag-build=.post"]) - # Include C-API in nightly builds - cmake_args["TI_WITH_C_API"] = True + cmake_args.writeback() + wheel_tag = cmake_args.render_wheel_tag() + if misc.options.tag_local: + wheel_tag = misc.options.tag_local + + if misc.options.nightly: + os.environ["PROJECT_NAME"] = "taichi-nightly" + now = datetime.datetime.now().strftime("%Y%m%d") + proj_tags.extend(["egg_info", f"--tag-build=.post{now}+{wheel_tag}"]) + elif misc.options.tag_config or misc.options.tag_local: + proj_tags.extend(["egg_info", f"--tag-build=+{wheel_tag}"]) if platform.system() == "Linux": if is_manylinux2014(): @@ -46,11 +54,11 @@ def build_wheel(python: Command, pip: Command) -> None: else: extra.extend(["-p", "manylinux_2_27_x86_64"]) - cmake_args.writeback() python("setup.py", "clean") python("misc/make_changelog.py", "--ver", "origin/master", "--repo_dir", "./", "--save") - python("setup.py", *proj_tags, "bdist_wheel", *extra) + with nice(): + python("setup.py", *proj_tags, "bdist_wheel", *extra) @banner("Install Build Wheel Dependencies") @@ -59,7 +67,7 @@ def install_build_wheel_deps(python: Command, pip: Command) -> None: pip.install("-r", "requirements_dev.txt") -def setup_basic_build_env(force_vulkan=False): +def setup_basic_build_env(): u = platform.uname() if (u.system, u.machine) == ("Windows", "AMD64"): # Use MSVC on Windows @@ -70,8 +78,7 @@ def setup_basic_build_env(force_vulkan=False): setup_clang() setup_llvm() - if force_vulkan or cmake_args.get_effective("TI_WITH_VULKAN"): - setup_vulkan() + setup_vulkan() sccache = setup_sccache() @@ -89,18 +96,18 @@ def action_wheel(): handle_alternate_actions() build_wheel(python, pip) try: - sccache("--stop-server") + sccache("-s") except CommandFailed: pass def action_android(): - sccache, python, pip = setup_basic_build_env(force_vulkan=True) + sccache, python, pip = setup_basic_build_env() setup_android_ndk() handle_alternate_actions() build_android(python, pip) try: - sccache("--stop-server") + sccache("-s") except CommandFailed: pass @@ -150,6 +157,15 @@ def parse_args(): help = "Continue when encounters error." parser.add_argument("--permissive", action="store_true", default=False, help=help) + help = "Tag built wheel with TI_WITH_xxx config." + parser.add_argument("--tag-config", action="store_true", default=False, help=help) + + help = "Set a local version. Overrides --tag-config." + parser.add_argument("--tag-local", type=str, default=None, help=help) + + help = "Build nightly wheel." + parser.add_argument("--nightly", action="store_true", default=False, help=help) + options = parser.parse_args() return options diff --git a/.github/workflows/scripts/ti_build/tinysh.py b/.github/workflows/scripts/ti_build/tinysh.py index 4a88d88077b84..9a7e126095bad 100644 --- a/.github/workflows/scripts/ti_build/tinysh.py +++ b/.github/workflows/scripts/ti_build/tinysh.py @@ -218,6 +218,19 @@ def sudo(): return prefix("sudo") +def nice(): + """ + Wrap a command with sudo. + """ + if IS_WINDOWS: + from .misc import warn + + warn("nice is not yet implemented on Windows") + return with_options({}) + else: + return prefix("nice") + + sh = Command() git = sh.git # Use setup_python ! @@ -228,3 +241,5 @@ def sudo(): bash = sh.bash start = sh.start.bake("/wait") apt = sh.sudo.apt +powershell = Command("powershell.exe") +pwsh = Command("pwsh.exe") diff --git a/CMakeLists.txt b/CMakeLists.txt index d024821b1f5e0..b29755fbcad3c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,9 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) option(USE_LLD "Use lld (from llvm) linker" OFF) option(USE_MOLD "Use mold (A Modern Linker)" OFF) -option(TI_WITH_BACKTRACE "Use backward-cpp to print out C++ stack trace upon failure" OFF) +option(TI_WITH_BACKTRACE "Use backward-cpp to print out C++ stack trace upon failure" OFF) # wheel-tag: bt +option(TI_GENERATE_PDB "Generate Program Database (PDB) files (will make compilation uncacheable)" OFF) +option(TI_WITH_LTO "Enable Link Time Optimization (LTO) (affects Windows + MSVC for now)" OFF) # wheel-tag: lto if(LINUX OR APPLE) if (NOT IOS) @@ -205,8 +207,8 @@ endif() configure_file(taichi/common/version.h.in ${CMAKE_SOURCE_DIR}/taichi/common/version.h) configure_file(taichi/common/commit_hash.h.in ${CMAKE_SOURCE_DIR}/taichi/common/commit_hash.h) -option(TI_WITH_C_API "build taichi runtime c-api library" ON) -option(TI_WITH_STATIC_C_API "build static taichi runtime c-api library" OFF) +option(TI_WITH_C_API "build taichi runtime c-api library" ON) # wheel-tag: aot +option(TI_WITH_STATIC_C_API "build static taichi runtime c-api library" OFF) # wheel-tag: static_aot if(TI_WITH_STATIC_C_API) set(TI_WITH_C_API ${TI_WITH_STATIC_C_API}) @@ -231,7 +233,7 @@ if (TI_BUILD_RHI_EXAMPLES) endif() -option(TI_WITH_GRAPHVIZ "generate dependency graphs between targets" OFF) +option(TI_WITH_GRAPHVIZ "generate dependency graphs between targets" OFF) # wheel-tag: viz if (TI_WITH_GRAPHVIZ) set(GRAPHVIZ_GRAPH_NAME "ti_targets") add_custom_target(graphviz ALL diff --git a/c_api/src/taichi_llvm_impl.cpp b/c_api/src/taichi_llvm_impl.cpp index a8b2fb8d8c6d2..7e89211031a34 100644 --- a/c_api/src/taichi_llvm_impl.cpp +++ b/c_api/src/taichi_llvm_impl.cpp @@ -54,11 +54,11 @@ TiMemory LlvmRuntime::allocate_memory( const taichi::lang::Device::AllocParams ¶ms) { taichi::lang::LLVMRuntime *llvm_runtime = executor_->get_llvm_runtime(); taichi::lang::LlvmDevice *llvm_device = executor_->llvm_device(); - taichi::lang::DeviceAllocation devalloc = - llvm_device->allocate_memory_runtime({params, - executor_->get_runtime_jit_module(), - llvm_runtime, result_buffer}); + llvm_device->allocate_memory_runtime( + {params, executor_->get_runtime_jit_module(), llvm_runtime, + result_buffer, executor_->use_device_memory_pool()}); + return devalloc2devmem(*this, devalloc); } diff --git a/cmake/TaichiCAPITests.cmake b/cmake/TaichiCAPITests.cmake index de894cf131b54..ad743eec3cb52 100644 --- a/cmake/TaichiCAPITests.cmake +++ b/cmake/TaichiCAPITests.cmake @@ -22,7 +22,7 @@ if (WIN32) set_target_properties(${C_API_TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${C_API_TESTS_OUTPUT_DIR}) set_target_properties(${C_API_TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${C_API_TESTS_OUTPUT_DIR}) set_target_properties(${C_API_TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${C_API_TESTS_OUTPUT_DIR}) - if (MSVC) + if (MSVC AND TI_GENERATE_PDB) target_compile_options(${C_API_TESTS_NAME} PRIVATE "$<$:/Zi>") target_link_options(${C_API_TESTS_NAME} PRIVATE "$<$:/DEBUG>") target_link_options(${C_API_TESTS_NAME} PRIVATE "$<$:/OPT:REF>") diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake index a03e41d9fc59e..58b9a5a3ded3d 100644 --- a/cmake/TaichiCXXFlags.cmake +++ b/cmake/TaichiCXXFlags.cmake @@ -25,10 +25,14 @@ if (WIN32) set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -flto=thin") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -flto=thin") elseif (MSVC) - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /GL /Gy") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /GL /Gy") - set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS} /LTCG") - set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} /LTCG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /Gy") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /Gy") + if (TI_WITH_LTO) + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} /GL") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /GL") + set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS} /LTCG") + set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} /LTCG") + endif() endif() endif() @@ -39,8 +43,10 @@ if (WIN32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus /Zc:inline /std:c++17") # Linker & object related flags set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /bigobj") - # Debugging (generate PBD files) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /Zf") + # Debugging (generate PDB files) + if (TI_GENERATE_PDB) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /Zf") + endif() # Performance and optimizations set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Oi") # C4244: conversion from 'type1' to 'type2', possible loss of data diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake index 8579d2d3bdacb..a3b44521f23d2 100644 --- a/cmake/TaichiCore.cmake +++ b/cmake/TaichiCore.cmake @@ -1,14 +1,14 @@ option(USE_STDCPP "Use -stdlib=libc++" OFF) -option(TI_WITH_LLVM "Build with LLVM backends" ON) -option(TI_WITH_METAL "Build with the Metal backend" ON) -option(TI_WITH_CUDA "Build with the CUDA backend" ON) -option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF) -option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF) -option(TI_WITH_OPENGL "Build with the OpenGL backend" ON) -option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF) -option(TI_WITH_DX11 "Build with the DX11 backend" OFF) -option(TI_WITH_DX12 "Build with the DX12 backend" OFF) -option(TI_WITH_GGUI "Build with GGUI" OFF) +option(TI_WITH_LLVM "Build with LLVM backends" ON) # wheel-tag: llvm +option(TI_WITH_METAL "Build with the Metal backend" ON) # wheel-tag: mtl +option(TI_WITH_CUDA "Build with the CUDA backend" ON) # wheel-tag: cu +option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF) # wheel-tag: cutk +option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF) # wheel-tag: amd +option(TI_WITH_OPENGL "Build with the OpenGL backend" ON) # wheel-tag: gl +option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF) # wheel-tag: vk +option(TI_WITH_DX11 "Build with the DX11 backend" OFF) # wheel-tag: dx11 +option(TI_WITH_DX12 "Build with the DX12 backend" OFF) # wheel-tag: dx12 +option(TI_WITH_GGUI "Build with GGUI" OFF) # wheel-tag: ggui # Force symbols to be 'hidden' by default so nothing is exported from the Taichi # library including the third-party dependencies. diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake index c3d59577d15a9..63d96c3a60c11 100644 --- a/cmake/TaichiTests.cmake +++ b/cmake/TaichiTests.cmake @@ -56,11 +56,11 @@ if (WIN32) set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${TESTS_OUTPUT_DIR}) set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${TESTS_OUTPUT_DIR}) set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${TESTS_OUTPUT_DIR}) - if (MSVC) - target_compile_options(${TESTS_NAME} PRIVATE "$<$:/Zi>") - target_link_options(${TESTS_NAME} PRIVATE "$<$:/DEBUG>") - target_link_options(${TESTS_NAME} PRIVATE "$<$:/OPT:REF>") - target_link_options(${TESTS_NAME} PRIVATE "$<$:/OPT:ICF>") + if (MSVC AND TI_GENERATE_PDB) + target_compile_options(${TESTS_NAME} PRIVATE "/Zi") + target_link_options(${TESTS_NAME} PRIVATE "/DEBUG") + target_link_options(${TESTS_NAME} PRIVATE "/OPT:REF") + target_link_options(${TESTS_NAME} PRIVATE "/OPT:ICF") endif() endif() target_link_libraries(${TESTS_NAME} PRIVATE taichi_core) diff --git a/misc/make_changelog.py b/misc/make_changelog.py index a7e23e593ece4..c373aa52bdf4a 100644 --- a/misc/make_changelog.py +++ b/misc/make_changelog.py @@ -112,4 +112,5 @@ def format(c): if args.save: with open("./python/taichi/CHANGELOG.md", "w", encoding="utf-8") as f: f.write(res) - print(res) + else: + print(res) diff --git a/python/taichi/aot/_export.py b/python/taichi/aot/_export.py index 0af32c5fcb938..37889a70fed91 100644 --- a/python/taichi/aot/_export.py +++ b/python/taichi/aot/_export.py @@ -23,4 +23,4 @@ def inner(f): def export(f): - export_as(f.__name__)(f) + return export_as(f.__name__)(f) diff --git a/taichi/common/one_or_more.h b/taichi/common/one_or_more.h index 472f0629b8ec8..bb7ad75a14d33 100644 --- a/taichi/common/one_or_more.h +++ b/taichi/common/one_or_more.h @@ -11,21 +11,27 @@ struct one_or_more { std::variant var; + // NOLINTNEXTLINE one_or_more(value_type const &value) : var(value) { } + // NOLINTNEXTLINE one_or_more(value_type &value) : var(value) { } + // NOLINTNEXTLINE one_or_more(value_type &&value) : var(std::move(value)) { } + // NOLINTNEXTLINE one_or_more(Container const &value) : var(value) { } + // NOLINTNEXTLINE one_or_more(Container &value) : var(value) { } + // NOLINTNEXTLINE one_or_more(Container &&value) : var(std::move(value)) { } diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp index ae597e7199ebd..641c9077a2576 100644 --- a/taichi/ir/frontend_ir.cpp +++ b/taichi/ir/frontend_ir.cpp @@ -140,6 +140,9 @@ FrontendWhileStmt::FrontendWhileStmt(const FrontendWhileStmt &o) void ArgLoadExpression::type_check(const CompileConfig *) { ret_type = dt; + if (is_ptr) { + ret_type = TypeFactory::get_instance().get_pointer_type(ret_type, false); + } if (!create_load) { ret_type = TypeFactory::get_instance().get_pointer_type(ret_type, false); } @@ -656,7 +659,7 @@ Stmt *make_ndarray_access(Expression::FlattenContext *ctx, auto var_stmt = flatten_lvalue(var, ctx); auto expr = var.cast(); auto external_ptr_stmt = std::make_unique( - var_stmt, index_stmts, expr->dt.get_shape(), expr->element_dim, + var_stmt, index_stmts, expr->dim, expr->dt.get_shape(), expr->element_dim, expr->is_grad); if (expr->dim == indices.size()) { // Indexing into an scalar element @@ -960,7 +963,7 @@ void AtomicOpExpression::type_check(const CompileConfig *config) { }; // Broadcast val to dest if neccessary - auto val_dtype = val->ret_type; + auto val_dtype = get_rvalue_dtype(val); auto dest_dtype = dest->ret_type.ptr_removed(); if (dest_dtype->is() && val_dtype->is()) { error(); @@ -973,20 +976,18 @@ void AtomicOpExpression::type_check(const CompileConfig *config) { } // Validate dtype - auto dtype = val->ret_type; - if (dtype->is()) { - dtype = dtype.get_element_type(); + if (val_dtype->is()) { + val_dtype = val_dtype.get_element_type(); } - if (!dtype->is()) { + if (!val_dtype->is()) { error(); } - if (is_quant(dest->ret_type)) { - ret_type = dest->ret_type->get_compute_type(); - } else if (dest->ret_type->is() || - dest->ret_type->is()) { - ret_type = dest->ret_type; + if (is_quant(dest_dtype)) { + ret_type = dest_dtype->get_compute_type(); + } else if (dest_dtype->is() || dest_dtype->is()) { + ret_type = dest_dtype; } else { error(); } @@ -1271,7 +1272,7 @@ void MeshIndexConversionExpression::flatten(FlattenContext *ctx) { } void ReferenceExpression::type_check(const CompileConfig *) { - ret_type = var->ret_type; + ret_type = TypeFactory::get_instance().get_pointer_type(var->ret_type); } void ReferenceExpression::flatten(FlattenContext *ctx) { @@ -1796,4 +1797,14 @@ Stmt *flatten_rvalue(Expr ptr, Expression::FlattenContext *ctx) { return ptr_stmt; } +DataType get_rvalue_dtype(Expr expr) { + if (auto argload = expr.cast()) { + if (argload->is_ptr) { + return argload->ret_type.ptr_removed(); + } + return argload->ret_type; + } + return expr->ret_type; +} + } // namespace taichi::lang diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h index a873d42b8e5bd..1c4a6bb3448fa 100644 --- a/taichi/ir/frontend_ir.h +++ b/taichi/ir/frontend_ir.h @@ -1109,4 +1109,6 @@ Stmt *flatten_lvalue(Expr expr, Expression::FlattenContext *ctx); Stmt *flatten_rvalue(Expr expr, Expression::FlattenContext *ctx); +DataType get_rvalue_dtype(Expr expr); + } // namespace taichi::lang diff --git a/taichi/ir/ir_builder.cpp b/taichi/ir/ir_builder.cpp index 29df5319cf31e..27a2f70d4b4aa 100644 --- a/taichi/ir/ir_builder.cpp +++ b/taichi/ir/ir_builder.cpp @@ -441,7 +441,7 @@ ExternalPtrStmt *IRBuilder::create_external_ptr( const std::vector &indices, bool is_grad) { return insert(Stmt::make_typed( - ptr, indices, std::vector(), 0, is_grad)); + ptr, indices, indices.size(), std::vector(), 0, is_grad)); } AdStackAllocaStmt *IRBuilder::create_ad_stack(const DataType &dt, diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp index 8234ddce9b868..1666a614d8e89 100644 --- a/taichi/ir/statements.cpp +++ b/taichi/ir/statements.cpp @@ -36,6 +36,7 @@ ExternalPtrStmt::ExternalPtrStmt(Stmt *base_ptr, const std::vector &indices, bool is_grad) : base_ptr(base_ptr), indices(indices), is_grad(is_grad) { + ndim = indices.size(); TI_ASSERT(base_ptr != nullptr); TI_ASSERT(base_ptr->is()); TI_STMT_REG_FIELDS; @@ -43,12 +44,14 @@ ExternalPtrStmt::ExternalPtrStmt(Stmt *base_ptr, ExternalPtrStmt::ExternalPtrStmt(Stmt *base_ptr, const std::vector &indices, + int ndim, const std::vector &element_shape, int element_dim, bool is_grad) : ExternalPtrStmt(base_ptr, indices, is_grad) { this->element_shape = element_shape; this->element_dim = element_dim; + this->ndim = ndim; } GlobalPtrStmt::GlobalPtrStmt(SNode *snode, diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h index 04fe2175f1838..f3bfd718207e9 100644 --- a/taichi/ir/statements.h +++ b/taichi/ir/statements.h @@ -333,7 +333,13 @@ class AtomicOpStmt : public Stmt, class ExternalPtrStmt : public Stmt { public: Stmt *base_ptr; + std::vector indices; + + // Number of dimensions of external shape + int ndim; + + // Shape of element type std::vector element_shape; // AOS: element_dim < 0 // SOA: element_dim > 0 @@ -352,6 +358,7 @@ class ExternalPtrStmt : public Stmt { ExternalPtrStmt(Stmt *base_ptr, const std::vector &indices, + int ndim, const std::vector &element_shape, int element_dim, bool is_grad = false); diff --git a/taichi/rhi/CMakeLists.txt b/taichi/rhi/CMakeLists.txt index 0e26f5579f500..4b88b7d974c0d 100644 --- a/taichi/rhi/CMakeLists.txt +++ b/taichi/rhi/CMakeLists.txt @@ -113,3 +113,8 @@ target_link_libraries(${TAICHI_DEVICE_API} PUBLIC common_rhi) # Generate shared library add_library(ti_device_api_shared SHARED public_device.h) target_link_libraries(ti_device_api_shared PUBLIC ${TAICHI_DEVICE_API}) + +# When building targets on Windows using Ninja + MSVC, the linker requires at least 1 object file +# to work properly, else link.exe would complain about LNK4001 warning, and fail afterwards. +# Adding a dummy file to workaround this. +target_sources(ti_device_api_shared PRIVATE dummy.cpp) diff --git a/taichi/rhi/amdgpu/amdgpu_context.h b/taichi/rhi/amdgpu/amdgpu_context.h index affef71aa2ddd..6688eb91a5667 100644 --- a/taichi/rhi/amdgpu/amdgpu_context.h +++ b/taichi/rhi/amdgpu/amdgpu_context.h @@ -94,7 +94,7 @@ class AMDGPUContext { void *new_ctx_; public: - ContextGuard(AMDGPUContext *new_ctx) + explicit ContextGuard(AMDGPUContext *new_ctx) : old_ctx_(nullptr), new_ctx_(new_ctx) { AMDGPUDriver::get_instance().context_get_current(&old_ctx_); if (old_ctx_ != new_ctx) diff --git a/taichi/rhi/amdgpu/amdgpu_device.cpp b/taichi/rhi/amdgpu/amdgpu_device.cpp index ee86b03665dc3..ac786a4cb0ffb 100644 --- a/taichi/rhi/amdgpu/amdgpu_device.cpp +++ b/taichi/rhi/amdgpu/amdgpu_device.cpp @@ -106,8 +106,8 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) { false); } else if (!info.use_preallocated) { DeviceMemoryPool::get_instance().release(info.size, info.ptr); - info.ptr = nullptr; } + info.ptr = nullptr; } RhiResult AmdgpuDevice::map(DeviceAllocation alloc, void **mapped_ptr) { diff --git a/taichi/rhi/cuda/cuda_device.cpp b/taichi/rhi/cuda/cuda_device.cpp index fe5d296071672..25d24ac8e8e5a 100644 --- a/taichi/rhi/cuda/cuda_device.cpp +++ b/taichi/rhi/cuda/cuda_device.cpp @@ -53,17 +53,17 @@ DeviceAllocation CudaDevice::allocate_memory_runtime( info.size = taichi::iroundup(params.size, taichi_page_size); if (info.size == 0) { info.ptr = nullptr; + } else if (params.use_memory_pool) { + CUDADriver::get_instance().malloc_async((void **)&info.ptr, info.size, + nullptr); } else { info.ptr = DeviceMemoryPool::get_instance().allocate_with_cache(this, params); - - TI_ASSERT(info.ptr != nullptr); - - CUDADriver::get_instance().memset((void *)info.ptr, 0, info.size); } info.is_imported = false; info.use_cached = true; info.use_preallocated = true; + info.use_memory_pool = params.use_memory_pool; DeviceAllocation alloc; alloc.alloc_id = allocations_.size(); @@ -92,6 +92,7 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) { validate_device_alloc(handle); AllocInfo &info = allocations_[handle.alloc_id]; + if (info.size == 0) { return; } @@ -99,14 +100,16 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) { TI_ERROR("the DeviceAllocation is already deallocated"); } TI_ASSERT(!info.is_imported); - if (info.use_cached) { + if (info.use_memory_pool) { + CUDADriver::get_instance().mem_free_async(info.ptr, nullptr); + } else if (info.use_cached) { DeviceMemoryPool::get_instance().release(info.size, (uint64_t *)info.ptr, false); } else if (!info.use_preallocated) { auto &mem_pool = DeviceMemoryPool::get_instance(); mem_pool.release(info.size, info.ptr, true /*release_raw*/); - info.ptr = nullptr; } + info.ptr = nullptr; } RhiResult CudaDevice::upload_data(DevicePtr *device_ptr, diff --git a/taichi/rhi/cuda/cuda_device.h b/taichi/rhi/cuda/cuda_device.h index 0e06174552331..3a94209084a13 100644 --- a/taichi/rhi/cuda/cuda_device.h +++ b/taichi/rhi/cuda/cuda_device.h @@ -77,6 +77,7 @@ class CudaDevice : public LlvmDevice { * */ bool use_preallocated{true}; bool use_cached{false}; + bool use_memory_pool{false}; void *mapped{nullptr}; }; diff --git a/taichi/rhi/dummy.cpp b/taichi/rhi/dummy.cpp new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/taichi/rhi/llvm/device_memory_pool.h b/taichi/rhi/llvm/device_memory_pool.h index 0ccb5ae77f338..f5081defb2c57 100644 --- a/taichi/rhi/llvm/device_memory_pool.h +++ b/taichi/rhi/llvm/device_memory_pool.h @@ -24,7 +24,7 @@ class TI_DLL_EXPORT DeviceMemoryPool { void *allocate(std::size_t size, std::size_t alignment, bool managed = false); void release(std::size_t size, void *ptr, bool release_raw = false); void reset(); - DeviceMemoryPool(bool merge_upon_release); + explicit DeviceMemoryPool(bool merge_upon_release); ~DeviceMemoryPool(); protected: diff --git a/taichi/rhi/llvm/llvm_device.h b/taichi/rhi/llvm/llvm_device.h index cbefaaeb60327..4279ba1765a7a 100644 --- a/taichi/rhi/llvm/llvm_device.h +++ b/taichi/rhi/llvm/llvm_device.h @@ -13,6 +13,7 @@ class LlvmDevice : public Device { JITModule *runtime_jit{nullptr}; LLVMRuntime *runtime{nullptr}; uint64 *result_buffer{nullptr}; + bool use_memory_pool{false}; }; Arch arch() const override { diff --git a/taichi/rhi/opengl/opengl_device.cpp b/taichi/rhi/opengl/opengl_device.cpp index e2e65e25f2a28..cae2caeb4d0ea 100644 --- a/taichi/rhi/opengl/opengl_device.cpp +++ b/taichi/rhi/opengl/opengl_device.cpp @@ -644,9 +644,18 @@ RhiResult GLDevice::create_pipeline(Pipeline **out_pipeline, PipelineCache *cache) noexcept { try { *out_pipeline = new GLPipeline(src, name); - } catch (std::bad_alloc &) { + } catch (std::bad_alloc &e) { *out_pipeline = nullptr; + RHI_LOG_ERROR(e.what()); return RhiResult::out_of_memory; + } catch (std::invalid_argument &e) { + *out_pipeline = nullptr; + RHI_LOG_ERROR(e.what()); + return RhiResult::invalid_usage; + } catch (std::runtime_error &e) { + *out_pipeline = nullptr; + RHI_LOG_ERROR(e.what()); + return RhiResult::error; } return RhiResult::success; } diff --git a/taichi/rhi/opengl/opengl_device.h b/taichi/rhi/opengl/opengl_device.h index 514ec98b13426..3c5008cee2199 100644 --- a/taichi/rhi/opengl/opengl_device.h +++ b/taichi/rhi/opengl/opengl_device.h @@ -29,7 +29,7 @@ extern void *kGetOpenglProcAddr; class GLResourceSet : public ShaderResourceSet { public: GLResourceSet() = default; - explicit GLResourceSet(const GLResourceSet &other) = default; + GLResourceSet(const GLResourceSet &other) = default; ~GLResourceSet() override; diff --git a/taichi/rhi/vulkan/vulkan_device.h b/taichi/rhi/vulkan/vulkan_device.h index 1e7a4fd8027e1..71b33ecdf6b6e 100644 --- a/taichi/rhi/vulkan/vulkan_device.h +++ b/taichi/rhi/vulkan/vulkan_device.h @@ -239,7 +239,7 @@ class VulkanResourceSet : public ShaderResourceSet { class VulkanRasterResources : public RasterResources { public: - VulkanRasterResources(VulkanDevice *device) : device_(device) { + explicit VulkanRasterResources(VulkanDevice *device) : device_(device) { } struct BufferBinding { diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp index 6c636f833f299..3adada586f5df 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.cpp +++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp @@ -46,6 +46,7 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, config.arch = host_arch(); } else { // CUDA runtime created successfully + use_device_memory_pool_ = CUDAContext::get_instance().supports_mem_pool(); } #else TI_WARN("Taichi is not compiled with CUDA."); @@ -398,6 +399,20 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes( const int tree_id = field_cache_data.tree_id; const int root_id = field_cache_data.root_id; + bool all_dense = config_.demote_dense_struct_fors; + for (size_t i = 0; i < snode_metas.size(); i++) { + if (snode_metas[i].type != SNodeType::dense && + snode_metas[i].type != SNodeType::place && + snode_metas[i].type != SNodeType::root) { + all_dense = false; + break; + } + } + + if (config_.arch == Arch::cuda && use_device_memory_pool() && !all_dense) { + preallocate_runtime_memory(); + } + TI_TRACE("Allocating data structure of size {} bytes", root_size); std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size); @@ -424,16 +439,6 @@ void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes( snode_tree_allocs_[tree_id] = alloc; - bool all_dense = config_.demote_dense_struct_fors; - for (size_t i = 0; i < snode_metas.size(); i++) { - if (snode_metas[i].type != SNodeType::dense && - snode_metas[i].type != SNodeType::place && - snode_metas[i].type != SNodeType::root) { - all_dense = false; - break; - } - } - runtime_jit->call( "runtime_initialize_snodes", llvm_runtime_, root_size, root_id, (int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense); @@ -471,16 +476,25 @@ LlvmDevice *LlvmRuntimeExecutor::llvm_device() { DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray( std::size_t alloc_size, uint64 *result_buffer) { - return llvm_device()->allocate_memory_runtime( + auto devalloc = llvm_device()->allocate_memory_runtime( {{alloc_size, /*host_write=*/false, /*host_read=*/false, /*export_sharing=*/false, AllocUsage::Storage}, get_runtime_jit_module(), get_llvm_runtime(), - result_buffer}); + result_buffer, + use_device_memory_pool()}); + + TI_ASSERT(allocated_runtime_memory_allocs_.find(devalloc.alloc_id) == + allocated_runtime_memory_allocs_.end()); + allocated_runtime_memory_allocs_[devalloc.alloc_id] = devalloc; + return devalloc; } void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) { + TI_ASSERT(allocated_runtime_memory_allocs_.find(handle.alloc_id) != + allocated_runtime_memory_allocs_.end()); llvm_device()->dealloc_memory(handle); + allocated_runtime_memory_allocs_.erase(handle.alloc_id); } void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc, @@ -534,13 +548,32 @@ uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr( void LlvmRuntimeExecutor::finalize() { profiler_ = nullptr; - for (auto &preallocated_device_buffer_alloc : - preallocated_device_buffer_allocs_) { - if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) { - llvm_device()->dealloc_memory(preallocated_device_buffer_alloc); - llvm_device()->clear(); - DeviceMemoryPool::get_instance().reset(); + if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) { + preallocated_runtime_objects_allocs_.reset(); + preallocated_runtime_memory_allocs_.reset(); + + // Reset runtime memory + auto allocated_runtime_memory_allocs_copy = + allocated_runtime_memory_allocs_; + for (auto &iter : allocated_runtime_memory_allocs_copy) { + // The runtime allocation may have already been freed upon explicit + // Ndarray/Field destruction Check if the allocation still alive + void *ptr = llvm_device()->get_memory_addr(iter.second); + if (ptr == nullptr) + continue; + + deallocate_memory_ndarray(iter.second); } + allocated_runtime_memory_allocs_.clear(); + + // Reset device + llvm_device()->clear(); + + // Reset memory pool + DeviceMemoryPool::get_instance().reset(); + + // Release unused memory from cuda memory pool + synchronize(); } finalized_ = true; } @@ -551,7 +584,9 @@ LlvmRuntimeExecutor::~LlvmRuntimeExecutor() { } } -void *LlvmRuntimeExecutor::preallocate_memory(std::size_t prealloc_size) { +void *LlvmRuntimeExecutor::preallocate_memory( + std::size_t prealloc_size, + DeviceAllocationUnique &devalloc) { DeviceAllocation preallocated_device_buffer_alloc; Device::AllocParams preallocated_device_buffer_alloc_params; @@ -563,11 +598,38 @@ void *LlvmRuntimeExecutor::preallocate_memory(std::size_t prealloc_size) { void *preallocated_device_buffer = llvm_device()->get_memory_addr(preallocated_device_buffer_alloc); - preallocated_device_buffer_allocs_.emplace_back( + devalloc = std::make_unique( std::move(preallocated_device_buffer_alloc)); return preallocated_device_buffer; } +void LlvmRuntimeExecutor::preallocate_runtime_memory() { + if (preallocated_runtime_memory_allocs_ != nullptr) + return; + + std::size_t total_prealloc_size = 0; + const auto total_mem = llvm_device()->get_total_memory(); + if (config_.device_memory_fraction == 0) { + TI_ASSERT(config_.device_memory_GB > 0); + total_prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30)); + } else { + total_prealloc_size = + std::size_t(config_.device_memory_fraction * total_mem); + } + TI_ASSERT(total_prealloc_size <= total_mem); + + void *runtime_memory_prealloc_buffer = preallocate_memory( + total_prealloc_size, preallocated_runtime_memory_allocs_); + + TI_TRACE("Allocating device memory {:.2f} MB", + 1.0 * total_prealloc_size / (1UL << 20)); + + auto *const runtime_jit = get_runtime_jit_module(); + runtime_jit->call( + "runtime_initialize_memory", llvm_runtime_, total_prealloc_size, + runtime_memory_prealloc_buffer); +} + void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler, uint64 **result_buffer_ptr) { // The result buffer allocated here is only used for the launches of @@ -576,14 +638,14 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler, // CUDA and AMDGPU backends. // | ==================preallocated device buffer ========================== | // |<- reserved for return ->|<---- usable for allocators on the device ---->| - std::size_t runtime_objects_prealloc_size = 0; void *runtime_objects_prealloc_buffer = nullptr; if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) { #if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU) + runtime_objects_prealloc_size = 60 * (1UL << 20); // 50 MB - runtime_objects_prealloc_buffer = - preallocate_memory(runtime_objects_prealloc_size); + runtime_objects_prealloc_buffer = preallocate_memory( + runtime_objects_prealloc_size, preallocated_runtime_objects_allocs_); TI_TRACE("Allocating device memory {:.2f} MB", 1.0 * runtime_objects_prealloc_size / (1UL << 20)); @@ -639,31 +701,9 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler, // Preallocate for runtime memory and update to LLVMRuntime if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) { - std::size_t total_prealloc_size = 0; - const auto total_mem = llvm_device()->get_total_memory(); - if (config_.device_memory_fraction == 0) { - TI_ASSERT(config_.device_memory_GB > 0); - total_prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30)); - } else { - total_prealloc_size = - std::size_t(config_.device_memory_fraction * total_mem); + if (!use_device_memory_pool()) { + preallocate_runtime_memory(); } - TI_ASSERT(total_prealloc_size <= total_mem); - - auto runtime_memory_prealloc_size = - total_prealloc_size > runtime_objects_prealloc_size - ? total_prealloc_size - runtime_objects_prealloc_size - : 0; - - void *runtime_memory_prealloc_buffer = - preallocate_memory(runtime_memory_prealloc_size); - - TI_TRACE("Allocating device memory {:.2f} MB", - 1.0 * runtime_memory_prealloc_size / (1UL << 20)); - - runtime_jit->call( - "runtime_initialize_memory", llvm_runtime_, - runtime_memory_prealloc_size, runtime_memory_prealloc_buffer); } if (config_.arch == Arch::cuda) { diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h index 6e487419a28a5..871e394ad8b50 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.h +++ b/taichi/runtime/llvm/llvm_runtime_executor.h @@ -75,6 +75,10 @@ class LlvmRuntimeExecutor { void synchronize(); + bool use_device_memory_pool() { + return use_device_memory_pool_; + } + private: /* ----------------------- */ /* ------ Allocation ----- */ @@ -96,7 +100,9 @@ class LlvmRuntimeExecutor { std::size_t size, uint32_t data); - void *preallocate_memory(std::size_t prealloc_size); + void *preallocate_memory(std::size_t prealloc_size, + DeviceAllocationUnique &devalloc); + void preallocate_runtime_memory(); /* ------------------------- */ /* ---- Runtime Helpers ---- */ @@ -144,12 +150,16 @@ class LlvmRuntimeExecutor { std::unique_ptr snode_tree_buffer_manager_{nullptr}; std::unordered_map snode_tree_allocs_; - std::vector preallocated_device_buffer_allocs_; + DeviceAllocationUnique preallocated_runtime_objects_allocs_ = nullptr; + DeviceAllocationUnique preallocated_runtime_memory_allocs_ = nullptr; + std::unordered_map + allocated_runtime_memory_allocs_; // good buddy friend LlvmProgramImpl; friend SNodeTreeBufferManager; + bool use_device_memory_pool_ = false; bool finalized_{false}; KernelProfilerBase *profiler_ = nullptr; }; diff --git a/taichi/transforms/auto_diff.cpp b/taichi/transforms/auto_diff.cpp index 67a4b08eb29e3..d15c90fb1f7fa 100644 --- a/taichi/transforms/auto_diff.cpp +++ b/taichi/transforms/auto_diff.cpp @@ -1518,9 +1518,10 @@ class MakeAdjoint : public ADTransform { "Cannot automatically differentiate through a grad " "tensor, if you really want to do that, pass the grad " "tensor into the kernel directly"); - auto adj_ptr = insert( - src->base_ptr, src->indices, src->element_shape, src->element_dim, - /*is_grad=*/true); + auto adj_ptr = + insert(src->base_ptr, src->indices, src->ndim, + src->element_shape, src->element_dim, + /*is_grad=*/true); adj_ptr->ret_type = src->ret_type; if (is_ptr_offset) { @@ -1592,9 +1593,10 @@ class MakeAdjoint : public ADTransform { "Cannot automatically differentiate through a grad " "tensor, if you really want to do that, pass the grad " "tensor into the kernel directly"); - adjoint_ptr = insert( - dest->base_ptr, dest->indices, dest->element_shape, dest->element_dim, - /*is_grad=*/true); + adjoint_ptr = + insert(dest->base_ptr, dest->indices, dest->ndim, + dest->element_shape, dest->element_dim, + /*is_grad=*/true); adjoint_ptr->ret_type = dest->ret_type; if (is_ptr_offset) { @@ -1659,7 +1661,7 @@ class MakeAdjoint : public ADTransform { "tensor, if you really want to do that, pass the grad " "tensor into the kernel directly"); auto adjoint_ptr = - insert(dest->base_ptr, dest->indices, + insert(dest->base_ptr, dest->indices, dest->ndim, dest->element_shape, dest->element_dim, /*is_grad=*/true); adjoint_ptr->ret_type = dest->ret_type; diff --git a/taichi/transforms/cache_loop_invariant_global_vars.cpp b/taichi/transforms/cache_loop_invariant_global_vars.cpp index 3966bd4d9d181..3ba7cc886d3af 100644 --- a/taichi/transforms/cache_loop_invariant_global_vars.cpp +++ b/taichi/transforms/cache_loop_invariant_global_vars.cpp @@ -54,7 +54,20 @@ class CacheLoopInvariantGlobalVars : public LoopInvariantDetector { if (current_offloaded->task_type == OffloadedTaskType::serial) { return true; } - if (auto global_ptr = stmt->cast()) { + + // Handle GlobalPtrStmt + bool is_global_ptr_stmt = false; + GlobalPtrStmt *global_ptr = nullptr; + if (stmt->is()) { + is_global_ptr_stmt = true; + global_ptr = stmt->as(); + } else if (stmt->is() && + stmt->as()->origin->is()) { + is_global_ptr_stmt = true; + global_ptr = stmt->as()->origin->as(); + } + + if (global_ptr) { auto snode = global_ptr->snode; if (loop_unique_ptr_[snode] == nullptr || loop_unique_ptr_[snode]->indices.empty()) { @@ -69,8 +82,21 @@ class CacheLoopInvariantGlobalVars : public LoopInvariantDetector { return false; } return true; - } else if (stmt->is()) { - ExternalPtrStmt *dest_ptr = stmt->as(); + } + + // Handle ExternalPtrStmt + bool is_external_ptr_stmt = false; + ExternalPtrStmt *dest_ptr = nullptr; + if (stmt->is()) { + is_external_ptr_stmt = true; + dest_ptr = stmt->as(); + } else if (stmt->is() && + stmt->as()->origin->is()) { + is_external_ptr_stmt = true; + dest_ptr = stmt->as()->origin->as(); + } + + if (is_external_ptr_stmt) { if (dest_ptr->indices.empty()) { return false; } diff --git a/taichi/transforms/check_out_of_bound.cpp b/taichi/transforms/check_out_of_bound.cpp index 4bd5dd51e51af..79acfff34a754 100644 --- a/taichi/transforms/check_out_of_bound.cpp +++ b/taichi/transforms/check_out_of_bound.cpp @@ -59,17 +59,22 @@ class CheckOutOfBound : public BasicStmtVisitor { auto check_lower_bound = new_stmts.push_back( BinaryOpType::cmp_ge, stmt->indices[i], lower_bound); Stmt *upper_bound{nullptr}; - // TODO: Simplify logic here since SOA layout for ndarray is deprecated - if ((stmt->element_dim < 0 && i == (stmt->indices.size() - 1)) || - (stmt->element_dim > 0 && i == 0)) { - upper_bound = - new_stmts.push_back(TypedConstant(flattened_element)); - } else { - auto axis = stmt->element_dim <= 0 ? i : (i - stmt->element_dim); + + // SOA layout for ndarray is deprecated, assert it's AOS layout + TI_ASSERT(stmt->element_dim <= 0); + auto ndim = stmt->ndim; + if (i < ndim) { + // Check for External Shape + auto axis = i; upper_bound = new_stmts.push_back( /*axis=*/axis, /*arg_id=*/stmt->base_ptr->as()->arg_id); + } else { + // Check for Element Shape + upper_bound = + new_stmts.push_back(TypedConstant(flattened_element)); } + auto check_upper_bound = new_stmts.push_back( BinaryOpType::cmp_lt, stmt->indices[i], upper_bound); auto check_i = new_stmts.push_back( diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index 14fb925dd8414..f162f5975274f 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -183,14 +183,6 @@ void offload_to_executable(IRNode *ir, print("Atomics demoted I"); irpass::analysis::verify(ir); - if (config.real_matrix_scalarize) { - irpass::scalarize(ir); - - // Remove redundant MatrixInitStmt inserted during scalarization - irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false}); - print("Scalarized"); - } - if (config.cache_loop_invariant_global_vars) { irpass::cache_loop_invariant_global_vars(ir, config); print("Cache loop-invariant global vars"); @@ -218,6 +210,14 @@ void offload_to_executable(IRNode *ir, irpass::analysis::verify(ir); } + if (config.real_matrix_scalarize) { + irpass::scalarize(ir); + + // Remove redundant MatrixInitStmt inserted during scalarization + irpass::full_simplify(ir, config, {false, /*autodiff_enabled*/ false}); + print("Scalarized"); + } + if (make_thread_local) { irpass::make_thread_local(ir, config); print("Make thread local"); diff --git a/taichi/transforms/make_cpu_multithreaded_range_for.cpp b/taichi/transforms/make_cpu_multithreaded_range_for.cpp index 8ada2a9c70c88..8528b0ba1685a 100644 --- a/taichi/transforms/make_cpu_multithreaded_range_for.cpp +++ b/taichi/transforms/make_cpu_multithreaded_range_for.cpp @@ -48,7 +48,8 @@ using TaskType = OffloadedStmt::TaskType; class MakeCPUMultithreadedRangeFor : public BasicStmtVisitor { public: - MakeCPUMultithreadedRangeFor(const CompileConfig &config) : config(config) { + explicit MakeCPUMultithreadedRangeFor(const CompileConfig &config) + : config(config) { } void visit(Block *block) override { diff --git a/taichi/transforms/scalarize.cpp b/taichi/transforms/scalarize.cpp index 69073000b5329..ba426be5baa2c 100644 --- a/taichi/transforms/scalarize.cpp +++ b/taichi/transforms/scalarize.cpp @@ -608,10 +608,14 @@ class Scalarize : public BasicStmtVisitor { } void visit(ArgLoadStmt *stmt) override { + if (!stmt->ret_type.is_pointer()) { + return; + } if (stmt->ret_type.ptr_removed()->is()) { return; } auto ret_type = stmt->ret_type.ptr_removed().get_element_type(); + ret_type = TypeFactory::get_instance().get_pointer_type(ret_type); auto arg_load = std::make_unique( stmt->arg_id, ret_type, stmt->is_ptr, stmt->create_load); @@ -1119,9 +1123,9 @@ class MergeExternalAndMatrixPtr : public BasicStmtVisitor { std::accumulate(begin(origin->element_shape), end(origin->element_shape), 1, std::multiplies<>())}; - auto fused = std::make_unique(origin->base_ptr, indices, - element_shape, element_dim, - origin->is_grad); + auto fused = std::make_unique( + origin->base_ptr, indices, origin->ndim, element_shape, element_dim, + origin->is_grad); fused->ret_type = stmt->ret_type; // Note: Update base_ptr's ret_type so that it matches the ExternalPtrStmt // with flattened indices. Main goal is to keep all the hacks in a single diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp index 8d59faf8cc30e..e979f11d3b5e5 100644 --- a/taichi/transforms/type_check.cpp +++ b/taichi/transforms/type_check.cpp @@ -431,12 +431,7 @@ class TypeCheck : public IRVisitor { } void visit(ArgLoadStmt *stmt) override { - // TODO: Maybe have a type_inference() pass, which takes in the args/rets - // defined by the kernel. After that, type_check() pass will purely do - // verification, without modifying any types. - if (stmt->is_ptr) { - stmt->ret_type.set_is_pointer(true); - } + // Do nothing } void visit(ReturnStmt *stmt) override { diff --git a/taichi/transforms/vectorize_half2.cpp b/taichi/transforms/vectorize_half2.cpp index 3a9358937e1a6..d159092357b66 100644 --- a/taichi/transforms/vectorize_half2.cpp +++ b/taichi/transforms/vectorize_half2.cpp @@ -355,7 +355,8 @@ class Half2Vectorize : public BasicStmtVisitor { std::vector element_shape = {2}; int element_dim = -1; auto new_extern_stmt = std::make_unique( - self_ptr, new_indices, element_shape, element_dim); + self_ptr, new_indices, self_extern_stmt->ndim, element_shape, + element_dim); new_extern_stmt->overrided_dtype = true; new_extern_stmt->ret_type = tensor_type; new_extern_stmt->ret_type.set_is_pointer(true); diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py index 476b9df320222..8f04ed2ad6ba8 100644 --- a/tests/python/test_ndarray.py +++ b/tests/python/test_ndarray.py @@ -6,6 +6,7 @@ from taichi.lang.exception import TaichiIndexError, TaichiTypeError from taichi.lang.misc import get_host_arch_list from taichi.lang.util import has_pytorch +from taichi.math import vec3, ivec3 import taichi as ti from tests import test_utils @@ -779,10 +780,23 @@ def test_matrix_ndarray_oob(): def access_arr(input: ti.types.ndarray(), p: ti.i32, q: ti.i32, x: ti.i32, y: ti.i32) -> ti.f32: return input[p, q][x, y] + @ti.kernel + def valid_access(indices: ti.types.ndarray(dtype=ivec3, ndim=1), dummy: ti.types.ndarray(dtype=ivec3, ndim=1)): + for i in indices: + index_vec = ti.Vector([0, 0, 0]) + for j in ti.static(range(3)): + index = indices[i][j] + index_vec[j] = index + dummy[i] = index_vec + input = ti.ndarray(dtype=ti.math.mat2, shape=(4, 5)) + indices = ti.ndarray(dtype=ivec3, shape=(10)) + dummy = ti.ndarray(dtype=ivec3, shape=(10)) + # Works access_arr(input, 2, 3, 0, 1) + valid_access(indices, dummy) # element_shape with pytest.raises(AssertionError, match=r"Out of bound access"):