diff --git a/.azure-pipelines/azure-pipelines-linux.yml b/.azure-pipelines/azure-pipelines-linux.yml index 1c4c1717..34ff6e86 100755 --- a/.azure-pipelines/azure-pipelines-linux.yml +++ b/.azure-pipelines/azure-pipelines-linux.yml @@ -11,28 +11,27 @@ jobs: linux_64_cuda_compiler_version10.0: CONFIG: linux_64_cuda_compiler_version10.0 UPLOAD_PACKAGES: 'True' - DOCKER_IMAGE: condaforge/linux-anvil-cuda:10.0 + DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:10.0 linux_64_cuda_compiler_version10.1: CONFIG: linux_64_cuda_compiler_version10.1 UPLOAD_PACKAGES: 'True' - DOCKER_IMAGE: condaforge/linux-anvil-cuda:10.1 + DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:10.1 linux_64_cuda_compiler_version10.2: CONFIG: linux_64_cuda_compiler_version10.2 UPLOAD_PACKAGES: 'True' - DOCKER_IMAGE: condaforge/linux-anvil-cuda:10.2 + DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:10.2 linux_64_cuda_compiler_version11.0: CONFIG: linux_64_cuda_compiler_version11.0 UPLOAD_PACKAGES: 'True' - DOCKER_IMAGE: condaforge/linux-anvil-cuda:11.0 + DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:11.0 linux_64_cuda_compiler_version9.2: CONFIG: linux_64_cuda_compiler_version9.2 UPLOAD_PACKAGES: 'True' - DOCKER_IMAGE: condaforge/linux-anvil-cuda:9.2 + DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:9.2 linux_64_cuda_compiler_versionNone: CONFIG: linux_64_cuda_compiler_versionNone UPLOAD_PACKAGES: 'True' - DOCKER_IMAGE: condaforge/linux-anvil-comp7 - maxParallel: 8 + DOCKER_IMAGE: quay.io/condaforge/linux-anvil-comp7 timeoutInMinutes: 360 steps: diff --git a/.azure-pipelines/azure-pipelines-osx.yml b/.azure-pipelines/azure-pipelines-osx.yml index 568ec85f..0c892401 100755 --- a/.azure-pipelines/azure-pipelines-osx.yml +++ b/.azure-pipelines/azure-pipelines-osx.yml @@ -11,7 +11,6 @@ jobs: osx_64_: CONFIG: osx_64_ UPLOAD_PACKAGES: 'True' - maxParallel: 8 timeoutInMinutes: 360 steps: diff --git a/.azure-pipelines/azure-pipelines-win.yml b/.azure-pipelines/azure-pipelines-win.yml new file mode 100755 index 00000000..1fd8ea12 --- /dev/null +++ b/.azure-pipelines/azure-pipelines-win.yml @@ -0,0 +1,110 @@ +# This file was generated automatically from conda-smithy. To update this configuration, +# update the conda-forge.yml and/or the recipe/meta.yaml. +# -*- mode: yaml -*- + +jobs: +- job: win + pool: + vmImage: vs2017-win2016 + strategy: + matrix: + win_64_: + CONFIG: win_64_ + UPLOAD_PACKAGES: 'True' + timeoutInMinutes: 360 + variables: + CONDA_BLD_PATH: D:\\bld\\ + + steps: + - script: | + choco install vcpython27 -fdv -y --debug + condition: contains(variables['CONFIG'], 'vs2008') + displayName: Install vcpython27.msi (if needed) + + # Cygwin's git breaks conda-build. (See https://github.com/conda-forge/conda-smithy-feedstock/pull/2.) + # - script: rmdir C:\cygwin /s /q + # continueOnError: true + + - powershell: | + Set-PSDebug -Trace 1 + + $batchcontent = @" + ECHO ON + SET vcpython=C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python\9.0 + + DIR "%vcpython%" + + CALL "%vcpython%\vcvarsall.bat" %* + "@ + + $batchDir = "C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python\9.0\VC" + $batchPath = "$batchDir" + "\vcvarsall.bat" + New-Item -Path $batchPath -ItemType "file" -Force + + Set-Content -Value $batchcontent -Path $batchPath + + Get-ChildItem -Path $batchDir + + Get-ChildItem -Path ($batchDir + '\..') + + condition: contains(variables['CONFIG'], 'vs2008') + displayName: Patch vs2008 (if needed) + + - task: CondaEnvironment@1 + inputs: + packageSpecs: 'python=3.6 conda-build conda "conda-forge-ci-setup=3" pip' # Optional + installOptions: "-c conda-forge" + updateConda: true + displayName: Install conda-build and activate environment + + - script: set PYTHONUNBUFFERED=1 + displayName: Set PYTHONUNBUFFERED + + # Configure the VM + - script: | + call activate base + setup_conda_rc .\ ".\recipe" .\.ci_support\%CONFIG%.yaml + displayName: conda-forge CI setup + + # Configure the VM. + - script: | + set "CI=azure" + call activate base + run_conda_forge_build_setup + displayName: conda-forge build setup + + + # Special cased version setting some more things! + - script: | + call activate base + conda.exe build "recipe" -m .ci_support\%CONFIG%.yaml + displayName: Build recipe (vs2008) + env: + VS90COMNTOOLS: "C:\\Program Files (x86)\\Common Files\\Microsoft\\Visual C++ for Python\\9.0\\VC\\bin" + PYTHONUNBUFFERED: 1 + condition: contains(variables['CONFIG'], 'vs2008') + + - script: | + call activate base + conda.exe build "recipe" -m .ci_support\%CONFIG%.yaml + displayName: Build recipe + env: + PYTHONUNBUFFERED: 1 + condition: not(contains(variables['CONFIG'], 'vs2008')) + - script: | + set "FEEDSTOCK_NAME=%BUILD_REPOSITORY_NAME:*/=%" + call activate base + validate_recipe_outputs "%FEEDSTOCK_NAME%" + displayName: Validate Recipe Outputs + + - script: | + set "GIT_BRANCH=%BUILD_SOURCEBRANCHNAME%" + set "FEEDSTOCK_NAME=%BUILD_REPOSITORY_NAME:*/=%" + call activate base + upload_package --validate --feedstock-name="%FEEDSTOCK_NAME%" .\ ".\recipe" .ci_support\%CONFIG%.yaml + displayName: Upload package + env: + BINSTAR_TOKEN: $(BINSTAR_TOKEN) + FEEDSTOCK_TOKEN: $(FEEDSTOCK_TOKEN) + STAGING_BINSTAR_TOKEN: $(STAGING_BINSTAR_TOKEN) + condition: and(succeeded(), not(eq(variables['UPLOAD_PACKAGES'], 'False'))) \ No newline at end of file diff --git a/.ci_support/linux_64_cuda_compiler_version10.0.yaml b/.ci_support/linux_64_cuda_compiler_version10.0.yaml index 507da21a..a2227db8 100644 --- a/.ci_support/linux_64_cuda_compiler_version10.0.yaml +++ b/.ci_support/linux_64_cuda_compiler_version10.0.yaml @@ -1,3 +1,5 @@ +cdt_name: +- cos6 channel_sources: - conda-forge,defaults channel_targets: @@ -11,11 +13,16 @@ cxx_compiler: cxx_compiler_version: - '7' docker_image: -- condaforge/linux-anvil-cuda:10.0 +- quay.io/condaforge/linux-anvil-cuda:10.0 libblas: - 3.8 *netlib liblapack: -- 3.8.0 *netlib +- 3.8 *netlib +numpy: +- '1.16' +- '1.16' +- '1.16' +- '1.19' pin_run_as_build: python: min_pin: x.x @@ -24,8 +31,11 @@ python: - 3.6.* *_cpython - 3.7.* *_cpython - 3.8.* *_cpython +- 3.9.* *_cpython target_platform: - linux-64 zip_keys: - - cuda_compiler_version - docker_image +- - python + - numpy diff --git a/.ci_support/linux_64_cuda_compiler_version10.1.yaml b/.ci_support/linux_64_cuda_compiler_version10.1.yaml index d9dfec45..cfb515bb 100644 --- a/.ci_support/linux_64_cuda_compiler_version10.1.yaml +++ b/.ci_support/linux_64_cuda_compiler_version10.1.yaml @@ -1,3 +1,5 @@ +cdt_name: +- cos6 channel_sources: - conda-forge,defaults channel_targets: @@ -11,11 +13,16 @@ cxx_compiler: cxx_compiler_version: - '7' docker_image: -- condaforge/linux-anvil-cuda:10.1 +- quay.io/condaforge/linux-anvil-cuda:10.1 libblas: - 3.8 *netlib liblapack: -- 3.8.0 *netlib +- 3.8 *netlib +numpy: +- '1.16' +- '1.16' +- '1.16' +- '1.19' pin_run_as_build: python: min_pin: x.x @@ -24,8 +31,11 @@ python: - 3.6.* *_cpython - 3.7.* *_cpython - 3.8.* *_cpython +- 3.9.* *_cpython target_platform: - linux-64 zip_keys: - - cuda_compiler_version - docker_image +- - python + - numpy diff --git a/.ci_support/linux_64_cuda_compiler_version10.2.yaml b/.ci_support/linux_64_cuda_compiler_version10.2.yaml index 2d051101..59928208 100644 --- a/.ci_support/linux_64_cuda_compiler_version10.2.yaml +++ b/.ci_support/linux_64_cuda_compiler_version10.2.yaml @@ -1,3 +1,5 @@ +cdt_name: +- cos6 channel_sources: - conda-forge,defaults channel_targets: @@ -11,11 +13,16 @@ cxx_compiler: cxx_compiler_version: - '7' docker_image: -- condaforge/linux-anvil-cuda:10.2 +- quay.io/condaforge/linux-anvil-cuda:10.2 libblas: - 3.8 *netlib liblapack: -- 3.8.0 *netlib +- 3.8 *netlib +numpy: +- '1.16' +- '1.16' +- '1.16' +- '1.19' pin_run_as_build: python: min_pin: x.x @@ -24,8 +31,11 @@ python: - 3.6.* *_cpython - 3.7.* *_cpython - 3.8.* *_cpython +- 3.9.* *_cpython target_platform: - linux-64 zip_keys: - - cuda_compiler_version - docker_image +- - python + - numpy diff --git a/.ci_support/linux_64_cuda_compiler_version11.0.yaml b/.ci_support/linux_64_cuda_compiler_version11.0.yaml index 14414976..681c7001 100644 --- a/.ci_support/linux_64_cuda_compiler_version11.0.yaml +++ b/.ci_support/linux_64_cuda_compiler_version11.0.yaml @@ -1,3 +1,5 @@ +cdt_name: +- cos6 channel_sources: - conda-forge,defaults channel_targets: @@ -11,11 +13,16 @@ cxx_compiler: cxx_compiler_version: - '7' docker_image: -- condaforge/linux-anvil-cuda:11.0 +- quay.io/condaforge/linux-anvil-cuda:11.0 libblas: - 3.8 *netlib liblapack: -- 3.8.0 *netlib +- 3.8 *netlib +numpy: +- '1.16' +- '1.16' +- '1.16' +- '1.19' pin_run_as_build: python: min_pin: x.x @@ -24,8 +31,11 @@ python: - 3.6.* *_cpython - 3.7.* *_cpython - 3.8.* *_cpython +- 3.9.* *_cpython target_platform: - linux-64 zip_keys: - - cuda_compiler_version - docker_image +- - python + - numpy diff --git a/.ci_support/linux_64_cuda_compiler_version9.2.yaml b/.ci_support/linux_64_cuda_compiler_version9.2.yaml index 7e2594ef..10b31134 100644 --- a/.ci_support/linux_64_cuda_compiler_version9.2.yaml +++ b/.ci_support/linux_64_cuda_compiler_version9.2.yaml @@ -1,3 +1,5 @@ +cdt_name: +- cos6 channel_sources: - conda-forge,defaults channel_targets: @@ -11,11 +13,16 @@ cxx_compiler: cxx_compiler_version: - '7' docker_image: -- condaforge/linux-anvil-cuda:9.2 +- quay.io/condaforge/linux-anvil-cuda:9.2 libblas: - 3.8 *netlib liblapack: -- 3.8.0 *netlib +- 3.8 *netlib +numpy: +- '1.16' +- '1.16' +- '1.16' +- '1.19' pin_run_as_build: python: min_pin: x.x @@ -24,8 +31,11 @@ python: - 3.6.* *_cpython - 3.7.* *_cpython - 3.8.* *_cpython +- 3.9.* *_cpython target_platform: - linux-64 zip_keys: - - cuda_compiler_version - docker_image +- - python + - numpy diff --git a/.ci_support/linux_64_cuda_compiler_versionNone.yaml b/.ci_support/linux_64_cuda_compiler_versionNone.yaml index 5c699f1c..9cc1d4cc 100644 --- a/.ci_support/linux_64_cuda_compiler_versionNone.yaml +++ b/.ci_support/linux_64_cuda_compiler_versionNone.yaml @@ -1,3 +1,5 @@ +cdt_name: +- cos6 channel_sources: - conda-forge,defaults channel_targets: @@ -11,11 +13,16 @@ cxx_compiler: cxx_compiler_version: - '7' docker_image: -- condaforge/linux-anvil-comp7 +- quay.io/condaforge/linux-anvil-comp7 libblas: - 3.8 *netlib liblapack: -- 3.8.0 *netlib +- 3.8 *netlib +numpy: +- '1.16' +- '1.16' +- '1.16' +- '1.19' pin_run_as_build: python: min_pin: x.x @@ -24,8 +31,11 @@ python: - 3.6.* *_cpython - 3.7.* *_cpython - 3.8.* *_cpython +- 3.9.* *_cpython target_platform: - linux-64 zip_keys: - - cuda_compiler_version - docker_image +- - python + - numpy diff --git a/.ci_support/migrations/python39.yaml b/.ci_support/migrations/python39.yaml new file mode 100644 index 00000000..1c915c15 --- /dev/null +++ b/.ci_support/migrations/python39.yaml @@ -0,0 +1,27 @@ +migrator_ts: 1602104489 +__migrator: + migration_number: 2 + operation: key_add + primary_key: python + ordering: + python: + - 3.6.* *_cpython + - 3.7.* *_cpython + - 3.8.* *_cpython + - 3.9.* *_cpython # new entry + - 3.6.* *_73_pypy + paused: false + longterm: True + pr_limit: 50 + exclude: + # this shouldn't attempt to modify the python feedstocks + - python + - pypy3.6 + - pypy-meta +python: + - 3.9.* *_cpython +# additional entries to add for zip_keys +numpy: + - 1.19 +python_impl: + - cpython diff --git a/.ci_support/osx_64_.yaml b/.ci_support/osx_64_.yaml index adcbfc34..6768d9d9 100644 --- a/.ci_support/osx_64_.yaml +++ b/.ci_support/osx_64_.yaml @@ -13,9 +13,14 @@ cxx_compiler_version: libblas: - 3.8 *netlib liblapack: -- 3.8.0 *netlib +- 3.8 *netlib macos_machine: - x86_64-apple-darwin13.4.0 +numpy: +- '1.16' +- '1.16' +- '1.16' +- '1.19' pin_run_as_build: python: min_pin: x.x @@ -24,5 +29,9 @@ python: - 3.6.* *_cpython - 3.7.* *_cpython - 3.8.* *_cpython +- 3.9.* *_cpython target_platform: - osx-64 +zip_keys: +- - python + - numpy diff --git a/.ci_support/win_64_.yaml b/.ci_support/win_64_.yaml new file mode 100644 index 00000000..642d5c24 --- /dev/null +++ b/.ci_support/win_64_.yaml @@ -0,0 +1,31 @@ +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +cuda_compiler_version: +- None +cxx_compiler: +- vs2017 +libblas: +- 3.8 *netlib +liblapack: +- 3.8 *netlib +numpy: +- '1.16' +- '1.16' +- '1.16' +- '1.19' +pin_run_as_build: + python: + min_pin: x.x + max_pin: x.x +python: +- 3.6.* *_cpython +- 3.7.* *_cpython +- 3.8.* *_cpython +- 3.9.* *_cpython +target_platform: +- win-64 +zip_keys: +- - python + - numpy diff --git a/README.md b/README.md index 511432d1..093aa804 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,13 @@ Current build status variant + + win_64 + + + variant + + diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 33a441c1..6b346f50 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -4,4 +4,5 @@ jobs: - template: ./.azure-pipelines/azure-pipelines-linux.yml + - template: ./.azure-pipelines/azure-pipelines-win.yml - template: ./.azure-pipelines/azure-pipelines-osx.yml \ No newline at end of file diff --git a/build-locally.py b/build-locally.py index d92f4edc..3453cfe6 100755 --- a/build-locally.py +++ b/build-locally.py @@ -61,7 +61,7 @@ def main(args=None): help="Setup debug environment using `conda debug`", ) p.add_argument( - "--output-id", help="If running debug, specifiy the output to setup." + "--output-id", help="If running debug, specify the output to setup." ) ns = p.parse_args(args=args) diff --git a/recipe/build-lib.bat b/recipe/build-lib.bat index 640e266b..9b97b2e6 100644 --- a/recipe/build-lib.bat +++ b/recipe/build-lib.bat @@ -1,9 +1,19 @@ -@echo off +:: Build faiss.dll +cmake -B _build ^ + -DBUILD_SHARED_LIBS=ON ^ + -DBUILD_TESTING=OFF ^ + -DFAISS_ENABLE_GPU=OFF ^ + -DFAISS_ENABLE_PYTHON=OFF ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^ + -DCMAKE_INSTALL_BINDIR="%LIBRARY_BIN%" ^ + -DCMAKE_INSTALL_LIBDIR="%LIBRARY_LIB%" ^ + -DCMAKE_INSTALL_INCLUDEDIR="%LIBRARY_INC%" ^ + . +if %ERRORLEVEL% neq 0 exit 1 -:: there's a symbolic link from faiss/ to ./ in the upstream repo that does not work with windows; -:: delete symlink & copy entire source recursively (= "/S") to folder faiss to work around it -rmdir faiss -robocopy . faiss /S +cmake --build _build --config Release -j %CPU_COUNT% +if %ERRORLEVEL% neq 0 exit 1 -call %BUILD_PREFIX%\Library\bin\run_autotools_clang_conda_build.bat build-lib.sh +cmake --install _build --config Release --prefix %PREFIX% if %ERRORLEVEL% neq 0 exit 1 diff --git a/recipe/build-lib.sh b/recipe/build-lib.sh index be79f863..0fb8bc2d 100644 --- a/recipe/build-lib.sh +++ b/recipe/build-lib.sh @@ -1,7 +1,9 @@ # function for facilitate version comparison; cf. https://stackoverflow.com/a/37939589 function version2int { echo "$@" | awk -F. '{ printf("%d%02d\n", $1, $2); }'; } -CUDA_CONFIG_ARG="" +set -e + +declare -a CUDA_CONFIG_ARGS if [ ${cuda_compiler_version} != "None" ]; then # for documentation see e.g. # docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#building-for-maximum-compatibility @@ -22,27 +24,44 @@ if [ ${cuda_compiler_version} != "None" ]; then LATEST_ARCH=80 fi for arch in "${ARCHES[@]}"; do - CUDA_ARCH="${CUDA_ARCH} -gencode=arch=compute_${arch},code=sm_${arch}"; + CMAKE_CUDA_ARCHS="${CMAKE_CUDA_ARCHS+${CMAKE_CUDA_ARCHS};}${arch}-virtual" done # to support PTX JIT compilation; see first link above or cf. # devblogs.nvidia.com/cuda-pro-tip-understand-fat-binaries-jit-caching - CUDA_ARCH="${CUDA_ARCH} -gencode=arch=compute_${LATEST_ARCH},code=compute_${LATEST_ARCH}" + # see also cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html + CMAKE_CUDA_ARCHS="${CMAKE_CUDA_ARCHS+${CMAKE_CUDA_ARCHS};}${LATEST_ARCH}-real" + + FAISS_ENABLE_GPU="ON" + CUDA_CONFIG_ARGS+=( + -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHS}" + ) + # cmake does not generate output for the call below; echo some info + echo "Set up extra cmake-args: CUDA_CONFIG_ARGS=${CUDA_CONFIG_ARGS+"${CUDA_CONFIG_ARGS[@]}"}" - CUDA_CONFIG_ARG="--with-cuda=${CUDA_HOME}" + # Acc. to https://cmake.org/cmake/help/v3.19/module/FindCUDAToolkit.html#search-behavior + # CUDA toolkit is search relative to `nvcc` first before considering + # "-DCUDAToolkit_ROOT=${CUDA_HOME}". We have multiple workarounds: + # - Add symlinks from ${CUDA_HOME} to ${BUILD_PREFIX} + # - Add ${CUDA_HOME}/bin to ${PATH} + # - Remove `nvcc` wrapper in ${BUILD_PREFIX} so that `nvcc` from ${CUDA_HOME} gets found. + # TODO: Fix this in nvcc-feedstock or cmake-feedstock. + # NOTE: It's okay for us to not use the wrapper since CMake adds -ccbin itself. + rm "${BUILD_PREFIX}/bin/nvcc" else - CUDA_CONFIG_ARG="--without-cuda" + FAISS_ENABLE_GPU="OFF" fi -# need to regenerate ./configure for patch to configure.ac to take effect -autoreconf -i -f - # Build vanilla version (no avx) -./configure --prefix=${PREFIX} --exec-prefix=${PREFIX} \ - --with-blas=-lblas --with-lapack=-llapack \ - ${CUDA_CONFIG_ARG} --with-cuda-arch="${CUDA_ARCH}" || exit 1 - -# make sets SHAREDEXT correctly for linux/osx -make install +cmake -B _build_generic \ + -DBUILD_SHARED_LIBS=ON \ + -DBUILD_TESTING=ON \ + -DFAISS_ENABLE_PYTHON=OFF \ + -DFAISS_ENABLE_GPU=${FAISS_ENABLE_GPU} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_LIBDIR=lib \ + ${CUDA_CONFIG_ARGS+"${CUDA_CONFIG_ARGS[@]}"} \ + --verbose \ + . -# make builds libfaiss.a & libfaiss.so; we only want the latter -rm ${PREFIX}/lib/libfaiss.a +cmake --build _build_generic -j $CPU_COUNT +cmake --install _build_generic --prefix $PREFIX diff --git a/recipe/build-pkg.bat b/recipe/build-pkg.bat index 5608d2e8..0af669a0 100644 --- a/recipe/build-pkg.bat +++ b/recipe/build-pkg.bat @@ -1,9 +1,19 @@ -@echo off +:: Build vanilla version (no avx2). +:: Do not use the Python3_* variants for cmake +cmake -B _build_python ^ + -DFAISS_ENABLE_GPU=OFF ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DPython_EXECUTABLE="%PYTHON%" ^ + faiss/python +if %ERRORLEVEL% neq 0 exit 1 -:: there's a symbolic link from faiss/ to ./ in the upstream repo that does not work with windows; -:: delete symlink & copy entire source recursively (= "/S") to folder faiss to work around it -rmdir faiss -robocopy . faiss /S +cmake --build _build_python --config Release -j %CPU_COUNT% +if %ERRORLEVEL% neq 0 exit 1 -call %BUILD_PREFIX%\Library\bin\run_autotools_clang_conda_build.bat build-pkg.sh +:: Build actual python module. +pushd _build_python +%PYTHON% setup.py install --single-version-externally-managed --record=record.txt --prefix=%PREFIX% if %ERRORLEVEL% neq 0 exit 1 +popd +:: clean up cmake-cache between builds +rd /S /Q _build_python diff --git a/recipe/build-pkg.sh b/recipe/build-pkg.sh index 84dd2183..eb44cc51 100644 --- a/recipe/build-pkg.sh +++ b/recipe/build-pkg.sh @@ -1,15 +1,35 @@ -CUDA_CONFIG_ARG="" +set -e + +FAISS_ENABLE_GPU="" if [ ${cuda_compiler_version} != "None" ]; then - CUDA_CONFIG_ARG="--with-cuda=${CUDA_HOME}" + FAISS_ENABLE_GPU="ON" + # Acc. to https://cmake.org/cmake/help/v3.19/module/FindCUDAToolkit.html#search-behavior + # CUDA toolkit is search relative to `nvcc` first before considering + # "-DCUDAToolkit_ROOT=${CUDA_HOME}". We have multiple workarounds: + # - Add symlinks from ${CUDA_HOME} to ${BUILD_PREFIX} + # - Add ${CUDA_HOME}/bin to ${PATH} + # - Remove `nvcc` wrapper in ${BUILD_PREFIX} so that `nvcc` from ${CUDA_HOME} gets found. + # TODO: Fix this in nvcc-feedstock or cmake-feedstock. + # NOTE: It's okay for us to not use the wrapper since CMake adds -ccbin itself. + rm "${BUILD_PREFIX}/bin/nvcc" else - CUDA_CONFIG_ARG="--without-cuda" + FAISS_ENABLE_GPU="OFF" fi -# Build vanilla version (no avx) -./configure --with-blas=-lblas --with-lapack=-llapack ${CUDA_CONFIG_ARG} - -make -C python build +# Build vanilla version (no avx2), see build-lib.sh +# Do not use the Python3_* variants for cmake +cmake -B _build_python \ + -Dfaiss_ROOT=${PREFIX}\ + -DFAISS_ENABLE_GPU=${FAISS_ENABLE_GPU} \ + -DCMAKE_BUILD_TYPE=Release \ + -DPython_EXECUTABLE="${PYTHON}" \ + faiss/python -cd python +cmake --build _build_python -j $CPU_COUNT -$PYTHON -m pip install . -vv +# Build actual python module. +pushd _build_python +$PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX +popd +# clean up cmake-cache between builds +rm -r _build_python diff --git a/recipe/conda_build_config.yaml b/recipe/conda_build_config.yaml index 5700830c..274fe175 100644 --- a/recipe/conda_build_config.yaml +++ b/recipe/conda_build_config.yaml @@ -1,2 +1,9 @@ cuda_compiler_version: - None # [not linux] + +cxx_compiler_version: # [unix] + # need to downgrade on osx due to a bug that breaks the test suite + - 10 # [osx] + # need to downgrade on linux due to nvcc 9.2 not being able to deal with gcc>7, + # and conda-build not being able to zip this with cuda_compiler_version + - 7 # [linux] diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 0f6a0d74..153a6abe 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,24 +1,28 @@ -{% set version = "1.6.3" %} -{% set number = 3 %} +{% set version = "1.6.4" %} +{% set number = 0 %} # see github.com/conda-forge/conda-forge.github.io/issues/1059 for naming discussion {% set faiss_proc_type = "cuda" if cuda_compiler_version != "None" else "cpu" %} -# headers for upstream-folders './*.h', 'impl/*.h', 'utils/*.h' -# gpu adds headers in 'gpu/*.h', 'gpu/impl/*.(cu)?h', 'gpu/utils/*.(cu)?h' +# headers for upstream-folders 'faiss/*.h', 'faiss/{impl,utils}/*.h', +# see https://github.com/facebookresearch/faiss/blob/v1.6.4/faiss/CMakeLists.txt +# gpu adds headers in 'faiss/gpu/*.h', 'faiss/gpu/{impl,utils}/*.(cu)?h' {% set headers = [ - 'AutoTune.h', 'clone_index.h', 'Clustering.h', 'DirectMap.h', 'Index.h', 'Index2Layer.h', - 'IndexBinary.h', 'IndexBinaryFlat.h', 'IndexBinaryFromFloat.h', 'IndexBinaryHash.h', - 'IndexBinaryHNSW.h', 'IndexBinaryIVF.h', 'IndexFlat.h', 'IndexHNSW.h', 'IndexIVF.h', - 'IndexIVFFlat.h', 'IndexIVFPQ.h', 'IndexIVFPQR.h', 'IndexIVFSpectralHash.h', 'IndexLattice.h', - 'IndexLSH.h', 'IndexPQ.h', 'IndexPreTransform.h', 'IndexReplicas.h', 'IndexScalarQuantizer.h', - 'IndexShards.h', 'index_factory.h', 'index_io.h', 'InvertedLists.h', 'IVFlib.h', - 'MatrixStats.h', 'MetaIndexes.h', 'MetricType.h', 'OnDiskInvertedLists.h', 'VectorTransform.h', + 'AutoTune.h', 'Clustering.h', 'DirectMap.h', 'IVFlib.h', 'Index.h', 'Index2Layer.h', + 'IndexBinary.h', 'IndexBinaryFlat.h', 'IndexBinaryFromFloat.h', 'IndexBinaryHNSW.h', + 'IndexBinaryHash.h', 'IndexBinaryIVF.h', 'IndexFlat.h', 'IndexHNSW.h', 'IndexIVF.h', + 'IndexIVFFlat.h', 'IndexIVFPQ.h', 'IndexIVFPQR.h', 'IndexIVFSpectralHash.h', 'IndexLSH.h', + 'IndexLattice.h', 'IndexPQ.h', 'IndexPreTransform.h', 'IndexReplicas.h', + 'IndexScalarQuantizer.h', 'IndexShards.h', 'InvertedLists.h', 'MatrixStats.h', + 'MetaIndexes.h', 'MetricType.h', 'VectorTransform.h', 'clone_index.h', 'index_factory.h', + 'index_io.h', 'impl/AuxIndexStructures.h', 'impl/FaissAssert.h', 'impl/FaissException.h', 'impl/HNSW.h', - 'impl/io.h', 'impl/lattice_Zn.h', 'impl/PolysemousTraining.h', 'impl/ProductQuantizer-inl.h', - 'impl/ProductQuantizer.h', 'impl/ScalarQuantizer.h', 'impl/ThreadedIndex-inl.h', - 'impl/ThreadedIndex.h', - 'utils/distances.h', 'utils/extra_distances.h', 'utils/hamming-inl.h', 'utils/hamming.h', - 'utils/Heap.h', 'utils/random.h', 'utils/utils.h', 'utils/WorkerThread.h' + 'impl/PolysemousTraining.h', 'impl/ProductQuantizer-inl.h', 'impl/ProductQuantizer.h', + 'impl/ScalarQuantizer.h', 'impl/ThreadedIndex-inl.h', 'impl/ThreadedIndex.h', 'impl/io.h', + 'impl/io_macros.h', 'impl/lattice_Zn.h', 'impl/platform_macros.h', + 'utils/Heap.h', 'utils/WorkerThread.h', 'utils/distances.h', 'utils/extra_distances.h', + 'utils/hamming-inl.h', 'utils/hamming.h', 'utils/random.h', 'utils/utils.h' +] + (not win) * [ + 'OnDiskInvertedLists.h' ] + (cuda_compiler_version != "None") * [ 'gpu/GpuAutoTune.h', 'gpu/GpuCloner.h', 'gpu/GpuClonerOptions.h', 'gpu/GpuDistance.h', 'gpu/GpuFaissAssert.h', 'gpu/GpuIndex.h', 'gpu/GpuIndexBinaryFlat.h', 'gpu/GpuIndexFlat.h', @@ -30,12 +34,12 @@ 'gpu/impl/FlatIndex.cuh', 'gpu/impl/GeneralDistance.cuh', 'gpu/impl/GpuScalarQuantizer.cuh', 'gpu/impl/IVFAppend.cuh', 'gpu/impl/IVFBase.cuh', 'gpu/impl/IVFFlat.cuh', 'gpu/impl/IVFFlatScan.cuh', 'gpu/impl/IVFPQ.cuh', 'gpu/impl/IVFUtils.cuh', - 'gpu/impl/L2Norm.cuh', 'gpu/impl/L2Select.cuh', 'gpu/impl/Metrics.cuh', + 'gpu/impl/L2Norm.cuh', 'gpu/impl/L2Select.cuh', 'gpu/impl/PQCodeDistances-inl.cuh', 'gpu/impl/PQCodeDistances.cuh', 'gpu/impl/PQCodeLoad.cuh', 'gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh', 'gpu/impl/PQScanMultiPassNoPrecomputed.cuh', 'gpu/impl/PQScanMultiPassPrecomputed.cuh', 'gpu/impl/VectorResidual.cuh', - 'gpu/utils/DeviceMemory.h', 'gpu/utils/DeviceUtils.h', 'gpu/utils/MemorySpace.h', - 'gpu/utils/StackDeviceMemory.h', 'gpu/utils/StaticUtils.h', 'gpu/utils/Timer.h', + 'gpu/utils/DeviceUtils.h', 'gpu/utils/StackDeviceMemory.h', 'gpu/utils/StaticUtils.h', + 'gpu/utils/Timer.h', 'gpu/utils/BlockSelectKernel.cuh', 'gpu/utils/Comparators.cuh', 'gpu/utils/ConversionOperators.cuh', 'gpu/utils/CopyUtils.cuh', 'gpu/utils/DeviceDefs.cuh', 'gpu/utils/DeviceTensor-inl.cuh', 'gpu/utils/DeviceTensor.cuh', 'gpu/utils/DeviceVector.cuh', @@ -55,26 +59,14 @@ package: source: url: https://github.com/facebookresearch/faiss/archive/v{{ version }}.tar.gz - sha256: e1a41c159f0b896975fbb133e0240a233af5c9286c09a28fde6aefff5336e542 + sha256: 6ed6311415ccb70ea1afe82bafc24e9f3aa60c06c9d030bfc23ee4173d59cc99 patches: - - patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch - - patches/0002-remove-fPIC-flag-for-msvc-compat.patch # [win] - # add upstream commits (64dd9884, e05f773f, ba061ff82) at the recommendation - # of the maintainers. These can be removed upon the next release. - - patches/0003-remove-deleted-files-from-template-change.patch - - patches/0004-CUDA-8-fixes.patch - - patches/0005-CUDA-10-fixes.patch - # backport of facebookresearch/faiss#1380 to be able to build for Ampere - - patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch - # use c++14 (in autoconf macro AX_CXX_COMPILE_STDCXX) - - patches/0007-use-c-14.patch - # backport of facebookresearch/faiss#1388, see conda-forge/faiss-split-feedstock#14 - - patches/0008-backport-facebookresearch-faiss-1388.patch + # backport of facebookresearch/faiss#1541, can be dropped for ver>1.6.5 + - patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch + - patches/0002-use-c-14.patch build: number: {{ number }} - # TODO: figure out compiler errors on windows - skip: true # [win] # GPU version only for linux skip: true # [(win or osx) and cuda_compiler_version != "None"] @@ -108,41 +100,32 @@ outputs: build: - {{ compiler('cxx') }} - {{ compiler('cuda') }} # [cuda_compiler_version != "None"] - - autoconf # [not win] - - automake # [not win] - - libtool # [not win] - cmake - libgomp # [linux] - - llvm-openmp # [osx or win] - - autotools_clang_conda # [win] + - llvm-openmp # [osx] host: - libblas - liblapack - run: - - libblas - - liblapack run_constrained: - faiss-proc * {{ faiss_proc_type }} test: commands: # shared - - test -f $PREFIX/lib/libfaiss.so # [linux] - - test -f $PREFIX/lib/libfaiss.dylib # [osx] - - if not exist %PREFIX%\\Library\\bin\\libfaiss.dll exit 1 # [win] + - test -f $PREFIX/lib/libfaiss.so # [linux] + - test -f $PREFIX/lib/libfaiss.dylib # [osx] + - if not exist %LIBRARY_BIN%\faiss.dll exit 1 # [win] + # On windows, faiss.lib is an "import library"; + # Deleting it breaks the faiss-builds + - if not exist %LIBRARY_LIB%\faiss.lib exit 1 # [win] # absence of static libraries - - test ! -f $PREFIX/lib/libfaiss.a # [not win] - - if exist %PREFIX%\\Library\\lib\\libfaiss.lib exit 1 # [win] - - # conda tools - - conda inspect linkages -p $PREFIX $PKG_NAME # [not win] - - conda inspect objects -p $PREFIX $PKG_NAME # [osx] + - test ! -f $PREFIX/lib/libfaiss.a # [not win] # headers {% for each_header in headers %} - - test -f $PREFIX/include/faiss/{{ each_header }} # [unix] - - if not exist %LIBRARY_INC%\\faiss\\{{ "\\".join(each_header.split("/")) }} exit 1 # [win] + - test -f $PREFIX/include/faiss/{{ each_header }} || (echo "{{ each_header }} not found" && exit 1) # [unix] + - if not exist %LIBRARY_INC%\faiss\{{ "\\".join(each_header.split("/")) }} exit 1 # [win] {% endfor %} - name: faiss @@ -156,17 +139,17 @@ outputs: - {{ compiler('cuda') }} # [cuda_compiler_version != "None"] - swig - cmake - - autotools_clang_conda # [win] + - libgomp # [linux] + - llvm-openmp # [osx] host: - python - pip - - numpy =1.14.* + - numpy - libfaiss ={{ version }}=*_{{ faiss_proc_type }} - libblas - liblapack - - libgomp # [linux] - - llvm-openmp # [osx or win] run: + - python - libfaiss ={{ version }}=*_{{ faiss_proc_type }} - {{ pin_compatible('numpy') }} run_constrained: @@ -182,42 +165,33 @@ outputs: - scipy source_files: - tests/ - - conda/faiss/run_test.py - - conda/faiss-gpu/run_test.py imports: - faiss commands: - - python conda/faiss/run_test.py - # running the following test requires an actual GPU device, which is not available in CI - # - python conda/faiss-gpu/run_test.py # [cuda_compiler_version != "None"] - python -m unittest discover tests + # running the following test requires an actual GPU device, which is not available in CI + # - python -m unittest discover faiss/gpu/test/ # for compatibility with (& ease of migration from) existing packages in the pytorch channel - name: faiss-cpu build: - string: "py{{ CONDA_PY }}_{{ number }}" skip: true # [cuda_compiler_version != "None"] requirements: - host: - - python run: - faiss ={{ version }}=*_cpu test: - commands: - - exit 0 + imports: + - faiss - name: faiss-gpu build: - string: "py{{ CONDA_PY }}_{{ number }}" skip: true # [cuda_compiler_version == "None"] requirements: - host: - - python run: - faiss ={{ version }}=*_cuda test: - commands: - - exit 0 + imports: + - faiss about: home: https://github.com/facebookresearch/faiss diff --git a/recipe/patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch b/recipe/patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch new file mode 100644 index 00000000..c01d2211 --- /dev/null +++ b/recipe/patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch @@ -0,0 +1,76 @@ +From e80893144a71dd48d829f611bf2708d9372f71bf Mon Sep 17 00:00:00 2001 +From: Mo Zhou +Date: Tue, 24 Nov 2020 05:44:37 +0000 +Subject: [PATCH 1/2] CMake: use GNUInstallDirs instead of hardcoded paths. + +--- + CMakeLists.txt | 1 + + faiss/CMakeLists.txt | 14 +++++++------- + faiss/gpu/CMakeLists.txt | 2 +- + 3 files changed, 9 insertions(+), 8 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 29b73d8..158712e 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -11,6 +11,7 @@ project(faiss + DESCRIPTION "A library for efficient similarity search and clustering of dense vectors." + HOMEPAGE_URL "https://github.com/facebookresearch/faiss" + LANGUAGES CXX) ++include(GNUInstallDirs) + + set(CMAKE_CXX_STANDARD 11) + +diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt +index 51680fd..3469538 100644 +--- a/faiss/CMakeLists.txt ++++ b/faiss/CMakeLists.txt +@@ -160,15 +160,15 @@ endif() + + install(TARGETS faiss + EXPORT faiss-targets +- RUNTIME DESTINATION bin +- ARCHIVE DESTINATION lib +- LIBRARY DESTINATION lib +- INCLUDES DESTINATION include ++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ++ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ++ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ++ INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + ) + foreach(header ${FAISS_HEADERS}) + get_filename_component(dir ${header} DIRECTORY ) + install(FILES ${header} +- DESTINATION include/faiss/${dir} ++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/${dir} + ) + endforeach() + +@@ -185,9 +185,9 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/faiss-config.cmake.in + ) + install(FILES ${PROJECT_BINARY_DIR}/cmake/faiss-config.cmake + ${PROJECT_BINARY_DIR}/cmake/faiss-config-version.cmake +- DESTINATION share/faiss ++ DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss + ) + + install(EXPORT faiss-targets +- DESTINATION share/faiss ++ DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss + ) +diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt +index 5b2a957..8041bac 100644 +--- a/faiss/gpu/CMakeLists.txt ++++ b/faiss/gpu/CMakeLists.txt +@@ -173,7 +173,7 @@ set(FAISS_GPU_HEADERS + foreach(header ${FAISS_GPU_HEADERS}) + get_filename_component(dir ${header} DIRECTORY ) + install(FILES ${header} +- DESTINATION include/faiss/gpu/${dir} ++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/gpu/${dir} + ) + endforeach() + +-- +2.29.2.windows.3 + diff --git a/recipe/patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch b/recipe/patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch deleted file mode 100644 index 7b378223..00000000 --- a/recipe/patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 5f990ac215a73ba6c3bea8b4033f5b0f5d026237 Mon Sep 17 00:00:00 2001 -From: "H. Vetinari" -Date: Thu, 7 May 2020 11:05:05 +0200 -Subject: [PATCH 1/8] change python/Makefile to point to libfaiss.$(SHLIB_EXT) - instead of .a - -also give definite names to makefile-rules, as otherwise make may -struggle to find make _swigfaiss.so otherwise. ---- - python/Makefile | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/python/Makefile b/python/Makefile -index 2836568..eaa32a4 100644 ---- a/python/Makefile -+++ b/python/Makefile -@@ -12,18 +12,18 @@ endif - all: build - - # Also silently generates swigfaiss.py. --swigfaiss.cpp: swigfaiss.swig ../libfaiss.a -+swigfaiss.cpp: swigfaiss.swig $(PREFIX)/lib/libfaiss$(SHLIB_EXT) - $(SWIG) -python -c++ -Doverride= -I../ $(SWIGFLAGS) -o $@ $< - - swigfaiss_avx2.cpp: swigfaiss.swig ../libfaiss.a - $(SWIG) -python -c++ -Doverride= -module swigfaiss_avx2 -I../ $(SWIGFLAGS) -o $@ $< - --%.o: %.cpp -+swigfaiss.o: swigfaiss.cpp - $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) $(PYTHONCFLAGS) \ - -I../ -c $< -o $@ - - # Extension is .so even on OSX. --_%.so: %.o ../libfaiss.a -+_swigfaiss.so: swigfaiss.o $(PREFIX)/lib/libfaiss$(SHLIB_EXT) - $(CXX) $(SHAREDFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS) - - build: _swigfaiss.so faiss.py --- -2.26.2.windows.1 - diff --git a/recipe/patches/0002-remove-fPIC-flag-for-msvc-compat.patch b/recipe/patches/0002-remove-fPIC-flag-for-msvc-compat.patch deleted file mode 100644 index e8774ed9..00000000 --- a/recipe/patches/0002-remove-fPIC-flag-for-msvc-compat.patch +++ /dev/null @@ -1,25 +0,0 @@ -From c8dc93d045be8485a4863d20b36e12d1aa2c6c8a Mon Sep 17 00:00:00 2001 -From: "H. Vetinari" -Date: Fri, 8 May 2020 20:12:30 +0200 -Subject: [PATCH 2/8] remove -fPIC flag for msvc-compat - ---- - makefile.inc.in | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/makefile.inc.in b/makefile.inc.in -index 19d8511..1c4e976 100644 ---- a/makefile.inc.in -+++ b/makefile.inc.in -@@ -6,7 +6,7 @@ - CXX = @CXX@ - CXXCPP = @CXXCPP@ - CPPFLAGS = -DFINTEGER=int @CPPFLAGS@ @OPENMP_CXXFLAGS@ @NVCC_CPPFLAGS@ --CXXFLAGS = -fPIC @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@ -+CXXFLAGS = @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@ - CPUFLAGS = @ARCH_CPUFLAGS@ - LDFLAGS = @OPENMP_LDFLAGS@ @LDFLAGS@ @NVCC_LDFLAGS@ - LIBS = @BLAS_LIBS@ @LAPACK_LIBS@ @LIBS@ @NVCC_LIBS@ --- -2.26.2.windows.1 - diff --git a/recipe/patches/0002-use-c-14.patch b/recipe/patches/0002-use-c-14.patch new file mode 100644 index 00000000..b1cc3690 --- /dev/null +++ b/recipe/patches/0002-use-c-14.patch @@ -0,0 +1,39 @@ +From 65f07198267d0b777c03b21ae81c0b27577176fc Mon Sep 17 00:00:00 2001 +From: "H. Vetinari" +Date: Mon, 26 Oct 2020 22:44:44 +0100 +Subject: [PATCH 2/2] use c++14 + +--- + CMakeLists.txt | 2 +- + faiss/python/CMakeLists.txt | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 158712e..1b2038f 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -13,7 +13,7 @@ project(faiss + LANGUAGES CXX) + include(GNUInstallDirs) + +-set(CMAKE_CXX_STANDARD 11) ++set(CMAKE_CXX_STANDARD 14) + + list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") + +diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt +index 0b4366d..1ef5dea 100644 +--- a/faiss/python/CMakeLists.txt ++++ b/faiss/python/CMakeLists.txt +@@ -11,7 +11,7 @@ project(pyfaiss + HOMEPAGE_URL "https://github.com/facebookresearch/faiss" + LANGUAGES CXX) + +-set(CMAKE_CXX_STANDARD 11) ++set(CMAKE_CXX_STANDARD 14) + + find_package(SWIG REQUIRED COMPONENTS python) + include(${SWIG_USE_FILE}) +-- +2.29.2.windows.3 + diff --git a/recipe/patches/0003-remove-deleted-files-from-template-change.patch b/recipe/patches/0003-remove-deleted-files-from-template-change.patch deleted file mode 100644 index 9217a390..00000000 --- a/recipe/patches/0003-remove-deleted-files-from-template-change.patch +++ /dev/null @@ -1,1191 +0,0 @@ -From 94bd01d4b4f457f40994df7817d8b789a99d605b Mon Sep 17 00:00:00 2001 -From: Jeff Johnson -Date: Wed, 25 Mar 2020 10:57:57 -0700 -Subject: [PATCH 3/8] remove deleted files from template change - ---- - gpu/impl/PQCodeDistances.cu | 567 --------------------- - gpu/impl/PQScanMultiPassNoPrecomputed.cu | 597 ----------------------- - 2 files changed, 1164 deletions(-) - delete mode 100644 gpu/impl/PQCodeDistances.cu - delete mode 100644 gpu/impl/PQScanMultiPassNoPrecomputed.cu - -diff --git a/gpu/impl/PQCodeDistances.cu b/gpu/impl/PQCodeDistances.cu -deleted file mode 100644 -index 817990b..0000000 ---- a/gpu/impl/PQCodeDistances.cu -+++ /dev/null -@@ -1,567 +0,0 @@ --/** -- * Copyright (c) Facebook, Inc. and its affiliates. -- * -- * This source code is licensed under the MIT license found in the -- * LICENSE file in the root directory of this source tree. -- */ -- -- --#include -- --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include -- --namespace faiss { namespace gpu { -- --template --struct Converter { --}; -- --template <> --struct Converter { -- inline static __device__ half to(float v) { return __float2half(v); } --}; -- --template <> --struct Converter { -- inline static __device__ float to(float v) { return v; } --}; -- --// Kernel responsible for calculating distance from residual vector to --// each product quantizer code centroid --template --__global__ void --__launch_bounds__(288, 4) --pqCodeDistances(Tensor queries, -- int queriesPerBlock, -- Tensor coarseCentroids, -- Tensor pqCentroids, -- Tensor topQueryToCentroid, -- // (query id)(coarse)(subquantizer)(code) -> dist -- Tensor outCodeDistances) { -- const auto numSubQuantizers = pqCentroids.getSize(0); -- const auto dimsPerSubQuantizer = pqCentroids.getSize(1); -- assert(DimsPerSubQuantizer == dimsPerSubQuantizer); -- const auto codesPerSubQuantizer = pqCentroids.getSize(2); -- -- bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer; -- int loadingThreadId = threadIdx.x - codesPerSubQuantizer; -- -- extern __shared__ float smem[]; -- -- // Each thread calculates a single code -- float subQuantizerData[DimsPerSubQuantizer]; -- -- auto code = threadIdx.x; -- auto subQuantizer = blockIdx.y; -- -- // Each thread will load the pq centroid data for the code that it -- // is processing --#pragma unroll -- for (int i = 0; i < DimsPerSubQuantizer; ++i) { -- subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg(); -- } -- -- // Where we store our query vector -- float* smemQuery = smem; -- -- // Where we store our residual vector; this is double buffered so we -- // can be loading the next one while processing the current one -- float* smemResidual1 = &smemQuery[DimsPerSubQuantizer]; -- float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer]; -- -- // Where we pre-load the coarse centroid IDs -- int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer]; -- -- // Each thread is calculating the distance for a single code, -- // performing the reductions locally -- -- // Handle multiple queries per block -- auto startQueryId = blockIdx.x * queriesPerBlock; -- auto numQueries = queries.getSize(0) - startQueryId; -- if (numQueries > queriesPerBlock) { -- numQueries = queriesPerBlock; -- } -- -- for (int query = 0; query < numQueries; ++query) { -- auto queryId = startQueryId + query; -- -- auto querySubQuantizer = -- queries[queryId][subQuantizer * DimsPerSubQuantizer].data(); -- -- // Load current query vector -- for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) { -- smemQuery[i] = querySubQuantizer[i]; -- } -- -- // Load list of coarse centroids found -- for (int i = threadIdx.x; -- i < topQueryToCentroid.getSize(1); i += blockDim.x) { -- coarseIds[i] = topQueryToCentroid[queryId][i]; -- } -- -- // We need coarseIds below -- // FIXME: investigate loading separately, so we don't need this -- __syncthreads(); -- -- // Preload first buffer of residual data -- if (isLoadingThread) { -- for (int i = loadingThreadId; -- i < DimsPerSubQuantizer; -- i += blockDim.x - codesPerSubQuantizer) { -- auto coarseId = coarseIds[0]; -- // In case NaNs were in the original query data -- coarseId = coarseId == -1 ? 0 : coarseId; -- auto coarseCentroidSubQuantizer = -- coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data(); -- -- if (L2Distance) { -- smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i]; -- } else { -- smemResidual1[i] = coarseCentroidSubQuantizer[i]; -- } -- } -- } -- -- // The block walks the list for a single query -- for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) { -- // Wait for smemResidual1 to be loaded -- __syncthreads(); -- -- if (isLoadingThread) { -- // Preload second buffer of residual data -- for (int i = loadingThreadId; -- i < DimsPerSubQuantizer; -- i += blockDim.x - codesPerSubQuantizer) { -- // FIXME: try always making this centroid id 0 so we can -- // terminate -- if (coarse != (topQueryToCentroid.getSize(1) - 1)) { -- auto coarseId = coarseIds[coarse + 1]; -- // In case NaNs were in the original query data -- coarseId = coarseId == -1 ? 0 : coarseId; -- -- auto coarseCentroidSubQuantizer = -- coarseCentroids[coarseId] -- [subQuantizer * dimsPerSubQuantizer].data(); -- -- if (L2Distance) { -- smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i]; -- } else { -- smemResidual2[i] = coarseCentroidSubQuantizer[i]; -- } -- } -- } -- } else { -- // These are the processing threads -- float dist = 0.0f; -- -- constexpr int kUnroll = 4; -- constexpr int kRemainder = DimsPerSubQuantizer % kUnroll; -- constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder; -- float vals[kUnroll]; -- -- // Calculate residual - pqCentroid for each dim that we're -- // processing -- -- // Unrolled loop -- if (L2Distance) { --#pragma unroll -- for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) { --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- vals[j] = smemResidual1[i * kUnroll + j]; -- } -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- vals[j] -= subQuantizerData[i * kUnroll + j]; -- } -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- vals[j] *= vals[j]; -- } -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- dist += vals[j]; -- } -- } -- } else { -- // Inner product: query slice against the reconstructed sub-quantizer -- // for this coarse cell (query o (centroid + subQCentroid)) --#pragma unroll -- for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) { --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- vals[j] = smemResidual1[i * kUnroll + j]; -- } -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- vals[j] += subQuantizerData[i * kUnroll + j]; -- } -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- vals[j] *= smemQuery[i * kUnroll + j]; -- } -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- dist += vals[j]; -- } -- } -- } -- -- // Remainder loop -- if (L2Distance) { --#pragma unroll -- for (int j = 0; j < kRemainder; ++j) { -- vals[j] = smemResidual1[kRemainderBase + j]; -- } -- --#pragma unroll -- for (int j = 0; j < kRemainder; ++j) { -- vals[j] -= subQuantizerData[kRemainderBase + j]; -- } -- --#pragma unroll -- for (int j = 0; j < kRemainder; ++j) { -- vals[j] *= vals[j]; -- } -- } else { -- // Inner product -- // Inner product: query slice against the reconstructed sub-quantizer -- // for this coarse cell (query o (centroid + subQCentroid)) --#pragma unroll -- for (int j = 0; j < kRemainder; ++j) { -- vals[j] = smemResidual1[kRemainderBase + j]; -- } -- --#pragma unroll -- for (int j = 0; j < kRemainder; ++j) { -- vals[j] += subQuantizerData[kRemainderBase + j]; -- } -- --#pragma unroll -- for (int j = 0; j < kRemainder; ++j) { -- vals[j] *= smemQuery[kRemainderBase + j]; -- } -- } -- --#pragma unroll -- for (int j = 0; j < kRemainder; ++j) { -- dist += vals[j]; -- } -- -- // We have the distance for our code; write it out -- outCodeDistances[queryId][coarse][subQuantizer][code] = -- Converter::to(dist); -- } // !isLoadingThread -- -- // Swap residual buffers -- float* tmp = smemResidual1; -- smemResidual1 = smemResidual2; -- smemResidual2 = tmp; -- } -- } --} -- --__global__ void --residualVector(Tensor queries, -- Tensor coarseCentroids, -- Tensor topQueryToCentroid, -- int numSubDim, -- // output is transposed: -- // (sub q)(query id)(centroid id)(sub dim) -- Tensor residual) { -- // block x is query id -- // block y is centroid id -- // thread x is dim -- auto queryId = blockIdx.x; -- auto centroidId = blockIdx.y; -- -- int realCentroidId = topQueryToCentroid[queryId][centroidId]; -- -- for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) { -- float q = queries[queryId][dim]; -- float c = coarseCentroids[realCentroidId][dim]; -- -- residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] = -- q - c; -- } --} -- --void --runResidualVector(Tensor& pqCentroids, -- Tensor& queries, -- Tensor& coarseCentroids, -- Tensor& topQueryToCentroid, -- Tensor& residual, -- cudaStream_t stream) { -- auto grid = -- dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1)); -- auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice())); -- -- residualVector<<>>( -- queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1), -- residual); -- -- CUDA_TEST_ERROR(); --} -- --void --runPQCodeDistancesMM(Tensor& pqCentroids, -- Tensor& queries, -- Tensor& coarseCentroids, -- Tensor& topQueryToCentroid, -- NoTypeTensor<4, true>& outCodeDistances, -- bool useFloat16Lookup, -- DeviceMemory& mem, -- cublasHandle_t handle, -- cudaStream_t stream) { -- // Calculate (q - c) residual vector -- // (sub q)(query id)(centroid id)(sub dim) -- DeviceTensor residual( -- mem, -- {pqCentroids.getSize(0), -- topQueryToCentroid.getSize(0), -- topQueryToCentroid.getSize(1), -- pqCentroids.getSize(1)}, -- stream); -- -- runResidualVector(pqCentroids, queries, -- coarseCentroids, topQueryToCentroid, -- residual, stream); -- -- // Calculate ||q - c||^2 -- DeviceTensor residualNorms( -- mem, -- {pqCentroids.getSize(0) * -- topQueryToCentroid.getSize(0) * -- topQueryToCentroid.getSize(1)}, -- stream); -- -- auto residualView2 = residual.view<2>( -- {pqCentroids.getSize(0) * -- topQueryToCentroid.getSize(0) * -- topQueryToCentroid.getSize(1), -- pqCentroids.getSize(1)}); -- -- runL2Norm(residualView2, true, residualNorms, true, stream); -- -- // Perform a batch MM: -- // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} => -- // (sub q) x {(q * c)(code)} -- auto residualView3 = residual.view<3>( -- {pqCentroids.getSize(0), -- topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), -- pqCentroids.getSize(1)}); -- -- DeviceTensor residualDistance( -- mem, -- {pqCentroids.getSize(0), -- topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), -- pqCentroids.getSize(2)}, -- stream); -- -- runIteratedMatrixMult(residualDistance, false, -- residualView3, false, -- pqCentroids, false, -- -2.0f, 0.0f, -- handle, -- stream); -- -- // Sum ||q - c||^2 along rows -- auto residualDistanceView2 = residualDistance.view<2>( -- {pqCentroids.getSize(0) * -- topQueryToCentroid.getSize(0) * -- topQueryToCentroid.getSize(1), -- pqCentroids.getSize(2)}); -- -- runSumAlongRows(residualNorms, residualDistanceView2, false, stream); -- -- Tensor outCodeDistancesF; -- DeviceTensor outCodeDistancesFloatMem; -- -- if (useFloat16Lookup) { -- outCodeDistancesFloatMem = DeviceTensor( -- mem, {outCodeDistances.getSize(0), -- outCodeDistances.getSize(1), -- outCodeDistances.getSize(2), -- outCodeDistances.getSize(3)}, -- stream); -- -- outCodeDistancesF = outCodeDistancesFloatMem; -- } else { -- outCodeDistancesF = outCodeDistances.toTensor(); -- } -- -- // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which -- // is where we build our output distances) -- auto outCodeDistancesView = outCodeDistancesF.view<3>( -- {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), -- outCodeDistances.getSize(2), -- outCodeDistances.getSize(3)}); -- -- runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream); -- -- // Calculate code norms per each sub-dim -- // (sub q)(sub dim)(code) is pqCentroids -- // transpose to (sub q)(code)(sub dim) -- DeviceTensor pqCentroidsTranspose( -- mem, -- {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)}, -- stream); -- -- runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream); -- -- auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>( -- {pqCentroids.getSize(0) * pqCentroids.getSize(2), -- pqCentroids.getSize(1)}); -- -- DeviceTensor pqCentroidsNorm( -- mem, -- {pqCentroids.getSize(0) * pqCentroids.getSize(2)}, -- stream); -- -- runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream); -- -- // View output as (q * c)(sub q * code), and add centroid norm to -- // each row -- auto outDistancesCodeViewCols = outCodeDistancesView.view<2>( -- {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), -- outCodeDistances.getSize(2) * outCodeDistances.getSize(3)}); -- -- runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream); -- -- if (useFloat16Lookup) { -- // Need to convert back -- auto outCodeDistancesH = outCodeDistances.toTensor(); -- convertTensor(stream, -- outCodeDistancesF, -- outCodeDistancesH); -- } --} -- --void --runPQCodeDistances(Tensor& pqCentroids, -- Tensor& queries, -- Tensor& coarseCentroids, -- Tensor& topQueryToCentroid, -- NoTypeTensor<4, true>& outCodeDistances, -- bool l2Distance, -- bool useFloat16Lookup, -- cudaStream_t stream) { -- const auto numSubQuantizers = pqCentroids.getSize(0); -- const auto dimsPerSubQuantizer = pqCentroids.getSize(1); -- const auto codesPerSubQuantizer = pqCentroids.getSize(2); -- -- // FIXME: tune -- // Reuse of pq centroid data is based on both # of queries * nprobe, -- // and we should really be tiling in both dimensions -- constexpr int kQueriesPerBlock = 8; -- -- auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock), -- numSubQuantizers); -- -- // Reserve one block of threads for double buffering -- // FIXME: probably impractical for large # of dims? -- auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize); -- auto block = dim3(codesPerSubQuantizer + loadingThreads); -- -- auto smem = (3 * dimsPerSubQuantizer) * sizeof(float) -- + topQueryToCentroid.getSize(1) * sizeof(int); -- --#define RUN_CODE(DIMS, L2) \ -- do { \ -- if (useFloat16Lookup) { \ -- auto outCodeDistancesT = outCodeDistances.toTensor(); \ -- \ -- pqCodeDistances<<>>( \ -- queries, kQueriesPerBlock, \ -- coarseCentroids, pqCentroids, \ -- topQueryToCentroid, outCodeDistancesT); \ -- } else { \ -- auto outCodeDistancesT = outCodeDistances.toTensor(); \ -- \ -- pqCodeDistances<<>>( \ -- queries, kQueriesPerBlock, \ -- coarseCentroids, pqCentroids, \ -- topQueryToCentroid, outCodeDistancesT); \ -- } \ -- } while (0) -- --#define CODE_L2(DIMS) \ -- do { \ -- if (l2Distance) { \ -- RUN_CODE(DIMS, true); \ -- } else { \ -- RUN_CODE(DIMS, false); \ -- } \ -- } while (0) -- -- switch (dimsPerSubQuantizer) { -- case 1: -- CODE_L2(1); -- break; -- case 2: -- CODE_L2(2); -- break; -- case 3: -- CODE_L2(3); -- break; -- case 4: -- CODE_L2(4); -- break; -- case 6: -- CODE_L2(6); -- break; -- case 8: -- CODE_L2(8); -- break; -- case 10: -- CODE_L2(10); -- break; -- case 12: -- CODE_L2(12); -- break; -- case 16: -- CODE_L2(16); -- break; -- case 20: -- CODE_L2(20); -- break; -- case 24: -- CODE_L2(24); -- break; -- case 28: -- CODE_L2(28); -- break; -- case 32: -- CODE_L2(32); -- break; -- // FIXME: larger sizes require too many registers - we need the -- // MM implementation working -- default: -- FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer " -- "not currently supported"); -- } -- --#undef RUN_CODE --#undef CODE_L2 -- -- CUDA_TEST_ERROR(); --} -- --} } // namespace -diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/gpu/impl/PQScanMultiPassNoPrecomputed.cu -deleted file mode 100644 -index a514694..0000000 ---- a/gpu/impl/PQScanMultiPassNoPrecomputed.cu -+++ /dev/null -@@ -1,597 +0,0 @@ --/** -- * Copyright (c) Facebook, Inc. and its affiliates. -- * -- * This source code is licensed under the MIT license found in the -- * LICENSE file in the root directory of this source tree. -- */ -- -- --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include -- --#include -- --namespace faiss { namespace gpu { -- --// This must be kept in sync with PQCodeDistances.cu --bool isSupportedNoPrecomputedSubDimSize(int dims) { -- switch (dims) { -- case 1: -- case 2: -- case 3: -- case 4: -- case 6: -- case 8: -- case 10: -- case 12: -- case 16: -- case 20: -- case 24: -- case 28: -- case 32: -- return true; -- default: -- // FIXME: larger sizes require too many registers - we need the -- // MM implementation working -- return false; -- } --} -- --template --struct LoadCodeDistances { -- static inline __device__ void load(LookupT* smem, -- LookupT* codes, -- int numCodes) { -- constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT); -- -- // We can only use the vector type if the data is guaranteed to be -- // aligned. The codes are innermost, so if it is evenly divisible, -- // then any slice will be aligned. -- if (numCodes % kWordSize == 0) { -- // Load the data by float4 for efficiency, and then handle any remainder -- // limitVec is the number of whole vec words we can load, in terms -- // of whole blocks performing the load -- constexpr int kUnroll = 2; -- int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x); -- limitVec *= kUnroll * blockDim.x; -- -- LookupVecT* smemV = (LookupVecT*) smem; -- LookupVecT* codesV = (LookupVecT*) codes; -- -- for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) { -- LookupVecT vals[kUnroll]; -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- vals[j] = -- LoadStore::load(&codesV[i + j * blockDim.x]); -- } -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- LoadStore::store(&smemV[i + j * blockDim.x], vals[j]); -- } -- } -- -- // This is where we start loading the remainder that does not evenly -- // fit into kUnroll x blockDim.x -- int remainder = limitVec * kWordSize; -- -- for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) { -- smem[i] = codes[i]; -- } -- } else { -- // Potential unaligned load -- constexpr int kUnroll = 4; -- -- int limit = utils::roundDown(numCodes, kUnroll * blockDim.x); -- -- int i = threadIdx.x; -- for (; i < limit; i += kUnroll * blockDim.x) { -- LookupT vals[kUnroll]; -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- vals[j] = codes[i + j * blockDim.x]; -- } -- --#pragma unroll -- for (int j = 0; j < kUnroll; ++j) { -- smem[i + j * blockDim.x] = vals[j]; -- } -- } -- -- for (; i < numCodes; i += blockDim.x) { -- smem[i] = codes[i]; -- } -- } -- } --}; -- --template --__global__ void --pqScanNoPrecomputedMultiPass(Tensor queries, -- Tensor pqCentroids, -- Tensor topQueryToCentroid, -- Tensor codeDistances, -- void** listCodes, -- int* listLengths, -- Tensor prefixSumOffsets, -- Tensor distance) { -- const auto codesPerSubQuantizer = pqCentroids.getSize(2); -- -- // Where the pq code -> residual distance is stored -- extern __shared__ char smemCodeDistances[]; -- LookupT* codeDist = (LookupT*) smemCodeDistances; -- -- // Each block handles a single query -- auto queryId = blockIdx.y; -- auto probeId = blockIdx.x; -- -- // This is where we start writing out data -- // We ensure that before the array (at offset -1), there is a 0 value -- int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1); -- float* distanceOut = distance[outBase].data(); -- -- auto listId = topQueryToCentroid[queryId][probeId]; -- // Safety guard in case NaNs in input cause no list ID to be generated -- if (listId == -1) { -- return; -- } -- -- unsigned char* codeList = (unsigned char*) listCodes[listId]; -- int limit = listLengths[listId]; -- -- constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 : -- (NumSubQuantizers / 4); -- unsigned int code32[kNumCode32]; -- unsigned int nextCode32[kNumCode32]; -- -- // We double-buffer the code loading, which improves memory utilization -- if (threadIdx.x < limit) { -- LoadCode32::load(code32, codeList, threadIdx.x); -- } -- -- LoadCodeDistances::load( -- codeDist, -- codeDistances[queryId][probeId].data(), -- codeDistances.getSize(2) * codeDistances.getSize(3)); -- -- // Prevent WAR dependencies -- __syncthreads(); -- -- // Each thread handles one code element in the list, with a -- // block-wide stride -- for (int codeIndex = threadIdx.x; -- codeIndex < limit; -- codeIndex += blockDim.x) { -- // Prefetch next codes -- if (codeIndex + blockDim.x < limit) { -- LoadCode32::load( -- nextCode32, codeList, codeIndex + blockDim.x); -- } -- -- float dist = 0.0f; -- --#pragma unroll -- for (int word = 0; word < kNumCode32; ++word) { -- constexpr int kBytesPerCode32 = -- NumSubQuantizers < 4 ? NumSubQuantizers : 4; -- -- if (kBytesPerCode32 == 1) { -- auto code = code32[0]; -- dist = ConvertTo::to(codeDist[code]); -- -- } else { --#pragma unroll -- for (int byte = 0; byte < kBytesPerCode32; ++byte) { -- auto code = getByte(code32[word], byte * 8, 8); -- -- auto offset = -- codesPerSubQuantizer * (word * kBytesPerCode32 + byte); -- -- dist += ConvertTo::to(codeDist[offset + code]); -- } -- } -- } -- -- // Write out intermediate distance result -- // We do not maintain indices here, in order to reduce global -- // memory traffic. Those are recovered in the final selection step. -- distanceOut[codeIndex] = dist; -- -- // Rotate buffers --#pragma unroll -- for (int word = 0; word < kNumCode32; ++word) { -- code32[word] = nextCode32[word]; -- } -- } --} -- --void --runMultiPassTile(Tensor& queries, -- Tensor& centroids, -- Tensor& pqCentroidsInnermostCode, -- NoTypeTensor<4, true>& codeDistances, -- Tensor& topQueryToCentroid, -- bool useFloat16Lookup, -- int bytesPerCode, -- int numSubQuantizers, -- int numSubQuantizerCodes, -- thrust::device_vector& listCodes, -- thrust::device_vector& listIndices, -- IndicesOptions indicesOptions, -- thrust::device_vector& listLengths, -- Tensor& thrustMem, -- Tensor& prefixSumOffsets, -- Tensor& allDistances, -- Tensor& heapDistances, -- Tensor& heapIndices, -- int k, -- faiss::MetricType metric, -- Tensor& outDistances, -- Tensor& outIndices, -- cudaStream_t stream) { -- // We only support two metrics at the moment -- FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT || -- metric == MetricType::METRIC_L2); -- -- bool l2Distance = metric == MetricType::METRIC_L2; -- -- // Calculate offset lengths, so we know where to write out -- // intermediate results -- runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets, -- thrustMem, stream); -- -- // Calculate residual code distances, since this is without -- // precomputed codes -- runPQCodeDistances(pqCentroidsInnermostCode, -- queries, -- centroids, -- topQueryToCentroid, -- codeDistances, -- l2Distance, -- useFloat16Lookup, -- stream); -- -- // Convert all codes to a distance, and write out (distance, -- // index) values for all intermediate results -- { -- auto kThreadsPerBlock = 256; -- -- auto grid = dim3(topQueryToCentroid.getSize(1), -- topQueryToCentroid.getSize(0)); -- auto block = dim3(kThreadsPerBlock); -- -- // pq centroid distances -- auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float); -- -- smem *= numSubQuantizers * numSubQuantizerCodes; -- FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice()); -- --#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \ -- do { \ -- auto codeDistancesT = codeDistances.toTensor(); \ -- \ -- pqScanNoPrecomputedMultiPass \ -- <<>>( \ -- queries, \ -- pqCentroidsInnermostCode, \ -- topQueryToCentroid, \ -- codeDistancesT, \ -- listCodes.data().get(), \ -- listLengths.data().get(), \ -- prefixSumOffsets, \ -- allDistances); \ -- } while (0) -- --#define RUN_PQ(NUM_SUB_Q) \ -- do { \ -- if (useFloat16Lookup) { \ -- RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \ -- } else { \ -- RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ -- } \ -- } while (0) -- -- switch (bytesPerCode) { -- case 1: -- RUN_PQ(1); -- break; -- case 2: -- RUN_PQ(2); -- break; -- case 3: -- RUN_PQ(3); -- break; -- case 4: -- RUN_PQ(4); -- break; -- case 8: -- RUN_PQ(8); -- break; -- case 12: -- RUN_PQ(12); -- break; -- case 16: -- RUN_PQ(16); -- break; -- case 20: -- RUN_PQ(20); -- break; -- case 24: -- RUN_PQ(24); -- break; -- case 28: -- RUN_PQ(28); -- break; -- case 32: -- RUN_PQ(32); -- break; -- case 40: -- RUN_PQ(40); -- break; -- case 48: -- RUN_PQ(48); -- break; -- case 56: -- RUN_PQ(56); -- break; -- case 64: -- RUN_PQ(64); -- break; -- case 96: -- RUN_PQ(96); -- break; -- default: -- FAISS_ASSERT(false); -- break; -- } -- --#undef RUN_PQ --#undef RUN_PQ_OPT -- } -- -- CUDA_TEST_ERROR(); -- -- // k-select the output in chunks, to increase parallelism -- runPass1SelectLists(prefixSumOffsets, -- allDistances, -- topQueryToCentroid.getSize(1), -- k, -- !l2Distance, // L2 distance chooses smallest -- heapDistances, -- heapIndices, -- stream); -- -- // k-select final output -- auto flatHeapDistances = heapDistances.downcastInner<2>(); -- auto flatHeapIndices = heapIndices.downcastInner<2>(); -- -- runPass2SelectLists(flatHeapDistances, -- flatHeapIndices, -- listIndices, -- indicesOptions, -- prefixSumOffsets, -- topQueryToCentroid, -- k, -- !l2Distance, // L2 distance chooses smallest -- outDistances, -- outIndices, -- stream); --} -- --void runPQScanMultiPassNoPrecomputed(Tensor& queries, -- Tensor& centroids, -- Tensor& pqCentroidsInnermostCode, -- Tensor& topQueryToCentroid, -- bool useFloat16Lookup, -- int bytesPerCode, -- int numSubQuantizers, -- int numSubQuantizerCodes, -- thrust::device_vector& listCodes, -- thrust::device_vector& listIndices, -- IndicesOptions indicesOptions, -- thrust::device_vector& listLengths, -- int maxListLength, -- int k, -- faiss::MetricType metric, -- // output -- Tensor& outDistances, -- // output -- Tensor& outIndices, -- GpuResources* res) { -- constexpr int kMinQueryTileSize = 8; -- constexpr int kMaxQueryTileSize = 128; -- constexpr int kThrustMemSize = 16384; -- -- int nprobe = topQueryToCentroid.getSize(1); -- -- auto& mem = res->getMemoryManagerCurrentDevice(); -- auto stream = res->getDefaultStreamCurrentDevice(); -- -- // Make a reservation for Thrust to do its dirty work (global memory -- // cross-block reduction space); hopefully this is large enough. -- DeviceTensor thrustMem1( -- mem, {kThrustMemSize}, stream); -- DeviceTensor thrustMem2( -- mem, {kThrustMemSize}, stream); -- DeviceTensor* thrustMem[2] = -- {&thrustMem1, &thrustMem2}; -- -- // How much temporary storage is available? -- // If possible, we'd like to fit within the space available. -- size_t sizeAvailable = mem.getSizeAvailable(); -- -- // We run two passes of heap selection -- // This is the size of the first-level heap passes -- constexpr int kNProbeSplit = 8; -- int pass2Chunks = std::min(nprobe, kNProbeSplit); -- -- size_t sizeForFirstSelectPass = -- pass2Chunks * k * (sizeof(float) + sizeof(int)); -- -- // How much temporary storage we need per each query -- size_t sizePerQuery = -- 2 * // streams -- ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets -- nprobe * maxListLength * sizeof(float) + // allDistances -- // residual distances -- nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) + -- sizeForFirstSelectPass); -- -- int queryTileSize = (int) (sizeAvailable / sizePerQuery); -- -- if (queryTileSize < kMinQueryTileSize) { -- queryTileSize = kMinQueryTileSize; -- } else if (queryTileSize > kMaxQueryTileSize) { -- queryTileSize = kMaxQueryTileSize; -- } -- -- // FIXME: we should adjust queryTileSize to deal with this, since -- // indexing is in int32 -- FAISS_ASSERT(queryTileSize * nprobe * maxListLength < -- std::numeric_limits::max()); -- -- // Temporary memory buffers -- // Make sure there is space prior to the start which will be 0, and -- // will handle the boundary condition without branches -- DeviceTensor prefixSumOffsetSpace1( -- mem, {queryTileSize * nprobe + 1}, stream); -- DeviceTensor prefixSumOffsetSpace2( -- mem, {queryTileSize * nprobe + 1}, stream); -- -- DeviceTensor prefixSumOffsets1( -- prefixSumOffsetSpace1[1].data(), -- {queryTileSize, nprobe}); -- DeviceTensor prefixSumOffsets2( -- prefixSumOffsetSpace2[1].data(), -- {queryTileSize, nprobe}); -- DeviceTensor* prefixSumOffsets[2] = -- {&prefixSumOffsets1, &prefixSumOffsets2}; -- -- // Make sure the element before prefixSumOffsets is 0, since we -- // depend upon simple, boundary-less indexing to get proper results -- CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(), -- 0, -- sizeof(int), -- stream)); -- CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(), -- 0, -- sizeof(int), -- stream)); -- -- int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float); -- -- int totalCodeDistancesSize = -- queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes * -- codeDistanceTypeSize; -- -- DeviceTensor codeDistances1Mem( -- mem, {totalCodeDistancesSize}, stream); -- NoTypeTensor<4, true> codeDistances1( -- codeDistances1Mem.data(), -- codeDistanceTypeSize, -- {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes}); -- -- DeviceTensor codeDistances2Mem( -- mem, {totalCodeDistancesSize}, stream); -- NoTypeTensor<4, true> codeDistances2( -- codeDistances2Mem.data(), -- codeDistanceTypeSize, -- {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes}); -- -- NoTypeTensor<4, true>* codeDistances[2] = -- {&codeDistances1, &codeDistances2}; -- -- DeviceTensor allDistances1( -- mem, {queryTileSize * nprobe * maxListLength}, stream); -- DeviceTensor allDistances2( -- mem, {queryTileSize * nprobe * maxListLength}, stream); -- DeviceTensor* allDistances[2] = -- {&allDistances1, &allDistances2}; -- -- DeviceTensor heapDistances1( -- mem, {queryTileSize, pass2Chunks, k}, stream); -- DeviceTensor heapDistances2( -- mem, {queryTileSize, pass2Chunks, k}, stream); -- DeviceTensor* heapDistances[2] = -- {&heapDistances1, &heapDistances2}; -- -- DeviceTensor heapIndices1( -- mem, {queryTileSize, pass2Chunks, k}, stream); -- DeviceTensor heapIndices2( -- mem, {queryTileSize, pass2Chunks, k}, stream); -- DeviceTensor* heapIndices[2] = -- {&heapIndices1, &heapIndices2}; -- -- auto streams = res->getAlternateStreamsCurrentDevice(); -- streamWait(streams, {stream}); -- -- int curStream = 0; -- -- for (int query = 0; query < queries.getSize(0); query += queryTileSize) { -- int numQueriesInTile = -- std::min(queryTileSize, queries.getSize(0) - query); -- -- auto prefixSumOffsetsView = -- prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile); -- -- auto codeDistancesView = -- codeDistances[curStream]->narrowOutermost(0, numQueriesInTile); -- auto coarseIndicesView = -- topQueryToCentroid.narrowOutermost(query, numQueriesInTile); -- auto queryView = -- queries.narrowOutermost(query, numQueriesInTile); -- -- auto heapDistancesView = -- heapDistances[curStream]->narrowOutermost(0, numQueriesInTile); -- auto heapIndicesView = -- heapIndices[curStream]->narrowOutermost(0, numQueriesInTile); -- -- auto outDistanceView = -- outDistances.narrowOutermost(query, numQueriesInTile); -- auto outIndicesView = -- outIndices.narrowOutermost(query, numQueriesInTile); -- -- runMultiPassTile(queryView, -- centroids, -- pqCentroidsInnermostCode, -- codeDistancesView, -- coarseIndicesView, -- useFloat16Lookup, -- bytesPerCode, -- numSubQuantizers, -- numSubQuantizerCodes, -- listCodes, -- listIndices, -- indicesOptions, -- listLengths, -- *thrustMem[curStream], -- prefixSumOffsetsView, -- *allDistances[curStream], -- heapDistancesView, -- heapIndicesView, -- k, -- metric, -- outDistanceView, -- outIndicesView, -- streams[curStream]); -- -- curStream = (curStream + 1) % 2; -- } -- -- streamWait({stream}, streams); --} -- --} } // namespace --- -2.26.2.windows.1 - diff --git a/recipe/patches/0004-CUDA-8-fixes.patch b/recipe/patches/0004-CUDA-8-fixes.patch deleted file mode 100644 index 430f063e..00000000 --- a/recipe/patches/0004-CUDA-8-fixes.patch +++ /dev/null @@ -1,117 +0,0 @@ -From 3a69eeeb6067ebd2e71e510e11535b59a4a7768b Mon Sep 17 00:00:00 2001 -From: Jeff Johnson -Date: Sun, 29 Mar 2020 20:07:01 -0700 -Subject: [PATCH 4/8] CUDA 8 fixes - ---- - gpu/impl/Distance.cu | 2 +- - gpu/impl/GeneralDistance.cuh | 15 ++++++++------- - gpu/test/TestGpuIndexFlat.cpp | 2 +- - gpu/utils/MathOperators.cuh | 2 +- - 4 files changed, 11 insertions(+), 10 deletions(-) - -diff --git a/gpu/impl/Distance.cu b/gpu/impl/Distance.cu -index 63ed60b..3a46c37 100644 ---- a/gpu/impl/Distance.cu -+++ b/gpu/impl/Distance.cu -@@ -64,7 +64,7 @@ void runDistance(bool computeL2, - if (centroids.numElements() == 0) { - thrust::fill(thrust::cuda::par.on(defaultStream), - outDistances.data(), outDistances.end(), -- Limits::getMax()); -+ Limits::getMax()); - - thrust::fill(thrust::cuda::par.on(defaultStream), - outIndices.data(), outIndices.end(), -diff --git a/gpu/impl/GeneralDistance.cuh b/gpu/impl/GeneralDistance.cuh -index 0ecdfa5..a2a447f 100644 ---- a/gpu/impl/GeneralDistance.cuh -+++ b/gpu/impl/GeneralDistance.cuh -@@ -9,6 +9,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -144,11 +145,11 @@ generalDistance(Tensor query, // m x k - - queryTileBase[threadIdx.x + i * kWarpSize] = - kInBounds ? -- queryBase[k] : (T) 0; //DistanceOp::kIdentityData; -+ queryBase[k] : ConvertTo::to(0); - - vecTileBase[threadIdx.x + i * kWarpSize] = - kInBounds ? -- vecBase[k] : (T) 0; // DistanceOp::kIdentityData; -+ vecBase[k] : ConvertTo::to(0); - } - - __syncthreads(); -@@ -179,11 +180,11 @@ generalDistance(Tensor query, // m x k - // Load query tile - queryTileBase[threadIdx.x] = - queryThreadInBounds ? -- queryBase[k] : (T) 0; // DistanceOp::kIdentityData; -+ queryBase[k] : ConvertTo::to(0); - - vecTileBase[threadIdx.x] = - vecThreadInBoundsLoad ? -- vecBase[k] : (T) 0; // DistanceOp::kIdentityData; -+ vecBase[k] : ConvertTo::to(0); - - __syncthreads(); - -@@ -205,11 +206,11 @@ generalDistance(Tensor query, // m x k - // Load query tile - queryTileBase[threadIdx.x] = - queryThreadInBounds && kInBounds ? -- queryBase[k] : (T) 0; // DistanceOp::kIdentityData; -+ queryBase[k] : ConvertTo::to(0); - - vecTileBase[threadIdx.x] = - vecThreadInBoundsLoad && kInBounds ? -- vecBase[k] : (T) 0; // DistanceOp::kIdentityData; -+ vecBase[k] : ConvertTo::to(0); - - __syncthreads(); - -@@ -278,7 +279,7 @@ void runGeneralDistance(GpuResources* resources, - if (centroids.numElements() == 0) { - thrust::fill(thrust::cuda::par.on(defaultStream), - outDistances.data(), outDistances.end(), -- Limits::getMax()); -+ Limits::getMax()); - - thrust::fill(thrust::cuda::par.on(defaultStream), - outIndices.data(), outIndices.end(), -diff --git a/gpu/test/TestGpuIndexFlat.cpp b/gpu/test/TestGpuIndexFlat.cpp -index 73cfe20..cabd7aa 100644 ---- a/gpu/test/TestGpuIndexFlat.cpp -+++ b/gpu/test/TestGpuIndexFlat.cpp -@@ -277,7 +277,7 @@ TEST(TestGpuIndexFlat, CopyFrom) { - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - - faiss::gpu::GpuIndexFlatConfig config; -- config.device = 0; -+ config.device = device; - config.useFloat16 = false; - config.storeTransposed = false; - -diff --git a/gpu/utils/MathOperators.cuh b/gpu/utils/MathOperators.cuh -index 68ccbd5..020d220 100644 ---- a/gpu/utils/MathOperators.cuh -+++ b/gpu/utils/MathOperators.cuh -@@ -537,7 +537,7 @@ struct Math { - return h; - } - -- static inline __device__ half reduceAdd(Half8 v) { -+ static inline __device__ float reduceAdd(Half8 v) { - float x = Math::reduceAdd(v.a); - float y = Math::reduceAdd(v.b); - return x + y; --- -2.26.2.windows.1 - diff --git a/recipe/patches/0005-CUDA-10-fixes.patch b/recipe/patches/0005-CUDA-10-fixes.patch deleted file mode 100644 index 46b3d0dc..00000000 --- a/recipe/patches/0005-CUDA-10-fixes.patch +++ /dev/null @@ -1,69 +0,0 @@ -From d41dcff71a519f17d87a9648c75f8f0b7fe36399 Mon Sep 17 00:00:00 2001 -From: Jeff Johnson -Date: Mon, 30 Mar 2020 03:01:56 +0000 -Subject: [PATCH 5/8] CUDA 10 fixes - ---- - gpu/impl/PQCodeDistances-inl.cuh | 2 +- - gpu/utils/DeviceDefs.cuh | 2 ++ - gpu/utils/DeviceUtils.cu | 14 +++++++++++++- - 3 files changed, 16 insertions(+), 2 deletions(-) - -diff --git a/gpu/impl/PQCodeDistances-inl.cuh b/gpu/impl/PQCodeDistances-inl.cuh -index c3ef87f..fcb9a02 100644 ---- a/gpu/impl/PQCodeDistances-inl.cuh -+++ b/gpu/impl/PQCodeDistances-inl.cuh -@@ -27,7 +27,7 @@ template - __global__ void --__launch_bounds__(288, 4) -+__launch_bounds__(288, 3) - pqCodeDistances(Tensor queries, - int queriesPerBlock, - Tensor coarseCentroids, -diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh -index 4e859ec..89d3dda 100644 ---- a/gpu/utils/DeviceDefs.cuh -+++ b/gpu/utils/DeviceDefs.cuh -@@ -8,6 +8,8 @@ - - #pragma once - -+#include -+ - namespace faiss { namespace gpu { - - #ifdef __CUDA_ARCH__ -diff --git a/gpu/utils/DeviceUtils.cu b/gpu/utils/DeviceUtils.cu -index a8195c9..7bcc230 100644 ---- a/gpu/utils/DeviceUtils.cu -+++ b/gpu/utils/DeviceUtils.cu -@@ -111,11 +111,23 @@ int getDeviceForAddress(const void* p) { - FAISS_ASSERT_FMT(err == cudaErrorInvalidValue, - "unknown error %d", (int) err); - return -1; -- } else if (att.memoryType == cudaMemoryTypeHost) { -+ } -+ -+ // memoryType is deprecated for CUDA 10.0+ -+#if CUDA_VERSION < 10000 -+ if (att.memoryType == cudaMemoryTypeHost) { - return -1; - } else { - return att.device; - } -+#else -+ // FIXME: what to use for managed memory? -+ if (att.type == cudaMemoryTypeDevice) { -+ return att.device; -+ } else { -+ return -1; -+ } -+#endif - } - - bool getFullUnifiedMemSupport(int device) { --- -2.26.2.windows.1 - diff --git a/recipe/patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch b/recipe/patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch deleted file mode 100644 index 69fa6a8c..00000000 --- a/recipe/patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch +++ /dev/null @@ -1,26 +0,0 @@ -From c33f254e219c318e25497a8e399ccbe43bd28687 Mon Sep 17 00:00:00 2001 -From: "H. Vetinari" -Date: Thu, 8 Oct 2020 12:45:09 +0200 -Subject: [PATCH 6/8] update util-guard for ampere; backport of - facebookresearch/faiss#1380 - ---- - gpu/utils/DeviceDefs.cuh | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh -index 89d3dda..bc0f9b5 100644 ---- a/gpu/utils/DeviceDefs.cuh -+++ b/gpu/utils/DeviceDefs.cuh -@@ -13,7 +13,7 @@ - namespace faiss { namespace gpu { - - #ifdef __CUDA_ARCH__ --#if __CUDA_ARCH__ <= 750 -+#if __CUDA_ARCH__ <= 800 - constexpr int kWarpSize = 32; - #else - #error Unknown __CUDA_ARCH__; please define parameters for compute capability --- -2.26.2.windows.1 - diff --git a/recipe/patches/0007-use-c-14.patch b/recipe/patches/0007-use-c-14.patch deleted file mode 100644 index 6d8baf31..00000000 --- a/recipe/patches/0007-use-c-14.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 5e576fb95233970295084e7d05b6818d7667af6a Mon Sep 17 00:00:00 2001 -From: "H. Vetinari" -Date: Thu, 8 Oct 2020 18:06:20 +0200 -Subject: [PATCH 7/8] use c++14 - ---- - configure.ac | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/configure.ac b/configure.ac -index 31b587b..5755fb6 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -16,7 +16,7 @@ AC_CONFIG_MACRO_DIR([acinclude]) - # Checks for programs. - AC_LANG(C++) - AC_PROG_CXX --AX_CXX_COMPILE_STDCXX([11], [noext], [mandatory]) -+AX_CXX_COMPILE_STDCXX([14], [noext], [mandatory]) - AC_PROG_CPP - AC_PROG_MAKE_SET - AC_PROG_MKDIR_P --- -2.26.2.windows.1 - diff --git a/recipe/patches/0008-backport-facebookresearch-faiss-1388.patch b/recipe/patches/0008-backport-facebookresearch-faiss-1388.patch deleted file mode 100644 index e08c2726..00000000 --- a/recipe/patches/0008-backport-facebookresearch-faiss-1388.patch +++ /dev/null @@ -1,33 +0,0 @@ -From fb6103d0d1f044c8b27450d93f1d1d6f82c2c1b4 Mon Sep 17 00:00:00 2001 -From: "H. Vetinari" -Date: Fri, 9 Oct 2020 18:24:57 +0200 -Subject: [PATCH 8/8] backport facebookresearch/faiss#1388 - ---- - gpu/StandardGpuResources.cpp | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/gpu/StandardGpuResources.cpp b/gpu/StandardGpuResources.cpp -index e564f8e..257fc2a 100644 ---- a/gpu/StandardGpuResources.cpp -+++ b/gpu/StandardGpuResources.cpp -@@ -249,11 +249,15 @@ StandardGpuResources::initializeForDevice(int device) { - blasHandles_[device] = blasHandle; - - // Enable tensor core support if available --#if CUDA_VERSION >= 9000 -+#if CUDA_VERSION >= 9000 && CUDA_VERSION < 11000 -+ // This flag was deprecated in CUDA 11 - if (getTensorCoreSupport(device)) { - cublasSetMathMode(blasHandle, CUBLAS_TENSOR_OP_MATH); - } - #endif -+#if CUDA_VERSION >= 11000 -+ cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); -+#endif - - FAISS_ASSERT(memory_.count(device) == 0); - --- -2.26.2.windows.1 -