diff --git a/.azure-pipelines/azure-pipelines-linux.yml b/.azure-pipelines/azure-pipelines-linux.yml
index 1c4c1717..34ff6e86 100755
--- a/.azure-pipelines/azure-pipelines-linux.yml
+++ b/.azure-pipelines/azure-pipelines-linux.yml
@@ -11,28 +11,27 @@ jobs:
linux_64_cuda_compiler_version10.0:
CONFIG: linux_64_cuda_compiler_version10.0
UPLOAD_PACKAGES: 'True'
- DOCKER_IMAGE: condaforge/linux-anvil-cuda:10.0
+ DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:10.0
linux_64_cuda_compiler_version10.1:
CONFIG: linux_64_cuda_compiler_version10.1
UPLOAD_PACKAGES: 'True'
- DOCKER_IMAGE: condaforge/linux-anvil-cuda:10.1
+ DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:10.1
linux_64_cuda_compiler_version10.2:
CONFIG: linux_64_cuda_compiler_version10.2
UPLOAD_PACKAGES: 'True'
- DOCKER_IMAGE: condaforge/linux-anvil-cuda:10.2
+ DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:10.2
linux_64_cuda_compiler_version11.0:
CONFIG: linux_64_cuda_compiler_version11.0
UPLOAD_PACKAGES: 'True'
- DOCKER_IMAGE: condaforge/linux-anvil-cuda:11.0
+ DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:11.0
linux_64_cuda_compiler_version9.2:
CONFIG: linux_64_cuda_compiler_version9.2
UPLOAD_PACKAGES: 'True'
- DOCKER_IMAGE: condaforge/linux-anvil-cuda:9.2
+ DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:9.2
linux_64_cuda_compiler_versionNone:
CONFIG: linux_64_cuda_compiler_versionNone
UPLOAD_PACKAGES: 'True'
- DOCKER_IMAGE: condaforge/linux-anvil-comp7
- maxParallel: 8
+ DOCKER_IMAGE: quay.io/condaforge/linux-anvil-comp7
timeoutInMinutes: 360
steps:
diff --git a/.azure-pipelines/azure-pipelines-osx.yml b/.azure-pipelines/azure-pipelines-osx.yml
index 568ec85f..0c892401 100755
--- a/.azure-pipelines/azure-pipelines-osx.yml
+++ b/.azure-pipelines/azure-pipelines-osx.yml
@@ -11,7 +11,6 @@ jobs:
osx_64_:
CONFIG: osx_64_
UPLOAD_PACKAGES: 'True'
- maxParallel: 8
timeoutInMinutes: 360
steps:
diff --git a/.azure-pipelines/azure-pipelines-win.yml b/.azure-pipelines/azure-pipelines-win.yml
new file mode 100755
index 00000000..1fd8ea12
--- /dev/null
+++ b/.azure-pipelines/azure-pipelines-win.yml
@@ -0,0 +1,110 @@
+# This file was generated automatically from conda-smithy. To update this configuration,
+# update the conda-forge.yml and/or the recipe/meta.yaml.
+# -*- mode: yaml -*-
+
+jobs:
+- job: win
+ pool:
+ vmImage: vs2017-win2016
+ strategy:
+ matrix:
+ win_64_:
+ CONFIG: win_64_
+ UPLOAD_PACKAGES: 'True'
+ timeoutInMinutes: 360
+ variables:
+ CONDA_BLD_PATH: D:\\bld\\
+
+ steps:
+ - script: |
+ choco install vcpython27 -fdv -y --debug
+ condition: contains(variables['CONFIG'], 'vs2008')
+ displayName: Install vcpython27.msi (if needed)
+
+ # Cygwin's git breaks conda-build. (See https://github.com/conda-forge/conda-smithy-feedstock/pull/2.)
+ # - script: rmdir C:\cygwin /s /q
+ # continueOnError: true
+
+ - powershell: |
+ Set-PSDebug -Trace 1
+
+ $batchcontent = @"
+ ECHO ON
+ SET vcpython=C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python\9.0
+
+ DIR "%vcpython%"
+
+ CALL "%vcpython%\vcvarsall.bat" %*
+ "@
+
+ $batchDir = "C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python\9.0\VC"
+ $batchPath = "$batchDir" + "\vcvarsall.bat"
+ New-Item -Path $batchPath -ItemType "file" -Force
+
+ Set-Content -Value $batchcontent -Path $batchPath
+
+ Get-ChildItem -Path $batchDir
+
+ Get-ChildItem -Path ($batchDir + '\..')
+
+ condition: contains(variables['CONFIG'], 'vs2008')
+ displayName: Patch vs2008 (if needed)
+
+ - task: CondaEnvironment@1
+ inputs:
+ packageSpecs: 'python=3.6 conda-build conda "conda-forge-ci-setup=3" pip' # Optional
+ installOptions: "-c conda-forge"
+ updateConda: true
+ displayName: Install conda-build and activate environment
+
+ - script: set PYTHONUNBUFFERED=1
+ displayName: Set PYTHONUNBUFFERED
+
+ # Configure the VM
+ - script: |
+ call activate base
+ setup_conda_rc .\ ".\recipe" .\.ci_support\%CONFIG%.yaml
+ displayName: conda-forge CI setup
+
+ # Configure the VM.
+ - script: |
+ set "CI=azure"
+ call activate base
+ run_conda_forge_build_setup
+ displayName: conda-forge build setup
+
+
+ # Special cased version setting some more things!
+ - script: |
+ call activate base
+ conda.exe build "recipe" -m .ci_support\%CONFIG%.yaml
+ displayName: Build recipe (vs2008)
+ env:
+ VS90COMNTOOLS: "C:\\Program Files (x86)\\Common Files\\Microsoft\\Visual C++ for Python\\9.0\\VC\\bin"
+ PYTHONUNBUFFERED: 1
+ condition: contains(variables['CONFIG'], 'vs2008')
+
+ - script: |
+ call activate base
+ conda.exe build "recipe" -m .ci_support\%CONFIG%.yaml
+ displayName: Build recipe
+ env:
+ PYTHONUNBUFFERED: 1
+ condition: not(contains(variables['CONFIG'], 'vs2008'))
+ - script: |
+ set "FEEDSTOCK_NAME=%BUILD_REPOSITORY_NAME:*/=%"
+ call activate base
+ validate_recipe_outputs "%FEEDSTOCK_NAME%"
+ displayName: Validate Recipe Outputs
+
+ - script: |
+ set "GIT_BRANCH=%BUILD_SOURCEBRANCHNAME%"
+ set "FEEDSTOCK_NAME=%BUILD_REPOSITORY_NAME:*/=%"
+ call activate base
+ upload_package --validate --feedstock-name="%FEEDSTOCK_NAME%" .\ ".\recipe" .ci_support\%CONFIG%.yaml
+ displayName: Upload package
+ env:
+ BINSTAR_TOKEN: $(BINSTAR_TOKEN)
+ FEEDSTOCK_TOKEN: $(FEEDSTOCK_TOKEN)
+ STAGING_BINSTAR_TOKEN: $(STAGING_BINSTAR_TOKEN)
+ condition: and(succeeded(), not(eq(variables['UPLOAD_PACKAGES'], 'False')))
\ No newline at end of file
diff --git a/.ci_support/linux_64_cuda_compiler_version10.0.yaml b/.ci_support/linux_64_cuda_compiler_version10.0.yaml
index 507da21a..a2227db8 100644
--- a/.ci_support/linux_64_cuda_compiler_version10.0.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version10.0.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
channel_sources:
- conda-forge,defaults
channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
cxx_compiler_version:
- '7'
docker_image:
-- condaforge/linux-anvil-cuda:10.0
+- quay.io/condaforge/linux-anvil-cuda:10.0
libblas:
- 3.8 *netlib
liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
pin_run_as_build:
python:
min_pin: x.x
@@ -24,8 +31,11 @@ python:
- 3.6.* *_cpython
- 3.7.* *_cpython
- 3.8.* *_cpython
+- 3.9.* *_cpython
target_platform:
- linux-64
zip_keys:
- - cuda_compiler_version
- docker_image
+- - python
+ - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_version10.1.yaml b/.ci_support/linux_64_cuda_compiler_version10.1.yaml
index d9dfec45..cfb515bb 100644
--- a/.ci_support/linux_64_cuda_compiler_version10.1.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version10.1.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
channel_sources:
- conda-forge,defaults
channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
cxx_compiler_version:
- '7'
docker_image:
-- condaforge/linux-anvil-cuda:10.1
+- quay.io/condaforge/linux-anvil-cuda:10.1
libblas:
- 3.8 *netlib
liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
pin_run_as_build:
python:
min_pin: x.x
@@ -24,8 +31,11 @@ python:
- 3.6.* *_cpython
- 3.7.* *_cpython
- 3.8.* *_cpython
+- 3.9.* *_cpython
target_platform:
- linux-64
zip_keys:
- - cuda_compiler_version
- docker_image
+- - python
+ - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_version10.2.yaml b/.ci_support/linux_64_cuda_compiler_version10.2.yaml
index 2d051101..59928208 100644
--- a/.ci_support/linux_64_cuda_compiler_version10.2.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version10.2.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
channel_sources:
- conda-forge,defaults
channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
cxx_compiler_version:
- '7'
docker_image:
-- condaforge/linux-anvil-cuda:10.2
+- quay.io/condaforge/linux-anvil-cuda:10.2
libblas:
- 3.8 *netlib
liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
pin_run_as_build:
python:
min_pin: x.x
@@ -24,8 +31,11 @@ python:
- 3.6.* *_cpython
- 3.7.* *_cpython
- 3.8.* *_cpython
+- 3.9.* *_cpython
target_platform:
- linux-64
zip_keys:
- - cuda_compiler_version
- docker_image
+- - python
+ - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_version11.0.yaml b/.ci_support/linux_64_cuda_compiler_version11.0.yaml
index 14414976..681c7001 100644
--- a/.ci_support/linux_64_cuda_compiler_version11.0.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version11.0.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
channel_sources:
- conda-forge,defaults
channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
cxx_compiler_version:
- '7'
docker_image:
-- condaforge/linux-anvil-cuda:11.0
+- quay.io/condaforge/linux-anvil-cuda:11.0
libblas:
- 3.8 *netlib
liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
pin_run_as_build:
python:
min_pin: x.x
@@ -24,8 +31,11 @@ python:
- 3.6.* *_cpython
- 3.7.* *_cpython
- 3.8.* *_cpython
+- 3.9.* *_cpython
target_platform:
- linux-64
zip_keys:
- - cuda_compiler_version
- docker_image
+- - python
+ - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_version9.2.yaml b/.ci_support/linux_64_cuda_compiler_version9.2.yaml
index 7e2594ef..10b31134 100644
--- a/.ci_support/linux_64_cuda_compiler_version9.2.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version9.2.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
channel_sources:
- conda-forge,defaults
channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
cxx_compiler_version:
- '7'
docker_image:
-- condaforge/linux-anvil-cuda:9.2
+- quay.io/condaforge/linux-anvil-cuda:9.2
libblas:
- 3.8 *netlib
liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
pin_run_as_build:
python:
min_pin: x.x
@@ -24,8 +31,11 @@ python:
- 3.6.* *_cpython
- 3.7.* *_cpython
- 3.8.* *_cpython
+- 3.9.* *_cpython
target_platform:
- linux-64
zip_keys:
- - cuda_compiler_version
- docker_image
+- - python
+ - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_versionNone.yaml b/.ci_support/linux_64_cuda_compiler_versionNone.yaml
index 5c699f1c..9cc1d4cc 100644
--- a/.ci_support/linux_64_cuda_compiler_versionNone.yaml
+++ b/.ci_support/linux_64_cuda_compiler_versionNone.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
channel_sources:
- conda-forge,defaults
channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
cxx_compiler_version:
- '7'
docker_image:
-- condaforge/linux-anvil-comp7
+- quay.io/condaforge/linux-anvil-comp7
libblas:
- 3.8 *netlib
liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
pin_run_as_build:
python:
min_pin: x.x
@@ -24,8 +31,11 @@ python:
- 3.6.* *_cpython
- 3.7.* *_cpython
- 3.8.* *_cpython
+- 3.9.* *_cpython
target_platform:
- linux-64
zip_keys:
- - cuda_compiler_version
- docker_image
+- - python
+ - numpy
diff --git a/.ci_support/migrations/python39.yaml b/.ci_support/migrations/python39.yaml
new file mode 100644
index 00000000..1c915c15
--- /dev/null
+++ b/.ci_support/migrations/python39.yaml
@@ -0,0 +1,27 @@
+migrator_ts: 1602104489
+__migrator:
+ migration_number: 2
+ operation: key_add
+ primary_key: python
+ ordering:
+ python:
+ - 3.6.* *_cpython
+ - 3.7.* *_cpython
+ - 3.8.* *_cpython
+ - 3.9.* *_cpython # new entry
+ - 3.6.* *_73_pypy
+ paused: false
+ longterm: True
+ pr_limit: 50
+ exclude:
+ # this shouldn't attempt to modify the python feedstocks
+ - python
+ - pypy3.6
+ - pypy-meta
+python:
+ - 3.9.* *_cpython
+# additional entries to add for zip_keys
+numpy:
+ - 1.19
+python_impl:
+ - cpython
diff --git a/.ci_support/osx_64_.yaml b/.ci_support/osx_64_.yaml
index adcbfc34..6768d9d9 100644
--- a/.ci_support/osx_64_.yaml
+++ b/.ci_support/osx_64_.yaml
@@ -13,9 +13,14 @@ cxx_compiler_version:
libblas:
- 3.8 *netlib
liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
macos_machine:
- x86_64-apple-darwin13.4.0
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
pin_run_as_build:
python:
min_pin: x.x
@@ -24,5 +29,9 @@ python:
- 3.6.* *_cpython
- 3.7.* *_cpython
- 3.8.* *_cpython
+- 3.9.* *_cpython
target_platform:
- osx-64
+zip_keys:
+- - python
+ - numpy
diff --git a/.ci_support/win_64_.yaml b/.ci_support/win_64_.yaml
new file mode 100644
index 00000000..642d5c24
--- /dev/null
+++ b/.ci_support/win_64_.yaml
@@ -0,0 +1,31 @@
+channel_sources:
+- conda-forge,defaults
+channel_targets:
+- conda-forge main
+cuda_compiler_version:
+- None
+cxx_compiler:
+- vs2017
+libblas:
+- 3.8 *netlib
+liblapack:
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
+pin_run_as_build:
+ python:
+ min_pin: x.x
+ max_pin: x.x
+python:
+- 3.6.* *_cpython
+- 3.7.* *_cpython
+- 3.8.* *_cpython
+- 3.9.* *_cpython
+target_platform:
+- win-64
+zip_keys:
+- - python
+ - numpy
diff --git a/README.md b/README.md
index 511432d1..093aa804 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,13 @@ Current build status
+
+ win_64 |
+
+
+
+
+ |
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 33a441c1..6b346f50 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -4,4 +4,5 @@
jobs:
- template: ./.azure-pipelines/azure-pipelines-linux.yml
+ - template: ./.azure-pipelines/azure-pipelines-win.yml
- template: ./.azure-pipelines/azure-pipelines-osx.yml
\ No newline at end of file
diff --git a/build-locally.py b/build-locally.py
index d92f4edc..3453cfe6 100755
--- a/build-locally.py
+++ b/build-locally.py
@@ -61,7 +61,7 @@ def main(args=None):
help="Setup debug environment using `conda debug`",
)
p.add_argument(
- "--output-id", help="If running debug, specifiy the output to setup."
+ "--output-id", help="If running debug, specify the output to setup."
)
ns = p.parse_args(args=args)
diff --git a/recipe/build-lib.bat b/recipe/build-lib.bat
index 640e266b..9b97b2e6 100644
--- a/recipe/build-lib.bat
+++ b/recipe/build-lib.bat
@@ -1,9 +1,19 @@
-@echo off
+:: Build faiss.dll
+cmake -B _build ^
+ -DBUILD_SHARED_LIBS=ON ^
+ -DBUILD_TESTING=OFF ^
+ -DFAISS_ENABLE_GPU=OFF ^
+ -DFAISS_ENABLE_PYTHON=OFF ^
+ -DCMAKE_BUILD_TYPE=Release ^
+ -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^
+ -DCMAKE_INSTALL_BINDIR="%LIBRARY_BIN%" ^
+ -DCMAKE_INSTALL_LIBDIR="%LIBRARY_LIB%" ^
+ -DCMAKE_INSTALL_INCLUDEDIR="%LIBRARY_INC%" ^
+ .
+if %ERRORLEVEL% neq 0 exit 1
-:: there's a symbolic link from faiss/ to ./ in the upstream repo that does not work with windows;
-:: delete symlink & copy entire source recursively (= "/S") to folder faiss to work around it
-rmdir faiss
-robocopy . faiss /S
+cmake --build _build --config Release -j %CPU_COUNT%
+if %ERRORLEVEL% neq 0 exit 1
-call %BUILD_PREFIX%\Library\bin\run_autotools_clang_conda_build.bat build-lib.sh
+cmake --install _build --config Release --prefix %PREFIX%
if %ERRORLEVEL% neq 0 exit 1
diff --git a/recipe/build-lib.sh b/recipe/build-lib.sh
index be79f863..0fb8bc2d 100644
--- a/recipe/build-lib.sh
+++ b/recipe/build-lib.sh
@@ -1,7 +1,9 @@
# function for facilitate version comparison; cf. https://stackoverflow.com/a/37939589
function version2int { echo "$@" | awk -F. '{ printf("%d%02d\n", $1, $2); }'; }
-CUDA_CONFIG_ARG=""
+set -e
+
+declare -a CUDA_CONFIG_ARGS
if [ ${cuda_compiler_version} != "None" ]; then
# for documentation see e.g.
# docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#building-for-maximum-compatibility
@@ -22,27 +24,44 @@ if [ ${cuda_compiler_version} != "None" ]; then
LATEST_ARCH=80
fi
for arch in "${ARCHES[@]}"; do
- CUDA_ARCH="${CUDA_ARCH} -gencode=arch=compute_${arch},code=sm_${arch}";
+ CMAKE_CUDA_ARCHS="${CMAKE_CUDA_ARCHS+${CMAKE_CUDA_ARCHS};}${arch}-virtual"
done
# to support PTX JIT compilation; see first link above or cf.
# devblogs.nvidia.com/cuda-pro-tip-understand-fat-binaries-jit-caching
- CUDA_ARCH="${CUDA_ARCH} -gencode=arch=compute_${LATEST_ARCH},code=compute_${LATEST_ARCH}"
+ # see also cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html
+ CMAKE_CUDA_ARCHS="${CMAKE_CUDA_ARCHS+${CMAKE_CUDA_ARCHS};}${LATEST_ARCH}-real"
+
+ FAISS_ENABLE_GPU="ON"
+ CUDA_CONFIG_ARGS+=(
+ -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHS}"
+ )
+ # cmake does not generate output for the call below; echo some info
+ echo "Set up extra cmake-args: CUDA_CONFIG_ARGS=${CUDA_CONFIG_ARGS+"${CUDA_CONFIG_ARGS[@]}"}"
- CUDA_CONFIG_ARG="--with-cuda=${CUDA_HOME}"
+ # Acc. to https://cmake.org/cmake/help/v3.19/module/FindCUDAToolkit.html#search-behavior
+ # CUDA toolkit is search relative to `nvcc` first before considering
+ # "-DCUDAToolkit_ROOT=${CUDA_HOME}". We have multiple workarounds:
+ # - Add symlinks from ${CUDA_HOME} to ${BUILD_PREFIX}
+ # - Add ${CUDA_HOME}/bin to ${PATH}
+ # - Remove `nvcc` wrapper in ${BUILD_PREFIX} so that `nvcc` from ${CUDA_HOME} gets found.
+ # TODO: Fix this in nvcc-feedstock or cmake-feedstock.
+ # NOTE: It's okay for us to not use the wrapper since CMake adds -ccbin itself.
+ rm "${BUILD_PREFIX}/bin/nvcc"
else
- CUDA_CONFIG_ARG="--without-cuda"
+ FAISS_ENABLE_GPU="OFF"
fi
-# need to regenerate ./configure for patch to configure.ac to take effect
-autoreconf -i -f
-
# Build vanilla version (no avx)
-./configure --prefix=${PREFIX} --exec-prefix=${PREFIX} \
- --with-blas=-lblas --with-lapack=-llapack \
- ${CUDA_CONFIG_ARG} --with-cuda-arch="${CUDA_ARCH}" || exit 1
-
-# make sets SHAREDEXT correctly for linux/osx
-make install
+cmake -B _build_generic \
+ -DBUILD_SHARED_LIBS=ON \
+ -DBUILD_TESTING=ON \
+ -DFAISS_ENABLE_PYTHON=OFF \
+ -DFAISS_ENABLE_GPU=${FAISS_ENABLE_GPU} \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_INSTALL_LIBDIR=lib \
+ ${CUDA_CONFIG_ARGS+"${CUDA_CONFIG_ARGS[@]}"} \
+ --verbose \
+ .
-# make builds libfaiss.a & libfaiss.so; we only want the latter
-rm ${PREFIX}/lib/libfaiss.a
+cmake --build _build_generic -j $CPU_COUNT
+cmake --install _build_generic --prefix $PREFIX
diff --git a/recipe/build-pkg.bat b/recipe/build-pkg.bat
index 5608d2e8..0af669a0 100644
--- a/recipe/build-pkg.bat
+++ b/recipe/build-pkg.bat
@@ -1,9 +1,19 @@
-@echo off
+:: Build vanilla version (no avx2).
+:: Do not use the Python3_* variants for cmake
+cmake -B _build_python ^
+ -DFAISS_ENABLE_GPU=OFF ^
+ -DCMAKE_BUILD_TYPE=Release ^
+ -DPython_EXECUTABLE="%PYTHON%" ^
+ faiss/python
+if %ERRORLEVEL% neq 0 exit 1
-:: there's a symbolic link from faiss/ to ./ in the upstream repo that does not work with windows;
-:: delete symlink & copy entire source recursively (= "/S") to folder faiss to work around it
-rmdir faiss
-robocopy . faiss /S
+cmake --build _build_python --config Release -j %CPU_COUNT%
+if %ERRORLEVEL% neq 0 exit 1
-call %BUILD_PREFIX%\Library\bin\run_autotools_clang_conda_build.bat build-pkg.sh
+:: Build actual python module.
+pushd _build_python
+%PYTHON% setup.py install --single-version-externally-managed --record=record.txt --prefix=%PREFIX%
if %ERRORLEVEL% neq 0 exit 1
+popd
+:: clean up cmake-cache between builds
+rd /S /Q _build_python
diff --git a/recipe/build-pkg.sh b/recipe/build-pkg.sh
index 84dd2183..eb44cc51 100644
--- a/recipe/build-pkg.sh
+++ b/recipe/build-pkg.sh
@@ -1,15 +1,35 @@
-CUDA_CONFIG_ARG=""
+set -e
+
+FAISS_ENABLE_GPU=""
if [ ${cuda_compiler_version} != "None" ]; then
- CUDA_CONFIG_ARG="--with-cuda=${CUDA_HOME}"
+ FAISS_ENABLE_GPU="ON"
+ # Acc. to https://cmake.org/cmake/help/v3.19/module/FindCUDAToolkit.html#search-behavior
+ # CUDA toolkit is search relative to `nvcc` first before considering
+ # "-DCUDAToolkit_ROOT=${CUDA_HOME}". We have multiple workarounds:
+ # - Add symlinks from ${CUDA_HOME} to ${BUILD_PREFIX}
+ # - Add ${CUDA_HOME}/bin to ${PATH}
+ # - Remove `nvcc` wrapper in ${BUILD_PREFIX} so that `nvcc` from ${CUDA_HOME} gets found.
+ # TODO: Fix this in nvcc-feedstock or cmake-feedstock.
+ # NOTE: It's okay for us to not use the wrapper since CMake adds -ccbin itself.
+ rm "${BUILD_PREFIX}/bin/nvcc"
else
- CUDA_CONFIG_ARG="--without-cuda"
+ FAISS_ENABLE_GPU="OFF"
fi
-# Build vanilla version (no avx)
-./configure --with-blas=-lblas --with-lapack=-llapack ${CUDA_CONFIG_ARG}
-
-make -C python build
+# Build vanilla version (no avx2), see build-lib.sh
+# Do not use the Python3_* variants for cmake
+cmake -B _build_python \
+ -Dfaiss_ROOT=${PREFIX}\
+ -DFAISS_ENABLE_GPU=${FAISS_ENABLE_GPU} \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DPython_EXECUTABLE="${PYTHON}" \
+ faiss/python
-cd python
+cmake --build _build_python -j $CPU_COUNT
-$PYTHON -m pip install . -vv
+# Build actual python module.
+pushd _build_python
+$PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX
+popd
+# clean up cmake-cache between builds
+rm -r _build_python
diff --git a/recipe/conda_build_config.yaml b/recipe/conda_build_config.yaml
index 5700830c..274fe175 100644
--- a/recipe/conda_build_config.yaml
+++ b/recipe/conda_build_config.yaml
@@ -1,2 +1,9 @@
cuda_compiler_version:
- None # [not linux]
+
+cxx_compiler_version: # [unix]
+ # need to downgrade on osx due to a bug that breaks the test suite
+ - 10 # [osx]
+ # need to downgrade on linux due to nvcc 9.2 not being able to deal with gcc>7,
+ # and conda-build not being able to zip this with cuda_compiler_version
+ - 7 # [linux]
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index 0f6a0d74..153a6abe 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -1,24 +1,28 @@
-{% set version = "1.6.3" %}
-{% set number = 3 %}
+{% set version = "1.6.4" %}
+{% set number = 0 %}
# see github.com/conda-forge/conda-forge.github.io/issues/1059 for naming discussion
{% set faiss_proc_type = "cuda" if cuda_compiler_version != "None" else "cpu" %}
-# headers for upstream-folders './*.h', 'impl/*.h', 'utils/*.h'
-# gpu adds headers in 'gpu/*.h', 'gpu/impl/*.(cu)?h', 'gpu/utils/*.(cu)?h'
+# headers for upstream-folders 'faiss/*.h', 'faiss/{impl,utils}/*.h',
+# see https://github.com/facebookresearch/faiss/blob/v1.6.4/faiss/CMakeLists.txt
+# gpu adds headers in 'faiss/gpu/*.h', 'faiss/gpu/{impl,utils}/*.(cu)?h'
{% set headers = [
- 'AutoTune.h', 'clone_index.h', 'Clustering.h', 'DirectMap.h', 'Index.h', 'Index2Layer.h',
- 'IndexBinary.h', 'IndexBinaryFlat.h', 'IndexBinaryFromFloat.h', 'IndexBinaryHash.h',
- 'IndexBinaryHNSW.h', 'IndexBinaryIVF.h', 'IndexFlat.h', 'IndexHNSW.h', 'IndexIVF.h',
- 'IndexIVFFlat.h', 'IndexIVFPQ.h', 'IndexIVFPQR.h', 'IndexIVFSpectralHash.h', 'IndexLattice.h',
- 'IndexLSH.h', 'IndexPQ.h', 'IndexPreTransform.h', 'IndexReplicas.h', 'IndexScalarQuantizer.h',
- 'IndexShards.h', 'index_factory.h', 'index_io.h', 'InvertedLists.h', 'IVFlib.h',
- 'MatrixStats.h', 'MetaIndexes.h', 'MetricType.h', 'OnDiskInvertedLists.h', 'VectorTransform.h',
+ 'AutoTune.h', 'Clustering.h', 'DirectMap.h', 'IVFlib.h', 'Index.h', 'Index2Layer.h',
+ 'IndexBinary.h', 'IndexBinaryFlat.h', 'IndexBinaryFromFloat.h', 'IndexBinaryHNSW.h',
+ 'IndexBinaryHash.h', 'IndexBinaryIVF.h', 'IndexFlat.h', 'IndexHNSW.h', 'IndexIVF.h',
+ 'IndexIVFFlat.h', 'IndexIVFPQ.h', 'IndexIVFPQR.h', 'IndexIVFSpectralHash.h', 'IndexLSH.h',
+ 'IndexLattice.h', 'IndexPQ.h', 'IndexPreTransform.h', 'IndexReplicas.h',
+ 'IndexScalarQuantizer.h', 'IndexShards.h', 'InvertedLists.h', 'MatrixStats.h',
+ 'MetaIndexes.h', 'MetricType.h', 'VectorTransform.h', 'clone_index.h', 'index_factory.h',
+ 'index_io.h',
'impl/AuxIndexStructures.h', 'impl/FaissAssert.h', 'impl/FaissException.h', 'impl/HNSW.h',
- 'impl/io.h', 'impl/lattice_Zn.h', 'impl/PolysemousTraining.h', 'impl/ProductQuantizer-inl.h',
- 'impl/ProductQuantizer.h', 'impl/ScalarQuantizer.h', 'impl/ThreadedIndex-inl.h',
- 'impl/ThreadedIndex.h',
- 'utils/distances.h', 'utils/extra_distances.h', 'utils/hamming-inl.h', 'utils/hamming.h',
- 'utils/Heap.h', 'utils/random.h', 'utils/utils.h', 'utils/WorkerThread.h'
+ 'impl/PolysemousTraining.h', 'impl/ProductQuantizer-inl.h', 'impl/ProductQuantizer.h',
+ 'impl/ScalarQuantizer.h', 'impl/ThreadedIndex-inl.h', 'impl/ThreadedIndex.h', 'impl/io.h',
+ 'impl/io_macros.h', 'impl/lattice_Zn.h', 'impl/platform_macros.h',
+ 'utils/Heap.h', 'utils/WorkerThread.h', 'utils/distances.h', 'utils/extra_distances.h',
+ 'utils/hamming-inl.h', 'utils/hamming.h', 'utils/random.h', 'utils/utils.h'
+] + (not win) * [
+ 'OnDiskInvertedLists.h'
] + (cuda_compiler_version != "None") * [
'gpu/GpuAutoTune.h', 'gpu/GpuCloner.h', 'gpu/GpuClonerOptions.h', 'gpu/GpuDistance.h',
'gpu/GpuFaissAssert.h', 'gpu/GpuIndex.h', 'gpu/GpuIndexBinaryFlat.h', 'gpu/GpuIndexFlat.h',
@@ -30,12 +34,12 @@
'gpu/impl/FlatIndex.cuh', 'gpu/impl/GeneralDistance.cuh', 'gpu/impl/GpuScalarQuantizer.cuh',
'gpu/impl/IVFAppend.cuh', 'gpu/impl/IVFBase.cuh', 'gpu/impl/IVFFlat.cuh',
'gpu/impl/IVFFlatScan.cuh', 'gpu/impl/IVFPQ.cuh', 'gpu/impl/IVFUtils.cuh',
- 'gpu/impl/L2Norm.cuh', 'gpu/impl/L2Select.cuh', 'gpu/impl/Metrics.cuh',
+ 'gpu/impl/L2Norm.cuh', 'gpu/impl/L2Select.cuh',
'gpu/impl/PQCodeDistances-inl.cuh', 'gpu/impl/PQCodeDistances.cuh', 'gpu/impl/PQCodeLoad.cuh',
'gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh', 'gpu/impl/PQScanMultiPassNoPrecomputed.cuh',
'gpu/impl/PQScanMultiPassPrecomputed.cuh', 'gpu/impl/VectorResidual.cuh',
- 'gpu/utils/DeviceMemory.h', 'gpu/utils/DeviceUtils.h', 'gpu/utils/MemorySpace.h',
- 'gpu/utils/StackDeviceMemory.h', 'gpu/utils/StaticUtils.h', 'gpu/utils/Timer.h',
+ 'gpu/utils/DeviceUtils.h', 'gpu/utils/StackDeviceMemory.h', 'gpu/utils/StaticUtils.h',
+ 'gpu/utils/Timer.h',
'gpu/utils/BlockSelectKernel.cuh', 'gpu/utils/Comparators.cuh',
'gpu/utils/ConversionOperators.cuh', 'gpu/utils/CopyUtils.cuh', 'gpu/utils/DeviceDefs.cuh',
'gpu/utils/DeviceTensor-inl.cuh', 'gpu/utils/DeviceTensor.cuh', 'gpu/utils/DeviceVector.cuh',
@@ -55,26 +59,14 @@ package:
source:
url: https://github.com/facebookresearch/faiss/archive/v{{ version }}.tar.gz
- sha256: e1a41c159f0b896975fbb133e0240a233af5c9286c09a28fde6aefff5336e542
+ sha256: 6ed6311415ccb70ea1afe82bafc24e9f3aa60c06c9d030bfc23ee4173d59cc99
patches:
- - patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch
- - patches/0002-remove-fPIC-flag-for-msvc-compat.patch # [win]
- # add upstream commits (64dd9884, e05f773f, ba061ff82) at the recommendation
- # of the maintainers. These can be removed upon the next release.
- - patches/0003-remove-deleted-files-from-template-change.patch
- - patches/0004-CUDA-8-fixes.patch
- - patches/0005-CUDA-10-fixes.patch
- # backport of facebookresearch/faiss#1380 to be able to build for Ampere
- - patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch
- # use c++14 (in autoconf macro AX_CXX_COMPILE_STDCXX)
- - patches/0007-use-c-14.patch
- # backport of facebookresearch/faiss#1388, see conda-forge/faiss-split-feedstock#14
- - patches/0008-backport-facebookresearch-faiss-1388.patch
+ # backport of facebookresearch/faiss#1541, can be dropped for ver>1.6.5
+ - patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch
+ - patches/0002-use-c-14.patch
build:
number: {{ number }}
- # TODO: figure out compiler errors on windows
- skip: true # [win]
# GPU version only for linux
skip: true # [(win or osx) and cuda_compiler_version != "None"]
@@ -108,41 +100,32 @@ outputs:
build:
- {{ compiler('cxx') }}
- {{ compiler('cuda') }} # [cuda_compiler_version != "None"]
- - autoconf # [not win]
- - automake # [not win]
- - libtool # [not win]
- cmake
- libgomp # [linux]
- - llvm-openmp # [osx or win]
- - autotools_clang_conda # [win]
+ - llvm-openmp # [osx]
host:
- libblas
- liblapack
- run:
- - libblas
- - liblapack
run_constrained:
- faiss-proc * {{ faiss_proc_type }}
test:
commands:
# shared
- - test -f $PREFIX/lib/libfaiss.so # [linux]
- - test -f $PREFIX/lib/libfaiss.dylib # [osx]
- - if not exist %PREFIX%\\Library\\bin\\libfaiss.dll exit 1 # [win]
+ - test -f $PREFIX/lib/libfaiss.so # [linux]
+ - test -f $PREFIX/lib/libfaiss.dylib # [osx]
+ - if not exist %LIBRARY_BIN%\faiss.dll exit 1 # [win]
+ # On windows, faiss.lib is an "import library";
+ # Deleting it breaks the faiss-builds
+ - if not exist %LIBRARY_LIB%\faiss.lib exit 1 # [win]
# absence of static libraries
- - test ! -f $PREFIX/lib/libfaiss.a # [not win]
- - if exist %PREFIX%\\Library\\lib\\libfaiss.lib exit 1 # [win]
-
- # conda tools
- - conda inspect linkages -p $PREFIX $PKG_NAME # [not win]
- - conda inspect objects -p $PREFIX $PKG_NAME # [osx]
+ - test ! -f $PREFIX/lib/libfaiss.a # [not win]
# headers
{% for each_header in headers %}
- - test -f $PREFIX/include/faiss/{{ each_header }} # [unix]
- - if not exist %LIBRARY_INC%\\faiss\\{{ "\\".join(each_header.split("/")) }} exit 1 # [win]
+ - test -f $PREFIX/include/faiss/{{ each_header }} || (echo "{{ each_header }} not found" && exit 1) # [unix]
+ - if not exist %LIBRARY_INC%\faiss\{{ "\\".join(each_header.split("/")) }} exit 1 # [win]
{% endfor %}
- name: faiss
@@ -156,17 +139,17 @@ outputs:
- {{ compiler('cuda') }} # [cuda_compiler_version != "None"]
- swig
- cmake
- - autotools_clang_conda # [win]
+ - libgomp # [linux]
+ - llvm-openmp # [osx]
host:
- python
- pip
- - numpy =1.14.*
+ - numpy
- libfaiss ={{ version }}=*_{{ faiss_proc_type }}
- libblas
- liblapack
- - libgomp # [linux]
- - llvm-openmp # [osx or win]
run:
+ - python
- libfaiss ={{ version }}=*_{{ faiss_proc_type }}
- {{ pin_compatible('numpy') }}
run_constrained:
@@ -182,42 +165,33 @@ outputs:
- scipy
source_files:
- tests/
- - conda/faiss/run_test.py
- - conda/faiss-gpu/run_test.py
imports:
- faiss
commands:
- - python conda/faiss/run_test.py
- # running the following test requires an actual GPU device, which is not available in CI
- # - python conda/faiss-gpu/run_test.py # [cuda_compiler_version != "None"]
- python -m unittest discover tests
+ # running the following test requires an actual GPU device, which is not available in CI
+ # - python -m unittest discover faiss/gpu/test/
# for compatibility with (& ease of migration from) existing packages in the pytorch channel
- name: faiss-cpu
build:
- string: "py{{ CONDA_PY }}_{{ number }}"
skip: true # [cuda_compiler_version != "None"]
requirements:
- host:
- - python
run:
- faiss ={{ version }}=*_cpu
test:
- commands:
- - exit 0
+ imports:
+ - faiss
- name: faiss-gpu
build:
- string: "py{{ CONDA_PY }}_{{ number }}"
skip: true # [cuda_compiler_version == "None"]
requirements:
- host:
- - python
run:
- faiss ={{ version }}=*_cuda
test:
- commands:
- - exit 0
+ imports:
+ - faiss
about:
home: https://github.com/facebookresearch/faiss
diff --git a/recipe/patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch b/recipe/patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch
new file mode 100644
index 00000000..c01d2211
--- /dev/null
+++ b/recipe/patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch
@@ -0,0 +1,76 @@
+From e80893144a71dd48d829f611bf2708d9372f71bf Mon Sep 17 00:00:00 2001
+From: Mo Zhou
+Date: Tue, 24 Nov 2020 05:44:37 +0000
+Subject: [PATCH 1/2] CMake: use GNUInstallDirs instead of hardcoded paths.
+
+---
+ CMakeLists.txt | 1 +
+ faiss/CMakeLists.txt | 14 +++++++-------
+ faiss/gpu/CMakeLists.txt | 2 +-
+ 3 files changed, 9 insertions(+), 8 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 29b73d8..158712e 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -11,6 +11,7 @@ project(faiss
+ DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
+ HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
+ LANGUAGES CXX)
++include(GNUInstallDirs)
+
+ set(CMAKE_CXX_STANDARD 11)
+
+diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
+index 51680fd..3469538 100644
+--- a/faiss/CMakeLists.txt
++++ b/faiss/CMakeLists.txt
+@@ -160,15 +160,15 @@ endif()
+
+ install(TARGETS faiss
+ EXPORT faiss-targets
+- RUNTIME DESTINATION bin
+- ARCHIVE DESTINATION lib
+- LIBRARY DESTINATION lib
+- INCLUDES DESTINATION include
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
++ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
++ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
++ INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+ )
+ foreach(header ${FAISS_HEADERS})
+ get_filename_component(dir ${header} DIRECTORY )
+ install(FILES ${header}
+- DESTINATION include/faiss/${dir}
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/${dir}
+ )
+ endforeach()
+
+@@ -185,9 +185,9 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/faiss-config.cmake.in
+ )
+ install(FILES ${PROJECT_BINARY_DIR}/cmake/faiss-config.cmake
+ ${PROJECT_BINARY_DIR}/cmake/faiss-config-version.cmake
+- DESTINATION share/faiss
++ DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
+ )
+
+ install(EXPORT faiss-targets
+- DESTINATION share/faiss
++ DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
+ )
+diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
+index 5b2a957..8041bac 100644
+--- a/faiss/gpu/CMakeLists.txt
++++ b/faiss/gpu/CMakeLists.txt
+@@ -173,7 +173,7 @@ set(FAISS_GPU_HEADERS
+ foreach(header ${FAISS_GPU_HEADERS})
+ get_filename_component(dir ${header} DIRECTORY )
+ install(FILES ${header}
+- DESTINATION include/faiss/gpu/${dir}
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/gpu/${dir}
+ )
+ endforeach()
+
+--
+2.29.2.windows.3
+
diff --git a/recipe/patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch b/recipe/patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch
deleted file mode 100644
index 7b378223..00000000
--- a/recipe/patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch
+++ /dev/null
@@ -1,41 +0,0 @@
-From 5f990ac215a73ba6c3bea8b4033f5b0f5d026237 Mon Sep 17 00:00:00 2001
-From: "H. Vetinari"
-Date: Thu, 7 May 2020 11:05:05 +0200
-Subject: [PATCH 1/8] change python/Makefile to point to libfaiss.$(SHLIB_EXT)
- instead of .a
-
-also give definite names to makefile-rules, as otherwise make may
-struggle to find make _swigfaiss.so otherwise.
----
- python/Makefile | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/python/Makefile b/python/Makefile
-index 2836568..eaa32a4 100644
---- a/python/Makefile
-+++ b/python/Makefile
-@@ -12,18 +12,18 @@ endif
- all: build
-
- # Also silently generates swigfaiss.py.
--swigfaiss.cpp: swigfaiss.swig ../libfaiss.a
-+swigfaiss.cpp: swigfaiss.swig $(PREFIX)/lib/libfaiss$(SHLIB_EXT)
- $(SWIG) -python -c++ -Doverride= -I../ $(SWIGFLAGS) -o $@ $<
-
- swigfaiss_avx2.cpp: swigfaiss.swig ../libfaiss.a
- $(SWIG) -python -c++ -Doverride= -module swigfaiss_avx2 -I../ $(SWIGFLAGS) -o $@ $<
-
--%.o: %.cpp
-+swigfaiss.o: swigfaiss.cpp
- $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) $(PYTHONCFLAGS) \
- -I../ -c $< -o $@
-
- # Extension is .so even on OSX.
--_%.so: %.o ../libfaiss.a
-+_swigfaiss.so: swigfaiss.o $(PREFIX)/lib/libfaiss$(SHLIB_EXT)
- $(CXX) $(SHAREDFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
-
- build: _swigfaiss.so faiss.py
---
-2.26.2.windows.1
-
diff --git a/recipe/patches/0002-remove-fPIC-flag-for-msvc-compat.patch b/recipe/patches/0002-remove-fPIC-flag-for-msvc-compat.patch
deleted file mode 100644
index e8774ed9..00000000
--- a/recipe/patches/0002-remove-fPIC-flag-for-msvc-compat.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From c8dc93d045be8485a4863d20b36e12d1aa2c6c8a Mon Sep 17 00:00:00 2001
-From: "H. Vetinari"
-Date: Fri, 8 May 2020 20:12:30 +0200
-Subject: [PATCH 2/8] remove -fPIC flag for msvc-compat
-
----
- makefile.inc.in | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/makefile.inc.in b/makefile.inc.in
-index 19d8511..1c4e976 100644
---- a/makefile.inc.in
-+++ b/makefile.inc.in
-@@ -6,7 +6,7 @@
- CXX = @CXX@
- CXXCPP = @CXXCPP@
- CPPFLAGS = -DFINTEGER=int @CPPFLAGS@ @OPENMP_CXXFLAGS@ @NVCC_CPPFLAGS@
--CXXFLAGS = -fPIC @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@
-+CXXFLAGS = @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@
- CPUFLAGS = @ARCH_CPUFLAGS@
- LDFLAGS = @OPENMP_LDFLAGS@ @LDFLAGS@ @NVCC_LDFLAGS@
- LIBS = @BLAS_LIBS@ @LAPACK_LIBS@ @LIBS@ @NVCC_LIBS@
---
-2.26.2.windows.1
-
diff --git a/recipe/patches/0002-use-c-14.patch b/recipe/patches/0002-use-c-14.patch
new file mode 100644
index 00000000..b1cc3690
--- /dev/null
+++ b/recipe/patches/0002-use-c-14.patch
@@ -0,0 +1,39 @@
+From 65f07198267d0b777c03b21ae81c0b27577176fc Mon Sep 17 00:00:00 2001
+From: "H. Vetinari"
+Date: Mon, 26 Oct 2020 22:44:44 +0100
+Subject: [PATCH 2/2] use c++14
+
+---
+ CMakeLists.txt | 2 +-
+ faiss/python/CMakeLists.txt | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 158712e..1b2038f 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -13,7 +13,7 @@ project(faiss
+ LANGUAGES CXX)
+ include(GNUInstallDirs)
+
+-set(CMAKE_CXX_STANDARD 11)
++set(CMAKE_CXX_STANDARD 14)
+
+ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+
+diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
+index 0b4366d..1ef5dea 100644
+--- a/faiss/python/CMakeLists.txt
++++ b/faiss/python/CMakeLists.txt
+@@ -11,7 +11,7 @@ project(pyfaiss
+ HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
+ LANGUAGES CXX)
+
+-set(CMAKE_CXX_STANDARD 11)
++set(CMAKE_CXX_STANDARD 14)
+
+ find_package(SWIG REQUIRED COMPONENTS python)
+ include(${SWIG_USE_FILE})
+--
+2.29.2.windows.3
+
diff --git a/recipe/patches/0003-remove-deleted-files-from-template-change.patch b/recipe/patches/0003-remove-deleted-files-from-template-change.patch
deleted file mode 100644
index 9217a390..00000000
--- a/recipe/patches/0003-remove-deleted-files-from-template-change.patch
+++ /dev/null
@@ -1,1191 +0,0 @@
-From 94bd01d4b4f457f40994df7817d8b789a99d605b Mon Sep 17 00:00:00 2001
-From: Jeff Johnson
-Date: Wed, 25 Mar 2020 10:57:57 -0700
-Subject: [PATCH 3/8] remove deleted files from template change
-
----
- gpu/impl/PQCodeDistances.cu | 567 ---------------------
- gpu/impl/PQScanMultiPassNoPrecomputed.cu | 597 -----------------------
- 2 files changed, 1164 deletions(-)
- delete mode 100644 gpu/impl/PQCodeDistances.cu
- delete mode 100644 gpu/impl/PQScanMultiPassNoPrecomputed.cu
-
-diff --git a/gpu/impl/PQCodeDistances.cu b/gpu/impl/PQCodeDistances.cu
-deleted file mode 100644
-index 817990b..0000000
---- a/gpu/impl/PQCodeDistances.cu
-+++ /dev/null
-@@ -1,567 +0,0 @@
--/**
-- * Copyright (c) Facebook, Inc. and its affiliates.
-- *
-- * This source code is licensed under the MIT license found in the
-- * LICENSE file in the root directory of this source tree.
-- */
--
--
--#include
--
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--
--namespace faiss { namespace gpu {
--
--template
--struct Converter {
--};
--
--template <>
--struct Converter {
-- inline static __device__ half to(float v) { return __float2half(v); }
--};
--
--template <>
--struct Converter {
-- inline static __device__ float to(float v) { return v; }
--};
--
--// Kernel responsible for calculating distance from residual vector to
--// each product quantizer code centroid
--template
--__global__ void
--__launch_bounds__(288, 4)
--pqCodeDistances(Tensor queries,
-- int queriesPerBlock,
-- Tensor coarseCentroids,
-- Tensor pqCentroids,
-- Tensor topQueryToCentroid,
-- // (query id)(coarse)(subquantizer)(code) -> dist
-- Tensor outCodeDistances) {
-- const auto numSubQuantizers = pqCentroids.getSize(0);
-- const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
-- assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
-- const auto codesPerSubQuantizer = pqCentroids.getSize(2);
--
-- bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
-- int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
--
-- extern __shared__ float smem[];
--
-- // Each thread calculates a single code
-- float subQuantizerData[DimsPerSubQuantizer];
--
-- auto code = threadIdx.x;
-- auto subQuantizer = blockIdx.y;
--
-- // Each thread will load the pq centroid data for the code that it
-- // is processing
--#pragma unroll
-- for (int i = 0; i < DimsPerSubQuantizer; ++i) {
-- subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
-- }
--
-- // Where we store our query vector
-- float* smemQuery = smem;
--
-- // Where we store our residual vector; this is double buffered so we
-- // can be loading the next one while processing the current one
-- float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
-- float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
--
-- // Where we pre-load the coarse centroid IDs
-- int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer];
--
-- // Each thread is calculating the distance for a single code,
-- // performing the reductions locally
--
-- // Handle multiple queries per block
-- auto startQueryId = blockIdx.x * queriesPerBlock;
-- auto numQueries = queries.getSize(0) - startQueryId;
-- if (numQueries > queriesPerBlock) {
-- numQueries = queriesPerBlock;
-- }
--
-- for (int query = 0; query < numQueries; ++query) {
-- auto queryId = startQueryId + query;
--
-- auto querySubQuantizer =
-- queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
--
-- // Load current query vector
-- for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
-- smemQuery[i] = querySubQuantizer[i];
-- }
--
-- // Load list of coarse centroids found
-- for (int i = threadIdx.x;
-- i < topQueryToCentroid.getSize(1); i += blockDim.x) {
-- coarseIds[i] = topQueryToCentroid[queryId][i];
-- }
--
-- // We need coarseIds below
-- // FIXME: investigate loading separately, so we don't need this
-- __syncthreads();
--
-- // Preload first buffer of residual data
-- if (isLoadingThread) {
-- for (int i = loadingThreadId;
-- i < DimsPerSubQuantizer;
-- i += blockDim.x - codesPerSubQuantizer) {
-- auto coarseId = coarseIds[0];
-- // In case NaNs were in the original query data
-- coarseId = coarseId == -1 ? 0 : coarseId;
-- auto coarseCentroidSubQuantizer =
-- coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
--
-- if (L2Distance) {
-- smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
-- } else {
-- smemResidual1[i] = coarseCentroidSubQuantizer[i];
-- }
-- }
-- }
--
-- // The block walks the list for a single query
-- for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) {
-- // Wait for smemResidual1 to be loaded
-- __syncthreads();
--
-- if (isLoadingThread) {
-- // Preload second buffer of residual data
-- for (int i = loadingThreadId;
-- i < DimsPerSubQuantizer;
-- i += blockDim.x - codesPerSubQuantizer) {
-- // FIXME: try always making this centroid id 0 so we can
-- // terminate
-- if (coarse != (topQueryToCentroid.getSize(1) - 1)) {
-- auto coarseId = coarseIds[coarse + 1];
-- // In case NaNs were in the original query data
-- coarseId = coarseId == -1 ? 0 : coarseId;
--
-- auto coarseCentroidSubQuantizer =
-- coarseCentroids[coarseId]
-- [subQuantizer * dimsPerSubQuantizer].data();
--
-- if (L2Distance) {
-- smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
-- } else {
-- smemResidual2[i] = coarseCentroidSubQuantizer[i];
-- }
-- }
-- }
-- } else {
-- // These are the processing threads
-- float dist = 0.0f;
--
-- constexpr int kUnroll = 4;
-- constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
-- constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
-- float vals[kUnroll];
--
-- // Calculate residual - pqCentroid for each dim that we're
-- // processing
--
-- // Unrolled loop
-- if (L2Distance) {
--#pragma unroll
-- for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- vals[j] = smemResidual1[i * kUnroll + j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- vals[j] -= subQuantizerData[i * kUnroll + j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- vals[j] *= vals[j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- dist += vals[j];
-- }
-- }
-- } else {
-- // Inner product: query slice against the reconstructed sub-quantizer
-- // for this coarse cell (query o (centroid + subQCentroid))
--#pragma unroll
-- for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- vals[j] = smemResidual1[i * kUnroll + j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- vals[j] += subQuantizerData[i * kUnroll + j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- vals[j] *= smemQuery[i * kUnroll + j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- dist += vals[j];
-- }
-- }
-- }
--
-- // Remainder loop
-- if (L2Distance) {
--#pragma unroll
-- for (int j = 0; j < kRemainder; ++j) {
-- vals[j] = smemResidual1[kRemainderBase + j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kRemainder; ++j) {
-- vals[j] -= subQuantizerData[kRemainderBase + j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kRemainder; ++j) {
-- vals[j] *= vals[j];
-- }
-- } else {
-- // Inner product
-- // Inner product: query slice against the reconstructed sub-quantizer
-- // for this coarse cell (query o (centroid + subQCentroid))
--#pragma unroll
-- for (int j = 0; j < kRemainder; ++j) {
-- vals[j] = smemResidual1[kRemainderBase + j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kRemainder; ++j) {
-- vals[j] += subQuantizerData[kRemainderBase + j];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kRemainder; ++j) {
-- vals[j] *= smemQuery[kRemainderBase + j];
-- }
-- }
--
--#pragma unroll
-- for (int j = 0; j < kRemainder; ++j) {
-- dist += vals[j];
-- }
--
-- // We have the distance for our code; write it out
-- outCodeDistances[queryId][coarse][subQuantizer][code] =
-- Converter::to(dist);
-- } // !isLoadingThread
--
-- // Swap residual buffers
-- float* tmp = smemResidual1;
-- smemResidual1 = smemResidual2;
-- smemResidual2 = tmp;
-- }
-- }
--}
--
--__global__ void
--residualVector(Tensor queries,
-- Tensor coarseCentroids,
-- Tensor topQueryToCentroid,
-- int numSubDim,
-- // output is transposed:
-- // (sub q)(query id)(centroid id)(sub dim)
-- Tensor residual) {
-- // block x is query id
-- // block y is centroid id
-- // thread x is dim
-- auto queryId = blockIdx.x;
-- auto centroidId = blockIdx.y;
--
-- int realCentroidId = topQueryToCentroid[queryId][centroidId];
--
-- for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
-- float q = queries[queryId][dim];
-- float c = coarseCentroids[realCentroidId][dim];
--
-- residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] =
-- q - c;
-- }
--}
--
--void
--runResidualVector(Tensor& pqCentroids,
-- Tensor& queries,
-- Tensor& coarseCentroids,
-- Tensor& topQueryToCentroid,
-- Tensor& residual,
-- cudaStream_t stream) {
-- auto grid =
-- dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1));
-- auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
--
-- residualVector<<>>(
-- queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1),
-- residual);
--
-- CUDA_TEST_ERROR();
--}
--
--void
--runPQCodeDistancesMM(Tensor& pqCentroids,
-- Tensor& queries,
-- Tensor& coarseCentroids,
-- Tensor& topQueryToCentroid,
-- NoTypeTensor<4, true>& outCodeDistances,
-- bool useFloat16Lookup,
-- DeviceMemory& mem,
-- cublasHandle_t handle,
-- cudaStream_t stream) {
-- // Calculate (q - c) residual vector
-- // (sub q)(query id)(centroid id)(sub dim)
-- DeviceTensor residual(
-- mem,
-- {pqCentroids.getSize(0),
-- topQueryToCentroid.getSize(0),
-- topQueryToCentroid.getSize(1),
-- pqCentroids.getSize(1)},
-- stream);
--
-- runResidualVector(pqCentroids, queries,
-- coarseCentroids, topQueryToCentroid,
-- residual, stream);
--
-- // Calculate ||q - c||^2
-- DeviceTensor residualNorms(
-- mem,
-- {pqCentroids.getSize(0) *
-- topQueryToCentroid.getSize(0) *
-- topQueryToCentroid.getSize(1)},
-- stream);
--
-- auto residualView2 = residual.view<2>(
-- {pqCentroids.getSize(0) *
-- topQueryToCentroid.getSize(0) *
-- topQueryToCentroid.getSize(1),
-- pqCentroids.getSize(1)});
--
-- runL2Norm(residualView2, true, residualNorms, true, stream);
--
-- // Perform a batch MM:
-- // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
-- // (sub q) x {(q * c)(code)}
-- auto residualView3 = residual.view<3>(
-- {pqCentroids.getSize(0),
-- topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
-- pqCentroids.getSize(1)});
--
-- DeviceTensor residualDistance(
-- mem,
-- {pqCentroids.getSize(0),
-- topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
-- pqCentroids.getSize(2)},
-- stream);
--
-- runIteratedMatrixMult(residualDistance, false,
-- residualView3, false,
-- pqCentroids, false,
-- -2.0f, 0.0f,
-- handle,
-- stream);
--
-- // Sum ||q - c||^2 along rows
-- auto residualDistanceView2 = residualDistance.view<2>(
-- {pqCentroids.getSize(0) *
-- topQueryToCentroid.getSize(0) *
-- topQueryToCentroid.getSize(1),
-- pqCentroids.getSize(2)});
--
-- runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
--
-- Tensor outCodeDistancesF;
-- DeviceTensor outCodeDistancesFloatMem;
--
-- if (useFloat16Lookup) {
-- outCodeDistancesFloatMem = DeviceTensor(
-- mem, {outCodeDistances.getSize(0),
-- outCodeDistances.getSize(1),
-- outCodeDistances.getSize(2),
-- outCodeDistances.getSize(3)},
-- stream);
--
-- outCodeDistancesF = outCodeDistancesFloatMem;
-- } else {
-- outCodeDistancesF = outCodeDistances.toTensor();
-- }
--
-- // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
-- // is where we build our output distances)
-- auto outCodeDistancesView = outCodeDistancesF.view<3>(
-- {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
-- outCodeDistances.getSize(2),
-- outCodeDistances.getSize(3)});
--
-- runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
--
-- // Calculate code norms per each sub-dim
-- // (sub q)(sub dim)(code) is pqCentroids
-- // transpose to (sub q)(code)(sub dim)
-- DeviceTensor pqCentroidsTranspose(
-- mem,
-- {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)},
-- stream);
--
-- runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
--
-- auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
-- {pqCentroids.getSize(0) * pqCentroids.getSize(2),
-- pqCentroids.getSize(1)});
--
-- DeviceTensor pqCentroidsNorm(
-- mem,
-- {pqCentroids.getSize(0) * pqCentroids.getSize(2)},
-- stream);
--
-- runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
--
-- // View output as (q * c)(sub q * code), and add centroid norm to
-- // each row
-- auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
-- {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
-- outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
--
-- runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
--
-- if (useFloat16Lookup) {
-- // Need to convert back
-- auto outCodeDistancesH = outCodeDistances.toTensor();
-- convertTensor(stream,
-- outCodeDistancesF,
-- outCodeDistancesH);
-- }
--}
--
--void
--runPQCodeDistances(Tensor& pqCentroids,
-- Tensor& queries,
-- Tensor& coarseCentroids,
-- Tensor& topQueryToCentroid,
-- NoTypeTensor<4, true>& outCodeDistances,
-- bool l2Distance,
-- bool useFloat16Lookup,
-- cudaStream_t stream) {
-- const auto numSubQuantizers = pqCentroids.getSize(0);
-- const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
-- const auto codesPerSubQuantizer = pqCentroids.getSize(2);
--
-- // FIXME: tune
-- // Reuse of pq centroid data is based on both # of queries * nprobe,
-- // and we should really be tiling in both dimensions
-- constexpr int kQueriesPerBlock = 8;
--
-- auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
-- numSubQuantizers);
--
-- // Reserve one block of threads for double buffering
-- // FIXME: probably impractical for large # of dims?
-- auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize);
-- auto block = dim3(codesPerSubQuantizer + loadingThreads);
--
-- auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
-- + topQueryToCentroid.getSize(1) * sizeof(int);
--
--#define RUN_CODE(DIMS, L2) \
-- do { \
-- if (useFloat16Lookup) { \
-- auto outCodeDistancesT = outCodeDistances.toTensor(); \
-- \
-- pqCodeDistances<<>>( \
-- queries, kQueriesPerBlock, \
-- coarseCentroids, pqCentroids, \
-- topQueryToCentroid, outCodeDistancesT); \
-- } else { \
-- auto outCodeDistancesT = outCodeDistances.toTensor(); \
-- \
-- pqCodeDistances<<>>( \
-- queries, kQueriesPerBlock, \
-- coarseCentroids, pqCentroids, \
-- topQueryToCentroid, outCodeDistancesT); \
-- } \
-- } while (0)
--
--#define CODE_L2(DIMS) \
-- do { \
-- if (l2Distance) { \
-- RUN_CODE(DIMS, true); \
-- } else { \
-- RUN_CODE(DIMS, false); \
-- } \
-- } while (0)
--
-- switch (dimsPerSubQuantizer) {
-- case 1:
-- CODE_L2(1);
-- break;
-- case 2:
-- CODE_L2(2);
-- break;
-- case 3:
-- CODE_L2(3);
-- break;
-- case 4:
-- CODE_L2(4);
-- break;
-- case 6:
-- CODE_L2(6);
-- break;
-- case 8:
-- CODE_L2(8);
-- break;
-- case 10:
-- CODE_L2(10);
-- break;
-- case 12:
-- CODE_L2(12);
-- break;
-- case 16:
-- CODE_L2(16);
-- break;
-- case 20:
-- CODE_L2(20);
-- break;
-- case 24:
-- CODE_L2(24);
-- break;
-- case 28:
-- CODE_L2(28);
-- break;
-- case 32:
-- CODE_L2(32);
-- break;
-- // FIXME: larger sizes require too many registers - we need the
-- // MM implementation working
-- default:
-- FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer "
-- "not currently supported");
-- }
--
--#undef RUN_CODE
--#undef CODE_L2
--
-- CUDA_TEST_ERROR();
--}
--
--} } // namespace
-diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/gpu/impl/PQScanMultiPassNoPrecomputed.cu
-deleted file mode 100644
-index a514694..0000000
---- a/gpu/impl/PQScanMultiPassNoPrecomputed.cu
-+++ /dev/null
-@@ -1,597 +0,0 @@
--/**
-- * Copyright (c) Facebook, Inc. and its affiliates.
-- *
-- * This source code is licensed under the MIT license found in the
-- * LICENSE file in the root directory of this source tree.
-- */
--
--
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--#include
--
--#include
--
--namespace faiss { namespace gpu {
--
--// This must be kept in sync with PQCodeDistances.cu
--bool isSupportedNoPrecomputedSubDimSize(int dims) {
-- switch (dims) {
-- case 1:
-- case 2:
-- case 3:
-- case 4:
-- case 6:
-- case 8:
-- case 10:
-- case 12:
-- case 16:
-- case 20:
-- case 24:
-- case 28:
-- case 32:
-- return true;
-- default:
-- // FIXME: larger sizes require too many registers - we need the
-- // MM implementation working
-- return false;
-- }
--}
--
--template
--struct LoadCodeDistances {
-- static inline __device__ void load(LookupT* smem,
-- LookupT* codes,
-- int numCodes) {
-- constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
--
-- // We can only use the vector type if the data is guaranteed to be
-- // aligned. The codes are innermost, so if it is evenly divisible,
-- // then any slice will be aligned.
-- if (numCodes % kWordSize == 0) {
-- // Load the data by float4 for efficiency, and then handle any remainder
-- // limitVec is the number of whole vec words we can load, in terms
-- // of whole blocks performing the load
-- constexpr int kUnroll = 2;
-- int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
-- limitVec *= kUnroll * blockDim.x;
--
-- LookupVecT* smemV = (LookupVecT*) smem;
-- LookupVecT* codesV = (LookupVecT*) codes;
--
-- for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
-- LookupVecT vals[kUnroll];
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- vals[j] =
-- LoadStore::load(&codesV[i + j * blockDim.x]);
-- }
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- LoadStore::store(&smemV[i + j * blockDim.x], vals[j]);
-- }
-- }
--
-- // This is where we start loading the remainder that does not evenly
-- // fit into kUnroll x blockDim.x
-- int remainder = limitVec * kWordSize;
--
-- for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
-- smem[i] = codes[i];
-- }
-- } else {
-- // Potential unaligned load
-- constexpr int kUnroll = 4;
--
-- int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
--
-- int i = threadIdx.x;
-- for (; i < limit; i += kUnroll * blockDim.x) {
-- LookupT vals[kUnroll];
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- vals[j] = codes[i + j * blockDim.x];
-- }
--
--#pragma unroll
-- for (int j = 0; j < kUnroll; ++j) {
-- smem[i + j * blockDim.x] = vals[j];
-- }
-- }
--
-- for (; i < numCodes; i += blockDim.x) {
-- smem[i] = codes[i];
-- }
-- }
-- }
--};
--
--template
--__global__ void
--pqScanNoPrecomputedMultiPass(Tensor queries,
-- Tensor pqCentroids,
-- Tensor topQueryToCentroid,
-- Tensor codeDistances,
-- void** listCodes,
-- int* listLengths,
-- Tensor prefixSumOffsets,
-- Tensor distance) {
-- const auto codesPerSubQuantizer = pqCentroids.getSize(2);
--
-- // Where the pq code -> residual distance is stored
-- extern __shared__ char smemCodeDistances[];
-- LookupT* codeDist = (LookupT*) smemCodeDistances;
--
-- // Each block handles a single query
-- auto queryId = blockIdx.y;
-- auto probeId = blockIdx.x;
--
-- // This is where we start writing out data
-- // We ensure that before the array (at offset -1), there is a 0 value
-- int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
-- float* distanceOut = distance[outBase].data();
--
-- auto listId = topQueryToCentroid[queryId][probeId];
-- // Safety guard in case NaNs in input cause no list ID to be generated
-- if (listId == -1) {
-- return;
-- }
--
-- unsigned char* codeList = (unsigned char*) listCodes[listId];
-- int limit = listLengths[listId];
--
-- constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
-- (NumSubQuantizers / 4);
-- unsigned int code32[kNumCode32];
-- unsigned int nextCode32[kNumCode32];
--
-- // We double-buffer the code loading, which improves memory utilization
-- if (threadIdx.x < limit) {
-- LoadCode32::load(code32, codeList, threadIdx.x);
-- }
--
-- LoadCodeDistances::load(
-- codeDist,
-- codeDistances[queryId][probeId].data(),
-- codeDistances.getSize(2) * codeDistances.getSize(3));
--
-- // Prevent WAR dependencies
-- __syncthreads();
--
-- // Each thread handles one code element in the list, with a
-- // block-wide stride
-- for (int codeIndex = threadIdx.x;
-- codeIndex < limit;
-- codeIndex += blockDim.x) {
-- // Prefetch next codes
-- if (codeIndex + blockDim.x < limit) {
-- LoadCode32::load(
-- nextCode32, codeList, codeIndex + blockDim.x);
-- }
--
-- float dist = 0.0f;
--
--#pragma unroll
-- for (int word = 0; word < kNumCode32; ++word) {
-- constexpr int kBytesPerCode32 =
-- NumSubQuantizers < 4 ? NumSubQuantizers : 4;
--
-- if (kBytesPerCode32 == 1) {
-- auto code = code32[0];
-- dist = ConvertTo::to(codeDist[code]);
--
-- } else {
--#pragma unroll
-- for (int byte = 0; byte < kBytesPerCode32; ++byte) {
-- auto code = getByte(code32[word], byte * 8, 8);
--
-- auto offset =
-- codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
--
-- dist += ConvertTo::to(codeDist[offset + code]);
-- }
-- }
-- }
--
-- // Write out intermediate distance result
-- // We do not maintain indices here, in order to reduce global
-- // memory traffic. Those are recovered in the final selection step.
-- distanceOut[codeIndex] = dist;
--
-- // Rotate buffers
--#pragma unroll
-- for (int word = 0; word < kNumCode32; ++word) {
-- code32[word] = nextCode32[word];
-- }
-- }
--}
--
--void
--runMultiPassTile(Tensor& queries,
-- Tensor& centroids,
-- Tensor& pqCentroidsInnermostCode,
-- NoTypeTensor<4, true>& codeDistances,
-- Tensor& topQueryToCentroid,
-- bool useFloat16Lookup,
-- int bytesPerCode,
-- int numSubQuantizers,
-- int numSubQuantizerCodes,
-- thrust::device_vector& listCodes,
-- thrust::device_vector& listIndices,
-- IndicesOptions indicesOptions,
-- thrust::device_vector& listLengths,
-- Tensor& thrustMem,
-- Tensor& prefixSumOffsets,
-- Tensor& allDistances,
-- Tensor& heapDistances,
-- Tensor& heapIndices,
-- int k,
-- faiss::MetricType metric,
-- Tensor& outDistances,
-- Tensor& outIndices,
-- cudaStream_t stream) {
-- // We only support two metrics at the moment
-- FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT ||
-- metric == MetricType::METRIC_L2);
--
-- bool l2Distance = metric == MetricType::METRIC_L2;
--
-- // Calculate offset lengths, so we know where to write out
-- // intermediate results
-- runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
-- thrustMem, stream);
--
-- // Calculate residual code distances, since this is without
-- // precomputed codes
-- runPQCodeDistances(pqCentroidsInnermostCode,
-- queries,
-- centroids,
-- topQueryToCentroid,
-- codeDistances,
-- l2Distance,
-- useFloat16Lookup,
-- stream);
--
-- // Convert all codes to a distance, and write out (distance,
-- // index) values for all intermediate results
-- {
-- auto kThreadsPerBlock = 256;
--
-- auto grid = dim3(topQueryToCentroid.getSize(1),
-- topQueryToCentroid.getSize(0));
-- auto block = dim3(kThreadsPerBlock);
--
-- // pq centroid distances
-- auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
--
-- smem *= numSubQuantizers * numSubQuantizerCodes;
-- FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
--
--#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \
-- do { \
-- auto codeDistancesT = codeDistances.toTensor(); \
-- \
-- pqScanNoPrecomputedMultiPass \
-- <<>>( \
-- queries, \
-- pqCentroidsInnermostCode, \
-- topQueryToCentroid, \
-- codeDistancesT, \
-- listCodes.data().get(), \
-- listLengths.data().get(), \
-- prefixSumOffsets, \
-- allDistances); \
-- } while (0)
--
--#define RUN_PQ(NUM_SUB_Q) \
-- do { \
-- if (useFloat16Lookup) { \
-- RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \
-- } else { \
-- RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
-- } \
-- } while (0)
--
-- switch (bytesPerCode) {
-- case 1:
-- RUN_PQ(1);
-- break;
-- case 2:
-- RUN_PQ(2);
-- break;
-- case 3:
-- RUN_PQ(3);
-- break;
-- case 4:
-- RUN_PQ(4);
-- break;
-- case 8:
-- RUN_PQ(8);
-- break;
-- case 12:
-- RUN_PQ(12);
-- break;
-- case 16:
-- RUN_PQ(16);
-- break;
-- case 20:
-- RUN_PQ(20);
-- break;
-- case 24:
-- RUN_PQ(24);
-- break;
-- case 28:
-- RUN_PQ(28);
-- break;
-- case 32:
-- RUN_PQ(32);
-- break;
-- case 40:
-- RUN_PQ(40);
-- break;
-- case 48:
-- RUN_PQ(48);
-- break;
-- case 56:
-- RUN_PQ(56);
-- break;
-- case 64:
-- RUN_PQ(64);
-- break;
-- case 96:
-- RUN_PQ(96);
-- break;
-- default:
-- FAISS_ASSERT(false);
-- break;
-- }
--
--#undef RUN_PQ
--#undef RUN_PQ_OPT
-- }
--
-- CUDA_TEST_ERROR();
--
-- // k-select the output in chunks, to increase parallelism
-- runPass1SelectLists(prefixSumOffsets,
-- allDistances,
-- topQueryToCentroid.getSize(1),
-- k,
-- !l2Distance, // L2 distance chooses smallest
-- heapDistances,
-- heapIndices,
-- stream);
--
-- // k-select final output
-- auto flatHeapDistances = heapDistances.downcastInner<2>();
-- auto flatHeapIndices = heapIndices.downcastInner<2>();
--
-- runPass2SelectLists(flatHeapDistances,
-- flatHeapIndices,
-- listIndices,
-- indicesOptions,
-- prefixSumOffsets,
-- topQueryToCentroid,
-- k,
-- !l2Distance, // L2 distance chooses smallest
-- outDistances,
-- outIndices,
-- stream);
--}
--
--void runPQScanMultiPassNoPrecomputed(Tensor& queries,
-- Tensor& centroids,
-- Tensor& pqCentroidsInnermostCode,
-- Tensor& topQueryToCentroid,
-- bool useFloat16Lookup,
-- int bytesPerCode,
-- int numSubQuantizers,
-- int numSubQuantizerCodes,
-- thrust::device_vector& listCodes,
-- thrust::device_vector& listIndices,
-- IndicesOptions indicesOptions,
-- thrust::device_vector& listLengths,
-- int maxListLength,
-- int k,
-- faiss::MetricType metric,
-- // output
-- Tensor& outDistances,
-- // output
-- Tensor& outIndices,
-- GpuResources* res) {
-- constexpr int kMinQueryTileSize = 8;
-- constexpr int kMaxQueryTileSize = 128;
-- constexpr int kThrustMemSize = 16384;
--
-- int nprobe = topQueryToCentroid.getSize(1);
--
-- auto& mem = res->getMemoryManagerCurrentDevice();
-- auto stream = res->getDefaultStreamCurrentDevice();
--
-- // Make a reservation for Thrust to do its dirty work (global memory
-- // cross-block reduction space); hopefully this is large enough.
-- DeviceTensor thrustMem1(
-- mem, {kThrustMemSize}, stream);
-- DeviceTensor thrustMem2(
-- mem, {kThrustMemSize}, stream);
-- DeviceTensor* thrustMem[2] =
-- {&thrustMem1, &thrustMem2};
--
-- // How much temporary storage is available?
-- // If possible, we'd like to fit within the space available.
-- size_t sizeAvailable = mem.getSizeAvailable();
--
-- // We run two passes of heap selection
-- // This is the size of the first-level heap passes
-- constexpr int kNProbeSplit = 8;
-- int pass2Chunks = std::min(nprobe, kNProbeSplit);
--
-- size_t sizeForFirstSelectPass =
-- pass2Chunks * k * (sizeof(float) + sizeof(int));
--
-- // How much temporary storage we need per each query
-- size_t sizePerQuery =
-- 2 * // streams
-- ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
-- nprobe * maxListLength * sizeof(float) + // allDistances
-- // residual distances
-- nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
-- sizeForFirstSelectPass);
--
-- int queryTileSize = (int) (sizeAvailable / sizePerQuery);
--
-- if (queryTileSize < kMinQueryTileSize) {
-- queryTileSize = kMinQueryTileSize;
-- } else if (queryTileSize > kMaxQueryTileSize) {
-- queryTileSize = kMaxQueryTileSize;
-- }
--
-- // FIXME: we should adjust queryTileSize to deal with this, since
-- // indexing is in int32
-- FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
-- std::numeric_limits::max());
--
-- // Temporary memory buffers
-- // Make sure there is space prior to the start which will be 0, and
-- // will handle the boundary condition without branches
-- DeviceTensor prefixSumOffsetSpace1(
-- mem, {queryTileSize * nprobe + 1}, stream);
-- DeviceTensor prefixSumOffsetSpace2(
-- mem, {queryTileSize * nprobe + 1}, stream);
--
-- DeviceTensor prefixSumOffsets1(
-- prefixSumOffsetSpace1[1].data(),
-- {queryTileSize, nprobe});
-- DeviceTensor prefixSumOffsets2(
-- prefixSumOffsetSpace2[1].data(),
-- {queryTileSize, nprobe});
-- DeviceTensor* prefixSumOffsets[2] =
-- {&prefixSumOffsets1, &prefixSumOffsets2};
--
-- // Make sure the element before prefixSumOffsets is 0, since we
-- // depend upon simple, boundary-less indexing to get proper results
-- CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
-- 0,
-- sizeof(int),
-- stream));
-- CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
-- 0,
-- sizeof(int),
-- stream));
--
-- int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
--
-- int totalCodeDistancesSize =
-- queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
-- codeDistanceTypeSize;
--
-- DeviceTensor codeDistances1Mem(
-- mem, {totalCodeDistancesSize}, stream);
-- NoTypeTensor<4, true> codeDistances1(
-- codeDistances1Mem.data(),
-- codeDistanceTypeSize,
-- {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
--
-- DeviceTensor codeDistances2Mem(
-- mem, {totalCodeDistancesSize}, stream);
-- NoTypeTensor<4, true> codeDistances2(
-- codeDistances2Mem.data(),
-- codeDistanceTypeSize,
-- {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
--
-- NoTypeTensor<4, true>* codeDistances[2] =
-- {&codeDistances1, &codeDistances2};
--
-- DeviceTensor allDistances1(
-- mem, {queryTileSize * nprobe * maxListLength}, stream);
-- DeviceTensor allDistances2(
-- mem, {queryTileSize * nprobe * maxListLength}, stream);
-- DeviceTensor* allDistances[2] =
-- {&allDistances1, &allDistances2};
--
-- DeviceTensor heapDistances1(
-- mem, {queryTileSize, pass2Chunks, k}, stream);
-- DeviceTensor heapDistances2(
-- mem, {queryTileSize, pass2Chunks, k}, stream);
-- DeviceTensor* heapDistances[2] =
-- {&heapDistances1, &heapDistances2};
--
-- DeviceTensor heapIndices1(
-- mem, {queryTileSize, pass2Chunks, k}, stream);
-- DeviceTensor heapIndices2(
-- mem, {queryTileSize, pass2Chunks, k}, stream);
-- DeviceTensor* heapIndices[2] =
-- {&heapIndices1, &heapIndices2};
--
-- auto streams = res->getAlternateStreamsCurrentDevice();
-- streamWait(streams, {stream});
--
-- int curStream = 0;
--
-- for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
-- int numQueriesInTile =
-- std::min(queryTileSize, queries.getSize(0) - query);
--
-- auto prefixSumOffsetsView =
-- prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
--
-- auto codeDistancesView =
-- codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
-- auto coarseIndicesView =
-- topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
-- auto queryView =
-- queries.narrowOutermost(query, numQueriesInTile);
--
-- auto heapDistancesView =
-- heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
-- auto heapIndicesView =
-- heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
--
-- auto outDistanceView =
-- outDistances.narrowOutermost(query, numQueriesInTile);
-- auto outIndicesView =
-- outIndices.narrowOutermost(query, numQueriesInTile);
--
-- runMultiPassTile(queryView,
-- centroids,
-- pqCentroidsInnermostCode,
-- codeDistancesView,
-- coarseIndicesView,
-- useFloat16Lookup,
-- bytesPerCode,
-- numSubQuantizers,
-- numSubQuantizerCodes,
-- listCodes,
-- listIndices,
-- indicesOptions,
-- listLengths,
-- *thrustMem[curStream],
-- prefixSumOffsetsView,
-- *allDistances[curStream],
-- heapDistancesView,
-- heapIndicesView,
-- k,
-- metric,
-- outDistanceView,
-- outIndicesView,
-- streams[curStream]);
--
-- curStream = (curStream + 1) % 2;
-- }
--
-- streamWait({stream}, streams);
--}
--
--} } // namespace
---
-2.26.2.windows.1
-
diff --git a/recipe/patches/0004-CUDA-8-fixes.patch b/recipe/patches/0004-CUDA-8-fixes.patch
deleted file mode 100644
index 430f063e..00000000
--- a/recipe/patches/0004-CUDA-8-fixes.patch
+++ /dev/null
@@ -1,117 +0,0 @@
-From 3a69eeeb6067ebd2e71e510e11535b59a4a7768b Mon Sep 17 00:00:00 2001
-From: Jeff Johnson
-Date: Sun, 29 Mar 2020 20:07:01 -0700
-Subject: [PATCH 4/8] CUDA 8 fixes
-
----
- gpu/impl/Distance.cu | 2 +-
- gpu/impl/GeneralDistance.cuh | 15 ++++++++-------
- gpu/test/TestGpuIndexFlat.cpp | 2 +-
- gpu/utils/MathOperators.cuh | 2 +-
- 4 files changed, 11 insertions(+), 10 deletions(-)
-
-diff --git a/gpu/impl/Distance.cu b/gpu/impl/Distance.cu
-index 63ed60b..3a46c37 100644
---- a/gpu/impl/Distance.cu
-+++ b/gpu/impl/Distance.cu
-@@ -64,7 +64,7 @@ void runDistance(bool computeL2,
- if (centroids.numElements() == 0) {
- thrust::fill(thrust::cuda::par.on(defaultStream),
- outDistances.data(), outDistances.end(),
-- Limits::getMax());
-+ Limits::getMax());
-
- thrust::fill(thrust::cuda::par.on(defaultStream),
- outIndices.data(), outIndices.end(),
-diff --git a/gpu/impl/GeneralDistance.cuh b/gpu/impl/GeneralDistance.cuh
-index 0ecdfa5..a2a447f 100644
---- a/gpu/impl/GeneralDistance.cuh
-+++ b/gpu/impl/GeneralDistance.cuh
-@@ -9,6 +9,7 @@
- #include
- #include
- #include
-+#include
- #include
- #include
- #include
-@@ -144,11 +145,11 @@ generalDistance(Tensor query, // m x k
-
- queryTileBase[threadIdx.x + i * kWarpSize] =
- kInBounds ?
-- queryBase[k] : (T) 0; //DistanceOp::kIdentityData;
-+ queryBase[k] : ConvertTo::to(0);
-
- vecTileBase[threadIdx.x + i * kWarpSize] =
- kInBounds ?
-- vecBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+ vecBase[k] : ConvertTo::to(0);
- }
-
- __syncthreads();
-@@ -179,11 +180,11 @@ generalDistance(Tensor query, // m x k
- // Load query tile
- queryTileBase[threadIdx.x] =
- queryThreadInBounds ?
-- queryBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+ queryBase[k] : ConvertTo::to(0);
-
- vecTileBase[threadIdx.x] =
- vecThreadInBoundsLoad ?
-- vecBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+ vecBase[k] : ConvertTo::to(0);
-
- __syncthreads();
-
-@@ -205,11 +206,11 @@ generalDistance(Tensor query, // m x k
- // Load query tile
- queryTileBase[threadIdx.x] =
- queryThreadInBounds && kInBounds ?
-- queryBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+ queryBase[k] : ConvertTo::to(0);
-
- vecTileBase[threadIdx.x] =
- vecThreadInBoundsLoad && kInBounds ?
-- vecBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+ vecBase[k] : ConvertTo::to(0);
-
- __syncthreads();
-
-@@ -278,7 +279,7 @@ void runGeneralDistance(GpuResources* resources,
- if (centroids.numElements() == 0) {
- thrust::fill(thrust::cuda::par.on(defaultStream),
- outDistances.data(), outDistances.end(),
-- Limits::getMax());
-+ Limits::getMax());
-
- thrust::fill(thrust::cuda::par.on(defaultStream),
- outIndices.data(), outIndices.end(),
-diff --git a/gpu/test/TestGpuIndexFlat.cpp b/gpu/test/TestGpuIndexFlat.cpp
-index 73cfe20..cabd7aa 100644
---- a/gpu/test/TestGpuIndexFlat.cpp
-+++ b/gpu/test/TestGpuIndexFlat.cpp
-@@ -277,7 +277,7 @@ TEST(TestGpuIndexFlat, CopyFrom) {
- int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
- faiss::gpu::GpuIndexFlatConfig config;
-- config.device = 0;
-+ config.device = device;
- config.useFloat16 = false;
- config.storeTransposed = false;
-
-diff --git a/gpu/utils/MathOperators.cuh b/gpu/utils/MathOperators.cuh
-index 68ccbd5..020d220 100644
---- a/gpu/utils/MathOperators.cuh
-+++ b/gpu/utils/MathOperators.cuh
-@@ -537,7 +537,7 @@ struct Math {
- return h;
- }
-
-- static inline __device__ half reduceAdd(Half8 v) {
-+ static inline __device__ float reduceAdd(Half8 v) {
- float x = Math::reduceAdd(v.a);
- float y = Math::reduceAdd(v.b);
- return x + y;
---
-2.26.2.windows.1
-
diff --git a/recipe/patches/0005-CUDA-10-fixes.patch b/recipe/patches/0005-CUDA-10-fixes.patch
deleted file mode 100644
index 46b3d0dc..00000000
--- a/recipe/patches/0005-CUDA-10-fixes.patch
+++ /dev/null
@@ -1,69 +0,0 @@
-From d41dcff71a519f17d87a9648c75f8f0b7fe36399 Mon Sep 17 00:00:00 2001
-From: Jeff Johnson
-Date: Mon, 30 Mar 2020 03:01:56 +0000
-Subject: [PATCH 5/8] CUDA 10 fixes
-
----
- gpu/impl/PQCodeDistances-inl.cuh | 2 +-
- gpu/utils/DeviceDefs.cuh | 2 ++
- gpu/utils/DeviceUtils.cu | 14 +++++++++++++-
- 3 files changed, 16 insertions(+), 2 deletions(-)
-
-diff --git a/gpu/impl/PQCodeDistances-inl.cuh b/gpu/impl/PQCodeDistances-inl.cuh
-index c3ef87f..fcb9a02 100644
---- a/gpu/impl/PQCodeDistances-inl.cuh
-+++ b/gpu/impl/PQCodeDistances-inl.cuh
-@@ -27,7 +27,7 @@ template
- __global__ void
--__launch_bounds__(288, 4)
-+__launch_bounds__(288, 3)
- pqCodeDistances(Tensor queries,
- int queriesPerBlock,
- Tensor coarseCentroids,
-diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh
-index 4e859ec..89d3dda 100644
---- a/gpu/utils/DeviceDefs.cuh
-+++ b/gpu/utils/DeviceDefs.cuh
-@@ -8,6 +8,8 @@
-
- #pragma once
-
-+#include
-+
- namespace faiss { namespace gpu {
-
- #ifdef __CUDA_ARCH__
-diff --git a/gpu/utils/DeviceUtils.cu b/gpu/utils/DeviceUtils.cu
-index a8195c9..7bcc230 100644
---- a/gpu/utils/DeviceUtils.cu
-+++ b/gpu/utils/DeviceUtils.cu
-@@ -111,11 +111,23 @@ int getDeviceForAddress(const void* p) {
- FAISS_ASSERT_FMT(err == cudaErrorInvalidValue,
- "unknown error %d", (int) err);
- return -1;
-- } else if (att.memoryType == cudaMemoryTypeHost) {
-+ }
-+
-+ // memoryType is deprecated for CUDA 10.0+
-+#if CUDA_VERSION < 10000
-+ if (att.memoryType == cudaMemoryTypeHost) {
- return -1;
- } else {
- return att.device;
- }
-+#else
-+ // FIXME: what to use for managed memory?
-+ if (att.type == cudaMemoryTypeDevice) {
-+ return att.device;
-+ } else {
-+ return -1;
-+ }
-+#endif
- }
-
- bool getFullUnifiedMemSupport(int device) {
---
-2.26.2.windows.1
-
diff --git a/recipe/patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch b/recipe/patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch
deleted file mode 100644
index 69fa6a8c..00000000
--- a/recipe/patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-From c33f254e219c318e25497a8e399ccbe43bd28687 Mon Sep 17 00:00:00 2001
-From: "H. Vetinari"
-Date: Thu, 8 Oct 2020 12:45:09 +0200
-Subject: [PATCH 6/8] update util-guard for ampere; backport of
- facebookresearch/faiss#1380
-
----
- gpu/utils/DeviceDefs.cuh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh
-index 89d3dda..bc0f9b5 100644
---- a/gpu/utils/DeviceDefs.cuh
-+++ b/gpu/utils/DeviceDefs.cuh
-@@ -13,7 +13,7 @@
- namespace faiss { namespace gpu {
-
- #ifdef __CUDA_ARCH__
--#if __CUDA_ARCH__ <= 750
-+#if __CUDA_ARCH__ <= 800
- constexpr int kWarpSize = 32;
- #else
- #error Unknown __CUDA_ARCH__; please define parameters for compute capability
---
-2.26.2.windows.1
-
diff --git a/recipe/patches/0007-use-c-14.patch b/recipe/patches/0007-use-c-14.patch
deleted file mode 100644
index 6d8baf31..00000000
--- a/recipe/patches/0007-use-c-14.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 5e576fb95233970295084e7d05b6818d7667af6a Mon Sep 17 00:00:00 2001
-From: "H. Vetinari"
-Date: Thu, 8 Oct 2020 18:06:20 +0200
-Subject: [PATCH 7/8] use c++14
-
----
- configure.ac | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/configure.ac b/configure.ac
-index 31b587b..5755fb6 100644
---- a/configure.ac
-+++ b/configure.ac
-@@ -16,7 +16,7 @@ AC_CONFIG_MACRO_DIR([acinclude])
- # Checks for programs.
- AC_LANG(C++)
- AC_PROG_CXX
--AX_CXX_COMPILE_STDCXX([11], [noext], [mandatory])
-+AX_CXX_COMPILE_STDCXX([14], [noext], [mandatory])
- AC_PROG_CPP
- AC_PROG_MAKE_SET
- AC_PROG_MKDIR_P
---
-2.26.2.windows.1
-
diff --git a/recipe/patches/0008-backport-facebookresearch-faiss-1388.patch b/recipe/patches/0008-backport-facebookresearch-faiss-1388.patch
deleted file mode 100644
index e08c2726..00000000
--- a/recipe/patches/0008-backport-facebookresearch-faiss-1388.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-From fb6103d0d1f044c8b27450d93f1d1d6f82c2c1b4 Mon Sep 17 00:00:00 2001
-From: "H. Vetinari"
-Date: Fri, 9 Oct 2020 18:24:57 +0200
-Subject: [PATCH 8/8] backport facebookresearch/faiss#1388
-
----
- gpu/StandardGpuResources.cpp | 6 +++++-
- 1 file changed, 5 insertions(+), 1 deletion(-)
-
-diff --git a/gpu/StandardGpuResources.cpp b/gpu/StandardGpuResources.cpp
-index e564f8e..257fc2a 100644
---- a/gpu/StandardGpuResources.cpp
-+++ b/gpu/StandardGpuResources.cpp
-@@ -249,11 +249,15 @@ StandardGpuResources::initializeForDevice(int device) {
- blasHandles_[device] = blasHandle;
-
- // Enable tensor core support if available
--#if CUDA_VERSION >= 9000
-+#if CUDA_VERSION >= 9000 && CUDA_VERSION < 11000
-+ // This flag was deprecated in CUDA 11
- if (getTensorCoreSupport(device)) {
- cublasSetMathMode(blasHandle, CUBLAS_TENSOR_OP_MATH);
- }
- #endif
-+#if CUDA_VERSION >= 11000
-+ cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
-+#endif
-
- FAISS_ASSERT(memory_.count(device) == 0);
-
---
-2.26.2.windows.1
-