diff --git a/.azure-pipelines/azure-pipelines-linux.yml b/.azure-pipelines/azure-pipelines-linux.yml
index 1c4c1717..34ff6e86 100755
--- a/.azure-pipelines/azure-pipelines-linux.yml
+++ b/.azure-pipelines/azure-pipelines-linux.yml
@@ -11,28 +11,27 @@ jobs:
       linux_64_cuda_compiler_version10.0:
         CONFIG: linux_64_cuda_compiler_version10.0
         UPLOAD_PACKAGES: 'True'
-        DOCKER_IMAGE: condaforge/linux-anvil-cuda:10.0
+        DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:10.0
       linux_64_cuda_compiler_version10.1:
         CONFIG: linux_64_cuda_compiler_version10.1
         UPLOAD_PACKAGES: 'True'
-        DOCKER_IMAGE: condaforge/linux-anvil-cuda:10.1
+        DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:10.1
       linux_64_cuda_compiler_version10.2:
         CONFIG: linux_64_cuda_compiler_version10.2
         UPLOAD_PACKAGES: 'True'
-        DOCKER_IMAGE: condaforge/linux-anvil-cuda:10.2
+        DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:10.2
       linux_64_cuda_compiler_version11.0:
         CONFIG: linux_64_cuda_compiler_version11.0
         UPLOAD_PACKAGES: 'True'
-        DOCKER_IMAGE: condaforge/linux-anvil-cuda:11.0
+        DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:11.0
       linux_64_cuda_compiler_version9.2:
         CONFIG: linux_64_cuda_compiler_version9.2
         UPLOAD_PACKAGES: 'True'
-        DOCKER_IMAGE: condaforge/linux-anvil-cuda:9.2
+        DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cuda:9.2
       linux_64_cuda_compiler_versionNone:
         CONFIG: linux_64_cuda_compiler_versionNone
         UPLOAD_PACKAGES: 'True'
-        DOCKER_IMAGE: condaforge/linux-anvil-comp7
-    maxParallel: 8
+        DOCKER_IMAGE: quay.io/condaforge/linux-anvil-comp7
   timeoutInMinutes: 360
 
   steps:
diff --git a/.azure-pipelines/azure-pipelines-osx.yml b/.azure-pipelines/azure-pipelines-osx.yml
index 568ec85f..0c892401 100755
--- a/.azure-pipelines/azure-pipelines-osx.yml
+++ b/.azure-pipelines/azure-pipelines-osx.yml
@@ -11,7 +11,6 @@ jobs:
       osx_64_:
         CONFIG: osx_64_
         UPLOAD_PACKAGES: 'True'
-    maxParallel: 8
   timeoutInMinutes: 360
 
   steps:
diff --git a/.azure-pipelines/azure-pipelines-win.yml b/.azure-pipelines/azure-pipelines-win.yml
new file mode 100755
index 00000000..1fd8ea12
--- /dev/null
+++ b/.azure-pipelines/azure-pipelines-win.yml
@@ -0,0 +1,110 @@
+# This file was generated automatically from conda-smithy. To update this configuration,
+# update the conda-forge.yml and/or the recipe/meta.yaml.
+# -*- mode: yaml -*-
+
+jobs:
+- job: win
+  pool:
+    vmImage: vs2017-win2016
+  strategy:
+    matrix:
+      win_64_:
+        CONFIG: win_64_
+        UPLOAD_PACKAGES: 'True'
+  timeoutInMinutes: 360
+  variables:
+    CONDA_BLD_PATH: D:\\bld\\
+
+  steps:
+    - script: |
+        choco install vcpython27 -fdv -y --debug
+      condition: contains(variables['CONFIG'], 'vs2008')
+      displayName: Install vcpython27.msi (if needed)
+
+    # Cygwin's git breaks conda-build. (See https://github.com/conda-forge/conda-smithy-feedstock/pull/2.)
+    # - script: rmdir C:\cygwin /s /q
+    #   continueOnError: true
+
+    - powershell: |
+        Set-PSDebug -Trace 1
+
+        $batchcontent = @"
+        ECHO ON
+        SET vcpython=C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python\9.0
+
+        DIR "%vcpython%"
+
+        CALL "%vcpython%\vcvarsall.bat" %*
+        "@
+
+        $batchDir = "C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python\9.0\VC"
+        $batchPath = "$batchDir" + "\vcvarsall.bat"
+        New-Item -Path $batchPath -ItemType "file" -Force
+
+        Set-Content -Value $batchcontent -Path $batchPath
+
+        Get-ChildItem -Path $batchDir
+
+        Get-ChildItem -Path ($batchDir + '\..')
+
+      condition: contains(variables['CONFIG'], 'vs2008')
+      displayName: Patch vs2008 (if needed)
+
+    - task: CondaEnvironment@1
+      inputs:
+        packageSpecs: 'python=3.6 conda-build conda "conda-forge-ci-setup=3" pip' # Optional
+        installOptions: "-c conda-forge"
+        updateConda: true
+      displayName: Install conda-build and activate environment
+
+    - script: set PYTHONUNBUFFERED=1
+      displayName: Set PYTHONUNBUFFERED
+
+    # Configure the VM
+    - script: |
+        call activate base
+        setup_conda_rc .\ ".\recipe" .\.ci_support\%CONFIG%.yaml
+      displayName: conda-forge CI setup
+
+    # Configure the VM.
+    - script: |
+        set "CI=azure"
+        call activate base
+        run_conda_forge_build_setup
+      displayName: conda-forge build setup
+    
+
+    # Special cased version setting some more things!
+    - script: |
+        call activate base
+        conda.exe build "recipe" -m .ci_support\%CONFIG%.yaml
+      displayName: Build recipe (vs2008)
+      env:
+        VS90COMNTOOLS: "C:\\Program Files (x86)\\Common Files\\Microsoft\\Visual C++ for Python\\9.0\\VC\\bin"
+        PYTHONUNBUFFERED: 1
+      condition: contains(variables['CONFIG'], 'vs2008')
+
+    - script: |
+        call activate base
+        conda.exe build "recipe" -m .ci_support\%CONFIG%.yaml
+      displayName: Build recipe
+      env:
+        PYTHONUNBUFFERED: 1
+      condition: not(contains(variables['CONFIG'], 'vs2008'))
+    - script: |
+        set "FEEDSTOCK_NAME=%BUILD_REPOSITORY_NAME:*/=%"
+        call activate base
+        validate_recipe_outputs "%FEEDSTOCK_NAME%"
+      displayName: Validate Recipe Outputs
+
+    - script: |
+        set "GIT_BRANCH=%BUILD_SOURCEBRANCHNAME%"
+        set "FEEDSTOCK_NAME=%BUILD_REPOSITORY_NAME:*/=%"
+        call activate base
+        upload_package --validate --feedstock-name="%FEEDSTOCK_NAME%" .\ ".\recipe" .ci_support\%CONFIG%.yaml
+      displayName: Upload package
+      env:
+        BINSTAR_TOKEN: $(BINSTAR_TOKEN)
+        FEEDSTOCK_TOKEN: $(FEEDSTOCK_TOKEN)
+        STAGING_BINSTAR_TOKEN: $(STAGING_BINSTAR_TOKEN)
+      condition: and(succeeded(), not(eq(variables['UPLOAD_PACKAGES'], 'False')))
\ No newline at end of file
diff --git a/.ci_support/linux_64_cuda_compiler_version10.0.yaml b/.ci_support/linux_64_cuda_compiler_version10.0.yaml
index 507da21a..a2227db8 100644
--- a/.ci_support/linux_64_cuda_compiler_version10.0.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version10.0.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
 channel_sources:
 - conda-forge,defaults
 channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
 cxx_compiler_version:
 - '7'
 docker_image:
-- condaforge/linux-anvil-cuda:10.0
+- quay.io/condaforge/linux-anvil-cuda:10.0
 libblas:
 - 3.8 *netlib
 liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
 pin_run_as_build:
   python:
     min_pin: x.x
@@ -24,8 +31,11 @@ python:
 - 3.6.* *_cpython
 - 3.7.* *_cpython
 - 3.8.* *_cpython
+- 3.9.* *_cpython
 target_platform:
 - linux-64
 zip_keys:
 - - cuda_compiler_version
   - docker_image
+- - python
+  - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_version10.1.yaml b/.ci_support/linux_64_cuda_compiler_version10.1.yaml
index d9dfec45..cfb515bb 100644
--- a/.ci_support/linux_64_cuda_compiler_version10.1.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version10.1.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
 channel_sources:
 - conda-forge,defaults
 channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
 cxx_compiler_version:
 - '7'
 docker_image:
-- condaforge/linux-anvil-cuda:10.1
+- quay.io/condaforge/linux-anvil-cuda:10.1
 libblas:
 - 3.8 *netlib
 liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
 pin_run_as_build:
   python:
     min_pin: x.x
@@ -24,8 +31,11 @@ python:
 - 3.6.* *_cpython
 - 3.7.* *_cpython
 - 3.8.* *_cpython
+- 3.9.* *_cpython
 target_platform:
 - linux-64
 zip_keys:
 - - cuda_compiler_version
   - docker_image
+- - python
+  - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_version10.2.yaml b/.ci_support/linux_64_cuda_compiler_version10.2.yaml
index 2d051101..59928208 100644
--- a/.ci_support/linux_64_cuda_compiler_version10.2.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version10.2.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
 channel_sources:
 - conda-forge,defaults
 channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
 cxx_compiler_version:
 - '7'
 docker_image:
-- condaforge/linux-anvil-cuda:10.2
+- quay.io/condaforge/linux-anvil-cuda:10.2
 libblas:
 - 3.8 *netlib
 liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
 pin_run_as_build:
   python:
     min_pin: x.x
@@ -24,8 +31,11 @@ python:
 - 3.6.* *_cpython
 - 3.7.* *_cpython
 - 3.8.* *_cpython
+- 3.9.* *_cpython
 target_platform:
 - linux-64
 zip_keys:
 - - cuda_compiler_version
   - docker_image
+- - python
+  - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_version11.0.yaml b/.ci_support/linux_64_cuda_compiler_version11.0.yaml
index 14414976..681c7001 100644
--- a/.ci_support/linux_64_cuda_compiler_version11.0.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version11.0.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
 channel_sources:
 - conda-forge,defaults
 channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
 cxx_compiler_version:
 - '7'
 docker_image:
-- condaforge/linux-anvil-cuda:11.0
+- quay.io/condaforge/linux-anvil-cuda:11.0
 libblas:
 - 3.8 *netlib
 liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
 pin_run_as_build:
   python:
     min_pin: x.x
@@ -24,8 +31,11 @@ python:
 - 3.6.* *_cpython
 - 3.7.* *_cpython
 - 3.8.* *_cpython
+- 3.9.* *_cpython
 target_platform:
 - linux-64
 zip_keys:
 - - cuda_compiler_version
   - docker_image
+- - python
+  - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_version9.2.yaml b/.ci_support/linux_64_cuda_compiler_version9.2.yaml
index 7e2594ef..10b31134 100644
--- a/.ci_support/linux_64_cuda_compiler_version9.2.yaml
+++ b/.ci_support/linux_64_cuda_compiler_version9.2.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
 channel_sources:
 - conda-forge,defaults
 channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
 cxx_compiler_version:
 - '7'
 docker_image:
-- condaforge/linux-anvil-cuda:9.2
+- quay.io/condaforge/linux-anvil-cuda:9.2
 libblas:
 - 3.8 *netlib
 liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
 pin_run_as_build:
   python:
     min_pin: x.x
@@ -24,8 +31,11 @@ python:
 - 3.6.* *_cpython
 - 3.7.* *_cpython
 - 3.8.* *_cpython
+- 3.9.* *_cpython
 target_platform:
 - linux-64
 zip_keys:
 - - cuda_compiler_version
   - docker_image
+- - python
+  - numpy
diff --git a/.ci_support/linux_64_cuda_compiler_versionNone.yaml b/.ci_support/linux_64_cuda_compiler_versionNone.yaml
index 5c699f1c..9cc1d4cc 100644
--- a/.ci_support/linux_64_cuda_compiler_versionNone.yaml
+++ b/.ci_support/linux_64_cuda_compiler_versionNone.yaml
@@ -1,3 +1,5 @@
+cdt_name:
+- cos6
 channel_sources:
 - conda-forge,defaults
 channel_targets:
@@ -11,11 +13,16 @@ cxx_compiler:
 cxx_compiler_version:
 - '7'
 docker_image:
-- condaforge/linux-anvil-comp7
+- quay.io/condaforge/linux-anvil-comp7
 libblas:
 - 3.8 *netlib
 liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
 pin_run_as_build:
   python:
     min_pin: x.x
@@ -24,8 +31,11 @@ python:
 - 3.6.* *_cpython
 - 3.7.* *_cpython
 - 3.8.* *_cpython
+- 3.9.* *_cpython
 target_platform:
 - linux-64
 zip_keys:
 - - cuda_compiler_version
   - docker_image
+- - python
+  - numpy
diff --git a/.ci_support/migrations/python39.yaml b/.ci_support/migrations/python39.yaml
new file mode 100644
index 00000000..1c915c15
--- /dev/null
+++ b/.ci_support/migrations/python39.yaml
@@ -0,0 +1,27 @@
+migrator_ts: 1602104489
+__migrator:
+    migration_number: 2
+    operation: key_add
+    primary_key: python
+    ordering:
+        python:
+            - 3.6.* *_cpython
+            - 3.7.* *_cpython
+            - 3.8.* *_cpython
+            - 3.9.* *_cpython   # new entry
+            - 3.6.* *_73_pypy
+    paused: false
+    longterm: True
+    pr_limit: 50
+    exclude:
+      # this shouldn't attempt to modify the python feedstocks
+      - python
+      - pypy3.6
+      - pypy-meta
+python:
+  - 3.9.* *_cpython
+# additional entries to add for zip_keys
+numpy:
+  - 1.19
+python_impl:
+  - cpython
diff --git a/.ci_support/osx_64_.yaml b/.ci_support/osx_64_.yaml
index adcbfc34..6768d9d9 100644
--- a/.ci_support/osx_64_.yaml
+++ b/.ci_support/osx_64_.yaml
@@ -13,9 +13,14 @@ cxx_compiler_version:
 libblas:
 - 3.8 *netlib
 liblapack:
-- 3.8.0 *netlib
+- 3.8 *netlib
 macos_machine:
 - x86_64-apple-darwin13.4.0
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
 pin_run_as_build:
   python:
     min_pin: x.x
@@ -24,5 +29,9 @@ python:
 - 3.6.* *_cpython
 - 3.7.* *_cpython
 - 3.8.* *_cpython
+- 3.9.* *_cpython
 target_platform:
 - osx-64
+zip_keys:
+- - python
+  - numpy
diff --git a/.ci_support/win_64_.yaml b/.ci_support/win_64_.yaml
new file mode 100644
index 00000000..642d5c24
--- /dev/null
+++ b/.ci_support/win_64_.yaml
@@ -0,0 +1,31 @@
+channel_sources:
+- conda-forge,defaults
+channel_targets:
+- conda-forge main
+cuda_compiler_version:
+- None
+cxx_compiler:
+- vs2017
+libblas:
+- 3.8 *netlib
+liblapack:
+- 3.8 *netlib
+numpy:
+- '1.16'
+- '1.16'
+- '1.16'
+- '1.19'
+pin_run_as_build:
+  python:
+    min_pin: x.x
+    max_pin: x.x
+python:
+- 3.6.* *_cpython
+- 3.7.* *_cpython
+- 3.8.* *_cpython
+- 3.9.* *_cpython
+target_platform:
+- win-64
+zip_keys:
+- - python
+  - numpy
diff --git a/README.md b/README.md
index 511432d1..093aa804 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,13 @@ Current build status
                   <img src="https://dev.azure.com/conda-forge/feedstock-builds/_apis/build/status/faiss-split-feedstock?branchName=master&jobName=osx&configuration=osx_64_" alt="variant">
                 </a>
               </td>
+            </tr><tr>
+              <td>win_64</td>
+              <td>
+                <a href="https://dev.azure.com/conda-forge/feedstock-builds/_build/latest?definitionId=9713&branchName=master">
+                  <img src="https://dev.azure.com/conda-forge/feedstock-builds/_apis/build/status/faiss-split-feedstock?branchName=master&jobName=win&configuration=win_64_" alt="variant">
+                </a>
+              </td>
             </tr>
           </tbody>
         </table>
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 33a441c1..6b346f50 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -4,4 +4,5 @@
 
 jobs:
   - template: ./.azure-pipelines/azure-pipelines-linux.yml
+  - template: ./.azure-pipelines/azure-pipelines-win.yml
   - template: ./.azure-pipelines/azure-pipelines-osx.yml
\ No newline at end of file
diff --git a/build-locally.py b/build-locally.py
index d92f4edc..3453cfe6 100755
--- a/build-locally.py
+++ b/build-locally.py
@@ -61,7 +61,7 @@ def main(args=None):
         help="Setup debug environment using `conda debug`",
     )
     p.add_argument(
-        "--output-id", help="If running debug, specifiy the output to setup."
+        "--output-id", help="If running debug, specify the output to setup."
     )
 
     ns = p.parse_args(args=args)
diff --git a/recipe/build-lib.bat b/recipe/build-lib.bat
index 640e266b..9b97b2e6 100644
--- a/recipe/build-lib.bat
+++ b/recipe/build-lib.bat
@@ -1,9 +1,19 @@
-@echo off
+:: Build faiss.dll
+cmake -B _build ^
+    -DBUILD_SHARED_LIBS=ON ^
+    -DBUILD_TESTING=OFF ^
+    -DFAISS_ENABLE_GPU=OFF ^
+    -DFAISS_ENABLE_PYTHON=OFF ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^
+    -DCMAKE_INSTALL_BINDIR="%LIBRARY_BIN%" ^
+    -DCMAKE_INSTALL_LIBDIR="%LIBRARY_LIB%" ^
+    -DCMAKE_INSTALL_INCLUDEDIR="%LIBRARY_INC%" ^
+    .
+if %ERRORLEVEL% neq 0 exit 1
 
-:: there's a symbolic link from faiss/ to ./ in the upstream repo that does not work with windows;
-:: delete symlink & copy entire source recursively (= "/S") to folder faiss to work around it
-rmdir faiss
-robocopy . faiss /S
+cmake --build _build --config Release -j %CPU_COUNT%
+if %ERRORLEVEL% neq 0 exit 1
 
-call %BUILD_PREFIX%\Library\bin\run_autotools_clang_conda_build.bat build-lib.sh
+cmake --install _build --config Release --prefix %PREFIX%
 if %ERRORLEVEL% neq 0 exit 1
diff --git a/recipe/build-lib.sh b/recipe/build-lib.sh
index be79f863..0fb8bc2d 100644
--- a/recipe/build-lib.sh
+++ b/recipe/build-lib.sh
@@ -1,7 +1,9 @@
 # function for facilitate version comparison; cf. https://stackoverflow.com/a/37939589
 function version2int { echo "$@" | awk -F. '{ printf("%d%02d\n", $1, $2); }'; }
 
-CUDA_CONFIG_ARG=""
+set -e
+
+declare -a CUDA_CONFIG_ARGS
 if [ ${cuda_compiler_version} != "None" ]; then
     # for documentation see e.g.
     # docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#building-for-maximum-compatibility
@@ -22,27 +24,44 @@ if [ ${cuda_compiler_version} != "None" ]; then
         LATEST_ARCH=80
     fi
     for arch in "${ARCHES[@]}"; do
-        CUDA_ARCH="${CUDA_ARCH} -gencode=arch=compute_${arch},code=sm_${arch}";
+        CMAKE_CUDA_ARCHS="${CMAKE_CUDA_ARCHS+${CMAKE_CUDA_ARCHS};}${arch}-virtual"
     done
     # to support PTX JIT compilation; see first link above or cf.
     # devblogs.nvidia.com/cuda-pro-tip-understand-fat-binaries-jit-caching
-    CUDA_ARCH="${CUDA_ARCH} -gencode=arch=compute_${LATEST_ARCH},code=compute_${LATEST_ARCH}"
+    # see also cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html
+    CMAKE_CUDA_ARCHS="${CMAKE_CUDA_ARCHS+${CMAKE_CUDA_ARCHS};}${LATEST_ARCH}-real"
+
+    FAISS_ENABLE_GPU="ON"
+    CUDA_CONFIG_ARGS+=(
+        -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHS}"
+    )
+    # cmake does not generate output for the call below; echo some info
+    echo "Set up extra cmake-args: CUDA_CONFIG_ARGS=${CUDA_CONFIG_ARGS+"${CUDA_CONFIG_ARGS[@]}"}"
 
-    CUDA_CONFIG_ARG="--with-cuda=${CUDA_HOME}"
+    # Acc. to https://cmake.org/cmake/help/v3.19/module/FindCUDAToolkit.html#search-behavior
+    # CUDA toolkit is search relative to `nvcc` first before considering
+    # "-DCUDAToolkit_ROOT=${CUDA_HOME}". We have multiple workarounds:
+    #   - Add symlinks from ${CUDA_HOME} to ${BUILD_PREFIX}
+    #   - Add ${CUDA_HOME}/bin to ${PATH}
+    #   - Remove `nvcc` wrapper in ${BUILD_PREFIX} so that `nvcc` from ${CUDA_HOME} gets found.
+    # TODO: Fix this in nvcc-feedstock or cmake-feedstock.
+    # NOTE: It's okay for us to not use the wrapper since CMake adds -ccbin itself.
+    rm "${BUILD_PREFIX}/bin/nvcc"
 else
-    CUDA_CONFIG_ARG="--without-cuda"
+    FAISS_ENABLE_GPU="OFF"
 fi
 
-# need to regenerate ./configure for patch to configure.ac to take effect
-autoreconf -i -f
-
 # Build vanilla version (no avx)
-./configure --prefix=${PREFIX} --exec-prefix=${PREFIX} \
-  --with-blas=-lblas --with-lapack=-llapack \
-  ${CUDA_CONFIG_ARG} --with-cuda-arch="${CUDA_ARCH}" || exit 1
-
-# make sets SHAREDEXT correctly for linux/osx
-make install
+cmake -B _build_generic \
+      -DBUILD_SHARED_LIBS=ON \
+      -DBUILD_TESTING=ON \
+      -DFAISS_ENABLE_PYTHON=OFF \
+      -DFAISS_ENABLE_GPU=${FAISS_ENABLE_GPU} \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_INSTALL_LIBDIR=lib \
+      ${CUDA_CONFIG_ARGS+"${CUDA_CONFIG_ARGS[@]}"} \
+      --verbose \
+      .
 
-# make builds libfaiss.a & libfaiss.so; we only want the latter
-rm ${PREFIX}/lib/libfaiss.a
+cmake --build _build_generic -j $CPU_COUNT
+cmake --install _build_generic --prefix $PREFIX
diff --git a/recipe/build-pkg.bat b/recipe/build-pkg.bat
index 5608d2e8..0af669a0 100644
--- a/recipe/build-pkg.bat
+++ b/recipe/build-pkg.bat
@@ -1,9 +1,19 @@
-@echo off
+:: Build vanilla version (no avx2).
+:: Do not use the Python3_* variants for cmake
+cmake -B _build_python ^
+    -DFAISS_ENABLE_GPU=OFF ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DPython_EXECUTABLE="%PYTHON%" ^
+    faiss/python
+if %ERRORLEVEL% neq 0 exit 1
 
-:: there's a symbolic link from faiss/ to ./ in the upstream repo that does not work with windows;
-:: delete symlink & copy entire source recursively (= "/S") to folder faiss to work around it
-rmdir faiss
-robocopy . faiss /S
+cmake --build _build_python --config Release -j %CPU_COUNT%
+if %ERRORLEVEL% neq 0 exit 1
 
-call %BUILD_PREFIX%\Library\bin\run_autotools_clang_conda_build.bat build-pkg.sh
+:: Build actual python module.
+pushd _build_python
+%PYTHON% setup.py install --single-version-externally-managed --record=record.txt --prefix=%PREFIX%
 if %ERRORLEVEL% neq 0 exit 1
+popd
+:: clean up cmake-cache between builds
+rd /S /Q _build_python
diff --git a/recipe/build-pkg.sh b/recipe/build-pkg.sh
index 84dd2183..eb44cc51 100644
--- a/recipe/build-pkg.sh
+++ b/recipe/build-pkg.sh
@@ -1,15 +1,35 @@
-CUDA_CONFIG_ARG=""
+set -e
+
+FAISS_ENABLE_GPU=""
 if [ ${cuda_compiler_version} != "None" ]; then
-    CUDA_CONFIG_ARG="--with-cuda=${CUDA_HOME}"
+    FAISS_ENABLE_GPU="ON"
+    # Acc. to https://cmake.org/cmake/help/v3.19/module/FindCUDAToolkit.html#search-behavior
+    # CUDA toolkit is search relative to `nvcc` first before considering
+    # "-DCUDAToolkit_ROOT=${CUDA_HOME}". We have multiple workarounds:
+    #   - Add symlinks from ${CUDA_HOME} to ${BUILD_PREFIX}
+    #   - Add ${CUDA_HOME}/bin to ${PATH}
+    #   - Remove `nvcc` wrapper in ${BUILD_PREFIX} so that `nvcc` from ${CUDA_HOME} gets found.
+    # TODO: Fix this in nvcc-feedstock or cmake-feedstock.
+    # NOTE: It's okay for us to not use the wrapper since CMake adds -ccbin itself.
+    rm "${BUILD_PREFIX}/bin/nvcc"
 else
-    CUDA_CONFIG_ARG="--without-cuda"
+    FAISS_ENABLE_GPU="OFF"
 fi
 
-# Build vanilla version (no avx)
-./configure --with-blas=-lblas --with-lapack=-llapack ${CUDA_CONFIG_ARG}
-
-make -C python build
+# Build vanilla version (no avx2), see build-lib.sh
+# Do not use the Python3_* variants for cmake
+cmake -B _build_python \
+      -Dfaiss_ROOT=${PREFIX}\
+      -DFAISS_ENABLE_GPU=${FAISS_ENABLE_GPU} \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DPython_EXECUTABLE="${PYTHON}" \
+      faiss/python
 
-cd python
+cmake --build _build_python -j $CPU_COUNT
 
-$PYTHON -m pip install . -vv
+# Build actual python module.
+pushd _build_python
+$PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX
+popd
+# clean up cmake-cache between builds
+rm -r _build_python
diff --git a/recipe/conda_build_config.yaml b/recipe/conda_build_config.yaml
index 5700830c..274fe175 100644
--- a/recipe/conda_build_config.yaml
+++ b/recipe/conda_build_config.yaml
@@ -1,2 +1,9 @@
 cuda_compiler_version:
 - None                         # [not linux]
+
+cxx_compiler_version:          # [unix]
+  # need to downgrade on osx due to a bug that breaks the test suite
+  - 10                         # [osx]
+  # need to downgrade on linux due to nvcc 9.2 not being able to deal with gcc>7,
+  # and conda-build not being able to zip this with cuda_compiler_version
+  - 7                          # [linux]
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index 0f6a0d74..153a6abe 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -1,24 +1,28 @@
-{% set version = "1.6.3" %}
-{% set number = 3 %}
+{% set version = "1.6.4" %}
+{% set number = 0 %}
 # see github.com/conda-forge/conda-forge.github.io/issues/1059 for naming discussion
 {% set faiss_proc_type = "cuda" if cuda_compiler_version != "None" else "cpu" %}
 
-# headers for upstream-folders './*.h', 'impl/*.h', 'utils/*.h'
-# gpu adds headers in 'gpu/*.h', 'gpu/impl/*.(cu)?h', 'gpu/utils/*.(cu)?h'
+# headers for upstream-folders 'faiss/*.h', 'faiss/{impl,utils}/*.h',
+# see https://github.com/facebookresearch/faiss/blob/v1.6.4/faiss/CMakeLists.txt
+# gpu adds headers in 'faiss/gpu/*.h', 'faiss/gpu/{impl,utils}/*.(cu)?h'
 {% set headers = [
-    'AutoTune.h', 'clone_index.h', 'Clustering.h', 'DirectMap.h', 'Index.h', 'Index2Layer.h',
-    'IndexBinary.h', 'IndexBinaryFlat.h', 'IndexBinaryFromFloat.h', 'IndexBinaryHash.h',
-    'IndexBinaryHNSW.h', 'IndexBinaryIVF.h', 'IndexFlat.h', 'IndexHNSW.h', 'IndexIVF.h',
-    'IndexIVFFlat.h', 'IndexIVFPQ.h', 'IndexIVFPQR.h', 'IndexIVFSpectralHash.h', 'IndexLattice.h',
-    'IndexLSH.h', 'IndexPQ.h', 'IndexPreTransform.h', 'IndexReplicas.h', 'IndexScalarQuantizer.h',
-    'IndexShards.h', 'index_factory.h', 'index_io.h', 'InvertedLists.h', 'IVFlib.h',
-    'MatrixStats.h', 'MetaIndexes.h', 'MetricType.h', 'OnDiskInvertedLists.h', 'VectorTransform.h',
+    'AutoTune.h', 'Clustering.h', 'DirectMap.h', 'IVFlib.h', 'Index.h', 'Index2Layer.h',
+    'IndexBinary.h', 'IndexBinaryFlat.h', 'IndexBinaryFromFloat.h', 'IndexBinaryHNSW.h',
+    'IndexBinaryHash.h', 'IndexBinaryIVF.h', 'IndexFlat.h', 'IndexHNSW.h', 'IndexIVF.h',
+    'IndexIVFFlat.h', 'IndexIVFPQ.h', 'IndexIVFPQR.h', 'IndexIVFSpectralHash.h', 'IndexLSH.h',
+    'IndexLattice.h', 'IndexPQ.h', 'IndexPreTransform.h', 'IndexReplicas.h',
+    'IndexScalarQuantizer.h', 'IndexShards.h', 'InvertedLists.h', 'MatrixStats.h',
+    'MetaIndexes.h', 'MetricType.h', 'VectorTransform.h', 'clone_index.h', 'index_factory.h',
+    'index_io.h',
     'impl/AuxIndexStructures.h', 'impl/FaissAssert.h', 'impl/FaissException.h', 'impl/HNSW.h',
-    'impl/io.h', 'impl/lattice_Zn.h', 'impl/PolysemousTraining.h', 'impl/ProductQuantizer-inl.h',
-    'impl/ProductQuantizer.h', 'impl/ScalarQuantizer.h', 'impl/ThreadedIndex-inl.h',
-    'impl/ThreadedIndex.h',
-    'utils/distances.h', 'utils/extra_distances.h', 'utils/hamming-inl.h', 'utils/hamming.h',
-    'utils/Heap.h', 'utils/random.h', 'utils/utils.h', 'utils/WorkerThread.h'
+    'impl/PolysemousTraining.h', 'impl/ProductQuantizer-inl.h', 'impl/ProductQuantizer.h',
+    'impl/ScalarQuantizer.h', 'impl/ThreadedIndex-inl.h', 'impl/ThreadedIndex.h', 'impl/io.h',
+    'impl/io_macros.h', 'impl/lattice_Zn.h', 'impl/platform_macros.h',
+    'utils/Heap.h', 'utils/WorkerThread.h', 'utils/distances.h', 'utils/extra_distances.h',
+    'utils/hamming-inl.h', 'utils/hamming.h', 'utils/random.h', 'utils/utils.h'
+] + (not win) * [
+    'OnDiskInvertedLists.h'
 ] + (cuda_compiler_version != "None") * [
     'gpu/GpuAutoTune.h', 'gpu/GpuCloner.h', 'gpu/GpuClonerOptions.h', 'gpu/GpuDistance.h',
     'gpu/GpuFaissAssert.h', 'gpu/GpuIndex.h', 'gpu/GpuIndexBinaryFlat.h', 'gpu/GpuIndexFlat.h',
@@ -30,12 +34,12 @@
     'gpu/impl/FlatIndex.cuh', 'gpu/impl/GeneralDistance.cuh', 'gpu/impl/GpuScalarQuantizer.cuh',
     'gpu/impl/IVFAppend.cuh', 'gpu/impl/IVFBase.cuh', 'gpu/impl/IVFFlat.cuh',
     'gpu/impl/IVFFlatScan.cuh', 'gpu/impl/IVFPQ.cuh', 'gpu/impl/IVFUtils.cuh',
-    'gpu/impl/L2Norm.cuh', 'gpu/impl/L2Select.cuh', 'gpu/impl/Metrics.cuh',
+    'gpu/impl/L2Norm.cuh', 'gpu/impl/L2Select.cuh',
     'gpu/impl/PQCodeDistances-inl.cuh', 'gpu/impl/PQCodeDistances.cuh', 'gpu/impl/PQCodeLoad.cuh',
     'gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh', 'gpu/impl/PQScanMultiPassNoPrecomputed.cuh',
     'gpu/impl/PQScanMultiPassPrecomputed.cuh', 'gpu/impl/VectorResidual.cuh',
-    'gpu/utils/DeviceMemory.h', 'gpu/utils/DeviceUtils.h', 'gpu/utils/MemorySpace.h',
-    'gpu/utils/StackDeviceMemory.h', 'gpu/utils/StaticUtils.h', 'gpu/utils/Timer.h',
+    'gpu/utils/DeviceUtils.h', 'gpu/utils/StackDeviceMemory.h', 'gpu/utils/StaticUtils.h',
+    'gpu/utils/Timer.h',
     'gpu/utils/BlockSelectKernel.cuh', 'gpu/utils/Comparators.cuh',
     'gpu/utils/ConversionOperators.cuh', 'gpu/utils/CopyUtils.cuh', 'gpu/utils/DeviceDefs.cuh',
     'gpu/utils/DeviceTensor-inl.cuh', 'gpu/utils/DeviceTensor.cuh', 'gpu/utils/DeviceVector.cuh',
@@ -55,26 +59,14 @@ package:
 
 source:
   url: https://github.com/facebookresearch/faiss/archive/v{{ version }}.tar.gz
-  sha256: e1a41c159f0b896975fbb133e0240a233af5c9286c09a28fde6aefff5336e542
+  sha256: 6ed6311415ccb70ea1afe82bafc24e9f3aa60c06c9d030bfc23ee4173d59cc99
   patches:
-    - patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch
-    - patches/0002-remove-fPIC-flag-for-msvc-compat.patch  # [win]
-    # add upstream commits (64dd9884, e05f773f, ba061ff82) at the recommendation
-    # of the maintainers. These can be removed upon the next release.
-    - patches/0003-remove-deleted-files-from-template-change.patch
-    - patches/0004-CUDA-8-fixes.patch
-    - patches/0005-CUDA-10-fixes.patch
-    # backport of facebookresearch/faiss#1380 to be able to build for Ampere
-    - patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch
-    # use c++14 (in autoconf macro AX_CXX_COMPILE_STDCXX)
-    - patches/0007-use-c-14.patch
-    # backport of facebookresearch/faiss#1388, see conda-forge/faiss-split-feedstock#14
-    - patches/0008-backport-facebookresearch-faiss-1388.patch
+    # backport of facebookresearch/faiss#1541, can be dropped for ver>1.6.5
+    - patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch
+    - patches/0002-use-c-14.patch
 
 build:
   number: {{ number }}
-  # TODO: figure out compiler errors on windows
-  skip: true  # [win]
   # GPU version only for linux
   skip: true  # [(win or osx) and cuda_compiler_version != "None"]
 
@@ -108,41 +100,32 @@ outputs:
       build:
         - {{ compiler('cxx') }}
         - {{ compiler('cuda') }}  # [cuda_compiler_version != "None"]
-        - autoconf                # [not win]
-        - automake                # [not win]
-        - libtool                 # [not win]
         - cmake
         - libgomp                 # [linux]
-        - llvm-openmp             # [osx or win]
-        - autotools_clang_conda   # [win]
+        - llvm-openmp             # [osx]
       host:
         - libblas
         - liblapack
-      run:
-        - libblas
-        - liblapack
       run_constrained:
         - faiss-proc * {{ faiss_proc_type }}
 
     test:
       commands:
         # shared
-        - test -f $PREFIX/lib/libfaiss.so                           # [linux]
-        - test -f $PREFIX/lib/libfaiss.dylib                        # [osx]
-        - if not exist %PREFIX%\\Library\\bin\\libfaiss.dll exit 1  # [win]
+        - test -f $PREFIX/lib/libfaiss.so               # [linux]
+        - test -f $PREFIX/lib/libfaiss.dylib            # [osx]
+        - if not exist %LIBRARY_BIN%\faiss.dll exit 1   # [win]
+        # On windows, faiss.lib is an "import library";
+        # Deleting it breaks the faiss-builds
+        - if not exist %LIBRARY_LIB%\faiss.lib exit 1   # [win]
 
         # absence of static libraries
-        - test ! -f $PREFIX/lib/libfaiss.a                          # [not win]
-        - if exist %PREFIX%\\Library\\lib\\libfaiss.lib exit 1      # [win]
-
-        # conda tools
-        - conda inspect linkages -p $PREFIX $PKG_NAME               # [not win]
-        - conda inspect objects -p $PREFIX $PKG_NAME                # [osx]
+        - test ! -f $PREFIX/lib/libfaiss.a              # [not win]
 
         # headers
         {% for each_header in headers %}
-        - test -f $PREFIX/include/faiss/{{ each_header }}                                    # [unix]
-        - if not exist %LIBRARY_INC%\\faiss\\{{ "\\".join(each_header.split("/")) }} exit 1  # [win]
+        - test -f $PREFIX/include/faiss/{{ each_header }} || (echo "{{ each_header }} not found" && exit 1)  # [unix]
+        - if not exist %LIBRARY_INC%\faiss\{{ "\\".join(each_header.split("/")) }} exit 1                    # [win]
         {% endfor %}
 
   - name: faiss
@@ -156,17 +139,17 @@ outputs:
         - {{ compiler('cuda') }}  # [cuda_compiler_version != "None"]
         - swig
         - cmake
-        - autotools_clang_conda   # [win]
+        - libgomp                 # [linux]
+        - llvm-openmp             # [osx]
       host:
         - python
         - pip
-        - numpy =1.14.*
+        - numpy
         - libfaiss ={{ version }}=*_{{ faiss_proc_type }}
         - libblas
         - liblapack
-        - libgomp                 # [linux]
-        - llvm-openmp             # [osx or win]
       run:
+        - python
         - libfaiss ={{ version }}=*_{{ faiss_proc_type }}
         - {{ pin_compatible('numpy') }}
       run_constrained:
@@ -182,42 +165,33 @@ outputs:
         - scipy
       source_files:
         - tests/
-        - conda/faiss/run_test.py
-        - conda/faiss-gpu/run_test.py
       imports:
         - faiss
       commands:
-        - python conda/faiss/run_test.py
-        # running the following test requires an actual GPU device, which is not available in CI
-        # - python conda/faiss-gpu/run_test.py  # [cuda_compiler_version != "None"]
         - python -m unittest discover tests
+        # running the following test requires an actual GPU device, which is not available in CI
+        # - python -m unittest discover faiss/gpu/test/
 
   # for compatibility with (& ease of migration from) existing packages in the pytorch channel
   - name: faiss-cpu
     build:
-      string: "py{{ CONDA_PY }}_{{ number }}"
       skip: true  # [cuda_compiler_version != "None"]
     requirements:
-      host:
-        - python
       run:
         - faiss ={{ version }}=*_cpu
     test:
-      commands:
-        - exit 0
+      imports:
+        - faiss
 
   - name: faiss-gpu
     build:
-      string: "py{{ CONDA_PY }}_{{ number }}"
       skip: true  # [cuda_compiler_version == "None"]
     requirements:
-      host:
-        - python
       run:
         - faiss ={{ version }}=*_cuda
     test:
-      commands:
-        - exit 0
+      imports:
+        - faiss
 
 about:
   home: https://github.com/facebookresearch/faiss
diff --git a/recipe/patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch b/recipe/patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch
new file mode 100644
index 00000000..c01d2211
--- /dev/null
+++ b/recipe/patches/0001-CMake-use-GNUInstallDirs-instead-of-hardcoded-paths.patch
@@ -0,0 +1,76 @@
+From e80893144a71dd48d829f611bf2708d9372f71bf Mon Sep 17 00:00:00 2001
+From: Mo Zhou <cdluminate@gmail.com>
+Date: Tue, 24 Nov 2020 05:44:37 +0000
+Subject: [PATCH 1/2] CMake: use GNUInstallDirs instead of hardcoded paths.
+
+---
+ CMakeLists.txt           |  1 +
+ faiss/CMakeLists.txt     | 14 +++++++-------
+ faiss/gpu/CMakeLists.txt |  2 +-
+ 3 files changed, 9 insertions(+), 8 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 29b73d8..158712e 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -11,6 +11,7 @@ project(faiss
+   DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
+   HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
+   LANGUAGES CXX)
++include(GNUInstallDirs)
+ 
+ set(CMAKE_CXX_STANDARD 11)
+ 
+diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
+index 51680fd..3469538 100644
+--- a/faiss/CMakeLists.txt
++++ b/faiss/CMakeLists.txt
+@@ -160,15 +160,15 @@ endif()
+ 
+ install(TARGETS faiss
+   EXPORT faiss-targets
+-  RUNTIME DESTINATION bin
+-  ARCHIVE DESTINATION lib
+-  LIBRARY DESTINATION lib
+-  INCLUDES DESTINATION include
++  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
++  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
++  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
++  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+ )
+ foreach(header ${FAISS_HEADERS})
+   get_filename_component(dir ${header} DIRECTORY )
+   install(FILES ${header}
+-    DESTINATION include/faiss/${dir}
++    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/${dir}
+   )
+ endforeach()
+ 
+@@ -185,9 +185,9 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/faiss-config.cmake.in
+ )
+ install(FILES ${PROJECT_BINARY_DIR}/cmake/faiss-config.cmake
+   ${PROJECT_BINARY_DIR}/cmake/faiss-config-version.cmake
+-  DESTINATION share/faiss
++  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
+ )
+ 
+ install(EXPORT faiss-targets
+-  DESTINATION share/faiss
++  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
+ )
+diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
+index 5b2a957..8041bac 100644
+--- a/faiss/gpu/CMakeLists.txt
++++ b/faiss/gpu/CMakeLists.txt
+@@ -173,7 +173,7 @@ set(FAISS_GPU_HEADERS
+ foreach(header ${FAISS_GPU_HEADERS})
+   get_filename_component(dir ${header} DIRECTORY )
+   install(FILES ${header}
+-    DESTINATION include/faiss/gpu/${dir}
++    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/gpu/${dir}
+   )
+ endforeach()
+ 
+-- 
+2.29.2.windows.3
+
diff --git a/recipe/patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch b/recipe/patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch
deleted file mode 100644
index 7b378223..00000000
--- a/recipe/patches/0001-change-python-Makefile-to-point-to-libfaiss.-SHLIB_E.patch
+++ /dev/null
@@ -1,41 +0,0 @@
-From 5f990ac215a73ba6c3bea8b4033f5b0f5d026237 Mon Sep 17 00:00:00 2001
-From: "H. Vetinari" <h.vetinari@gmx.com>
-Date: Thu, 7 May 2020 11:05:05 +0200
-Subject: [PATCH 1/8] change python/Makefile to point to libfaiss.$(SHLIB_EXT)
- instead of .a
-
-also give definite names to makefile-rules, as otherwise make may
-struggle to find make _swigfaiss.so otherwise.
----
- python/Makefile | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/python/Makefile b/python/Makefile
-index 2836568..eaa32a4 100644
---- a/python/Makefile
-+++ b/python/Makefile
-@@ -12,18 +12,18 @@ endif
- all: build
- 
- # Also silently generates swigfaiss.py.
--swigfaiss.cpp: swigfaiss.swig ../libfaiss.a
-+swigfaiss.cpp: swigfaiss.swig $(PREFIX)/lib/libfaiss$(SHLIB_EXT)
- 	$(SWIG) -python -c++ -Doverride= -I../ $(SWIGFLAGS) -o $@ $<
- 
- swigfaiss_avx2.cpp: swigfaiss.swig ../libfaiss.a
- 	$(SWIG) -python -c++ -Doverride= -module swigfaiss_avx2 -I../ $(SWIGFLAGS) -o $@ $<
- 
--%.o: %.cpp
-+swigfaiss.o: swigfaiss.cpp
- 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) $(PYTHONCFLAGS) \
-                -I../ -c $< -o $@
- 
- # Extension is .so even on OSX.
--_%.so: %.o ../libfaiss.a
-+_swigfaiss.so: swigfaiss.o $(PREFIX)/lib/libfaiss$(SHLIB_EXT)
- 	$(CXX) $(SHAREDFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
- 
- build: _swigfaiss.so faiss.py
--- 
-2.26.2.windows.1
-
diff --git a/recipe/patches/0002-remove-fPIC-flag-for-msvc-compat.patch b/recipe/patches/0002-remove-fPIC-flag-for-msvc-compat.patch
deleted file mode 100644
index e8774ed9..00000000
--- a/recipe/patches/0002-remove-fPIC-flag-for-msvc-compat.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From c8dc93d045be8485a4863d20b36e12d1aa2c6c8a Mon Sep 17 00:00:00 2001
-From: "H. Vetinari" <h.vetinari@gmx.com>
-Date: Fri, 8 May 2020 20:12:30 +0200
-Subject: [PATCH 2/8] remove -fPIC flag for msvc-compat
-
----
- makefile.inc.in | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/makefile.inc.in b/makefile.inc.in
-index 19d8511..1c4e976 100644
---- a/makefile.inc.in
-+++ b/makefile.inc.in
-@@ -6,7 +6,7 @@
- CXX          = @CXX@
- CXXCPP       = @CXXCPP@
- CPPFLAGS     = -DFINTEGER=int @CPPFLAGS@ @OPENMP_CXXFLAGS@ @NVCC_CPPFLAGS@
--CXXFLAGS     = -fPIC @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@
-+CXXFLAGS     = @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@
- CPUFLAGS     = @ARCH_CPUFLAGS@
- LDFLAGS      = @OPENMP_LDFLAGS@ @LDFLAGS@ @NVCC_LDFLAGS@
- LIBS         = @BLAS_LIBS@ @LAPACK_LIBS@ @LIBS@ @NVCC_LIBS@
--- 
-2.26.2.windows.1
-
diff --git a/recipe/patches/0002-use-c-14.patch b/recipe/patches/0002-use-c-14.patch
new file mode 100644
index 00000000..b1cc3690
--- /dev/null
+++ b/recipe/patches/0002-use-c-14.patch
@@ -0,0 +1,39 @@
+From 65f07198267d0b777c03b21ae81c0b27577176fc Mon Sep 17 00:00:00 2001
+From: "H. Vetinari" <h.vetinari@gmx.com>
+Date: Mon, 26 Oct 2020 22:44:44 +0100
+Subject: [PATCH 2/2] use c++14
+
+---
+ CMakeLists.txt              | 2 +-
+ faiss/python/CMakeLists.txt | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 158712e..1b2038f 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -13,7 +13,7 @@ project(faiss
+   LANGUAGES CXX)
+ include(GNUInstallDirs)
+ 
+-set(CMAKE_CXX_STANDARD 11)
++set(CMAKE_CXX_STANDARD 14)
+ 
+ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+ 
+diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
+index 0b4366d..1ef5dea 100644
+--- a/faiss/python/CMakeLists.txt
++++ b/faiss/python/CMakeLists.txt
+@@ -11,7 +11,7 @@ project(pyfaiss
+   HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
+   LANGUAGES CXX)
+ 
+-set(CMAKE_CXX_STANDARD 11)
++set(CMAKE_CXX_STANDARD 14)
+ 
+ find_package(SWIG REQUIRED COMPONENTS python)
+ include(${SWIG_USE_FILE})
+-- 
+2.29.2.windows.3
+
diff --git a/recipe/patches/0003-remove-deleted-files-from-template-change.patch b/recipe/patches/0003-remove-deleted-files-from-template-change.patch
deleted file mode 100644
index 9217a390..00000000
--- a/recipe/patches/0003-remove-deleted-files-from-template-change.patch
+++ /dev/null
@@ -1,1191 +0,0 @@
-From 94bd01d4b4f457f40994df7817d8b789a99d605b Mon Sep 17 00:00:00 2001
-From: Jeff Johnson <jhj@fb.com>
-Date: Wed, 25 Mar 2020 10:57:57 -0700
-Subject: [PATCH 3/8] remove deleted files from template change
-
----
- gpu/impl/PQCodeDistances.cu              | 567 ---------------------
- gpu/impl/PQScanMultiPassNoPrecomputed.cu | 597 -----------------------
- 2 files changed, 1164 deletions(-)
- delete mode 100644 gpu/impl/PQCodeDistances.cu
- delete mode 100644 gpu/impl/PQScanMultiPassNoPrecomputed.cu
-
-diff --git a/gpu/impl/PQCodeDistances.cu b/gpu/impl/PQCodeDistances.cu
-deleted file mode 100644
-index 817990b..0000000
---- a/gpu/impl/PQCodeDistances.cu
-+++ /dev/null
-@@ -1,567 +0,0 @@
--/**
-- * Copyright (c) Facebook, Inc. and its affiliates.
-- *
-- * This source code is licensed under the MIT license found in the
-- * LICENSE file in the root directory of this source tree.
-- */
--
--
--#include <faiss/gpu/impl/PQCodeDistances.cuh>
--
--#include <faiss/gpu/impl/BroadcastSum.cuh>
--#include <faiss/gpu/impl/Distance.cuh>
--#include <faiss/gpu/impl/L2Norm.cuh>
--#include <faiss/gpu/utils/ConversionOperators.cuh>
--#include <faiss/gpu/utils/DeviceDefs.cuh>
--#include <faiss/gpu/utils/DeviceUtils.h>
--#include <faiss/gpu/utils/Float16.cuh>
--#include <faiss/gpu/utils/MatrixMult.cuh>
--#include <faiss/gpu/utils/PtxUtils.cuh>
--#include <faiss/gpu/utils/StaticUtils.h>
--#include <faiss/gpu/utils/Transpose.cuh>
--
--namespace faiss { namespace gpu {
--
--template <typename T>
--struct Converter {
--};
--
--template <>
--struct Converter<half> {
--  inline static __device__ half to(float v) { return __float2half(v); }
--};
--
--template <>
--struct Converter<float> {
--  inline static __device__ float to(float v) { return v; }
--};
--
--// Kernel responsible for calculating distance from residual vector to
--// each product quantizer code centroid
--template <typename OutCodeT, int DimsPerSubQuantizer, bool L2Distance>
--__global__ void
--__launch_bounds__(288, 4)
--pqCodeDistances(Tensor<float, 2, true> queries,
--                int queriesPerBlock,
--                Tensor<float, 2, true> coarseCentroids,
--                Tensor<float, 3, true> pqCentroids,
--                Tensor<int, 2, true> topQueryToCentroid,
--                // (query id)(coarse)(subquantizer)(code) -> dist
--                Tensor<OutCodeT, 4, true> outCodeDistances) {
--  const auto numSubQuantizers = pqCentroids.getSize(0);
--  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
--  assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
--  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
--
--  bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
--  int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
--
--  extern __shared__ float smem[];
--
--  // Each thread calculates a single code
--  float subQuantizerData[DimsPerSubQuantizer];
--
--  auto code = threadIdx.x;
--  auto subQuantizer = blockIdx.y;
--
--  // Each thread will load the pq centroid data for the code that it
--  // is processing
--#pragma unroll
--  for (int i = 0; i < DimsPerSubQuantizer; ++i) {
--    subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
--  }
--
--  // Where we store our query vector
--  float* smemQuery = smem;
--
--  // Where we store our residual vector; this is double buffered so we
--  // can be loading the next one while processing the current one
--  float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
--  float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
--
--  // Where we pre-load the coarse centroid IDs
--  int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer];
--
--  // Each thread is calculating the distance for a single code,
--  // performing the reductions locally
--
--  // Handle multiple queries per block
--  auto startQueryId = blockIdx.x * queriesPerBlock;
--  auto numQueries = queries.getSize(0) - startQueryId;
--  if (numQueries > queriesPerBlock) {
--    numQueries = queriesPerBlock;
--  }
--
--  for (int query = 0; query < numQueries; ++query) {
--    auto queryId = startQueryId + query;
--
--    auto querySubQuantizer =
--      queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
--
--    // Load current query vector
--    for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
--      smemQuery[i] = querySubQuantizer[i];
--    }
--
--    // Load list of coarse centroids found
--    for (int i = threadIdx.x;
--         i < topQueryToCentroid.getSize(1); i += blockDim.x) {
--      coarseIds[i] = topQueryToCentroid[queryId][i];
--    }
--
--    // We need coarseIds below
--    // FIXME: investigate loading separately, so we don't need this
--    __syncthreads();
--
--    // Preload first buffer of residual data
--    if (isLoadingThread) {
--      for (int i = loadingThreadId;
--           i < DimsPerSubQuantizer;
--           i += blockDim.x - codesPerSubQuantizer) {
--        auto coarseId = coarseIds[0];
--        // In case NaNs were in the original query data
--        coarseId = coarseId == -1 ? 0 : coarseId;
--        auto coarseCentroidSubQuantizer =
--          coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
--
--        if (L2Distance) {
--          smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
--        } else {
--          smemResidual1[i] = coarseCentroidSubQuantizer[i];
--        }
--      }
--    }
--
--    // The block walks the list for a single query
--    for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) {
--      // Wait for smemResidual1 to be loaded
--      __syncthreads();
--
--      if (isLoadingThread) {
--        // Preload second buffer of residual data
--        for (int i = loadingThreadId;
--             i < DimsPerSubQuantizer;
--             i += blockDim.x - codesPerSubQuantizer) {
--          // FIXME: try always making this centroid id 0 so we can
--          // terminate
--          if (coarse != (topQueryToCentroid.getSize(1) - 1)) {
--            auto coarseId = coarseIds[coarse + 1];
--            // In case NaNs were in the original query data
--            coarseId = coarseId == -1 ? 0 : coarseId;
--
--            auto coarseCentroidSubQuantizer =
--              coarseCentroids[coarseId]
--              [subQuantizer * dimsPerSubQuantizer].data();
--
--            if (L2Distance) {
--              smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
--            } else {
--              smemResidual2[i] = coarseCentroidSubQuantizer[i];
--            }
--          }
--        }
--      } else {
--        // These are the processing threads
--        float dist = 0.0f;
--
--        constexpr int kUnroll = 4;
--        constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
--        constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
--        float vals[kUnroll];
--
--        // Calculate residual - pqCentroid for each dim that we're
--        // processing
--
--        // Unrolled loop
--        if (L2Distance) {
--#pragma unroll
--          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
--#pragma unroll
--            for (int j = 0; j < kUnroll; ++j) {
--              vals[j] = smemResidual1[i * kUnroll + j];
--            }
--
--#pragma unroll
--            for (int j = 0; j < kUnroll; ++j) {
--              vals[j] -= subQuantizerData[i * kUnroll + j];
--            }
--
--#pragma unroll
--            for (int j = 0; j < kUnroll; ++j) {
--              vals[j] *= vals[j];
--            }
--
--#pragma unroll
--            for (int j = 0; j < kUnroll; ++j) {
--              dist += vals[j];
--            }
--          }
--        } else {
--          // Inner product: query slice against the reconstructed sub-quantizer
--          // for this coarse cell (query o (centroid + subQCentroid))
--#pragma unroll
--          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
--#pragma unroll
--            for (int j = 0; j < kUnroll; ++j) {
--              vals[j] = smemResidual1[i * kUnroll + j];
--            }
--
--#pragma unroll
--            for (int j = 0; j < kUnroll; ++j) {
--              vals[j] += subQuantizerData[i * kUnroll + j];
--            }
--
--#pragma unroll
--            for (int j = 0; j < kUnroll; ++j) {
--              vals[j] *= smemQuery[i * kUnroll + j];
--            }
--
--#pragma unroll
--            for (int j = 0; j < kUnroll; ++j) {
--              dist += vals[j];
--            }
--          }
--        }
--
--        // Remainder loop
--        if (L2Distance) {
--#pragma unroll
--          for (int j = 0; j < kRemainder; ++j) {
--            vals[j] = smemResidual1[kRemainderBase + j];
--          }
--
--#pragma unroll
--          for (int j = 0; j < kRemainder; ++j) {
--            vals[j] -= subQuantizerData[kRemainderBase + j];
--          }
--
--#pragma unroll
--          for (int j = 0; j < kRemainder; ++j) {
--            vals[j] *= vals[j];
--          }
--        } else {
--          // Inner product
--          // Inner product: query slice against the reconstructed sub-quantizer
--          // for this coarse cell (query o (centroid + subQCentroid))
--#pragma unroll
--          for (int j = 0; j < kRemainder; ++j) {
--            vals[j] = smemResidual1[kRemainderBase + j];
--          }
--
--#pragma unroll
--          for (int j = 0; j < kRemainder; ++j) {
--            vals[j] += subQuantizerData[kRemainderBase + j];
--          }
--
--#pragma unroll
--          for (int j = 0; j < kRemainder; ++j) {
--            vals[j] *= smemQuery[kRemainderBase + j];
--          }
--        }
--
--#pragma unroll
--        for (int j = 0; j < kRemainder; ++j) {
--          dist += vals[j];
--        }
--
--        // We have the distance for our code; write it out
--        outCodeDistances[queryId][coarse][subQuantizer][code] =
--          Converter<OutCodeT>::to(dist);
--      } // !isLoadingThread
--
--      // Swap residual buffers
--      float* tmp = smemResidual1;
--      smemResidual1 = smemResidual2;
--      smemResidual2 = tmp;
--    }
--  }
--}
--
--__global__ void
--residualVector(Tensor<float, 2, true> queries,
--               Tensor<float, 2, true> coarseCentroids,
--               Tensor<int, 2, true> topQueryToCentroid,
--               int numSubDim,
--               // output is transposed:
--               // (sub q)(query id)(centroid id)(sub dim)
--               Tensor<float, 4, true> residual) {
--  // block x is query id
--  // block y is centroid id
--  // thread x is dim
--  auto queryId = blockIdx.x;
--  auto centroidId = blockIdx.y;
--
--  int realCentroidId = topQueryToCentroid[queryId][centroidId];
--
--  for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
--    float q = queries[queryId][dim];
--    float c = coarseCentroids[realCentroidId][dim];
--
--    residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] =
--      q - c;
--  }
--}
--
--void
--runResidualVector(Tensor<float, 3, true>& pqCentroids,
--                  Tensor<float, 2, true>& queries,
--                  Tensor<float, 2, true>& coarseCentroids,
--                  Tensor<int, 2, true>& topQueryToCentroid,
--                  Tensor<float, 4, true>& residual,
--                  cudaStream_t stream) {
--  auto grid =
--    dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1));
--  auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
--
--  residualVector<<<grid, block, 0, stream>>>(
--    queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1),
--    residual);
--
--  CUDA_TEST_ERROR();
--}
--
--void
--runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
--                     Tensor<float, 2, true>& queries,
--                     Tensor<float, 2, true>& coarseCentroids,
--                     Tensor<int, 2, true>& topQueryToCentroid,
--                     NoTypeTensor<4, true>& outCodeDistances,
--                     bool useFloat16Lookup,
--                     DeviceMemory& mem,
--                     cublasHandle_t handle,
--                     cudaStream_t stream) {
--  // Calculate (q - c) residual vector
--  // (sub q)(query id)(centroid id)(sub dim)
--  DeviceTensor<float, 4, true> residual(
--    mem,
--    {pqCentroids.getSize(0),
--        topQueryToCentroid.getSize(0),
--        topQueryToCentroid.getSize(1),
--        pqCentroids.getSize(1)},
--    stream);
--
--  runResidualVector(pqCentroids, queries,
--                    coarseCentroids, topQueryToCentroid,
--                    residual, stream);
--
--  // Calculate ||q - c||^2
--  DeviceTensor<float, 1, true> residualNorms(
--    mem,
--    {pqCentroids.getSize(0) *
--        topQueryToCentroid.getSize(0) *
--        topQueryToCentroid.getSize(1)},
--    stream);
--
--  auto residualView2 = residual.view<2>(
--    {pqCentroids.getSize(0) *
--        topQueryToCentroid.getSize(0) *
--        topQueryToCentroid.getSize(1),
--        pqCentroids.getSize(1)});
--
--  runL2Norm(residualView2, true, residualNorms, true, stream);
--
--  // Perform a batch MM:
--  // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
--  // (sub q) x {(q * c)(code)}
--  auto residualView3 = residual.view<3>(
--    {pqCentroids.getSize(0),
--        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
--        pqCentroids.getSize(1)});
--
--  DeviceTensor<float, 3, true> residualDistance(
--    mem,
--    {pqCentroids.getSize(0),
--        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
--        pqCentroids.getSize(2)},
--    stream);
--
--  runIteratedMatrixMult(residualDistance, false,
--                        residualView3, false,
--                        pqCentroids, false,
--                        -2.0f, 0.0f,
--                        handle,
--                        stream);
--
--  // Sum ||q - c||^2 along rows
--  auto residualDistanceView2 = residualDistance.view<2>(
--    {pqCentroids.getSize(0) *
--        topQueryToCentroid.getSize(0) *
--        topQueryToCentroid.getSize(1),
--        pqCentroids.getSize(2)});
--
--  runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
--
--  Tensor<float, 4, true> outCodeDistancesF;
--  DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
--
--  if (useFloat16Lookup) {
--    outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
--      mem, {outCodeDistances.getSize(0),
--          outCodeDistances.getSize(1),
--          outCodeDistances.getSize(2),
--          outCodeDistances.getSize(3)},
--      stream);
--
--    outCodeDistancesF = outCodeDistancesFloatMem;
--  } else {
--    outCodeDistancesF = outCodeDistances.toTensor<float>();
--  }
--
--  // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
--  // is where we build our output distances)
--  auto outCodeDistancesView = outCodeDistancesF.view<3>(
--    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
--        outCodeDistances.getSize(2),
--        outCodeDistances.getSize(3)});
--
--  runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
--
--  // Calculate code norms per each sub-dim
--  // (sub q)(sub dim)(code) is pqCentroids
--  // transpose to (sub q)(code)(sub dim)
--  DeviceTensor<float, 3, true> pqCentroidsTranspose(
--    mem,
--    {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)},
--    stream);
--
--  runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
--
--  auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
--    {pqCentroids.getSize(0) * pqCentroids.getSize(2),
--        pqCentroids.getSize(1)});
--
--  DeviceTensor<float, 1, true> pqCentroidsNorm(
--    mem,
--    {pqCentroids.getSize(0) * pqCentroids.getSize(2)},
--    stream);
--
--  runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
--
--  // View output as (q * c)(sub q * code), and add centroid norm to
--  // each row
--  auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
--    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
--        outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
--
--  runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
--
--  if (useFloat16Lookup) {
--    // Need to convert back
--    auto outCodeDistancesH = outCodeDistances.toTensor<half>();
--    convertTensor<float, half, 4>(stream,
--                                  outCodeDistancesF,
--                                  outCodeDistancesH);
--  }
--}
--
--void
--runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
--                   Tensor<float, 2, true>& queries,
--                   Tensor<float, 2, true>& coarseCentroids,
--                   Tensor<int, 2, true>& topQueryToCentroid,
--                   NoTypeTensor<4, true>& outCodeDistances,
--                   bool l2Distance,
--                   bool useFloat16Lookup,
--                   cudaStream_t stream) {
--  const auto numSubQuantizers = pqCentroids.getSize(0);
--  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
--  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
--
--  // FIXME: tune
--  // Reuse of pq centroid data is based on both # of queries * nprobe,
--  // and we should really be tiling in both dimensions
--  constexpr int kQueriesPerBlock = 8;
--
--  auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
--                   numSubQuantizers);
--
--  // Reserve one block of threads for double buffering
--  // FIXME: probably impractical for large # of dims?
--  auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize);
--  auto block = dim3(codesPerSubQuantizer + loadingThreads);
--
--  auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
--    + topQueryToCentroid.getSize(1) * sizeof(int);
--
--#define RUN_CODE(DIMS, L2)                                              \
--  do {                                                                  \
--    if (useFloat16Lookup) {                                             \
--      auto outCodeDistancesT = outCodeDistances.toTensor<half>();       \
--                                                                        \
--      pqCodeDistances<half, DIMS, L2><<<grid, block, smem, stream>>>(   \
--        queries, kQueriesPerBlock,                                      \
--        coarseCentroids, pqCentroids,                                   \
--        topQueryToCentroid, outCodeDistancesT);                         \
--    } else {                                                            \
--      auto outCodeDistancesT = outCodeDistances.toTensor<float>();      \
--                                                                        \
--      pqCodeDistances<float, DIMS, L2><<<grid, block, smem, stream>>>(  \
--        queries, kQueriesPerBlock,                                      \
--        coarseCentroids, pqCentroids,                                   \
--        topQueryToCentroid, outCodeDistancesT);                         \
--    }                                                                   \
--  } while (0)
--
--#define CODE_L2(DIMS)                           \
--  do {                                          \
--    if (l2Distance) {                           \
--      RUN_CODE(DIMS, true);                     \
--    } else {                                    \
--      RUN_CODE(DIMS, false);                    \
--    }                                           \
--  } while (0)
--
--  switch (dimsPerSubQuantizer) {
--    case 1:
--      CODE_L2(1);
--      break;
--    case 2:
--      CODE_L2(2);
--      break;
--    case 3:
--      CODE_L2(3);
--      break;
--    case 4:
--      CODE_L2(4);
--      break;
--    case 6:
--      CODE_L2(6);
--      break;
--    case 8:
--      CODE_L2(8);
--      break;
--    case 10:
--      CODE_L2(10);
--      break;
--    case 12:
--      CODE_L2(12);
--      break;
--    case 16:
--      CODE_L2(16);
--      break;
--    case 20:
--      CODE_L2(20);
--      break;
--    case 24:
--      CODE_L2(24);
--      break;
--    case 28:
--      CODE_L2(28);
--      break;
--    case 32:
--      CODE_L2(32);
--      break;
--      // FIXME: larger sizes require too many registers - we need the
--      // MM implementation working
--    default:
--      FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer "
--                      "not currently supported");
--  }
--
--#undef RUN_CODE
--#undef CODE_L2
--
--  CUDA_TEST_ERROR();
--}
--
--} } // namespace
-diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/gpu/impl/PQScanMultiPassNoPrecomputed.cu
-deleted file mode 100644
-index a514694..0000000
---- a/gpu/impl/PQScanMultiPassNoPrecomputed.cu
-+++ /dev/null
-@@ -1,597 +0,0 @@
--/**
-- * Copyright (c) Facebook, Inc. and its affiliates.
-- *
-- * This source code is licensed under the MIT license found in the
-- * LICENSE file in the root directory of this source tree.
-- */
--
--
--#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh>
--#include <faiss/gpu/GpuResources.h>
--#include <faiss/gpu/impl/PQCodeDistances.cuh>
--#include <faiss/gpu/impl/PQCodeLoad.cuh>
--#include <faiss/gpu/impl/IVFUtils.cuh>
--#include <faiss/gpu/utils/ConversionOperators.cuh>
--#include <faiss/gpu/utils/DeviceTensor.cuh>
--#include <faiss/gpu/utils/DeviceUtils.h>
--#include <faiss/gpu/utils/Float16.cuh>
--#include <faiss/gpu/utils/LoadStoreOperators.cuh>
--#include <faiss/gpu/utils/NoTypeTensor.cuh>
--#include <faiss/gpu/utils/StaticUtils.h>
--
--#include <faiss/gpu/utils/HostTensor.cuh>
--
--namespace faiss { namespace gpu {
--
--// This must be kept in sync with PQCodeDistances.cu
--bool isSupportedNoPrecomputedSubDimSize(int dims) {
--  switch (dims) {
--    case 1:
--    case 2:
--    case 3:
--    case 4:
--    case 6:
--    case 8:
--    case 10:
--    case 12:
--    case 16:
--    case 20:
--    case 24:
--    case 28:
--    case 32:
--      return true;
--    default:
--      // FIXME: larger sizes require too many registers - we need the
--      // MM implementation working
--      return false;
--  }
--}
--
--template <typename LookupT, typename LookupVecT>
--struct LoadCodeDistances {
--  static inline __device__ void load(LookupT* smem,
--                                     LookupT* codes,
--                                     int numCodes) {
--    constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
--
--    // We can only use the vector type if the data is guaranteed to be
--    // aligned. The codes are innermost, so if it is evenly divisible,
--    // then any slice will be aligned.
--    if (numCodes % kWordSize == 0) {
--      // Load the data by float4 for efficiency, and then handle any remainder
--      // limitVec is the number of whole vec words we can load, in terms
--      // of whole blocks performing the load
--      constexpr int kUnroll = 2;
--      int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
--      limitVec *= kUnroll * blockDim.x;
--
--      LookupVecT* smemV = (LookupVecT*) smem;
--      LookupVecT* codesV = (LookupVecT*) codes;
--
--      for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
--        LookupVecT vals[kUnroll];
--
--#pragma unroll
--        for (int j = 0; j < kUnroll; ++j) {
--          vals[j] =
--            LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
--        }
--
--#pragma unroll
--        for (int j = 0; j < kUnroll; ++j) {
--          LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
--        }
--      }
--
--      // This is where we start loading the remainder that does not evenly
--      // fit into kUnroll x blockDim.x
--      int remainder = limitVec * kWordSize;
--
--      for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
--        smem[i] = codes[i];
--      }
--    } else {
--      // Potential unaligned load
--      constexpr int kUnroll = 4;
--
--      int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
--
--      int i = threadIdx.x;
--      for (; i < limit; i += kUnroll * blockDim.x) {
--        LookupT vals[kUnroll];
--
--#pragma unroll
--        for (int j = 0; j < kUnroll; ++j) {
--          vals[j] = codes[i + j * blockDim.x];
--        }
--
--#pragma unroll
--        for (int j = 0; j < kUnroll; ++j) {
--          smem[i + j * blockDim.x] = vals[j];
--        }
--      }
--
--      for (; i < numCodes; i += blockDim.x) {
--        smem[i] = codes[i];
--      }
--    }
--  }
--};
--
--template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
--__global__ void
--pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
--                             Tensor<float, 3, true> pqCentroids,
--                             Tensor<int, 2, true> topQueryToCentroid,
--                             Tensor<LookupT, 4, true> codeDistances,
--                             void** listCodes,
--                             int* listLengths,
--                             Tensor<int, 2, true> prefixSumOffsets,
--                             Tensor<float, 1, true> distance) {
--  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
--
--  // Where the pq code -> residual distance is stored
--  extern __shared__ char smemCodeDistances[];
--  LookupT* codeDist = (LookupT*) smemCodeDistances;
--
--  // Each block handles a single query
--  auto queryId = blockIdx.y;
--  auto probeId = blockIdx.x;
--
--  // This is where we start writing out data
--  // We ensure that before the array (at offset -1), there is a 0 value
--  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
--  float* distanceOut = distance[outBase].data();
--
--  auto listId = topQueryToCentroid[queryId][probeId];
--  // Safety guard in case NaNs in input cause no list ID to be generated
--  if (listId == -1) {
--    return;
--  }
--
--  unsigned char* codeList = (unsigned char*) listCodes[listId];
--  int limit = listLengths[listId];
--
--  constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
--    (NumSubQuantizers / 4);
--  unsigned int code32[kNumCode32];
--  unsigned int nextCode32[kNumCode32];
--
--  // We double-buffer the code loading, which improves memory utilization
--  if (threadIdx.x < limit) {
--    LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
--  }
--
--  LoadCodeDistances<LookupT, LookupVecT>::load(
--    codeDist,
--    codeDistances[queryId][probeId].data(),
--    codeDistances.getSize(2) * codeDistances.getSize(3));
--
--  // Prevent WAR dependencies
--  __syncthreads();
--
--  // Each thread handles one code element in the list, with a
--  // block-wide stride
--  for (int codeIndex = threadIdx.x;
--       codeIndex < limit;
--       codeIndex += blockDim.x) {
--    // Prefetch next codes
--    if (codeIndex + blockDim.x < limit) {
--      LoadCode32<NumSubQuantizers>::load(
--        nextCode32, codeList, codeIndex + blockDim.x);
--    }
--
--    float dist = 0.0f;
--
--#pragma unroll
--    for (int word = 0; word < kNumCode32; ++word) {
--      constexpr int kBytesPerCode32 =
--        NumSubQuantizers < 4 ? NumSubQuantizers : 4;
--
--      if (kBytesPerCode32 == 1) {
--        auto code = code32[0];
--        dist = ConvertTo<float>::to(codeDist[code]);
--
--      } else {
--#pragma unroll
--        for (int byte = 0; byte < kBytesPerCode32; ++byte) {
--          auto code = getByte(code32[word], byte * 8, 8);
--
--          auto offset =
--            codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
--
--          dist += ConvertTo<float>::to(codeDist[offset + code]);
--        }
--      }
--    }
--
--    // Write out intermediate distance result
--    // We do not maintain indices here, in order to reduce global
--    // memory traffic. Those are recovered in the final selection step.
--    distanceOut[codeIndex] = dist;
--
--    // Rotate buffers
--#pragma unroll
--    for (int word = 0; word < kNumCode32; ++word) {
--      code32[word] = nextCode32[word];
--    }
--  }
--}
--
--void
--runMultiPassTile(Tensor<float, 2, true>& queries,
--                 Tensor<float, 2, true>& centroids,
--                 Tensor<float, 3, true>& pqCentroidsInnermostCode,
--                 NoTypeTensor<4, true>& codeDistances,
--                 Tensor<int, 2, true>& topQueryToCentroid,
--                 bool useFloat16Lookup,
--                 int bytesPerCode,
--                 int numSubQuantizers,
--                 int numSubQuantizerCodes,
--                 thrust::device_vector<void*>& listCodes,
--                 thrust::device_vector<void*>& listIndices,
--                 IndicesOptions indicesOptions,
--                 thrust::device_vector<int>& listLengths,
--                 Tensor<char, 1, true>& thrustMem,
--                 Tensor<int, 2, true>& prefixSumOffsets,
--                 Tensor<float, 1, true>& allDistances,
--                 Tensor<float, 3, true>& heapDistances,
--                 Tensor<int, 3, true>& heapIndices,
--                 int k,
--                 faiss::MetricType metric,
--                 Tensor<float, 2, true>& outDistances,
--                 Tensor<long, 2, true>& outIndices,
--                 cudaStream_t stream) {
--  // We only support two metrics at the moment
--  FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT ||
--               metric == MetricType::METRIC_L2);
--
--  bool l2Distance = metric == MetricType::METRIC_L2;
--
--  // Calculate offset lengths, so we know where to write out
--  // intermediate results
--  runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
--                     thrustMem, stream);
--
--  // Calculate residual code distances, since this is without
--  // precomputed codes
--  runPQCodeDistances(pqCentroidsInnermostCode,
--                     queries,
--                     centroids,
--                     topQueryToCentroid,
--                     codeDistances,
--                     l2Distance,
--                     useFloat16Lookup,
--                     stream);
--
--  // Convert all codes to a distance, and write out (distance,
--  // index) values for all intermediate results
--  {
--    auto kThreadsPerBlock = 256;
--
--    auto grid = dim3(topQueryToCentroid.getSize(1),
--                     topQueryToCentroid.getSize(0));
--    auto block = dim3(kThreadsPerBlock);
--
--    // pq centroid distances
--    auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
--
--    smem *= numSubQuantizers * numSubQuantizerCodes;
--    FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
--
--#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T)                   \
--    do {                                                                \
--      auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>();         \
--                                                                        \
--      pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T>   \
--        <<<grid, block, smem, stream>>>(                                \
--          queries,                                                      \
--          pqCentroidsInnermostCode,                                     \
--          topQueryToCentroid,                                           \
--          codeDistancesT,                                               \
--          listCodes.data().get(),                                       \
--          listLengths.data().get(),                                     \
--          prefixSumOffsets,                                             \
--          allDistances);                                                \
--    } while (0)
--
--#define RUN_PQ(NUM_SUB_Q)                       \
--    do {                                        \
--      if (useFloat16Lookup) {                   \
--        RUN_PQ_OPT(NUM_SUB_Q, half, Half8);     \
--      } else {                                  \
--        RUN_PQ_OPT(NUM_SUB_Q, float, float4);   \
--      }                                         \
--    } while (0)
--
--    switch (bytesPerCode) {
--      case 1:
--        RUN_PQ(1);
--        break;
--      case 2:
--        RUN_PQ(2);
--        break;
--      case 3:
--        RUN_PQ(3);
--        break;
--      case 4:
--        RUN_PQ(4);
--        break;
--      case 8:
--        RUN_PQ(8);
--        break;
--      case 12:
--        RUN_PQ(12);
--        break;
--      case 16:
--        RUN_PQ(16);
--        break;
--      case 20:
--        RUN_PQ(20);
--        break;
--      case 24:
--        RUN_PQ(24);
--        break;
--      case 28:
--        RUN_PQ(28);
--        break;
--      case 32:
--        RUN_PQ(32);
--        break;
--      case 40:
--        RUN_PQ(40);
--        break;
--      case 48:
--        RUN_PQ(48);
--        break;
--      case 56:
--        RUN_PQ(56);
--        break;
--      case 64:
--        RUN_PQ(64);
--        break;
--      case 96:
--        RUN_PQ(96);
--        break;
--      default:
--        FAISS_ASSERT(false);
--        break;
--    }
--
--#undef RUN_PQ
--#undef RUN_PQ_OPT
--  }
--
--  CUDA_TEST_ERROR();
--
--  // k-select the output in chunks, to increase parallelism
--  runPass1SelectLists(prefixSumOffsets,
--                      allDistances,
--                      topQueryToCentroid.getSize(1),
--                      k,
--                      !l2Distance, // L2 distance chooses smallest
--                      heapDistances,
--                      heapIndices,
--                      stream);
--
--  // k-select final output
--  auto flatHeapDistances = heapDistances.downcastInner<2>();
--  auto flatHeapIndices = heapIndices.downcastInner<2>();
--
--  runPass2SelectLists(flatHeapDistances,
--                      flatHeapIndices,
--                      listIndices,
--                      indicesOptions,
--                      prefixSumOffsets,
--                      topQueryToCentroid,
--                      k,
--                      !l2Distance, // L2 distance chooses smallest
--                      outDistances,
--                      outIndices,
--                      stream);
--}
--
--void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
--                                     Tensor<float, 2, true>& centroids,
--                                     Tensor<float, 3, true>& pqCentroidsInnermostCode,
--                                     Tensor<int, 2, true>& topQueryToCentroid,
--                                     bool useFloat16Lookup,
--                                     int bytesPerCode,
--                                     int numSubQuantizers,
--                                     int numSubQuantizerCodes,
--                                     thrust::device_vector<void*>& listCodes,
--                                     thrust::device_vector<void*>& listIndices,
--                                     IndicesOptions indicesOptions,
--                                     thrust::device_vector<int>& listLengths,
--                                     int maxListLength,
--                                     int k,
--                                     faiss::MetricType metric,
--                                     // output
--                                     Tensor<float, 2, true>& outDistances,
--                                     // output
--                                     Tensor<long, 2, true>& outIndices,
--                                     GpuResources* res) {
--  constexpr int kMinQueryTileSize = 8;
--  constexpr int kMaxQueryTileSize = 128;
--  constexpr int kThrustMemSize = 16384;
--
--  int nprobe = topQueryToCentroid.getSize(1);
--
--  auto& mem = res->getMemoryManagerCurrentDevice();
--  auto stream = res->getDefaultStreamCurrentDevice();
--
--  // Make a reservation for Thrust to do its dirty work (global memory
--  // cross-block reduction space); hopefully this is large enough.
--  DeviceTensor<char, 1, true> thrustMem1(
--    mem, {kThrustMemSize}, stream);
--  DeviceTensor<char, 1, true> thrustMem2(
--    mem, {kThrustMemSize}, stream);
--  DeviceTensor<char, 1, true>* thrustMem[2] =
--    {&thrustMem1, &thrustMem2};
--
--  // How much temporary storage is available?
--  // If possible, we'd like to fit within the space available.
--  size_t sizeAvailable = mem.getSizeAvailable();
--
--  // We run two passes of heap selection
--  // This is the size of the first-level heap passes
--  constexpr int kNProbeSplit = 8;
--  int pass2Chunks = std::min(nprobe, kNProbeSplit);
--
--  size_t sizeForFirstSelectPass =
--    pass2Chunks * k * (sizeof(float) + sizeof(int));
--
--  // How much temporary storage we need per each query
--  size_t sizePerQuery =
--    2 * // streams
--    ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
--     nprobe * maxListLength * sizeof(float) + // allDistances
--     // residual distances
--     nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
--     sizeForFirstSelectPass);
--
--  int queryTileSize = (int) (sizeAvailable / sizePerQuery);
--
--  if (queryTileSize < kMinQueryTileSize) {
--    queryTileSize = kMinQueryTileSize;
--  } else if (queryTileSize > kMaxQueryTileSize) {
--    queryTileSize = kMaxQueryTileSize;
--  }
--
--  // FIXME: we should adjust queryTileSize to deal with this, since
--  // indexing is in int32
--  FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
--         std::numeric_limits<int>::max());
--
--  // Temporary memory buffers
--  // Make sure there is space prior to the start which will be 0, and
--  // will handle the boundary condition without branches
--  DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
--    mem, {queryTileSize * nprobe + 1}, stream);
--  DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
--    mem, {queryTileSize * nprobe + 1}, stream);
--
--  DeviceTensor<int, 2, true> prefixSumOffsets1(
--    prefixSumOffsetSpace1[1].data(),
--    {queryTileSize, nprobe});
--  DeviceTensor<int, 2, true> prefixSumOffsets2(
--    prefixSumOffsetSpace2[1].data(),
--    {queryTileSize, nprobe});
--  DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
--    {&prefixSumOffsets1, &prefixSumOffsets2};
--
--  // Make sure the element before prefixSumOffsets is 0, since we
--  // depend upon simple, boundary-less indexing to get proper results
--  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
--                              0,
--                              sizeof(int),
--                              stream));
--  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
--                              0,
--                              sizeof(int),
--                              stream));
--
--  int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
--
--  int totalCodeDistancesSize =
--    queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
--    codeDistanceTypeSize;
--
--  DeviceTensor<char, 1, true> codeDistances1Mem(
--    mem, {totalCodeDistancesSize}, stream);
--  NoTypeTensor<4, true> codeDistances1(
--    codeDistances1Mem.data(),
--    codeDistanceTypeSize,
--    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
--
--  DeviceTensor<char, 1, true> codeDistances2Mem(
--    mem, {totalCodeDistancesSize}, stream);
--  NoTypeTensor<4, true> codeDistances2(
--    codeDistances2Mem.data(),
--    codeDistanceTypeSize,
--    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
--
--  NoTypeTensor<4, true>* codeDistances[2] =
--    {&codeDistances1, &codeDistances2};
--
--  DeviceTensor<float, 1, true> allDistances1(
--    mem, {queryTileSize * nprobe * maxListLength}, stream);
--  DeviceTensor<float, 1, true> allDistances2(
--    mem, {queryTileSize * nprobe * maxListLength}, stream);
--  DeviceTensor<float, 1, true>* allDistances[2] =
--    {&allDistances1, &allDistances2};
--
--  DeviceTensor<float, 3, true> heapDistances1(
--    mem, {queryTileSize, pass2Chunks, k}, stream);
--  DeviceTensor<float, 3, true> heapDistances2(
--    mem, {queryTileSize, pass2Chunks, k}, stream);
--  DeviceTensor<float, 3, true>* heapDistances[2] =
--    {&heapDistances1, &heapDistances2};
--
--  DeviceTensor<int, 3, true> heapIndices1(
--    mem, {queryTileSize, pass2Chunks, k}, stream);
--  DeviceTensor<int, 3, true> heapIndices2(
--    mem, {queryTileSize, pass2Chunks, k}, stream);
--  DeviceTensor<int, 3, true>* heapIndices[2] =
--    {&heapIndices1, &heapIndices2};
--
--  auto streams = res->getAlternateStreamsCurrentDevice();
--  streamWait(streams, {stream});
--
--  int curStream = 0;
--
--  for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
--    int numQueriesInTile =
--      std::min(queryTileSize, queries.getSize(0) - query);
--
--    auto prefixSumOffsetsView =
--      prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
--
--    auto codeDistancesView =
--      codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
--    auto coarseIndicesView =
--      topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
--    auto queryView =
--      queries.narrowOutermost(query, numQueriesInTile);
--
--    auto heapDistancesView =
--      heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
--    auto heapIndicesView =
--      heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
--
--    auto outDistanceView =
--      outDistances.narrowOutermost(query, numQueriesInTile);
--    auto outIndicesView =
--      outIndices.narrowOutermost(query, numQueriesInTile);
--
--    runMultiPassTile(queryView,
--                     centroids,
--                     pqCentroidsInnermostCode,
--                     codeDistancesView,
--                     coarseIndicesView,
--                     useFloat16Lookup,
--                     bytesPerCode,
--                     numSubQuantizers,
--                     numSubQuantizerCodes,
--                     listCodes,
--                     listIndices,
--                     indicesOptions,
--                     listLengths,
--                     *thrustMem[curStream],
--                     prefixSumOffsetsView,
--                     *allDistances[curStream],
--                     heapDistancesView,
--                     heapIndicesView,
--                     k,
--                     metric,
--                     outDistanceView,
--                     outIndicesView,
--                     streams[curStream]);
--
--    curStream = (curStream + 1) % 2;
--  }
--
--  streamWait({stream}, streams);
--}
--
--} } // namespace
--- 
-2.26.2.windows.1
-
diff --git a/recipe/patches/0004-CUDA-8-fixes.patch b/recipe/patches/0004-CUDA-8-fixes.patch
deleted file mode 100644
index 430f063e..00000000
--- a/recipe/patches/0004-CUDA-8-fixes.patch
+++ /dev/null
@@ -1,117 +0,0 @@
-From 3a69eeeb6067ebd2e71e510e11535b59a4a7768b Mon Sep 17 00:00:00 2001
-From: Jeff Johnson <jhj@devfair032.maas>
-Date: Sun, 29 Mar 2020 20:07:01 -0700
-Subject: [PATCH 4/8] CUDA 8 fixes
-
----
- gpu/impl/Distance.cu          |  2 +-
- gpu/impl/GeneralDistance.cuh  | 15 ++++++++-------
- gpu/test/TestGpuIndexFlat.cpp |  2 +-
- gpu/utils/MathOperators.cuh   |  2 +-
- 4 files changed, 11 insertions(+), 10 deletions(-)
-
-diff --git a/gpu/impl/Distance.cu b/gpu/impl/Distance.cu
-index 63ed60b..3a46c37 100644
---- a/gpu/impl/Distance.cu
-+++ b/gpu/impl/Distance.cu
-@@ -64,7 +64,7 @@ void runDistance(bool computeL2,
-   if (centroids.numElements() == 0) {
-     thrust::fill(thrust::cuda::par.on(defaultStream),
-                  outDistances.data(), outDistances.end(),
--                 Limits<T>::getMax());
-+                 Limits<float>::getMax());
- 
-     thrust::fill(thrust::cuda::par.on(defaultStream),
-                  outIndices.data(), outIndices.end(),
-diff --git a/gpu/impl/GeneralDistance.cuh b/gpu/impl/GeneralDistance.cuh
-index 0ecdfa5..a2a447f 100644
---- a/gpu/impl/GeneralDistance.cuh
-+++ b/gpu/impl/GeneralDistance.cuh
-@@ -9,6 +9,7 @@
- #include <faiss/MetricType.h>
- #include <faiss/impl/AuxIndexStructures.h>
- #include <faiss/gpu/impl/DistanceUtils.cuh>
-+#include <faiss/gpu/utils/ConversionOperators.cuh>
- #include <faiss/gpu/utils/DeviceTensor.cuh>
- #include <faiss/gpu/utils/DeviceDefs.cuh>
- #include <faiss/gpu/utils/DeviceUtils.h>
-@@ -144,11 +145,11 @@ generalDistance(Tensor<T, 2, InnerContig> query, // m x k
- 
-         queryTileBase[threadIdx.x + i * kWarpSize] =
-                              kInBounds ?
--                             queryBase[k] : (T) 0; //DistanceOp::kIdentityData;
-+                             queryBase[k] : ConvertTo<T>::to(0);
- 
-         vecTileBase[threadIdx.x + i * kWarpSize] =
-                              kInBounds ?
--                             vecBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+                             vecBase[k] : ConvertTo<T>::to(0);
-       }
- 
-       __syncthreads();
-@@ -179,11 +180,11 @@ generalDistance(Tensor<T, 2, InnerContig> query, // m x k
-       // Load query tile
-       queryTileBase[threadIdx.x] =
-         queryThreadInBounds ?
--        queryBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+        queryBase[k] : ConvertTo<T>::to(0);
- 
-       vecTileBase[threadIdx.x] =
-         vecThreadInBoundsLoad ?
--        vecBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+        vecBase[k] : ConvertTo<T>::to(0);
- 
-       __syncthreads();
- 
-@@ -205,11 +206,11 @@ generalDistance(Tensor<T, 2, InnerContig> query, // m x k
-       // Load query tile
-       queryTileBase[threadIdx.x] =
-                            queryThreadInBounds && kInBounds ?
--                           queryBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+                           queryBase[k] : ConvertTo<T>::to(0);
- 
-       vecTileBase[threadIdx.x] =
-                            vecThreadInBoundsLoad && kInBounds ?
--                           vecBase[k] : (T) 0; // DistanceOp::kIdentityData;
-+                           vecBase[k] : ConvertTo<T>::to(0);
- 
-       __syncthreads();
- 
-@@ -278,7 +279,7 @@ void runGeneralDistance(GpuResources* resources,
-   if (centroids.numElements() == 0) {
-     thrust::fill(thrust::cuda::par.on(defaultStream),
-                  outDistances.data(), outDistances.end(),
--                 Limits<T>::getMax());
-+                 Limits<float>::getMax());
- 
-     thrust::fill(thrust::cuda::par.on(defaultStream),
-                  outIndices.data(), outIndices.end(),
-diff --git a/gpu/test/TestGpuIndexFlat.cpp b/gpu/test/TestGpuIndexFlat.cpp
-index 73cfe20..cabd7aa 100644
---- a/gpu/test/TestGpuIndexFlat.cpp
-+++ b/gpu/test/TestGpuIndexFlat.cpp
-@@ -277,7 +277,7 @@ TEST(TestGpuIndexFlat, CopyFrom) {
-   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
- 
-   faiss::gpu::GpuIndexFlatConfig config;
--  config.device = 0;
-+  config.device = device;
-   config.useFloat16 = false;
-   config.storeTransposed = false;
- 
-diff --git a/gpu/utils/MathOperators.cuh b/gpu/utils/MathOperators.cuh
-index 68ccbd5..020d220 100644
---- a/gpu/utils/MathOperators.cuh
-+++ b/gpu/utils/MathOperators.cuh
-@@ -537,7 +537,7 @@ struct Math<Half8> {
-     return h;
-   }
- 
--  static inline __device__ half reduceAdd(Half8 v) {
-+  static inline __device__ float reduceAdd(Half8 v) {
-     float x = Math<Half4>::reduceAdd(v.a);
-     float y = Math<Half4>::reduceAdd(v.b);
-     return x + y;
--- 
-2.26.2.windows.1
-
diff --git a/recipe/patches/0005-CUDA-10-fixes.patch b/recipe/patches/0005-CUDA-10-fixes.patch
deleted file mode 100644
index 46b3d0dc..00000000
--- a/recipe/patches/0005-CUDA-10-fixes.patch
+++ /dev/null
@@ -1,69 +0,0 @@
-From d41dcff71a519f17d87a9648c75f8f0b7fe36399 Mon Sep 17 00:00:00 2001
-From: Jeff Johnson <jhj@fb.com>
-Date: Mon, 30 Mar 2020 03:01:56 +0000
-Subject: [PATCH 5/8] CUDA 10 fixes
-
----
- gpu/impl/PQCodeDistances-inl.cuh |  2 +-
- gpu/utils/DeviceDefs.cuh         |  2 ++
- gpu/utils/DeviceUtils.cu         | 14 +++++++++++++-
- 3 files changed, 16 insertions(+), 2 deletions(-)
-
-diff --git a/gpu/impl/PQCodeDistances-inl.cuh b/gpu/impl/PQCodeDistances-inl.cuh
-index c3ef87f..fcb9a02 100644
---- a/gpu/impl/PQCodeDistances-inl.cuh
-+++ b/gpu/impl/PQCodeDistances-inl.cuh
-@@ -27,7 +27,7 @@ template <typename OutCodeT,
-           int DimsPerSubQuantizer,
-           bool L2Distance>
- __global__ void
--__launch_bounds__(288, 4)
-+__launch_bounds__(288, 3)
- pqCodeDistances(Tensor<float, 2, true> queries,
-                 int queriesPerBlock,
-                 Tensor<CentroidT, 2, true> coarseCentroids,
-diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh
-index 4e859ec..89d3dda 100644
---- a/gpu/utils/DeviceDefs.cuh
-+++ b/gpu/utils/DeviceDefs.cuh
-@@ -8,6 +8,8 @@
- 
- #pragma once
- 
-+#include <cuda.h>
-+
- namespace faiss { namespace gpu {
- 
- #ifdef __CUDA_ARCH__
-diff --git a/gpu/utils/DeviceUtils.cu b/gpu/utils/DeviceUtils.cu
-index a8195c9..7bcc230 100644
---- a/gpu/utils/DeviceUtils.cu
-+++ b/gpu/utils/DeviceUtils.cu
-@@ -111,11 +111,23 @@ int getDeviceForAddress(const void* p) {
-     FAISS_ASSERT_FMT(err == cudaErrorInvalidValue,
-                      "unknown error %d", (int) err);
-     return -1;
--  } else if (att.memoryType == cudaMemoryTypeHost) {
-+  }
-+
-+  // memoryType is deprecated for CUDA 10.0+
-+#if CUDA_VERSION < 10000
-+  if (att.memoryType == cudaMemoryTypeHost) {
-     return -1;
-   } else {
-     return att.device;
-   }
-+#else
-+  // FIXME: what to use for managed memory?
-+  if (att.type == cudaMemoryTypeDevice) {
-+    return att.device;
-+  } else {
-+    return -1;
-+  }
-+#endif
- }
- 
- bool getFullUnifiedMemSupport(int device) {
--- 
-2.26.2.windows.1
-
diff --git a/recipe/patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch b/recipe/patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch
deleted file mode 100644
index 69fa6a8c..00000000
--- a/recipe/patches/0006-update-util-guard-for-ampere-backport-of-facebookres.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-From c33f254e219c318e25497a8e399ccbe43bd28687 Mon Sep 17 00:00:00 2001
-From: "H. Vetinari" <h.vetinari@gmx.com>
-Date: Thu, 8 Oct 2020 12:45:09 +0200
-Subject: [PATCH 6/8] update util-guard for ampere; backport of
- facebookresearch/faiss#1380
-
----
- gpu/utils/DeviceDefs.cuh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh
-index 89d3dda..bc0f9b5 100644
---- a/gpu/utils/DeviceDefs.cuh
-+++ b/gpu/utils/DeviceDefs.cuh
-@@ -13,7 +13,7 @@
- namespace faiss { namespace gpu {
- 
- #ifdef __CUDA_ARCH__
--#if __CUDA_ARCH__ <= 750
-+#if __CUDA_ARCH__ <= 800
- constexpr int kWarpSize = 32;
- #else
- #error Unknown __CUDA_ARCH__; please define parameters for compute capability
--- 
-2.26.2.windows.1
-
diff --git a/recipe/patches/0007-use-c-14.patch b/recipe/patches/0007-use-c-14.patch
deleted file mode 100644
index 6d8baf31..00000000
--- a/recipe/patches/0007-use-c-14.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 5e576fb95233970295084e7d05b6818d7667af6a Mon Sep 17 00:00:00 2001
-From: "H. Vetinari" <h.vetinari@gmx.com>
-Date: Thu, 8 Oct 2020 18:06:20 +0200
-Subject: [PATCH 7/8] use c++14
-
----
- configure.ac | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/configure.ac b/configure.ac
-index 31b587b..5755fb6 100644
---- a/configure.ac
-+++ b/configure.ac
-@@ -16,7 +16,7 @@ AC_CONFIG_MACRO_DIR([acinclude])
- # Checks for programs.
- AC_LANG(C++)
- AC_PROG_CXX
--AX_CXX_COMPILE_STDCXX([11], [noext], [mandatory])
-+AX_CXX_COMPILE_STDCXX([14], [noext], [mandatory])
- AC_PROG_CPP
- AC_PROG_MAKE_SET
- AC_PROG_MKDIR_P
--- 
-2.26.2.windows.1
-
diff --git a/recipe/patches/0008-backport-facebookresearch-faiss-1388.patch b/recipe/patches/0008-backport-facebookresearch-faiss-1388.patch
deleted file mode 100644
index e08c2726..00000000
--- a/recipe/patches/0008-backport-facebookresearch-faiss-1388.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-From fb6103d0d1f044c8b27450d93f1d1d6f82c2c1b4 Mon Sep 17 00:00:00 2001
-From: "H. Vetinari" <h.vetinari@gmx.com>
-Date: Fri, 9 Oct 2020 18:24:57 +0200
-Subject: [PATCH 8/8] backport facebookresearch/faiss#1388
-
----
- gpu/StandardGpuResources.cpp | 6 +++++-
- 1 file changed, 5 insertions(+), 1 deletion(-)
-
-diff --git a/gpu/StandardGpuResources.cpp b/gpu/StandardGpuResources.cpp
-index e564f8e..257fc2a 100644
---- a/gpu/StandardGpuResources.cpp
-+++ b/gpu/StandardGpuResources.cpp
-@@ -249,11 +249,15 @@ StandardGpuResources::initializeForDevice(int device) {
-   blasHandles_[device] = blasHandle;
- 
-   // Enable tensor core support if available
--#if CUDA_VERSION >= 9000
-+#if CUDA_VERSION >= 9000 && CUDA_VERSION < 11000
-+  // This flag was deprecated in CUDA 11
-   if (getTensorCoreSupport(device)) {
-     cublasSetMathMode(blasHandle, CUBLAS_TENSOR_OP_MATH);
-   }
- #endif
-+#if CUDA_VERSION >= 11000
-+  cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
-+#endif
- 
-   FAISS_ASSERT(memory_.count(device) == 0);
- 
--- 
-2.26.2.windows.1
-