From b96b14e39c71fcd1b910b859e8351888954b678d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcus=20Bl=C3=A4ttermann?= <marcus@essenmitsosse.de>
Date: Tue, 17 Jan 2023 15:29:06 +0100
Subject: [PATCH 01/48] Fix brittleness during builds

91f4667a511e94e9ac5bd8cb317272cae2bd0517 introduced a hacky solution, that unfortunatly caused the build/dev server to fail, when `/website/src/fonts` doesn't exist. This removes the coupling of `fonts.sass` to other features while keeping it optional.
---
 website/gatsby-config.js         | 7 +++++++
 website/src/components/layout.js | 2 --
 website/src/styles/base.sass     | 3 +++
 website/src/styles/fonts.sass    | 1 -
 4 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index 905d4cc41..924872568 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -90,6 +90,13 @@ const plugins = [
     `gatsby-plugin-offline`,
 ]
 
+plugins.push({
+    resolve: `gatsby-plugin-sass-resources`,
+    options: {
+        resources: ['./src/styles/base.sass'],
+    },
+})
+
 if (fs.existsSync('./src/fonts')) {
     plugins.push({
         resolve: `gatsby-plugin-sass-resources`,
diff --git a/website/src/components/layout.js b/website/src/components/layout.js
index a5db53403..62dc8d775 100644
--- a/website/src/components/layout.js
+++ b/website/src/components/layout.js
@@ -2,8 +2,6 @@ import React from 'react'
 
 import SEO from './seo'
 
-import '../styles/base.sass'
-
 const Layout = ({ title, description, className, children }) => (
     <>
         <SEO title={title} description={description} />
diff --git a/website/src/styles/base.sass b/website/src/styles/base.sass
index 18f4794e8..182b769d4 100644
--- a/website/src/styles/base.sass
+++ b/website/src/styles/base.sass
@@ -1,3 +1,6 @@
+
+@use 'sass:math'
+
 \:root
     --font-primary: 'Calibre', 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'
     --font-code: 'IBM Plex Mono', Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace
diff --git a/website/src/styles/fonts.sass b/website/src/styles/fonts.sass
index 0449e4396..e6591a57b 100644
--- a/website/src/styles/fonts.sass
+++ b/website/src/styles/fonts.sass
@@ -1,5 +1,4 @@
 /* Fonts */
-@use 'sass:math'
 
 @font-face
     font-family: "Calibre"

From ff0ca36a942874f9f246abfd35f3654e018ebcdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 23 Jan 2023 12:59:41 +0100
Subject: [PATCH 02/48] Update Dockerfile (#844)

* Update Dockerfile for latest website changes

* Update to Node 16.
* Do not run as root, this also works better with Node privilege-dropping.
* Update README with new run instructions.

* Add .dockerignore to avoid sending large build contexts
---
 website/.dockerignore | 11 +++++++++++
 website/Dockerfile    | 18 ++++++++----------
 website/README.md     |  4 ++--
 3 files changed, 21 insertions(+), 12 deletions(-)
 create mode 100644 website/.dockerignore

diff --git a/website/.dockerignore b/website/.dockerignore
new file mode 100644
index 000000000..a12b116ce
--- /dev/null
+++ b/website/.dockerignore
@@ -0,0 +1,11 @@
+# Avoid uploading large Docker contexts
+.cache/
+public/
+node_modules
+.npm
+logs
+*.log
+npm-debug.log*
+www/
+_deploy.sh
+*.html
diff --git a/website/Dockerfile b/website/Dockerfile
index b1965b17a..64fb46bd7 100644
--- a/website/Dockerfile
+++ b/website/Dockerfile
@@ -1,16 +1,14 @@
-FROM node:11.15.0
+FROM node:16
 
-WORKDIR /thinc-ai
-
-RUN npm install -g gatsby-cli@2.7.4
-
-COPY package.json .
-COPY package-lock.json .
-
-RUN npm install
+USER node
 
 # This is so the installed node_modules will be up one directory
 # from where a user mounts files, so that they don't accidentally mount
 # their own node_modules from a different build
 # https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
-WORKDIR /thinc-ai/website/
+WORKDIR /home/node
+COPY --chown=node package.json .
+COPY --chown=node package-lock.json .
+RUN npm install
+
+WORKDIR /home/node/website/
diff --git a/website/README.md b/website/README.md
index 7b18ad0c2..aaedde806 100644
--- a/website/README.md
+++ b/website/README.md
@@ -28,10 +28,10 @@ Afterwards, the website can be built and run in the container:
 
 ```bash
 docker run --rm -it \
-  -v $PWD:/thinc-ai/website \
+  -v $PWD:/home/node/website \
   -p 8000:8000 \
   thinc-ai \
-  gatsby develop -H 0.0.0.0
+  npm run dev -- -H 0.0.0.0
 ```
 
 This is currently the only way to build the website on ARM64 Macs, since the

From a6ce36bdbdd6b5a92adfbe8cb28c9deb427a05c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcus=20Bl=C3=A4ttermann?= <marcus@essenmitsosse.de>
Date: Fri, 27 Jan 2023 10:40:54 +0100
Subject: [PATCH 03/48] Update Explosion logo

---
 website/src/images/logos/explosion.svg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/src/images/logos/explosion.svg b/website/src/images/logos/explosion.svg
index 119e6b86f..105d0ff0a 100644
--- a/website/src/images/logos/explosion.svg
+++ b/website/src/images/logos/explosion.svg
@@ -1,3 +1,3 @@
-<svg width="100" height="100" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 500 500" fill="currentColor">
-    <path d="M111.7 74.9L91.2 93.1l9.1 10.2 17.8-15.8 7.4 8.4-17.8 15.8 10.1 11.4 20.6-18.2 7.7 8.7-30.4 26.9-41.9-47.3 30.3-26.9 7.6 8.6zM190.8 59.6L219 84.3l-14.4 4.5-20.4-18.2-6.4 26.6-14.4 4.5 8.9-36.4-26.9-24.1 14.3-4.5L179 54.2l5.7-25.2 14.3-4.5-8.2 35.1zM250.1 21.2l27.1 3.4c6.1.8 10.8 3.1 14 7.2 3.2 4.1 4.5 9.2 3.7 15.5-.8 6.3-3.2 11-7.4 14.1-4.1 3.1-9.2 4.3-15.3 3.5L258 63.2l-2.8 22.3-13-1.6 7.9-62.7zm11.5 13l-2.2 17.5 12.6 1.6c5.1.6 9.1-2 9.8-7.6.7-5.6-2.5-9.2-7.6-9.9l-12.6-1.6zM329.1 95.4l23.8 13.8-5.8 10L312 98.8l31.8-54.6 11.3 6.6-26 44.6zM440.5 145c-1.3 8.4-5.9 15.4-13.9 21.1s-16.2 7.7-24.6 6.1c-8.4-1.6-15.3-6.3-20.8-14.1-5.5-7.9-7.6-16-6.4-24.4 1.3-8.5 6-15.5 14-21.1 8-5.6 16.2-7.7 24.5-6 8.4 1.6 15.4 6.3 20.9 14.2 5.5 7.6 7.6 15.7 6.3 24.2zM412 119c-5.1-.8-10.3.6-15.6 4.4-5.2 3.7-8.4 8.1-9.4 13.2-1 5.2.2 10.1 3.5 14.8 3.4 4.8 7.5 7.5 12.7 8.2 5.2.8 10.4-.7 15.6-4.4 5.3-3.7 8.4-8.1 9.4-13.2 1.1-5.1-.1-9.9-3.4-14.7-3.4-4.8-7.6-7.6-12.8-8.3zM471.5 237.9c-2.8 4.8-7.1 7.6-13 8.7l-2.6-13.1c5.3-.9 8.1-5 7.2-11-.9-5.8-4.3-8.8-8.9-8.2-2.3.3-3.7 1.4-4.5 3.3-.7 1.9-1.4 5.2-1.7 10.1-.8 7.5-2.2 13.1-4.3 16.9-2.1 3.9-5.7 6.2-10.9 7-6.3.9-11.3-.5-15.2-4.4-3.9-3.8-6.3-9-7.3-15.7-1.1-7.4-.2-13.7 2.6-18.8 2.8-5.1 7.4-8.2 13.7-9.2l2.6 13c-5.6 1.1-8.7 6.6-7.7 13.4 1 6.6 3.9 9.5 8.6 8.8 4.4-.7 5.7-4.5 6.7-14.1.3-3.5.7-6.2 1.1-8.4.4-2.2 1.2-4.4 2.2-6.8 2.1-4.7 6-7.2 11.8-8.1 5.4-.8 10.3.4 14.5 3.7 4.2 3.3 6.9 8.5 8 15.6.9 6.9-.1 12.6-2.9 17.3zM408.6 293.5l2.4-12.9 62 11.7-2.4 12.9-62-11.7zM419.6 396.9c-8.3 2-16.5.3-24.8-5-8.2-5.3-13.2-12.1-14.9-20.5-1.6-8.4.1-16.6 5.3-24.6 5.2-8.1 11.9-13.1 20.2-15.1 8.4-1.9 16.6-.3 24.9 5 8.2 5.3 13.2 12.1 14.8 20.5 1.7 8.4 0 16.6-5.2 24.7-5.2 8-12 13-20.3 15zm13.4-36.3c-1.2-5.1-4.5-9.3-9.9-12.8s-10.6-4.7-15.8-3.7-9.3 4-12.4 8.9-4.1 9.8-2.8 14.8c1.2 5.1 4.5 9.3 9.9 12.8 5.5 3.5 10.7 4.8 15.8 3.7 5.1-.9 9.2-3.8 12.3-8.7s4.1-9.9 2.9-15zM303.6 416.5l9.6-5.4 43.3 20.4-19.2-34 11.4-6.4 31 55-9.6 5.4-43.4-20.5 19.2 34.1-11.3 6.4-31-55zM238.2 468.8c-49 0-96.9-17.4-134.8-49-38.3-32-64-76.7-72.5-125.9-2-11.9-3.1-24-3.1-35.9 0-36.5 9.6-72.6 27.9-104.4 2.1-3.6 6.7-4.9 10.3-2.8 3.6 2.1 4.9 6.7 2.8 10.3-16.9 29.5-25.9 63.1-25.9 96.9 0 11.1 1 22.3 2.9 33.4 7.9 45.7 31.8 87.2 67.3 116.9 35.2 29.3 79.6 45.5 125.1 45.5 11.1 0 22.3-1 33.4-2.9 4.1-.7 8 2 8.7 6.1.7 4.1-2 8-6.1 8.7-11.9 2-24 3.1-36 3.1z"/>
+<svg width="100" height="100" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" fill="currentColor">
+    <path d="M22.8 11.9l1.5 1.7-4.4 3.8 1.8 2 4-3.5 1.4 1.6-4 3.5 1.9 2.2 4.4-3.8 1.5 1.7-6.3 5.5-8-9.3 6.2-5.4zm17.1-1.2l5.9 4.8-2.8.8-4-3.3-1.5 4.9-2.7.8 2.3-7.2-5.6-4.6 2.8-.8 3.7 3 1.5-4.6 2.7-.8-2.3 7zm14.1.9l-.6 4.2-2.5-.4 1.8-12.1 4.9.7c2.6.4 4.1 2 3.7 4.6s-2.3 3.7-4.9 3.3l-2.4-.3zm3.2-5.4l-2.3-.3-.5 3.6 2.3.3c1.3.2 2.1-.5 2.2-1.5.1-1.1-.4-1.9-1.7-2.1zm14.5 2.4l2.2 1.4-5.4 8.4 4.8 3.1-1.2 1.9-7-4.5 6.6-10.3zm6.8 22.8c-1.9-2.9-.9-6.3 2.1-8.3 3.1-2 6.6-1.5 8.5 1.4s1 6.3-2.1 8.3c-3 2-6.6 1.4-8.5-1.4zm8.7-5.7c-1.1-1.6-3.1-1.8-5.1-.5s-2.7 3.3-1.6 4.9 3.2 1.8 5.2.5c1.9-1.3 2.6-3.3 1.5-4.9zm-.1 17c-1 .5-1.4 1.4-1.2 2.6s.8 2 1.8 1.8c.7-.1 1.1-.6 1.2-1.7l.2-2.2c.2-1.8.9-3.3 2.9-3.5 2.2-.3 4 1.3 4.3 3.9.4 2.8-.9 4.6-2.9 5.2l-.3-2.5c.8-.4 1.3-1.2 1.2-2.4-.2-1.1-.8-1.8-1.7-1.7-.7.1-1 .6-1.1 1.5l-.3 2.3c-.2 2-1.2 3.4-3 3.6-2.4.3-4-1.4-4.4-4-.4-2.7.7-4.8 3-5.5l.3 2.6zm-3.8 15.6l.5-2.5 12 2.5-.5 2.5-12-2.5zm-4.7 10.3c1.9-2.9 5.4-3.4 8.5-1.4s4 5.5 2.1 8.3-5.4 3.4-8.5 1.4-4-5.5-2.1-8.3zm8.7 5.6c1.1-1.6.4-3.6-1.7-4.9-2-1.3-4.1-1.2-5.1.5-1.1 1.6-.4 3.6 1.6 4.9 2 1.4 4.1 1.2 5.2-.5zm-24.7 8.1l1.8-1 9.1 4.2-4.1-7.1 2.1-1.2 6.1 10.6-2 1.2-8.6-4 3.9 6.7-2.1 1.2-6.2-10.6zM50 92.2C26.7 92.2 7.8 73.3 7.8 50c0-7.2 1.8-14.3 5.3-20.5.4-.7 1.3-1 2-.6s1 1.3.6 2a39.53 39.53 0 0 0-4.9 19c0 21.6 17.6 39.2 39.2 39.2 2.2 0 4.4-.2 6.6-.5.8-.1 1.6.4 1.7 1.2s-.4 1.6-1.2 1.7c-2.4.5-4.7.7-7.1.7z"/>
 </svg>

From c0b3828cbb386ab6b814fc61ca0e59f89f330943 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:26:25 +0100
Subject: [PATCH 04/48] Use black version constraints from requirements.txt for
 autoblack (#847)

---
 .github/workflows/autoblack.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
index 9a7f1e8f0..a483dadb1 100644
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@@ -16,7 +16,7 @@ jobs:
         with:
             ref: ${{ github.head_ref }}
       - uses: actions/setup-python@v4
-      - run: pip install black
+      - run: pip install black -c requirements.txt
       - name: Auto-format code if needed
         run: black thinc
       # We can't run black --check here because that returns a non-zero excit

From 703f9fe98e886f36be48b531aff532472ed5fd10 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:33:54 +0100
Subject: [PATCH 05/48] Extend mypy to v1.0.x (#848)

---
 requirements.txt | 2 +-
 thinc/mypy.py    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1773ccfe4..e638e3da7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,7 +24,7 @@ pytest-cov>=2.7.0,<5.0.0
 coverage>=5.0.0,<8.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-mypy>=0.990,<0.1000; python_version >= "3.7"
+mypy>=1.0.0,<1.1.0; python_version >= "3.7"
 types-mock>=0.1.1
 types-contextvars>=0.1.2; python_version < "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"
diff --git a/thinc/mypy.py b/thinc/mypy.py
index 99ce502c2..e3ae190ec 100644
--- a/thinc/mypy.py
+++ b/thinc/mypy.py
@@ -257,6 +257,7 @@ def __init__(
         tree: MypyFile,
         path: str,
         plugin: Plugin,
+        per_line_checking_time_ns: Dict[int, int],
     ):
         self._error_messages: List[str] = []
-        super().__init__(errors, modules, options, tree, path, plugin)
+        super().__init__(errors, modules, options, tree, path, plugin, per_line_checking_time_ns)

From 248c32b5aa317c5ea0e7f05253292f8007175d76 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 10 Feb 2023 15:27:39 +0100
Subject: [PATCH 06/48] Auto-format code with black (#849)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 thinc/mypy.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/thinc/mypy.py b/thinc/mypy.py
index e3ae190ec..e02f6d5be 100644
--- a/thinc/mypy.py
+++ b/thinc/mypy.py
@@ -260,4 +260,6 @@ def __init__(
         per_line_checking_time_ns: Dict[int, int],
     ):
         self._error_messages: List[str] = []
-        super().__init__(errors, modules, options, tree, path, plugin, per_line_checking_time_ns)
+        super().__init__(
+            errors, modules, options, tree, path, plugin, per_line_checking_time_ns
+        )

From f81c6e46ecf165ac193c325b091e41cadf1cb692 Mon Sep 17 00:00:00 2001
From: Lucie <lucie.daeye@gmail.com>
Date: Wed, 15 Feb 2023 17:51:50 +0100
Subject: [PATCH 07/48] Convert Azure pipeline config to github action (#850)

* Convert azure pipeline config to GHA

* fix quotes

* fix matrix + remove -e from install extras step

* fix typo in python_version

* fix typo in python_version

* Change fail fast to false

* Update .github/workflows/tests.yml

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update .github/workflows/tests.yml

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* update filter

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 .github/workflows/tests.yml | 123 ++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 .github/workflows/tests.yml

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 000000000..190632bb5
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,123 @@
+name: tests
+
+on:
+  push:
+    paths-ignore:
+      - "website/**"
+      - "*.md"
+  pull_request:
+    types: [opened, synchronize, reopened, edited]
+    paths-ignore:
+      - "website/**"
+      - "*.md"
+
+jobs:
+  tests:
+    name: Test
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python_version: ["3.11"]
+        include:
+          - os: windows-2019
+            python_version: "3.6"
+          - os: macos-latest
+            python_version: "3.7"
+          - os: ubuntu-latest
+            python_version: "3.8"
+          - os: windows-latest
+            python_version: "3.9"
+          - os: macos-latest
+            python_version: "3.10"
+
+    runs-on: ${{ matrix.os }}
+    env:
+      NOTEBOOK_KERNEL: "thinc-notebook-tests"
+
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Configure Python version
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python_version }}
+          architecture: x64
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip setuptools wheel
+          pip install -r requirements.txt
+
+      - name: Build sdist
+        run: |
+          python setup.py build_ext --inplace
+          python setup.py sdist --formats=gztar
+
+      - name: Run mypy
+        run: python -m mypy thinc --no-implicit-reexport
+        if: matrix.python_version != '3.6'
+
+      - name: Delete source directory
+        run: rm -rf thinc
+        shell: bash
+
+      - name: Uninstall all packages
+        run: |
+          python -m pip freeze
+          pip freeze --exclude pywin32 > installed.txt
+          pip uninstall -y -r installed.txt
+
+      - name: Install from sdist
+        run: |
+          SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+          PIP_CONSTRAINT="build-constraints.txt" pip install dist/$SDIST
+        shell: bash
+
+      - name: Test import
+        run: python -c "import thinc"
+
+      - name: Run tests without extras
+        run: |
+          pip install -r requirements.txt
+          pip install ipykernel pydot graphviz
+          python -m ipykernel install --name thinc-notebook-tests --user
+          python -m pytest --pyargs thinc --cov=thinc --cov-report=term
+
+      # Notes on numpy requirements hacks:
+      # 1. torch does not have a direct numpy requirement but is compiled
+      # against a newer version than the oldest supported numpy for windows and
+      # python 3.10; this version of numpy would not work with
+      # tensorflow~=2.5.0 as specified above, but there is no release for
+      # python 3.10 anyway
+      # 2. restrict to numpy<1.24.0 due to mxnet incompatibility
+      # 3. keep restriction to torch<1.13.0 due to segfaults with numpy<1.24.0,
+      # which skips torch for python 3.11
+      # Note: some of these pip install commands are known to fail for some platforms.
+      # To continue despite errors as in azure pipelines, remove -e from the default
+      # bash flags.
+      - name: Install extras for testing
+        run: |
+          pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
+          pip install "mxnet; sys_platform != 'win32'"
+          pip install "torch<1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install "numpy~=1.23.0; python_version=='3.10' and sys_platform=='win32'"
+          pip install "numpy<1.24.0"
+          pip install -r requirements.txt
+          pip uninstall -y mypy
+        shell: bash --noprofile --norc -o pipefail {0}
+
+      - name: Run tests with extras
+        run: python -m pytest --pyargs thinc --cov=thinc --cov-report=term
+
+      - name: Run tests for thinc-apple-ops
+        run: |
+          pip uninstall -y tensorflow
+          pip install thinc-apple-ops
+          python -m pytest --pyargs thinc_apple_ops
+        if: matrix.os == 'macos-latest' && matrix.python_version == '3.10'
+
+      - name: Run tests with thinc-apple-ops
+        run: python -m pytest --pyargs thinc
+        if: matrix.os == 'macos-latest' && matrix.python_version == '3.10'

From 997731a3e82b84396332993b3fd134b6cd09c8e7 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 23 Feb 2023 04:16:37 +0900
Subject: [PATCH 08/48] Make resizable layer work with textcat and transformers
 (#820)

* Make resizable layer work with textcat and transformers

* Restructure conditional

This avoids setting nO if it doesn't need to be changed in the first
place.

* Add minimal tests for resizable layer

* cleanup

---------

Co-authored-by: svlandeg <svlandeg@github.com>
---
 thinc/layers/resizable.py            |  3 +++
 thinc/tests/layers/test_resizable.py | 32 ++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 thinc/tests/layers/test_resizable.py

diff --git a/thinc/layers/resizable.py b/thinc/layers/resizable.py
index 3454684d0..2dd4dde1a 100644
--- a/thinc/layers/resizable.py
+++ b/thinc/layers/resizable.py
@@ -59,6 +59,9 @@ def resize_linear_weighted(
         return layer
     elif new_nO == layer.get_dim("nO"):
         return layer
+    elif layer.has_dim("nI") is None:
+        layer.set_dim("nO", new_nO, force=True)
+        return layer
 
     dims = {name: layer.maybe_get_dim(name) for name in layer.dim_names}
     dims["nO"] = new_nO
diff --git a/thinc/tests/layers/test_resizable.py b/thinc/tests/layers/test_resizable.py
new file mode 100644
index 000000000..dfb6c67fd
--- /dev/null
+++ b/thinc/tests/layers/test_resizable.py
@@ -0,0 +1,32 @@
+import pytest
+from functools import partial
+from thinc.api import resizable, Linear
+from thinc.layers.resizable import resize_model, resize_linear_weighted
+
+
+@pytest.fixture
+def model():
+    output_layer = Linear(nO=None, nI=None)
+    fill_defaults = {"b": 0, "W": 0}
+    model = resizable(
+        output_layer,
+        resize_layer=partial(resize_linear_weighted, fill_defaults=fill_defaults),
+    )
+    return model
+
+
+def test_resizable_linear_default_name(model):
+    assert model.name == "resizable(linear)"
+
+
+def test_resize_model(model):
+    """Test that resizing the model doesn't cause an exception."""
+    resize_model(model, new_nO=10)
+    resize_model(model, new_nO=11)
+
+    model.set_dim("nO", 0, force=True)
+    resize_model(model, new_nO=10)
+
+    model.set_dim("nI", 10, force=True)
+    model.set_dim("nO", 0, force=True)
+    resize_model(model, new_nO=10)

From 529fc82f3ddabea59106f3b65675004a09a7c1c8 Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Tue, 28 Feb 2023 13:44:33 +0100
Subject: [PATCH 09/48] Premap: Mapping layer from ints to ints (#815)

* layer to strictly map from ints to ints

* layer to strictly map from ints to ints

* mini rough speed test

* premap imports

* tests for remap_ids and premap_ids

* import fix

* Update api.py

* test with Embed

* test with Embed

* add hashembed test

* binding=True in preamble

* change all to numpy assert_equal

* turn functions to fixtures

* np to numpy and remove binding decorator

* remove preshmap as possible input type

* add assert_equal

* all tests with assert_equal

* add context manager for timing

* use context manager for timing

* Update thinc/tests/layers/test_layers_api.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* remove time_context from util

* black

* revert changes to util

* revert changes to util

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 examples/benchmarks/mappers.py        | 143 ++++++++++++++++++++++++++
 setup.py                              |   1 +
 thinc/api.py                          |   6 +-
 thinc/layers/__init__.py              |   2 +
 thinc/layers/premap_ids.pyx           |  78 ++++++++++++++
 thinc/tests/layers/test_layers_api.py |   3 +-
 thinc/tests/layers/test_mappers.py    |  61 +++++++++++
 7 files changed, 291 insertions(+), 3 deletions(-)
 create mode 100644 examples/benchmarks/mappers.py
 create mode 100644 thinc/layers/premap_ids.pyx
 create mode 100644 thinc/tests/layers/test_mappers.py

diff --git a/examples/benchmarks/mappers.py b/examples/benchmarks/mappers.py
new file mode 100644
index 000000000..dc3cf4b22
--- /dev/null
+++ b/examples/benchmarks/mappers.py
@@ -0,0 +1,143 @@
+from thinc.api import remap_ids_v2
+from thinc.api import premap_ids
+from thinc.api import chain, Embed, HashEmbed
+import time
+import random
+import numpy as np
+import cupy as cp
+
+
+N_symbols = 200000
+N_tokens = 50000
+N_batch = 500
+N_columns = 4
+N_dim = 300
+mapper = {}
+numbers = list(range(N_symbols))
+random.shuffle(numbers)
+for v, k in enumerate(numbers):
+    mapper[k] = v
+
+
+class time_context:
+    """Register the running time of a context."""
+
+    def __enter__(self):
+        self.start = time.perf_counter()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.elapsed = time.perf_counter() - self.start
+
+
+def speed_test_no_column():
+    remap = remap_ids_v2(mapper)
+    premap = premap_ids(mapper)
+    keys = np.random.randint(0, N_symbols, N_tokens)
+    with time_context() as elapsed:
+        for i in range(100):
+            remap(keys, False)
+    remaptime = elapsed.elapsed
+    with time_context() as elapsed:
+        for i in range(100):
+            premap(keys, False)
+    premaptime = elapsed.elapsed
+    print("remap", remaptime)
+    print("premap", premaptime)
+    print("speedup", remaptime / premaptime)
+
+
+def speed_test_column():
+    remap = remap_ids_v2(mapper, column=3)
+    premap = premap_ids(mapper, column=3)
+    keys = np.random.randint(0, N_symbols, (N_tokens, N_columns))
+    with time_context() as elapsed:
+        for i in range(100):
+            remap(keys, False)
+    remaptime = elapsed.elapsed
+    with time_context() as elapsed:
+        for i in range(100):
+            premap(keys, False)
+    premaptime = elapsed.elapsed
+    print("remap", remaptime)
+    print("premap", premaptime)
+    print("speedup", remaptime / premaptime)
+
+
+def speed_test_cupy():
+    remap = remap_ids_v2(mapper)
+    premap = premap_ids(mapper)
+    keys = cp.random.randint(0, N_symbols, N_tokens)
+    with time_context() as elapsed:
+        for i in range(100):
+            remap(keys, False)
+    remaptime = elapsed.elapsed
+    with time_context() as elapsed:
+        for i in range(100):
+            premap(keys, False)
+    premaptime = elapsed.elapsed
+    print("remap", remaptime)
+    print("premap", premaptime)
+    print("speedup", remaptime / premaptime)
+
+
+def speed_test_with_embed():
+    remap = chain(remap_ids_v2(mapper), Embed(N_dim, N_symbols))
+    premap = chain(premap_ids(mapper), Embed(N_dim, N_symbols))
+    remap.initialize()
+    premap.initialize()
+    keys = np.random.randint(0, N_symbols, N_tokens)
+    with time_context() as elapsed:
+        for i in range(100):
+            remap(keys, False)
+    remaptime = elapsed.elapsed
+    with time_context() as elapsed:
+        for i in range(100):
+            premap(keys, False)
+    premaptime = elapsed.elapsed
+    print("remap", remaptime)
+    print("premap", premaptime)
+    print("speedup", remaptime / premaptime)
+
+
+def speed_test_cupy_with_embed():
+    remap = chain(remap_ids_v2(mapper), Embed(N_dim, N_symbols))
+    premap = chain(premap_ids(mapper), Embed(N_dim, N_symbols))
+    remap.initialize()
+    premap.initialize()
+    keys = cp.random.randint(0, N_symbols, N_tokens)
+    with time_context() as elapsed:
+        for i in range(100):
+            remap(keys, False)
+    remaptime = elapsed.elapsed
+    with time_context() as elapsed:
+        for i in range(100):
+            premap(keys, False)
+    premaptime = elapsed.elapsed
+    print("remap", remaptime)
+    print("premap", premaptime)
+    print("speedup", remaptime / premaptime)
+
+
+def speed_test_hashembed():
+    embed = HashEmbed(N_dim, N_symbols)
+    embed.initialize()
+    keys = np.random.randint(0, N_symbols, N_tokens)
+    with time_context() as elapsed:
+        for i in range(100):
+            embed(keys, False)
+    print(elapsed.elapsed)
+
+
+print("No columns")
+speed_test_no_column()
+print("Columns")
+speed_test_column()
+print("Cupy")
+speed_test_cupy()
+print("With Embed")
+speed_test_with_embed()
+print("Cupy With Embed")
+speed_test_cupy_with_embed()
+print("HashEmbed speed")
+speed_test_hashembed()
diff --git a/setup.py b/setup.py
index 4c0085820..30962ed83 100644
--- a/setup.py
+++ b/setup.py
@@ -22,6 +22,7 @@
     "thinc.backends.numpy_ops",
     "thinc.extra.search",
     "thinc.layers.sparselinear",
+    "thinc.layers.premap_ids"
 ]
 COMPILE_OPTIONS = {
     "msvc": ["/Ox", "/EHsc"],
diff --git a/thinc/api.py b/thinc/api.py
index dbf9aac6f..203c501da 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -39,7 +39,8 @@
 from .layers import with_array, with_array2d
 from .layers import with_padded, with_list, with_ragged, with_flatten
 from .layers import with_reshape, with_getitem, strings2arrays, list2array
-from .layers import list2ragged, ragged2list, list2padded, padded2list, remap_ids
+from .layers import list2ragged, ragged2list, list2padded, padded2list
+from .layers import remap_ids, remap_ids_v2, premap_ids
 from .layers import array_getitem, with_cpu, with_debug, with_nvtx_range
 from .layers import with_signpost_interval
 from .layers import tuplify, with_flatten_v2
@@ -101,7 +102,8 @@
     "with_array", "with_array2d",
     "with_padded", "with_list", "with_ragged", "with_flatten",
     "with_reshape", "with_getitem", "strings2arrays", "list2array",
-    "list2ragged", "ragged2list", "list2padded", "padded2list", "remap_ids",
+    "list2ragged", "ragged2list", "list2padded", "padded2list", 
+    "remap_ids", "remap_ids_v2", "premap_ids",
     "array_getitem", "with_cpu", "with_debug", "with_nvtx_range",
     "with_signpost_interval",
     "tuplify", "with_flatten_v2",
diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py
index 2f5e299b5..4b73a2dce 100644
--- a/thinc/layers/__init__.py
+++ b/thinc/layers/__init__.py
@@ -61,6 +61,7 @@
 from .ragged2list import ragged2list
 from .padded2list import padded2list
 from .remap_ids import remap_ids, remap_ids_v2
+from .premap_ids import premap_ids
 from .strings2arrays import strings2arrays
 from .with_array import with_array
 from .with_array2d import with_array2d
@@ -141,6 +142,7 @@
     "with_signpost_interval",
     "remap_ids",
     "remap_ids_v2",
+    "premap_ids",
     "softmax_activation",
     "Logistic",
     "Sigmoid",
diff --git a/thinc/layers/premap_ids.pyx b/thinc/layers/premap_ids.pyx
new file mode 100644
index 000000000..74bc8dc6a
--- /dev/null
+++ b/thinc/layers/premap_ids.pyx
@@ -0,0 +1,78 @@
+# cython: binding=True, infer_types=True
+import numpy
+from preshed.maps cimport PreshMap
+from typing import Dict, Union, Optional, cast, Callable, Tuple, Mapping
+from ..types import Ints1d, Ints2d
+from ..config import registry
+from ..model import Model
+from ..util import to_numpy
+
+
+InT = Union[Ints1d, Ints2d]
+OutT = Ints2d
+
+
+cdef lookup(PreshMap mapping, long[:] keys, long default):
+    """
+    Faster dict.get(keys, default) for the case when
+    the "dict" is a Dict[int, int] converted to PreshMap
+    and the "keys" is a numpy integer vector.
+    """
+    cdef int maxi = len(keys)
+    result = numpy.empty(maxi, dtype="int")
+    cdef long[:] result_view = result
+    for i in range(maxi):
+        v = mapping[keys[i]]
+        if v is None:
+            result_view[i] = default
+        else:
+            result_view[i] = v
+    return result
+
+
+@registry.layers("premap_ids.v1")
+def premap_ids(
+    mapping_table: Mapping[int, int],
+    default: int = 0,
+    *,
+    column: Optional[int] = None
+):
+    """Remap integer inputs to integers a mapping table, usually as a
+    preprocess before embeddings."""
+    mapper = PreshMap(initial_size=len(mapping_table))
+    for k, v in mapping_table.items():
+        if not (isinstance(k, int) and isinstance(v, int)):
+            raise ValueError(
+                "mapping_table has to be of type Mapping[int, int], "
+                f"but found {k}, {type(k)} and {v}, {type(v)}"
+            )
+        mapper[k] = v
+    return Model(
+        "premap_ids",
+        forward,
+        attrs={
+            "mapping_table": mapper, "default": default, "column": column
+        }
+    )
+
+
+def forward(
+    model: Model, inputs: InT, is_train: bool
+) -> Tuple[OutT, Callable]:
+    table = model.attrs["mapping_table"]
+    default = model.attrs["default"]
+    column = model.attrs["column"]
+    # Have to convert to numpy anyways, because
+    # cupy ints don't work together with Python ints.
+    if column is None:
+        idx = to_numpy(inputs)
+    else:
+        idx = to_numpy(cast(Ints2d, inputs)[:, column])
+    result = lookup(table, idx, default)
+    arr = model.ops.asarray2i(result)
+    output = model.ops.reshape2i(arr, -1, 1)
+
+    def backprop(dY: OutT) -> InT:
+        return model.ops.xp.empty(dY.shape)  # type: ignore
+
+    return output, backprop
diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py
index 6ea64eb39..761cad880 100644
--- a/thinc/tests/layers/test_layers_api.py
+++ b/thinc/tests/layers/test_layers_api.py
@@ -131,7 +131,8 @@ def assert_data_match(Y, out_data):
     ("SparseLinear.v1", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
     ("SparseLinear.v2", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
     ("remap_ids.v1", {"dtype": "f"}, ["a", 1, 5.0], array2dint),
-    ("remap_ids.v2", {"mapping_table": {}, "column": 1}, numpy.array([[1, 2, 3], [4, 5, 6]]).T, array2dint)
+    ("remap_ids.v2", {"mapping_table": {}, "column": 1}, numpy.array([[1, 2, 3], [4, 5, 6]]).T, array2dint),
+    ("premap_ids.v1", {"mapping_table": {}, "column": 1}, numpy.array([[1, 2, 3], [4, 5, 6]]).T, array2dint),
     # fmt: on
 ]
 
diff --git a/thinc/tests/layers/test_mappers.py b/thinc/tests/layers/test_mappers.py
new file mode 100644
index 000000000..444de9a58
--- /dev/null
+++ b/thinc/tests/layers/test_mappers.py
@@ -0,0 +1,61 @@
+import pytest
+import numpy
+from thinc.layers import premap_ids, remap_ids, remap_ids_v2
+
+
+@pytest.fixture
+def keys():
+    return numpy.array([4, 2, 6, 1, 8, 7, 9, 3, 30])
+
+
+@pytest.fixture
+def mapper(keys):
+    return {int(k): int(v) for v, k in enumerate(keys)}
+
+
+def test_premap(keys, mapper):
+    premap = premap_ids(mapper, default=99)
+    values, _ = premap(keys, False)
+    numpy.testing.assert_equal(
+        values.squeeze(), numpy.asarray(range(len(keys)))
+    )
+
+
+def test_remap(keys, mapper):
+    remap = remap_ids(mapper, default=99)
+    values, _ = remap(keys, False)
+    numpy.testing.assert_equal(
+        values.squeeze(), numpy.asarray(range(len(keys)))
+    )
+
+
+def test_remap_v2(keys, mapper):
+    remap = remap_ids_v2(mapper, default=99)
+    values, _ = remap(keys, False)
+    numpy.testing.assert_equal(
+        values.squeeze(), numpy.asarray(range(len(keys)))
+    )
+
+
+def test_remap_premap_eq(keys, mapper):
+    remap = remap_ids(mapper, default=99)
+    remap_v2 = remap_ids_v2(mapper, default=99)
+    premap = premap_ids(mapper, default=99)
+    values1, _ = remap(keys, False)
+    values2, _ = remap_v2(keys, False)
+    values3, _ = premap(keys, False)
+    numpy.testing.assert_equal(values1, values2)
+    numpy.testing.assert_equal(values2, values3)
+
+
+def test_column(keys, mapper):
+    idx = numpy.zeros((len(keys), 4), dtype="int")
+    idx[:, 3] = keys
+    remap_v2 = remap_ids_v2(mapper, column=3)
+    premap = premap_ids(mapper, column=3)
+    numpy.testing.assert_equal(
+        remap_v2(idx, False)[0].squeeze(), numpy.asarray(range(len(keys)))
+    )
+    numpy.testing.assert_equal(
+        premap(idx, False)[0].squeeze(), numpy.asarray(range(len(keys)))
+    )

From 6d787c4ffdf718effc5c629b7efd5aafe855dde7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 1 Mar 2023 15:33:26 +0100
Subject: [PATCH 10/48] Set version to v8.1.8 (#855)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 22c03998d..f782b3757 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.7"
+__version__ = "8.1.8"
 __release__ = True

From 14cc42acb0abc0462b8ddf20156fb2da067d73be Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 1 Mar 2023 20:22:54 +0100
Subject: [PATCH 11/48] Skip test_lstm_forward_training_fuzz on aarch64 (#856)

---
 thinc/tests/backends/test_ops.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index e46d14872..111c4c402 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -2,6 +2,7 @@
 
 import pytest
 import numpy
+import platform
 from hypothesis import given, settings
 from hypothesis.strategies import composite, integers
 from numpy.testing import assert_allclose
@@ -1260,6 +1261,7 @@ def test_lstm_forward_training(ops, depth, dirs, nO, batch_size, nI):
     assert_allclose(Y, reference[0], atol=1e-4, rtol=1e-3)
 
 
+@pytest.mark.skipif(platform.machine() == "aarch64", reason="Flaky, skip temporarily")
 @pytest.mark.parametrize("ops", XP_OPS)
 @settings(max_examples=MAX_EXAMPLES, deadline=None)
 @given(args=draw_lstm_args())

From 3d58cc87ad1ad2980e42e8323aeba24de85e9780 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 3 Mar 2023 11:41:20 +0100
Subject: [PATCH 12/48] Auto-format code with black (#857)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 thinc/tests/layers/test_mappers.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/thinc/tests/layers/test_mappers.py b/thinc/tests/layers/test_mappers.py
index 444de9a58..e890dd086 100644
--- a/thinc/tests/layers/test_mappers.py
+++ b/thinc/tests/layers/test_mappers.py
@@ -16,25 +16,19 @@ def mapper(keys):
 def test_premap(keys, mapper):
     premap = premap_ids(mapper, default=99)
     values, _ = premap(keys, False)
-    numpy.testing.assert_equal(
-        values.squeeze(), numpy.asarray(range(len(keys)))
-    )
+    numpy.testing.assert_equal(values.squeeze(), numpy.asarray(range(len(keys))))
 
 
 def test_remap(keys, mapper):
     remap = remap_ids(mapper, default=99)
     values, _ = remap(keys, False)
-    numpy.testing.assert_equal(
-        values.squeeze(), numpy.asarray(range(len(keys)))
-    )
+    numpy.testing.assert_equal(values.squeeze(), numpy.asarray(range(len(keys))))
 
 
 def test_remap_v2(keys, mapper):
     remap = remap_ids_v2(mapper, default=99)
     values, _ = remap(keys, False)
-    numpy.testing.assert_equal(
-        values.squeeze(), numpy.asarray(range(len(keys)))
-    )
+    numpy.testing.assert_equal(values.squeeze(), numpy.asarray(range(len(keys))))
 
 
 def test_remap_premap_eq(keys, mapper):

From 99530a356a8db3008d8fc46857c96f2311876d4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 3 Mar 2023 16:59:59 +0100
Subject: [PATCH 13/48] Model.begin_update: fix type signature (#858)

---
 thinc/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/model.py b/thinc/model.py
index 08366523e..e094d5294 100644
--- a/thinc/model.py
+++ b/thinc/model.py
@@ -299,7 +299,7 @@ def initialize(self, X: Optional[InT] = None, Y: Optional[OutT] = None) -> "Mode
             self.init(self, X=X, Y=Y)
         return self
 
-    def begin_update(self, X: InT) -> Tuple[OutT, Callable[[InT], OutT]]:
+    def begin_update(self, X: InT) -> Tuple[OutT, Callable[[OutT], InT]]:
         """Run the model over a batch of data, returning the output and a
         callback to complete the backward pass. A tuple (Y, finish_update),
         where Y is a batch of output data, and finish_update is a callback that

From 18898f2d40a74766da4478231d6f519203f8514d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 7 Mar 2023 10:24:55 +0100
Subject: [PATCH 14/48] Set version to v8.1.9 (#859)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index f782b3757..4be69d41c 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.8"
+__version__ = "8.1.9"
 __release__ = True

From 388df1365f8d9b0005d9553ef48dcbf9855df6f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 16 Mar 2023 17:46:34 +0100
Subject: [PATCH 15/48] Avoid h2d - d2h roundtrip when using `unflatten` (#861)

* Avoid h2d - d2h roundtrip when using `unflatten`

`unflatten` converts its `lengths` argument to a NumPy array, because
CuPy's `split` function requires lengths to be in CPU memory. However,
in various places in Thinc, we copy the lengths array to GPU memory when
CupyOps is used. This results in an unnecessary roundtrip of the lengths
array (host to device -> device to host). One of these roundtrips
(array `list2array`) showed up in profiles of the biaffine parser.

This change fixes some length array allocations to avoid the round trip.

* Add a comment to `with_ragged` to avoid confusion about memory allocation
---
 thinc/layers/concatenate.py  |  7 ++++++-
 thinc/layers/list2array.py   |  8 ++++++--
 thinc/layers/with_array.py   |  7 ++++++-
 thinc/layers/with_array2d.py |  6 +++++-
 thinc/layers/with_ragged.py  | 18 +++++++++++++-----
 5 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/thinc/layers/concatenate.py b/thinc/layers/concatenate.py
index 78e4c558b..4cce96954 100644
--- a/thinc/layers/concatenate.py
+++ b/thinc/layers/concatenate.py
@@ -1,5 +1,7 @@
 from typing import Any, List, Tuple, Callable, Optional
 from typing import TypeVar, cast, Dict, Union, Sequence
+
+from ..backends import NumpyOps
 from ..model import Model
 from ..config import registry
 from ..types import Array2d, Ragged
@@ -8,6 +10,9 @@
 from ..types import XY_XY_OutT
 
 
+NUMPY_OPS = NumpyOps()
+
+
 InT = TypeVar("InT", bound=Any)
 OutT = TypeVar("OutT", bound=Union[Array2d, Sequence[Array2d], Ragged])
 
@@ -120,7 +125,7 @@ def backprop(d_output: Sequence[Array2d]) -> InT:
             start += width
         return dX
 
-    lengths = model.ops.asarray1i([len(x) for x in X])
+    lengths = NUMPY_OPS.asarray1i([len(x) for x in X])
     Ys = [model.ops.xp.concatenate(Y, axis=0) for Y in Ys]
     widths = [Y.shape[1] for Y in Ys]
     out_array = model.ops.xp.hstack(Ys)
diff --git a/thinc/layers/list2array.py b/thinc/layers/list2array.py
index fff5befc0..a52d6e6c6 100644
--- a/thinc/layers/list2array.py
+++ b/thinc/layers/list2array.py
@@ -1,10 +1,14 @@
-from typing import Tuple, Callable, TypeVar, List, Union, cast
+from typing import Tuple, Callable, TypeVar, List
 
+from ..backends import NumpyOps
 from ..model import Model
 from ..config import registry
 from ..types import Array2d
 
 
+NUMPY_OPS = NumpyOps()
+
+
 OutT = TypeVar("OutT", bound=Array2d)
 InT = List[OutT]
 
@@ -19,7 +23,7 @@ def list2array() -> Model[InT, OutT]:
 
 
 def forward(model: Model[InT, OutT], Xs: InT, is_train: bool) -> Tuple[OutT, Callable]:
-    lengths = model.ops.asarray1i([len(x) for x in Xs])
+    lengths = NUMPY_OPS.asarray1i([len(x) for x in Xs])
 
     def backprop(dY: OutT) -> InT:
         return model.ops.unflatten(dY, lengths)
diff --git a/thinc/layers/with_array.py b/thinc/layers/with_array.py
index 3701fc8a3..2511b3c17 100644
--- a/thinc/layers/with_array.py
+++ b/thinc/layers/with_array.py
@@ -1,9 +1,14 @@
 from typing import Tuple, Callable, Optional, TypeVar, Union, cast
 
+from ..backends import NumpyOps
 from ..model import Model
 from ..config import registry
 from ..types import Padded, Ragged, ArrayXd, Array3d, ListXd
 
+
+NUMPY_OPS = NumpyOps()
+
+
 ArrayTXd = TypeVar("ArrayTXd", bound=ArrayXd)
 SeqT = TypeVar("SeqT", bound=Union[Padded, Ragged, ListXd, ArrayXd])
 
@@ -68,7 +73,7 @@ def _list_forward(
 ) -> Tuple[ListXd, Callable]:
     layer: Model[ArrayXd, ArrayXd] = model.layers[0]
     pad = model.attrs["pad"]
-    lengths = layer.ops.asarray1i([len(seq) for seq in Xs])
+    lengths = NUMPY_OPS.asarray1i([len(seq) for seq in Xs])
     Xf = layer.ops.flatten(Xs, pad=pad)
     Yf, get_dXf = layer(Xf, is_train)
 
diff --git a/thinc/layers/with_array2d.py b/thinc/layers/with_array2d.py
index 9f7de213c..740593a26 100644
--- a/thinc/layers/with_array2d.py
+++ b/thinc/layers/with_array2d.py
@@ -1,10 +1,14 @@
 from typing import Tuple, Callable, Optional, TypeVar, cast, List, Union
 
+from ..backends import NumpyOps
 from ..model import Model
 from ..config import registry
 from ..types import Array2d, Floats2d, List2d, Padded, Ragged
 
 
+NUMPY_OPS = NumpyOps()
+
+
 ValT = TypeVar("ValT", bound=Array2d)
 SeqT = TypeVar("SeqT", bound=Union[Padded, Ragged, List2d, Array2d])
 
@@ -71,7 +75,7 @@ def _list_forward(
 ) -> Tuple[List2d, Callable]:
     layer: Model[Array2d, Array2d] = model.layers[0]
     pad = model.attrs["pad"]
-    lengths = layer.ops.asarray1i([len(seq) for seq in Xs])
+    lengths = NUMPY_OPS.asarray1i([len(seq) for seq in Xs])
     Xf = layer.ops.flatten(Xs, pad=pad)
     Yf, get_dXf = layer(Xf, is_train)
 
diff --git a/thinc/layers/with_ragged.py b/thinc/layers/with_ragged.py
index 005c69048..cbff6f59d 100644
--- a/thinc/layers/with_ragged.py
+++ b/thinc/layers/with_ragged.py
@@ -1,9 +1,14 @@
 from typing import Tuple, Callable, Optional, TypeVar, cast, List, Union
 
+from ..backends import NumpyOps
 from ..types import Padded, Ragged, Array2d, ListXd, List2d, Ints1d
 from ..model import Model
 from ..config import registry
 
+
+NUMPY_OPS = NumpyOps()
+
+
 RaggedData = Tuple[Array2d, Ints1d]
 SeqT = TypeVar("SeqT", bound=Union[Padded, Ragged, ListXd, RaggedData])
 
@@ -89,9 +94,12 @@ def _padded_forward(
     # sooner.
     Xs = padded2list(Xp)
     # Bit annoying here: padded is in a different order, so we need to make new
-    # lengths.
-    lengths = layer.ops.asarray1i([len(x) for x in Xs])
-    Yr, get_dXr = layer(Ragged(flatten(Xs), lengths), is_train)
+    # lengths. The lengths are unconditionally allocated in CPU memory, because
+    # otherwire unflatten would move GPU allocations to the CPU again. For the
+    # ragged arrays we let the layer's ops determine how lengths should be
+    # stored to ensure that the array and lengths use the same type of memory.
+    lengths = NUMPY_OPS.asarray1i([len(x) for x in Xs])
+    Yr, get_dXr = layer(Ragged(flatten(Xs), layer.ops.asarray1i(lengths)), is_train)
 
     def backprop(dYp: Padded):
         flattened = flatten(padded2list(dYp))
@@ -111,8 +119,8 @@ def _list_forward(
     flatten = layer.ops.flatten
     unflatten = layer.ops.unflatten
 
-    lengths = layer.ops.asarray1i([len(x) for x in Xs])
-    Yr, get_dXr = layer(Ragged(flatten(Xs), lengths), is_train)
+    lengths = [len(x) for x in Xs]
+    Yr, get_dXr = layer(Ragged(flatten(Xs), layer.ops.asarray1i(lengths)), is_train)
 
     def backprop(dYs):
         flattened = flatten(dYs)

From 38ac8285d37234496bb45f38eaaef96d0cc536b4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 21 Mar 2023 10:15:59 +0100
Subject: [PATCH 16/48] CI: Allow newest supported torch in extra tests (#866)

---
 .github/workflows/tests.yml | 5 ++---
 azure-pipelines.yml         | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 190632bb5..b904c1d77 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -92,8 +92,7 @@ jobs:
       # tensorflow~=2.5.0 as specified above, but there is no release for
       # python 3.10 anyway
       # 2. restrict to numpy<1.24.0 due to mxnet incompatibility
-      # 3. keep restriction to torch<1.13.0 due to segfaults with numpy<1.24.0,
-      # which skips torch for python 3.11
+      # 3. forbid torch!=1.13.0 due to segfaults with numpy<1.24.0
       # Note: some of these pip install commands are known to fail for some platforms.
       # To continue despite errors as in azure pipelines, remove -e from the default
       # bash flags.
@@ -101,7 +100,7 @@ jobs:
         run: |
           pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
           pip install "mxnet; sys_platform != 'win32'"
-          pip install "torch<1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install "torch!=1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
           pip install "numpy~=1.23.0; python_version=='3.10' and sys_platform=='win32'"
           pip install "numpy<1.24.0"
           pip install -r requirements.txt
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 17f3588d2..633916971 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -103,12 +103,11 @@ jobs:
   # tensorflow~=2.5.0 as specified above, but there is no release for
   # python 3.10 anyway
   # 2. restrict to numpy<1.24.0 due to mxnet incompatibility
-  # 3. keep restriction to torch<1.13.0 due to segfaults with numpy<1.24.0,
-  # which skips torch for python 3.11
+  # 3. forbid torch!=1.13.0 due to segfaults with numpy<1.24.0
   - script: |
       pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
       pip install "mxnet; sys_platform != 'win32'"
-      pip install "torch<1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
+      pip install "torch!=1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
       pip install "numpy~=1.23.0; python_version=='3.10' and sys_platform=='win32'"
       pip install "numpy<1.24.0"
       pip install -r requirements.txt

From 7cc8dcf40a5e740e95cf7c69c9c65f2939c713e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 24 Mar 2023 16:32:01 +0100
Subject: [PATCH 17/48] Improve exception when CuPy/PyTorch MPS is not
 installed (#863)

* Improve exception when CuPy/PyTorch MPS is not installed

Rather than raising a generic `No GPU devices can be detected` when
CuPy or PyTorch with MPS isn't installed, but raise more specific
errors.

* Remove use of torch.has_mps()

It's undocumented.
---
 thinc/compat.py |  8 +++-----
 thinc/util.py   | 11 +++++++++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/thinc/compat.py b/thinc/compat.py
index 11fcd85dc..54421e187 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -32,11 +32,8 @@
 
     has_torch = True
     has_torch_cuda_gpu = torch.cuda.device_count() != 0
-    has_torch_mps_gpu = (
-        hasattr(torch, "has_mps")
-        and torch.has_mps  # type: ignore[attr-defined]
-        and torch.backends.mps.is_available()  # type: ignore[attr-defined]
-    )
+    has_torch_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_built()
+    has_torch_mps_gpu = has_torch_mps and torch.backends.mps.is_available()
     has_torch_gpu = has_torch_cuda_gpu
     torch_version = Version(str(torch.__version__))
     has_torch_amp = (
@@ -48,6 +45,7 @@
     has_torch = False
     has_torch_cuda_gpu = False
     has_torch_gpu = False
+    has_torch_mps = False
     has_torch_mps_gpu = False
     has_torch_amp = False
     torch_version = Version("0.0.0")
diff --git a/thinc/util.py b/thinc/util.py
index b87ca4e5f..aabab9ecb 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -1,6 +1,7 @@
 from typing import Any, Union, Sequence, cast, Dict, Optional, Callable, TypeVar
 from typing import List, Mapping, Tuple
 import numpy
+import platform
 from packaging.version import Version
 import random
 import functools
@@ -15,7 +16,7 @@
 from dataclasses import dataclass
 from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow
 from .compat import has_cupy_gpu, has_torch_cuda_gpu, has_gpu
-from .compat import has_torch_mps_gpu
+from .compat import has_torch_mps
 from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack
 
 DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False)
@@ -191,7 +192,13 @@ def prefer_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
 def require_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
     from .backends import set_current_ops, CupyOps, MPSOps
 
-    if not has_gpu:
+    if platform.system() == "Darwin" and not has_torch_mps:
+        if has_torch:
+            raise ValueError("Cannot use GPU, installed PyTorch does not support MPS")
+        raise ValueError("Cannot use GPU, PyTorch is not installed")
+    elif platform.system() != "Darwin" and not has_cupy:
+        raise ValueError("Cannot use GPU, CuPy is not installed")
+    elif not has_gpu:
         raise ValueError("No GPU devices detected")
 
     if has_cupy_gpu:

From 49a53465d92c94296070fc0b20f70fefcc5cbdf9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 28 Mar 2023 20:27:12 +0200
Subject: [PATCH 18/48] Directly loading a TorchScript model onto an MPS device
 is not supported (#864)

So instead, load on CPU first and then move to MPS.
---
 thinc/shims/torchscript.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/thinc/shims/torchscript.py b/thinc/shims/torchscript.py
index 47db1fbf1..675718cd1 100644
--- a/thinc/shims/torchscript.py
+++ b/thinc/shims/torchscript.py
@@ -57,7 +57,10 @@ def from_bytes(self, bytes_data):
         self.cfg = msg["config"]
         filelike = BytesIO(msg["model"])
         filelike.seek(0)
-        self._model = torch.jit.load(filelike, map_location=device)
+        # As of Torch 2.0.0, loading TorchScript models directly to
+        # an MPS device is not supported.
+        map_location = torch.device("cpu") if device.type == "mps" else device
+        self._model = torch.jit.load(filelike, map_location=map_location)
         self._model.to(device)
         self._grad_scaler.to_(device)
         return self

From 054bbdc32be61a98371aa207274602ee42a79318 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 5 Apr 2023 14:49:24 +0200
Subject: [PATCH 19/48] CI: Restrict to explosion (#871)

---
 .github/workflows/tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b904c1d77..90ea34aa2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -14,6 +14,7 @@ on:
 jobs:
   tests:
     name: Test
+    if: github.repository_owner == 'explosion'
     strategy:
       fail-fast: false
       matrix:

From 85cd22436c3ae7ea807d0f3652fbf21a2391cbc5 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 18 Apr 2023 16:20:45 +0200
Subject: [PATCH 20/48] Add `LazyKernel` for lazily-compiled custom `cupy`
 kernels (#870)

* Add a wrapper around `cupy.RawKernel` that lazily compiles them on first invocation

This prevents CuPy from allocating memory unnecessarily during module init.

* Fix nullable-type in type hints

* Expand/clarify docstring

* Iniline murmur kernel path

* Remove `_compiled` flag

* Add test for compiling custom kernels
---
 thinc/backends/_custom_kernels.py | 150 ++++++++++++++++++------------
 thinc/tests/backends/test_ops.py  |  10 ++
 2 files changed, 99 insertions(+), 61 deletions(-)

diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
index 859405495..0e8f1c641 100644
--- a/thinc/backends/_custom_kernels.py
+++ b/thinc/backends/_custom_kernels.py
@@ -1,4 +1,4 @@
-from typing import Optional, Tuple
+from typing import Callable, Optional, Tuple
 import re
 from pathlib import Path
 from collections import defaultdict
@@ -62,74 +62,102 @@
 )
 
 
-def _get_kernel(name):
-    """A small wrapper around KERNELS.get_function that verifies first that
-    compiler kernels are available (cupy is installed)."""
-    if KERNELS is None:
-        return None
-    else:
-        return KERNELS.get_function(name)
+class LazyKernel:
+    """Wraps around `cupy.RawModule` and `cupy.RawKernel` to verify CuPy availability
+    and lazily compile the latter on first invocation.
+
+    The default CuPy behaviour triggers the compilation as soon as the `cupy.RawKernel` object
+    is accessed."""
+
+    name: str
+    _kernel: Optional["cupy.RawKernel"]
+    _compile_callback: Optional[Callable[[], "cupy.RawKernel"]]
+
+    __slots__ = ["name", "_kernel", "_compile_callback"]
+
+    def __init__(
+        self,
+        name: str,
+        *,
+        compile_callback: Optional[Callable[[], "cupy.RawKernel"]] = None,
+    ) -> None:
+        self.name = name
+        self._kernel = None
+        self._compile_callback = compile_callback
+
+    def __call__(self, *args, **kwargs):
+        self._compile_kernel()
+        self._kernel(*args, **kwargs)
+
+    def _compile_kernel(self):
+        if self._kernel is not None:
+            return
+
+        if self._compile_callback is not None:
+            self._kernel = self._compile_callback()
+        elif KERNELS is not None:
+            self._kernel = KERNELS.get_function(self.name)
+
+        if self._kernel is None:
+            raise ValueError(f"couldn't compile Cupy kernel '{self.name}'")
 
 
-def compile_mmh(src):
+def compile_mmh():
     if not has_cupy_gpu:
         return None
-    return cupy.RawKernel(src, "hash_data")
-
-
-MMH_SRC = (PWD / "_murmur3.cu").read_text(encoding="utf8")
-
-
-clipped_linear_kernel_float = _get_kernel("clipped_linear<float>")
-clipped_linear_kernel_double = _get_kernel("clipped_linear<double>")
-dish_kernel_float = _get_kernel("dish<float>")
-dish_kernel_double = _get_kernel("dish<double>")
-gather_add_kernel_float = _get_kernel("gather_add<float>")
-gather_add_kernel_double = _get_kernel("gather_add<double>")
-gelu_kernel_float = _get_kernel("gelu<float>")
-gelu_kernel_double = _get_kernel("gelu<double>")
-hash_data_kernel = compile_mmh(MMH_SRC)
-maxout_kernel_float = _get_kernel("maxout<float>")
-maxout_kernel_double = _get_kernel("maxout<double>")
-mish_kernel_float = _get_kernel("mish<float>")
-mish_kernel_double = _get_kernel("mish<double>")
-reduce_max_kernel_float = _get_kernel("reduce_max<float>")
-reduce_max_kernel_double = _get_kernel("reduce_max<double>")
-reduce_sum_kernel_float = _get_kernel("reduce_sum<float>")
-reduce_sum_kernel_double = _get_kernel("reduce_sum<double>")
-seq2col_kernel_float = _get_kernel("seq2col<float>")
-seq2col_kernel_double = _get_kernel("seq2col<double>")
-swish_kernel_float = _get_kernel("swish<float>")
-swish_kernel_double = _get_kernel("swish<double>")
-
-backprop_clipped_linear_kernel_double = _get_kernel("backprop_clipped_linear<double>")
-backprop_clipped_linear_kernel_float = _get_kernel("backprop_clipped_linear<float>")
-backprop_dish_kernel_double = _get_kernel("backprop_dish<double>")
-backprop_dish_kernel_float = _get_kernel("backprop_dish<float>")
-backprop_gelu_kernel_double = _get_kernel("backprop_gelu<double>")
-backprop_gelu_kernel_float = _get_kernel("backprop_gelu<float>")
-backprop_hard_swish_kernel_double = _get_kernel("backprop_hard_swish<double>")
-backprop_hard_swish_kernel_float = _get_kernel("backprop_hard_swish<float>")
-backprop_hard_swish_mobilenet_kernel_double = _get_kernel(
+    return cupy.RawKernel((PWD / "_murmur3.cu").read_text(encoding="utf8"), "hash_data")
+
+
+clipped_linear_kernel_float = LazyKernel("clipped_linear<float>")
+clipped_linear_kernel_double = LazyKernel("clipped_linear<double>")
+dish_kernel_float = LazyKernel("dish<float>")
+dish_kernel_double = LazyKernel("dish<double>")
+gather_add_kernel_float = LazyKernel("gather_add<float>")
+gather_add_kernel_double = LazyKernel("gather_add<double>")
+gelu_kernel_float = LazyKernel("gelu<float>")
+gelu_kernel_double = LazyKernel("gelu<double>")
+hash_data_kernel = LazyKernel("hash_data", compile_callback=compile_mmh)
+maxout_kernel_float = LazyKernel("maxout<float>")
+maxout_kernel_double = LazyKernel("maxout<double>")
+mish_kernel_float = LazyKernel("mish<float>")
+mish_kernel_double = LazyKernel("mish<double>")
+reduce_max_kernel_float = LazyKernel("reduce_max<float>")
+reduce_max_kernel_double = LazyKernel("reduce_max<double>")
+reduce_sum_kernel_float = LazyKernel("reduce_sum<float>")
+reduce_sum_kernel_double = LazyKernel("reduce_sum<double>")
+seq2col_kernel_float = LazyKernel("seq2col<float>")
+seq2col_kernel_double = LazyKernel("seq2col<double>")
+swish_kernel_float = LazyKernel("swish<float>")
+swish_kernel_double = LazyKernel("swish<double>")
+
+backprop_clipped_linear_kernel_double = LazyKernel("backprop_clipped_linear<double>")
+backprop_clipped_linear_kernel_float = LazyKernel("backprop_clipped_linear<float>")
+backprop_dish_kernel_double = LazyKernel("backprop_dish<double>")
+backprop_dish_kernel_float = LazyKernel("backprop_dish<float>")
+backprop_gelu_kernel_double = LazyKernel("backprop_gelu<double>")
+backprop_gelu_kernel_float = LazyKernel("backprop_gelu<float>")
+backprop_hard_swish_kernel_double = LazyKernel("backprop_hard_swish<double>")
+backprop_hard_swish_kernel_float = LazyKernel("backprop_hard_swish<float>")
+backprop_hard_swish_mobilenet_kernel_double = LazyKernel(
     "backprop_hard_swish_mobilenet<double>"
 )
-backprop_hard_swish_mobilenet_kernel_float = _get_kernel(
+backprop_hard_swish_mobilenet_kernel_float = LazyKernel(
     "backprop_hard_swish_mobilenet<float>"
 )
-backprop_maxout_kernel_double = _get_kernel("backprop_maxout<double>")
-backprop_maxout_kernel_float = _get_kernel("backprop_maxout<float>")
-backprop_mish_kernel_double = _get_kernel("backprop_mish<double>")
-backprop_mish_kernel_float = _get_kernel("backprop_mish<float>")
-backprop_reduce_max_kernel_double = _get_kernel("backprop_reduce_max<double>")
-backprop_reduce_max_kernel_float = _get_kernel("backprop_reduce_max<float>")
-backprop_reduce_mean_kernel_double = _get_kernel("backprop_reduce_mean<double>")
-backprop_reduce_mean_kernel_float = _get_kernel("backprop_reduce_mean<float>")
-backprop_reduce_sum_kernel_double = _get_kernel("backprop_reduce_sum<double>")
-backprop_reduce_sum_kernel_float = _get_kernel("backprop_reduce_sum<float>")
-backprop_seq2col_kernel_double = _get_kernel("backprop_seq2col<double>")
-backprop_seq2col_kernel_float = _get_kernel("backprop_seq2col<float>")
-backprop_swish_kernel_double = _get_kernel("backprop_swish<double>")
-backprop_swish_kernel_float = _get_kernel("backprop_swish<float>")
+backprop_maxout_kernel_double = LazyKernel("backprop_maxout<double>")
+backprop_maxout_kernel_float = LazyKernel("backprop_maxout<float>")
+backprop_mish_kernel_double = LazyKernel("backprop_mish<double>")
+backprop_mish_kernel_float = LazyKernel("backprop_mish<float>")
+backprop_reduce_max_kernel_double = LazyKernel("backprop_reduce_max<double>")
+backprop_reduce_max_kernel_float = LazyKernel("backprop_reduce_max<float>")
+backprop_reduce_mean_kernel_double = LazyKernel("backprop_reduce_mean<double>")
+backprop_reduce_mean_kernel_float = LazyKernel("backprop_reduce_mean<float>")
+backprop_reduce_sum_kernel_double = LazyKernel("backprop_reduce_sum<double>")
+backprop_reduce_sum_kernel_float = LazyKernel("backprop_reduce_sum<float>")
+backprop_seq2col_kernel_double = LazyKernel("backprop_seq2col<double>")
+backprop_seq2col_kernel_float = LazyKernel("backprop_seq2col<float>")
+backprop_swish_kernel_double = LazyKernel("backprop_swish<double>")
+backprop_swish_kernel_float = LazyKernel("backprop_swish<float>")
 
 
 def _alloc(shape, dtype, *, zeros: bool = True):
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 111c4c402..0e1cac13b 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -14,6 +14,7 @@
 from thinc.api import fix_random_seed
 from thinc.api import LSTM
 from thinc.types import Floats2d
+from thinc.backends._custom_kernels import KERNELS_LIST, KERNELS, compile_mmh
 import inspect
 
 from .. import strategies
@@ -1466,3 +1467,12 @@ def test_to_numpy_byteorder(ops, byte_order, x):
         assert y.dtype.newbyteorder("S").newbyteorder("S").byteorder == byte_order
     else:
         assert x.dtype.byteorder == y.dtype.byteorder
+
+
+@pytest.mark.skipif(not has_cupy_gpu, reason="needs GPU/CuPy")
+def test_custom_kernel_compilation():
+    for kernel_name in KERNELS_LIST:
+        compiled_kernel = KERNELS.get_function(kernel_name)
+        assert compiled_kernel is not None
+
+    assert compile_mmh() is not None

From 12c03cc3d5437f97905a4537a21962b79eeb400a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 19 Apr 2023 18:40:33 +0200
Subject: [PATCH 21/48] Implement `pad` as a CUDA kernel (#860)

* Implement `pad` as a CUDA kernel

`Ops.pad` was a fairly slow operation on GPU. It iterates over all
sequences and copies each sequence into the padded array. This results
in a lot of kernel launches. In the biaffine parser, padding the inputs
was more costly than applying the biaffine layers.

This change optimizes the `pad` op using a custom CUDA kernel. The
kernel get an array of pointers to the CuPy arrays that are provided as
a list. The output array is then filled, parallelizing over the 'time
steps'. This should provides the largest amount of parallelism, since
we usually have n_steps * hidden_size to parallelize over.

* Rename variables for clarification

* Better validation of incorrect rounding

* Simplify rounding using modular arithmetic, add test
---
 thinc/backends/_custom_kernels.cu | 19 +++++++
 thinc/backends/_custom_kernels.py | 82 +++++++++++++++++++++++++++++++
 thinc/backends/cupy_ops.py        | 26 ++++++++++
 thinc/backends/ops.py             | 11 +++--
 thinc/tests/backends/test_ops.py  | 34 +++++++++++++
 5 files changed, 169 insertions(+), 3 deletions(-)

diff --git a/thinc/backends/_custom_kernels.cu b/thinc/backends/_custom_kernels.cu
index 9c9fece1e..b9cd0b4e1 100644
--- a/thinc/backends/_custom_kernels.cu
+++ b/thinc/backends/_custom_kernels.cu
@@ -121,6 +121,25 @@ __global__ void seq2col(T* output, const T* X, const int* lengths,
 }
 
 
+template <typename T>
+__global__ void pad(T* out, T const **seqs, int const *lengths, int stride, int N, int L)
+{
+    int _loop_start = blockIdx.x * blockDim.x + threadIdx.x;
+    int _loop_stride = blockDim.x * gridDim.x;
+
+    for (int i = _loop_start; i < L * stride; i += _loop_stride) {
+        for (int j = 0; j < N; ++j) {
+            T const *seq = seqs[j];
+            if (i < lengths[j] * stride) {
+                out[j * L * stride + i] = seq[i];
+            } else {
+                out[j * L * stride + i] = T();
+            }
+        }
+    }
+}
+
+
 template <typename T>
 __global__ void maxout(T* best, int* which, const T* cands, int B, int O, int P)
 {
diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
index 0e8f1c641..0b868e6d6 100644
--- a/thinc/backends/_custom_kernels.py
+++ b/thinc/backends/_custom_kernels.py
@@ -1,4 +1,7 @@
 from typing import Callable, Optional, Tuple
+from functools import reduce
+import numpy
+import operator
 import re
 from pathlib import Path
 from collections import defaultdict
@@ -44,6 +47,10 @@
     "maxout<float>",
     "mish<double>",
     "mish<float>",
+    "pad<double>",
+    "pad<float>",
+    "pad<int>",
+    "pad<long long>",
     "reduce_max<double>",
     "reduce_max<float>",
     "reduce_sum<double>",
@@ -121,6 +128,10 @@ def compile_mmh():
 maxout_kernel_double = LazyKernel("maxout<double>")
 mish_kernel_float = LazyKernel("mish<float>")
 mish_kernel_double = LazyKernel("mish<double>")
+pad_kernel_float = LazyKernel("pad<float>")
+pad_kernel_double = LazyKernel("pad<double>")
+pad_kernel_int32 = LazyKernel("pad<int>")
+pad_kernel_int64 = LazyKernel("pad<long long>")
 reduce_max_kernel_float = LazyKernel("reduce_max<float>")
 reduce_max_kernel_double = LazyKernel("reduce_max<double>")
 reduce_sum_kernel_float = LazyKernel("reduce_sum<float>")
@@ -174,6 +185,65 @@ def _alloc_like(array, zeros: bool = True):
         return cupy.empty_like(array)
 
 
+def pad(seqs, round_to=1, *, threads_per_block=128, num_blocks=128):
+    if round_to < 1:
+        raise ValueError(f"Rounding for padding must at least be 1, was: {round_to}")
+    for seq in seqs:
+        _is_float_or_int_array(seq)
+
+    seq_lens = [len(seq) for seq in seqs]
+    max_seq_len = max(seq_lens)
+    # Round the length to nearest bucket -- helps on GPU, to make similar
+    # array sizes.
+    max_seq_len += -max_seq_len % round_to
+    seq_lens = cupy.array(seq_lens, dtype="int32")
+    final_shape = (len(seqs), max_seq_len) + seqs[0].shape[1:]
+    out = cupy.empty(final_shape, dtype=seqs[0].dtype)
+
+    # Extract pointers from CuPy arrays, so that we can address
+    # them in the CUDA kernel.
+    ptrs = numpy.empty(
+        (
+            len(
+                seqs,
+            )
+        ),
+        "int64",
+    )
+    for idx, seq in enumerate(seqs):
+        ptrs[idx] = seq.data.ptr
+    ptrs = cupy.array(ptrs)
+
+    stride = reduce(operator.mul, seqs[0].shape[1:], 1)
+
+    if out.dtype == "float32":
+        pad_kernel_float(
+            (num_blocks,),
+            (threads_per_block,),
+            (out, ptrs, seq_lens, stride, len(seqs), max_seq_len),
+        )
+    elif out.dtype == "float64":
+        pad_kernel_double(
+            (num_blocks,),
+            (threads_per_block,),
+            (out, ptrs, seq_lens, stride, len(seqs), max_seq_len),
+        )
+    elif out.dtype == "int32":
+        pad_kernel_int32(
+            (num_blocks,),
+            (threads_per_block,),
+            (out, ptrs, seq_lens, stride, len(seqs), max_seq_len),
+        )
+    elif out.dtype == "int64":
+        pad_kernel_int64(
+            (num_blocks,),
+            (threads_per_block,),
+            (out, ptrs, seq_lens, stride, len(seqs), max_seq_len),
+        )
+
+    return out
+
+
 def clipped_linear(
     X,
     *,
@@ -749,6 +819,18 @@ def _is_float_array(out, *, shape: Optional[Tuple] = None):
         raise ValueError(msg)
 
 
+def _is_float_or_int_array(out, *, shape: Optional[Tuple] = None):
+    assert out.dtype in (
+        "float32",
+        "float64",
+        "int32",
+        "int64",
+    ), "CUDA kernel can only handle float32, float64, int32 and int64"
+    if shape is not None and out.shape != shape:
+        msg = f"array has incorrect shape, expected: {shape}, was: {out.shape}"
+        raise ValueError(msg)
+
+
 def _check_lengths(lengths, n_elems: int, *, min_length=0):
     assert lengths.dtype == "int32", "lengths should be encoded as 32-bit integers"
     if not cupy.all(lengths >= min_length):
diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index 9d2c7a34e..506276380 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -94,6 +94,32 @@ def asarray(self, data, dtype=None):
 
         return array
 
+    def pad(self, seqs, round_to=1):
+        """Perform padding on a list of arrays so that they each have the same
+        length, by taking the maximum dimension across each axis. This only
+        works on non-empty sequences with the same `ndim` and `dtype`.
+        """
+        # TODO: This should be generalized to handle different ranks
+        if not seqs:
+            raise ValueError("Cannot pad empty sequence")
+        if len(set(seq.ndim for seq in seqs)) != 1:
+            raise ValueError("Cannot pad sequences with different ndims")
+        if len(set(seq.dtype for seq in seqs)) != 1:
+            raise ValueError("Cannot pad sequences with different dtypes")
+        if len(set(seq.shape[1:] for seq in seqs)) != 1:
+            raise ValueError("Cannot pad sequences that differ on other dimensions")
+
+        # Our CUDA kernel can currently only handle C contiguous arrays.
+        if not all(seq.flags["C_CONTIGUOUS"] for seq in seqs) or seqs[0].dtype not in (
+            "float32",
+            "float64",
+            "int32",
+            "int64",
+        ):
+            return super().pad(seqs, round_to)
+
+        return _custom_kernels.pad(seqs, round_to)
+
     def maxout(self, X):
         if X.dtype in ("float32", "float64"):
             return _custom_kernels.maxout(X)
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 3237cf78c..8bb770023 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -358,6 +358,11 @@ def pad(  # noqa: F811
         length, by taking the maximum dimension across each axis. This only
         works on non-empty sequences with the same `ndim` and `dtype`.
         """
+        if round_to < 1:
+            raise ValueError(
+                f"Rounding for padding must at least be 1, was: {round_to}"
+            )
+
         # TODO: This should be generalized to handle different ranks
         if not seqs:
             raise ValueError("Cannot pad empty sequence")
@@ -368,11 +373,11 @@ def pad(  # noqa: F811
         if len(set(seq.shape[1:] for seq in seqs)) != 1:
             raise ValueError("Cannot pad sequences that differ on other dimensions")
         # Find the maximum dimension along each axis. That's what we'll pad to.
-        length = max(len(seq) for seq in seqs)
+        max_seq_len = max(len(seq) for seq in seqs)
         # Round the length to nearest bucket -- helps on GPU, to make similar
         # array sizes.
-        length = (length + (round_to - 1)) // round_to * round_to
-        final_shape = (len(seqs), length) + seqs[0].shape[1:]
+        max_seq_len += -max_seq_len % round_to
+        final_shape = (len(seqs), max_seq_len) + seqs[0].shape[1:]
         output: Array3d = cast(Array3d, self.alloc(final_shape, dtype=seqs[0].dtype))
         for i, arr in enumerate(seqs):
             # It's difficult to convince this that the dtypes will match.
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 0e1cac13b..83dd582ea 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -33,6 +33,7 @@
 ALL_OPS = XP_OPS + [VANILLA_OPS]
 
 FLOAT_TYPES = ["float32", "float64"]
+INT_TYPES = ["int32", "int64"]
 
 
 def create_pytorch_funcs():
@@ -791,6 +792,39 @@ def test_flatten_unflatten_roundtrip(cpu_ops, X):
     assert_allclose(X, unflat2)
 
 
+@pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.parametrize("dtype", FLOAT_TYPES + INT_TYPES)
+def test_pad(ops, dtype):
+    X = [ops.xp.arange(1, 3, dtype=dtype), ops.xp.arange(1, 5, dtype=dtype)]
+    ops.xp.testing.assert_allclose(ops.pad(X), [[1, 2, 0, 0], [1, 2, 3, 4]])
+    ops.xp.testing.assert_allclose(
+        ops.pad(X, round_to=8), [[1, 2, 0, 0, 0, 0, 0, 0], [1, 2, 3, 4, 0, 0, 0, 0]]
+    )
+
+    X = [
+        ops.xp.arange(1, 5, dtype=dtype).reshape(2, 2),
+        ops.xp.arange(1, 9, dtype=dtype).reshape(4, 2),
+    ]
+    ops.xp.testing.assert_allclose(
+        ops.pad(X),
+        [
+            [[1, 2], [3, 4], [0, 0], [0, 0]],
+            [[1, 2], [3, 4], [5, 6], [7, 8]],
+        ],
+    )
+
+    ops.xp.testing.assert_allclose(
+        ops.pad(X, round_to=5),
+        [
+            [[1, 2], [3, 4], [0, 0], [0, 0], [0, 0]],
+            [[1, 2], [3, 4], [5, 6], [7, 8], [0, 0]],
+        ],
+    )
+
+    with pytest.raises(ValueError, match=r"Rounding for padding must at least be 1"):
+        ops.pad(X, round_to=0)
+
+
 @pytest.mark.parametrize("ops", ALL_OPS)
 @pytest.mark.parametrize("dtype", FLOAT_TYPES)
 def test_reduce_sum(ops, dtype):

From 86f7282fdbb5020f45b7b85556d51f57c3415bd2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 21 Apr 2023 15:05:43 +0200
Subject: [PATCH 22/48] CI: Disable Azure (#872)

---
 azure-pipelines.yml | 131 --------------------------------------------
 1 file changed, 131 deletions(-)
 delete mode 100644 azure-pipelines.yml

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index 633916971..000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,131 +0,0 @@
-trigger:
-  batch: true
-  branches:
-    include:
-    - '*'
-  paths:
-    exclude:
-    - 'website/*'
-    - '*.md'
-pr:
-  paths:
-    exclude:
-    - 'website/*'
-    - '*.md'
-
-jobs:
-- job: 'Test'
-  variables:
-    NOTEBOOK_KERNEL: "thinc-notebook-tests"
-  strategy:
-    matrix:
-      Python36Windows:
-        imageName: 'windows-2019'
-        python.version: '3.6'
-      Python37Mac:
-        imageName: 'macos-latest'
-        python.version: '3.7'
-      Python38Linux:
-        imageName: 'ubuntu-latest'
-        python.version: '3.8'
-      Python39Windows:
-        imageName: 'windows-latest'
-        python.version: '3.9'
-      Python310Mac:
-        imageName: 'macos-latest'
-        python.version: '3.10'
-      Python311Linux:
-        imageName: 'ubuntu-latest'
-        python.version: '3.11'
-      Python311Windows:
-        imageName: 'windows-latest'
-        python.version: '3.11'
-      Python311Mac:
-        imageName: 'macos-latest'
-        python.version: '3.11'
-    maxParallel: 4
-  pool:
-    vmImage: $(imageName)
-
-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '$(python.version)'
-      architecture: 'x64'
-      allowUnstable: true
-
-  - script: |
-      python -m pip install --upgrade pip setuptools wheel
-      pip install -r requirements.txt
-    displayName: 'Install dependencies'
-
-  - script: |
-      python setup.py build_ext --inplace
-      python setup.py sdist --formats=gztar
-    displayName: 'Build sdist'
-
-  - script: |
-      python -m mypy thinc --no-implicit-reexport
-    displayName: 'Run mypy'
-    condition: ne(variables['python.version'], '3.6')
-
-  - task: DeleteFiles@1
-    inputs:
-      contents: 'thinc'
-    displayName: 'Delete source directory'
-
-  - script: |
-      python -m pip freeze
-      pip freeze --exclude pywin32 > installed.txt
-      pip uninstall -y -r installed.txt
-    displayName: 'Uninstall all packages'
-
-  - bash: |
-      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      PIP_CONSTRAINT="build-constraints.txt" pip install dist/$SDIST
-    displayName: 'Install from sdist'
-
-  - bash: |
-      python -c "import thinc"
-    displayName: 'Test import'
-
-  - script: |
-      pip install -r requirements.txt
-      pip install ipykernel pydot graphviz
-      python -m ipykernel install --name thinc-notebook-tests --user
-      python -m pytest --pyargs thinc --cov=thinc --cov-report=term
-    displayName: 'Run tests without extras'
-
-  # Notes on numpy requirements hacks:
-  # 1. torch does not have a direct numpy requirement but is compiled
-  # against a newer version than the oldest supported numpy for windows and
-  # python 3.10; this version of numpy would not work with
-  # tensorflow~=2.5.0 as specified above, but there is no release for
-  # python 3.10 anyway
-  # 2. restrict to numpy<1.24.0 due to mxnet incompatibility
-  # 3. forbid torch!=1.13.0 due to segfaults with numpy<1.24.0
-  - script: |
-      pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
-      pip install "mxnet; sys_platform != 'win32'"
-      pip install "torch!=1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
-      pip install "numpy~=1.23.0; python_version=='3.10' and sys_platform=='win32'"
-      pip install "numpy<1.24.0"
-      pip install -r requirements.txt
-      pip uninstall -y mypy
-    displayName: 'Install extras for testing'
-
-  - script: |
-      python -m pytest --pyargs thinc --cov=thinc --cov-report=term
-    displayName: 'Run tests with extras'
-
-  - script: |
-      pip uninstall -y tensorflow
-      pip install thinc-apple-ops
-      python -m pytest --pyargs thinc_apple_ops
-    displayName: 'Run tests for thinc-apple-ops'
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
-
-  - script: |
-      python -m pytest --pyargs thinc
-    displayName: 'Run tests with thinc-apple-ops'
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))

From 7503392c8ae65a79083d1a2a7c9164f2aa48400a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 27 Apr 2023 09:58:32 +0200
Subject: [PATCH 23/48] Set version to v8.1.10 (#873)

* Set version to v8.1.10

* Temporarily restrict hypothesis version due to incorrect numpy requirements
---
 requirements.txt | 2 +-
 thinc/about.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e638e3da7..522291e51 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 contextvars>=2.4,<3; python_version < "3.7"
 # Development dependencies
 cython>=0.25.0,<3.0
-hypothesis>=3.27.0,<7.0.0
+hypothesis>=3.27.0,<6.72.2
 pytest>=5.2.0,!=7.1.0
 pytest-cov>=2.7.0,<5.0.0
 coverage>=5.0.0,<8.0.0
diff --git a/thinc/about.py b/thinc/about.py
index 4be69d41c..2504236b7 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.9"
+__version__ = "8.1.10"
 __release__ = True

From df5d60335ce658e7150ebffcc57f7f2c11cf2c0d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 May 2023 15:20:32 +0200
Subject: [PATCH 24/48] Run basic tests without warnings (#878)

---
 .github/workflows/tests.yml      | 2 +-
 thinc/tests/backends/test_ops.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 90ea34aa2..0a9687e92 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -84,7 +84,7 @@ jobs:
           pip install -r requirements.txt
           pip install ipykernel pydot graphviz
           python -m ipykernel install --name thinc-notebook-tests --user
-          python -m pytest --pyargs thinc --cov=thinc --cov-report=term
+          python -m pytest --pyargs thinc -Werror --cov=thinc --cov-report=term
 
       # Notes on numpy requirements hacks:
       # 1. torch does not have a direct numpy requirement but is compiled
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 83dd582ea..ba296e271 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -135,6 +135,7 @@ def test_ops_consistency(op):
 
 
 @pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
 def test_adam_incorrect_inputs(ops):
     one = ops.xp.zeros(1, dtype="f")
     two = ops.xp.zeros(2, dtype="f")

From e5811586a277fe5edbc62773a6e109dac22691d5 Mon Sep 17 00:00:00 2001
From: Ankush Chander <ankush.watchtower@gmail.com>
Date: Thu, 1 Jun 2023 22:37:06 +0530
Subject: [PATCH 25/48] Fix typo in example code (#879)

* Fix typo in example code

* Update backprop101.md
---
 website/docs/backprop101.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/backprop101.md b/website/docs/backprop101.md
index 8dad9314a..2b27f6710 100644
--- a/website/docs/backprop101.md
+++ b/website/docs/backprop101.md
@@ -284,7 +284,7 @@ def combine_by_addition(workers: List[Estimator]) -> Estimator:
         for worker in workers:
             result, callback = worker(inputs)
             summed += result
-            callbacks.append(worker)
+            callbacks.append(callback)
 
         def handle_feedback(re_summed: float) -> Array2d:
             re_input = callbacks[0](re_summed)
@@ -323,7 +323,7 @@ def combine_by_average(workers: List[Estimator]) -> Estimator:
         for worker in workers:
             result, callback = worker(inputs)
             summed += result
-            callbacks.append(worker)
+            callbacks.append(callback)
         average = summed / len(workers)
 
         def handle_feedback(re_average: float) -> Array2d:

From febb00f2a3490cb6b14b5defbcda4a54571b5f1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 14 Jun 2023 17:48:58 +0200
Subject: [PATCH 26/48] Configure isort to use the Black profile, recursively
 isort the `thinc` module (#880)

* Use isort with the Black profile

* isort the thinc module

* Fix import cycles as a result of import sorting

* Add isort to requirements
---
 pyproject.toml                                |   3 +
 requirements.txt                              |   1 +
 thinc/__init__.py                             |   1 -
 thinc/api.py                                  | 207 ++++++++++++++----
 thinc/backends/__init__.py                    |  27 ++-
 thinc/backends/_cupy_allocators.py            |   2 +-
 thinc/backends/_custom_kernels.py             |  11 +-
 thinc/backends/_param_server.py               |   3 +-
 thinc/backends/cblas.pxd                      |   1 -
 thinc/backends/cupy_ops.py                    |  21 +-
 thinc/backends/linalg.pxd                     |   5 +-
 thinc/backends/mps_ops.py                     |   4 +-
 thinc/backends/numpy_ops.pyx                  |  24 +-
 thinc/backends/ops.py                         |  55 ++++-
 thinc/compat.py                               |   4 +-
 thinc/config.py                               |   3 +-
 thinc/extra/search.pxd                        |   4 +-
 thinc/extra/search.pyx                        |   5 +-
 thinc/extra/tests/c_test_search.pyx           |   3 +-
 thinc/initializers.py                         |   1 +
 thinc/layers/__init__.py                      |  94 ++++----
 thinc/layers/add.py                           |   5 +-
 thinc/layers/array_getitem.py                 |   6 +-
 thinc/layers/bidirectional.py                 |   5 +-
 thinc/layers/cauchysimilarity.py              |   5 +-
 thinc/layers/chain.py                         |   7 +-
 thinc/layers/clipped_linear.py                |  10 +-
 thinc/layers/clone.py                         |   9 +-
 thinc/layers/concatenate.py                   |  20 +-
 thinc/layers/dish.py                          |  10 +-
 thinc/layers/dropout.py                       |   7 +-
 thinc/layers/embed.py                         |  11 +-
 thinc/layers/expand_window.py                 |   5 +-
 thinc/layers/gelu.py                          |  10 +-
 thinc/layers/hard_swish.py                    |  10 +-
 thinc/layers/hard_swish_mobilenet.py          |  10 +-
 thinc/layers/hashembed.py                     |  11 +-
 thinc/layers/layernorm.py                     |   7 +-
 thinc/layers/linear.py                        |   7 +-
 thinc/layers/list2array.py                    |   5 +-
 thinc/layers/list2padded.py                   |   7 +-
 thinc/layers/list2ragged.py                   |   7 +-
 thinc/layers/logistic.py                      |   5 +-
 thinc/layers/lstm.py                          |  13 +-
 thinc/layers/map_list.py                      |   4 +-
 thinc/layers/maxout.py                        |   7 +-
 thinc/layers/mish.py                          |   9 +-
 thinc/layers/multisoftmax.py                  |   7 +-
 thinc/layers/mxnetwrapper.py                  |   7 +-
 thinc/layers/noop.py                          |   5 +-
 thinc/layers/padded2list.py                   |   7 +-
 thinc/layers/parametricattention.py           |   5 +-
 thinc/layers/premap_ids.pyx                   |   8 +-
 thinc/layers/pytorchwrapper.py                |  16 +-
 thinc/layers/ragged2list.py                   |   7 +-
 thinc/layers/reduce_first.py                  |   5 +-
 thinc/layers/reduce_last.py                   |   4 +-
 thinc/layers/reduce_max.py                    |   7 +-
 thinc/layers/reduce_mean.py                   |   7 +-
 thinc/layers/reduce_sum.py                    |   3 +-
 thinc/layers/relu.py                          |  11 +-
 thinc/layers/remap_ids.py                     |   8 +-
 thinc/layers/residual.py                      |   6 +-
 thinc/layers/resizable.py                     |   2 +-
 thinc/layers/siamese.py                       |   5 +-
 thinc/layers/sigmoid.py                       |   7 +-
 thinc/layers/sigmoid_activation.py            |   4 +-
 thinc/layers/softmax.py                       |   9 +-
 thinc/layers/softmax_activation.py            |   5 +-
 thinc/layers/sparselinear.pyx                 |  15 +-
 thinc/layers/strings2arrays.py                |   6 +-
 thinc/layers/swish.py                         |  10 +-
 thinc/layers/tensorflowwrapper.py             |  14 +-
 thinc/layers/torchscriptwrapper.py            |   7 +-
 thinc/layers/tuplify.py                       |   4 +-
 thinc/layers/uniqued.py                       |   8 +-
 thinc/layers/with_array.py                    |   7 +-
 thinc/layers/with_array2d.py                  |   5 +-
 thinc/layers/with_cpu.py                      |   5 +-
 thinc/layers/with_debug.py                    |   2 +-
 thinc/layers/with_flatten.py                  |   4 +-
 thinc/layers/with_flatten_v2.py               |   5 +-
 thinc/layers/with_getitem.py                  |   5 +-
 thinc/layers/with_list.py                     |   6 +-
 thinc/layers/with_nvtx_range.py               |   3 +-
 thinc/layers/with_padded.py                   |   7 +-
 thinc/layers/with_ragged.py                   |   7 +-
 thinc/layers/with_reshape.py                  |   7 +-
 thinc/layers/with_signpost_interval.py        |   3 +-
 thinc/loss.py                                 |  17 +-
 thinc/model.py                                |  37 +++-
 thinc/mypy.py                                 |  13 +-
 thinc/optimizers.py                           |   6 +-
 thinc/schedules.py                            |   1 +
 thinc/shims/__init__.py                       |   7 +-
 thinc/shims/mxnet.py                          |  14 +-
 thinc/shims/pytorch.py                        |  17 +-
 thinc/shims/shim.py                           |   6 +-
 thinc/shims/tensorflow.py                     |   9 +-
 thinc/shims/torchscript.py                    |   3 +-
 thinc/tests/backends/test_mem.py              |   3 +-
 thinc/tests/backends/test_ops.py              |  29 ++-
 thinc/tests/conftest.py                       |   5 +-
 thinc/tests/layers/test_basic_tagger.py       |  17 +-
 thinc/tests/layers/test_combinators.py        |  16 +-
 thinc/tests/layers/test_feed_forward.py       |   8 +-
 thinc/tests/layers/test_hash_embed.py         |   1 +
 thinc/tests/layers/test_layers_api.py         |  13 +-
 thinc/tests/layers/test_linear.py             |   7 +-
 thinc/tests/layers/test_lstm.py               |   9 +-
 thinc/tests/layers/test_mappers.py            |   3 +-
 thinc/tests/layers/test_mnist.py              |  16 +-
 thinc/tests/layers/test_mxnet_wrapper.py      |  15 +-
 thinc/tests/layers/test_pytorch_wrapper.py    |  38 +++-
 thinc/tests/layers/test_reduce.py             |   3 +-
 thinc/tests/layers/test_resizable.py          |   8 +-
 thinc/tests/layers/test_shim.py               |   2 +
 thinc/tests/layers/test_softmax.py            |   2 +-
 thinc/tests/layers/test_sparse_linear.py      |   4 +-
 thinc/tests/layers/test_tensorflow_wrapper.py |  16 +-
 thinc/tests/layers/test_torchscriptwrapper.py |   9 +-
 thinc/tests/layers/test_transforms.py         |   3 +-
 thinc/tests/layers/test_uniqued.py            |   9 +-
 thinc/tests/layers/test_with_debug.py         |   3 +-
 thinc/tests/layers/test_with_flatten.py       |   1 +
 thinc/tests/layers/test_with_transforms.py    |  19 +-
 thinc/tests/model/test_model.py               |  29 ++-
 thinc/tests/model/test_validation.py          |  13 +-
 thinc/tests/mypy/modules/fail_no_plugin.py    |   2 +-
 thinc/tests/mypy/modules/fail_plugin.py       |   2 +-
 thinc/tests/mypy/modules/success_no_plugin.py |   2 +-
 thinc/tests/mypy/modules/success_plugin.py    |   2 +-
 thinc/tests/mypy/test_mypy.py                 |   2 +-
 thinc/tests/regression/issue519/program.py    |   2 +-
 thinc/tests/regression/test_issue208.py       |   2 +-
 thinc/tests/shims/test_pytorch_grad_scaler.py |   4 +-
 thinc/tests/strategies.py                     |   5 +-
 thinc/tests/test_config.py                    |  17 +-
 thinc/tests/test_import__all__.py             |   4 +-
 thinc/tests/test_indexing.py                  |   5 +-
 thinc/tests/test_initializers.py              |  12 +-
 thinc/tests/test_loss.py                      |  11 +-
 thinc/tests/test_optimizers.py                |   5 +-
 thinc/tests/test_schedules.py                 |  11 +-
 thinc/tests/test_serialize.py                 |  13 +-
 thinc/tests/test_types.py                     |  15 +-
 thinc/tests/test_util.py                      |  15 +-
 thinc/tests/util.py                           |   8 +-
 thinc/types.py                                |  31 ++-
 thinc/util.py                                 |  59 +++--
 150 files changed, 1014 insertions(+), 612 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d3fb69b76..1ad6782d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,3 +9,6 @@ requires = [
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
+
+[tool.isort]
+profile = "black"
diff --git a/requirements.txt b/requirements.txt
index 522291e51..6ae0c270c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,3 +36,4 @@ nbformat>=5.0.4,<5.2.0
 # Test to_disk/from_disk against pathlib.Path subclasses
 pathy>=0.3.5
 black>=22.0,<23.0
+isort>=5.0,<6.0
diff --git a/thinc/__init__.py b/thinc/__init__.py
index dfa821c4f..8f4a8a5a5 100644
--- a/thinc/__init__.py
+++ b/thinc/__init__.py
@@ -4,7 +4,6 @@
 from .about import __version__
 from .config import registry
 
-
 # fmt: off
 __all__ = [
     "registry",
diff --git a/thinc/api.py b/thinc/api.py
index 203c501da..6f795237a 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -1,52 +1,163 @@
-from .config import Config, registry, ConfigValidationError
-from .initializers import normal_init, uniform_init, glorot_uniform_init, zero_init
-from .initializers import configure_normal_init
-from .loss import CategoricalCrossentropy, L2Distance, CosineDistance
-from .loss import SequenceCategoricalCrossentropy
-from .model import Model, serialize_attr, deserialize_attr
-from .model import set_dropout_rate, change_attr_values, wrap_model_recursive
-from .shims import Shim, PyTorchGradScaler, PyTorchShim, TensorFlowShim, keras_model_fns
-from .shims import MXNetShim, TorchScriptShim, maybe_handshake_model
-from .optimizers import Adam, RAdam, SGD, Optimizer
-from .schedules import cyclic_triangular, warmup_linear, constant, constant_then
-from .schedules import decaying, slanted_triangular, compounding
-from .types import Ragged, Padded, ArgsKwargs, Unserializable
-from .util import fix_random_seed, is_cupy_array, set_active_gpu
-from .util import prefer_gpu, require_gpu, require_cpu
-from .util import DataValidationError, data_validation
-from .util import to_categorical, get_width, get_array_module, to_numpy
-from .util import torch2xp, xp2torch, tensorflow2xp, xp2tensorflow, mxnet2xp, xp2mxnet
-from .util import get_torch_default_device
+from .backends import (
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Ops,
+    get_current_ops,
+    get_ops,
+    set_current_ops,
+    set_gpu_allocator,
+    use_ops,
+    use_pytorch_for_gpu_memory,
+    use_tensorflow_for_gpu_memory,
+)
 from .compat import has_cupy
-from .backends import get_ops, set_current_ops, get_current_ops, use_ops
-from .backends import Ops, CupyOps, MPSOps, NumpyOps, set_gpu_allocator
-from .backends import use_pytorch_for_gpu_memory, use_tensorflow_for_gpu_memory
-
-from .layers import Dropout, Embed, expand_window, HashEmbed, LayerNorm, Linear
-from .layers import Maxout, Mish, MultiSoftmax, Relu, softmax_activation, Softmax, LSTM
-from .layers import CauchySimilarity, ParametricAttention, Logistic
-from .layers import resizable, sigmoid_activation, Sigmoid, SparseLinear
-from .layers import SparseLinear_v2, ClippedLinear, ReluK, HardTanh, HardSigmoid
-from .layers import Dish, HardSwish, HardSwishMobilenet, Swish, Gelu
-from .layers import PyTorchWrapper, PyTorchRNNWrapper, PyTorchLSTM
-from .layers import TensorFlowWrapper, keras_subclass, MXNetWrapper
-from .layers import PyTorchWrapper_v2, Softmax_v2, PyTorchWrapper_v3
-from .layers import TorchScriptWrapper_v1, pytorch_to_torchscript_wrapper
-
-from .layers import add, bidirectional, chain, clone, concatenate, noop
-from .layers import residual, uniqued, siamese, list2ragged, ragged2list
-from .layers import map_list
-from .layers import with_array, with_array2d
-from .layers import with_padded, with_list, with_ragged, with_flatten
-from .layers import with_reshape, with_getitem, strings2arrays, list2array
-from .layers import list2ragged, ragged2list, list2padded, padded2list
-from .layers import remap_ids, remap_ids_v2, premap_ids
-from .layers import array_getitem, with_cpu, with_debug, with_nvtx_range
-from .layers import with_signpost_interval
-from .layers import tuplify, with_flatten_v2
-
-from .layers import reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum
-
+from .config import Config, ConfigValidationError, registry
+from .initializers import (
+    configure_normal_init,
+    glorot_uniform_init,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
+from .layers import (
+    LSTM,
+    CauchySimilarity,
+    ClippedLinear,
+    Dish,
+    Dropout,
+    Embed,
+    Gelu,
+    HardSigmoid,
+    HardSwish,
+    HardSwishMobilenet,
+    HardTanh,
+    HashEmbed,
+    LayerNorm,
+    Linear,
+    Logistic,
+    Maxout,
+    Mish,
+    MultiSoftmax,
+    MXNetWrapper,
+    ParametricAttention,
+    PyTorchLSTM,
+    PyTorchRNNWrapper,
+    PyTorchWrapper,
+    PyTorchWrapper_v2,
+    PyTorchWrapper_v3,
+    Relu,
+    ReluK,
+    Sigmoid,
+    Softmax,
+    Softmax_v2,
+    SparseLinear,
+    SparseLinear_v2,
+    Swish,
+    TensorFlowWrapper,
+    TorchScriptWrapper_v1,
+    add,
+    array_getitem,
+    bidirectional,
+    chain,
+    clone,
+    concatenate,
+    expand_window,
+    keras_subclass,
+    list2array,
+    list2padded,
+    list2ragged,
+    map_list,
+    noop,
+    padded2list,
+    premap_ids,
+    pytorch_to_torchscript_wrapper,
+    ragged2list,
+    reduce_first,
+    reduce_last,
+    reduce_max,
+    reduce_mean,
+    reduce_sum,
+    remap_ids,
+    remap_ids_v2,
+    residual,
+    resizable,
+    siamese,
+    sigmoid_activation,
+    softmax_activation,
+    strings2arrays,
+    tuplify,
+    uniqued,
+    with_array,
+    with_array2d,
+    with_cpu,
+    with_debug,
+    with_flatten,
+    with_flatten_v2,
+    with_getitem,
+    with_list,
+    with_nvtx_range,
+    with_padded,
+    with_ragged,
+    with_reshape,
+    with_signpost_interval,
+)
+from .loss import (
+    CategoricalCrossentropy,
+    CosineDistance,
+    L2Distance,
+    SequenceCategoricalCrossentropy,
+)
+from .model import (
+    Model,
+    change_attr_values,
+    deserialize_attr,
+    serialize_attr,
+    set_dropout_rate,
+    wrap_model_recursive,
+)
+from .optimizers import SGD, Adam, Optimizer, RAdam
+from .schedules import (
+    compounding,
+    constant,
+    constant_then,
+    cyclic_triangular,
+    decaying,
+    slanted_triangular,
+    warmup_linear,
+)
+from .shims import (
+    MXNetShim,
+    PyTorchGradScaler,
+    PyTorchShim,
+    Shim,
+    TensorFlowShim,
+    TorchScriptShim,
+    keras_model_fns,
+    maybe_handshake_model,
+)
+from .types import ArgsKwargs, Padded, Ragged, Unserializable
+from .util import (
+    DataValidationError,
+    data_validation,
+    fix_random_seed,
+    get_array_module,
+    get_torch_default_device,
+    get_width,
+    is_cupy_array,
+    mxnet2xp,
+    prefer_gpu,
+    require_cpu,
+    require_gpu,
+    set_active_gpu,
+    tensorflow2xp,
+    to_categorical,
+    to_numpy,
+    torch2xp,
+    xp2mxnet,
+    xp2tensorflow,
+    xp2torch,
+)
 
 # fmt: off
 __all__ = [
diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py
index c21620126..8973c8836 100644
--- a/thinc/backends/__init__.py
+++ b/thinc/backends/__init__.py
@@ -1,20 +1,23 @@
 import contextlib
-from typing import Type, Dict, Any, Callable, Optional, cast
-
-from contextvars import ContextVar
 import threading
+from contextvars import ContextVar
+from typing import Any, Callable, Dict, Optional, Type, cast
 
-from .ops import Ops
-from .cupy_ops import CupyOps
-from .numpy_ops import NumpyOps
-from .mps_ops import MPSOps
-from ._cupy_allocators import cupy_tensorflow_allocator, cupy_pytorch_allocator
-from ._param_server import ParamServer
-from ..util import assert_tensorflow_installed, assert_pytorch_installed
-from ..util import get_torch_default_device, is_cupy_array, require_cpu
 from .. import registry
 from ..compat import cupy, has_cupy
-
+from ..util import (
+    assert_pytorch_installed,
+    assert_tensorflow_installed,
+    get_torch_default_device,
+    is_cupy_array,
+    require_cpu,
+)
+from ._cupy_allocators import cupy_pytorch_allocator, cupy_tensorflow_allocator
+from ._param_server import ParamServer
+from .cupy_ops import CupyOps
+from .mps_ops import MPSOps
+from .numpy_ops import NumpyOps
+from .ops import Ops
 
 context_ops: ContextVar[Optional[Ops]] = ContextVar("context_ops", default=None)
 context_pools: ContextVar[dict] = ContextVar("context_pools", default={})
diff --git a/thinc/backends/_cupy_allocators.py b/thinc/backends/_cupy_allocators.py
index f2b6faee9..77c958e36 100644
--- a/thinc/backends/_cupy_allocators.py
+++ b/thinc/backends/_cupy_allocators.py
@@ -1,8 +1,8 @@
 from typing import cast
 
+from ..compat import cupy, tensorflow, torch
 from ..types import ArrayXd
 from ..util import get_torch_default_device, tensorflow2xp
-from ..compat import torch, cupy, tensorflow
 
 
 def cupy_tensorflow_allocator(size_in_bytes: int):
diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
index 0b868e6d6..fa837017d 100644
--- a/thinc/backends/_custom_kernels.py
+++ b/thinc/backends/_custom_kernels.py
@@ -1,12 +1,13 @@
-from typing import Callable, Optional, Tuple
-from functools import reduce
-import numpy
 import operator
 import re
-from pathlib import Path
 from collections import defaultdict
-from ..compat import cupy, has_cupy_gpu
+from functools import reduce
+from pathlib import Path
+from typing import Callable, Optional, Tuple
 
+import numpy
+
+from ..compat import cupy, has_cupy_gpu
 
 PWD = Path(__file__).parent
 KERNELS_SRC = (PWD / "_custom_kernels.cu").read_text(encoding="utf8")
diff --git a/thinc/backends/_param_server.py b/thinc/backends/_param_server.py
index 4ce374a4e..db7b5a505 100644
--- a/thinc/backends/_param_server.py
+++ b/thinc/backends/_param_server.py
@@ -1,9 +1,8 @@
-from typing import Dict, Tuple, Optional, Any
+from typing import Any, Dict, Optional, Tuple
 
 from ..types import FloatsXd
 from ..util import get_array_module
 
-
 KeyT = Tuple[int, str]
 
 
diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd
index 15837e5e7..73cea1f2d 100644
--- a/thinc/backends/cblas.pxd
+++ b/thinc/backends/cblas.pxd
@@ -1,6 +1,5 @@
 from libcpp.memory cimport shared_ptr
 
-
 ctypedef void (*sgemm_ptr)(bint transA, bint transB, int M, int N, int K,
                            float alpha, const float* A, int lda, const float *B,
                            int ldb, float beta, float* C, int ldc) nogil
diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index 506276380..366faf70a 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -1,13 +1,20 @@
 import numpy
+
 from .. import registry
-from .ops import Ops
-from .numpy_ops import NumpyOps
-from . import _custom_kernels
-from ..types import DeviceTypes
-from ..util import torch2xp, tensorflow2xp, mxnet2xp
-from ..util import is_cupy_array
-from ..util import is_torch_cuda_array, is_tensorflow_gpu_array, is_mxnet_gpu_array
 from ..compat import cupy, cupyx
+from ..types import DeviceTypes
+from ..util import (
+    is_cupy_array,
+    is_mxnet_gpu_array,
+    is_tensorflow_gpu_array,
+    is_torch_cuda_array,
+    mxnet2xp,
+    tensorflow2xp,
+    torch2xp,
+)
+from . import _custom_kernels
+from .numpy_ops import NumpyOps
+from .ops import Ops
 
 
 @registry.ops("CupyOps")
diff --git a/thinc/backends/linalg.pxd b/thinc/backends/linalg.pxd
index 494a26c30..37fb9ea2b 100644
--- a/thinc/backends/linalg.pxd
+++ b/thinc/backends/linalg.pxd
@@ -2,10 +2,9 @@
 # cython: cdivision=True
 
 cimport cython
-from libc.stdint cimport int32_t
-from libc.string cimport memset, memcpy
 from cymem.cymem cimport Pool
-
+from libc.stdint cimport int32_t
+from libc.string cimport memcpy, memset
 
 ctypedef float weight_t
 
diff --git a/thinc/backends/mps_ops.py b/thinc/backends/mps_ops.py
index 8ebbd4e4b..c6ba71f11 100644
--- a/thinc/backends/mps_ops.py
+++ b/thinc/backends/mps_ops.py
@@ -1,8 +1,10 @@
 from typing import TYPE_CHECKING
+
 import numpy
 
 from .. import registry
-from . import NumpyOps, Ops
+from .numpy_ops import NumpyOps
+from .ops import Ops
 
 if TYPE_CHECKING:
     # Type checking does not work with dynamic base classes, since MyPy cannot
diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index c980e6c5d..f64aa29dd 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -1,27 +1,29 @@
 # cython: cdivision=True
 # cython: infer_types=True
 # cython: profile=True
-from typing import Optional
 from collections.abc import Sized
+from typing import Optional
+
 import numpy
 
+cimport blis.cy
 cimport cython
-from libc.string cimport memcpy, memset
-from libc.stdlib cimport calloc, malloc, free
-from libc.stdint cimport uint32_t, uint64_t
-from libc.string cimport memcpy
-from libc.math cimport isnan
+cimport numpy as np
 from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
+from libc.math cimport isnan
+from libc.stdint cimport uint32_t, uint64_t
+from libc.stdlib cimport calloc, free, malloc
+from libc.string cimport memcpy, memset
 from murmurhash.mrmr cimport hash64
-cimport numpy as np
-cimport blis.cy
+from preshed.maps cimport PreshMap
 
 from .. import registry
+from ..types import ArrayXd, DeviceTypes, DTypes, Shape
 from ..util import copy_array, get_array_module
-from ..types import DeviceTypes, DTypes, Shape, ArrayXd
+
 from .cblas cimport CBlas, daxpy, saxpy
-from .linalg cimport VecVec, Vec
+from .linalg cimport Vec, VecVec
+
 from .ops import Ops
 
 try:
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 8bb770023..01bb2f852 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -1,18 +1,53 @@
+import itertools
 import math
+from typing import (
+    Any,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+    overload,
+)
 
-from typing import Optional, List, Tuple, Sequence, Type, Union, cast, TypeVar
-from typing import Iterator, overload, Any
 import numpy
-import itertools
 
-from ..types import Xp, Shape, DTypes, DTypesInt, DTypesFloat, List2d, ArrayXd
-from ..types import Floats1d, Floats2d, Floats3d, Floats4d
-from ..types import Array1d, Array2d, Array3d, Array4d, ListXd
-from ..types import FloatsXd, Ints1d, Ints2d, Ints3d, Ints4d, IntsXd, _Floats
-from ..types import FloatsXdT
-from ..types import DeviceTypes, Generator, Padded, Batchable, SizedGenerator
+from ..types import (
+    Array1d,
+    Array2d,
+    Array3d,
+    Array4d,
+    ArrayXd,
+    Batchable,
+    DeviceTypes,
+    DTypes,
+    DTypesFloat,
+    DTypesInt,
+    Floats1d,
+    Floats2d,
+    Floats3d,
+    Floats4d,
+    FloatsXd,
+    FloatsXdT,
+    Generator,
+    Ints1d,
+    Ints2d,
+    Ints3d,
+    Ints4d,
+    IntsXd,
+    List2d,
+    ListXd,
+    Padded,
+    Shape,
+    SizedGenerator,
+    Xp,
+    _Floats,
+)
 from ..util import get_array_module, is_xp_array, to_numpy
-
 from .cblas import CBlas
 
 ArrayT = TypeVar("ArrayT", bound=ArrayXd)
diff --git a/thinc/compat.py b/thinc/compat.py
index 54421e187..52a73669f 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -27,8 +27,8 @@
 
 
 try:  # pragma: no cover
-    import torch.utils.dlpack
     import torch
+    import torch.utils.dlpack
 
     has_torch = True
     has_torch_cuda_gpu = torch.cuda.device_count() != 0
@@ -51,8 +51,8 @@
     torch_version = Version("0.0.0")
 
 try:  # pragma: no cover
-    import tensorflow.experimental.dlpack
     import tensorflow
+    import tensorflow.experimental.dlpack
 
     has_tensorflow = True
     has_tensorflow_gpu = len(tensorflow.config.get_visible_devices("GPU")) > 0
diff --git a/thinc/config.py b/thinc/config.py
index e5452819b..434c96085 100644
--- a/thinc/config.py
+++ b/thinc/config.py
@@ -1,6 +1,7 @@
 import catalogue
 import confection
-from confection import Config, ConfigValidationError, Promise, VARIABLE_RE
+from confection import VARIABLE_RE, Config, ConfigValidationError, Promise
+
 from .types import Decorator
 
 
diff --git a/thinc/extra/search.pxd b/thinc/extra/search.pxd
index daccbf58e..a27ba0525 100644
--- a/thinc/extra/search.pxd
+++ b/thinc/extra/search.pxd
@@ -1,7 +1,5 @@
 from cymem.cymem cimport Pool
-
-from libc.stdint cimport uint32_t
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.pair cimport pair
 from libcpp.queue cimport priority_queue
 from libcpp.vector cimport vector
diff --git a/thinc/extra/search.pyx b/thinc/extra/search.pyx
index d69756551..71cc85d8b 100644
--- a/thinc/extra/search.pyx
+++ b/thinc/extra/search.pyx
@@ -1,7 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.string cimport memset, memcpy
-from libc.math cimport log, exp
+from libc.math cimport exp, log
+from libc.string cimport memcpy, memset
+
 import math
 
 from cymem.cymem cimport Pool
diff --git a/thinc/extra/tests/c_test_search.pyx b/thinc/extra/tests/c_test_search.pyx
index a727d3364..70cdf5745 100644
--- a/thinc/extra/tests/c_test_search.pyx
+++ b/thinc/extra/tests/c_test_search.pyx
@@ -1,5 +1,6 @@
-from thinc.extra.search cimport Beam
 from cymem.cymem cimport Pool
+
+from thinc.extra.search cimport Beam
 from thinc.typedefs cimport class_t, weight_t
 
 
diff --git a/thinc/initializers.py b/thinc/initializers.py
index 1333911a3..feb02889d 100644
--- a/thinc/initializers.py
+++ b/thinc/initializers.py
@@ -1,4 +1,5 @@
 from typing import Callable, cast
+
 import numpy
 
 from .backends import Ops
diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py
index 4b73a2dce..032af5fde 100644
--- a/thinc/layers/__init__.py
+++ b/thinc/layers/__init__.py
@@ -1,48 +1,48 @@
 # Weights layers
+# Combinators
+from .add import add
+
+# Array manipulation
+from .array_getitem import array_getitem
+from .bidirectional import bidirectional
 from .cauchysimilarity import CauchySimilarity
+from .chain import chain
+from .clipped_linear import ClippedLinear, HardSigmoid, HardTanh, ReluK
+from .clone import clone
+from .concatenate import concatenate
 from .dish import Dish
 from .dropout import Dropout
 from .embed import Embed
 from .expand_window import expand_window
+from .gelu import Gelu
+from .hard_swish import HardSwish
+from .hard_swish_mobilenet import HardSwishMobilenet
 from .hashembed import HashEmbed
 from .layernorm import LayerNorm
 from .linear import Linear
-from .lstm import LSTM, PyTorchLSTM
+
+# Data-type transfers
+from .list2array import list2array
+from .list2padded import list2padded
+from .list2ragged import list2ragged
 from .logistic import Logistic
+from .lstm import LSTM, PyTorchLSTM
+from .map_list import map_list
 from .maxout import Maxout
 from .mish import Mish
 from .multisoftmax import MultiSoftmax
-from .parametricattention import ParametricAttention
-from .pytorchwrapper import PyTorchWrapper, PyTorchWrapper_v2, PyTorchWrapper_v3
-from .pytorchwrapper import PyTorchRNNWrapper
-from .relu import Relu
-from .clipped_linear import ClippedLinear, ReluK, HardSigmoid, HardTanh
-from .hard_swish import HardSwish
-from .hard_swish_mobilenet import HardSwishMobilenet
-from .swish import Swish
-from .gelu import Gelu
-from .resizable import resizable
-from .sigmoid_activation import sigmoid_activation
-from .sigmoid import Sigmoid
-from .softmax_activation import softmax_activation
-from .softmax import Softmax, Softmax_v2
-from .sparselinear import SparseLinear, SparseLinear_v2
-from .tensorflowwrapper import TensorFlowWrapper, keras_subclass
-from .torchscriptwrapper import TorchScriptWrapper_v1, pytorch_to_torchscript_wrapper
 from .mxnetwrapper import MXNetWrapper
-
-# Combinators
-from .add import add
-from .bidirectional import bidirectional
-from .chain import chain
-from .clone import clone
-from .concatenate import concatenate
-from .map_list import map_list
 from .noop import noop
-from .residual import residual
-from .uniqued import uniqued
-from .siamese import siamese
-from .tuplify import tuplify
+from .padded2list import padded2list
+from .parametricattention import ParametricAttention
+from .premap_ids import premap_ids
+from .pytorchwrapper import (
+    PyTorchRNNWrapper,
+    PyTorchWrapper,
+    PyTorchWrapper_v2,
+    PyTorchWrapper_v3,
+)
+from .ragged2list import ragged2list
 
 # Pooling
 from .reduce_first import reduce_first
@@ -50,34 +50,36 @@
 from .reduce_max import reduce_max
 from .reduce_mean import reduce_mean
 from .reduce_sum import reduce_sum
-
-# Array manipulation
-from .array_getitem import array_getitem
-
-# Data-type transfers
-from .list2array import list2array
-from .list2ragged import list2ragged
-from .list2padded import list2padded
-from .ragged2list import ragged2list
-from .padded2list import padded2list
+from .relu import Relu
 from .remap_ids import remap_ids, remap_ids_v2
-from .premap_ids import premap_ids
+from .residual import residual
+from .resizable import resizable
+from .siamese import siamese
+from .sigmoid import Sigmoid
+from .sigmoid_activation import sigmoid_activation
+from .softmax import Softmax, Softmax_v2
+from .softmax_activation import softmax_activation
+from .sparselinear import SparseLinear, SparseLinear_v2
 from .strings2arrays import strings2arrays
+from .swish import Swish
+from .tensorflowwrapper import TensorFlowWrapper, keras_subclass
+from .torchscriptwrapper import TorchScriptWrapper_v1, pytorch_to_torchscript_wrapper
+from .tuplify import tuplify
+from .uniqued import uniqued
 from .with_array import with_array
 from .with_array2d import with_array2d
 from .with_cpu import with_cpu
+from .with_debug import with_debug
 from .with_flatten import with_flatten
 from .with_flatten_v2 import with_flatten_v2
-from .with_padded import with_padded
+from .with_getitem import with_getitem
 from .with_list import with_list
+from .with_nvtx_range import with_nvtx_range
+from .with_padded import with_padded
 from .with_ragged import with_ragged
 from .with_reshape import with_reshape
-from .with_getitem import with_getitem
-from .with_debug import with_debug
-from .with_nvtx_range import with_nvtx_range
 from .with_signpost_interval import with_signpost_interval
 
-
 # fmt: off
 __all__ = [
     "CauchySimilarity",
diff --git a/thinc/layers/add.py b/thinc/layers/add.py
index 60b1f46b9..a3aa1af17 100644
--- a/thinc/layers/add.py
+++ b/thinc/layers/add.py
@@ -1,11 +1,10 @@
-from typing import Any, Tuple, Callable, Optional, TypeVar, Dict
+from typing import Any, Callable, Dict, Optional, Tuple, TypeVar
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import ArrayXd, XY_XY_OutT
 from ..util import get_width
 
-
 InT = TypeVar("InT", bound=Any)
 OutT = TypeVar("OutT", bound=ArrayXd)
 
diff --git a/thinc/layers/array_getitem.py b/thinc/layers/array_getitem.py
index 17ffcb7ee..219b4ea1c 100644
--- a/thinc/layers/array_getitem.py
+++ b/thinc/layers/array_getitem.py
@@ -1,7 +1,7 @@
-from typing import Union, Sequence, Tuple, TypeVar
-from ..types import ArrayXd, FloatsXd, IntsXd
-from ..model import Model
+from typing import Sequence, Tuple, TypeVar, Union
 
+from ..model import Model
+from ..types import ArrayXd, FloatsXd, IntsXd
 
 AxisIndex = Union[int, slice, Sequence[int]]
 Index = Union[AxisIndex, Tuple[AxisIndex, ...]]
diff --git a/thinc/layers/bidirectional.py b/thinc/layers/bidirectional.py
index 1ff73f013..8cea04e30 100644
--- a/thinc/layers/bidirectional.py
+++ b/thinc/layers/bidirectional.py
@@ -1,11 +1,10 @@
-from typing import Optional, Tuple, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..backends import Ops
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Padded
 
-
 InT = Padded
 OutT = Padded
 
diff --git a/thinc/layers/cauchysimilarity.py b/thinc/layers/cauchysimilarity.py
index 25af8d9df..57e5932ec 100644
--- a/thinc/layers/cauchysimilarity.py
+++ b/thinc/layers/cauchysimilarity.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats1d, Floats2d
 from ..util import get_width
 
-
 InT = Tuple[Floats2d, Floats2d]
 OutT = Floats1d
 
diff --git a/thinc/layers/chain.py b/thinc/layers/chain.py
index 258ee0902..a7e3ee7da 100644
--- a/thinc/layers/chain.py
+++ b/thinc/layers/chain.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable, Optional, TypeVar, Any, Dict, List, cast
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-from ..util import get_width
+from ..model import Model
 from ..types import XY_YZ_OutT
-
+from ..util import get_width
 
 InT = TypeVar("InT")
 MidT = TypeVar("MidT")
diff --git a/thinc/layers/clipped_linear.py b/thinc/layers/clipped_linear.py
index 34bb8ade8..efe295fa6 100644
--- a/thinc/layers/clipped_linear.py
+++ b/thinc/layers/clipped_linear.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import glorot_uniform_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import glorot_uniform_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("ClippedLinear.v1")
diff --git a/thinc/layers/clone.py b/thinc/layers/clone.py
index 8b433407d..1758f5fe7 100644
--- a/thinc/layers/clone.py
+++ b/thinc/layers/clone.py
@@ -1,10 +1,9 @@
-from typing import TypeVar, cast, List
+from typing import List, TypeVar, cast
 
-from .noop import noop
-from .chain import chain
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
+from .chain import chain
+from .noop import noop
 
 InT = TypeVar("InT")
 OutT = TypeVar("OutT")
diff --git a/thinc/layers/concatenate.py b/thinc/layers/concatenate.py
index 4cce96954..e810cefc3 100644
--- a/thinc/layers/concatenate.py
+++ b/thinc/layers/concatenate.py
@@ -1,14 +1,22 @@
-from typing import Any, List, Tuple, Callable, Optional
-from typing import TypeVar, cast, Dict, Union, Sequence
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
 
 from ..backends import NumpyOps
-from ..model import Model
 from ..config import registry
-from ..types import Array2d, Ragged
+from ..model import Model
+from ..types import Array2d, Ragged, XY_XY_OutT
 from ..util import get_width
 from .noop import noop
-from ..types import XY_XY_OutT
-
 
 NUMPY_OPS = NumpyOps()
 
diff --git a/thinc/layers/dish.py b/thinc/layers/dish.py
index 1092638e7..dc871ad24 100644
--- a/thinc/layers/dish.py
+++ b/thinc/layers/dish.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("Dish.v1")
diff --git a/thinc/layers/dropout.py b/thinc/layers/dropout.py
index f4fa29445..7db35261a 100644
--- a/thinc/layers/dropout.py
+++ b/thinc/layers/dropout.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, List, TypeVar, cast, Union, Sequence
+from typing import Callable, List, Sequence, Tuple, TypeVar, Union, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import ArrayXd, Ragged, Padded
-
+from ..model import Model
+from ..types import ArrayXd, Padded, Ragged
 
 InT = TypeVar("InT", bound=Union[ArrayXd, Sequence[ArrayXd], Ragged, Padded])
 
diff --git a/thinc/layers/embed.py b/thinc/layers/embed.py
index 703baf475..9d8d34e4a 100644
--- a/thinc/layers/embed.py
+++ b/thinc/layers/embed.py
@@ -1,13 +1,12 @@
-from typing import Dict, Callable, Tuple, Optional, Union, cast, TypeVar
+from typing import Callable, Dict, Optional, Tuple, TypeVar, Union, cast
 
-from .chain import chain
-from .array_getitem import ints_getitem
-from ..model import Model
 from ..config import registry
-from ..types import Ints1d, Ints2d, Floats1d, Floats2d
 from ..initializers import uniform_init
+from ..model import Model
+from ..types import Floats1d, Floats2d, Ints1d, Ints2d
 from ..util import get_width, partial
-
+from .array_getitem import ints_getitem
+from .chain import chain
 
 InT = TypeVar("InT", bound=Union[Ints1d, Ints2d])
 OutT = Floats2d
diff --git a/thinc/layers/expand_window.py b/thinc/layers/expand_window.py
index 1075a49a2..193b82d39 100644
--- a/thinc/layers/expand_window.py
+++ b/thinc/layers/expand_window.py
@@ -1,10 +1,9 @@
-from typing import Tuple, TypeVar, Callable, Union, cast
+from typing import Callable, Tuple, TypeVar, Union, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d, Ragged
 
-
 InT = TypeVar("InT", Floats2d, Ragged)
 
 
diff --git a/thinc/layers/gelu.py b/thinc/layers/gelu.py
index 686b1f0d8..f51ee4545 100644
--- a/thinc/layers/gelu.py
+++ b/thinc/layers/gelu.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("Gelu.v1")
diff --git a/thinc/layers/hard_swish.py b/thinc/layers/hard_swish.py
index 773314a38..2fc135e41 100644
--- a/thinc/layers/hard_swish.py
+++ b/thinc/layers/hard_swish.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("HardSwish.v1")
diff --git a/thinc/layers/hard_swish_mobilenet.py b/thinc/layers/hard_swish_mobilenet.py
index 9f5f3fb9f..400622497 100644
--- a/thinc/layers/hard_swish_mobilenet.py
+++ b/thinc/layers/hard_swish_mobilenet.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("HardSwishMobilenet.v1")
diff --git a/thinc/layers/hashembed.py b/thinc/layers/hashembed.py
index 8c85fdb02..7ecd9b26a 100644
--- a/thinc/layers/hashembed.py
+++ b/thinc/layers/hashembed.py
@@ -1,13 +1,12 @@
-from typing import Callable, Dict, Tuple, Optional, Any, Union, cast, TypeVar
+from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, Union, cast
 
-from .chain import chain
-from .array_getitem import ints_getitem
-from ..model import Model
 from ..config import registry
-from ..types import Floats1d, Floats2d, Ints2d, Ints1d
 from ..initializers import uniform_init
+from ..model import Model
+from ..types import Floats1d, Floats2d, Ints1d, Ints2d
 from ..util import partial
-
+from .array_getitem import ints_getitem
+from .chain import chain
 
 InT = TypeVar("InT", bound=Union[Ints1d, Ints2d])
 OutT = Floats2d
diff --git a/thinc/layers/layernorm.py b/thinc/layers/layernorm.py
index 684489c54..2090ed9a8 100644
--- a/thinc/layers/layernorm.py
+++ b/thinc/layers/layernorm.py
@@ -1,12 +1,11 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
+from ..backends import Ops
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d
-from ..backends import Ops
 from ..util import get_width
 
-
 InT = Floats2d
 
 
diff --git a/thinc/layers/linear.py b/thinc/layers/linear.py
index bbf7b7874..ef24ec044 100644
--- a/thinc/layers/linear.py
+++ b/thinc/layers/linear.py
@@ -1,12 +1,11 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Floats1d, Floats2d
 from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
+from ..types import Floats1d, Floats2d
 from ..util import get_width, partial
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/list2array.py b/thinc/layers/list2array.py
index a52d6e6c6..a31d5d80d 100644
--- a/thinc/layers/list2array.py
+++ b/thinc/layers/list2array.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, TypeVar, List
+from typing import Callable, List, Tuple, TypeVar
 
 from ..backends import NumpyOps
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Array2d
 
-
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/thinc/layers/list2padded.py b/thinc/layers/list2padded.py
index 2a02f90e0..e98e88a5c 100644
--- a/thinc/layers/list2padded.py
+++ b/thinc/layers/list2padded.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, TypeVar, cast
+from typing import Callable, Tuple, TypeVar, cast
 
-from ..types import Padded, List2d
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
+from ..types import List2d, Padded
 
 InT = TypeVar("InT", bound=List2d)
 OutT = Padded
diff --git a/thinc/layers/list2ragged.py b/thinc/layers/list2ragged.py
index a63237dfe..25ad7bed3 100644
--- a/thinc/layers/list2ragged.py
+++ b/thinc/layers/list2ragged.py
@@ -1,9 +1,8 @@
-from typing import Tuple, List, Callable, cast, TypeVar
+from typing import Callable, List, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import ListXd, ArrayXd, Ragged
-
+from ..model import Model
+from ..types import ArrayXd, ListXd, Ragged
 
 InT = TypeVar("InT", bound=ListXd)
 OutT = Ragged
diff --git a/thinc/layers/logistic.py b/thinc/layers/logistic.py
index cda0c7dd5..43d45a330 100644
--- a/thinc/layers/logistic.py
+++ b/thinc/layers/logistic.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable
+from typing import Callable, Tuple
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/lstm.py b/thinc/layers/lstm.py
index 266fee6e3..c817cd4db 100644
--- a/thinc/layers/lstm.py
+++ b/thinc/layers/lstm.py
@@ -1,13 +1,13 @@
-from typing import Optional, Tuple, Callable, cast
 from functools import partial
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
+from ..backends import Ops
 from ..config import registry
-from ..util import get_width
+from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
 from ..types import Floats1d, Floats2d, Floats4d, Padded, Ragged
+from ..util import get_width
 from .noop import noop
-from ..initializers import glorot_uniform_init, zero_init
-from ..backends import Ops
 
 
 @registry.layers("LSTM.v1")
@@ -45,8 +45,9 @@ def PyTorchLSTM(
     nO: int, nI: int, *, bi: bool = False, depth: int = 1, dropout: float = 0.0
 ) -> Model[Padded, Padded]:
     import torch.nn
-    from .with_padded import with_padded
+
     from .pytorchwrapper import PyTorchRNNWrapper
+    from .with_padded import with_padded
 
     if depth == 0:
         return noop()  # type: ignore[misc]
diff --git a/thinc/layers/map_list.py b/thinc/layers/map_list.py
index b05a934b1..aaadf0b55 100644
--- a/thinc/layers/map_list.py
+++ b/thinc/layers/map_list.py
@@ -1,6 +1,6 @@
-from typing import Callable, TypeVar, List, Tuple, Optional
-from ..model import Model
+from typing import Callable, List, Optional, Tuple, TypeVar
 
+from ..model import Model
 
 InT = TypeVar("InT")
 OutT = TypeVar("OutT")
diff --git a/thinc/layers/maxout.py b/thinc/layers/maxout.py
index 72788a5c7..ff0e52037 100644
--- a/thinc/layers/maxout.py
+++ b/thinc/layers/maxout.py
@@ -1,14 +1,13 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
 from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
 from ..types import Floats2d
 from ..util import get_width, partial
+from .chain import chain
 from .dropout import Dropout
 from .layernorm import LayerNorm
-from .chain import chain
-
 
 InT = Floats2d
 OutT = Floats2d
diff --git a/thinc/layers/mish.py b/thinc/layers/mish.py
index ab7a2a76c..32542b963 100644
--- a/thinc/layers/mish.py
+++ b/thinc/layers/mish.py
@@ -1,14 +1,13 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
-from ..initializers import glorot_uniform_init, zero_init
 from ..config import registry
+from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
 from ..types import Floats1d, Floats2d
 from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-
+from .layernorm import LayerNorm
 
 InT = Floats2d
 OutT = Floats2d
diff --git a/thinc/layers/multisoftmax.py b/thinc/layers/multisoftmax.py
index cf55ecc37..d07b684f4 100644
--- a/thinc/layers/multisoftmax.py
+++ b/thinc/layers/multisoftmax.py
@@ -1,11 +1,10 @@
-from typing import Optional, Tuple, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..types import Floats2d, Floats1d
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Floats1d, Floats2d
 from ..util import get_width
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/mxnetwrapper.py b/thinc/layers/mxnetwrapper.py
index 642d01f38..2303871fb 100644
--- a/thinc/layers/mxnetwrapper.py
+++ b/thinc/layers/mxnetwrapper.py
@@ -1,11 +1,10 @@
-from typing import Callable, Tuple, Optional, Any, Type
+from typing import Any, Callable, Optional, Tuple, Type
 
+from ..config import registry
 from ..model import Model
 from ..shims import MXNetShim
-from ..config import registry
-from ..util import is_xp_array, is_mxnet_array
-from ..util import mxnet2xp, xp2mxnet, convert_recursive
 from ..types import ArgsKwargs
+from ..util import convert_recursive, is_mxnet_array, is_xp_array, mxnet2xp, xp2mxnet
 
 
 @registry.layers("MXNetWrapper.v1")
diff --git a/thinc/layers/noop.py b/thinc/layers/noop.py
index d1c83d1cd..2e855b875 100644
--- a/thinc/layers/noop.py
+++ b/thinc/layers/noop.py
@@ -1,8 +1,7 @@
-from typing import Tuple, Callable, TypeVar
+from typing import Callable, Tuple, TypeVar
 
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
 
 InOutT = TypeVar("InOutT")
 
diff --git a/thinc/layers/padded2list.py b/thinc/layers/padded2list.py
index 8f1bee7e8..a4d374e6b 100644
--- a/thinc/layers/padded2list.py
+++ b/thinc/layers/padded2list.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, TypeVar, cast
+from typing import Callable, Tuple, TypeVar, cast
 
-from ..types import Padded, List2d
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
+from ..types import List2d, Padded
 
 InT = Padded
 OutT = TypeVar("OutT", bound=List2d)
diff --git a/thinc/layers/parametricattention.py b/thinc/layers/parametricattention.py
index d54a2f19e..a03906f51 100644
--- a/thinc/layers/parametricattention.py
+++ b/thinc/layers/parametricattention.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional
+from typing import Callable, Optional, Tuple
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Ragged
 from ..util import get_width
 
-
 InT = Ragged
 OutT = Ragged
 
diff --git a/thinc/layers/premap_ids.pyx b/thinc/layers/premap_ids.pyx
index 74bc8dc6a..17acafa8e 100644
--- a/thinc/layers/premap_ids.pyx
+++ b/thinc/layers/premap_ids.pyx
@@ -1,13 +1,15 @@
 # cython: binding=True, infer_types=True
 import numpy
+
 from preshed.maps cimport PreshMap
-from typing import Dict, Union, Optional, cast, Callable, Tuple, Mapping
-from ..types import Ints1d, Ints2d
+
+from typing import Callable, Dict, Mapping, Optional, Tuple, Union, cast
+
 from ..config import registry
 from ..model import Model
+from ..types import Ints1d, Ints2d
 from ..util import to_numpy
 
-
 InT = Union[Ints1d, Ints2d]
 OutT = Ints2d
 
diff --git a/thinc/layers/pytorchwrapper.py b/thinc/layers/pytorchwrapper.py
index a1b0c462a..39c8b95c1 100644
--- a/thinc/layers/pytorchwrapper.py
+++ b/thinc/layers/pytorchwrapper.py
@@ -1,12 +1,18 @@
-from typing import Callable, Dict, Tuple, Optional, Any, cast
+from typing import Any, Callable, Dict, Optional, Tuple, cast
 
 from ..compat import torch
+from ..config import registry
 from ..model import Model
 from ..shims import PyTorchGradScaler, PyTorchShim
-from ..config import registry
-from ..util import is_xp_array, is_torch_array, partial
-from ..util import xp2torch, torch2xp, convert_recursive
-from ..types import Floats3d, ArgsKwargs, Padded
+from ..types import ArgsKwargs, Floats3d, Padded
+from ..util import (
+    convert_recursive,
+    is_torch_array,
+    is_xp_array,
+    partial,
+    torch2xp,
+    xp2torch,
+)
 
 
 @registry.layers("PyTorchRNNWrapper.v1")
diff --git a/thinc/layers/ragged2list.py b/thinc/layers/ragged2list.py
index 35af28f2f..3d8463f11 100644
--- a/thinc/layers/ragged2list.py
+++ b/thinc/layers/ragged2list.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, TypeVar, cast
+from typing import Callable, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Ragged, ListXd
-
+from ..model import Model
+from ..types import ListXd, Ragged
 
 InT = Ragged
 OutT = TypeVar("OutT", bound=ListXd)
diff --git a/thinc/layers/reduce_first.py b/thinc/layers/reduce_first.py
index ab72cb5e3..ede42c5d0 100644
--- a/thinc/layers/reduce_first.py
+++ b/thinc/layers/reduce_first.py
@@ -1,11 +1,10 @@
 from typing import Callable, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Ragged, Floats2d
+from ..model import Model
+from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
-
 InT = Ragged
 OutT = Floats2d
 
diff --git a/thinc/layers/reduce_last.py b/thinc/layers/reduce_last.py
index b8194ec2b..d2de6a877 100644
--- a/thinc/layers/reduce_last.py
+++ b/thinc/layers/reduce_last.py
@@ -1,8 +1,8 @@
 from typing import Callable, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Ragged, Floats2d
+from ..model import Model
+from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
 InT = Ragged
diff --git a/thinc/layers/reduce_max.py b/thinc/layers/reduce_max.py
index ebafb5172..e6f033e48 100644
--- a/thinc/layers/reduce_max.py
+++ b/thinc/layers/reduce_max.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, cast
+from typing import Callable, Tuple, cast
 
-from ..types import Floats2d, Ragged
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
-
 InT = Ragged
 OutT = Floats2d
 
diff --git a/thinc/layers/reduce_mean.py b/thinc/layers/reduce_mean.py
index f37ae8253..f1bd04898 100644
--- a/thinc/layers/reduce_mean.py
+++ b/thinc/layers/reduce_mean.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, cast
+from typing import Callable, Tuple, cast
 
-from ..types import Floats2d, Ragged
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
-
 InT = Ragged
 OutT = Floats2d
 
diff --git a/thinc/layers/reduce_sum.py b/thinc/layers/reduce_sum.py
index e93a362d8..62ade00f6 100644
--- a/thinc/layers/reduce_sum.py
+++ b/thinc/layers/reduce_sum.py
@@ -1,11 +1,10 @@
 from typing import Callable, Tuple, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
-
 InT = Ragged
 OutT = Floats2d
 
diff --git a/thinc/layers/relu.py b/thinc/layers/relu.py
index d1d3ebf74..488a1eff7 100644
--- a/thinc/layers/relu.py
+++ b/thinc/layers/relu.py
@@ -1,14 +1,13 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
-from ..initializers import glorot_uniform_init, zero_init
 from ..config import registry
-from ..types import Floats2d, Floats1d
+from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
+from ..types import Floats1d, Floats2d
 from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-
+from .layernorm import LayerNorm
 
 InT = Floats2d
 OutT = Floats2d
diff --git a/thinc/layers/remap_ids.py b/thinc/layers/remap_ids.py
index 265b24a9d..3801b703f 100644
--- a/thinc/layers/remap_ids.py
+++ b/thinc/layers/remap_ids.py
@@ -1,12 +1,10 @@
-from typing import Tuple, Callable, Sequence, cast
-from typing import Dict, Union, Optional, Hashable, Any
+from typing import Any, Callable, Dict, Hashable, Optional, Sequence, Tuple, Union, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Ints1d, Ints2d, DTypes
+from ..model import Model
+from ..types import DTypes, Ints1d, Ints2d
 from ..util import is_xp_array, to_numpy
 
-
 InT = Union[Sequence[Hashable], Ints1d, Ints2d]
 OutT = Ints2d
 
diff --git a/thinc/layers/residual.py b/thinc/layers/residual.py
index 3793ee1d5..f213e9bf5 100644
--- a/thinc/layers/residual.py
+++ b/thinc/layers/residual.py
@@ -1,8 +1,8 @@
-from typing import Tuple, Callable, Optional, List, TypeVar
+from typing import Callable, List, Optional, Tuple, TypeVar
 
-from ..model import Model
 from ..config import registry
-from ..types import Floats1d, Floats2d, Floats3d, Floats4d, FloatsXd, Ragged, Padded
+from ..model import Model
+from ..types import Floats1d, Floats2d, Floats3d, Floats4d, FloatsXd, Padded, Ragged
 
 # fmt: off
 InT = TypeVar(  
diff --git a/thinc/layers/resizable.py b/thinc/layers/resizable.py
index 2dd4dde1a..606d50dae 100644
--- a/thinc/layers/resizable.py
+++ b/thinc/layers/resizable.py
@@ -1,7 +1,7 @@
 from typing import Callable, Optional, TypeVar
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d
 
 InT = TypeVar("InT")
diff --git a/thinc/layers/siamese.py b/thinc/layers/siamese.py
index 82bafacbb..33579a4de 100644
--- a/thinc/layers/siamese.py
+++ b/thinc/layers/siamese.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, TypeVar
+from typing import Callable, Optional, Tuple, TypeVar
 
+from ..config import registry
 from ..model import Model
 from ..types import ArrayXd
-from ..config import registry
 from ..util import get_width
 
-
 LayerT = TypeVar("LayerT")
 SimT = TypeVar("SimT")
 InT = Tuple[LayerT, LayerT]
diff --git a/thinc/layers/sigmoid.py b/thinc/layers/sigmoid.py
index d8933b66e..157047e37 100644
--- a/thinc/layers/sigmoid.py
+++ b/thinc/layers/sigmoid.py
@@ -1,12 +1,11 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Floats2d, Floats1d
 from ..initializers import zero_init
+from ..model import Model
+from ..types import Floats1d, Floats2d
 from ..util import get_width, partial
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/sigmoid_activation.py b/thinc/layers/sigmoid_activation.py
index b87261075..37e188ab8 100644
--- a/thinc/layers/sigmoid_activation.py
+++ b/thinc/layers/sigmoid_activation.py
@@ -1,7 +1,7 @@
-from typing import TypeVar, Tuple, Callable, cast
+from typing import Callable, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import FloatsXdT
 
 
diff --git a/thinc/layers/softmax.py b/thinc/layers/softmax.py
index 9d766f1db..8b7301af0 100644
--- a/thinc/layers/softmax.py
+++ b/thinc/layers/softmax.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Floats2d, Floats1d
 from ..initializers import zero_init
-from ..util import get_width, partial, ArrayInfo
-
+from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import ArrayInfo, get_width, partial
 
 InT = Floats2d
 OutT = Floats2d
diff --git a/thinc/layers/softmax_activation.py b/thinc/layers/softmax_activation.py
index 858320143..974ed2c8c 100644
--- a/thinc/layers/softmax_activation.py
+++ b/thinc/layers/softmax_activation.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable
+from typing import Callable, Tuple
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/sparselinear.pyx b/thinc/layers/sparselinear.pyx
index b9a982f4b..a1be75ccc 100644
--- a/thinc/layers/sparselinear.pyx
+++ b/thinc/layers/sparselinear.pyx
@@ -1,16 +1,15 @@
 # cython: infer_types=True, cdivision=True, bounds_check=False, wraparound=False
-cimport numpy as np
-from libc.stdint cimport uint64_t, int32_t, uint32_t
 cimport cython
+cimport numpy as np
+from libc.stdint cimport int32_t, uint32_t, uint64_t
 
-from typing import Tuple, Callable, Optional
+from typing import Callable, Optional, Tuple
 
-from ..types import ArrayXd
-from ..model import Model
+from ..backends import CupyOps, NumpyOps
 from ..config import registry
-from ..util import get_width, is_cupy_array, is_numpy_array, get_array_module
-from ..backends import NumpyOps, CupyOps
-
+from ..model import Model
+from ..types import ArrayXd
+from ..util import get_array_module, get_width, is_cupy_array, is_numpy_array
 
 InT = Tuple[ArrayXd, ArrayXd, ArrayXd]
 OutT = ArrayXd
diff --git a/thinc/layers/strings2arrays.py b/thinc/layers/strings2arrays.py
index 469b1636d..91a6b1a31 100644
--- a/thinc/layers/strings2arrays.py
+++ b/thinc/layers/strings2arrays.py
@@ -1,11 +1,11 @@
-from typing import Tuple, List, Callable, Sequence
+from typing import Callable, List, Sequence, Tuple
+
 from murmurhash import hash_unicode
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Ints2d
 
-
 InT = Sequence[Sequence[str]]
 OutT = List[Ints2d]
 
diff --git a/thinc/layers/swish.py b/thinc/layers/swish.py
index 4f3fe49d5..5cf8be50f 100644
--- a/thinc/layers/swish.py
+++ b/thinc/layers/swish.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("Swish.v1")
diff --git a/thinc/layers/tensorflowwrapper.py b/thinc/layers/tensorflowwrapper.py
index 7e166ea50..a77e0b3af 100644
--- a/thinc/layers/tensorflowwrapper.py
+++ b/thinc/layers/tensorflowwrapper.py
@@ -2,12 +2,18 @@
 
 import srsly
 
+from ..compat import tensorflow as tf
 from ..model import Model
 from ..shims import TensorFlowShim, keras_model_fns, maybe_handshake_model
-from ..util import xp2tensorflow, tensorflow2xp, assert_tensorflow_installed
-from ..util import is_tensorflow_array, convert_recursive, is_xp_array
-from ..types import ArrayXd, ArgsKwargs
-from ..compat import tensorflow as tf
+from ..types import ArgsKwargs, ArrayXd
+from ..util import (
+    assert_tensorflow_installed,
+    convert_recursive,
+    is_tensorflow_array,
+    is_xp_array,
+    tensorflow2xp,
+    xp2tensorflow,
+)
 
 InT = TypeVar("InT")
 OutT = TypeVar("OutT")
diff --git a/thinc/layers/torchscriptwrapper.py b/thinc/layers/torchscriptwrapper.py
index a74db9225..a3a8e1ac0 100644
--- a/thinc/layers/torchscriptwrapper.py
+++ b/thinc/layers/torchscriptwrapper.py
@@ -3,8 +3,11 @@
 from ..compat import torch
 from ..model import Model
 from ..shims import PyTorchGradScaler, PyTorchShim, TorchScriptShim
-from .pytorchwrapper import forward, convert_pytorch_default_inputs
-from .pytorchwrapper import convert_pytorch_default_outputs
+from .pytorchwrapper import (
+    convert_pytorch_default_inputs,
+    convert_pytorch_default_outputs,
+    forward,
+)
 
 
 def TorchScriptWrapper_v1(
diff --git a/thinc/layers/tuplify.py b/thinc/layers/tuplify.py
index 99b4d7589..35dfdc66f 100644
--- a/thinc/layers/tuplify.py
+++ b/thinc/layers/tuplify.py
@@ -1,7 +1,7 @@
-from typing import Optional, Tuple, Any, TypeVar
+from typing import Any, Optional, Tuple, TypeVar
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 
 InT = TypeVar("InT")
 OutT = Tuple
diff --git a/thinc/layers/uniqued.py b/thinc/layers/uniqued.py
index 582b31093..26f2cdf16 100644
--- a/thinc/layers/uniqued.py
+++ b/thinc/layers/uniqued.py
@@ -1,10 +1,10 @@
-from typing import Tuple, Callable, Optional
+from typing import Callable, Optional, Tuple
+
 import numpy
 
-from ..model import Model
 from ..config import registry
-from ..types import Ints2d, Floats2d
-
+from ..model import Model
+from ..types import Floats2d, Ints2d
 
 InT = Ints2d
 OutT = Floats2d
diff --git a/thinc/layers/with_array.py b/thinc/layers/with_array.py
index 2511b3c17..31b9fa494 100644
--- a/thinc/layers/with_array.py
+++ b/thinc/layers/with_array.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable, Optional, TypeVar, Union, cast
+from typing import Callable, Optional, Tuple, TypeVar, Union, cast
 
 from ..backends import NumpyOps
-from ..model import Model
 from ..config import registry
-from ..types import Padded, Ragged, ArrayXd, Array3d, ListXd
-
+from ..model import Model
+from ..types import Array3d, ArrayXd, ListXd, Padded, Ragged
 
 NUMPY_OPS = NumpyOps()
 
diff --git a/thinc/layers/with_array2d.py b/thinc/layers/with_array2d.py
index 740593a26..98eba8b96 100644
--- a/thinc/layers/with_array2d.py
+++ b/thinc/layers/with_array2d.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, TypeVar, cast, List, Union
+from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast
 
 from ..backends import NumpyOps
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Array2d, Floats2d, List2d, Padded, Ragged
 
-
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/thinc/layers/with_cpu.py b/thinc/layers/with_cpu.py
index 3fc7645a8..39e5965f2 100644
--- a/thinc/layers/with_cpu.py
+++ b/thinc/layers/with_cpu.py
@@ -1,10 +1,11 @@
-from typing import Tuple, Callable, Any
+from typing import Any, Callable, Tuple
 
 import numpy
+
 from thinc.backends import Ops
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 
 
 @registry.layers("with_cpu.v1")
diff --git a/thinc/layers/with_debug.py b/thinc/layers/with_debug.py
index 91505c9f6..21790e468 100644
--- a/thinc/layers/with_debug.py
+++ b/thinc/layers/with_debug.py
@@ -1,4 +1,4 @@
-from typing import Optional, Callable, Any, Tuple, TypeVar
+from typing import Any, Callable, Optional, Tuple, TypeVar
 
 from ..model import Model
 
diff --git a/thinc/layers/with_flatten.py b/thinc/layers/with_flatten.py
index 5cf8a85cf..9658a788f 100644
--- a/thinc/layers/with_flatten.py
+++ b/thinc/layers/with_flatten.py
@@ -1,7 +1,7 @@
-from typing import Tuple, Callable, Sequence, Any, cast, TypeVar, Optional, List
+from typing import Any, Callable, List, Optional, Sequence, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import ArrayXd, ListXd
 
 ItemT = TypeVar("ItemT")
diff --git a/thinc/layers/with_flatten_v2.py b/thinc/layers/with_flatten_v2.py
index 4dd75e0d1..95549994f 100644
--- a/thinc/layers/with_flatten_v2.py
+++ b/thinc/layers/with_flatten_v2.py
@@ -1,8 +1,7 @@
-from typing import Tuple, Callable, Sequence, Any, cast, TypeVar, Optional, List
+from typing import Any, Callable, List, Optional, Sequence, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
 
 InItemT = TypeVar("InItemT")
 OutItemT = TypeVar("OutItemT")
diff --git a/thinc/layers/with_getitem.py b/thinc/layers/with_getitem.py
index 9f6b93459..fb6a3cccf 100644
--- a/thinc/layers/with_getitem.py
+++ b/thinc/layers/with_getitem.py
@@ -1,8 +1,7 @@
-from typing import Callable, Optional, Tuple, Any
+from typing import Any, Callable, Optional, Tuple
 
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
 
 InT = Tuple[Any, ...]
 OutT = Tuple[Any, ...]
diff --git a/thinc/layers/with_list.py b/thinc/layers/with_list.py
index 9f86c24dc..5331758a5 100644
--- a/thinc/layers/with_list.py
+++ b/thinc/layers/with_list.py
@@ -1,8 +1,8 @@
-from typing import Tuple, Callable, List, Optional, TypeVar, Union, cast
+from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast
 
-from ..types import Padded, Ragged, Array2d, List2d, Floats2d, Ints2d
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Array2d, Floats2d, Ints2d, List2d, Padded, Ragged
 
 SeqT = TypeVar("SeqT", Padded, Ragged, List2d, List[Floats2d], List[Ints2d])
 
diff --git a/thinc/layers/with_nvtx_range.py b/thinc/layers/with_nvtx_range.py
index bf270abce..480f82a7c 100644
--- a/thinc/layers/with_nvtx_range.py
+++ b/thinc/layers/with_nvtx_range.py
@@ -1,9 +1,8 @@
-from typing import Optional, Callable, Any, Tuple, TypeVar
+from typing import Any, Callable, Optional, Tuple, TypeVar
 
 from ..model import Model
 from ..util import use_nvtx_range
 
-
 _ModelT = TypeVar("_ModelT", bound=Model)
 
 
diff --git a/thinc/layers/with_padded.py b/thinc/layers/with_padded.py
index 379df1bef..b92c6308a 100644
--- a/thinc/layers/with_padded.py
+++ b/thinc/layers/with_padded.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, TypeVar, Union, cast, List
+from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast
 
-from ..types import Padded, Ragged, Floats3d, Ints1d, List2d, Array2d
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Array2d, Floats3d, Ints1d, List2d, Padded, Ragged
 from ..util import is_xp_array
 
-
 PaddedData = Tuple[Floats3d, Ints1d, Ints1d, Ints1d]
 SeqT = TypeVar("SeqT", bound=Union[Padded, Ragged, List2d, Floats3d, PaddedData])
 
diff --git a/thinc/layers/with_ragged.py b/thinc/layers/with_ragged.py
index cbff6f59d..6cf45d9e8 100644
--- a/thinc/layers/with_ragged.py
+++ b/thinc/layers/with_ragged.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable, Optional, TypeVar, cast, List, Union
+from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast
 
 from ..backends import NumpyOps
-from ..types import Padded, Ragged, Array2d, ListXd, List2d, Ints1d
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
+from ..types import Array2d, Ints1d, List2d, ListXd, Padded, Ragged
 
 NUMPY_OPS = NumpyOps()
 
diff --git a/thinc/layers/with_reshape.py b/thinc/layers/with_reshape.py
index 5bd3e9025..b40ada757 100644
--- a/thinc/layers/with_reshape.py
+++ b/thinc/layers/with_reshape.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, Optional, cast, TypeVar, List
+from typing import Callable, List, Optional, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Array3d, Array2d
-
+from ..model import Model
+from ..types import Array2d, Array3d
 
 InT = TypeVar("InT", bound=Array3d)
 OutT = TypeVar("OutT", bound=Array2d)
diff --git a/thinc/layers/with_signpost_interval.py b/thinc/layers/with_signpost_interval.py
index 9a468d896..58f5d4165 100644
--- a/thinc/layers/with_signpost_interval.py
+++ b/thinc/layers/with_signpost_interval.py
@@ -1,9 +1,8 @@
-from typing import Optional, Callable, Any, Tuple, TypeVar
+from typing import Any, Callable, Optional, Tuple, TypeVar
 
 from ..compat import has_os_signpost, os_signpost
 from ..model import Model
 
-
 _ModelT = TypeVar("_ModelT", bound=Model)
 
 
diff --git a/thinc/loss.py b/thinc/loss.py
index e8edb194d..756dac4c3 100644
--- a/thinc/loss.py
+++ b/thinc/loss.py
@@ -1,11 +1,20 @@
-from typing import Tuple, Sequence, cast, TypeVar, Generic, Any, Union, Optional, List
-from typing import Dict
 from abc import abstractmethod
+from typing import (
+    Any,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
 
+from .config import registry
 from .types import Floats2d, Ints1d
 from .util import get_array_module, to_categorical
-from .config import registry
-
 
 LossT = TypeVar("LossT")
 GradT = TypeVar("GradT")
diff --git a/thinc/model.py b/thinc/model.py
index e094d5294..ba49215c1 100644
--- a/thinc/model.py
+++ b/thinc/model.py
@@ -1,20 +1,39 @@
-from typing import Dict, List, Callable, Optional, Any, Union, Iterable, Set, cast
-from typing import Generic, Sequence, Tuple, TypeVar, Iterator
 import contextlib
-from contextvars import ContextVar
-import srsly
-from pathlib import Path
 import copy
 import functools
 import threading
+from contextvars import ContextVar
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
+
+import srsly
 
-from .backends import ParamServer, Ops, NumpyOps, CupyOps, get_current_ops
+from .backends import CupyOps, NumpyOps, Ops, ParamServer, get_current_ops
 from .optimizers import Optimizer  # noqa: F401
 from .shims import Shim
-from .util import convert_recursive, is_xp_array, DATA_VALIDATION
-from .util import partial, validate_fwd_input_output
 from .types import FloatsXd
-
+from .util import (
+    DATA_VALIDATION,
+    convert_recursive,
+    is_xp_array,
+    partial,
+    validate_fwd_input_output,
+)
 
 InT = TypeVar("InT")
 OutT = TypeVar("OutT")
diff --git a/thinc/mypy.py b/thinc/mypy.py
index e02f6d5be..73c6e72f6 100644
--- a/thinc/mypy.py
+++ b/thinc/mypy.py
@@ -1,13 +1,14 @@
-from typing import Dict, List
 import itertools
-from mypy.errors import Errors
+from typing import Dict, List
+
+from mypy.checker import TypeChecker
 from mypy.errorcodes import ErrorCode
+from mypy.errors import Errors
+from mypy.nodes import CallExpr, Decorator, Expression, FuncDef, MypyFile, NameExpr
 from mypy.options import Options
-from mypy.plugin import FunctionContext, Plugin, CheckerPluginInterface
-from mypy.types import Instance, Type, CallableType, TypeVarType
-from mypy.nodes import Expression, CallExpr, NameExpr, FuncDef, Decorator, MypyFile
-from mypy.checker import TypeChecker
+from mypy.plugin import CheckerPluginInterface, FunctionContext, Plugin
 from mypy.subtypes import is_subtype
+from mypy.types import CallableType, Instance, Type, TypeVarType
 
 thinc_model_fullname = "thinc.model.Model"
 chained_out_fullname = "thinc.types.XY_YZ_OutT"
diff --git a/thinc/optimizers.py b/thinc/optimizers.py
index f34cd2ff8..4b4eca2b6 100644
--- a/thinc/optimizers.py
+++ b/thinc/optimizers.py
@@ -1,12 +1,10 @@
 import math
-
-from typing import Dict, Optional, Union, Tuple, List, cast
 from collections import defaultdict
+from typing import Dict, List, Optional, Tuple, Union, cast
 
 from .backends import get_array_ops
-from .types import Generator, FloatsXd
 from .config import registry
-
+from .types import FloatsXd, Generator
 
 KeyT = Tuple[int, str]
 FloatOrSeq = Union[float, List[float], Generator]
diff --git a/thinc/schedules.py b/thinc/schedules.py
index 87581af74..c13868a5d 100644
--- a/thinc/schedules.py
+++ b/thinc/schedules.py
@@ -1,5 +1,6 @@
 """Generators that provide different rates, schedules, decays or series."""
 from typing import Iterable
+
 import numpy
 
 from .config import registry
diff --git a/thinc/shims/__init__.py b/thinc/shims/__init__.py
index 9cd8bd030..fb246c9f2 100644
--- a/thinc/shims/__init__.py
+++ b/thinc/shims/__init__.py
@@ -1,10 +1,9 @@
-from .shim import Shim
+from .mxnet import MXNetShim
 from .pytorch import PyTorchShim
 from .pytorch_grad_scaler import PyTorchGradScaler
-from .tensorflow import keras_model_fns, TensorFlowShim, maybe_handshake_model
+from .shim import Shim
+from .tensorflow import TensorFlowShim, keras_model_fns, maybe_handshake_model
 from .torchscript import TorchScriptShim
-from .mxnet import MXNetShim
-
 
 # fmt: off
 __all__ = [
diff --git a/thinc/shims/mxnet.py b/thinc/shims/mxnet.py
index 3962a2ef5..2dd36a62f 100644
--- a/thinc/shims/mxnet.py
+++ b/thinc/shims/mxnet.py
@@ -1,13 +1,19 @@
+import copy
 from typing import Any, cast
+
 import srsly
-import copy
 
-from ..util import mxnet2xp, convert_recursive, make_tempfile, xp2mxnet
-from ..util import get_array_module
+from ..compat import mxnet as mx
 from ..optimizers import Optimizer
 from ..types import ArgsKwargs, FloatsXd
+from ..util import (
+    convert_recursive,
+    get_array_module,
+    make_tempfile,
+    mxnet2xp,
+    xp2mxnet,
+)
 from .shim import Shim
-from ..compat import mxnet as mx
 
 
 class MXNetShim(Shim):
diff --git a/thinc/shims/pytorch.py b/thinc/shims/pytorch.py
index 9582c8616..505669867 100644
--- a/thinc/shims/pytorch.py
+++ b/thinc/shims/pytorch.py
@@ -1,16 +1,21 @@
-from typing import Any, Dict, Optional, cast, Callable
 import contextlib
-from io import BytesIO
 import itertools
+from io import BytesIO
+from typing import Any, Callable, Dict, Optional, cast
+
 import srsly
 
-from ..util import torch2xp, xp2torch, convert_recursive, iterate_recursive
-from ..util import get_torch_default_device
+from ..backends import CupyOps, context_pools, get_current_ops, set_gpu_allocator
 from ..compat import torch
-from ..backends import get_current_ops, context_pools, CupyOps
-from ..backends import set_gpu_allocator
 from ..optimizers import Optimizer
 from ..types import ArgsKwargs, FloatsXd
+from ..util import (
+    convert_recursive,
+    get_torch_default_device,
+    iterate_recursive,
+    torch2xp,
+    xp2torch,
+)
 from .pytorch_grad_scaler import PyTorchGradScaler
 from .shim import Shim
 
diff --git a/thinc/shims/shim.py b/thinc/shims/shim.py
index 0c246e8d4..ef88408a3 100644
--- a/thinc/shims/shim.py
+++ b/thinc/shims/shim.py
@@ -1,8 +1,8 @@
-from typing import Any, Optional, Tuple, Callable, Dict, Union
-import copy
 import contextlib
-from pathlib import Path
+import copy
 import threading
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 
 class Shim:  # pragma: no cover
diff --git a/thinc/shims/tensorflow.py b/thinc/shims/tensorflow.py
index d630d86f9..bcaae3aac 100644
--- a/thinc/shims/tensorflow.py
+++ b/thinc/shims/tensorflow.py
@@ -1,17 +1,18 @@
-from typing import Any, Dict, List, Optional
-import catalogue
 import contextlib
 import copy
 from io import BytesIO
+from typing import Any, Dict, List, Optional
+
+import catalogue
 import numpy
 
 from ..backends import Ops, get_current_ops
+from ..compat import cupy, h5py
+from ..compat import tensorflow as tf
 from ..optimizers import Optimizer
 from ..types import ArgsKwargs, ArrayXd
 from ..util import get_array_module
 from .shim import Shim
-from ..compat import tensorflow as tf
-from ..compat import cupy, h5py
 
 keras_model_fns = catalogue.create("thinc", "keras", entry_points=True)
 
diff --git a/thinc/shims/torchscript.py b/thinc/shims/torchscript.py
index 675718cd1..6c05c8a9b 100644
--- a/thinc/shims/torchscript.py
+++ b/thinc/shims/torchscript.py
@@ -1,5 +1,6 @@
-from typing import Any, Optional
 from io import BytesIO
+from typing import Any, Optional
+
 import srsly
 
 from ..compat import torch
diff --git a/thinc/tests/backends/test_mem.py b/thinc/tests/backends/test_mem.py
index cb26e24e0..bf867726d 100644
--- a/thinc/tests/backends/test_mem.py
+++ b/thinc/tests/backends/test_mem.py
@@ -1,6 +1,7 @@
-from thinc.backends._param_server import ParamServer
 import numpy
 
+from thinc.backends._param_server import ParamServer
+
 
 def test_param_server_init():
     array = numpy.zeros((5,), dtype="f")
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index ba296e271..d5235ecc3 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -1,26 +1,32 @@
+import inspect
+import platform
 from typing import Tuple, cast
 
-import pytest
 import numpy
-import platform
+import pytest
 from hypothesis import given, settings
 from hypothesis.strategies import composite, integers
 from numpy.testing import assert_allclose
 from packaging.version import Version
-from thinc.api import NumpyOps, CupyOps, Ops, get_ops
-from thinc.api import get_current_ops, use_ops
-from thinc.util import torch2xp, xp2torch
+
+from thinc.api import (
+    LSTM,
+    CupyOps,
+    NumpyOps,
+    Ops,
+    fix_random_seed,
+    get_current_ops,
+    get_ops,
+    use_ops,
+)
+from thinc.backends._custom_kernels import KERNELS, KERNELS_LIST, compile_mmh
 from thinc.compat import has_cupy_gpu, has_torch, torch_version
-from thinc.api import fix_random_seed
-from thinc.api import LSTM
 from thinc.types import Floats2d
-from thinc.backends._custom_kernels import KERNELS_LIST, KERNELS, compile_mmh
-import inspect
+from thinc.util import torch2xp, xp2torch
 
 from .. import strategies
 from ..strategies import arrays_BI, ndarrays_of_shape
 
-
 MAX_EXAMPLES = 10
 
 VANILLA_OPS = Ops(numpy)  # type:ignore
@@ -37,9 +43,10 @@
 
 
 def create_pytorch_funcs():
-    import torch
     import math
 
+    import torch
+
     def torch_relu(x):
         return torch.nn.functional.relu(x)
 
diff --git a/thinc/tests/conftest.py b/thinc/tests/conftest.py
index 19b5137d3..026f3eb06 100644
--- a/thinc/tests/conftest.py
+++ b/thinc/tests/conftest.py
@@ -52,9 +52,10 @@ def getopt(opt):
 @pytest.fixture()
 def pathy_fixture():
     pytest.importorskip("pathy")
-    import tempfile
     import shutil
-    from pathy import use_fs, Pathy
+    import tempfile
+
+    from pathy import Pathy, use_fs
 
     temp_folder = tempfile.mkdtemp(prefix="thinc-pathy")
     use_fs(temp_folder)
diff --git a/thinc/tests/layers/test_basic_tagger.py b/thinc/tests/layers/test_basic_tagger.py
index 3046c1b04..855a6d6ad 100644
--- a/thinc/tests/layers/test_basic_tagger.py
+++ b/thinc/tests/layers/test_basic_tagger.py
@@ -1,7 +1,18 @@
-import pytest
 import random
-from thinc.api import Model, Relu, Softmax, HashEmbed, expand_window
-from thinc.api import chain, with_array, Adam, strings2arrays
+
+import pytest
+
+from thinc.api import (
+    Adam,
+    HashEmbed,
+    Model,
+    Relu,
+    Softmax,
+    chain,
+    expand_window,
+    strings2arrays,
+    with_array,
+)
 
 
 @pytest.fixture(scope="module")
diff --git a/thinc/tests/layers/test_combinators.py b/thinc/tests/layers/test_combinators.py
index ea5583108..c7b4fbe9f 100644
--- a/thinc/tests/layers/test_combinators.py
+++ b/thinc/tests/layers/test_combinators.py
@@ -1,8 +1,18 @@
-import pytest
 import numpy
+import pytest
 from numpy.testing import assert_allclose
-from thinc.api import clone, concatenate, noop, add, map_list
-from thinc.api import Linear, Dropout, Model, NumpyOps
+
+from thinc.api import (
+    Dropout,
+    Linear,
+    Model,
+    NumpyOps,
+    add,
+    clone,
+    concatenate,
+    map_list,
+    noop,
+)
 from thinc.layers import chain, tuplify
 
 
diff --git a/thinc/tests/layers/test_feed_forward.py b/thinc/tests/layers/test_feed_forward.py
index b18a0fc0b..a808bb445 100644
--- a/thinc/tests/layers/test_feed_forward.py
+++ b/thinc/tests/layers/test_feed_forward.py
@@ -1,8 +1,10 @@
-import pytest
-import numpy
 from functools import partial
+
+import numpy
+import pytest
 from numpy.testing import assert_allclose
-from thinc.api import chain, Linear, Relu, NumpyOps
+
+from thinc.api import Linear, NumpyOps, Relu, chain
 
 
 @pytest.fixture(params=[1, 2, 9])
diff --git a/thinc/tests/layers/test_hash_embed.py b/thinc/tests/layers/test_hash_embed.py
index 8df50a03f..5b79539fa 100644
--- a/thinc/tests/layers/test_hash_embed.py
+++ b/thinc/tests/layers/test_hash_embed.py
@@ -1,4 +1,5 @@
 import numpy
+
 from thinc.api import HashEmbed
 
 
diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py
index 761cad880..0ef559d96 100644
--- a/thinc/tests/layers/test_layers_api.py
+++ b/thinc/tests/layers/test_layers_api.py
@@ -1,14 +1,15 @@
 from typing import List, Optional
 
-from numpy.testing import assert_almost_equal
-from thinc.api import registry, with_padded, Dropout, NumpyOps, Model
-from thinc.backends import NumpyOps
-from thinc.util import data_validation, get_width
-from thinc.types import Ragged, Padded, Array2d, Floats2d, FloatsXd, Shape
-from thinc.compat import has_torch
 import numpy
 import pytest
 import srsly
+from numpy.testing import assert_almost_equal
+
+from thinc.api import Dropout, Model, NumpyOps, registry, with_padded
+from thinc.backends import NumpyOps
+from thinc.compat import has_torch
+from thinc.types import Array2d, Floats2d, FloatsXd, Padded, Ragged, Shape
+from thinc.util import data_validation, get_width
 
 OPS = NumpyOps()
 
diff --git a/thinc/tests/layers/test_linear.py b/thinc/tests/layers/test_linear.py
index 2362b556b..345669d87 100644
--- a/thinc/tests/layers/test_linear.py
+++ b/thinc/tests/layers/test_linear.py
@@ -1,9 +1,10 @@
+import numpy
 import pytest
-from mock import MagicMock
 from hypothesis import given, settings
-import numpy
+from mock import MagicMock
 from numpy.testing import assert_allclose
-from thinc.api import Linear, chain, Dropout, SGD
+
+from thinc.api import SGD, Dropout, Linear, chain
 
 from ..strategies import arrays_OI_O_BI
 from ..util import get_model, get_shape
diff --git a/thinc/tests/layers/test_lstm.py b/thinc/tests/layers/test_lstm.py
index 208ffb58b..44c90ed4c 100644
--- a/thinc/tests/layers/test_lstm.py
+++ b/thinc/tests/layers/test_lstm.py
@@ -1,10 +1,11 @@
-import numpy
 import timeit
-from thinc.api import NumpyOps, LSTM, PyTorchLSTM, with_padded, fix_random_seed
-from thinc.api import Ops
-from thinc.compat import has_torch
+
+import numpy
 import pytest
 
+from thinc.api import LSTM, NumpyOps, Ops, PyTorchLSTM, fix_random_seed, with_padded
+from thinc.compat import has_torch
+
 
 @pytest.fixture(params=[1, 6])
 def nI(request):
diff --git a/thinc/tests/layers/test_mappers.py b/thinc/tests/layers/test_mappers.py
index e890dd086..85e984bc4 100644
--- a/thinc/tests/layers/test_mappers.py
+++ b/thinc/tests/layers/test_mappers.py
@@ -1,5 +1,6 @@
-import pytest
 import numpy
+import pytest
+
 from thinc.layers import premap_ids, remap_ids, remap_ids_v2
 
 
diff --git a/thinc/tests/layers/test_mnist.py b/thinc/tests/layers/test_mnist.py
index 321de3a0f..060007cfd 100644
--- a/thinc/tests/layers/test_mnist.py
+++ b/thinc/tests/layers/test_mnist.py
@@ -1,8 +1,16 @@
 import pytest
-from thinc.api import Relu, Softmax, chain, clone, Adam
-from thinc.api import PyTorchWrapper, TensorFlowWrapper
-from thinc.api import get_current_ops
-from thinc.compat import has_torch, has_tensorflow
+
+from thinc.api import (
+    Adam,
+    PyTorchWrapper,
+    Relu,
+    Softmax,
+    TensorFlowWrapper,
+    chain,
+    clone,
+    get_current_ops,
+)
+from thinc.compat import has_tensorflow, has_torch
 
 
 @pytest.fixture(scope="module")
diff --git a/thinc/tests/layers/test_mxnet_wrapper.py b/thinc/tests/layers/test_mxnet_wrapper.py
index b954a8ec5..8ddf5dfce 100644
--- a/thinc/tests/layers/test_mxnet_wrapper.py
+++ b/thinc/tests/layers/test_mxnet_wrapper.py
@@ -2,10 +2,19 @@
 
 import numpy
 import pytest
-from thinc.api import Adam, ArgsKwargs, Model, Ops, MXNetWrapper
-from thinc.api import get_current_ops, mxnet2xp, xp2mxnet
-from thinc.types import Array2d, Array1d, IntsXd
+
+from thinc.api import (
+    Adam,
+    ArgsKwargs,
+    Model,
+    MXNetWrapper,
+    Ops,
+    get_current_ops,
+    mxnet2xp,
+    xp2mxnet,
+)
 from thinc.compat import has_cupy_gpu, has_mxnet
+from thinc.types import Array1d, Array2d, IntsXd
 from thinc.util import to_categorical
 
 from ..util import check_input_converters, make_tempdir
diff --git a/thinc/tests/layers/test_pytorch_wrapper.py b/thinc/tests/layers/test_pytorch_wrapper.py
index f4f83cb60..aa40d9044 100644
--- a/thinc/tests/layers/test_pytorch_wrapper.py
+++ b/thinc/tests/layers/test_pytorch_wrapper.py
@@ -1,20 +1,34 @@
-from thinc.api import Linear, SGD, PyTorchWrapper, PyTorchWrapper_v2, PyTorchWrapper_v3
-from thinc.api import xp2torch, torch2xp, ArgsKwargs, use_ops
-from thinc.api import chain, get_current_ops, Relu
-from thinc.api import CupyOps, MPSOps, NumpyOps
+import numpy
+import pytest
+
+from thinc.api import (
+    SGD,
+    ArgsKwargs,
+    CupyOps,
+    Linear,
+    MPSOps,
+    NumpyOps,
+    PyTorchWrapper,
+    PyTorchWrapper_v2,
+    PyTorchWrapper_v3,
+    Relu,
+    chain,
+    get_current_ops,
+    torch2xp,
+    use_ops,
+    xp2torch,
+)
 from thinc.backends import context_pools
+from thinc.compat import has_cupy_gpu, has_torch, has_torch_amp, has_torch_mps_gpu
 from thinc.layers.pytorchwrapper import PyTorchWrapper_v3
+from thinc.shims.pytorch import (
+    default_deserialize_torch_model,
+    default_serialize_torch_model,
+)
 from thinc.shims.pytorch_grad_scaler import PyTorchGradScaler
-from thinc.shims.pytorch import default_deserialize_torch_model
-from thinc.shims.pytorch import default_serialize_torch_model
-from thinc.compat import has_torch, has_torch_amp
-from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
-import numpy
-import pytest
 from thinc.util import get_torch_default_device
 
-from ..util import make_tempdir, check_input_converters
-
+from ..util import check_input_converters, make_tempdir
 
 XP_OPS = [NumpyOps()]
 if has_cupy_gpu:
diff --git a/thinc/tests/layers/test_reduce.py b/thinc/tests/layers/test_reduce.py
index d26065c4a..608561e13 100644
--- a/thinc/tests/layers/test_reduce.py
+++ b/thinc/tests/layers/test_reduce.py
@@ -1,5 +1,6 @@
-import pytest
 import numpy
+import pytest
+
 from thinc.api import reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum
 from thinc.types import Ragged
 
diff --git a/thinc/tests/layers/test_resizable.py b/thinc/tests/layers/test_resizable.py
index dfb6c67fd..ffa256de5 100644
--- a/thinc/tests/layers/test_resizable.py
+++ b/thinc/tests/layers/test_resizable.py
@@ -1,7 +1,9 @@
-import pytest
 from functools import partial
-from thinc.api import resizable, Linear
-from thinc.layers.resizable import resize_model, resize_linear_weighted
+
+import pytest
+
+from thinc.api import Linear, resizable
+from thinc.layers.resizable import resize_linear_weighted, resize_model
 
 
 @pytest.fixture
diff --git a/thinc/tests/layers/test_shim.py b/thinc/tests/layers/test_shim.py
index bacde5cf6..dcb43ab1e 100644
--- a/thinc/tests/layers/test_shim.py
+++ b/thinc/tests/layers/test_shim.py
@@ -1,5 +1,7 @@
 from typing import List
+
 from thinc.shims.shim import Shim
+
 from ..util import make_tempdir
 
 
diff --git a/thinc/tests/layers/test_softmax.py b/thinc/tests/layers/test_softmax.py
index 69072b558..95e2f41c7 100644
--- a/thinc/tests/layers/test_softmax.py
+++ b/thinc/tests/layers/test_softmax.py
@@ -1,8 +1,8 @@
 from typing import Tuple, cast
 
 import numpy
-from numpy.testing import assert_allclose
 import pytest
+from numpy.testing import assert_allclose
 
 from thinc.api import Model, NumpyOps, Softmax_v2
 from thinc.types import Floats2d, Ints1d
diff --git a/thinc/tests/layers/test_sparse_linear.py b/thinc/tests/layers/test_sparse_linear.py
index 87c5a3a75..cce0d1023 100644
--- a/thinc/tests/layers/test_sparse_linear.py
+++ b/thinc/tests/layers/test_sparse_linear.py
@@ -1,7 +1,9 @@
 import math
+
 import numpy
 import pytest
-from thinc.api import SGD, to_categorical, SparseLinear, SparseLinear_v2
+
+from thinc.api import SGD, SparseLinear, SparseLinear_v2, to_categorical
 
 
 @pytest.fixture
diff --git a/thinc/tests/layers/test_tensorflow_wrapper.py b/thinc/tests/layers/test_tensorflow_wrapper.py
index c1b85da3b..4741f6dc3 100644
--- a/thinc/tests/layers/test_tensorflow_wrapper.py
+++ b/thinc/tests/layers/test_tensorflow_wrapper.py
@@ -1,9 +1,19 @@
 import numpy
 import pytest
-from thinc.api import Adam, ArgsKwargs, Linear, Model, TensorFlowWrapper
-from thinc.api import get_current_ops, keras_subclass, tensorflow2xp, xp2tensorflow
-from thinc.util import to_categorical
+
+from thinc.api import (
+    Adam,
+    ArgsKwargs,
+    Linear,
+    Model,
+    TensorFlowWrapper,
+    get_current_ops,
+    keras_subclass,
+    tensorflow2xp,
+    xp2tensorflow,
+)
 from thinc.compat import has_cupy_gpu, has_tensorflow
+from thinc.util import to_categorical
 
 from ..util import check_input_converters, make_tempdir
 
diff --git a/thinc/tests/layers/test_torchscriptwrapper.py b/thinc/tests/layers/test_torchscriptwrapper.py
index 37ff9ef08..b37afa3c3 100644
--- a/thinc/tests/layers/test_torchscriptwrapper.py
+++ b/thinc/tests/layers/test_torchscriptwrapper.py
@@ -1,8 +1,11 @@
-import pytest
 import numpy
+import pytest
 
-from thinc.api import PyTorchWrapper_v2, TorchScriptWrapper_v1
-from thinc.api import pytorch_to_torchscript_wrapper
+from thinc.api import (
+    PyTorchWrapper_v2,
+    TorchScriptWrapper_v1,
+    pytorch_to_torchscript_wrapper,
+)
 from thinc.compat import has_torch, torch
 
 
diff --git a/thinc/tests/layers/test_transforms.py b/thinc/tests/layers/test_transforms.py
index 8de5341d7..3a9a110f1 100644
--- a/thinc/tests/layers/test_transforms.py
+++ b/thinc/tests/layers/test_transforms.py
@@ -1,7 +1,8 @@
-from thinc.api import strings2arrays, NumpyOps, Ragged, registry
 import numpy
 import pytest
 
+from thinc.api import NumpyOps, Ragged, registry, strings2arrays
+
 from ..util import get_data_checker
 
 
diff --git a/thinc/tests/layers/test_uniqued.py b/thinc/tests/layers/test_uniqued.py
index 9cb207ca5..685da1deb 100644
--- a/thinc/tests/layers/test_uniqued.py
+++ b/thinc/tests/layers/test_uniqued.py
@@ -1,10 +1,11 @@
-import pytest
 import numpy
+import pytest
+from hypothesis import given, settings
+from hypothesis.strategies import composite, integers, lists
+from numpy.testing import assert_allclose
+
 from thinc.layers import Embed
 from thinc.layers.uniqued import uniqued
-from numpy.testing import assert_allclose
-from hypothesis import given, settings
-from hypothesis.strategies import integers, lists, composite
 
 ROWS = 10
 
diff --git a/thinc/tests/layers/test_with_debug.py b/thinc/tests/layers/test_with_debug.py
index 679c1f21e..3f65a3ac3 100644
--- a/thinc/tests/layers/test_with_debug.py
+++ b/thinc/tests/layers/test_with_debug.py
@@ -1,5 +1,6 @@
 from mock import MagicMock
-from thinc.api import with_debug, Linear
+
+from thinc.api import Linear, with_debug
 
 
 def test_with_debug():
diff --git a/thinc/tests/layers/test_with_flatten.py b/thinc/tests/layers/test_with_flatten.py
index 1ff622026..86d18eb67 100644
--- a/thinc/tests/layers/test_with_flatten.py
+++ b/thinc/tests/layers/test_with_flatten.py
@@ -1,4 +1,5 @@
 from typing import List
+
 from thinc.api import Model, with_flatten_v2
 
 INPUT = [[1, 2, 3], [4, 5], [], [6, 7, 8]]
diff --git a/thinc/tests/layers/test_with_transforms.py b/thinc/tests/layers/test_with_transforms.py
index c23db1463..82cdaed36 100644
--- a/thinc/tests/layers/test_with_transforms.py
+++ b/thinc/tests/layers/test_with_transforms.py
@@ -1,11 +1,20 @@
-import pytest
 import numpy
 import numpy.testing
-from thinc.api import NumpyOps, Model, Linear, noop
-from thinc.api import with_array2d, with_array, with_padded, with_list
-from thinc.api import with_ragged, with_getitem
-from thinc.types import Padded, Ragged
+import pytest
 
+from thinc.api import (
+    Linear,
+    Model,
+    NumpyOps,
+    noop,
+    with_array,
+    with_array2d,
+    with_getitem,
+    with_list,
+    with_padded,
+    with_ragged,
+)
+from thinc.types import Padded, Ragged
 
 from ..util import get_data_checker
 
diff --git a/thinc/tests/model/test_model.py b/thinc/tests/model/test_model.py
index 733b3329f..f93b46c8c 100644
--- a/thinc/tests/model/test_model.py
+++ b/thinc/tests/model/test_model.py
@@ -1,13 +1,28 @@
-from collections import Counter
-import pytest
 import threading
 import time
-from thinc.api import Adam, CupyOps, Dropout, Linear, Model, Relu
-from thinc.api import Shim, Softmax, chain, change_attr_values
-from thinc.api import concatenate, set_dropout_rate
-from thinc.api import use_ops, with_debug, wrap_model_recursive
-from thinc.compat import has_cupy_gpu
+from collections import Counter
+
 import numpy
+import pytest
+
+from thinc.api import (
+    Adam,
+    CupyOps,
+    Dropout,
+    Linear,
+    Model,
+    Relu,
+    Shim,
+    Softmax,
+    chain,
+    change_attr_values,
+    concatenate,
+    set_dropout_rate,
+    use_ops,
+    with_debug,
+    wrap_model_recursive,
+)
+from thinc.compat import has_cupy_gpu
 
 from ..util import make_tempdir
 
diff --git a/thinc/tests/model/test_validation.py b/thinc/tests/model/test_validation.py
index adecdd6d5..c58efd015 100644
--- a/thinc/tests/model/test_validation.py
+++ b/thinc/tests/model/test_validation.py
@@ -1,6 +1,15 @@
 import pytest
-from thinc.api import chain, Relu, reduce_max, Softmax, with_ragged
-from thinc.api import ParametricAttention, list2ragged, reduce_sum
+
+from thinc.api import (
+    ParametricAttention,
+    Relu,
+    Softmax,
+    chain,
+    list2ragged,
+    reduce_max,
+    reduce_sum,
+    with_ragged,
+)
 from thinc.util import DataValidationError, data_validation
 
 
diff --git a/thinc/tests/mypy/modules/fail_no_plugin.py b/thinc/tests/mypy/modules/fail_no_plugin.py
index 807fd672b..f53e33ef3 100644
--- a/thinc/tests/mypy/modules/fail_no_plugin.py
+++ b/thinc/tests/mypy/modules/fail_no_plugin.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, Relu, reduce_max, Softmax, add
+from thinc.api import Relu, Softmax, add, chain, reduce_max
 
 bad_model = chain(Relu(10), reduce_max(), Softmax())
 
diff --git a/thinc/tests/mypy/modules/fail_plugin.py b/thinc/tests/mypy/modules/fail_plugin.py
index b14fcecf0..6f23c82b1 100644
--- a/thinc/tests/mypy/modules/fail_plugin.py
+++ b/thinc/tests/mypy/modules/fail_plugin.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, Relu, reduce_max, Softmax, add, concatenate
+from thinc.api import Relu, Softmax, add, chain, concatenate, reduce_max
 
 bad_model = chain(Relu(10), reduce_max(), Softmax())
 
diff --git a/thinc/tests/mypy/modules/success_no_plugin.py b/thinc/tests/mypy/modules/success_no_plugin.py
index b17cff053..058573e5b 100644
--- a/thinc/tests/mypy/modules/success_no_plugin.py
+++ b/thinc/tests/mypy/modules/success_no_plugin.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, Relu, reduce_max, Softmax, add
+from thinc.api import Relu, Softmax, add, chain, reduce_max
 
 good_model = chain(Relu(10), Relu(10), Softmax())
 reveal_type(good_model)
diff --git a/thinc/tests/mypy/modules/success_plugin.py b/thinc/tests/mypy/modules/success_plugin.py
index 85879a88a..3214bdcb7 100644
--- a/thinc/tests/mypy/modules/success_plugin.py
+++ b/thinc/tests/mypy/modules/success_plugin.py
@@ -1,6 +1,6 @@
 from typing import Any, TypeVar
 
-from thinc.api import chain, Relu, reduce_max, Softmax, add, Model
+from thinc.api import Model, Relu, Softmax, add, chain, reduce_max
 
 good_model = chain(Relu(10), Relu(10), Softmax())
 reveal_type(good_model)
diff --git a/thinc/tests/mypy/test_mypy.py b/thinc/tests/mypy/test_mypy.py
index 2f2976882..f144128f4 100644
--- a/thinc/tests/mypy/test_mypy.py
+++ b/thinc/tests/mypy/test_mypy.py
@@ -1,8 +1,8 @@
 import os
 import re
-from pathlib import Path
 import shutil
 import sys
+from pathlib import Path
 
 import pytest
 
diff --git a/thinc/tests/regression/issue519/program.py b/thinc/tests/regression/issue519/program.py
index b3e6dc9ba..bce5f3234 100644
--- a/thinc/tests/regression/issue519/program.py
+++ b/thinc/tests/regression/issue519/program.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, concatenate, Relu, Softmax
+from thinc.api import Relu, Softmax, chain, concatenate
 from thinc.model import Model
 from thinc.types import Floats2d
 
diff --git a/thinc/tests/regression/test_issue208.py b/thinc/tests/regression/test_issue208.py
index 25d7280f1..0c574d6d1 100644
--- a/thinc/tests/regression/test_issue208.py
+++ b/thinc/tests/regression/test_issue208.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, Linear
+from thinc.api import Linear, chain
 
 
 def test_issue208():
diff --git a/thinc/tests/shims/test_pytorch_grad_scaler.py b/thinc/tests/shims/test_pytorch_grad_scaler.py
index 2ab0fa738..d4ac10fec 100644
--- a/thinc/tests/shims/test_pytorch_grad_scaler.py
+++ b/thinc/tests/shims/test_pytorch_grad_scaler.py
@@ -1,10 +1,10 @@
 import pytest
-
 from hypothesis import given, settings
 from hypothesis.strategies import lists, one_of, tuples
+
+from thinc.api import PyTorchGradScaler
 from thinc.compat import has_torch, has_torch_amp, has_torch_cuda_gpu, torch
 from thinc.util import is_torch_array
-from thinc.api import PyTorchGradScaler
 
 from ..strategies import ndarrays
 
diff --git a/thinc/tests/strategies.py b/thinc/tests/strategies.py
index 322728cd9..bc12975aa 100644
--- a/thinc/tests/strategies.py
+++ b/thinc/tests/strategies.py
@@ -1,7 +1,8 @@
 import numpy
-from hypothesis.strategies import just, tuples, integers, floats
 from hypothesis.extra.numpy import arrays
-from thinc.api import NumpyOps, Linear
+from hypothesis.strategies import floats, integers, just, tuples
+
+from thinc.api import Linear, NumpyOps
 
 
 def get_ops():
diff --git a/thinc/tests/test_config.py b/thinc/tests/test_config.py
index 0dceadfc4..fe2118e25 100644
--- a/thinc/tests/test_config.py
+++ b/thinc/tests/test_config.py
@@ -1,20 +1,21 @@
-import pytest
-from typing import Iterable, Union, Optional, List, Callable, Dict, Any
+import inspect
+import pickle
 from types import GeneratorType
-from pydantic import BaseModel, StrictBool, StrictFloat, PositiveInt, constr
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
 import catalogue
+import numpy
+import pytest
+from pydantic import BaseModel, PositiveInt, StrictBool, StrictFloat, constr
+
 import thinc.config
+from thinc.api import Config, Model, NumpyOps, RAdam
 from thinc.config import ConfigValidationError
 from thinc.types import Generator, Ragged
-from thinc.api import Config, RAdam, Model, NumpyOps
 from thinc.util import partial
-import numpy
-import inspect
-import pickle
 
 from .util import make_tempdir
 
-
 EXAMPLE_CONFIG = """
 [optimizer]
 @optimizers = "Adam.v1"
diff --git a/thinc/tests/test_import__all__.py b/thinc/tests/test_import__all__.py
index 226783ec2..fb0a08a20 100644
--- a/thinc/tests/test_import__all__.py
+++ b/thinc/tests/test_import__all__.py
@@ -1,9 +1,9 @@
 import ast
+import importlib
 from collections import namedtuple
-from typing import Tuple, List
+from typing import List, Tuple
 
 import pytest
-import importlib
 
 _Import = namedtuple("_Import", ["module", "name", "alias"])
 
diff --git a/thinc/tests/test_indexing.py b/thinc/tests/test_indexing.py
index 98fbc4437..2703e5dfa 100644
--- a/thinc/tests/test_indexing.py
+++ b/thinc/tests/test_indexing.py
@@ -1,7 +1,8 @@
-import pytest
 import numpy
+import pytest
 from numpy.testing import assert_allclose
-from thinc.types import Ragged, Pairs
+
+from thinc.types import Pairs, Ragged
 
 
 @pytest.fixture
diff --git a/thinc/tests/test_initializers.py b/thinc/tests/test_initializers.py
index 4f7c8f2cc..628398be0 100644
--- a/thinc/tests/test_initializers.py
+++ b/thinc/tests/test_initializers.py
@@ -1,8 +1,14 @@
+import numpy
 import pytest
-from thinc.api import glorot_uniform_init, zero_init, uniform_init, normal_init
-from thinc.api import NumpyOps
+
 from thinc import registry
-import numpy
+from thinc.api import (
+    NumpyOps,
+    glorot_uniform_init,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
 
 
 @pytest.mark.parametrize(
diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py
index 75206d240..fc100dd3a 100644
--- a/thinc/tests/test_loss.py
+++ b/thinc/tests/test_loss.py
@@ -1,8 +1,13 @@
-import pytest
 import numpy
-from thinc.api import CategoricalCrossentropy, SequenceCategoricalCrossentropy
-from thinc.api import L2Distance, CosineDistance
+import pytest
+
 from thinc import registry
+from thinc.api import (
+    CategoricalCrossentropy,
+    CosineDistance,
+    L2Distance,
+    SequenceCategoricalCrossentropy,
+)
 
 # some simple arrays
 scores0 = numpy.zeros((3, 3), dtype="f")
diff --git a/thinc/tests/test_optimizers.py b/thinc/tests/test_optimizers.py
index a31dbce32..4e336640b 100644
--- a/thinc/tests/test_optimizers.py
+++ b/thinc/tests/test_optimizers.py
@@ -1,6 +1,7 @@
-import pytest
-from thinc.api import registry, Optimizer
 import numpy
+import pytest
+
+from thinc.api import Optimizer, registry
 
 
 def _test_schedule_valid():
diff --git a/thinc/tests/test_schedules.py b/thinc/tests/test_schedules.py
index d975d2dbd..31a8f4e3b 100644
--- a/thinc/tests/test_schedules.py
+++ b/thinc/tests/test_schedules.py
@@ -1,5 +1,12 @@
-from thinc.api import decaying, compounding, slanted_triangular, constant_then
-from thinc.api import constant, warmup_linear, cyclic_triangular
+from thinc.api import (
+    compounding,
+    constant,
+    constant_then,
+    cyclic_triangular,
+    decaying,
+    slanted_triangular,
+    warmup_linear,
+)
 
 
 def test_decaying_rate():
diff --git a/thinc/tests/test_serialize.py b/thinc/tests/test_serialize.py
index b89fc2d94..a457cd237 100644
--- a/thinc/tests/test_serialize.py
+++ b/thinc/tests/test_serialize.py
@@ -1,7 +1,16 @@
 import pytest
 import srsly
-from thinc.api import with_array, Linear, Maxout, chain, Model, Shim
-from thinc.api import serialize_attr, deserialize_attr
+
+from thinc.api import (
+    Linear,
+    Maxout,
+    Model,
+    Shim,
+    chain,
+    deserialize_attr,
+    serialize_attr,
+    with_array,
+)
 
 
 @pytest.fixture
diff --git a/thinc/tests/test_types.py b/thinc/tests/test_types.py
index 249ce2b80..ebfbb6fb6 100644
--- a/thinc/tests/test_types.py
+++ b/thinc/tests/test_types.py
@@ -1,8 +1,17 @@
 import numpy
-from pydantic import create_model, ValidationError
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
-from thinc.types import Ints1d, Ints2d, Ints3d, Ints4d
 import pytest
+from pydantic import ValidationError, create_model
+
+from thinc.types import (
+    Floats1d,
+    Floats2d,
+    Floats3d,
+    Floats4d,
+    Ints1d,
+    Ints2d,
+    Ints3d,
+    Ints4d,
+)
 
 
 @pytest.mark.parametrize(
diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py
index 8d2d0058d..77f6a7b86 100644
--- a/thinc/tests/test_util.py
+++ b/thinc/tests/test_util.py
@@ -1,11 +1,16 @@
-import pytest
 import numpy
+import pytest
 from hypothesis import given
-from thinc.api import get_width, Ragged, Padded
-from thinc.util import get_array_module, is_numpy_array, to_categorical
-from thinc.util import is_cupy_array
-from thinc.util import convert_recursive
+
+from thinc.api import Padded, Ragged, get_width
 from thinc.types import ArgsKwargs
+from thinc.util import (
+    convert_recursive,
+    get_array_module,
+    is_cupy_array,
+    is_numpy_array,
+    to_categorical,
+)
 
 from . import strategies
 
diff --git a/thinc/tests/util.py b/thinc/tests/util.py
index 7440a4b6e..defb9a2f6 100644
--- a/thinc/tests/util.py
+++ b/thinc/tests/util.py
@@ -1,10 +1,12 @@
 import contextlib
-from pathlib import Path
-import tempfile
 import shutil
-from thinc.api import Linear, Ragged, Padded, ArgsKwargs
+import tempfile
+from pathlib import Path
+
 import numpy
 import pytest
+
+from thinc.api import ArgsKwargs, Linear, Padded, Ragged
 from thinc.util import has_cupy, is_cupy_array, is_numpy_array
 
 
diff --git a/thinc/types.py b/thinc/types.py
index c7e6a00f6..9a9487cb4 100644
--- a/thinc/types.py
+++ b/thinc/types.py
@@ -1,11 +1,28 @@
-from typing import Union, Tuple, Sized, Container, Any, TypeVar, Callable
-from typing import Iterable, Iterator, Sequence, Dict, Generic, cast
-from typing import Optional, List, overload
+import sys
 from abc import abstractmethod
 from dataclasses import dataclass
+from typing import (
+    Any,
+    Callable,
+    Container,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Sized,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+    overload,
+)
+
 import numpy
-import sys
-from .compat import has_cupy, cupy
+
+from .compat import cupy, has_cupy
 
 if has_cupy:
     get_array_module = cupy.get_array_module
@@ -14,9 +31,9 @@
 
 # Use typing_extensions for Python versions < 3.8
 if sys.version_info < (3, 8):
-    from typing_extensions import Protocol, Literal
+    from typing_extensions import Literal, Protocol
 else:
-    from typing import Protocol, Literal  # noqa: F401
+    from typing import Literal, Protocol  # noqa: F401
 
 
 # fmt: off
diff --git a/thinc/util.py b/thinc/util.py
index aabab9ecb..9a1aaf65b 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -1,30 +1,55 @@
-from typing import Any, Union, Sequence, cast, Dict, Optional, Callable, TypeVar
-from typing import List, Mapping, Tuple
-import numpy
-import platform
-from packaging.version import Version
-import random
+import contextlib
 import functools
-from wasabi import table
-from pydantic import create_model, ValidationError
 import inspect
 import os
+import platform
+import random
 import tempfile
 import threading
-import contextlib
 from contextvars import ContextVar
 from dataclasses import dataclass
-from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow
-from .compat import has_cupy_gpu, has_torch_cuda_gpu, has_gpu
-from .compat import has_torch_mps
-from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
+
+import numpy
+from packaging.version import Version
+from pydantic import ValidationError, create_model
+from wasabi import table
+
+from .compat import (
+    cupy,
+    cupy_from_dlpack,
+    has_cupy,
+    has_cupy_gpu,
+    has_gpu,
+    has_mxnet,
+    has_tensorflow,
+    has_torch,
+    has_torch_cuda_gpu,
+    has_torch_mps,
+)
+from .compat import mxnet as mx
+from .compat import tensorflow as tf
+from .compat import torch
 
 DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False)
 
-from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd  # noqa: E402
-from . import types  # noqa: E402
 from typing import TYPE_CHECKING
 
+from . import types  # noqa: E402
+from .types import ArgsKwargs, ArrayXd, FloatsXd, IntsXd, Padded, Ragged  # noqa: E402
+
 if TYPE_CHECKING:
     from .api import Ops
 
@@ -174,7 +199,7 @@ def set_active_gpu(gpu_id: int) -> "cupy.cuda.Device":  # pragma: no cover
 
 def require_cpu() -> bool:  # pragma: no cover
     """Use CPU through best available backend."""
-    from .backends import set_current_ops, get_ops
+    from .backends import get_ops, set_current_ops
 
     ops = get_ops("cpu")
     set_current_ops(ops)
@@ -190,7 +215,7 @@ def prefer_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
 
 
 def require_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
-    from .backends import set_current_ops, CupyOps, MPSOps
+    from .backends import CupyOps, MPSOps, set_current_ops
 
     if platform.system() == "Darwin" and not has_torch_mps:
         if has_torch:

From e1f12c55df0d1d0b241c57ccd1df3e87b7cccb17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 16 Jun 2023 00:31:20 +0200
Subject: [PATCH 27/48] CI: add black/isort/flake8 validate check (#881)

---
 .github/workflows/tests.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 0a9687e92..81d060667 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -12,8 +12,35 @@ on:
       - "*.md"
 
 jobs:
+  validate:
+    name: Validate
+    if: github.repository_owner == 'explosion'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Configure Python version
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.7"
+          architecture: x64
+
+      - name: black
+        run: |
+          python -m pip install black -c requirements.txt
+          python -m black thinc --check
+      - name: isort
+        run: |
+          python -m pip install isort -c requirements.txt
+          python -m isort thinc --check
+      - name: flake8
+        run: |
+          python -m pip install flake8==5.0.4
+          python -m flake8 thinc --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
   tests:
     name: Test
+    needs: Validate
     if: github.repository_owner == 'explosion'
     strategy:
       fail-fast: false

From 36516645863b705523f9c8d7a12b8e482132133d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jul 2023 09:57:26 +0200
Subject: [PATCH 28/48] Switch CI badge to GHA (#884)

---
 README.md | 77 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 488f5c9db..47dc81701 100644
--- a/README.md
+++ b/README.md
@@ -4,16 +4,17 @@
 
 ### From the makers of [spaCy](https://spacy.io) and [Prodigy](https://prodi.gy)
 
-[Thinc](https://thinc.ai) is a **lightweight deep learning library** that offers an elegant,
-type-checked, functional-programming API for **composing models**, with support
-for layers defined in other frameworks such as **PyTorch, TensorFlow and MXNet**. You
-can use Thinc as an interface layer, a standalone toolkit or a flexible way to
-develop new models. Previous versions of Thinc have been running quietly in
-production in thousands of companies, via both [spaCy](https://spacy.io) and
-[Prodigy](https://prodi.gy). We wrote the new version to let users **compose,
-configure and deploy custom models** built with their favorite framework.
-
-[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/7/master.svg?logo=azure-pipelines&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=7)
+[Thinc](https://thinc.ai) is a **lightweight deep learning library** that offers
+an elegant, type-checked, functional-programming API for **composing models**,
+with support for layers defined in other frameworks such as **PyTorch,
+TensorFlow and MXNet**. You can use Thinc as an interface layer, a standalone
+toolkit or a flexible way to develop new models. Previous versions of Thinc have
+been running quietly in production in thousands of companies, via both
+[spaCy](https://spacy.io) and [Prodigy](https://prodi.gy). We wrote the new
+version to let users **compose, configure and deploy custom models** built with
+their favorite framework.
+
+[![tests](https://github.com/explosion/thinc/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/thinc/actions/workflows/tests.yml)
 [![Current Release Version](https://img.shields.io/github/v/release/explosion/thinc.svg?include_prereleases&sort=semver&style=flat-square&logo=github)](https://github.com/explosion/thinc/releases)
 [![PyPi Version](https://img.shields.io/pypi/v/thinc.svg?include_prereleases&sort=semver&style=flat-square&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/thinc)
 [![conda Version](https://img.shields.io/conda/vn/conda-forge/thinc.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/thinc)
@@ -23,9 +24,11 @@ configure and deploy custom models** built with their favorite framework.
 
 ## 🔥 Features
 
-- **Type-check** your model definitions with custom types and [`mypy`](https://mypy.readthedocs.io/en/latest/) plugin.
+- **Type-check** your model definitions with custom types and
+  [`mypy`](https://mypy.readthedocs.io/en/latest/) plugin.
 - Wrap **PyTorch**, **TensorFlow** and **MXNet** models for use in your network.
-- Concise **functional-programming** approach to model definition, using composition rather than inheritance.
+- Concise **functional-programming** approach to model definition, using
+  composition rather than inheritance.
 - Optional custom infix notation via **operator overloading**.
 - Integrated **config system** to describe trees of objects and hyperparameters.
 - Choice of **extensible backends**.
@@ -33,8 +36,8 @@ configure and deploy custom models** built with their favorite framework.
 
 ## 🚀 Quickstart
 
-Thinc is compatible with **Python 3.6+** and runs on **Linux**,
-**macOS** and **Windows**. The latest releases with binary wheels are available from
+Thinc is compatible with **Python 3.6+** and runs on **Linux**, **macOS** and
+**Windows**. The latest releases with binary wheels are available from
 [pip](https://pypi.python.org/pypi/thinc). Before you install Thinc and its
 dependencies, make sure that your `pip`, `setuptools` and `wheel` are up to
 date. For the most recent releases, pip 19.3 or newer is recommended.
@@ -44,15 +47,23 @@ pip install -U pip setuptools wheel
 pip install thinc
 ```
 
-See the [extended installation docs](https://thinc.ai/docs/install#extended) for details on optional dependencies for different backends and GPU. You might also want to [set up static type checking](https://thinc.ai/docs/install#type-checking) to take advantage of Thinc's type system.
+See the [extended installation docs](https://thinc.ai/docs/install#extended) for
+details on optional dependencies for different backends and GPU. You might also
+want to
+[set up static type checking](https://thinc.ai/docs/install#type-checking) to
+take advantage of Thinc's type system.
 
 > ⚠️ If you have installed PyTorch and you are using Python 3.7+, uninstall the
-> package `dataclasses` with `pip uninstall dataclasses`, since it may have
-> been installed by PyTorch and is incompatible with Python 3.7+.
+> package `dataclasses` with `pip uninstall dataclasses`, since it may have been
+> installed by PyTorch and is incompatible with Python 3.7+.
 
 ### 📓 Selected examples and notebooks
 
-Also see the [`/examples`](examples) directory and [usage documentation](https://thinc.ai/docs) for more examples. Most examples are Jupyter notebooks – to launch them on [Google Colab](https://colab.research.google.com) (with GPU support!) click on the button next to the notebook name.
+Also see the [`/examples`](examples) directory and
+[usage documentation](https://thinc.ai/docs) for more examples. Most examples
+are Jupyter notebooks – to launch them on
+[Google Colab](https://colab.research.google.com) (with GPU support!) click on
+the button next to the notebook name.
 
 | Notebook                                                                                                              | Description                                                                                                                                                                                       |
 | --------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -63,15 +74,20 @@ Also see the [`/examples`](examples) directory and [usage documentation](https:/
 
 **[View more &rarr;](examples)**
 
-[colab]: https://gistcdn.githack.com/ines/dcf354aa71a7665ae19871d7fd14a4e0/raw/461fc1f61a7bc5860f943cd4b6bcfabb8c8906e7/colab-badge.svg
+[colab]:
+  https://gistcdn.githack.com/ines/dcf354aa71a7665ae19871d7fd14a4e0/raw/461fc1f61a7bc5860f943cd4b6bcfabb8c8906e7/colab-badge.svg
 [intro_to_thinc]: examples/00_intro_to_thinc.ipynb
-[intro_to_thinc_colab]: https://colab.research.google.com/github/explosion/thinc/blob/master/examples/00_intro_to_thinc.ipynb
+[intro_to_thinc_colab]:
+  https://colab.research.google.com/github/explosion/thinc/blob/master/examples/00_intro_to_thinc.ipynb
 [transformers_tagger_bert]: examples/02_transformers_tagger_bert.ipynb
-[transformers_tagger_bert_colab]: https://colab.research.google.com/github/explosion/thinc/blob/master/examples/02_transformers_tagger_bert.ipynb
+[transformers_tagger_bert_colab]:
+  https://colab.research.google.com/github/explosion/thinc/blob/master/examples/02_transformers_tagger_bert.ipynb
 [pos_tagger_basic_cnn]: examples/03_pos_tagger_basic_cnn.ipynb
-[pos_tagger_basic_cnn_colab]: https://colab.research.google.com/github/explosion/thinc/blob/master/examples/03_pos_tagger_basic_cnn.ipynb
+[pos_tagger_basic_cnn_colab]:
+  https://colab.research.google.com/github/explosion/thinc/blob/master/examples/03_pos_tagger_basic_cnn.ipynb
 [parallel_training_ray]: examples/04_parallel_training_ray.ipynb
-[parallel_training_ray_colab]: https://colab.research.google.com/github/explosion/thinc/blob/master/examples/04_parallel_training_ray.ipynb
+[parallel_training_ray_colab]:
+  https://colab.research.google.com/github/explosion/thinc/blob/master/examples/04_parallel_training_ray.ipynb
 
 ### 📖 Documentation & usage guides
 
@@ -103,7 +119,12 @@ Also see the [`/examples`](examples) directory and [usage documentation](https:/
 
 ## 🐍 Development notes
 
-Thinc uses [`black`](https://github.com/psf/black) for auto-formatting, [`flake8`](http://flake8.pycqa.org/en/latest/) for linting and [`mypy`](https://mypy.readthedocs.io/en/latest/) for type checking. All code is written compatible with **Python 3.6+**, with type hints wherever possible. See the [type reference](https://thinc.ai/docs/api-types) for more details on Thinc's custom types.
+Thinc uses [`black`](https://github.com/psf/black) for auto-formatting,
+[`flake8`](http://flake8.pycqa.org/en/latest/) for linting and
+[`mypy`](https://mypy.readthedocs.io/en/latest/) for type checking. All code is
+written compatible with **Python 3.6+**, with type hints wherever possible. See
+the [type reference](https://thinc.ai/docs/api-types) for more details on
+Thinc's custom types.
 
 ### 👷‍♀️ Building Thinc from source
 
@@ -138,7 +159,8 @@ python setup.py build_ext --inplace
 
 ### 🚦 Running tests
 
-Thinc comes with an [extensive test suite](thinc/tests). The following should all pass and not report any warnings or errors:
+Thinc comes with an [extensive test suite](thinc/tests). The following should
+all pass and not report any warnings or errors:
 
 ```bash
 python -m pytest thinc    # test suite
@@ -146,4 +168,7 @@ python -m mypy thinc      # type checks
 python -m flake8 thinc    # linting
 ```
 
-To view test coverage, you can run `python -m pytest thinc --cov=thinc`. We aim for a 100% test coverage. This doesn't mean that we meticulously write tests for every single line – we ignore blocks that are not relevant or difficult to test and make sure that the tests execute all code paths.
+To view test coverage, you can run `python -m pytest thinc --cov=thinc`. We aim
+for a 100% test coverage. This doesn't mean that we meticulously write tests for
+every single line – we ignore blocks that are not relevant or difficult to test
+and make sure that the tests execute all code paths.

From 5a52314330b1a4647f09acec4e7b52930bbc03d7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jul 2023 10:52:14 +0200
Subject: [PATCH 29/48] Update numpy build constraints for numpy 1.25 (#885)

Starting in numpy 1.25 (see
https://github.com/numpy/numpy/releases/tag/v1.25.0), the numpy C API is
backwards-compatible by default.

For python 3.9+, we should be able to drop the specific numpy build
requirements and use `numpy>=1.25`, which is currently
backwards-compatible to `numpy>=1.19`.

In the future, the python <3.9 requirements could be dropped and the
lower numpy pin could correspond to the oldest supported version for the
current lower python pin.
---
 build-constraints.txt | 5 +----
 pyproject.toml        | 3 ++-
 setup.cfg             | 3 ++-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/build-constraints.txt b/build-constraints.txt
index c1e82f1b0..5540d634d 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -3,7 +3,4 @@ numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
 numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
-numpy==1.19.3; python_version=='3.9'
-numpy==1.21.3; python_version=='3.10'
-numpy==1.23.2; python_version=='3.11'
-numpy; python_version>='3.12'
+numpy>=1.25.0; python_version>='3.9'
diff --git a/pyproject.toml b/pyproject.toml
index 1ad6782d2..aa7dd3a3f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,8 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "blis>=0.7.8,<0.8.0",
-    "numpy>=1.15.0",
+    "numpy>=1.15.0; python_version < '3.9'",
+    "numpy>=1.25.0; python_version >= '3.9'",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/setup.cfg b/setup.cfg
index 52477183f..d38e994fb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -49,7 +49,8 @@ install_requires =
     confection>=0.0.1,<1.0.0
     # Third-party dependencies
     setuptools
-    numpy>=1.15.0
+    numpy>=1.15.0; python_version < "3.9"
+    numpy>=1.19.0; python_version >= "3.9"
     pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
     packaging>=20.0
     # Backports of modern Python features

From 7e146767d73250f32c9387990c9bc18068989de4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jul 2023 11:13:42 +0200
Subject: [PATCH 30/48] Update requirements.txt for numpy 1.25 (#886)

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6ae0c270c..d0f8b055b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,8 @@ confection>=0.0.1,<1.0.0
 ml_datasets>=0.2.0,<0.3.0; python_version < "3.11"
 # Third-party dependencies
 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
-numpy>=1.15.0
+numpy>=1.15.0; python_version < "3.9"
+numpy>=1.19.0; python_version >= "3.9"
 packaging>=20.0
 # Backports of modern Python features
 dataclasses>=0.6,<1.0; python_version < "3.7"

From 9a90a6e2cab4d7002eced5a4f52a1c0a2de1700e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Jul 2023 17:15:17 +0200
Subject: [PATCH 31/48] Switch from distutils to setuptools/sysconfig (#888)

Additionally remove outdated `is_new_osx` check and settings.
---
 setup.py | 31 +++----------------------------
 1 file changed, 3 insertions(+), 28 deletions(-)

diff --git a/setup.py b/setup.py
index 30962ed83..028376c19 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python
 import sys
-import distutils.util
-from distutils.command.build_ext import build_ext
-from distutils.sysconfig import get_python_inc
+from setuptools.command.build_ext import build_ext
+from sysconfig import get_path
 from setuptools import Extension, setup, find_packages
 from pathlib import Path
 import numpy
@@ -36,30 +35,6 @@
 LINK_OPTIONS = {"msvc": [], "other": []}
 
 
-def is_new_osx():
-    """Check whether we're on OSX >= 10.10"""
-    name = distutils.util.get_platform()
-    if sys.platform != "darwin":
-        return False
-    elif name.startswith("macosx-10"):
-        minor_version = int(name.split("-")[1].split(".")[1])
-        if minor_version >= 7:
-            return True
-        else:
-            return False
-    else:
-        return False
-
-
-if is_new_osx():
-    # On Mac, use libc++ because Apple deprecated use of libstdc
-    COMPILE_OPTIONS["other"].append("-stdlib=libc++")
-    LINK_OPTIONS["other"].append("-lc++")
-    # g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
-    # See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
-    LINK_OPTIONS["other"].append("-nodefaultlibs")
-
-
 # By subclassing build_extensions we have the actual compiler that will be used
 # which is really known only after finalize_options
 # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
@@ -100,7 +75,7 @@ def setup_package():
         about = {}
         exec(f.read(), about)
 
-    include_dirs = [numpy.get_include(), get_python_inc(plat_specific=True)]
+    include_dirs = [numpy.get_include(), get_path("include")]
     ext_modules = []
     for name in MOD_NAMES:
         mod_path = name.replace(".", "/") + ".pyx"

From 2fc7f8f6a528016e93f46d50e36a238554d038ab Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 26 Jul 2023 08:36:13 +0200
Subject: [PATCH 32/48] Set version to v8.2.0.dev0

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 2504236b7..5abd31409 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.10"
+__version__ = "8.2.0.dev0"
 __release__ = True

From eae8c782777394093d1ec567d014151143320867 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 4 Aug 2023 14:38:12 +0200
Subject: [PATCH 33/48] Allow pydantic v2 using transitional v1 support (#891)

---
 requirements.txt           | 2 +-
 setup.cfg                  | 2 +-
 thinc/tests/test_config.py | 6 +++++-
 thinc/tests/test_types.py  | 7 ++++++-
 thinc/util.py              | 8 ++++++--
 5 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d0f8b055b..b7682e738 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ catalogue>=2.0.4,<2.1.0
 confection>=0.0.1,<1.0.0
 ml_datasets>=0.2.0,<0.3.0; python_version < "3.11"
 # Third-party dependencies
-pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
+pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
 numpy>=1.15.0; python_version < "3.9"
 numpy>=1.19.0; python_version >= "3.9"
 packaging>=20.0
diff --git a/setup.cfg b/setup.cfg
index d38e994fb..f80422a8c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,7 +51,7 @@ install_requires =
     setuptools
     numpy>=1.15.0; python_version < "3.9"
     numpy>=1.19.0; python_version >= "3.9"
-    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
+    pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
     packaging>=20.0
     # Backports of modern Python features
     dataclasses>=0.6,<1.0; python_version < "3.7"
diff --git a/thinc/tests/test_config.py b/thinc/tests/test_config.py
index fe2118e25..a3f4ede46 100644
--- a/thinc/tests/test_config.py
+++ b/thinc/tests/test_config.py
@@ -6,7 +6,11 @@
 import catalogue
 import numpy
 import pytest
-from pydantic import BaseModel, PositiveInt, StrictBool, StrictFloat, constr
+
+try:
+    from pydantic.v1 import BaseModel, PositiveInt, StrictBool, StrictFloat, constr
+except ImportError:
+    from pydantic import BaseModel, PositiveInt, StrictBool, StrictFloat, constr  # type: ignore
 
 import thinc.config
 from thinc.api import Config, Model, NumpyOps, RAdam
diff --git a/thinc/tests/test_types.py b/thinc/tests/test_types.py
index ebfbb6fb6..738a309f9 100644
--- a/thinc/tests/test_types.py
+++ b/thinc/tests/test_types.py
@@ -1,6 +1,11 @@
 import numpy
 import pytest
-from pydantic import ValidationError, create_model
+
+try:
+    from pydantic.v1 import ValidationError, create_model
+except ImportError:
+    from pydantic import ValidationError, create_model  # type: ignore
+
 
 from thinc.types import (
     Floats1d,
diff --git a/thinc/util.py b/thinc/util.py
index 9a1aaf65b..04abe0170 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -24,7 +24,12 @@
 
 import numpy
 from packaging.version import Version
-from pydantic import ValidationError, create_model
+
+try:
+    from pydantic.v1 import ValidationError, create_model
+except ImportError:
+    from pydantic import ValidationError, create_model  # type: ignore
+
 from wasabi import table
 
 from .compat import (
@@ -251,7 +256,6 @@ def to_categorical(
     *,
     label_smoothing: float = 0.0,
 ) -> FloatsXd:
-
     if n_classes is None:
         n_classes = int(numpy.max(Y) + 1)  # type: ignore
 

From 049d37e24b0361a212212732eb1b68db4799732a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 4 Aug 2023 15:04:30 +0200
Subject: [PATCH 34/48] Set version to v8.1.11 (#892)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 2504236b7..f9f48a287 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.10"
+__version__ = "8.1.11"
 __release__ = True

From 4db3879a18529f7f8e906a5880ab8d924d37477c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 7 Aug 2023 09:37:44 +0200
Subject: [PATCH 35/48] Import mxnet and tensorflow only if explicitly enabled
 (#890)

* Import mxnet and tensorflow only if explicitly enabled

* Ignore import errors for mxnet/tensorflow in tests

* Add enable_{mxnet,tensorflow} to thinc.api and docs

* Update intro example notebook

* Add warnings/info to docs

* Add deprecation warnings to enable_ methods

* Extend error messages in assert_{mxnet,tensorflow}_installed
---
 .github/workflows/tests.yml        |  2 +-
 examples/00_intro_to_thinc.ipynb   |  8 +++---
 thinc/api.py                       |  4 ++-
 thinc/backends/_cupy_allocators.py |  2 +-
 thinc/compat.py                    | 40 +++++++++++++++++++++++-------
 thinc/layers/tensorflowwrapper.py  |  1 +
 thinc/shims/mxnet.py               |  1 +
 thinc/shims/tensorflow.py          |  1 +
 thinc/tests/enable_mxnet.py        |  6 +++++
 thinc/tests/enable_tensorflow.py   |  6 +++++
 thinc/util.py                      | 40 ++++++++++++++++--------------
 website/docs/api-layers.md         | 22 +++++++++++++++-
 website/docs/api-util.md           |  8 ++++++
 website/docs/usage-frameworks.md   | 10 ++++++++
 14 files changed, 116 insertions(+), 35 deletions(-)
 create mode 100644 thinc/tests/enable_mxnet.py
 create mode 100644 thinc/tests/enable_tensorflow.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 81d060667..4ad90be7d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -136,7 +136,7 @@ jobs:
         shell: bash --noprofile --norc -o pipefail {0}
 
       - name: Run tests with extras
-        run: python -m pytest --pyargs thinc --cov=thinc --cov-report=term
+        run: python -m pytest --pyargs thinc --cov=thinc --cov-report=term -p thinc.tests.enable_tensorflow -p thinc.tests.enable_mxnet
 
       - name: Run tests for thinc-apple-ops
         run: |
diff --git a/examples/00_intro_to_thinc.ipynb b/examples/00_intro_to_thinc.ipynb
index 80e0c25a6..c7fbf5292 100644
--- a/examples/00_intro_to_thinc.ipynb
+++ b/examples/00_intro_to_thinc.ipynb
@@ -23,7 +23,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install \"thinc>=8.0.0\" \"ml_datasets>=0.2.0\" \"tqdm>=4.41\""
+    "!pip install \"thinc>=8.2.0\" \"ml_datasets>=0.2.0\" \"tqdm>=4.41\""
    ]
   },
   {
@@ -1050,7 +1050,8 @@
    "source": [
     "from tensorflow.keras.layers import Dense, Dropout\n",
     "from tensorflow.keras.models import Sequential\n",
-    "from thinc.api import TensorFlowWrapper, Adam\n",
+    "from thinc.api import enable_tensorflow, TensorFlowWrapper, Adam\n",
+    "enable_tensorflow()\n",
     "\n",
     "width = 32\n",
     "nO = 10\n",
@@ -1373,8 +1374,9 @@
    "outputs": [],
    "source": [
     "from mxnet.gluon.nn import Dense, Sequential, Dropout\n",
-    "from thinc.api import MXNetWrapper, chain, Softmax\n",
+    "from thinc.api import enable_mxnet, MXNetWrapper, chain, Softmax\n",
     "import thinc.util\n",
+    "enable_mxnet()\n",
     "\n",
     "assert thinc.util.has_mxnet\n",
     "\n",
diff --git a/thinc/api.py b/thinc/api.py
index 6f795237a..b2bc346a0 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -11,7 +11,7 @@
     use_pytorch_for_gpu_memory,
     use_tensorflow_for_gpu_memory,
 )
-from .compat import has_cupy
+from .compat import enable_mxnet, enable_tensorflow, has_cupy
 from .config import Config, ConfigValidationError, registry
 from .initializers import (
     configure_normal_init,
@@ -190,6 +190,8 @@
     "torch2xp", "xp2torch", "tensorflow2xp", "xp2tensorflow", "mxnet2xp", "xp2mxnet",
     "get_torch_default_device",
     # .compat
+    "enable_mxnet",
+    "enable_tensorflow",
     "has_cupy",
     # .backends
     "get_ops", "set_current_ops", "get_current_ops", "use_ops",
diff --git a/thinc/backends/_cupy_allocators.py b/thinc/backends/_cupy_allocators.py
index 77c958e36..09322ac00 100644
--- a/thinc/backends/_cupy_allocators.py
+++ b/thinc/backends/_cupy_allocators.py
@@ -12,7 +12,7 @@ def cupy_tensorflow_allocator(size_in_bytes: int):
     sitting in the other library's pool.
     """
     size_in_bytes = max(1024, size_in_bytes)
-    tensor = tensorflow.zeros((size_in_bytes // 4,), dtype=tensorflow.dtypes.float32)
+    tensor = tensorflow.zeros((size_in_bytes // 4,), dtype=tensorflow.dtypes.float32)  # type: ignore
     # We convert to cupy via dlpack, so that we can get a memory pointer.
     cupy_array = cast(ArrayXd, tensorflow2xp(tensor))
     address = int(cupy_array.data)
diff --git a/thinc/compat.py b/thinc/compat.py
index 52a73669f..5d600796a 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -1,3 +1,5 @@
+import warnings
+
 from packaging.version import Version
 
 try:  # pragma: no cover
@@ -50,25 +52,45 @@
     has_torch_amp = False
     torch_version = Version("0.0.0")
 
-try:  # pragma: no cover
+
+def enable_tensorflow():
+    warn_msg = (
+        "Built-in TensorFlow support will be removed in Thinc v9. If you need "
+        "TensorFlow support in the future, you can transition to using a "
+        "custom copy of the current TensorFlowWrapper in your package or "
+        "project."
+    )
+    warnings.warn(warn_msg, DeprecationWarning)
+    global tensorflow, has_tensorflow, has_tensorflow_gpu
     import tensorflow
     import tensorflow.experimental.dlpack
 
     has_tensorflow = True
     has_tensorflow_gpu = len(tensorflow.config.get_visible_devices("GPU")) > 0
-except ImportError:  # pragma: no cover
-    tensorflow = None
-    has_tensorflow = False
-    has_tensorflow_gpu = False
 
 
-try:  # pragma: no cover
+tensorflow = None
+has_tensorflow = False
+has_tensorflow_gpu = False
+
+
+def enable_mxnet():
+    warn_msg = (
+        "Built-in MXNet support will be removed in Thinc v9. If you need "
+        "MXNet support in the future, you can transition to using a "
+        "custom copy of the current MXNetWrapper in your package or "
+        "project."
+    )
+    warnings.warn(warn_msg, DeprecationWarning)
+    global mxnet, has_mxnet
     import mxnet
 
     has_mxnet = True
-except ImportError:  # pragma: no cover
-    mxnet = None
-    has_mxnet = False
+
+
+mxnet = None
+has_mxnet = False
+
 
 try:
     import h5py
diff --git a/thinc/layers/tensorflowwrapper.py b/thinc/layers/tensorflowwrapper.py
index a77e0b3af..302764c3f 100644
--- a/thinc/layers/tensorflowwrapper.py
+++ b/thinc/layers/tensorflowwrapper.py
@@ -1,3 +1,4 @@
+# mypy: ignore-errors
 from typing import Any, Callable, Dict, Optional, Tuple, Type, TypeVar
 
 import srsly
diff --git a/thinc/shims/mxnet.py b/thinc/shims/mxnet.py
index 2dd36a62f..544db59e9 100644
--- a/thinc/shims/mxnet.py
+++ b/thinc/shims/mxnet.py
@@ -1,3 +1,4 @@
+# mypy: ignore-errors
 import copy
 from typing import Any, cast
 
diff --git a/thinc/shims/tensorflow.py b/thinc/shims/tensorflow.py
index bcaae3aac..0668ba92c 100644
--- a/thinc/shims/tensorflow.py
+++ b/thinc/shims/tensorflow.py
@@ -1,3 +1,4 @@
+# mypy: ignore-errors
 import contextlib
 import copy
 from io import BytesIO
diff --git a/thinc/tests/enable_mxnet.py b/thinc/tests/enable_mxnet.py
new file mode 100644
index 000000000..b7ccb3e6e
--- /dev/null
+++ b/thinc/tests/enable_mxnet.py
@@ -0,0 +1,6 @@
+from thinc.compat import enable_mxnet
+
+try:
+    enable_mxnet()
+except ImportError:
+    pass
diff --git a/thinc/tests/enable_tensorflow.py b/thinc/tests/enable_tensorflow.py
new file mode 100644
index 000000000..bd1ac7667
--- /dev/null
+++ b/thinc/tests/enable_tensorflow.py
@@ -0,0 +1,6 @@
+from thinc.compat import enable_tensorflow
+
+try:
+    enable_tensorflow()
+except ImportError:
+    pass
diff --git a/thinc/util.py b/thinc/util.py
index 9a1aaf65b..ce8fcbb78 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -151,7 +151,7 @@ def is_torch_mps_array(obj: Any) -> bool:  # pragma: no cover
 def is_tensorflow_array(obj: Any) -> bool:  # pragma: no cover
     if not has_tensorflow:
         return False
-    elif isinstance(obj, tf.Tensor):
+    elif isinstance(obj, tf.Tensor):  # type: ignore
         return True
     else:
         return False
@@ -164,7 +164,7 @@ def is_tensorflow_gpu_array(obj: Any) -> bool:  # pragma: no cover
 def is_mxnet_array(obj: Any) -> bool:  # pragma: no cover
     if not has_mxnet:
         return False
-    elif isinstance(obj, mx.nd.NDArray):
+    elif isinstance(obj, mx.nd.NDArray):  # type: ignore
         return True
     else:
         return False
@@ -316,15 +316,17 @@ def get_width(
 
 def assert_tensorflow_installed() -> None:  # pragma: no cover
     """Raise an ImportError if TensorFlow is not installed."""
-    template = "TensorFlow support requires {pkg}: pip install thinc[tensorflow]"
+    template = "TensorFlow support requires {pkg}: pip install thinc[tensorflow]\n\nEnable TensorFlow support with thinc.api.enable_tensorflow()"
     if not has_tensorflow:
-        raise ImportError(template.format(pkg="tensorflow>=2.0.0"))
+        raise ImportError(template.format(pkg="tensorflow>=2.0.0,<2.6.0"))
 
 
 def assert_mxnet_installed() -> None:  # pragma: no cover
     """Raise an ImportError if MXNet is not installed."""
     if not has_mxnet:
-        raise ImportError("MXNet support requires mxnet: pip install thinc[mxnet]")
+        raise ImportError(
+            "MXNet support requires mxnet: pip install thinc[mxnet]\n\nEnable MXNet support with thinc.api.enable_mxnet()"
+        )
 
 
 def assert_pytorch_installed() -> None:  # pragma: no cover
@@ -429,32 +431,32 @@ def torch2xp(
 
 def xp2tensorflow(
     xp_tensor: ArrayXd, requires_grad: bool = False, as_variable: bool = False
-) -> "tf.Tensor":  # pragma: no cover
+) -> "tf.Tensor":  # type: ignore  # pragma: no cover
     """Convert a numpy or cupy tensor to a TensorFlow Tensor or Variable"""
     assert_tensorflow_installed()
     if hasattr(xp_tensor, "toDlpack"):
         dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
-        tf_tensor = tf.experimental.dlpack.from_dlpack(dlpack_tensor)
+        tf_tensor = tf.experimental.dlpack.from_dlpack(dlpack_tensor)  # type: ignore
     elif hasattr(xp_tensor, "__dlpack__"):
         dlpack_tensor = xp_tensor.__dlpack__()  # type: ignore
-        tf_tensor = tf.experimental.dlpack.from_dlpack(dlpack_tensor)
+        tf_tensor = tf.experimental.dlpack.from_dlpack(dlpack_tensor)  # type: ignore
     else:
-        tf_tensor = tf.convert_to_tensor(xp_tensor)
+        tf_tensor = tf.convert_to_tensor(xp_tensor)  # type: ignore
     if as_variable:
         # tf.Variable() automatically puts in GPU if available.
         # So we need to control it using the context manager
-        with tf.device(tf_tensor.device):
-            tf_tensor = tf.Variable(tf_tensor, trainable=requires_grad)
+        with tf.device(tf_tensor.device):  # type: ignore
+            tf_tensor = tf.Variable(tf_tensor, trainable=requires_grad)  # type: ignore
     if requires_grad is False and as_variable is False:
         # tf.stop_gradient() automatically puts in GPU if available.
         # So we need to control it using the context manager
-        with tf.device(tf_tensor.device):
-            tf_tensor = tf.stop_gradient(tf_tensor)
+        with tf.device(tf_tensor.device):  # type: ignore
+            tf_tensor = tf.stop_gradient(tf_tensor)  # type: ignore
     return tf_tensor
 
 
 def tensorflow2xp(
-    tf_tensor: "tf.Tensor", *, ops: Optional["Ops"] = None
+    tf_tensor: "tf.Tensor", *, ops: Optional["Ops"] = None  # type: ignore
 ) -> ArrayXd:  # pragma: no cover
     """Convert a Tensorflow tensor to numpy or cupy tensor depending on the `ops` parameter.
     If `ops` is `None`, the type of the resultant tensor will be determined by the source tensor's device.
@@ -466,7 +468,7 @@ def tensorflow2xp(
         if isinstance(ops, NumpyOps):
             return tf_tensor.numpy()
         else:
-            dlpack_tensor = tf.experimental.dlpack.to_dlpack(tf_tensor)
+            dlpack_tensor = tf.experimental.dlpack.to_dlpack(tf_tensor)  # type: ignore
             return cupy_from_dlpack(dlpack_tensor)
     else:
         if isinstance(ops, NumpyOps) or ops is None:
@@ -477,21 +479,21 @@ def tensorflow2xp(
 
 def xp2mxnet(
     xp_tensor: ArrayXd, requires_grad: bool = False
-) -> "mx.nd.NDArray":  # pragma: no cover
+) -> "mx.nd.NDArray":  # type: ignore  # pragma: no cover
     """Convert a numpy or cupy tensor to a MXNet tensor."""
     assert_mxnet_installed()
     if hasattr(xp_tensor, "toDlpack"):
         dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
-        mx_tensor = mx.nd.from_dlpack(dlpack_tensor)
+        mx_tensor = mx.nd.from_dlpack(dlpack_tensor)  # type: ignore
     else:
-        mx_tensor = mx.nd.from_numpy(xp_tensor)
+        mx_tensor = mx.nd.from_numpy(xp_tensor)  # type: ignore
     if requires_grad:
         mx_tensor.attach_grad()
     return mx_tensor
 
 
 def mxnet2xp(
-    mx_tensor: "mx.nd.NDArray", *, ops: Optional["Ops"] = None
+    mx_tensor: "mx.nd.NDArray", *, ops: Optional["Ops"] = None  # type: ignore
 ) -> ArrayXd:  # pragma: no cover
     """Convert a MXNet tensor to a numpy or cupy tensor."""
     from .api import NumpyOps
diff --git a/website/docs/api-layers.md b/website/docs/api-layers.md
index 45ad8c824..dbdde5b20 100644
--- a/website/docs/api-layers.md
+++ b/website/docs/api-layers.md
@@ -1003,7 +1003,7 @@ model, e.g. `chain(f, g)` computes `g(f(x))`.
 
 | Argument    | Type           | Description                       |
 | ----------- | -------------- | --------------------------------- |
-| `layer1 `   | <tt>Model</tt> | The first model to compose.       |
+| `layer1`    | <tt>Model</tt> | The first model to compose.       |
 | `layer2`    | <tt>Model</tt> | The second model to compose.      |
 | `*layers`   | <tt>Model</tt> | Any additional models to compose. |
 | **RETURNS** | <tt>Model</tt> | The composed feed-forward model.  |
@@ -1795,6 +1795,16 @@ https://github.com/explosion/thinc/blob/master/thinc/layers/torchscriptwrapper.p
 
 </inline-list>
 
+<infobox variant="warning">
+In Thinc v8.2+, TensorFlow support is not enabled by default. To enable TensorFlow:
+
+```python
+from thinc.api import enable_tensorflow
+enable_tensorflow()
+```
+
+</infobox>
+
 Wrap a [TensorFlow](https://tensorflow.org) model, so that it has the same API
 as Thinc models. To optimize the model, you'll need to create a TensorFlow
 optimizer and call `optimizer.apply_gradients` after each batch. To allow
@@ -1820,6 +1830,16 @@ https://github.com/explosion/thinc/blob/master/thinc/layers/tensorflowwrapper.py
 
 </inline-list>
 
+<infobox variant="warning">
+In Thinc v8.2+, MXNet support is not enabled by default. To enable MXNet:
+
+```python
+from thinc.api import enable_mxnet
+enable_mxnet()
+```
+
+</infobox>
+
 Wrap a [MXNet](https://mxnet.apache.org/) model, so that it has the same API as
 Thinc models. To optimize the model, you'll need to create a MXNet optimizer and
 call `optimizer.step()` after each batch. To allow maximum flexibility, the
diff --git a/website/docs/api-util.md b/website/docs/api-util.md
index add7c12e1..c413f3503 100644
--- a/website/docs/api-util.md
+++ b/website/docs/api-util.md
@@ -141,6 +141,14 @@ Converts a class vector (integers) to binary class matrix. Based on
 | `label_smoothing` | <tt>float</tt>         | Smoothing-coefficient for label-smoothing.                                                     |
 | **RETURNS**       | <tt>Floats2d</tt>      | A binary matrix representation of the input. The axis representing the classes is placed last. |
 
+### enable_mxnet {#enable_mxnet tag="function" new="8.2.0"}
+
+Import and enable internal support for MXNet.
+
+### enable_tensorflow {#enable_tensorflow tag="function"  new="8.2.0"}
+
+Import and enable internal support for TensorFlow.
+
 ### xp2torch {#xp2torch tag="function"}
 
 Convert a `numpy` or `cupy` tensor to a PyTorch tensor.
diff --git a/website/docs/usage-frameworks.md b/website/docs/usage-frameworks.md
index 50dbc3da2..ea0f215b1 100644
--- a/website/docs/usage-frameworks.md
+++ b/website/docs/usage-frameworks.md
@@ -81,6 +81,16 @@ Y, backprop = model(X, is_train=True)
 dX = backprop(Y)
 ```
 
+<infobox variant="warning">
+In Thinc v8.2+, TensorFlow support is not enabled by default. To enable TensorFlow:
+
+```python
+from thinc.api import enable_tensorflow
+enable_tensorflow()
+```
+
+</infobox>
+
 ```python
 ### TensorFlow Example {highlight="6"}
 from thinc.api import TensorFlowWrapper, chain, Linear

From 302296eca338b173e7ba7a71d5895fc387bddf0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 7 Aug 2023 13:29:53 +0200
Subject: [PATCH 36/48] Support zero-length batches and hidden sizes in
 reduce_{max,mean,sum} (#882)

* Support zero-length batches and hidden sizes in reduce_{max,mean,sum}

Before this change we would fail with an assertion, but it is valid to
do reductions over zero-length arrays.

(As long as the length of a sequence is not zero in the case of max
and mean, but we check for that separately.)

* Exhaustively test zero-length and zero dimension reductions

* Update docs to describe all zero-length cases for reductions
---
 thinc/backends/numpy_ops.pyx     | 47 +++++++++++++------
 thinc/backends/ops.py            | 10 ++--
 thinc/tests/backends/test_ops.py | 79 ++++++++++++++++++++++++++++++++
 website/docs/api-backends.md     | 50 +++++++++++++-------
 4 files changed, 151 insertions(+), 35 deletions(-)

diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index f64aa29dd..5ab4d0d8f 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -317,8 +317,11 @@ class NumpyOps(Ops):
         cdef int O = X.shape[1]
         cdef int T = X.shape[0]
 
-        assert B != 0
-        assert O != 0
+        if B == 0 or O == 0:
+            if reals2d_ft is float2d_t:
+                return numpy.zeros(shape=(B, O), dtype="float32")
+            else:
+                return numpy.zeros(shape=(B, O), dtype="float64")
 
         cdef np.ndarray means
         if reals2d_ft is float2d_t:
@@ -340,8 +343,11 @@ class NumpyOps(Ops):
                 raise ValueError(f"all sequence lengths must be >= 0, got {length}")
             T += length
 
-        assert T != 0
-        assert O != 0
+        if T == 0 or O == 0:
+            if reals2d_ft is float2d_t:
+                return numpy.zeros(shape=(T, O), dtype="float32")
+            else:
+                return numpy.zeros(shape=(T, O), dtype="float64")
 
         cdef np.ndarray dX
         if reals2d_ft is float2d_t:
@@ -358,8 +364,11 @@ class NumpyOps(Ops):
         cdef int O = X.shape[1]
         cdef int T = X.shape[0]
 
-        assert B != 0
-        assert O != 0
+        if B == 0 or O == 0:
+            if reals2d_ft is float2d_t:
+                return numpy.zeros(shape=(B, O), dtype="float32")
+            else:
+                return numpy.zeros(shape=(B, O), dtype="float64")
 
         cdef np.ndarray sums
         if reals2d_ft is float2d_t:
@@ -381,8 +390,11 @@ class NumpyOps(Ops):
                 raise ValueError(f"all sequence lengths must be >= 0, got {length}")
             T += length
 
-        assert T != 0
-        assert O != 0
+        if T == 0 or O == 0:
+            if reals2d_ft is float2d_t:
+                return numpy.zeros(shape=(T, O), dtype="float32")
+            else:
+                return numpy.zeros(shape=(T, O), dtype="float64")
 
         cdef np.ndarray dX
         if reals2d_ft is float2d_t:
@@ -399,12 +411,16 @@ class NumpyOps(Ops):
         cdef int O = X.shape[1]
         cdef int T = X.shape[0]
 
-        assert B != 0
-        assert O != 0
-
-        cdef np.ndarray maxes
         # Needs to be zero-initialized as we start by assuming that the first element is the max value.
         cdef np.ndarray which = self.alloc(shape=(B, O), dtype="i", zeros=True)
+
+        if B == 0 or O == 0:
+            if reals2d_ft is float2d_t:
+                return numpy.zeros(shape=(B, O), dtype="float32"), which
+            else:
+                return numpy.zeros(shape=(B, O), dtype="float64"), which
+
+        cdef np.ndarray maxes
         if reals2d_ft is float2d_t:
             maxes = self.alloc(shape=(B, O), dtype="float32", zeros=False)
             cpu_reduce_max(<float*>maxes.data, <int*>which.data, &X[0, 0], &lengths[0], B, T, O)
@@ -424,8 +440,11 @@ class NumpyOps(Ops):
                 raise ValueError(f"all sequence lengths must be > 0, got {length}")
             T += length
 
-        assert T != 0
-        assert O != 0
+        if T == 0 or O == 0:
+            if reals2d_ft is float2d_t:
+                return numpy.zeros(shape=(T, O), dtype="float32")
+            else:
+                return numpy.zeros(shape=(T, O), dtype="float64")
 
         cdef np.ndarray dX
         if reals2d_ft is float2d_t:
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 01bb2f852..e3fec5c86 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -1289,8 +1289,10 @@ def reduce_max(self, X: Floats2d, lengths: Ints1d) -> Tuple[Floats2d, Ints2d]:
     def backprop_reduce_first(
         self, d_firsts: Floats2d, starts_ends: Ints1d
     ) -> Floats2d:
-        if starts_ends.size < 2:
-            raise ValueError(f"starts_ends should least have size 2")
+        if starts_ends.size == 0:
+            return self.alloc2f(0, d_firsts.shape[1], dtype=d_firsts.dtype, zeros=True)
+        elif starts_ends.size == 1:
+            raise ValueError(f"starts_ends must not have size 1")
         dX = self.alloc2f(
             int(starts_ends[-1]), d_firsts.shape[1], dtype=d_firsts.dtype, zeros=True
         )
@@ -1298,8 +1300,8 @@ def backprop_reduce_first(
         return dX
 
     def backprop_reduce_last(self, d_lasts: Floats2d, lasts: Ints1d) -> Floats2d:
-        if lasts.size < 1:
-            raise ValueError(f"lasts should least have size 2")
+        if lasts.size == 0:
+            return self.alloc2f(0, d_lasts.shape[1], dtype=d_lasts.dtype, zeros=True)
         dX = self.alloc2f(
             int(lasts[-1]) + 1, d_lasts.shape[1], dtype=d_lasts.dtype, zeros=True
         )
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index d5235ecc3..b867b14e4 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -41,6 +41,20 @@
 FLOAT_TYPES = ["float32", "float64"]
 INT_TYPES = ["int32", "int64"]
 
+REDUCTIONS = ["reduce_first", "reduce_last", "reduce_max", "reduce_mean", "reduce_sum"]
+
+REDUCE_ZERO_LENGTH_RAISES = [
+    ("reduce_first", True),
+    ("reduce_last", True),
+    ("reduce_max", True),
+    # From a mathematical perspective we'd want mean reduction to raise for
+    # zero-length sequences, since floating point numbers are not a monoid
+    # under averaging. However, floret relies on reduce_mean to return a
+    # zero-vector in this case.
+    ("reduce_mean", False),
+    ("reduce_sum", False),
+]
+
 
 def create_pytorch_funcs():
     import math
@@ -1077,6 +1091,71 @@ def test_backprop_reduce_mean(ops, dtype):
         )
 
 
+@pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.parametrize("dtype", FLOAT_TYPES)
+@pytest.mark.parametrize("reduction", REDUCTIONS)
+def test_reduce_empty_batch(ops, dtype, reduction):
+    func = getattr(ops, reduction)
+    backprop_func = getattr(ops, f"backprop_{reduction}")
+
+    lengths = ops.asarray1i([])
+    Y = func(ops.alloc((0, 10), dtype=dtype), lengths)
+
+    if reduction == "reduce_max":
+        Y, which = Y
+        dX = backprop_func(Y, which, lengths)
+    elif isinstance(Y, tuple):
+        Y, extra = Y
+        dX = backprop_func(Y, extra)
+    else:
+        dX = backprop_func(Y, lengths)
+
+    assert Y.shape == (0, 10)
+    assert dX.shape == (0, 10)
+
+
+@pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.parametrize("dtype", FLOAT_TYPES)
+@pytest.mark.parametrize("reduction", REDUCTIONS)
+def test_reduce_empty_hidden(ops, dtype, reduction):
+    func = getattr(ops, reduction)
+    backprop_func = getattr(ops, f"backprop_{reduction}")
+
+    lengths = ops.asarray1i([2, 3])
+    Y = func(ops.alloc((5, 0), dtype=dtype), lengths)
+
+    if reduction == "reduce_max":
+        Y, which = Y
+        dX = backprop_func(Y, which, lengths)
+    elif isinstance(Y, tuple):
+        Y, extra = Y
+        dX = backprop_func(Y, extra)
+    else:
+        dX = backprop_func(Y, lengths)
+
+    assert Y.shape == (2, 0)
+    assert dX.shape == (5, 0)
+
+
+@pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.parametrize("dtype", FLOAT_TYPES)
+@pytest.mark.parametrize("reduction_raises", REDUCE_ZERO_LENGTH_RAISES)
+def test_reduce_zero_seq_length(ops, dtype, reduction_raises):
+    reduction_str, raises = reduction_raises
+    reduction = getattr(ops, reduction_str)
+    X = ops.asarray2f(
+        [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [1.0, 2.0], [3.0, 4.0]], dtype=dtype
+    )
+    lengths = ops.asarray1i([3, 0, 2])
+
+    if raises:
+        with pytest.raises(ValueError):
+            reduction(X, lengths)
+    else:
+        # All non-raising reductions have zero as their identity element.
+        ops.xp.testing.assert_allclose(reduction(X, lengths)[1], [0.0, 0.0])
+
+
 @pytest.mark.parametrize("ops", ALL_OPS)
 @settings(max_examples=MAX_EXAMPLES, deadline=None)
 @given(X=strategies.arrays_BI())
diff --git a/website/docs/api-backends.md b/website/docs/api-backends.md
index c5a54cff8..fc69a775d 100644
--- a/website/docs/api-backends.md
+++ b/website/docs/api-backends.md
@@ -937,9 +937,10 @@ Backpropagate the Swish activation
 
 </inline-list>
 
-Dish or "Daniël's Swish-like activation" is an activation function with a non-monotinic shape similar to
-[GELU](#gelu), [Swish](#swish) and [Mish](#mish). However, Dish does not rely on
-elementary functions like `exp` or `erf`, making it much
+Dish or "Daniël's Swish-like activation" is an activation function with a
+non-monotinic shape similar to [GELU](#gelu), [Swish](#swish) and [Mish](#mish).
+However, Dish does not rely on elementary functions like `exp` or `erf`, making
+it much
 [faster to compute](https://twitter.com/danieldekok/status/1484898130441166853)
 in most cases.
 
@@ -1264,9 +1265,12 @@ Backpropagate the hard Swish MobileNet activation.
 
 </inline-list>
 
-Perform sequence-wise first pooling for data in the ragged format. Zero-length
-sequences are not allowed. A `ValueError` is raised if any element in `lengths`
-is zero.
+Perform sequence-wise first pooling for data in the ragged format.
+
+- Zero-length sequences are not allowed. A `ValueError` is raised if any element
+  in `lengths` is zero.
+- Batch and hidden dimensions can have a size of zero. In these cases the
+  corresponding dimensions in the output also have a size of zero.
 
 | Argument    | Type                            | Description                                                           |
 | ----------- | ------------------------------- | --------------------------------------------------------------------- |
@@ -1302,9 +1306,12 @@ Backpropagate the `reduce_first` operation.
 
 </inline-list>
 
-Perform sequence-wise last pooling for data in the ragged format. Zero-length
-sequences are not allowed. A `ValueError` is raised if any element in `lengths`
-is zero.
+Perform sequence-wise last pooling for data in the ragged format.
+
+- Zero-length sequences are not allowed. A `ValueError` is raised if any element
+  in `lengths` is zero.
+- Batch and hidden dimensions can have a size of zero. In these cases the
+  corresponding dimensions in the output also have a size of zero.
 
 | Argument    | Type                            | Description                                                                     |
 | ----------- | ------------------------------- | ------------------------------------------------------------------------------- |
@@ -1340,8 +1347,11 @@ Backpropagate the `reduce_last` operation.
 
 </inline-list>
 
-Perform sequence-wise summation for data in the ragged format. Zero-length
-sequences are reduced to the zero vector.
+Perform sequence-wise summation for data in the ragged format.
+
+- Zero-length sequences are reduced to all-zero vectors.
+- Batch and hidden dimensions can have a size of zero. In these cases the
+  corresponding dimensions in the output also have a size of zero.
 
 | Argument    | Type              | Description                   |
 | ----------- | ----------------- | ----------------------------- |
@@ -1377,8 +1387,11 @@ Backpropagate the `reduce_sum` operation.
 
 </inline-list>
 
-Perform sequence-wise averaging for data in the ragged format. Zero-length
-sequences are reduced to the zero vector.
+Perform sequence-wise averaging for data in the ragged format.
+
+- Zero-length sequences are reduced to all-zero vectors.
+- Batch and hidden dimensions can have a size of zero. In these cases the
+  corresponding dimensions in the output also have a size of zero.
 
 | Argument    | Type              | Description                 |
 | ----------- | ----------------- | --------------------------- |
@@ -1415,8 +1428,12 @@ Backpropagate the `reduce_mean` operation.
 </inline-list>
 
 Perform sequence-wise max pooling for data in the ragged format. Zero-length
-sequences are not allowed. A `ValueError` is raised if any element in `lengths`
-is zero.
+sequences are not allowed.
+
+- Zero-length sequences are not allowed. A `ValueError` is raised if any element
+  in `lengths` is zero.
+- Batch and hidden dimensions can have a size of zero. In these cases the
+  corresponding dimensions in the output also have a size of zero.
 
 | Argument    | Type                             | Description                 |
 | ----------- | -------------------------------- | --------------------------- |
@@ -1434,8 +1451,7 @@ is zero.
 
 </inline-list>
 
-Backpropagate the `reduce_max` operation. A `ValueError` is raised if any
-element in `lengths` is zero.
+Backpropagate the `reduce_max` operation.
 
 | Argument    | Type              | Description                                 |
 | ----------- | ----------------- | ------------------------------------------- |

From 37419e5a1bb57349f0299038fe686c5c3fc1f539 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Aug 2023 08:59:43 +0200
Subject: [PATCH 37/48] Preserve values with dtype for NumpyOps/CupyOps.asarray
 (#897)

* Preserve values with dtype for NumpyOps/CupyOps.asarray

Always specify `dtype` when creating new arrays so that large integer
values are preserved and not at risk of going through an intermediate
`float64` conversion.

* Fix integer conversions for strings2arrays

* Fix types and shape casting in strings2arrays

* Format

* Rename list in test

* Pass dtype=None
---
 thinc/backends/cupy_ops.py       | 2 +-
 thinc/backends/numpy_ops.pyx     | 2 +-
 thinc/layers/strings2arrays.py   | 6 ++++--
 thinc/tests/backends/test_ops.py | 7 +++++++
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index 366faf70a..1e1e5b92b 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -94,7 +94,7 @@ def asarray(self, data, dtype=None):
         elif is_mxnet_gpu_array(data):
             array = mxnet2xp(data)
         else:
-            array = self.xp.array(data)
+            array = self.xp.array(data, dtype=dtype)
 
         if dtype is not None:
             array = array.astype(dtype=dtype, copy=False)
diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index 5ab4d0d8f..4ecad4271 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -72,7 +72,7 @@ class NumpyOps(Ops):
         elif hasattr(data, "get"):
             array = data.get()
         else:
-            array = self.xp.array(data)
+            array = self.xp.array(data, dtype=dtype)
 
         if dtype is not None:
             array = array.astype(dtype=dtype, copy=False)
diff --git a/thinc/layers/strings2arrays.py b/thinc/layers/strings2arrays.py
index 91a6b1a31..ed40b1e88 100644
--- a/thinc/layers/strings2arrays.py
+++ b/thinc/layers/strings2arrays.py
@@ -17,8 +17,10 @@ def strings2arrays() -> Model[InT, OutT]:
 
 
 def forward(model: Model[InT, OutT], Xs: InT, is_train: bool) -> Tuple[OutT, Callable]:
-    hashes = [[hash_unicode(word) for word in X] for X in Xs]
-    hash_arrays = [model.ops.asarray2i(h, dtype="uint64") for h in hashes]
+    hashes = model.ops.asarray2i(
+        [[hash_unicode(word) for word in X] for X in Xs], dtype="int32"
+    )
+    hash_arrays = [model.ops.asarray1i(h, dtype="uint64") for h in hashes]
     arrays = [model.ops.reshape2i(array, -1, 1) for array in hash_arrays]
 
     def backprop(dX: OutT) -> InT:
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index b867b14e4..9f03c0438 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -1597,3 +1597,10 @@ def test_custom_kernel_compilation():
         assert compiled_kernel is not None
 
     assert compile_mmh() is not None
+
+
+@pytest.mark.parametrize("ops", ALL_OPS)
+def test_asarray_from_list_uint64(ops):
+    # list contains int values both above and below int64.max
+    uint64_list = [16, 11648197037703959513]
+    assert uint64_list == list(ops.asarray(uint64_list, dtype="uint64"))

From cc91edcfc86fe8b684b1dcebd04fce08893b6ae4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Aug 2023 09:15:00 +0200
Subject: [PATCH 38/48] Set version to v8.1.12 (#898)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index f9f48a287..a55cc7b91 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.11"
+__version__ = "8.1.12"
 __release__ = True

From 7a31415b87448dac26fb8db0bdbf2e6eb5a2a22b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Aug 2023 09:55:31 +0200
Subject: [PATCH 39/48] Set version to v8.2.0 (#900)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 5abd31409..c2f698250 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.2.0.dev0"
+__version__ = "8.2.0"
 __release__ = True

From 9d3ec3b941b0de573bfb4bced395805922bcaa04 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 22 Sep 2023 08:39:29 +0200
Subject: [PATCH 40/48] Redesign cython profile defaults due to python 3.12
 (#904)

Profiling support for python 3.12 will not be available in cython 0.29,
so toggle internal defaults to be able to disable profiling for python
3.12 completely in `setup.py`. The cython `profile` compiler directive
in `setup.py` is overridden by any file-specific or function-specific
settings.

* Swap file-specific `profile` settings to `False`
* In setup, set `profile` default to:
  * `True` for python < 3.12
  * `False` for python >= 3.12
---
 setup.py                            | 7 +++++--
 thinc/backends/cblas.pyx            | 1 +
 thinc/backends/linalg.pyx           | 1 +
 thinc/backends/numpy_ops.pyx        | 1 -
 thinc/extra/search.pyx              | 2 +-
 thinc/extra/tests/c_test_search.pyx | 1 +
 thinc/layers/premap_ids.pyx         | 2 +-
 thinc/layers/sparselinear.pyx       | 2 +-
 8 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 028376c19..d2c717be9 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
     "thinc.backends.numpy_ops",
     "thinc.extra.search",
     "thinc.layers.sparselinear",
-    "thinc.layers.premap_ids"
+    "thinc.layers.premap_ids",
 ]
 COMPILE_OPTIONS = {
     "msvc": ["/Ox", "/EHsc"],
@@ -31,6 +31,7 @@
     "language_level": 3,
     "embedsignature": True,
     "annotation_typing": False,
+    "profile": sys.version_info < (3, 12),
 }
 LINK_OPTIONS = {"msvc": [], "other": []}
 
@@ -82,7 +83,9 @@ def setup_package():
         ext = Extension(name, [mod_path], language="c++", include_dirs=include_dirs)
         ext_modules.append(ext)
     print("Cythonizing sources")
-    ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES, language_level=2)
+    ext_modules = cythonize(
+        ext_modules, compiler_directives=COMPILER_DIRECTIVES, language_level=2
+    )
 
     setup(
         name="thinc",
diff --git a/thinc/backends/cblas.pyx b/thinc/backends/cblas.pyx
index 9eb4514d8..e35169417 100644
--- a/thinc/backends/cblas.pyx
+++ b/thinc/backends/cblas.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 cimport blis.cy
 from cython.operator cimport dereference as deref
 from libcpp.memory cimport make_shared
diff --git a/thinc/backends/linalg.pyx b/thinc/backends/linalg.pyx
index 4979e8aa9..64a360731 100644
--- a/thinc/backends/linalg.pyx
+++ b/thinc/backends/linalg.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 try:
     import blis.py
 except ImportError:
diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index 4ecad4271..78eee6ada 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -1,6 +1,5 @@
 # cython: cdivision=True
 # cython: infer_types=True
-# cython: profile=True
 from collections.abc import Sized
 from typing import Optional
 
diff --git a/thinc/extra/search.pyx b/thinc/extra/search.pyx
index 71cc85d8b..651e6ff04 100644
--- a/thinc/extra/search.pyx
+++ b/thinc/extra/search.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+# cython: experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
 from libc.math cimport exp, log
 from libc.string cimport memcpy, memset
diff --git a/thinc/extra/tests/c_test_search.pyx b/thinc/extra/tests/c_test_search.pyx
index 70cdf5745..81327f5a9 100644
--- a/thinc/extra/tests/c_test_search.pyx
+++ b/thinc/extra/tests/c_test_search.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
 from cymem.cymem cimport Pool
 
 from thinc.extra.search cimport Beam
diff --git a/thinc/layers/premap_ids.pyx b/thinc/layers/premap_ids.pyx
index 17acafa8e..6473a0338 100644
--- a/thinc/layers/premap_ids.pyx
+++ b/thinc/layers/premap_ids.pyx
@@ -1,4 +1,4 @@
-# cython: binding=True, infer_types=True
+# cython: binding=True, infer_types=True, profile=False
 import numpy
 
 from preshed.maps cimport PreshMap
diff --git a/thinc/layers/sparselinear.pyx b/thinc/layers/sparselinear.pyx
index a1be75ccc..84c17330f 100644
--- a/thinc/layers/sparselinear.pyx
+++ b/thinc/layers/sparselinear.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, cdivision=True, bounds_check=False, wraparound=False
+# cython: infer_types=True, cdivision=True, bounds_check=False, wraparound=False, profile=False
 cimport cython
 cimport numpy as np
 from libc.stdint cimport int32_t, uint32_t, uint64_t

From ea451e481a589c206c1460f5ff966c6859ff6d7f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 22 Sep 2023 09:20:57 +0200
Subject: [PATCH 41/48] CI: Add python 3.12.0rc2 (#905)

* CI: Add python 3.12.0rc2

* Skip notebook test for python 3.12

* Skip mxnet for python 3.12
---
 .github/workflows/tests.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 4ad90be7d..c3c4ab666 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -46,7 +46,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
-        python_version: ["3.11"]
+        python_version: ["3.11", "3.12.0-rc.2"]
         include:
           - os: windows-2019
             python_version: "3.6"
@@ -106,11 +106,18 @@ jobs:
       - name: Test import
         run: python -c "import thinc"
 
-      - name: Run tests without extras
+      - name: Install test requirements
         run: |
           pip install -r requirements.txt
+
+      - name: Install notebook test requirements
+        run: |
           pip install ipykernel pydot graphviz
           python -m ipykernel install --name thinc-notebook-tests --user
+        if: matrix.python_version != '3.12.0-rc.2'
+
+      - name: Run tests without extras
+        run: |
           python -m pytest --pyargs thinc -Werror --cov=thinc --cov-report=term
 
       # Notes on numpy requirements hacks:
@@ -127,7 +134,7 @@ jobs:
       - name: Install extras for testing
         run: |
           pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
-          pip install "mxnet; sys_platform != 'win32'"
+          pip install "mxnet; sys_platform != 'win32' and python_version < '3.12'"
           pip install "torch!=1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
           pip install "numpy~=1.23.0; python_version=='3.10' and sys_platform=='win32'"
           pip install "numpy<1.24.0"

From cf51ac568d3df5e9926638b222842eb94b4f38b7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 22 Sep 2023 10:02:58 +0200
Subject: [PATCH 42/48] Set version to v8.2.1, update for python 3.12 (#906)

---
 setup.cfg      | 1 +
 thinc/about.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index f80422a8c..9125d6304 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -23,6 +23,7 @@ classifiers =
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
     Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
     Topic :: Scientific/Engineering
 
 [options]
diff --git a/thinc/about.py b/thinc/about.py
index c2f698250..b358a590d 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.2.0"
+__version__ = "8.2.1"
 __release__ = True

From 1df2033360792ea877d1fa3080018f669c6887de Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 6 Nov 2023 07:33:20 +0100
Subject: [PATCH 43/48] Repo: update issue-manager version (#908)

---
 .github/workflows/issue-manager.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml
index 23cf446da..7c15e00b0 100644
--- a/.github/workflows/issue-manager.yml
+++ b/.github/workflows/issue-manager.yml
@@ -15,7 +15,7 @@ jobs:
   issue-manager:
     runs-on: ubuntu-latest
     steps:
-      - uses: tiangolo/issue-manager@0.2.1
+      - uses: tiangolo/issue-manager@0.4.0
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
           config: >
@@ -26,4 +26,4 @@ jobs:
                 "remove_label_on_comment": true,
                 "remove_label_on_close": true
               }
-            }
\ No newline at end of file
+            }

From a180b5db565fdcaba4b926039008d30448974b13 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 6 Nov 2023 07:35:19 +0100
Subject: [PATCH 44/48] CI: Remove autoblack workflow (#909)

---
 .github/workflows/autoblack.yml | 44 ---------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 .github/workflows/autoblack.yml

diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
deleted file mode 100644
index a483dadb1..000000000
--- a/.github/workflows/autoblack.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# GitHub Action that uses Black to reformat all Python code and submits a PR
-# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
-
-name: autoblack
-on:
-  workflow_dispatch:  # allow manual trigger
-  schedule:
-    - cron: '0 8 * * 5'  # every Friday at 8am UTC
-
-jobs:
-  autoblack:
-    if: github.repository_owner == 'explosion'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-        with:
-            ref: ${{ github.head_ref }}
-      - uses: actions/setup-python@v4
-      - run: pip install black -c requirements.txt
-      - name: Auto-format code if needed
-        run: black thinc
-      # We can't run black --check here because that returns a non-zero excit
-      # code and makes GitHub think the action failed
-      - name: Check for modified files
-        id: git-check
-        run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
-      - name: Create Pull Request
-        if: steps.git-check.outputs.modified == 'true'
-        uses: peter-evans/create-pull-request@v3
-        with:
-            title: Auto-format code with black
-            labels: meta
-            commit-message: Auto-format code with black
-            committer: GitHub <noreply@github.com>
-            author: explosion-bot <explosion-bot@users.noreply.github.com>
-            body: _This PR is auto-generated._
-            branch: autoblack
-            delete-branch: true
-            draft: false
-      - name: Check outputs
-        if: steps.git-check.outputs.modified == 'true'
-        run: |
-          echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
-          echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"

From 05515c5b5e9b416f055a50d12d39179cf3a364f4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 6 Nov 2023 09:17:56 +0100
Subject: [PATCH 45/48] CI: Use stable python 3.12 (#910)

* CI: Use stable python 3.12

* Require future version of torch for macos
---
 .github/workflows/tests.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index c3c4ab666..035be0baf 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -46,7 +46,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
-        python_version: ["3.11", "3.12.0-rc.2"]
+        python_version: ["3.12"]
         include:
           - os: windows-2019
             python_version: "3.6"
@@ -58,6 +58,8 @@ jobs:
             python_version: "3.9"
           - os: macos-latest
             python_version: "3.10"
+          - os: ubuntu-latest
+            python_version: "3.11"
 
     runs-on: ${{ matrix.os }}
     env:
@@ -114,7 +116,7 @@ jobs:
         run: |
           pip install ipykernel pydot graphviz
           python -m ipykernel install --name thinc-notebook-tests --user
-        if: matrix.python_version != '3.12.0-rc.2'
+        if: matrix.python_version != '3.12'
 
       - name: Run tests without extras
         run: |
@@ -135,7 +137,11 @@ jobs:
         run: |
           pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
           pip install "mxnet; sys_platform != 'win32' and python_version < '3.12'"
-          pip install "torch!=1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install "torch!=1.13.0; sys_platform!='darwin'" --extra-index-url https://download.pytorch.org/whl/cpu
+          # there is a bug related to MPS devices in github macos runners that
+          # will be fixed in torch v2.1.1
+          # https://github.com/pytorch/pytorch/pull/111576
+          pip install "torch>=2.1.1; sys_platform=='darwin'" --extra-index-url https://download.pytorch.org/whl/cpu
           pip install "numpy~=1.23.0; python_version=='3.10' and sys_platform=='win32'"
           pip install "numpy<1.24.0"
           pip install -r requirements.txt

From c16f552f46d6e47d05d20669fa29307c72a59e19 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Nov 2023 16:28:17 +0100
Subject: [PATCH 46/48] Docs quickstart: replace cuda-autodetect with specific
 extras (#911)

---
 setup.cfg                     | 2 ++
 website/docs/_quickstart.json | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 9125d6304..fc154e629 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -98,6 +98,8 @@ cuda117 =
     cupy-cuda117>=5.0.0b4
 cuda11x =
     cupy-cuda11x>=11.0.0
+cuda12x =
+    cupy-cuda12x>=11.5.0
 cuda-autodetect =
     cupy-wheel>=11.0.0
 datasets =
diff --git a/website/docs/_quickstart.json b/website/docs/_quickstart.json
index bed6629a8..aad0cdff7 100644
--- a/website/docs/_quickstart.json
+++ b/website/docs/_quickstart.json
@@ -12,7 +12,11 @@
                 { "label": "9.2", "value": "cuda92" },
                 { "label": "10.0", "value": "cuda100" },
                 { "label": "10.1", "value": "cuda101" },
-                { "label": "10.2, 11.0+", "value": "cuda-autodetect" }
+                { "label": "10.2", "value": "cuda102" },
+                { "label": "11.0", "value": "cuda110" },
+                { "label": "11.1", "value": "cuda111" },
+                { "label": "11.2-11.x", "value": "cuda11x" },
+                { "label": "12.x", "value": "cuda12x" }
             ]
         },
         {

From 88dc49d6569303d193238b500c53f9bc1fdc97e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 14 Dec 2023 11:08:07 +0100
Subject: [PATCH 47/48] Add ParametricAttention.v2 (#913)

* Add ParametricAttention.v2

This layer is an extension of the existing `ParametricAttention` layer,
adding support for transformations (such as a non-linear layer) of the
key representation. This brings the model closer to the paper that
suggested it (Yang et al, 2016) and gave slightly better results in
experiments.

* Use `noop` for when `key_transform` is `None`

* Remove stray import

* Add constant for key transform ref

* Check that we correctly set the key transform

* isooooooort

* Update citation to ACL link

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 thinc/api.py                                  |   3 +-
 thinc/layers/__init__.py                      |   2 +
 thinc/layers/parametricattention_v2.py        | 100 ++++++++++++++++++
 thinc/tests/layers/test_layers_api.py         |   2 +
 .../layers/test_parametric_attention_v2.py    |  10 ++
 website/docs/api-layers.md                    |  38 +++++++
 6 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 thinc/layers/parametricattention_v2.py
 create mode 100644 thinc/tests/layers/test_parametric_attention_v2.py

diff --git a/thinc/api.py b/thinc/api.py
index b2bc346a0..204aa386e 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -41,6 +41,7 @@
     MultiSoftmax,
     MXNetWrapper,
     ParametricAttention,
+    ParametricAttention_v2,
     PyTorchLSTM,
     PyTorchRNNWrapper,
     PyTorchWrapper,
@@ -207,7 +208,7 @@
     "PyTorchWrapper", "PyTorchRNNWrapper", "PyTorchLSTM",
     "TensorFlowWrapper", "keras_subclass", "MXNetWrapper",
     "PyTorchWrapper_v2", "Softmax_v2", "PyTorchWrapper_v3",
-    "SparseLinear_v2", "TorchScriptWrapper_v1",
+    "SparseLinear_v2", "TorchScriptWrapper_v1", "ParametricAttention_v2",
 
     "add", "bidirectional", "chain", "clone", "concatenate", "noop",
     "residual", "uniqued", "siamese", "list2ragged", "ragged2list",
diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py
index 032af5fde..841e6c072 100644
--- a/thinc/layers/__init__.py
+++ b/thinc/layers/__init__.py
@@ -35,6 +35,7 @@
 from .noop import noop
 from .padded2list import padded2list
 from .parametricattention import ParametricAttention
+from .parametricattention_v2 import ParametricAttention_v2
 from .premap_ids import premap_ids
 from .pytorchwrapper import (
     PyTorchRNNWrapper,
@@ -94,6 +95,7 @@
     "Mish",
     "MultiSoftmax",
     "ParametricAttention",
+    "ParametricAttention_v2",
     "PyTorchLSTM",
     "PyTorchWrapper",
     "PyTorchWrapper_v2",
diff --git a/thinc/layers/parametricattention_v2.py b/thinc/layers/parametricattention_v2.py
new file mode 100644
index 000000000..e252dd7d2
--- /dev/null
+++ b/thinc/layers/parametricattention_v2.py
@@ -0,0 +1,100 @@
+from typing import Callable, Optional, Tuple, cast
+
+from ..config import registry
+from ..model import Model
+from ..types import Floats2d, Ragged
+from ..util import get_width
+from .noop import noop
+
+InT = Ragged
+OutT = Ragged
+
+KEY_TRANSFORM_REF: str = "key_transform"
+
+
+@registry.layers("ParametricAttention.v2")
+def ParametricAttention_v2(
+    *,
+    key_transform: Optional[Model[Floats2d, Floats2d]] = None,
+    nO: Optional[int] = None
+) -> Model[InT, OutT]:
+    if key_transform is None:
+        key_transform = noop()
+
+    """Weight inputs by similarity to a learned vector"""
+    return Model(
+        "para-attn",
+        forward,
+        init=init,
+        params={"Q": None},
+        dims={"nO": nO},
+        refs={KEY_TRANSFORM_REF: key_transform},
+        layers=[key_transform],
+    )
+
+
+def forward(model: Model[InT, OutT], Xr: InT, is_train: bool) -> Tuple[OutT, Callable]:
+    Q = model.get_param("Q")
+    key_transform = model.get_ref(KEY_TRANSFORM_REF)
+
+    attention, bp_attention = _get_attention(
+        model.ops, Q, key_transform, Xr.dataXd, Xr.lengths, is_train
+    )
+    output, bp_output = _apply_attention(model.ops, attention, Xr.dataXd, Xr.lengths)
+
+    def backprop(dYr: OutT) -> InT:
+        dX, d_attention = bp_output(dYr.dataXd)
+        dQ, dX2 = bp_attention(d_attention)
+        model.inc_grad("Q", dQ.ravel())
+        dX += dX2
+        return Ragged(dX, dYr.lengths)
+
+    return Ragged(output, Xr.lengths), backprop
+
+
+def init(
+    model: Model[InT, OutT], X: Optional[InT] = None, Y: Optional[OutT] = None
+) -> None:
+    key_transform = model.get_ref(KEY_TRANSFORM_REF)
+    width = get_width(X) if X is not None else None
+    if width:
+        model.set_dim("nO", width)
+        if key_transform.has_dim("nO"):
+            key_transform.set_dim("nO", width)
+
+    # Randomly initialize the parameter, as though it were an embedding.
+    Q = model.ops.alloc1f(model.get_dim("nO"))
+    Q += model.ops.xp.random.uniform(-0.1, 0.1, Q.shape)
+    model.set_param("Q", Q)
+
+    X_array = X.dataXd if X is not None else None
+    Y_array = Y.dataXd if Y is not None else None
+
+    key_transform.initialize(X_array, Y_array)
+
+
+def _get_attention(ops, Q, key_transform, X, lengths, is_train):
+    K, K_bp = key_transform(X, is_train=is_train)
+
+    attention = ops.gemm(K, ops.reshape2f(Q, -1, 1))
+    attention = ops.softmax_sequences(attention, lengths)
+
+    def get_attention_bwd(d_attention):
+        d_attention = ops.backprop_softmax_sequences(d_attention, attention, lengths)
+        dQ = ops.gemm(K, d_attention, trans1=True)
+        dY = ops.xp.outer(d_attention, Q)
+        dX = K_bp(dY)
+        return dQ, dX
+
+    return attention, get_attention_bwd
+
+
+def _apply_attention(ops, attention, X, lengths):
+    output = X * attention
+
+    def apply_attention_bwd(d_output):
+        d_attention = (X * d_output).sum(axis=1, keepdims=True)
+        dX = d_output * attention
+        return dX, d_attention
+
+    return output, apply_attention_bwd
diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py
index 0ef559d96..046d98940 100644
--- a/thinc/tests/layers/test_layers_api.py
+++ b/thinc/tests/layers/test_layers_api.py
@@ -129,6 +129,8 @@ def assert_data_match(Y, out_data):
     ("MultiSoftmax.v1", {"nOs": (1, 3)}, array2d, array2d),
     # ("CauchySimilarity.v1", {}, (array2d, array2d), array1d),
     ("ParametricAttention.v1", {}, ragged, ragged),
+    ("ParametricAttention.v2", {}, ragged, ragged),
+    ("ParametricAttention.v2", {"key_transform": {"@layers": "Gelu.v1"}}, ragged, ragged),
     ("SparseLinear.v1", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
     ("SparseLinear.v2", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
     ("remap_ids.v1", {"dtype": "f"}, ["a", 1, 5.0], array2dint),
diff --git a/thinc/tests/layers/test_parametric_attention_v2.py b/thinc/tests/layers/test_parametric_attention_v2.py
new file mode 100644
index 000000000..fd88880f4
--- /dev/null
+++ b/thinc/tests/layers/test_parametric_attention_v2.py
@@ -0,0 +1,10 @@
+from thinc.layers.gelu import Gelu
+from thinc.layers.parametricattention_v2 import (
+    KEY_TRANSFORM_REF,
+    ParametricAttention_v2,
+)
+
+
+def test_key_transform_used():
+    attn = ParametricAttention_v2(key_transform=Gelu())
+    assert attn.get_ref(KEY_TRANSFORM_REF).name == "gelu"
diff --git a/website/docs/api-layers.md b/website/docs/api-layers.md
index dbdde5b20..442ecb463 100644
--- a/website/docs/api-layers.md
+++ b/website/docs/api-layers.md
@@ -686,6 +686,44 @@ attention mechanism.
 https://github.com/explosion/thinc/blob/master/thinc/layers/parametricattention.py
 ```
 
+### ParametricAttention_v2 {#parametricattention_v2 tag="function"}
+
+<inline-list>
+
+- **Input:** <ndarray>Ragged</ndarray>
+- **Output:** <ndarray>Ragged</ndarray>
+- **Parameters:** <ndarray shape="nO,">Q</ndarray>
+
+</inline-list>
+
+A layer that uses the parametric attention scheme described by
+[Yang et al. (2016)](https://aclanthology.org/N16-1174).
+The layer learns a parameter vector that is used as the keys in a single-headed
+attention mechanism.
+
+<infobox variant="warning">
+
+The original `ParametricAttention` layer uses the hidden representation as-is
+for the keys in the attention. This differs from the paper that introduces
+parametric attention (Equation 5). `ParametricAttention_v2` adds the option to
+transform the key representation in line with the paper by passing such a 
+transformation through the `key_transform` parameter.
+
+</infobox>
+
+
+| Argument        | Type                                         | Description                                                            |
+|-----------------|----------------------------------------------|------------------------------------------------------------------------|
+| `key_transform` | <tt>Optional[Model[Floats2d, Floats2d]]</tt> | Transformation to apply to the key representations. Defaults to `None` |
+| `nO`            | <tt>Optional[int]</tt>                       | The size of the output vectors.                                        |
+| **RETURNS**     | <tt>Model[Ragged, Ragged]</tt>               | The created attention layer.                                           |
+
+```python
+https://github.com/explosion/thinc/blob/master/thinc/layers/parametricattention_v2.py
+```
+
+
+
 ### Relu {#relu tag="function"}
 
 <inline-list>

From b183006daa740e648cca997e0cca29239e18e575 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 14 Dec 2023 13:33:32 +0100
Subject: [PATCH 48/48] Set version to v8.2.2 (#914)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index b358a590d..394a8253e 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.2.1"
+__version__ = "8.2.2"
 __release__ = True