diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d510388..e47bb9b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,3 +20,4 @@ jobs: docker run --rm scispacy black scispacy --check --line-length 88 docker run --rm scispacy bash scripts/mypy.sh docker run --rm scispacy pytest tests/ --cov scispacy --cov-fail-under=20 + diff --git a/.github/workflows/old_scipy.yml b/.github/workflows/old_scipy.yml new file mode 100644 index 0000000..bc1136f --- /dev/null +++ b/.github/workflows/old_scipy.yml @@ -0,0 +1,17 @@ +name: CI (old scipy) + +on: + pull_request: + branches: + - main + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v1 + - name: Build and test with Docker using an older version of scipy + run: | + docker build --tag scispacy . + docker run --rm scispacy bash -c "pip install 'scipy<1.11' && pytest tests/" diff --git a/Dockerfile b/Dockerfile index 4cd09be..48422b2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,7 @@ WORKDIR /work COPY requirements.in . RUN pip install -r requirements.in -RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz +RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz RUN python -m spacy download en_core_web_sm RUN python -m spacy download en_core_web_md diff --git a/requirements.in b/requirements.in index 795a5de..50e9155 100644 --- a/requirements.in +++ b/requirements.in @@ -1,5 +1,8 @@ numpy -scipy<1.11 +# NOTE: scipy<1.11 is required when creating the linkers, so that's currently +# only supported on Python<3.11 +# https://github.com/allenai/scispacy/issues/519#issuecomment-2229915999 +scipy spacy>=3.7.0,<3.8.0 spacy-lookups-data pandas @@ -8,7 +11,10 @@ conllu # Candidate generation and entity linking joblib -nmslib>=1.7.3.6 +nmslib>=1.7.3.6; python_version < '3.11' +# Use the metabrainz fork until nmslib supports installing on Python 3.11+ +# https://github.com/nmslib/nmslib/issues/555 +nmslib-metabrainz==2.1.3; python_version >= '3.11' scikit-learn>=0.20.3 # Required for testing. diff --git a/scispacy/candidate_generation.py b/scispacy/candidate_generation.py index c988fcd..1abb0a8 100644 --- a/scispacy/candidate_generation.py +++ b/scispacy/candidate_generation.py @@ -10,6 +10,7 @@ import nmslib from nmslib.dist import FloatIndex +from scispacy.util import scipy_supports_sparse_float16 from scispacy.file_cache import cached_path from scispacy.linking_utils import ( KnowledgeBase, @@ -375,6 +376,11 @@ def create_tfidf_ann_index( The kb items to generate the index and vectors for. """ + if not scipy_supports_sparse_float16(): + raise RuntimeError( + "This function requires scipy<1.11, which only runs on Python<3.11." + ) + tfidf_vectorizer_path = f"{out_path}/tfidf_vectorizer.joblib" ann_index_path = f"{out_path}/nmslib_index.bin" tfidf_vectors_path = f"{out_path}/tfidf_vectors_sparse.npz" diff --git a/scispacy/util.py b/scispacy/util.py index 53f5ece..d87d040 100644 --- a/scispacy/util.py +++ b/scispacy/util.py @@ -1,4 +1,6 @@ +from packaging.version import Version import spacy +import scipy from spacy.language import Language from spacy.tokens import Doc @@ -17,6 +19,11 @@ def create_combined_rule_model() -> Language: return nlp +def scipy_supports_sparse_float16() -> bool: + # https://github.com/scipy/scipy/issues/7408 + return Version(scipy.__version__) < Version("1.11") + + class WhitespaceTokenizer: """ Spacy doesn't assume that text is tokenised. Sometimes this diff --git a/setup.py b/setup.py index 12a8b10..e91457f 100644 --- a/setup.py +++ b/setup.py @@ -42,12 +42,13 @@ license="Apache", install_requires=[ "spacy>=3.7.0,<3.8.0", - "scipy<1.11", + "scipy", "requests>=2.0.0,<3.0.0", "conllu", "numpy", "joblib", - "nmslib>=1.7.3.6", + "nmslib>=1.7.3.6; python_version < '3.11'", + "nmslib-metabrainz==2.1.3; python_version >= '3.11'", "scikit-learn>=0.20.3", "pysbd", ], diff --git a/tests/test_candidate_generation.py b/tests/test_candidate_generation.py index 660512c..3abef6a 100644 --- a/tests/test_candidate_generation.py +++ b/tests/test_candidate_generation.py @@ -3,10 +3,17 @@ from scispacy.candidate_generation import CandidateGenerator, create_tfidf_ann_index, MentionCandidate from scispacy.umls_utils import UmlsKnowledgeBase +from scispacy.util import scipy_supports_sparse_float16 class TestCandidateGeneration(unittest.TestCase): + def setUp(self): + super().setUp() + if not scipy_supports_sparse_float16(): + # https://github.com/allenai/scispacy/issues/519#issuecomment-2229915999 + self.skipTest("Candidate generation isn't supported for scipy>=1.11") + def test_create_index(self): umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json") diff --git a/tests/test_linking.py b/tests/test_linking.py index f1cbbc5..8585a0b 100644 --- a/tests/test_linking.py +++ b/tests/test_linking.py @@ -7,11 +7,16 @@ from scispacy.linking import EntityLinker from scispacy.umls_utils import UmlsKnowledgeBase from scispacy.abbreviation import AbbreviationDetector +from scispacy.util import scipy_supports_sparse_float16 class TestLinker(unittest.TestCase): def setUp(self): super().setUp() + if not scipy_supports_sparse_float16(): + # https://github.com/allenai/scispacy/issues/519#issuecomment-2229915999 + self.skipTest("Candidate generation isn't supported for scipy>=1.11") + self.nlp = spacy.load("en_core_web_sm") umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv")