Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Python 3.12 via newer scipy and nmslib-metabrainz #523

Merged
merged 13 commits into from
Sep 15, 2024
Merged
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ jobs:
docker run --rm scispacy black scispacy --check --line-length 88
docker run --rm scispacy bash scripts/mypy.sh
docker run --rm scispacy pytest tests/ --cov scispacy --cov-fail-under=20

17 changes: 17 additions & 0 deletions .github/workflows/old_scipy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: CI (old scipy)

on:
pull_request:
branches:
- main

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v1
- name: Build and test with Docker using an older version of scipy
run: |
docker build --tag scispacy .
docker run --rm scispacy bash -c "pip install 'scipy<1.11' && pytest tests/"
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ WORKDIR /work
COPY requirements.in .

RUN pip install -r requirements.in
RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz
RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
RUN python -m spacy download en_core_web_sm
RUN python -m spacy download en_core_web_md

Expand Down
10 changes: 8 additions & 2 deletions requirements.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
numpy
scipy<1.11
# NOTE: scipy<1.11 is required when creating the linkers, so that's currently
# only supported on Python<3.11
# https://github.com/allenai/scispacy/issues/519#issuecomment-2229915999
scipy
spacy>=3.7.0,<3.8.0
spacy-lookups-data
pandas
Expand All @@ -8,7 +11,10 @@ conllu

# Candidate generation and entity linking
joblib
nmslib>=1.7.3.6
nmslib>=1.7.3.6; python_version < '3.11'
# Use the metabrainz fork until nmslib supports installing on Python 3.11+
# https://github.com/nmslib/nmslib/issues/555
nmslib-metabrainz==2.1.3; python_version >= '3.11'
scikit-learn>=0.20.3

# Required for testing.
Expand Down
6 changes: 6 additions & 0 deletions scispacy/candidate_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import nmslib
from nmslib.dist import FloatIndex

from scispacy.util import scipy_supports_sparse_float16
from scispacy.file_cache import cached_path
from scispacy.linking_utils import (
KnowledgeBase,
Expand Down Expand Up @@ -375,6 +376,11 @@ def create_tfidf_ann_index(
The kb items to generate the index and vectors for.

"""
if not scipy_supports_sparse_float16():
dakinggg marked this conversation as resolved.
Show resolved Hide resolved
raise RuntimeError(
"This function requires scipy<1.11, which only runs on Python<3.11."
)

tfidf_vectorizer_path = f"{out_path}/tfidf_vectorizer.joblib"
ann_index_path = f"{out_path}/nmslib_index.bin"
tfidf_vectors_path = f"{out_path}/tfidf_vectors_sparse.npz"
Expand Down
7 changes: 7 additions & 0 deletions scispacy/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from packaging.version import Version
import spacy
import scipy
from spacy.language import Language
from spacy.tokens import Doc

Expand All @@ -17,6 +19,11 @@ def create_combined_rule_model() -> Language:
return nlp


def scipy_supports_sparse_float16() -> bool:
# https://github.com/scipy/scipy/issues/7408
return Version(scipy.__version__) < Version("1.11")


class WhitespaceTokenizer:
"""
Spacy doesn't assume that text is tokenised. Sometimes this
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,13 @@
license="Apache",
install_requires=[
"spacy>=3.7.0,<3.8.0",
"scipy<1.11",
"scipy",
"requests>=2.0.0,<3.0.0",
"conllu",
"numpy",
"joblib",
"nmslib>=1.7.3.6",
"nmslib>=1.7.3.6; python_version < '3.11'",
"nmslib-metabrainz==2.1.3; python_version >= '3.11'",
"scikit-learn>=0.20.3",
"pysbd",
],
Expand Down
7 changes: 7 additions & 0 deletions tests/test_candidate_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@

from scispacy.candidate_generation import CandidateGenerator, create_tfidf_ann_index, MentionCandidate
from scispacy.umls_utils import UmlsKnowledgeBase
from scispacy.util import scipy_supports_sparse_float16


class TestCandidateGeneration(unittest.TestCase):

def setUp(self):
super().setUp()
if not scipy_supports_sparse_float16():
dakinggg marked this conversation as resolved.
Show resolved Hide resolved
# https://github.com/allenai/scispacy/issues/519#issuecomment-2229915999
self.skipTest("Candidate generation isn't supported for scipy>=1.11")

def test_create_index(self):

umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
Expand Down
5 changes: 5 additions & 0 deletions tests/test_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@
from scispacy.linking import EntityLinker
from scispacy.umls_utils import UmlsKnowledgeBase
from scispacy.abbreviation import AbbreviationDetector
from scispacy.util import scipy_supports_sparse_float16


class TestLinker(unittest.TestCase):
def setUp(self):
super().setUp()
if not scipy_supports_sparse_float16():
# https://github.com/allenai/scispacy/issues/519#issuecomment-2229915999
self.skipTest("Candidate generation isn't supported for scipy>=1.11")

self.nlp = spacy.load("en_core_web_sm")

umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv")
Expand Down