CU-38g55wn / CU-39cmv82 Support for python3.11 (and 3.10) (#285)

* CU-38g55wn Move dependencies to (hopefully) support python 3.11 on Ubuntu * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x2 * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x3 * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x4 * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x5 - fix missing comma * CU-38g55wn Remove errorenous package from setup.py * CU-38g55wn Bump spacy version so as to (hopefully) fix pydantic issues * CU-38g55wn Bump spacy en_core_web_md version so as to (hopefully) fix requirements issues * CU-38g55wn Fix test typo that was fixed on newere en_core_web_md * CU-38g55wn Fix small issue in NER test * CU-38g55wn Fix small issue with NER test (int conversion) * CU-38g55wn Mark some places as ignore where newer mypy complains * CU-38g55wn Bump mypy dev requirement version * CU-38g55wn Add python 3.11 and 3.10 to workflow * CU-38g55wn Trying to install gensim over https rather tha ssh * CU-38g55wn Make python versions strings in GH worfklow so 3.10 doesn't get 'rounded' to 3.10 when read * CU-38g55wn Remove python 3.7 from workflow since it's not compatible with required versions of numpy and scipy * CU-38g55wn Universally fixing NER test regarding the 'movar~viruse' -> 'movar~virus' thing * CU-38g55wn Bump gensim version to 4.3.0 - the first to support 3.11 * CU-862hyd5wx Unify rosalind/vocab downloading in tests, identify and fail meaningfully in case of 503 * CU-862hyd5wx Remove unused imports in tests due to last commit * CU-862hyd5wx Add possibility of generating and using a simply vocab when Rosalind is down * CU-862hyd5wx Remove python 3.7 and add 3.10/3.11 to classifiers * CU-862hyd5wx Reorder python versions in GitHub workflow * CU-862hyd5wx Attempt to fix GHA by importing unittest.mock explicitly
CogStack · Jan 20, 2023 · 43e0346 · 43e0346
1 parent c38bd0c
commit 43e0346
Show file tree

Hide file tree

Showing 13 changed files with 66 additions and 61 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.7, 3.8, 3.9 ]
+        python-version: [ '3.8', '3.9', '3.10', '3.11' ]
       max-parallel: 4
 
     steps:

diff --git a/medcat/cat.py b/medcat/cat.py
@@ -378,7 +378,7 @@ def __call__(self, text: Optional[str], do_train: bool = False) -> Optional[Doc]
             return None
         else:
             text = self._get_trimmed_text(str(text))
-            return self.pipe(text)
+            return self.pipe(text)  # type: ignore
 
     def __repr__(self):
         """Prints the model_card for this CAT instance.
@@ -473,7 +473,7 @@ def _print_stats(self,
                     else:
                         local_filters.cuis = {'empty'}
 
-                spacy_doc: Doc = self(doc['text'])
+                spacy_doc: Doc = self(doc['text'])  # type: ignore
 
                 if use_overlaps:
                     p_anns = spacy_doc._.ents
@@ -786,7 +786,7 @@ def add_and_train_concept(self,
 
         if spacy_entity is not None and spacy_doc is not None:
             # Train Linking
-            self.linker.context_model.train(cui=cui, entity=spacy_entity, doc=spacy_doc, negative=negative, names=names)
+            self.linker.context_model.train(cui=cui, entity=spacy_entity, doc=spacy_doc, negative=negative, names=names)  # type: ignore
 
             if not negative and devalue_others:
                 # Find all cuis
@@ -798,7 +798,7 @@ def add_and_train_concept(self,
                     cuis.remove(cui)
                 # Add negative training for all other CUIs that link to these names
                 for _cui in cuis:
-                    self.linker.context_model.train(cui=_cui, entity=spacy_entity, doc=spacy_doc, negative=True)
+                    self.linker.context_model.train(cui=_cui, entity=spacy_entity, doc=spacy_doc, negative=True)  # type: ignore
 
     def train_supervised(self,
                          data_path: str,
@@ -970,7 +970,7 @@ def train_supervised(self,
 
                 for idx_doc in trange(current_document, len(project['documents']), initial=current_document, total=len(project['documents']), desc='Document', leave=False):
                     doc = project['documents'][idx_doc]
-                    spacy_doc: Doc = self(doc['text'])
+                    spacy_doc: Doc = self(doc['text'])  # type: ignore
 
                     # Compatibility with old output where annotations are a list
                     doc_annotations = self._get_doc_annotations(doc)
@@ -991,8 +991,8 @@ def train_supervised(self,
                     if train_from_false_positives:
                         fps: List[Span] = get_false_positives(doc, spacy_doc)
 
-                        for fp in fps:
-                            fp_: Span = fp
+                        for fp in fps:  # type: ignore
+                            fp_: Span = fp  # type: ignore
                             self.add_and_train_concept(cui=fp_._.cui,
                                                        name=fp_.text,
                                                        spacy_doc=spacy_doc,
@@ -1034,7 +1034,7 @@ def get_entities(self,
                      only_cui: bool = False,
                      addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed']) -> Dict:
         doc = self(text)
-        out = self._doc_to_out(doc, only_cui, addl_info)
+        out = self._doc_to_out(doc, only_cui, addl_info)  # type: ignore
         return out
 
     def get_entities_multi_texts(self,
@@ -1060,7 +1060,7 @@ def get_entities_multi_texts(self,
         if n_process is None:
             texts_ = self._generate_trimmed_texts(texts)
             for text in texts_:
-                out.append(self._doc_to_out(self(text), only_cui, addl_info))
+                out.append(self._doc_to_out(self(text), only_cui, addl_info))  # type: ignore
         else:
             self.pipe.set_error_handler(self._pipe_error_handler)
             try:
@@ -1077,9 +1077,9 @@ def get_entities_multi_texts(self,
                     logger.warning("Found at least one failed batch and set output for enclosed texts to empty")
                     for i, text in enumerate(texts_):
                         if i == len(out):
-                            out.append(self._doc_to_out(None, only_cui, addl_info))
+                            out.append(self._doc_to_out(None, only_cui, addl_info))  # type: ignore
                         elif out[i].get('text', '') != text:
-                            out.insert(i, self._doc_to_out(None, only_cui, addl_info))
+                            out.insert(i, self._doc_to_out(None, only_cui, addl_info))  # type: ignore
 
                 cnf_annotation_output = self.config.annotation_output
                 if not(cnf_annotation_output.include_text_in_output):
@@ -1487,7 +1487,7 @@ def _doc_to_out(self,
                         entity._.meta_anns = _ent['meta_anns']
                     _ents.append(entity)
             else:
-                _ents = doc.ents
+                _ents = doc.ents  # type: ignore
 
             if cnf_annotation_output.lowercase_context:
                 doc_tokens = [tkn.text_with_ws.lower() for tkn in list(doc)]
@@ -1570,10 +1570,10 @@ def _pipe_error_handler(proc_name: str, proc: "Pipe", docs: List[Doc], e: Except
 
     @staticmethod
     def _get_doc_annotations(doc: Doc):
-        if type(doc['annotations']) == list:
-            return doc['annotations']
-        if type(doc['annotations']) == dict:
-            return doc['annotations'].values()
+        if type(doc['annotations']) == list:  # type: ignore
+            return doc['annotations']  # type: ignore
+        if type(doc['annotations']) == dict:  # type: ignore
+            return doc['annotations'].values()  # type: ignore
         return None
 
     def destroy_pipe(self):

diff --git a/medcat/linking/context_based_linker.py b/medcat/linking/context_based_linker.py
@@ -55,7 +55,8 @@ def _train(self, cui: str, entity: Span, doc: Doc, add_negative: bool = True) ->
 
     # Override
     def __call__(self, doc: Doc) -> Doc:
-        doc.ents = [] # Reset main entities, will be recreated later
+        # Reset main entities, will be recreated later  
+        doc.ents = []  # type: ignore
         cnf_l = self.config.linking
         linked_entities = []
 

diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
@@ -432,17 +432,17 @@ def pipe(self, stream: Iterable[Union[Doc, FakeDoc]], *args, **kwargs) -> Iterat
         batch_size_chars = config.general['pipe_batch_size_in_chars']
 
         if config.general['device'] == 'cpu' or config.general['disable_component_lock']:
-            yield from self._set_meta_anns(stream, batch_size_chars, config, id2category_value)
+            yield from self._set_meta_anns(stream, batch_size_chars, config, id2category_value)  # type: ignore
         else:
             with MetaCAT._component_lock:
-                yield from self._set_meta_anns(stream, batch_size_chars, config, id2category_value)
+                yield from self._set_meta_anns(stream, batch_size_chars, config, id2category_value)  # type: ignore
 
     def _set_meta_anns(self,
                        stream: Iterable[Union[Doc, FakeDoc]],
                        batch_size_chars: int,
                        config: ConfigMetaCAT,
                        id2category_value: Dict) -> Iterator[Optional[Doc]]:
-        for docs in self.batch_generator(stream, batch_size_chars):
+        for docs in self.batch_generator(stream, batch_size_chars):  # type: ignore
             try:
                 if not config.general['save_and_reuse_tokens'] or docs[0]._.share_tokens is None:
                     if config.general['lowercase']:

diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py
@@ -331,19 +331,19 @@ def pipe(self, stream: Iterable[Union[Doc, None]], *args, **kwargs) -> Iterator[
             return stream
 
         batch_size_chars = self.config.general['pipe_batch_size_in_chars']
-        yield from self._process(stream, batch_size_chars)
+        yield from self._process(stream, batch_size_chars)  # type: ignore
 
     def _process(self,
                  stream: Iterable[Union[Doc, None]],
                  batch_size_chars: int) -> Iterator[Optional[Doc]]:
-        for docs in self.batch_generator(stream, batch_size_chars):
+        for docs in self.batch_generator(stream, batch_size_chars):  # type: ignore
             #all_text = [doc.text for doc in docs]
             #all_text_processed = self.tokenizer.encode_eval(all_text)
             # For now we will process the documents one by one, should be improved in the future to use batching
             for doc in docs:
                 try:
                     res = self.ner_pipe(doc.text, aggregation_strategy=self.config.general['ner_aggregation_strategy'])
-                    doc.ents = []
+                    doc.ents = []  # type: ignore
                     for r in res:
                         inds = []
                         for ind, word in enumerate(doc):

diff --git a/medcat/pipe.py b/medcat/pipe.py
@@ -139,7 +139,7 @@ def add_meta_cat(self, meta_cat: MetaCAT, name: Optional[str] = None) -> None:
     def add_addl_ner(self, addl_ner: TransformersNER, name: Optional[str] = None) -> None:
         component_name = spacy.util.get_object_name(addl_ner)
         name = name if name is not None else component_name
-        Language.component(name=component_name, func=addl_ner)
+        Language.component(name=component_name, func=addl_ner)  # type: ignore
         self._nlp.add_pipe(component_name, name=name, last=True)
 
         Doc.set_extension('ents', default=[], force=True)
@@ -176,7 +176,7 @@ def batch_multi_process(self,
             self._nlp.get_pipe(instance_name)
         except KeyError:
             component_name = spacy.util.get_object_name(self._ensure_serializable)
-            Language.component(name=component_name, func=self._ensure_serializable)
+            Language.component(name=component_name, func=self._ensure_serializable)  # type: ignore
             self._nlp.add_pipe(component_name, name=instance_name, last=True)
 
         n_process = n_process if n_process is not None else max(cpu_count() - 1, 1)
@@ -238,7 +238,7 @@ def _ensure_serializable(doc: Doc) -> Doc:
 
     def __call__(self, text: Union[str, Iterable[str]]) -> Union[Doc, List[Doc]]:
         if isinstance(text, str):
-            return self._nlp(text) if len(text) > 0 else None
+            return self._nlp(text) if len(text) > 0 else None  # type: ignore
         elif isinstance(text, Iterable):
             docs = []
             for t in text if isinstance(text, types.GeneratorType) else tqdm(text, total=len(list(text))):
@@ -249,7 +249,7 @@ def __call__(self, text: Union[str, Iterable[str]]) -> Union[Doc, List[Doc]]:
                     logger.warning(e, exc_info=True, stack_info=True)
                     doc = None
                 docs.append(doc)
-            return docs
+            return docs  # type: ignore
         else:
             logger.error("The input text should be either a string or a sequence of strings but got: %s", type(text))
             return None
diff --git a/medcat/pipeline/pipe_runner.py b/medcat/pipeline/pipe_runner.py
@@ -24,7 +24,7 @@ def __call__(self, doc: Doc):
         raise NotImplementedError("Method __call__ has not been implemented.")
 
     # Override
-    def pipe(self, stream: Iterable[Doc], batch_size: int, **kwargs) -> Union[Generator[Doc, None, None], Iterator[Doc]]:
+    def pipe(self, stream: Iterable[Doc], batch_size: int, **kwargs) -> Union[Generator[Doc, None, None], Iterator[Doc]]:  # type: ignore
         error_handler = self.get_error_handler()
         if kwargs.get("parallel", False):
             PipeRunner._execute, PipeRunner._delayed = self._lazy_init_pool()
@@ -35,14 +35,14 @@ def pipe(self, stream: Iterable[Doc], batch_size: int, **kwargs) -> Union[Genera
                     for output_doc in PipeRunner._execute(tasks):
                         yield PipeRunner.deserialize_entities(output_doc)
                 except Exception as e:
-                    error_handler(self.name, self, docs, e)
+                    error_handler(self.name, self, docs, e)  # type: ignore
                     yield from [None] * len(docs)
         else:
             for doc in stream:
                 try:
                     yield self(doc)
                 except Exception as e:
-                    error_handler(self.name, self, [doc], e)
+                    error_handler(self.name, self, [doc], e)  # type: ignore
                     yield None
 
     @staticmethod
@@ -89,7 +89,7 @@ def deserialize_entities(doc: Doc):
 
     @staticmethod
     def _run_pipe_on_one(call: Callable, doc: Doc, underscore_state: Tuple) -> Doc:
-        Underscore.load_state(underscore_state)
+        Underscore.load_state(underscore_state)  # type: ignore
         doc = PipeRunner.deserialize_entities(doc)
         doc = call(doc)
         doc = PipeRunner.serialize_entities(doc)

diff --git a/medcat/utils/postprocessing.py b/medcat/utils/postprocessing.py
@@ -31,7 +31,7 @@ def make_pretty_labels(cdb: CDB, doc: Doc, style: Optional[LabelStyle] = None) -
             setattr(n_ent._, attr, getattr(ent._, attr))
         n_ents.append(n_ent)
 
-    doc.ents = n_ents
+    doc.ents = n_ents  # type: ignore
 
 
 def create_main_ann(cdb: CDB, doc: Doc, tuis: Optional[List] = None) -> None:
@@ -59,4 +59,4 @@ def create_main_ann(cdb: CDB, doc: Doc, tuis: Optional[List] = None) -> None:
                     tkns_in.add(tkn)
                 main_anns.append(ent)
 
-    doc.ents = list(doc.ents) + main_anns
+    doc.ents = list(doc.ents) + main_anns  # type: ignore
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,7 +1,7 @@
 .
-https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.1.0/en_core_web_md-3.1.0-py3-none-any.whl
+https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl
 flake8==4.0.1
-mypy==0.931
+mypy==0.981
 mypy-extensions==0.4.3
 types-aiofiles==0.8.3
 types-PyYAML==6.0.3

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
 .
-https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.1.0/en_core_web_md-3.1.0-py3-none-any.whl
+https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl
diff --git a/setup.py b/setup.py
@@ -18,40 +18,42 @@
               'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.neo', 'medcat.utils.ner',
               'medcat.utils.saving', 'medcat.utils.regression'],
     install_requires=[
-        'numpy>=1.21.4',
-        'pandas>=1.1.5',
-        'gensim~=4.1.2',
-        'spacy<3.1.4,>=3.1.0',
-        'scipy>=1.5.4',
-        'transformers~=4.19.2',
-        'torch>=1.0',
+        'numpy>=1.22.0', # first to support 3.11
+        'pandas>=1.4.2', # first to support 3.11
+        'gensim>=4.3.0', # first to support 3.11
+        'spacy>=3.1.0',
+        'scipy~=1.9.2', # first to support 3.11
+        'transformers>=4.19.2',
+        'torch>=1.13.0', # first to support 3.11
         'tqdm>=4.27',
-        'scikit-learn<1.2.0',
+        'scikit-learn>=1.1.3', # first to supporrt 3.11
         'elasticsearch>=8.3,<9',  # Check if this is compatible with opensearch otherwise: 'elasticsearch>=7.10,<8.0.0',
         'eland>=8.3.0,<9',
-        'dill~=0.3.4,<0.3.5', # less than 0.3.5 due to datasets requirement
-        'datasets~=2.2.2',
-        'jsonpickle~=2.0.0',
+        'dill>=0.3.4', # allow later versions with later versions of datasets (tested with 0.3.6)
+        'datasets>=2.2.2', # allow later versions, tested with 2.7.1
+        'jsonpickle>=2.0.0', # allow later versions, tested with 3.0.0
         'psutil>=5.8.0',
         # 0.70.12 uses older version of dill (i.e less than 0.3.5) which is required for datasets
-        'multiprocess==0.70.12',  # seems to work better than standard mp
-        'py2neo==2021.2.3',
-        'aiofiles~=0.8.0',
-        'ipywidgets~=7.6.5',
-        'xxhash==3.0.0',
-        'blis<=0.7.5',
-        'click<=8.0.4',  # Spacy breaks without this
-        'pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4', # identical constraints to thinc and spacy
+        'multiprocess~=0.70.12',  # 0.70.14 seemed to work just fine
+        'py2neo~=2021.2.3',
+        'aiofiles>=0.8.0', # allow later versions, tested with 22.1.0
+        'ipywidgets>=7.6.5', # allow later versions, tested with 0.8.0
+        'xxhash>=3.0.0', # allow later versions, tested with 3.1.0
+        'blis>=0.7.5', # allow later versions, tested with 0.7.9
+        'click>=8.0.4', # allow later versions, tested with 8.1.3
+        'pydantic>=1.10.0', # for spacy compatibility
         # the following are not direct dependencies of MedCAT but needed for docs/building
-        'aiohttp==3.8.3', # 3.8.3 is needed for compatibility with fsspec
-        'smart-open==5.2.1', # 5.2.1 is needed for compatibility with pathy
-        'joblib~=1.2', 
+        # hopefully will no longer need the transitive dependencies
+        # 'aiohttp==3.8.3', # 3.8.3 is needed for compatibility with fsspec
+        # 'smart-open==5.2.1', # 5.2.1 is needed for compatibility with pathy
+        # 'joblib~=1.2',
         ],
     classifiers=[
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],

diff --git a/tests/helper.py b/tests/helper.py
@@ -1,6 +1,7 @@
 import os
 import requests
 import unittest
+import unittest.mock
 
 import numpy as np
 

diff --git a/tests/test_ner.py b/tests/test_ner.py
@@ -67,7 +67,8 @@ def tearDownClass(cls) -> None:
         cls.pipe.destroy()
 
     def test_aa_cdb_names_output(self):
-        target_result = {'S-229004': {'movar~viruse', 'movar', 'movar~viruses'}, 'S-229005': {'cdb'}}
+        print("Fixing 'movar~viruse' -> 'movar-virus' for newere en_core_web_md")
+        target_result = {'S-229004': {'movar~virus', 'movar', 'movar~viruses'}, 'S-229005': {'cdb'}}
         self.assertEqual(self.cdb.cui2names, target_result)
 
     def test_ab_entities_length(self):