Skip to content

Commit

Permalink
CU-38g55wn / CU-39cmv82 Support for python3.11 (and 3.10) (#285)
Browse files Browse the repository at this point in the history
* CU-38g55wn Move dependencies to (hopefully) support python 3.11 on Ubuntu

* CU-38g55wn Attempt to fix dependencies for github dependency (gensim)

* CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x2

* CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x3

* CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x4

* CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x5 - fix missing comma

* CU-38g55wn Remove errorenous package from setup.py

* CU-38g55wn Bump spacy version so as to (hopefully) fix pydantic issues

* CU-38g55wn Bump spacy en_core_web_md version so as to (hopefully) fix requirements issues

* CU-38g55wn Fix test typo that was fixed on newere en_core_web_md

* CU-38g55wn Fix small issue in NER test

* CU-38g55wn Fix small issue with NER test (int conversion)

* CU-38g55wn Mark some places as ignore where newer mypy complains

* CU-38g55wn Bump mypy dev requirement version

* CU-38g55wn Add python 3.11 and 3.10 to workflow

* CU-38g55wn Trying to install gensim over https rather tha ssh

* CU-38g55wn Make python versions strings in GH worfklow so 3.10 doesn't get 'rounded' to 3.10 when read

* CU-38g55wn Remove python 3.7 from workflow since it's not compatible with required versions of numpy and scipy

* CU-38g55wn Universally fixing NER test regarding the 'movar~viruse' -> 'movar~virus' thing

* CU-38g55wn Bump gensim version to 4.3.0 - the first to support 3.11

* CU-862hyd5wx Unify rosalind/vocab downloading in tests, identify and fail meaningfully in case of 503

* CU-862hyd5wx Remove unused imports in tests due to last commit

* CU-862hyd5wx Add possibility of generating and using a simply vocab when Rosalind is down

* CU-862hyd5wx Remove python 3.7 and add 3.10/3.11 to classifiers

* CU-862hyd5wx Reorder python versions in GitHub workflow

* CU-862hyd5wx Attempt to fix GHA by importing unittest.mock explicitly
  • Loading branch information
mart-r authored Jan 20, 2023
1 parent c38bd0c commit 43e0346
Show file tree
Hide file tree
Showing 13 changed files with 66 additions and 61 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ 3.7, 3.8, 3.9 ]
python-version: [ '3.8', '3.9', '3.10', '3.11' ]
max-parallel: 4

steps:
Expand Down
32 changes: 16 additions & 16 deletions medcat/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ def __call__(self, text: Optional[str], do_train: bool = False) -> Optional[Doc]
return None
else:
text = self._get_trimmed_text(str(text))
return self.pipe(text)
return self.pipe(text) # type: ignore

def __repr__(self):
"""Prints the model_card for this CAT instance.
Expand Down Expand Up @@ -473,7 +473,7 @@ def _print_stats(self,
else:
local_filters.cuis = {'empty'}

spacy_doc: Doc = self(doc['text'])
spacy_doc: Doc = self(doc['text']) # type: ignore

if use_overlaps:
p_anns = spacy_doc._.ents
Expand Down Expand Up @@ -786,7 +786,7 @@ def add_and_train_concept(self,

if spacy_entity is not None and spacy_doc is not None:
# Train Linking
self.linker.context_model.train(cui=cui, entity=spacy_entity, doc=spacy_doc, negative=negative, names=names)
self.linker.context_model.train(cui=cui, entity=spacy_entity, doc=spacy_doc, negative=negative, names=names) # type: ignore

if not negative and devalue_others:
# Find all cuis
Expand All @@ -798,7 +798,7 @@ def add_and_train_concept(self,
cuis.remove(cui)
# Add negative training for all other CUIs that link to these names
for _cui in cuis:
self.linker.context_model.train(cui=_cui, entity=spacy_entity, doc=spacy_doc, negative=True)
self.linker.context_model.train(cui=_cui, entity=spacy_entity, doc=spacy_doc, negative=True) # type: ignore

def train_supervised(self,
data_path: str,
Expand Down Expand Up @@ -970,7 +970,7 @@ def train_supervised(self,

for idx_doc in trange(current_document, len(project['documents']), initial=current_document, total=len(project['documents']), desc='Document', leave=False):
doc = project['documents'][idx_doc]
spacy_doc: Doc = self(doc['text'])
spacy_doc: Doc = self(doc['text']) # type: ignore

# Compatibility with old output where annotations are a list
doc_annotations = self._get_doc_annotations(doc)
Expand All @@ -991,8 +991,8 @@ def train_supervised(self,
if train_from_false_positives:
fps: List[Span] = get_false_positives(doc, spacy_doc)

for fp in fps:
fp_: Span = fp
for fp in fps: # type: ignore
fp_: Span = fp # type: ignore
self.add_and_train_concept(cui=fp_._.cui,
name=fp_.text,
spacy_doc=spacy_doc,
Expand Down Expand Up @@ -1034,7 +1034,7 @@ def get_entities(self,
only_cui: bool = False,
addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed']) -> Dict:
doc = self(text)
out = self._doc_to_out(doc, only_cui, addl_info)
out = self._doc_to_out(doc, only_cui, addl_info) # type: ignore
return out

def get_entities_multi_texts(self,
Expand All @@ -1060,7 +1060,7 @@ def get_entities_multi_texts(self,
if n_process is None:
texts_ = self._generate_trimmed_texts(texts)
for text in texts_:
out.append(self._doc_to_out(self(text), only_cui, addl_info))
out.append(self._doc_to_out(self(text), only_cui, addl_info)) # type: ignore
else:
self.pipe.set_error_handler(self._pipe_error_handler)
try:
Expand All @@ -1077,9 +1077,9 @@ def get_entities_multi_texts(self,
logger.warning("Found at least one failed batch and set output for enclosed texts to empty")
for i, text in enumerate(texts_):
if i == len(out):
out.append(self._doc_to_out(None, only_cui, addl_info))
out.append(self._doc_to_out(None, only_cui, addl_info)) # type: ignore
elif out[i].get('text', '') != text:
out.insert(i, self._doc_to_out(None, only_cui, addl_info))
out.insert(i, self._doc_to_out(None, only_cui, addl_info)) # type: ignore

cnf_annotation_output = self.config.annotation_output
if not(cnf_annotation_output.include_text_in_output):
Expand Down Expand Up @@ -1487,7 +1487,7 @@ def _doc_to_out(self,
entity._.meta_anns = _ent['meta_anns']
_ents.append(entity)
else:
_ents = doc.ents
_ents = doc.ents # type: ignore

if cnf_annotation_output.lowercase_context:
doc_tokens = [tkn.text_with_ws.lower() for tkn in list(doc)]
Expand Down Expand Up @@ -1570,10 +1570,10 @@ def _pipe_error_handler(proc_name: str, proc: "Pipe", docs: List[Doc], e: Except

@staticmethod
def _get_doc_annotations(doc: Doc):
if type(doc['annotations']) == list:
return doc['annotations']
if type(doc['annotations']) == dict:
return doc['annotations'].values()
if type(doc['annotations']) == list: # type: ignore
return doc['annotations'] # type: ignore
if type(doc['annotations']) == dict: # type: ignore
return doc['annotations'].values() # type: ignore
return None

def destroy_pipe(self):
Expand Down
3 changes: 2 additions & 1 deletion medcat/linking/context_based_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def _train(self, cui: str, entity: Span, doc: Doc, add_negative: bool = True) ->

# Override
def __call__(self, doc: Doc) -> Doc:
doc.ents = [] # Reset main entities, will be recreated later
# Reset main entities, will be recreated later
doc.ents = [] # type: ignore
cnf_l = self.config.linking
linked_entities = []

Expand Down
6 changes: 3 additions & 3 deletions medcat/meta_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,17 +432,17 @@ def pipe(self, stream: Iterable[Union[Doc, FakeDoc]], *args, **kwargs) -> Iterat
batch_size_chars = config.general['pipe_batch_size_in_chars']

if config.general['device'] == 'cpu' or config.general['disable_component_lock']:
yield from self._set_meta_anns(stream, batch_size_chars, config, id2category_value)
yield from self._set_meta_anns(stream, batch_size_chars, config, id2category_value) # type: ignore
else:
with MetaCAT._component_lock:
yield from self._set_meta_anns(stream, batch_size_chars, config, id2category_value)
yield from self._set_meta_anns(stream, batch_size_chars, config, id2category_value) # type: ignore

def _set_meta_anns(self,
stream: Iterable[Union[Doc, FakeDoc]],
batch_size_chars: int,
config: ConfigMetaCAT,
id2category_value: Dict) -> Iterator[Optional[Doc]]:
for docs in self.batch_generator(stream, batch_size_chars):
for docs in self.batch_generator(stream, batch_size_chars): # type: ignore
try:
if not config.general['save_and_reuse_tokens'] or docs[0]._.share_tokens is None:
if config.general['lowercase']:
Expand Down
6 changes: 3 additions & 3 deletions medcat/ner/transformers_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,19 +331,19 @@ def pipe(self, stream: Iterable[Union[Doc, None]], *args, **kwargs) -> Iterator[
return stream

batch_size_chars = self.config.general['pipe_batch_size_in_chars']
yield from self._process(stream, batch_size_chars)
yield from self._process(stream, batch_size_chars) # type: ignore

def _process(self,
stream: Iterable[Union[Doc, None]],
batch_size_chars: int) -> Iterator[Optional[Doc]]:
for docs in self.batch_generator(stream, batch_size_chars):
for docs in self.batch_generator(stream, batch_size_chars): # type: ignore
#all_text = [doc.text for doc in docs]
#all_text_processed = self.tokenizer.encode_eval(all_text)
# For now we will process the documents one by one, should be improved in the future to use batching
for doc in docs:
try:
res = self.ner_pipe(doc.text, aggregation_strategy=self.config.general['ner_aggregation_strategy'])
doc.ents = []
doc.ents = [] # type: ignore
for r in res:
inds = []
for ind, word in enumerate(doc):
Expand Down
8 changes: 4 additions & 4 deletions medcat/pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def add_meta_cat(self, meta_cat: MetaCAT, name: Optional[str] = None) -> None:
def add_addl_ner(self, addl_ner: TransformersNER, name: Optional[str] = None) -> None:
component_name = spacy.util.get_object_name(addl_ner)
name = name if name is not None else component_name
Language.component(name=component_name, func=addl_ner)
Language.component(name=component_name, func=addl_ner) # type: ignore
self._nlp.add_pipe(component_name, name=name, last=True)

Doc.set_extension('ents', default=[], force=True)
Expand Down Expand Up @@ -176,7 +176,7 @@ def batch_multi_process(self,
self._nlp.get_pipe(instance_name)
except KeyError:
component_name = spacy.util.get_object_name(self._ensure_serializable)
Language.component(name=component_name, func=self._ensure_serializable)
Language.component(name=component_name, func=self._ensure_serializable) # type: ignore
self._nlp.add_pipe(component_name, name=instance_name, last=True)

n_process = n_process if n_process is not None else max(cpu_count() - 1, 1)
Expand Down Expand Up @@ -238,7 +238,7 @@ def _ensure_serializable(doc: Doc) -> Doc:

def __call__(self, text: Union[str, Iterable[str]]) -> Union[Doc, List[Doc]]:
if isinstance(text, str):
return self._nlp(text) if len(text) > 0 else None
return self._nlp(text) if len(text) > 0 else None # type: ignore
elif isinstance(text, Iterable):
docs = []
for t in text if isinstance(text, types.GeneratorType) else tqdm(text, total=len(list(text))):
Expand All @@ -249,7 +249,7 @@ def __call__(self, text: Union[str, Iterable[str]]) -> Union[Doc, List[Doc]]:
logger.warning(e, exc_info=True, stack_info=True)
doc = None
docs.append(doc)
return docs
return docs # type: ignore
else:
logger.error("The input text should be either a string or a sequence of strings but got: %s", type(text))
return None
8 changes: 4 additions & 4 deletions medcat/pipeline/pipe_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __call__(self, doc: Doc):
raise NotImplementedError("Method __call__ has not been implemented.")

# Override
def pipe(self, stream: Iterable[Doc], batch_size: int, **kwargs) -> Union[Generator[Doc, None, None], Iterator[Doc]]:
def pipe(self, stream: Iterable[Doc], batch_size: int, **kwargs) -> Union[Generator[Doc, None, None], Iterator[Doc]]: # type: ignore
error_handler = self.get_error_handler()
if kwargs.get("parallel", False):
PipeRunner._execute, PipeRunner._delayed = self._lazy_init_pool()
Expand All @@ -35,14 +35,14 @@ def pipe(self, stream: Iterable[Doc], batch_size: int, **kwargs) -> Union[Genera
for output_doc in PipeRunner._execute(tasks):
yield PipeRunner.deserialize_entities(output_doc)
except Exception as e:
error_handler(self.name, self, docs, e)
error_handler(self.name, self, docs, e) # type: ignore
yield from [None] * len(docs)
else:
for doc in stream:
try:
yield self(doc)
except Exception as e:
error_handler(self.name, self, [doc], e)
error_handler(self.name, self, [doc], e) # type: ignore
yield None

@staticmethod
Expand Down Expand Up @@ -89,7 +89,7 @@ def deserialize_entities(doc: Doc):

@staticmethod
def _run_pipe_on_one(call: Callable, doc: Doc, underscore_state: Tuple) -> Doc:
Underscore.load_state(underscore_state)
Underscore.load_state(underscore_state) # type: ignore
doc = PipeRunner.deserialize_entities(doc)
doc = call(doc)
doc = PipeRunner.serialize_entities(doc)
Expand Down
4 changes: 2 additions & 2 deletions medcat/utils/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def make_pretty_labels(cdb: CDB, doc: Doc, style: Optional[LabelStyle] = None) -
setattr(n_ent._, attr, getattr(ent._, attr))
n_ents.append(n_ent)

doc.ents = n_ents
doc.ents = n_ents # type: ignore


def create_main_ann(cdb: CDB, doc: Doc, tuis: Optional[List] = None) -> None:
Expand Down Expand Up @@ -59,4 +59,4 @@ def create_main_ann(cdb: CDB, doc: Doc, tuis: Optional[List] = None) -> None:
tkns_in.add(tkn)
main_anns.append(ent)

doc.ents = list(doc.ents) + main_anns
doc.ents = list(doc.ents) + main_anns # type: ignore
4 changes: 2 additions & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
.
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.1.0/en_core_web_md-3.1.0-py3-none-any.whl
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl
flake8==4.0.1
mypy==0.931
mypy==0.981
mypy-extensions==0.4.3
types-aiofiles==0.8.3
types-PyYAML==6.0.3
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
.
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.1.0/en_core_web_md-3.1.0-py3-none-any.whl
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl
48 changes: 25 additions & 23 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,40 +18,42 @@
'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.neo', 'medcat.utils.ner',
'medcat.utils.saving', 'medcat.utils.regression'],
install_requires=[
'numpy>=1.21.4',
'pandas>=1.1.5',
'gensim~=4.1.2',
'spacy<3.1.4,>=3.1.0',
'scipy>=1.5.4',
'transformers~=4.19.2',
'torch>=1.0',
'numpy>=1.22.0', # first to support 3.11
'pandas>=1.4.2', # first to support 3.11
'gensim>=4.3.0', # first to support 3.11
'spacy>=3.1.0',
'scipy~=1.9.2', # first to support 3.11
'transformers>=4.19.2',
'torch>=1.13.0', # first to support 3.11
'tqdm>=4.27',
'scikit-learn<1.2.0',
'scikit-learn>=1.1.3', # first to supporrt 3.11
'elasticsearch>=8.3,<9', # Check if this is compatible with opensearch otherwise: 'elasticsearch>=7.10,<8.0.0',
'eland>=8.3.0,<9',
'dill~=0.3.4,<0.3.5', # less than 0.3.5 due to datasets requirement
'datasets~=2.2.2',
'jsonpickle~=2.0.0',
'dill>=0.3.4', # allow later versions with later versions of datasets (tested with 0.3.6)
'datasets>=2.2.2', # allow later versions, tested with 2.7.1
'jsonpickle>=2.0.0', # allow later versions, tested with 3.0.0
'psutil>=5.8.0',
# 0.70.12 uses older version of dill (i.e less than 0.3.5) which is required for datasets
'multiprocess==0.70.12', # seems to work better than standard mp
'py2neo==2021.2.3',
'aiofiles~=0.8.0',
'ipywidgets~=7.6.5',
'xxhash==3.0.0',
'blis<=0.7.5',
'click<=8.0.4', # Spacy breaks without this
'pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4', # identical constraints to thinc and spacy
'multiprocess~=0.70.12', # 0.70.14 seemed to work just fine
'py2neo~=2021.2.3',
'aiofiles>=0.8.0', # allow later versions, tested with 22.1.0
'ipywidgets>=7.6.5', # allow later versions, tested with 0.8.0
'xxhash>=3.0.0', # allow later versions, tested with 3.1.0
'blis>=0.7.5', # allow later versions, tested with 0.7.9
'click>=8.0.4', # allow later versions, tested with 8.1.3
'pydantic>=1.10.0', # for spacy compatibility
# the following are not direct dependencies of MedCAT but needed for docs/building
'aiohttp==3.8.3', # 3.8.3 is needed for compatibility with fsspec
'smart-open==5.2.1', # 5.2.1 is needed for compatibility with pathy
'joblib~=1.2',
# hopefully will no longer need the transitive dependencies
# 'aiohttp==3.8.3', # 3.8.3 is needed for compatibility with fsspec
# 'smart-open==5.2.1', # 5.2.1 is needed for compatibility with pathy
# 'joblib~=1.2',
],
classifiers=[
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
Expand Down
1 change: 1 addition & 0 deletions tests/helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import requests
import unittest
import unittest.mock

import numpy as np

Expand Down
3 changes: 2 additions & 1 deletion tests/test_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def tearDownClass(cls) -> None:
cls.pipe.destroy()

def test_aa_cdb_names_output(self):
target_result = {'S-229004': {'movar~viruse', 'movar', 'movar~viruses'}, 'S-229005': {'cdb'}}
print("Fixing 'movar~viruse' -> 'movar-virus' for newere en_core_web_md")
target_result = {'S-229004': {'movar~virus', 'movar', 'movar~viruses'}, 'S-229005': {'cdb'}}
self.assertEqual(self.cdb.cui2names, target_result)

def test_ab_entities_length(self):
Expand Down

0 comments on commit 43e0346

Please sign in to comment.