Merge branch 'master' into production

CogStack · Aug 28, 2024 · 34e5cde · 34e5cde
2 parents f6f3654 + 540224c
commit 34e5cde
Show file tree

Hide file tree

Showing 76 changed files with 3,961 additions and 3,358 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -40,7 +40,8 @@ jobs:
           second_half_nl=$(echo "$all_files" | tail -n +$(($midpoint + 1)))
           timeout 25m python -m unittest ${first_half_nl[@]}
           timeout 25m python -m unittest ${second_half_nl[@]}
-
+      - name: Regression
+        run: source tests/resources/regression/run_regression.sh
       - name: Get the latest release version
         id: get_latest_release
         uses: actions/github-script@v6

diff --git a/configs/default_regression_tests.yml b/configs/default_regression_tests.yml
@@ -1,79 +1,142 @@
-# # Example of some test cases
-# # They will try to cover as many possible use cases as possible
-# # The idea is that the CUI corresponding to the name is expected to be
-# # obtained by MedCAT
-# # Only the 'filters' under 'targeting' and the 'phrases' under
-# # the test case are the two required sections, the rest is optional 
-#
-# test-case-name-1: # name of this test case
-#   targeting: # info regarding targets of this test case
-#     strategy: "ALL" # the strategy for dealing with the filters below
-#                     # so "ALL" means the targets need to match all the below filters
-#                     # and "ANY" means that the targets need to match at least one of the filters
-#                     # if only one type of target it specified, this is irrelevant
-#                     # the default value is "ALL" if not specified
-#     prefname-only: False # set to True if only prefered names should be checked (defaults to False)
-#     targfiltersets: # the filters for this specific test case
-#                     # there has to be one type of target, but multiple can be specified
-#                     # if multiple types are target, the strategy defined above is taken into affect
-#                     # each type can specify one or multiple values
-#                     #  this example shows has one values 
-#                     #  the next example (below) will have multiple values
-#       type_id: "0123" # type_id or type_ids
-#       cui: "01230" # the target CUI (or list of CUIS)
-#       name: "name0" # the target names
-#                      # all specified names need to exist within the CDB
-#   phrases: "The quick brown %s jumped over the lazy cat" # the phrases to go through
-#                                                          # for each phrases, '%s' is replaced
-#                                                          # by each name that is to be tested
-# test-case-name-2: # name of this test case
-#   targeting:
-#     filters:
-#       type_id: # multiple target type IDs
-#       - "123"
-#       - "223"
-#       cui: # multiple target CUI
-#       - "1234"
-#       - "2234"
-#       name: # multiple names
-#       - "name1"
-#       - "name2"
-#       cui_and_children: # an example with CUI and children
-#         cui: '111' # the CUI (or CUIs)
-#         depth: 2   # and the depth of children
-#   phrases:
-#   - "The %s was measured"
-#   - "The %s was not measured"
-#
-# # The following example was (rather arbitrarily) created and should work for
-# # the included SNOMED models
-test-case-1:
-  targeting:
-    strategy: "ALL"
-    filters:
-      type_id: "2680757"
-  phrases:
-  - "The %s was measured"
+# this is an example test case
+# it is based on SNOMED-CT
+test-case-1:  # The (somewhat) arbitrary name of the test case
+  targeting:  # the description of the replacement targets in the phrase(s)
+    placeholders:  # the placeholders to replace in the phrase(s)
+                   # Note that only 1 concept will be tested for at one time.
+                   # So if the prhase(s) has/have more than 1 placeholder, the
+                   # rest of them will be substitued in without care for whether
+                   # or how accurately the model is able to recognise them.
+                   # For the concepts that are not under test at a given time
+                   # the "first" name is used (because the implementation has
+                   # names in a set, there is possibility for run-to-run variance
+                   # because of different names being used).
+                   #
+                   # There are 2 modes for the placeholders:
+                   # 1. any-combination: false
+                   #   In this mode, only the concepts in the same position
+                   #   in the various lists are used in conjunction to oneanother.
+                   #   Though this also means that it is expected that all of the
+                   #   placeholders have the same number of CUIs to use.
+                   #   Assuming each of the N placeholders defines M replacement
+                   #   cuis, this approach produces M*N cases.
+                   # 2. any-combination: true
+                   #   In this mode, any combination of the replacement CUIs is
+                   #   allowed. This means that quite a few different combinations
+                   #   will be generated and used. It also means that different
+                   #   placeholders can have different number of concepts suitbale
+                   #   for them.
+                   #   Assuming eacho of the N placeholders defines M repalcement
+                   #   cuis, this approach produces N * N^M (where `^` is power)
+                   #   cases. But for a more complicated set up (i.e where different
+                   #   placeholders have a different number of swappable CUIs)
+                   #   this calculation is not as straight forward.
+                   #
+                   # NOTE: The above description does not take into account different
+                   #       number of names associated with different concepts. For each
+                   #       of the "primary" concepts, each possible name is attempted.
+      - placeholder: '[DISORDER]'  # the palceholder that will be substituted in the phrase(s)
+        cuis: ['4473006',  # Intracerebral hemorrhage
+               '85189001',  # Acute appendicitis
+               '186738001',  # vestibular neuritis
+               '186738001',  # vestibular neuritis
+              ]
+      - placeholder: '[FINDING1]'
+        cuis: ['162300006',  # unilateral headache
+               '21522001',  # abdominal pain
+               '103298005',  # severe vertigo
+               '103298005',  # severe vertigo
+              ]
+        prefname-only: false  # this is an optional keyword for wach placeholder
+                              # if set to true, only the preferred name will be used for
+                              # this concept. Otherwise, all names will be used as
+                              # different sub-cases
+      - placeholder: '[FINDING2]'
+        cuis: ['409668002',  # photophobia
+               '422587007',  # nausea
+               '422587007',  # nausea
+               '422587007',  # nausea
+              ]
+      - placeholder: '[FINDING3]'
+        cuis: ['2228002',  # scintillating scotoma
+               '386661006',  # fever
+               '81756001',  # horizontal nystagmus
+               '81756001',  # horizontal nystagmus
+              ]
+      - placeholder: '[NEGFINDING]'
+        cuis: ['386661006',  # fever
+               '62315008',  # diarrhea
+               '15188001',  # hearing loss
+               '60862001',  # tinnitus
+              ]
+    any-combination: false  # if set to false, same length of CUIs is expected
+                            # for each placeholder and only a combination is used
+  phrases:  # The list of phrases
+  - >
+      Description: [DISORDER]
+
+      CC: [FINDING1] on presentation; then developed [FINDING3]
+
+      HX: On the day of presentation, this 32 y/o RHM suddenly developed [FINDING1] and [FINDING2].
+      Four hours later he experienced sudden [FINDING3] lasting two hours.
+      There were no other associated symptoms except for the [FINDING1] and [FINDING2].
+      He denied [NEGFINDING].
 test-case-2:
   targeting:
-    filters:
-      type_id: "9090192"
-  phrases:
-  - "Patient presented with %s"
-  - "No %s was present"
-test-case-3:
-  targeting:
-    filters:
-      type_id: "67667581"
-  phrases:
-  - "The patient has been diagnosed with %s"
-  - "There are no signs of %s"
-test-case-4:
-  targeting:
-    strategy: "ALL"
-    filters:
-      cui_and_children:
-       cui: "364075005" # 'heart rate'
-       depth: 4         # and children 4 deep
+    placeholders:
+      - placeholder: '[FINDING1]'
+        cuis: ['49727002',  # cough
+               '29857009',  # chest pain
+               '21522001',  # abdominal pain
+               '57676002',  # joint pain
+               '25064002',  # headache
+               '271807003',  # fever
+               '162397003',  # hematuria (blood in urine)
+               '271757001',  # fatigue
+               '386661006',  # weight loss
+               '62315008',  # dysuria (painful urination)
+              ]
+      - placeholder: '[FINDING2]'
+        cuis: ['267036007',  # shortness of breath
+               '68962001',  # palpatations
+               '422587007',  # nausea
+               '182888003',  # swelling
+               '404640003',  # dizziness
+               '422400008',  # sore throat
+               '267036007',  # shortness of breath
+               '267064002',  # night sweats
+               '162607003',  # back pain
+               '267102003',  # urinary frequency
+              ]
+      - placeholder: '[DISORDER]'
+        cuis: ['195967001',  # asthma
+               '194828000',  # angina pectoris
+               '25374005',  # gastroenteritis
+               '69896004',  # rheumatoid arthritis
+               '37796009',  # migraine
+               '186747009',  # influenza
+               '106063007',  # urinary tract infection
+               '444814009',  # chronic fatigue syndrome
+               '95281007',  # tuberculosis
+               '431855005',  # cystitis
+        ]
+    any-combination: false
   phrases:
-  - "The patient's %s was 82 bps"
+  - >
+      The patient presents with [FINDING1] and [FINDING2]. These findings are suggestive of [DISORDER].
+      Further diagnostic evaluation and investigations are required to confirm the diagnosis.
+  - >
+      The patient reports [FINDING1] and has also been experiencing [FINDING2]. These symptoms are consistent with a clinical presentation of [DISORDER].
+      Further assessment and diagnostic tests are required to establish the underlying cause.
+  - >
+      Upon evaluation, the patient exhibits [FINDING1] along with [FINDING2]. This combination of findings raises suspicion for [DISORDER].
+      Comprehensive diagnostic workup is advised to confirm the diagnosis and plan appropriate management.
+  - >
+      During the consultation, the patient described [FINDING1] and noted a recent history of [FINDING2]. These clinical features are suggestive of [DISORDER].
+      Further investigation is necessary to verify the diagnosis and rule out other potential causes.
+  - >
+      The patient's symptoms include [FINDING1] and [FINDING2], which are commonly associated with [DISORDER].
+      It is recommended that additional diagnostic procedures be performed to confirm this working diagnosis.
+  - >
+      The clinical presentation of [FINDING1] and [FINDING2] is indicative of [DISORDER].
+      To ensure accurate diagnosis, further clinical evaluation and diagnostic tests are required.
diff --git a/install_requires.txt b/install_requires.txt
@@ -17,7 +17,7 @@
 'aiofiles>=0.8.0' # allow later versions, tested with 22.1.0
 'ipywidgets>=7.6.5' # allow later versions, tested with 0.8.0
 'xxhash>=3.0.0' # allow later versions, tested with 3.1.0
-'blis>=0.7.5' # allow later versions, tested with 0.7.9
+'blis>=0.7.5,<1.0.0' # allow later versions, tested with 0.7.9, avoid 1.0.0 (depends on numpy 2)
 'click>=8.0.4' # allow later versions, tested with 8.1.3
 'pydantic>=1.10.0,<2.0' # for spacy compatibility; avoid 2.0 due to breaking changes
 "humanfriendly~=10.0"  # for human readable file / RAM sizes

diff --git a/medcat/cat.py b/medcat/cat.py
@@ -41,6 +41,7 @@
 from medcat.utils.saving.envsnapshot import get_environment_info, ENV_SNAPSHOT_FILE_NAME
 from medcat.stats.stats import get_stats
 from medcat.utils.filters import set_project_filters
+from medcat.utils.usage_monitoring import UsageMonitor
 
 
 logger = logging.getLogger(__name__) # separate logger from the package-level one
@@ -53,7 +54,7 @@
 
 class CAT(object):
     """The main MedCAT class used to annotate documents, it is built on top of spaCy
-    and works as a spaCy pipline. Creates an instance of a spaCy pipline that can
+    and works as a spaCy pipeline. Creates an instance of a spaCy pipeline that can
     be used as a spacy nlp model.
 
     Args:
@@ -108,6 +109,7 @@ def __init__(self,
         self._rel_cats = rel_cats
         self._addl_ner = addl_ner if isinstance(addl_ner, list) else [addl_ner]
         self._create_pipeline(self.config)
+        self.usage_monitor = UsageMonitor(self.config.version.id, self.config.general.usage_monitor)
 
     def _create_pipeline(self, config: Config):
         # Set log level
@@ -158,6 +160,10 @@ def get_hash(self, force_recalc: bool = False) -> str:
             str: The resulting hash
         """
         hasher = Hasher()
+        if self.config.general.simple_hash:
+            logger.info("Using simplified hashing that only takes into account the model card")
+            hasher.update(self.get_model_card())
+            return hasher.hexdigest()
         hasher.update(self.cdb.get_hash(force_recalc))
 
         hasher.update(self.config.get_hash())
@@ -258,7 +264,7 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M
         if cdb_format.lower() == 'json':
             json_path = save_dir_path # in the same folder!
         else:
-            json_path = None # use dill formating
+            json_path = None # use dill formatting
         logger.info('Saving model pack with CDB in %s format', cdb_format)
 
         # expand user path to make this work with '~'
@@ -339,7 +345,7 @@ def attempt_unpack(cls, zip_path: str) -> str:
 
         model_pack_path = os.path.join(base_dir, foldername)
         if os.path.exists(model_pack_path):
-            logger.info("Found an existing unziped model pack at: {}, the provided zip will not be touched.".format(model_pack_path))
+            logger.info("Found an existing unzipped model pack at: {}, the provided zip will not be touched.".format(model_pack_path))
         else:
             logger.info("Unziping the model pack and loading models.")
             shutil.unpack_archive(zip_path, extract_dir=model_pack_path)
@@ -350,6 +356,7 @@ def load_model_pack(cls,
                         zip_path: str,
                         meta_cat_config_dict: Optional[Dict] = None,
                         ner_config_dict: Optional[Dict] = None,
+                        medcat_config_dict: Optional[Dict] = None,
                         load_meta_models: bool = True,
                         load_addl_ner: bool = True,
                         load_rel_models: bool = True) -> "CAT":
@@ -367,6 +374,10 @@ def load_model_pack(cls,
                 A config dict that will overwrite existing configs in transformers ner.
                 e.g. ner_config_dict = {'general': {'chunking_overlap_window': 6}.
                 Defaults to None.
+            medcat_config_dict (Optional[Dict]):
+                A config dict that will overwrite existing configs in the main medcat config
+                before pipe initialisation. This can be useful if wanting to change something
+                that only takes effect at init time (e.g spacy model). Defaults to None.
             load_meta_models (bool):
                 Whether to load MetaCAT models if present (Default value True).
             load_addl_ner (bool):
@@ -389,7 +400,7 @@ def load_model_pack(cls,
 
         # load config
         config_path = os.path.join(model_pack_path, "config.json")
-        cdb.load_config(config_path)
+        cdb.load_config(config_path, medcat_config_dict)
 
         # TODO load addl_ner
 
@@ -493,8 +504,26 @@ def __call__(self, text: Optional[str], do_train: bool = False) -> Optional[Doc]
             logger.error("The input text should be either a string or a sequence of strings but got %s", type(text))
             return None
         else:
-            text = self._get_trimmed_text(str(text))
-            return self.pipe(text)  # type: ignore
+            text = str(text)  # NOTE: shouldn't be necessary but left it in
+            if self.config.general.usage_monitor.enabled:
+                l1 = len(text)
+                text = self._get_trimmed_text(text)
+                l2 = len(text)
+                rval = self.pipe(text)
+                # NOTE: pipe returns Doc (not List[Doc]) since we passed str (not List[str])
+                #       that's why we ignore type here
+                #       But it could still be None if the text is empty
+                if rval is None:
+                    nents = 0
+                elif self.config.general.show_nested_entities:
+                    nents = len(rval._.ents)  # type: ignore
+                else:
+                    nents = len(rval.ents)  # type: ignore
+                self.usage_monitor.log_inference(l1, l2, nents)
+                return rval  # type: ignore
+            else:
+                text = self._get_trimmed_text(text)
+                return self.pipe(text)  # type: ignore
 
     def __repr__(self) -> str:
         """Prints the model_card for this CAT instance.
@@ -525,7 +554,7 @@ def _print_stats(self,
                 Each project in MedCATtrainer can have filters, do we want to respect those filters
                 when calculating metrics.
             use_overlaps (bool):
-                Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entites.
+                Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entities.
             use_cui_doc_limit (bool):
                 If True the metrics for a CUI will be only calculated if that CUI appears in a document, in other words
                 if the document was annotated for that CUI. Useful in very specific situations when during the annotation
@@ -641,7 +670,7 @@ def add_cui_to_group(self, cui: str, group_name: str) -> None:
             cui (str):
                 The concept to be added.
             group_name (str):
-                The group to whcih the concept will be added.
+                The group to which the concept will be added.
 
         Examples:
 
@@ -1193,7 +1222,7 @@ def _run_nn_components(self, docs: Dict, nn_components: List, id2text: Dict) ->
         for name, component in nn_components:
             component.config.general['disable_component_lock'] = True
 
-        # For meta_cat compoments 
+        # For meta_cat components
         for name, component in [c for c in nn_components if isinstance(c[1], MetaCAT)]:
             spacy_docs = component.pipe(spacy_docs)
         for spacy_doc in spacy_docs:
@@ -1341,7 +1370,7 @@ def multiprocessing_batch_char_size(self,
 
         docs = {}
         _start_time = time.time()
-        _batch_counter = 0 # Used for splitting the output, counts batches inbetween saves
+        _batch_counter = 0 # Used for splitting the output, counts batches between saves
         for batch in self._batch_generator(iterator, batch_size_chars, skip_ids=set(annotated_ids)):
             logger.info("Annotated until now: %s docs; Current BS: %s docs; Elapsed time: %.2f minutes",
                           len(annotated_ids),