From f9c19fca0288b3b918c788a59e057e8e8998ae35 Mon Sep 17 00:00:00 2001 From: David Berenstein Date: Sun, 3 Apr 2022 16:18:04 +0200 Subject: [PATCH] added a flag to include classifying each sentence in spaCy. --- classy_classification/__init__.py | 58 ++++++------ .../classifiers/spacy_few_shot_external.py | 47 +++++++--- .../classifiers/spacy_internal.py | 52 +++++++---- .../classifiers/spacy_zero_shot_external.py | 93 ++++++++++--------- .../examples/spacy_few_shot_external.py | 6 +- .../examples/spacy_internal_embeddings.py | 8 +- .../examples/spacy_zero_shot_external.py | 6 +- pyproject.toml | 2 +- test.py | 9 ++ 9 files changed, 168 insertions(+), 113 deletions(-) create mode 100644 test.py diff --git a/classy_classification/__init__.py b/classy_classification/__init__.py index f3b6d1c..d5a0f5e 100644 --- a/classy_classification/__init__.py +++ b/classy_classification/__init__.py @@ -5,18 +5,13 @@ from spacy.language import Language from spacy.tokens import Doc -from .classifiers.sentence_transformer import \ - classySentenceTransformer as classyClassifier +from .classifiers.sentence_transformer import classySentenceTransformer as classyClassifier from .classifiers.spacy_few_shot_external import classySpacyFewShotExternal from .classifiers.spacy_internal import classySpacyInternal from .classifiers.spacy_zero_shot_external import classySpacyZeroShotExternal -__all__ = [ - 'classyClassifier', - 'classySpacyFewShotExternal', - 'classySpacyZeroShotExternal', - 'classySpacyInternal' -] +__all__ = ["classyClassifier", "classySpacyFewShotExternal", "classySpacyZeroShotExternal", "classySpacyInternal"] + @Language.factory( "text_categorizer", @@ -24,12 +19,10 @@ "data": None, "model": None, "device": "cpu", - "config": { - "C": [1, 2, 5, 10, 20, 100], - "kernels": ["linear"], - "max_cross_validation_folds": 5 - }, - "cat_type": 'few' + "config": {"C": [1, 2, 5, 10, 20, 100], "kernels": ["linear"], "max_cross_validation_folds": 5}, + "cat_type": "few", + "include_doc": True, + "include_sent": False, }, ) def make_text_categorizer( @@ -39,48 +32,57 @@ def make_text_categorizer( device: str, config: dict, model: str = None, - cat_type: str = 'few', -): - if model == 'spacy': - if cat_type == 'zero': - raise NotImplementedError('cannot use spacy internal embeddings with zero-shot classification') + cat_type: str = "few", + include_doc: bool = True, + include_sent: bool = False, +): + if model == "spacy": + if cat_type == "zero": + raise NotImplementedError("cannot use spacy internal embeddings with zero-shot classification") return classySpacyInternal( - nlp=nlp, - name=name, - data=data, - config=config, + nlp=nlp, name=name, data=data, config=config, include_doc=include_doc, include_sent=include_sent ) else: - if cat_type == 'zero': + if cat_type == "zero": if model: return classySpacyZeroShotExternal( + nlp=nlp, name=name, data=data, device=device, - model=model + model=model, + include_doc=include_doc, + include_sent=include_sent, ) else: return classySpacyZeroShotExternal( + nlp=nlp, name=name, data=data, device=device, - model=model + model=model, + include_doc=include_doc, + include_sent=include_sent, ) else: if model: return classySpacyFewShotExternal( + nlp=nlp, name=name, data=data, device=device, model=model, config=config, + include_doc=include_doc, + include_sent=include_sent, ) else: return classySpacyFewShotExternal( + nlp=nlp, name=name, data=data, device=device, config=config, + include_doc=include_doc, + include_sent=include_sent, ) - - diff --git a/classy_classification/classifiers/spacy_few_shot_external.py b/classy_classification/classifiers/spacy_few_shot_external.py index 691bdd7..eabb4e9 100644 --- a/classy_classification/classifiers/spacy_few_shot_external.py +++ b/classy_classification/classifiers/spacy_few_shot_external.py @@ -1,17 +1,21 @@ -import os - from spacy import util -from spacy.language import Language -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from .sentence_transformer import classySentenceTransformer class classySpacyFewShotExternal(classySentenceTransformer): - def __init__(self, name, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, nlp, name, data, device, config, include_doc, include_sent, *args, **kwargs): + super().__init__(data=data, device=device, config=config, *args, **kwargs) self.name = name - Doc.set_extension("cats", default=None, force=True) + self.include_doc = include_doc + self.include_sent = include_sent + if include_sent: + Span.set_extension("cats", default=None, force=True) + if "sentencizer" not in nlp.pipe_names: + nlp.add_pipe("sentencizer") + if include_doc: + Doc.set_extension("cats", default=None, force=True) def __call__(self, doc: Doc) -> Doc: """ @@ -23,8 +27,11 @@ def __call__(self, doc: Doc) -> Doc: Returns: Doc: spacy doc with ._.cats key-class proba-value dict """ - pred_result = super(self.__class__, self).__call__(doc.text.replace("\n", " ")) - doc._.cats = pred_result + if self.include_doc: + pred_result = super(self.__class__, self).__call__(doc.text.replace("\n", " ")) + doc._.cats = pred_result + if self.include_sent: + doc = self.set_pred_results_for_doc(doc) return doc @@ -39,11 +46,21 @@ def pipe(self, stream, batch_size=128): Doc: spacy doc with ._.cats key-class proba-value dict """ for docs in util.minibatch(stream, size=batch_size): - texts = [doc.text.replace("\n", " ") for doc in docs] - pred_results = super(self.__class__, self).pipe(texts) - + pred_results = [doc.text.replace("\n", " ") for doc in docs] + + if self.include_doc: + pred_results = super(self.__class__, self).pipe(pred_results) + for doc, pred_result in zip(docs, pred_results): - doc._.cats = pred_result - + if self.include_doc: + doc._.cats = pred_result + if self.include_sent: + doc = self.set_pred_results_for_doc(doc) yield doc - \ No newline at end of file + + def set_pred_results_for_doc(self, doc: Doc): + pred_results = super(self.__class__, self).pipe([sent.text for sent in list(doc.sents)]) + for sent, pred in zip(doc.sents, pred_results): + sent._.cats = pred + + return doc diff --git a/classy_classification/classifiers/spacy_internal.py b/classy_classification/classifiers/spacy_internal.py index dd7ac65..54721f6 100644 --- a/classy_classification/classifiers/spacy_internal.py +++ b/classy_classification/classifiers/spacy_internal.py @@ -1,22 +1,29 @@ from typing import List from spacy import util -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from .classy_skeleton import classySkeleton class classySpacyInternal(classySkeleton): - def __init__(self, nlp, name, *args, **kwargs): - super().__init__(*args, **kwargs) - Doc.set_extension("cats", default=None, force=True) + def __init__(self, nlp, name, data, config, include_doc, include_sent): + super().__init__(data=data, config=config) + self.include_doc = include_doc + self.include_sent = include_sent + if include_sent: + Span.set_extension("cats", default=None, force=True) + if "sentencizer" not in nlp.pipe_names: + nlp.add_pipe("sentencizer") + if include_doc: + Doc.set_extension("cats", default=None, force=True) self.name = name self.nlp = nlp self.set_training_data() self.set_svc() def get_embeddings(self, text: List[str]) -> List[float]: - """ Retrieve embeddings from text. + """Retrieve embeddings from text. Overwrites function from the classySkeleton that is used to get embeddings for training data. Args: @@ -27,9 +34,9 @@ def get_embeddings(self, text: List[str]) -> List[float]: """ docs = self.nlp.pipe(text) embeddings = [self.get_embeddings_from_doc(doc) for doc in docs] - + return embeddings - + def get_embeddings_from_doc(self, doc: Doc) -> List[float]: """Retrieve a vector from a spacy doc and internal embeddings. @@ -59,9 +66,12 @@ def __call__(self, doc: Doc): Returns: Doc: spacy doc with ._.cats key-class proba-value dict """ - embeddings = self.get_embeddings_from_doc(doc) - embeddings = embeddings.reshape(1, -1) - doc._.cats = self.get_prediction(embeddings)[0] + if self.include_doc: + embeddings = self.get_embeddings_from_doc(doc) + embeddings = embeddings.reshape(1, -1) + doc._.cats = self.get_prediction(embeddings)[0] + if self.include_sent: + doc = self.set_pred_results_for_doc(doc) return doc @@ -76,11 +86,21 @@ def pipe(self, stream, batch_size=128): Doc: spacy doc with ._.cats key-class proba-value dict """ for docs in util.minibatch(stream, size=batch_size): - embeddings = [self.get_embeddings_from_doc(doc) for doc in docs] - pred_results = self.get_prediction(embeddings) - + pred_results = [self.get_embeddings_from_doc(doc) for doc in docs] + if self.include_doc: + pred_results = self.get_prediction(pred_results) + for doc, pred_result in zip(docs, pred_results): - doc._.cats = pred_result - + if self.include_doc: + doc._.cats = pred_result + if self.include_sent: + doc = self.set_pred_results_for_doc(doc) + yield doc - \ No newline at end of file + + def set_pred_results_for_doc(self, doc: Doc): + embeddings = [sent.as_doc().vector for sent in list(doc.sents)] + pred_results = self.get_prediction(embeddings) + for sent, pred in zip(doc.sents, pred_results): + sent._.cats = pred + return doc diff --git a/classy_classification/classifiers/spacy_zero_shot_external.py b/classy_classification/classifiers/spacy_zero_shot_external.py index cd6d37f..a5da2ef 100644 --- a/classy_classification/classifiers/spacy_zero_shot_external.py +++ b/classy_classification/classifiers/spacy_zero_shot_external.py @@ -1,20 +1,31 @@ -from spacy import util -from spacy.tokens import Doc +from spacy import Language, util +from spacy.tokens import Doc, Span from transformers import pipeline class classySpacyZeroShotExternal(object): - def __init__(self, - name: str, - data: dict, - model: str = 'facebook/bart-large-mnli', - device: str = 'cpu', - ): + def __init__( + self, + nlp: Language, + name: str, + data: dict, + model: str = "facebook/bart-large-mnli", + device: str = "cpu", + include_doc: bool = False, + include_sent: bool = False, + ): self.data = data self.name = name self.device = device self.model = model - Doc.set_extension("cats", default=None, force=True) + self.include_doc = include_doc + self.include_sent = include_sent + if include_sent: + Span.set_extension("cats", default=None, force=True) + if "sentencizer" not in nlp.pipe_names: + nlp.add_pipe("sentencizer") + if include_doc: + Doc.set_extension("cats", default=None, force=True) self.set_classification_model() def __call__(self, doc: Doc) -> Doc: @@ -27,11 +38,21 @@ def __call__(self, doc: Doc) -> Doc: Returns: Doc: spacy doc with ._.cats key-class proba-value dict """ - pred_result = self.pipeline(doc.text, self.data) - doc._.cats = self.format_prediction(pred_result) + if self.include_doc: + pred_result = self.pipeline(doc.text, self.data) + doc._.cats = self.format_prediction(pred_result) + if self.include_sent: + doc = self.set_pred_results_for_doc(doc) return doc + def set_pred_results_for_doc(self, doc: Doc): + pred_results = self.pipeline([sent.text for sent in list(doc.sents)], self.data) + pred_results = [self.format_prediction(pred) for pred in pred_results] + for sent, pred in zip(doc.sents, pred_results): + sent._.cats = pred + return doc + def pipe(self, stream, batch_size=128): """ predict the class for a spacy Doc stream @@ -43,48 +64,34 @@ def pipe(self, stream, batch_size=128): Doc: spacy doc with ._.cats key-class proba-value dict """ for docs in util.minibatch(stream, size=batch_size): - texts = [doc.text.replace("\n", " ") for doc in docs] - predictions = self.pipeline(texts, self.data) - predictions = [self.format_prediction(pred) for pred in predictions] + predictions = [doc.text.replace("\n", " ") for doc in docs] + if self.include_doc: + predictions = self.pipeline(predictions, self.data) + predictions = [self.format_prediction(pred) for pred in predictions] for doc, pred_result in zip(docs, predictions): - doc._.cats = pred_result - + if self.include_doc: + doc._.cats = pred_result + if self.include_sent: + doc = self.set_pred_results_for_doc(doc) + yield doc - + def set_classification_model(self, model: str = None, device: str = None): - """ set the embedding model based on a sentencetransformer model or path + """set the embedding model based on a sentencetransformer model or path Args: model (str, optional): the model name. Defaults to self.model, if no model is provided. """ - if model: # update if overwritten + if model: # update if overwritten self.model = model if device: self.device = device - - if self.device == 'gpu': - self.pipeline = pipeline( - "zero-shot-classification", - model=self.model, - device=0 - ) + + if self.device == "gpu": + self.pipeline = pipeline("zero-shot-classification", model=self.model, device=0) else: - self.pipeline = pipeline( - "zero-shot-classification", - model=self.model - ) - + self.pipeline = pipeline("zero-shot-classification", model=self.model) + @staticmethod def format_prediction(prediction): - return [{label: score} for label, score in zip(prediction['labels'], prediction['scores'])] - - - - - - - - - - - + return [{label: score} for label, score in zip(prediction["labels"], prediction["scores"])] diff --git a/classy_classification/examples/spacy_few_shot_external.py b/classy_classification/examples/spacy_few_shot_external.py index c0cb5a7..0ce5cf0 100644 --- a/classy_classification/examples/spacy_few_shot_external.py +++ b/classy_classification/examples/spacy_few_shot_external.py @@ -3,7 +3,7 @@ from .data import training_data, validation_data -nlp = spacy.blank('en') -nlp.add_pipe('text_categorizer', config={'data': training_data}) -print(nlp(validation_data[0])._.cats) +nlp = spacy.blank("en") +nlp.add_pipe("text_categorizer", config={"data": training_data, "include_sent": True}) +print([sent._.cats for sent in nlp(validation_data[0]).sents]) print([doc._.cats for doc in nlp.pipe(validation_data)]) diff --git a/classy_classification/examples/spacy_internal_embeddings.py b/classy_classification/examples/spacy_internal_embeddings.py index 58cf7eb..8571290 100644 --- a/classy_classification/examples/spacy_internal_embeddings.py +++ b/classy_classification/examples/spacy_internal_embeddings.py @@ -1,9 +1,9 @@ +import classy_classification import spacy -import classy_classification from .data import training_data, validation_data -nlp = spacy.load('en_core_web_md') -nlp.add_pipe('text_categorizer', config={'data': training_data, 'model': 'spacy'}) -print(nlp(validation_data[0])._.cats) +nlp = spacy.load("en_core_web_md") +nlp.add_pipe("text_categorizer", config={"data": training_data, "model": "spacy", "include_sent": True}) +print([sent._.cats for sent in nlp(validation_data[0]).sents]) print([doc._.cats for doc in nlp.pipe(validation_data)]) diff --git a/classy_classification/examples/spacy_zero_shot_external.py b/classy_classification/examples/spacy_zero_shot_external.py index 12a05bc..dc98b19 100644 --- a/classy_classification/examples/spacy_zero_shot_external.py +++ b/classy_classification/examples/spacy_zero_shot_external.py @@ -3,7 +3,7 @@ from .data import training_data, validation_data -nlp = spacy.blank('en') -nlp.add_pipe('text_categorizer', config={'data': list(training_data.keys()), 'cat_type': 'zero'}) -print(nlp(validation_data[0])._.cats) +nlp = spacy.blank("en") +nlp.add_pipe("text_categorizer", config={"data": list(training_data.keys()), "cat_type": "zero", "include_sent": True}) +print([sent._.cats for sent in nlp(validation_data[0]).sents]) print([doc._.cats for doc in nlp.pipe(validation_data)]) diff --git a/pyproject.toml b/pyproject.toml index 0d7db3a..9ca02cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "classy-classification" -version = "0.3.6" +version = "0.4.0" description = "Have you every struggled with needing a Spacy TextCategorizer but didn't have the time to train one from scratch? Classy Classification is the way to go!" authors = ["David Berenstein "] license = "MIT" diff --git a/test.py b/test.py new file mode 100644 index 0000000..df19cb7 --- /dev/null +++ b/test.py @@ -0,0 +1,9 @@ +import spacy + +import classy_classification +from classy_classification.examples.data import training_data, validation_data + +nlp = spacy.blank("en") +nlp.add_pipe("text_categorizer", config={"data": list(training_data.keys()), "cat_type": "zero", "include_sent": True}) +print([sent._.cats for sent in nlp(validation_data[0]).sents]) +print([doc._.cats for doc in nlp.pipe(validation_data)])