removed overhead fasttext via duplicating only relevant txtai code, u…

…pdated dependencies
davidberenstein1957 · Sep 25, 2022 · 7db2c91 · 7db2c91
1 parent 4b4a68e
commit 7db2c91
Show file tree

Hide file tree

Showing 7 changed files with 1,053 additions and 1,321 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -5,37 +5,36 @@ name: Python package
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 jobs:
   build:
-
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.7", "3.8", "3.9"]
+        python-version: ["3.8", "3.9", "3.10"]
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest pytest-cov
-        python -m pip install poetry 
-        poetry export -f requirements.txt -o requirements.txt --without-hashes
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-        python -m spacy download en_core_web_md
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --max-complexity=18 --enable=W0614 --select=C,E,F,W,B,B950 --ignore=E203,E266,E501,W503 --exclude=.git,__pycache__,build,dist --max-line-length=119 --show-source --statistics 
-    - name: Test with pytest
-      run: |
-        pytest --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install flake8 pytest pytest-cov
+          python -m pip install poetry
+          poetry export -f requirements.txt -o requirements.txt --without-hashes
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+          python -m spacy download en_core_web_md
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --max-complexity=18 --enable=W0614 --select=C,E,F,W,B,B950 --ignore=E203,E266,E501,W503 --exclude=.git,__pycache__,build,dist --max-line-length=119 --show-source --statistics
+      - name: Test with pytest
+        run: |
+          pytest --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
diff --git a/.gitignore b/.gitignore
@@ -129,6 +129,5 @@ dmypy.json
 .pyre/
 
 # Downloaded models
-*.model
-*.model.*
-*.onnx
+/models
+*.onnx
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Classy Classification
-Have you every struggled with needing a [Spacy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Hugginface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification). 
+Have you every struggled with needing a [Spacy TextCategorizer](https://spacy.io/api/textcategorizer) but didn't have the time to train one from scratch? Classy Classification is the way to go! For few-shot classification using [sentence-transformers](https://github.com/UKPLab/sentence-transformers) or [spaCy models](https://spacy.io/usage/models), provide a dictionary with labels and examples, or just provide a list of labels for zero shot-classification with [Hugginface zero-shot classifiers](https://huggingface.co/models?pipeline_tag=zero-shot-classification).
 
 [![Current Release Version](https://img.shields.io/github/release/pandora-intelligence/classy-classification.svg?style=flat-square&logo=github)](https://github.com/pandora-intelligence/classy-classification/releases)
 [![pypi Version](https://img.shields.io/pypi/v/classy-classification.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/classy-classification/)
@@ -10,7 +10,7 @@ Have you every struggled with needing a [Spacy TextCategorizer](https://spacy.io
 ``` pip install classy-classification```
 # Quickstart
 ## SpaCy embeddings
-```
+```python
 import spacy
 import classy_classification
 
@@ -25,12 +25,12 @@ data = {
 
 nlp = spacy.load("en_core_web_md")
 nlp.add_pipe(
-    "text_categorizer", 
+    "text_categorizer",
     config={
-        "data": data, 
+        "data": data,
         "model": "spacy"
     }
-) 
+)
 
 print(nlp("I am looking for kitchen appliances.")._.cats)
 
@@ -39,7 +39,7 @@ print(nlp("I am looking for kitchen appliances.")._.cats)
 # [{"label": "furniture", "score": 0.21}, {"label": "kitchen", "score": 0.79}]
 ```
 ## Sentence-transfomer embeddings
-```
+```python
 import spacy
 import classy_classification
 
@@ -54,13 +54,13 @@ data = {
 
 nlp = spacy.blank("en")
 nlp.add_pipe(
-    "text_categorizer", 
+    "text_categorizer",
     config={
-        "data": data, 
+        "data": data,
         "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
         "device": "gpu"
     }
-) 
+)
 
 print(nlp("I am looking for kitchen appliances.")._.cats)
 
@@ -69,22 +69,22 @@ print(nlp("I am looking for kitchen appliances.")._.cats)
 # [{"label": "furniture", "score": 0.21}, {"label": "kitchen", "score": 0.79}]
 ```
 ## Hugginface zero-shot classifiers
-```
+```python
 import spacy
 import classy_classification
 
 data = ["furniture", "kitchen"]
 
 nlp = spacy.blank("en")
 nlp.add_pipe(
-    "text_categorizer", 
+    "text_categorizer",
     config={
-        "data": data, 
+        "data": data,
         "model": "facebook/bart-large-mnli",
         "cat_type": "zero",
         "device": "gpu"
     }
-) 
+)
 
 print(nlp("I am looking for kitchen appliances.")._.cats)
 
@@ -94,10 +94,10 @@ print(nlp("I am looking for kitchen appliances.")._.cats)
 ```
 # Credits
 ## Inspiration Drawn From
-[Huggingface](https://huggingface.co/) does offer some nice models for few/zero-shot classification, but these are not tailored to multi-lingual approaches. Rasa NLU has [a nice approach](https://rasa.com/blog/rasa-nlu-in-depth-part-1-intent-classification/) for this, but its too embedded in their codebase for easy usage outside of Rasa/chatbots. Additionally, it made sense to integrate [sentence-transformers](https://github.com/UKPLab/sentence-transformers) and [Hugginface zero-shot](https://huggingface.co/models?pipeline_tag=zero-shot-classification), instead of default [word embeddings](https://arxiv.org/abs/1301.3781). Finally, I decided to integrate with Spacy, since training a custom [Spacy TextCategorizer](https://spacy.io/api/textcategorizer) seems like a lot of hassle if you want something quick and dirty. 
+[Huggingface](https://huggingface.co/) does offer some nice models for few/zero-shot classification, but these are not tailored to multi-lingual approaches. Rasa NLU has [a nice approach](https://rasa.com/blog/rasa-nlu-in-depth-part-1-intent-classification/) for this, but its too embedded in their codebase for easy usage outside of Rasa/chatbots. Additionally, it made sense to integrate [sentence-transformers](https://github.com/UKPLab/sentence-transformers) and [Hugginface zero-shot](https://huggingface.co/models?pipeline_tag=zero-shot-classification), instead of default [word embeddings](https://arxiv.org/abs/1301.3781). Finally, I decided to integrate with Spacy, since training a custom [Spacy TextCategorizer](https://spacy.io/api/textcategorizer) seems like a lot of hassle if you want something quick and dirty.
 
 - [Scikit-learn](https://github.com/scikit-learn/scikit-learn)
-- [Rasa NLU](https://github.com/RasaHQ/rasa) 
+- [Rasa NLU](https://github.com/RasaHQ/rasa)
 - [Sentence Transformers](https://github.com/UKPLab/sentence-transformers)
 - [Spacy](https://github.com/explosion/spaCy)
 
@@ -107,7 +107,8 @@ print(nlp("I am looking for kitchen appliances.")._.cats)
 
 # Standalone usage without spaCy
 
-```
+```python
+
 from classy_classification import classyClassifier
 
 data = {
@@ -133,15 +134,36 @@ classifier("I am looking for kitchen appliances.")
 
 # overwrite SVC config
 classifier.set_svc(
-    config={                              
+    config={
         "C": [1, 2, 5, 10, 20, 100],
-        "kernels": ["linear"],                              
+        "kernels": ["linear"],
         "max_cross_validation_folds": 5
     }
 )
 classifier("I am looking for kitchen appliances.")
 ```
 
+## Save and load models
+```python
+data = {
+    "furniture": ["This text is about chairs.",
+               "Couches, benches and televisions.",
+               "I really need to get a new sofa."],
+    "kitchen": ["There also exist things like fridges.",
+                "I hope to be getting a new stove today.",
+                "Do you also have some ovens."]
+}
+classifier = classyClassifier(data=data)
+
+with open("./classifier.pkl", "wb") as f:
+    pickle.dump(classifier, f)
+
+f = open("./classifier.pkl", "rb")
+classifier = pickle.load(f)
+classifier("I am looking for kitchen appliances.")
+```
+
+
 # Todo
 
 [ ] look into a way to integrate spacy trf models.

diff --git a/classy_classification/classifiers/sentence_transformer.py b/classy_classification/classifiers/sentence_transformer.py
@@ -1,9 +1,9 @@
 from typing import List, Union
 
 import onnxruntime
+from fast_sentence_transformers.txtai import HFOnnx
 from onnxruntime import InferenceSession, SessionOptions
 from transformers import AutoTokenizer
-from txtai.pipeline import HFOnnx
 
 from .classy_skeleton import classySkeleton
 

diff --git a/classy_classification/classifiers/spacy_zero_shot_external.py b/classy_classification/classifiers/spacy_zero_shot_external.py
@@ -1,6 +1,7 @@
+from fast_sentence_transformers.txtai import HFOnnx
+from fast_sentence_transformers.txtai.text import Labels
 from spacy import Language, util
 from spacy.tokens import Doc, Span
-from txtai.pipeline import HFOnnx, Labels
 
 
 class classySpacyZeroShotExternal(object):
-Original file line number
+Diff line change
@@ Expand Up / @@ -129,6 +129,5 @@ dmypy.json @@
     .pyre/
     # Downloaded models
-    *.model
-    *.model.*
-    *.onnx
+    /models
+    *.onnx