initial commit

linto-ai · Nov 29, 2021 · 85ea60f · 85ea60f
commit 85ea60f
Show file tree

Hide file tree

Showing 13 changed files with 451 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+assets
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,18 @@
+FROM lintoai/linto-platform-nlp-core:latest
+LABEL maintainer="[email protected]"
+
+WORKDIR /app
+
+VOLUME /app/assets
+ENV ASSETS_PATH=/app/assets
+
+COPY ./requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY ./scripts /app/scripts
+COPY ./components /app/components
+
+HEALTHCHECK --interval=15s CMD curl -fs http://0.0.0.0/health || exit 1
+
+ENTRYPOINT ["/home/user/miniconda/bin/uvicorn", "scripts.main:app", "--host", "0.0.0.0", "--port", "80"]
+CMD ["--workers", "1"]
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -0,0 +1,51 @@
+pipeline {
+    agent any
+    environment {
+        DOCKER_HUB_REPO = "lintoai/linto-platform-nlp-keyphrase-extraction"
+        DOCKER_HUB_CRED = 'docker-hub-credentials'
+
+        VERSION = ''
+    }
+
+    stages{
+        stage('Docker build for master branch'){
+            when{
+                branch 'master'
+            }
+            steps {
+                echo 'Publishing latest'
+                script {
+                    image = docker.build(env.DOCKER_HUB_REPO)
+                    VERSION = sh(
+                        returnStdout: true, 
+                        script: "awk -v RS='' '/#/ {print; exit}' RELEASE.md | head -1 | sed 's/#//' | sed 's/ //'"
+                    ).trim()
+
+                    docker.withRegistry('https://registry.hub.docker.com', env.DOCKER_HUB_CRED) {
+                        image.push("${VERSION}")
+                        image.push('latest')
+                    }
+                }
+            }
+        }
+
+        stage('Docker build for next (unstable) branch'){
+            when{
+                branch 'next'
+            }
+            steps {
+                echo 'Publishing unstable'
+                script {
+                    image = docker.build(env.DOCKER_HUB_REPO)
+                    VERSION = sh(
+                        returnStdout: true, 
+                        script: "awk -v RS='' '/#/ {print; exit}' RELEASE.md | head -1 | sed 's/#//' | sed 's/ //'"
+                    ).trim()
+                    docker.withRegistry('https://registry.hub.docker.com', env.DOCKER_HUB_CRED) {
+                        image.push('latest-unstable')
+                    }
+                }
+            }
+        }
+    }// end stages
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,201 @@
+# linto-platform-nlp-keyphrase-extraction
+
+## Description
+This repository is for building a Docker image for LinTO's NLP service: Keyphrase Extraction on the basis of [linto-platform-nlp-core](https://github.com/linto-ai/linto-platform-nlp-core), can be deployed along with [LinTO stack](https://github.com/linto-ai/linto-platform-stack) or in a standalone way (see Develop section in below).
+
+linto-platform-nlp-keyphrase-extraction is backed by [spaCy](https://spacy.io/) v3.0+ featuring transformer-based pipelines, thus deploying with GPU support is highly recommeded for inference efficiency.
+
+LinTo's NLP services adopt the basic design concept of spaCy: [component and pipeline](https://spacy.io/usage/processing-pipelines), componets are decoupled from the service and can be easily re-used in other projects, components are organised into pipelines for realising specific NLP tasks. 
+
+This service uses [FastAPI](https://fastapi.tiangolo.com/) to serve custom spaCy's components as pipelines:
+- `kpe`: Keyphrase Extraction
+
+## Usage
+
+See documentation : [https://doc.linto.ai](https://doc.linto.ai)
+
+## Deploy
+
+With our proposed stack [https://github.com/linto-ai/linto-platform-stack](https://github.com/linto-ai/linto-platform-stack)
+
+# Develop
+
+## Build and run
+1 Create a named volume for storaging models.
+```bash
+sudo docker volume create linto-platform-nlp-assets
+```
+
+2 Download models into `assets/` on the host machine, make sure that `git-lfs`: [Git Large File Storage](https://git-lfs.github.com/) is installed and availble at `/usr/local/bin/git-lfs`.
+```bash
+cd linto-platform-nlp-keyphrase-extraction/
+bash scripts/download_models.sh
+```
+
+3 Copy downloaded models into created volume `linto-platform-nlp-assets`
+```bash
+sudo docker container create --name cp_helper -v linto-platform-nlp-assets:/root hello-world
+sudo docker cp assets/* cp_helper:/root
+sudo docker rm cp_helper
+```
+
+4 Build image
+```bash
+sudo docker build --tag lintoai/linto-platform-keyphrase-extraction:latest .
+```
+
+5 Run container (with GPU), make sure that [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian) and GPU driver are installed.
+```bash
+sudo docker run --gpus all \
+--rm -d -p 80:80 \
+-v linto-platform-nlp-assets:/app/assets:ro \
+--env APP_LANG="fr en" \
+lintoai/linto-platform-keyphrase-extraction:latest
+```
+<details>
+  <summary>Check running with CPU only setting</summary>
+
+  ```bash
+sudo docker run \
+--rm -d -p 80:80 \
+-v linto-platform-nlp-assets:/app/assets:ro \
+--env APP_LANG="fr en" \
+lintoai/linto-platform-keyphrase-extraction:latest
+  ```
+</details>
+To specify running language of the container, modify APP_LANG="fr en", APP_LANG="fr", etc.
+
+To lanche with multiple workers, add `--workers INTEGER` in the end of the above command.
+
+6 Navigate to `http://localhost/docs` or `http://localhost/redoc` in your browser, to explore the REST API interactively. See the examples for how to query the API.
+
+
+## Specification for `http://localhost/kpe/{lang}`
+
+### Supported languages
+| {lang} | Model | Size |
+| --- | --- | --- |
+| `en` | [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 80 MB |
+| `fr` | [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) | 418 MB |
+
+### Request
+```json
+{
+  "articles": [
+    {
+      "text": "Apple Inc. is an American multinational technology company that specializes in consumer electronics, computer software and online services."
+    },
+    {
+      "text": "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set."
+    }
+  ]
+}
+```
+
+### Response
+```json
+{
+  "kpe": [
+    {
+      "text": "Apple Inc. is an American multinational technology company that specializes in consumer electronics, computer software and online services.",
+      "keyphrases": [
+        {
+          "text": "apple",
+          "score": 0.6539
+        },
+        {
+          "text": "inc",
+          "score": 0.3941
+        },
+        {
+          "text": "company",
+          "score": 0.2985
+        },
+        {
+          "text": "multinational",
+          "score": 0.2635
+        },
+        {
+          "text": "electronics",
+          "score": 0.2143
+        }
+      ]
+    },
+    {
+      "text": "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set.",
+      "keyphrases": [
+        {
+          "text": "unsupervised",
+          "score": 0.6663
+        },
+        {
+          "text": "learning",
+          "score": 0.3155
+        },
+        {
+          "text": "algorithms",
+          "score": 0.3128
+        },
+        {
+          "text": "algorithm",
+          "score": 0.2494
+        },
+        {
+          "text": "patterns",
+          "score": 0.2476
+        }
+      ]
+    }
+  ]
+}
+```
+
+### Component configuration
+This is a component wrapped on the basis of [KeyBERT](https://github.com/MaartenGr/KeyBERT).
+
+| Parameter | Type | Default value | Description |
+| --- | --- | --- | --- |
+| candidates | List[str] | null | Candidate keywords/keyphrases to use instead of extracting them from the document(s) |
+| diversity | Float | 0.5 | The diversity of results between 0 and 1 if use_mmr is True |
+| keyphrase_ngram_range | Tuple[int, int] | [1,1] | Length, in words, of the extracted keywords/keyphrases |
+| min_df | int | 1 | Minimum document frequency of a word across all documents if keywords for multiple documents need to be extracted |
+| nr_candidates | int | 20 | The number of candidates to consider if use_maxsum is set to True |
+| seed_keywords | List[str] | null | Seed keywords that may guide the extraction of keywords by steering the similarities towards the seeded keywords |
+| stop_words | Union[str, List[str]] | null | Stopwords to remove from the document |
+| top_n | int | 5 | Return the top n keywords/keyphrases |
+| use_maxsum | bool | false | Whether to use Max Sum Similarity for the selection of keywords/keyphrases |
+| use_mmr | bool | false | Whether to use Maximal Marginal Relevance (MMR) for the selection of keywords/keyphrases |
+
+Component's config can be modified in [`components/config.cfg`](components/config.cfg) for default values, or on the per API request basis at runtime:
+
+```json
+{
+  "articles": [
+    {
+      "text": "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set."
+    }
+  ],
+  "component_cfg": {
+    "kpe": {"keyphrase_ngram_range": [2,2], "top_n": 1}
+  }
+}
+```
+
+```json
+{
+  "kpe": [
+    {
+      "text": "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set.",
+      "keyphrases": [
+        {
+          "text": "unsupervised learning",
+          "score": 0.7252
+        }
+      ]
+    }
+  ]
+}
+```
+
+### Advanced usage
+For advanced usage, such as Max Sum Similarity and Maximal Marginal Relevance for diversifying extraction results, please refer to the documentation of [KeyBERT](https://maartengr.github.io/KeyBERT/guides/quickstart.html#usage) and [medium post](https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea) to know how it works.
diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,3 @@
+# 0.1.0
+- Initial commit.
+- Keyphrase Extraction.
diff --git a/components/__init__.py b/components/__init__.py
@@ -0,0 +1,37 @@
+import spacy
+from spacy.language import Language
+from typing import List, Union, Tuple
+from sklearn.feature_extraction.text import CountVectorizer
+from sentence_transformers import SentenceTransformer
+from thinc.api import Config
+from components.keyphrase_extractor import KeyphraseExtractor
+
+# Load components' defaut configuration
+config = Config().from_disk("components/config.cfg")
+
+@Language.factory("kpe", default_config=config["components"]["kpe"])
+def make_keyphrase_extractor(
+    nlp: Language,
+    name: str,
+    model: SentenceTransformer,
+    candidates: List[str] = None,
+    keyphrase_ngram_range: Tuple[int, int] = (1, 1),
+    stop_words: Union[str, List[str]] = None,
+    top_n: int = 5,
+    min_df: int = 1,
+    use_maxsum: bool = False,
+    use_mmr: bool = False,
+    diversity: float = 0.5,
+    nr_candidates: int = 20,
+    vectorizer: CountVectorizer = None,
+    highlight: bool = False,
+    seed_keywords: List[str] = None
+    ):
+
+    kwargs = locals()
+    del kwargs['nlp']
+    del kwargs['name']
+    del kwargs['model']
+
+    return KeyphraseExtractor(model, **kwargs)
+
diff --git a/components/config.cfg b/components/config.cfg
@@ -0,0 +1,15 @@
+[components]
+
+[components.kpe]
+candidates = null
+diversity = 0.5
+highlight = false
+keyphrase_ngram_range = [1,1]
+min_df = 1
+nr_candidates = 20
+seed_keywords = null
+stop_words = null
+top_n = 5
+use_maxsum = false
+use_mmr = false
+vectorizer = null
diff --git a/components/keyphrase_extractor.py b/components/keyphrase_extractor.py
@@ -0,0 +1,20 @@
+from spacy.tokens import Doc
+from keybert import KeyBERT
+
+class KeyphraseExtractor:
+    """
+    Wrapper class for KeyBERT.
+    """
+    def __init__(self, model, **kwargs):
+        self.model = KeyBERT(model)
+        self.kwargs = kwargs
+        if not Doc.has_extension("keyphrases"):
+            Doc.set_extension("keyphrases", default=[])
+
+    def __call__(self, doc, **kwargs):
+        runtime_kwargs = {}
+        runtime_kwargs.update(self.kwargs)
+        runtime_kwargs.update(kwargs)
+        doc._.keyphrases = self.model.extract_keywords(doc.text, **runtime_kwargs)
+
+        return doc
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+# keyphrase extraction
+keybert==0.5.0
diff --git a/scripts/__init__.py b/scripts/__init__.py
diff --git a/scripts/download_models.sh b/scripts/download_models.sh
@@ -0,0 +1,8 @@
+mkdir -p assets
+cd assets
+
+mkdir -p sentence-transformers
+cd sentence-transformers
+git lfs install
+git clone https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
+git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2