diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f40fe05 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +assets \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3a4d02c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM lintoai/linto-platform-nlp-core:latest +LABEL maintainer="gshang@linagora.com" + +WORKDIR /app + +VOLUME /app/assets +ENV ASSETS_PATH=/app/assets + +COPY ./requirements.txt /app/ +RUN pip install --no-cache-dir -r requirements.txt + +COPY ./scripts /app/scripts +COPY ./components /app/components + +HEALTHCHECK --interval=15s CMD curl -fs http://0.0.0.0/health || exit 1 + +ENTRYPOINT ["/home/user/miniconda/bin/uvicorn", "scripts.main:app", "--host", "0.0.0.0", "--port", "80"] +CMD ["--workers", "1"] \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..207eeea --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,51 @@ +pipeline { + agent any + environment { + DOCKER_HUB_REPO = "lintoai/linto-platform-nlp-keyphrase-extraction" + DOCKER_HUB_CRED = 'docker-hub-credentials' + + VERSION = '' + } + + stages{ + stage('Docker build for master branch'){ + when{ + branch 'master' + } + steps { + echo 'Publishing latest' + script { + image = docker.build(env.DOCKER_HUB_REPO) + VERSION = sh( + returnStdout: true, + script: "awk -v RS='' '/#/ {print; exit}' RELEASE.md | head -1 | sed 's/#//' | sed 's/ //'" + ).trim() + + docker.withRegistry('https://registry.hub.docker.com', env.DOCKER_HUB_CRED) { + image.push("${VERSION}") + image.push('latest') + } + } + } + } + + stage('Docker build for next (unstable) branch'){ + when{ + branch 'next' + } + steps { + echo 'Publishing unstable' + script { + image = docker.build(env.DOCKER_HUB_REPO) + VERSION = sh( + returnStdout: true, + script: "awk -v RS='' '/#/ {print; exit}' RELEASE.md | head -1 | sed 's/#//' | sed 's/ //'" + ).trim() + docker.withRegistry('https://registry.hub.docker.com', env.DOCKER_HUB_CRED) { + image.push('latest-unstable') + } + } + } + } + }// end stages +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4614a57 --- /dev/null +++ b/README.md @@ -0,0 +1,201 @@ +# linto-platform-nlp-keyphrase-extraction + +## Description +This repository is for building a Docker image for LinTO's NLP service: Keyphrase Extraction on the basis of [linto-platform-nlp-core](https://github.com/linto-ai/linto-platform-nlp-core), can be deployed along with [LinTO stack](https://github.com/linto-ai/linto-platform-stack) or in a standalone way (see Develop section in below). + +linto-platform-nlp-keyphrase-extraction is backed by [spaCy](https://spacy.io/) v3.0+ featuring transformer-based pipelines, thus deploying with GPU support is highly recommeded for inference efficiency. + +LinTo's NLP services adopt the basic design concept of spaCy: [component and pipeline](https://spacy.io/usage/processing-pipelines), componets are decoupled from the service and can be easily re-used in other projects, components are organised into pipelines for realising specific NLP tasks. + +This service uses [FastAPI](https://fastapi.tiangolo.com/) to serve custom spaCy's components as pipelines: +- `kpe`: Keyphrase Extraction + +## Usage + +See documentation : [https://doc.linto.ai](https://doc.linto.ai) + +## Deploy + +With our proposed stack [https://github.com/linto-ai/linto-platform-stack](https://github.com/linto-ai/linto-platform-stack) + +# Develop + +## Build and run +1 Create a named volume for storaging models. +```bash +sudo docker volume create linto-platform-nlp-assets +``` + +2 Download models into `assets/` on the host machine, make sure that `git-lfs`: [Git Large File Storage](https://git-lfs.github.com/) is installed and availble at `/usr/local/bin/git-lfs`. +```bash +cd linto-platform-nlp-keyphrase-extraction/ +bash scripts/download_models.sh +``` + +3 Copy downloaded models into created volume `linto-platform-nlp-assets` +```bash +sudo docker container create --name cp_helper -v linto-platform-nlp-assets:/root hello-world +sudo docker cp assets/* cp_helper:/root +sudo docker rm cp_helper +``` + +4 Build image +```bash +sudo docker build --tag lintoai/linto-platform-keyphrase-extraction:latest . +``` + +5 Run container (with GPU), make sure that [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian) and GPU driver are installed. +```bash +sudo docker run --gpus all \ +--rm -d -p 80:80 \ +-v linto-platform-nlp-assets:/app/assets:ro \ +--env APP_LANG="fr en" \ +lintoai/linto-platform-keyphrase-extraction:latest +``` +
+ Check running with CPU only setting + + ```bash +sudo docker run \ +--rm -d -p 80:80 \ +-v linto-platform-nlp-assets:/app/assets:ro \ +--env APP_LANG="fr en" \ +lintoai/linto-platform-keyphrase-extraction:latest + ``` +
+To specify running language of the container, modify APP_LANG="fr en", APP_LANG="fr", etc. + +To lanche with multiple workers, add `--workers INTEGER` in the end of the above command. + +6 Navigate to `http://localhost/docs` or `http://localhost/redoc` in your browser, to explore the REST API interactively. See the examples for how to query the API. + + +## Specification for `http://localhost/kpe/{lang}` + +### Supported languages +| {lang} | Model | Size | +| --- | --- | --- | +| `en` | [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 80 MB | +| `fr` | [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) | 418 MB | + +### Request +```json +{ + "articles": [ + { + "text": "Apple Inc. is an American multinational technology company that specializes in consumer electronics, computer software and online services." + }, + { + "text": "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set." + } + ] +} +``` + +### Response +```json +{ + "kpe": [ + { + "text": "Apple Inc. is an American multinational technology company that specializes in consumer electronics, computer software and online services.", + "keyphrases": [ + { + "text": "apple", + "score": 0.6539 + }, + { + "text": "inc", + "score": 0.3941 + }, + { + "text": "company", + "score": 0.2985 + }, + { + "text": "multinational", + "score": 0.2635 + }, + { + "text": "electronics", + "score": 0.2143 + } + ] + }, + { + "text": "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set.", + "keyphrases": [ + { + "text": "unsupervised", + "score": 0.6663 + }, + { + "text": "learning", + "score": 0.3155 + }, + { + "text": "algorithms", + "score": 0.3128 + }, + { + "text": "algorithm", + "score": 0.2494 + }, + { + "text": "patterns", + "score": 0.2476 + } + ] + } + ] +} +``` + +### Component configuration +This is a component wrapped on the basis of [KeyBERT](https://github.com/MaartenGr/KeyBERT). + +| Parameter | Type | Default value | Description | +| --- | --- | --- | --- | +| candidates | List[str] | null | Candidate keywords/keyphrases to use instead of extracting them from the document(s) | +| diversity | Float | 0.5 | The diversity of results between 0 and 1 if use_mmr is True | +| keyphrase_ngram_range | Tuple[int, int] | [1,1] | Length, in words, of the extracted keywords/keyphrases | +| min_df | int | 1 | Minimum document frequency of a word across all documents if keywords for multiple documents need to be extracted | +| nr_candidates | int | 20 | The number of candidates to consider if use_maxsum is set to True | +| seed_keywords | List[str] | null | Seed keywords that may guide the extraction of keywords by steering the similarities towards the seeded keywords | +| stop_words | Union[str, List[str]] | null | Stopwords to remove from the document | +| top_n | int | 5 | Return the top n keywords/keyphrases | +| use_maxsum | bool | false | Whether to use Max Sum Similarity for the selection of keywords/keyphrases | +| use_mmr | bool | false | Whether to use Maximal Marginal Relevance (MMR) for the selection of keywords/keyphrases | + +Component's config can be modified in [`components/config.cfg`](components/config.cfg) for default values, or on the per API request basis at runtime: + +```json +{ + "articles": [ + { + "text": "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set." + } + ], + "component_cfg": { + "kpe": {"keyphrase_ngram_range": [2,2], "top_n": 1} + } +} +``` + +```json +{ + "kpe": [ + { + "text": "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set.", + "keyphrases": [ + { + "text": "unsupervised learning", + "score": 0.7252 + } + ] + } + ] +} +``` + +### Advanced usage +For advanced usage, such as Max Sum Similarity and Maximal Marginal Relevance for diversifying extraction results, please refer to the documentation of [KeyBERT](https://maartengr.github.io/KeyBERT/guides/quickstart.html#usage) and [medium post](https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea) to know how it works. \ No newline at end of file diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000..86038ce --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,3 @@ +# 0.1.0 +- Initial commit. +- Keyphrase Extraction. \ No newline at end of file diff --git a/components/__init__.py b/components/__init__.py new file mode 100644 index 0000000..3fabafc --- /dev/null +++ b/components/__init__.py @@ -0,0 +1,37 @@ +import spacy +from spacy.language import Language +from typing import List, Union, Tuple +from sklearn.feature_extraction.text import CountVectorizer +from sentence_transformers import SentenceTransformer +from thinc.api import Config +from components.keyphrase_extractor import KeyphraseExtractor + +# Load components' defaut configuration +config = Config().from_disk("components/config.cfg") + +@Language.factory("kpe", default_config=config["components"]["kpe"]) +def make_keyphrase_extractor( + nlp: Language, + name: str, + model: SentenceTransformer, + candidates: List[str] = None, + keyphrase_ngram_range: Tuple[int, int] = (1, 1), + stop_words: Union[str, List[str]] = None, + top_n: int = 5, + min_df: int = 1, + use_maxsum: bool = False, + use_mmr: bool = False, + diversity: float = 0.5, + nr_candidates: int = 20, + vectorizer: CountVectorizer = None, + highlight: bool = False, + seed_keywords: List[str] = None + ): + + kwargs = locals() + del kwargs['nlp'] + del kwargs['name'] + del kwargs['model'] + + return KeyphraseExtractor(model, **kwargs) + diff --git a/components/config.cfg b/components/config.cfg new file mode 100644 index 0000000..cc853fa --- /dev/null +++ b/components/config.cfg @@ -0,0 +1,15 @@ +[components] + +[components.kpe] +candidates = null +diversity = 0.5 +highlight = false +keyphrase_ngram_range = [1,1] +min_df = 1 +nr_candidates = 20 +seed_keywords = null +stop_words = null +top_n = 5 +use_maxsum = false +use_mmr = false +vectorizer = null diff --git a/components/keyphrase_extractor.py b/components/keyphrase_extractor.py new file mode 100644 index 0000000..d8aa013 --- /dev/null +++ b/components/keyphrase_extractor.py @@ -0,0 +1,20 @@ +from spacy.tokens import Doc +from keybert import KeyBERT + +class KeyphraseExtractor: + """ + Wrapper class for KeyBERT. + """ + def __init__(self, model, **kwargs): + self.model = KeyBERT(model) + self.kwargs = kwargs + if not Doc.has_extension("keyphrases"): + Doc.set_extension("keyphrases", default=[]) + + def __call__(self, doc, **kwargs): + runtime_kwargs = {} + runtime_kwargs.update(self.kwargs) + runtime_kwargs.update(kwargs) + doc._.keyphrases = self.model.extract_keywords(doc.text, **runtime_kwargs) + + return doc \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..00d0d73 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +# keyphrase extraction +keybert==0.5.0 \ No newline at end of file diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/download_models.sh b/scripts/download_models.sh new file mode 100644 index 0000000..b5ab27f --- /dev/null +++ b/scripts/download_models.sh @@ -0,0 +1,8 @@ +mkdir -p assets +cd assets + +mkdir -p sentence-transformers +cd sentence-transformers +git lfs install +git clone https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 +git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 diff --git a/scripts/main.py b/scripts/main.py new file mode 100644 index 0000000..d0c0fa5 --- /dev/null +++ b/scripts/main.py @@ -0,0 +1,68 @@ +import os +import spacy +import components +from scripts.schemas import * +from spacy.tokens import Doc +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from fastapi_health import health +from sentence_transformers import SentenceTransformer + +# To force the GPU usage: spacy.require_gpu() +spacy.prefer_gpu() + +# Supported languages and corresponding model names +LM_MAP = { + "fr": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + "en": "sentence-transformers/all-MiniLM-L6-v2" + } + +# Load models +MODELS = {LM_MAP[lang]: SentenceTransformer(os.environ.get("ASSETS_PATH") + '/' + LM_MAP[lang]) for lang in os.environ.get("APP_LANG").split(" ")} +print(f"Loaded {len(MODELS)} models: {MODELS.keys()}") + +@spacy.registry.misc("get_model") +def get_model(name): + return MODELS[name] + +# Set up the FastAPI app and define the endpoints +app = FastAPI() +app.add_middleware(CORSMiddleware, allow_origins=["*"]) + +# Health check +def healthy(): + return {"linto-platform-nlp-keyphrase-extraction": "online"} +app.add_api_route("/health", health([healthy])) + +# Keyphrase Extraction +def get_data(doc: Doc) -> Dict[str, Any]: + """Extract the data to return from the REST API given a Doc object. Modify + this function to include other data.""" + keyphrases = [ + { + "text": keyphrase[0], + "score": keyphrase[1] + } + for keyphrase in doc._.keyphrases + ] + return {"text": doc.text, "keyphrases": keyphrases} + +@app.post("/kpe/{lang}", summary="Keyphrase Extraction", response_model=KpeResponseModel) +def kpe(lang: str, query: RequestModel): + """Process a batch of articles and return the Keyphrases predicted by the + given model. Each record in the data should have a key "text". + """ + if lang in LM_MAP.keys(): + model_name = LM_MAP[lang] + if model_name not in MODELS.keys(): + raise RuntimeError(f"Model {model_name} for language {lang} is not loaded.") + nlp = spacy.blank(lang) + nlp.add_pipe("kpe", config={"model": {"@misc": "get_model", "name": model_name}}) + else: + raise ValueError(f"Language {lang} is not supported.") + + response_body = [] + texts = (article.text for article in query.articles) + for doc in nlp.pipe(texts, component_cfg=query.component_cfg): + response_body.append(get_data(doc)) + return {"kpe": response_body} \ No newline at end of file diff --git a/scripts/schemas.py b/scripts/schemas.py new file mode 100644 index 0000000..3fc7bcc --- /dev/null +++ b/scripts/schemas.py @@ -0,0 +1,27 @@ +from typing import List, Dict, Any, Optional +from pydantic import BaseModel + +class Article(BaseModel): + # Schema for a single article in a batch of articles to process + text: str + + +class RequestModel(BaseModel): + # Schema for a request consisting a batch of articles, and component configuration + articles: List[Article] + component_cfg: Optional[Dict[str, Dict[str, Any]]] = None + + +class KpeResponseModel(BaseModel): + # This is the schema of the expected response and depends on what you + # return from get_data. + + class Batch(BaseModel): + class Kyephrase(BaseModel): + text: str + score: float + + text: str + keyphrases: List[Kyephrase] = [] + + kpe: List[Batch]