Merge pull request #1 from linto-ai/0.2.0

0.2.0
linto-ai · Jun 14, 2023 · c7a8f71 · c7a8f71
2 parents 8760ecb + 0049288
commit c7a8f71
Show file tree

Hide file tree

Showing 24 changed files with 1,404 additions and 126 deletions.
diff --git a/.envdefault b/.envdefault
@@ -1,4 +1,15 @@
+# APPLICATION PARAMETERS
 APP_LANG=fr en
 ASSETS_PATH_ON_HOST=./assets
 ASSETS_PATH_IN_CONTAINER=/app/assets
-WORKER_NUMBER=1
+LM_MAP={"fr":"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2","en":"sentence-transformers/all-MiniLM-L6-v2"}
+
+# SERVING PARAMETERS
+SERVICE_MODE=http
+CONCURRENCY=1
+USE_GPU=True
+
+# MICRO-SERVICE PARAMETERS
+SERVICE_NAME=kpe
+SERVICES_BROKER=redis://172.17.0.1:6379
+BROKER_PASS=
diff --git a/.github/workflows/dockerhub-description.yml b/.github/workflows/dockerhub-description.yml
@@ -0,0 +1,20 @@
+name: Update Docker Hub Description
+on:
+  push:
+    branches:
+      - master
+    paths:
+      - README.md
+      - .github/workflows/dockerhub-description.yml
+jobs:
+  dockerHubDescription:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Docker Hub Description
+      uses: peter-evans/dockerhub-description@v3
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_PASSWORD }}
+        repository: lintoai/linto-platform-nlp-keyphrase-extraction
+        readme-filepath: ./README.md
diff --git a/Dockerfile b/Dockerfile
@@ -1,16 +1,20 @@
 FROM lintoai/linto-platform-nlp-core:latest
 LABEL maintainer="[email protected]"
 
-WORKDIR /app
+WORKDIR /usr/src/app
 
-COPY ./requirements.txt /app/
+COPY requirements.txt ./
 RUN pip install --no-cache-dir -r requirements.txt
 
-COPY .envdefault /app/
-COPY ./scripts /app/scripts
-COPY ./components /app/components
+COPY kpe /usr/src/app/kpe
+COPY components /usr/src/app/components
+COPY celery_app /usr/src/app/celery_app
+COPY http_server /usr/src/app/http_server
+COPY document /usr/src/app/document
+COPY docker-entrypoint.sh wait-for-it.sh healthcheck.sh ./
 
-HEALTHCHECK --interval=15s CMD curl -fs http://0.0.0.0/health || exit 1
+ENV PYTHONPATH="${PYTHONPATH}:/usr/src/app/kpe"
 
-ENTRYPOINT ["/opt/conda/bin/gunicorn", "scripts.main:app", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:80", "--access-logfile", "-", "--error-logfile", "-"]
-CMD ["--workers", "1"]
+HEALTHCHECK CMD ./healthcheck.sh
+
+ENTRYPOINT ["./docker-entrypoint.sh"]
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -3,12 +3,9 @@
 ## Description
 This repository is for building a Docker image for LinTO's NLP service: Keyphrase Extraction on the basis of [linto-platform-nlp-core](https://github.com/linto-ai/linto-platform-nlp-core), can be deployed along with [LinTO stack](https://github.com/linto-ai/linto-platform-stack) or in a standalone way (see Develop section in below).
 
-linto-platform-nlp-keyphrase-extraction is backed by [spaCy](https://spacy.io/) v3.0+ featuring transformer-based pipelines, thus deploying with GPU support is highly recommeded for inference efficiency.
+LinTo's NLP services adopt the basic design concept of spaCy: [component and pipeline](https://spacy.io/usage/processing-pipelines), components (located under the folder `components/`) are decoupled from the service and can be easily re-used in other spaCy projects, components are organised into pipelines for realising specific NLP tasks. 
 
-LinTo's NLP services adopt the basic design concept of spaCy: [component and pipeline](https://spacy.io/usage/processing-pipelines), components are decoupled from the service and can be easily re-used in other projects, components are organised into pipelines for realising specific NLP tasks. 
-
-This service uses [FastAPI](https://fastapi.tiangolo.com/) to serve custom spaCy's components as pipelines:
-- `kpe`: Keyphrase Extraction
+This service can be launched in two ways: REST API and Celery task, with and without GPU support.
 
 ## Usage
 
@@ -29,14 +26,22 @@ bash scripts/download_models.sh
 
 2 configure running environment variables
 ```bash
-mv .envdefault .env
-# cat .envdefault
-# APP_LANG=fr en | Running language of application, "fr en", "fr", etc.
-# ASSETS_PATH_ON_HOST=./assets | Storage path of models on host. (only applicable when docker-compose is used)
-# ASSETS_PATH_IN_CONTAINER=/app/assets | Volume mount point of models in container. (only applicable when docker-compose is used)
-# WORKER_NUMBER=1 | Number of processing workers. (only applicable when docker-compose is used)
+cp .envdefault .env
 ```
 
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `APP_LANG` | A space-separated list of supported languages for the application | fr en |
+| `ASSETS_PATH_ON_HOST` | The path to the assets folder on the host machine | ./assets |
+| `ASSETS_PATH_IN_CONTAINER` | The volume mount point of models in container | /app/assets |
+| `LM_MAP` | A JSON string that maps each supported language to its corresponding language model | {"fr":"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2","en":"sentence-transformers/all-MiniLM-L6-v2"} |
+| `SERVICE_MODE` | The mode in which the service is served, either "http" (REST API) or "task" (Celery task) | "http" |
+| `CONCURRENCY` | The maximum number of requests that can be handled concurrently | 1 |
+| `USE_GPU` | A flag indicating whether to use GPU for computation or not, either "True" or "False" | True |
+| `SERVICE_NAME` | The name of the micro-service | kpe |
+| `SERVICES_BROKER` | The URL of the broker server used for communication between micro-services | "redis://localhost:6379" |
+| `BROKER_PASS` | The password for accessing the broker server | None |
+
 4 Build image
 ```bash
 sudo docker build --tag lintoai/linto-platform-nlp-keyphrase-extraction:latest .
@@ -52,22 +57,29 @@ sudo docker run --gpus all \
 --rm -p 80:80 \
 -v $PWD/assets:/app/assets:ro \
 --env-file .env \
-lintoai/linto-platform-nlp-keyphrase-extraction:latest \
---workers 1
+lintoai/linto-platform-nlp-keyphrase-extraction:latest
 ```
+<details>
+  <summary>Check running with CPU only setting</summary>
+
+  - remove `--gpus all` from the first command.
+  - set `USE_GPU=False` in the `.env`.
+</details>
+
 or
+
 ```bash
 sudo docker-compose up
 ```
 <details>
   <summary>Check running with CPU only setting</summary>
 
-  - remove `--gpus all` from the first command.
   - remove `runtime: nvidia` from the `docker-compose.yml` file.
+  - set `USE_GPU=False` in the `.env`.
 </details>
 
 
-6 Navigate to `http://localhost/docs` or `http://localhost/redoc` in your browser, to explore the REST API interactively. See the examples for how to query the API.
+6 If running under `SERVICE_MODE=http`, navigate to `http://localhost/docs` or `http://localhost/redoc` in your browser, to explore the REST API interactively. See the examples for how to query the API. If running under `SERVICE_MODE=task`, plese refers to the individual section in the end of this README.
 
 
 ## Specification for `http://localhost/kpe/{lang}`
@@ -198,4 +210,33 @@ Component's config can be modified in [`components/config.cfg`](components/confi
 ```
 
 ### Advanced usage
-For advanced usage, such as Max Sum Similarity and Maximal Marginal Relevance for diversifying extraction results, please refer to the documentation of [KeyBERT](https://maartengr.github.io/KeyBERT/guides/quickstart.html#usage) and [medium post](https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea) to know how it works.
+For advanced usage, such as Max Sum Similarity and Maximal Marginal Relevance for diversifying extraction results, please refer to the documentation of [KeyBERT](https://maartengr.github.io/KeyBERT/guides/quickstart.html#usage) and [medium post](https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea) to know how it works.
+
+
+## Testing Celery mode locally
+1 Install Redis on your local machine, and run it with:
+```bash
+redis-server --protected-mode no --bind 0.0.0.0 --loglevel debug
+```
+
+2 Make sure in your `.env`, these two variables are set correctly as `SERVICE_MODE=task` and `SERVICES_BROKER=redis://172.17.0.1:6379`
+
+Then start your docker container with either `docker run` or `docker-compose up` as shown in the previous section.
+
+3 On your local computer, run this python script: 
+```python
+from celery import Celery
+celery = Celery(broker='redis://localhost:6379/0', backend='redis://localhost:6379/1')
+r = celery.send_task(
+    'kpe_task', 
+    (
+        'en', 
+        [
+            "Apple Inc. is an American multinational technology company that specializes in consumer electronics, computer software and online services.",
+            "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set."
+        ],
+        {"kpe": {"top_n": 3}}
+    ),
+    queue='kpe')
+r.get()
+```
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,3 +1,6 @@
+# 0.2.0
+- Migration to the [template]((https://github.com/linto-ai/linto-template-microservice)) of LinTO microservices.
+
 # 0.1.0
 - Initial commit.
 - Keyphrase Extraction.
diff --git a/scripts/__init__.py → celery_app/__init__.py b/scripts/__init__.py → celery_app/__init__.py
diff --git a/celery_app/celeryapp.py b/celery_app/celeryapp.py
@@ -0,0 +1,28 @@
+import os
+from celery import Celery
+
+from kpe import logger
+
+celery = Celery(__name__, include=['celery_app.tasks'])
+service_name = os.environ.get("SERVICE_NAME", "kpe")
+broker_url = os.environ.get("SERVICES_BROKER")
+if os.environ.get("BROKER_PASS", False):
+    components = broker_url.split('//')
+    broker_url = f'{components[0]}//:{os.environ.get("BROKER_PASS")}@{components[1]}'
+celery.conf.broker_url = "{}/0".format(broker_url)
+celery.conf.result_backend = "{}/1".format(broker_url)
+celery.conf.update(
+    result_expires=3600,
+    task_acks_late=True,
+    task_track_started = True)
+
+# Queues
+celery.conf.update(
+    {'task_routes': {
+        'kpe_task' : {'queue': 'kpe'},}
+    }
+)
+
+logger.info(
+    f"Celery configured for broker located at {broker_url} with service name {service_name}"
+)
diff --git a/celery_app/tasks.py b/celery_app/tasks.py
@@ -0,0 +1,35 @@
+import spacy
+import components
+
+from typing import Dict, List
+
+from celery_app.celeryapp import celery
+
+from kpe import logger
+from kpe.processing import LM_MAP, MODELS, get_model
+from kpe.processing.utils import get_data
+
+
+@celery.task(name="kpe_task")
+def kpe_task(lang: str, texts: List[str], component_cfg: Dict = {}):
+    """Process a batch of articles and return the Keyphrases predicted by the
+    given model. Each record in the data should have a key "text".
+    """
+    logger.info('KPE task received')
+
+    # Check language availability
+    if lang in LM_MAP.keys():
+        model_name = LM_MAP[lang]
+        if model_name not in MODELS.keys():
+            raise RuntimeError(f"Model {model_name} for language {lang} is not loaded.")
+        nlp = spacy.blank(lang)
+        nlp.add_pipe("kpe", config={"model": {"@misc": "get_model", "name": model_name}})
+    else:
+        raise ValueError(f"Language {lang} is not supported.")
+
+    response_body = []
+
+    for doc in nlp.pipe(texts, component_cfg=component_cfg):
+        response_body.append(get_data(doc))
+
+    return {"kpe": response_body}
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -9,5 +9,4 @@ services:
       - .env
     volumes:
       - $ASSETS_PATH_ON_HOST:$ASSETS_PATH_IN_CONTAINER:ro
-    command: ["--workers", $WORKER_NUMBER]
     runtime: nvidia
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -ea
+
+echo "RUNNING SERVICE"
+
+# Launch parameters, environement variables and dependencies check
+if [ -z "$SERVICE_MODE" ]
+then
+    echo "ERROR: Must specify a serving mode: [ http | task ]"
+    exit -1
+else
+    if [ "$SERVICE_MODE" = "http" ] 
+    then
+        echo "RUNNING HTTP SERVER"
+        python http_server/ingress.py --debug
+    elif [ "$SERVICE_MODE" == "task" ]
+    then
+        if [[ -z "$SERVICES_BROKER" ]]
+        then 
+            echo "ERROR: SERVICES_BROKER variable not specified, cannot start celery worker."
+            return -1
+        fi
+        /usr/src/app/wait-for-it.sh $(echo $SERVICES_BROKER | cut -d'/' -f 3) --timeout=20 --strict -- echo " $SERVICES_BROKER (Service Broker) is up"
+        echo "RUNNING CELERY WORKER"
+        POOL=$([ $USE_GPU == "True" ] && echo "gevent" || echo "prefork")
+        celery --app=celery_app.celeryapp worker -Ofair -n nlp_${SERVICE_NAME}_worker@%h --queues=${SERVICE_NAME} -c ${CONCURRENCY} --pool=$POOL
+    else
+        echo "ERROR: Wrong serving command: $1"
+        exit -1
+    fi
+fi
+
+echo "Service stopped"
diff --git a/document/swagger.yml b/document/swagger.yml
@@ -0,0 +1,83 @@
+openapi: 3.0.1
+info:
+  title: Keyphrase Extraction API
+  description: API to detect keyphrases in text.
+  version: 0.2.0
+
+servers:
+- url: /
+
+paths:
+  /kpe/{lang}:
+    post:
+      tags:
+      - Keyphrase Extraction API
+      summary: Perform Keyphrase Extraction
+      parameters:
+        - name: lang
+          in: path
+          required: true
+          description: Language
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/request'
+      responses:
+        200:
+          description: "Job successfully finished"
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/responsemodel'
+        400:
+          description: "Bad request"
+        500:
+          description: "Server error"
+
+components:
+  schemas:
+    article:
+      type: object
+      properties:
+        text:
+          type: string
+          default: This is an article.
+    request:
+      type: object
+      properties:
+        articles:
+          type: array
+          required: true
+          items:
+            $ref: '#/components/schemas/article'
+        component_cfg:
+          type: object
+
+    keyphrase:
+      type: object
+      properties:
+        text:
+          type: string
+        score:
+          type: float
+    batch:
+      type: object
+      properties:
+        text:
+          type: string
+        keyphrases:
+          type: array
+          items:
+            $ref: '#/components/schemas/keyphrase'
+
+    responsemodel:
+      type: object
+      properties:
+        kpe:
+          type: array
+          items:
+            $ref: '#/components/schemas/batch'
+
diff --git a/healthcheck.sh b/healthcheck.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+set -eax
+
+if [ "$SERVICE_MODE" = "http" ]
+then
+    curl --fail http://localhost:80/healthcheck || exit 1
+else
+    celery --app=celery_app.celeryapp inspect ping -d ${SERVICE_NAME}_worker@$HOSTNAME || exit 1
+fi
diff --git a/http_server/__init__.py b/http_server/__init__.py