From 732b8c558a45c17e1a805fe7f9131fa5c1328bf7 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Tue, 13 Jun 2023 13:38:57 +0200 Subject: [PATCH] Update requirements. Document models --- Dockerfile.cpu | 35 +++++++++++++++++++++++++++++++++++ README.md | 13 ++++++++++++- requirements.cpu.txt | 11 +++++++++++ requirements.txt | 7 +++---- 4 files changed, 61 insertions(+), 5 deletions(-) create mode 100644 Dockerfile.cpu create mode 100644 requirements.cpu.txt diff --git a/Dockerfile.cpu b/Dockerfile.cpu new file mode 100644 index 0000000..5547e06 --- /dev/null +++ b/Dockerfile.cpu @@ -0,0 +1,35 @@ +FROM python:3.9 +LABEL maintainer="jlouradour@linagora.com" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + g++ \ + openjdk-11-jre-headless \ + curl \ + wget + +# Rust compiler for tokenizers +RUN curl https://sh.rustup.rs -sSf | bash -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +WORKDIR /usr/src/app + +# Python dependencies +COPY requirements.cpu.txt . +RUN pip3 install --no-cache-dir -r requirements.cpu.txt -f https://download.pytorch.org/whl/torch_stable.html + +# Supervisor +COPY celery_app /usr/src/app/celery_app +COPY http_server /usr/src/app/http_server +COPY document /usr/src/app/document +COPY punctuation /usr/src/app/punctuation +RUN mkdir /usr/src/app/model-store +RUN mkdir -p /usr/src/app/tmp +COPY docker-entrypoint.sh wait-for-it.sh healthcheck.sh ./ + +ENV PYTHONPATH="${PYTHONPATH}:/usr/src/app/punctuation" +HEALTHCHECK CMD ./healthcheck.sh + +ENV TEMP=/usr/src/app/tmp +ENTRYPOINT ["./docker-entrypoint.sh"] diff --git a/README.md b/README.md index ee896a4..0b0816d 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,18 @@ LinTO-platform-punctuation can either be used as a standalone punctuation servic ### Models The punctuation service relies on a trained recasing and punctuation prediction model. -We provide homebrew models on [dl.linto.ai](https://dl.linto.ai/downloads/model-distribution/punctuation_models/). +Some models trained on [Common Crawl](http://data.statmt.org/cc-100/) are available on [recasepunc](https://github.com/benob/recasepunc) for the following the languages: +* French + * [fr-txt.large.19000](https://github.com/benob/recasepunc/releases/download/0.3/fr-txt.large.19000) + * [fr.22000](https://github.com/benob/recasepunc/releases/download/0.3/fr.22000) +* English + * [en.23000](https://github.com/benob/recasepunc/releases/download/0.3/en.23000) +* Italian + * [it.22000](https://github.com/CoffeePerry/recasepunc/releases/download/v0.1.0/it.22000) +* Chinese + * [zh.24000](https://github.com/benob/recasepunc/releases/download/0.3/zh.24000) + + ### Docker The punctuation service requires docker up and running. diff --git a/requirements.cpu.txt b/requirements.cpu.txt new file mode 100644 index 0000000..a24a7b6 --- /dev/null +++ b/requirements.cpu.txt @@ -0,0 +1,11 @@ +celery[redis,auth,msgpack]>=4.4.7 +flask>=1.1.2 +flask-cors>=3.0.10 +flask-swagger-ui>=3.36.0 +gevent>=22.10.2 +gunicorn>=20.1.0 +git+https://github.com/benob/mosestokenizer.git@169bd3a504fe20a3e51b9a7af3f0ca359c2d36c9 +numpy==1.19.5 +regex==2021.8.28 +torch==1.9.0+cpu +transformers==4.10.0 diff --git a/requirements.txt b/requirements.txt index 3499eab..edaaa92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,11 +2,10 @@ celery[redis,auth,msgpack]>=4.4.7 flask>=1.1.2 flask-cors>=3.0.10 flask-swagger-ui>=3.36.0 -gevent>=22.10.2 # NOCOMMIT -waitress>=2.1.2 # NOCOMMIT +gevent>=22.10.2 gunicorn>=20.1.0 -git+https://github.com/benob/mosestokenizer.git +git+https://github.com/benob/mosestokenizer.git@169bd3a504fe20a3e51b9a7af3f0ca359c2d36c9 numpy==1.19.5 regex==2021.8.28 -torch==1.9.0+cpu # NOCOMMIT : use cu111? +torch==1.9.0 transformers==4.10.0