diff --git a/.envdefault b/.envdefault index 1e817c4..1503427 100644 --- a/.envdefault +++ b/.envdefault @@ -1,4 +1,15 @@ +# APPLICATION PARAMETERS APP_LANG=fr en ASSETS_PATH_ON_HOST=./assets ASSETS_PATH_IN_CONTAINER=/app/assets -WORKER_NUMBER=1 \ No newline at end of file +LM_MAP={"fr":"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2","en":"sentence-transformers/all-MiniLM-L6-v2"} + +# SERVING PARAMETERS +SERVICE_MODE=http +CONCURRENCY=1 +USE_GPU=True + +# MICRO-SERVICE PARAMETERS +SERVICE_NAME=kpe +SERVICES_BROKER=redis://172.17.0.1:6379 +BROKER_PASS= \ No newline at end of file diff --git a/.github/workflows/dockerhub-description.yml b/.github/workflows/dockerhub-description.yml new file mode 100644 index 0000000..c2b874c --- /dev/null +++ b/.github/workflows/dockerhub-description.yml @@ -0,0 +1,20 @@ +name: Update Docker Hub Description +on: + push: + branches: + - master + paths: + - README.md + - .github/workflows/dockerhub-description.yml +jobs: + dockerHubDescription: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Docker Hub Description + uses: peter-evans/dockerhub-description@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + repository: lintoai/linto-platform-nlp-keyphrase-extraction + readme-filepath: ./README.md \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 36a97a5..b56d7fa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,20 @@ FROM lintoai/linto-platform-nlp-core:latest LABEL maintainer="gshang@linagora.com" -WORKDIR /app +WORKDIR /usr/src/app -COPY ./requirements.txt /app/ +COPY requirements.txt ./ RUN pip install --no-cache-dir -r requirements.txt -COPY .envdefault /app/ -COPY ./scripts /app/scripts -COPY ./components /app/components +COPY kpe /usr/src/app/kpe +COPY components /usr/src/app/components +COPY celery_app /usr/src/app/celery_app +COPY http_server /usr/src/app/http_server +COPY document /usr/src/app/document +COPY docker-entrypoint.sh wait-for-it.sh healthcheck.sh ./ -HEALTHCHECK --interval=15s CMD curl -fs http://0.0.0.0/health || exit 1 +ENV PYTHONPATH="${PYTHONPATH}:/usr/src/app/kpe" -ENTRYPOINT ["/opt/conda/bin/gunicorn", "scripts.main:app", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:80", "--access-logfile", "-", "--error-logfile", "-"] -CMD ["--workers", "1"] \ No newline at end of file +HEALTHCHECK CMD ./healthcheck.sh + +ENTRYPOINT ["./docker-entrypoint.sh"] \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c39e3a4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users" freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work"s +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users" Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work"s +users, your or third parties" legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program"s source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation"s users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party"s predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor"s "contributor version". + + A contributor"s "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor"s essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient"s use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others" Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy"s +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/README.md b/README.md index 105da15..b1fe0da 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,9 @@ ## Description This repository is for building a Docker image for LinTO's NLP service: Keyphrase Extraction on the basis of [linto-platform-nlp-core](https://github.com/linto-ai/linto-platform-nlp-core), can be deployed along with [LinTO stack](https://github.com/linto-ai/linto-platform-stack) or in a standalone way (see Develop section in below). -linto-platform-nlp-keyphrase-extraction is backed by [spaCy](https://spacy.io/) v3.0+ featuring transformer-based pipelines, thus deploying with GPU support is highly recommeded for inference efficiency. +LinTo's NLP services adopt the basic design concept of spaCy: [component and pipeline](https://spacy.io/usage/processing-pipelines), components (located under the folder `components/`) are decoupled from the service and can be easily re-used in other spaCy projects, components are organised into pipelines for realising specific NLP tasks. -LinTo's NLP services adopt the basic design concept of spaCy: [component and pipeline](https://spacy.io/usage/processing-pipelines), components are decoupled from the service and can be easily re-used in other projects, components are organised into pipelines for realising specific NLP tasks. - -This service uses [FastAPI](https://fastapi.tiangolo.com/) to serve custom spaCy's components as pipelines: -- `kpe`: Keyphrase Extraction +This service can be launched in two ways: REST API and Celery task, with and without GPU support. ## Usage @@ -29,14 +26,22 @@ bash scripts/download_models.sh 2 configure running environment variables ```bash -mv .envdefault .env -# cat .envdefault -# APP_LANG=fr en | Running language of application, "fr en", "fr", etc. -# ASSETS_PATH_ON_HOST=./assets | Storage path of models on host. (only applicable when docker-compose is used) -# ASSETS_PATH_IN_CONTAINER=/app/assets | Volume mount point of models in container. (only applicable when docker-compose is used) -# WORKER_NUMBER=1 | Number of processing workers. (only applicable when docker-compose is used) +cp .envdefault .env ``` +| Environment Variable | Description | Default Value | +| --- | --- | --- | +| `APP_LANG` | A space-separated list of supported languages for the application | fr en | +| `ASSETS_PATH_ON_HOST` | The path to the assets folder on the host machine | ./assets | +| `ASSETS_PATH_IN_CONTAINER` | The volume mount point of models in container | /app/assets | +| `LM_MAP` | A JSON string that maps each supported language to its corresponding language model | {"fr":"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2","en":"sentence-transformers/all-MiniLM-L6-v2"} | +| `SERVICE_MODE` | The mode in which the service is served, either "http" (REST API) or "task" (Celery task) | "http" | +| `CONCURRENCY` | The maximum number of requests that can be handled concurrently | 1 | +| `USE_GPU` | A flag indicating whether to use GPU for computation or not, either "True" or "False" | True | +| `SERVICE_NAME` | The name of the micro-service | kpe | +| `SERVICES_BROKER` | The URL of the broker server used for communication between micro-services | "redis://localhost:6379" | +| `BROKER_PASS` | The password for accessing the broker server | None | + 4 Build image ```bash sudo docker build --tag lintoai/linto-platform-nlp-keyphrase-extraction:latest . @@ -52,22 +57,29 @@ sudo docker run --gpus all \ --rm -p 80:80 \ -v $PWD/assets:/app/assets:ro \ --env-file .env \ -lintoai/linto-platform-nlp-keyphrase-extraction:latest \ ---workers 1 +lintoai/linto-platform-nlp-keyphrase-extraction:latest ``` +
+ Check running with CPU only setting + + - remove `--gpus all` from the first command. + - set `USE_GPU=False` in the `.env`. +
+ or + ```bash sudo docker-compose up ```
Check running with CPU only setting - - remove `--gpus all` from the first command. - remove `runtime: nvidia` from the `docker-compose.yml` file. + - set `USE_GPU=False` in the `.env`.
-6 Navigate to `http://localhost/docs` or `http://localhost/redoc` in your browser, to explore the REST API interactively. See the examples for how to query the API. +6 If running under `SERVICE_MODE=http`, navigate to `http://localhost/docs` or `http://localhost/redoc` in your browser, to explore the REST API interactively. See the examples for how to query the API. If running under `SERVICE_MODE=task`, plese refers to the individual section in the end of this README. ## Specification for `http://localhost/kpe/{lang}` @@ -198,4 +210,33 @@ Component's config can be modified in [`components/config.cfg`](components/confi ``` ### Advanced usage -For advanced usage, such as Max Sum Similarity and Maximal Marginal Relevance for diversifying extraction results, please refer to the documentation of [KeyBERT](https://maartengr.github.io/KeyBERT/guides/quickstart.html#usage) and [medium post](https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea) to know how it works. \ No newline at end of file +For advanced usage, such as Max Sum Similarity and Maximal Marginal Relevance for diversifying extraction results, please refer to the documentation of [KeyBERT](https://maartengr.github.io/KeyBERT/guides/quickstart.html#usage) and [medium post](https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea) to know how it works. + + +## Testing Celery mode locally +1 Install Redis on your local machine, and run it with: +```bash +redis-server --protected-mode no --bind 0.0.0.0 --loglevel debug +``` + +2 Make sure in your `.env`, these two variables are set correctly as `SERVICE_MODE=task` and `SERVICES_BROKER=redis://172.17.0.1:6379` + +Then start your docker container with either `docker run` or `docker-compose up` as shown in the previous section. + +3 On your local computer, run this python script: +```python +from celery import Celery +celery = Celery(broker='redis://localhost:6379/0', backend='redis://localhost:6379/1') +r = celery.send_task( + 'kpe_task', + ( + 'en', + [ + "Apple Inc. is an American multinational technology company that specializes in consumer electronics, computer software and online services.", + "Unsupervised learning is a type of machine learning in which the algorithm is not provided with any pre-assigned labels or scores for the training data. As a result, unsupervised learning algorithms must first self-discover any naturally occurring patterns in that training data set." + ], + {"kpe": {"top_n": 3}} + ), + queue='kpe') +r.get() +``` diff --git a/RELEASE.md b/RELEASE.md index 86038ce..9592a84 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,6 @@ +# 0.2.0 +- Migration to the [template]((https://github.com/linto-ai/linto-template-microservice)) of LinTO microservices. + # 0.1.0 - Initial commit. - Keyphrase Extraction. \ No newline at end of file diff --git a/scripts/__init__.py b/celery_app/__init__.py similarity index 100% rename from scripts/__init__.py rename to celery_app/__init__.py diff --git a/celery_app/celeryapp.py b/celery_app/celeryapp.py new file mode 100644 index 0000000..7be9926 --- /dev/null +++ b/celery_app/celeryapp.py @@ -0,0 +1,28 @@ +import os +from celery import Celery + +from kpe import logger + +celery = Celery(__name__, include=['celery_app.tasks']) +service_name = os.environ.get("SERVICE_NAME", "kpe") +broker_url = os.environ.get("SERVICES_BROKER") +if os.environ.get("BROKER_PASS", False): + components = broker_url.split('//') + broker_url = f'{components[0]}//:{os.environ.get("BROKER_PASS")}@{components[1]}' +celery.conf.broker_url = "{}/0".format(broker_url) +celery.conf.result_backend = "{}/1".format(broker_url) +celery.conf.update( + result_expires=3600, + task_acks_late=True, + task_track_started = True) + +# Queues +celery.conf.update( + {'task_routes': { + 'kpe_task' : {'queue': 'kpe'},} + } +) + +logger.info( + f"Celery configured for broker located at {broker_url} with service name {service_name}" +) \ No newline at end of file diff --git a/celery_app/tasks.py b/celery_app/tasks.py new file mode 100644 index 0000000..c69ac9e --- /dev/null +++ b/celery_app/tasks.py @@ -0,0 +1,35 @@ +import spacy +import components + +from typing import Dict, List + +from celery_app.celeryapp import celery + +from kpe import logger +from kpe.processing import LM_MAP, MODELS, get_model +from kpe.processing.utils import get_data + + +@celery.task(name="kpe_task") +def kpe_task(lang: str, texts: List[str], component_cfg: Dict = {}): + """Process a batch of articles and return the Keyphrases predicted by the + given model. Each record in the data should have a key "text". + """ + logger.info('KPE task received') + + # Check language availability + if lang in LM_MAP.keys(): + model_name = LM_MAP[lang] + if model_name not in MODELS.keys(): + raise RuntimeError(f"Model {model_name} for language {lang} is not loaded.") + nlp = spacy.blank(lang) + nlp.add_pipe("kpe", config={"model": {"@misc": "get_model", "name": model_name}}) + else: + raise ValueError(f"Language {lang} is not supported.") + + response_body = [] + + for doc in nlp.pipe(texts, component_cfg=component_cfg): + response_body.append(get_data(doc)) + + return {"kpe": response_body} diff --git a/docker-compose.yml b/docker-compose.yml index 2882e6f..97e09f6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,5 +9,4 @@ services: - .env volumes: - $ASSETS_PATH_ON_HOST:$ASSETS_PATH_IN_CONTAINER:ro - command: ["--workers", $WORKER_NUMBER] runtime: nvidia \ No newline at end of file diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh new file mode 100755 index 0000000..a697890 --- /dev/null +++ b/docker-entrypoint.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -ea + +echo "RUNNING SERVICE" + +# Launch parameters, environement variables and dependencies check +if [ -z "$SERVICE_MODE" ] +then + echo "ERROR: Must specify a serving mode: [ http | task ]" + exit -1 +else + if [ "$SERVICE_MODE" = "http" ] + then + echo "RUNNING HTTP SERVER" + python http_server/ingress.py --debug + elif [ "$SERVICE_MODE" == "task" ] + then + if [[ -z "$SERVICES_BROKER" ]] + then + echo "ERROR: SERVICES_BROKER variable not specified, cannot start celery worker." + return -1 + fi + /usr/src/app/wait-for-it.sh $(echo $SERVICES_BROKER | cut -d'/' -f 3) --timeout=20 --strict -- echo " $SERVICES_BROKER (Service Broker) is up" + echo "RUNNING CELERY WORKER" + POOL=$([ $USE_GPU == "True" ] && echo "gevent" || echo "prefork") + celery --app=celery_app.celeryapp worker -Ofair -n nlp_${SERVICE_NAME}_worker@%h --queues=${SERVICE_NAME} -c ${CONCURRENCY} --pool=$POOL + else + echo "ERROR: Wrong serving command: $1" + exit -1 + fi +fi + +echo "Service stopped" \ No newline at end of file diff --git a/document/swagger.yml b/document/swagger.yml new file mode 100644 index 0000000..a6c97da --- /dev/null +++ b/document/swagger.yml @@ -0,0 +1,83 @@ +openapi: 3.0.1 +info: + title: Keyphrase Extraction API + description: API to detect keyphrases in text. + version: 0.2.0 + +servers: +- url: / + +paths: + /kpe/{lang}: + post: + tags: + - Keyphrase Extraction API + summary: Perform Keyphrase Extraction + parameters: + - name: lang + in: path + required: true + description: Language + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/request' + responses: + 200: + description: "Job successfully finished" + content: + application/json: + schema: + $ref: '#/components/schemas/responsemodel' + 400: + description: "Bad request" + 500: + description: "Server error" + +components: + schemas: + article: + type: object + properties: + text: + type: string + default: This is an article. + request: + type: object + properties: + articles: + type: array + required: true + items: + $ref: '#/components/schemas/article' + component_cfg: + type: object + + keyphrase: + type: object + properties: + text: + type: string + score: + type: float + batch: + type: object + properties: + text: + type: string + keyphrases: + type: array + items: + $ref: '#/components/schemas/keyphrase' + + responsemodel: + type: object + properties: + kpe: + type: array + items: + $ref: '#/components/schemas/batch' + \ No newline at end of file diff --git a/healthcheck.sh b/healthcheck.sh new file mode 100755 index 0000000..f1a1b21 --- /dev/null +++ b/healthcheck.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -eax + +if [ "$SERVICE_MODE" = "http" ] +then + curl --fail http://localhost:80/healthcheck || exit 1 +else + celery --app=celery_app.celeryapp inspect ping -d ${SERVICE_NAME}_worker@$HOSTNAME || exit 1 +fi diff --git a/http_server/__init__.py b/http_server/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/http_server/confparser.py b/http_server/confparser.py new file mode 100644 index 0000000..882c7f1 --- /dev/null +++ b/http_server/confparser.py @@ -0,0 +1,51 @@ +import os +import argparse + +__all__ = ["createParser"] + +def createParser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser() + + # SERVICE + parser.add_argument( + '--service_name', + type=str, + help='Service Name', + default=os.environ.get('SERVICE_NAME', 'kpe')) + + #GUNICORN + parser.add_argument( + '--service_port', + type=int, + help='Service port', + default=80) + parser.add_argument( + '--workers', + type=int, + help="Number of Gunicorn workers (default=CONCURRENCY + 1)", + default=int(os.environ.get('CONCURRENCY', 1)) + 1) + + #SWAGGER + parser.add_argument( + '--swagger_url', + type=str, + help='Swagger interface url', + default='/docs') + parser.add_argument( + '--swagger_prefix', + type=str, + help='Swagger prefix', + default=os.environ.get('SWAGGER_PREFIX', '')) + parser.add_argument( + '--swagger_path', + type=str, + help='Swagger file path', + default=os.environ.get('SWAGGER_PATH', '/usr/src/app/document/swagger.yml')) + + #MISC + parser.add_argument( + '--debug', + action='store_true', + help='Display debug logs') + + return parser \ No newline at end of file diff --git a/http_server/ingress.py b/http_server/ingress.py new file mode 100644 index 0000000..4f35b03 --- /dev/null +++ b/http_server/ingress.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +import os +import json +import logging +from time import time + +import spacy +import components + +from flask import Flask, request, abort, Response, json +from serving import GeventServing, GunicornServing +from confparser import createParser +from swagger import setupSwaggerUI + +from kpe.processing import LM_MAP, MODELS, get_model +from kpe.processing.utils import get_data + + +app = Flask("__kpe-worker__") +app.config["JSON_AS_ASCII"] = False +app.config["JSON_SORT_KEYS"] = False + +logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s', datefmt='%d/%m/%Y %H:%M:%S') +logger = logging.getLogger("__kpe-worker__") + +@app.route('/healthcheck', methods=['GET']) +def healthcheck(): + return json.dumps({"healthcheck": "OK"}), 200 + +@app.route("/oas_docs", methods=['GET']) +def oas_docs(): + return "Not Implemented", 501 + +@app.route("/kpe/", methods=['POST']) +def kpe(lang: str): + """Process a batch of articles and return the Keyphrases predicted by the + given model. Each record in the data should have a key "text". + """ + logger.info('KPE request received') + + # Check language availability + if lang in LM_MAP.keys(): + model_name = LM_MAP[lang] + if model_name not in MODELS.keys(): + raise RuntimeError(f"Model {model_name} for language {lang} is not loaded.") + nlp = spacy.blank(lang) + nlp.add_pipe("kpe", config={"model": {"@misc": "get_model", "name": model_name}}) + else: + raise ValueError(f"Language {lang} is not supported.") + + response_body = [] + request_body = json.loads(request.data) + texts = [article["text"] for article in request_body.get("articles", [])] + + component_cfg = request_body.get("component_cfg", {}) + + for doc in nlp.pipe(texts, component_cfg=component_cfg): + response_body.append(get_data(doc)) + + return {"kpe": response_body}, 200 + +# Rejected request handlers +@app.errorhandler(405) +def method_not_allowed(error): + return 'The method is not allowed for the requested URL', 405 + +@app.errorhandler(404) +def page_not_found(error): + return 'The requested URL was not found', 404 + +@app.errorhandler(500) +def server_error(error): + logger.error(error) + return 'Server Error', 500 + +if __name__ == '__main__': + logger.info("Startup...") + + parser = createParser() + args = parser.parse_args() + logger.setLevel(logging.DEBUG if args.debug else logging.INFO) + try: + # Setup SwaggerUI + if args.swagger_path is not None: + setupSwaggerUI(app, args) + logger.debug("Swagger UI set.") + except Exception as e: + logger.warning("Could not setup swagger: {}".format(str(e))) + + if os.environ.get("USE_GPU", "True") == "True": + serving_type = GeventServing + logger.debug("Serving with gevent") + else: + serving_type = GunicornServing + logger.debug("Serving with gunicorn") + + serving = serving_type(app, {'bind': '{}:{}'.format("0.0.0.0", args.service_port), + 'workers': args.workers,}) + logger.info(args) + try: + serving.run() + except KeyboardInterrupt: + logger.info("Process interrupted by user") + except Exception as e: + logger.error(str(e)) + logger.critical("Service is shut down (Error)") + exit(e) diff --git a/http_server/serving.py b/http_server/serving.py new file mode 100644 index 0000000..68d32e5 --- /dev/null +++ b/http_server/serving.py @@ -0,0 +1,39 @@ +import gunicorn.app.base +import gevent.pywsgi +import gevent.monkey +gevent.monkey.patch_all() + +class GunicornServing(gunicorn.app.base.BaseApplication): + + def __init__(self, app, options=None): + self.options = options or {} + self.application = app + super().__init__() + + def load_config(self): + config = {key: value for key, value in self.options.items() + if key in self.cfg.settings and value is not None} + for key, value in config.items(): + self.cfg.set(key.lower(), value) + + def load(self): + return self.application + +class GeventServing(): + + def __init__(self, app, options=None): + self.options = options or {} + self.application = app + + def run(self): + bind = self.options.get('bind', "0.0.0.0:8080") + workers = self.options.get('workers', 1) + listener = bind.split(':') + try: + assert len(listener) == 2 + listener = (listener[0], int(listener[1])) + except: + print(f"Invalid bind address {bind}") + + server = gevent.pywsgi.WSGIServer(listener, self.application, spawn = workers) + server.serve_forever() \ No newline at end of file diff --git a/http_server/swagger.py b/http_server/swagger.py new file mode 100644 index 0000000..4388252 --- /dev/null +++ b/http_server/swagger.py @@ -0,0 +1,17 @@ +import yaml +from flask_swagger_ui import get_swaggerui_blueprint + +def setupSwaggerUI(app, args): + '''Setup Swagger UI within the app''' + swagger_yml = yaml.load( + open(args.swagger_path, 'r'), Loader=yaml.Loader) + swaggerui = get_swaggerui_blueprint( + # Swagger UI static files will be mapped to '{SWAGGER_URL}/dist/' + args.swagger_prefix + args.swagger_url, + args.swagger_path, + config={ # Swagger UI config overrides + 'app_name': "LinTO Platform NLP Keyphrase Extraction", + 'spec': swagger_yml + } + ) + app.register_blueprint(swaggerui, url_prefix=args.swagger_url) \ No newline at end of file diff --git a/kpe/__init__.py b/kpe/__init__.py new file mode 100644 index 0000000..6ce48d9 --- /dev/null +++ b/kpe/__init__.py @@ -0,0 +1,8 @@ +import logging + +logging.basicConfig( + format="%(asctime)s %(name)s %(levelname)s: %(message)s", + datefmt="%d/%m/%Y %H:%M:%S", +) +logger = logging.getLogger("__kpe__") +logger.setLevel(logging.INFO) diff --git a/kpe/processing/__init__.py b/kpe/processing/__init__.py new file mode 100644 index 0000000..086da68 --- /dev/null +++ b/kpe/processing/__init__.py @@ -0,0 +1,29 @@ +import os +import ast +import sys +import spacy +from time import time + +from kpe import logger +from kpe.processing.utils import get_data + +from sentence_transformers import SentenceTransformer + +__all__ = ["logger", "get_data", "LM_MAP", "MODELS", "get_model"] + +logger.info("Loading language model(s)...") +start = time() + +LM_MAP = ast.literal_eval(os.environ["LM_MAP"]) + +try: + MODELS = {LM_MAP[lang]: SentenceTransformer(os.environ.get("ASSETS_PATH_IN_CONTAINER") + '/' + LM_MAP[lang]) for lang in os.environ.get("APP_LANG").split(" ")} +except Exception as err: + raise Exception("Failed to load model(s): {}".format(str(err))) from err + sys.exit(-1) + +@spacy.registry.misc("get_model") +def get_model(name): + return MODELS[name] + +logger.info(f"(t={time() - start}s). Loaded {len(MODELS)} models: {MODELS.keys()}.") \ No newline at end of file diff --git a/kpe/processing/utils.py b/kpe/processing/utils.py new file mode 100644 index 0000000..a13a2d8 --- /dev/null +++ b/kpe/processing/utils.py @@ -0,0 +1,16 @@ +from typing import Dict, Any + +from spacy.tokens import Doc + +def get_data(doc: Doc) -> Dict[str, Any]: + """Extract the data to return from the REST API given a Doc object. Modify + this function to include other data.""" + keyphrases = [ + { + "text": keyphrase[0], + "score": keyphrase[1] + } + for keyphrase in doc._.keyphrases + ] + return {"text": doc.text, "keyphrases": keyphrases} + \ No newline at end of file diff --git a/scripts/main.py b/scripts/main.py deleted file mode 100644 index ff4e52c..0000000 --- a/scripts/main.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import spacy -import components -from scripts.schemas import * -from spacy.tokens import Doc -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from fastapi_health import health -from sentence_transformers import SentenceTransformer -from dotenv import load_dotenv - -# To force the GPU usage: spacy.require_gpu() -spacy.prefer_gpu() - -# Parse environment variables. -# variable and its value in the .envdefault file will be set, only if the variable is missing or empty in the current enviroment. -load_dotenv(".envdefault", override=False) - -# Supported languages and corresponding model names -LM_MAP = { - "fr": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - "en": "sentence-transformers/all-MiniLM-L6-v2" - } - -# Load models -MODELS = {LM_MAP[lang]: SentenceTransformer(os.environ.get("ASSETS_PATH_IN_CONTAINER") + '/' + LM_MAP[lang]) for lang in os.environ.get("APP_LANG").split(" ")} -print(f"Loaded {len(MODELS)} models: {MODELS.keys()}") - -@spacy.registry.misc("get_model") -def get_model(name): - return MODELS[name] - -# Set up the FastAPI app and define the endpoints -app = FastAPI() -app.add_middleware(CORSMiddleware, allow_origins=["*"]) - -# Health check -def healthy(): - return {"linto-platform-nlp-keyphrase-extraction": "online"} -app.add_api_route("/health", health([healthy])) - -# Keyphrase Extraction -def get_data(doc: Doc) -> Dict[str, Any]: - """Extract the data to return from the REST API given a Doc object. Modify - this function to include other data.""" - keyphrases = [ - { - "text": keyphrase[0], - "score": keyphrase[1] - } - for keyphrase in doc._.keyphrases - ] - return {"text": doc.text, "keyphrases": keyphrases} - -@app.post("/kpe/{lang}", summary="Keyphrase Extraction", response_model=KpeResponseModel) -def kpe(lang: str, query: RequestModel): - """Process a batch of articles and return the Keyphrases predicted by the - given model. Each record in the data should have a key "text". - """ - if lang in LM_MAP.keys(): - model_name = LM_MAP[lang] - if model_name not in MODELS.keys(): - raise RuntimeError(f"Model {model_name} for language {lang} is not loaded.") - nlp = spacy.blank(lang) - nlp.add_pipe("kpe", config={"model": {"@misc": "get_model", "name": model_name}}) - else: - raise ValueError(f"Language {lang} is not supported.") - - response_body = [] - texts = (article.text for article in query.articles) - for doc in nlp.pipe(texts, component_cfg=query.component_cfg): - response_body.append(get_data(doc)) - return {"kpe": response_body} \ No newline at end of file diff --git a/scripts/schemas.py b/scripts/schemas.py deleted file mode 100644 index 3fc7bcc..0000000 --- a/scripts/schemas.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import List, Dict, Any, Optional -from pydantic import BaseModel - -class Article(BaseModel): - # Schema for a single article in a batch of articles to process - text: str - - -class RequestModel(BaseModel): - # Schema for a request consisting a batch of articles, and component configuration - articles: List[Article] - component_cfg: Optional[Dict[str, Dict[str, Any]]] = None - - -class KpeResponseModel(BaseModel): - # This is the schema of the expected response and depends on what you - # return from get_data. - - class Batch(BaseModel): - class Kyephrase(BaseModel): - text: str - score: float - - text: str - keyphrases: List[Kyephrase] = [] - - kpe: List[Batch] diff --git a/wait-for-it.sh b/wait-for-it.sh new file mode 100755 index 0000000..92cbdbb --- /dev/null +++ b/wait-for-it.sh @@ -0,0 +1,182 @@ +#!/usr/bin/env bash +# Use this script to test if a given TCP host/port are available + +WAITFORIT_cmdname=${0##*/} + +echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } + +usage() +{ + cat << USAGE >&2 +Usage: + $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] + -h HOST | --host=HOST Host or IP under test + -p PORT | --port=PORT TCP port under test + Alternatively, you specify the host and port as host:port + -s | --strict Only execute subcommand if the test succeeds + -q | --quiet Don't output any status messages + -t TIMEOUT | --timeout=TIMEOUT + Timeout in seconds, zero for no timeout + -- COMMAND ARGS Execute command with args after the test finishes +USAGE + exit 1 +} + +wait_for() +{ + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + else + echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" + fi + WAITFORIT_start_ts=$(date +%s) + while : + do + if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then + nc -z $WAITFORIT_HOST $WAITFORIT_PORT + WAITFORIT_result=$? + else + (echo > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 + WAITFORIT_result=$? + fi + if [[ $WAITFORIT_result -eq 0 ]]; then + WAITFORIT_end_ts=$(date +%s) + echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" + break + fi + sleep 1 + done + return $WAITFORIT_result +} + +wait_for_wrapper() +{ + # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 + if [[ $WAITFORIT_QUIET -eq 1 ]]; then + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + else + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + fi + WAITFORIT_PID=$! + trap "kill -INT -$WAITFORIT_PID" INT + wait $WAITFORIT_PID + WAITFORIT_RESULT=$? + if [[ $WAITFORIT_RESULT -ne 0 ]]; then + echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + fi + return $WAITFORIT_RESULT +} + +# process arguments +while [[ $# -gt 0 ]] +do + case "$1" in + *:* ) + WAITFORIT_hostport=(${1//:/ }) + WAITFORIT_HOST=${WAITFORIT_hostport[0]} + WAITFORIT_PORT=${WAITFORIT_hostport[1]} + shift 1 + ;; + --child) + WAITFORIT_CHILD=1 + shift 1 + ;; + -q | --quiet) + WAITFORIT_QUIET=1 + shift 1 + ;; + -s | --strict) + WAITFORIT_STRICT=1 + shift 1 + ;; + -h) + WAITFORIT_HOST="$2" + if [[ $WAITFORIT_HOST == "" ]]; then break; fi + shift 2 + ;; + --host=*) + WAITFORIT_HOST="${1#*=}" + shift 1 + ;; + -p) + WAITFORIT_PORT="$2" + if [[ $WAITFORIT_PORT == "" ]]; then break; fi + shift 2 + ;; + --port=*) + WAITFORIT_PORT="${1#*=}" + shift 1 + ;; + -t) + WAITFORIT_TIMEOUT="$2" + if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi + shift 2 + ;; + --timeout=*) + WAITFORIT_TIMEOUT="${1#*=}" + shift 1 + ;; + --) + shift + WAITFORIT_CLI=("$@") + break + ;; + --help) + usage + ;; + *) + echoerr "Unknown argument: $1" + usage + ;; + esac +done + +if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then + echoerr "Error: you need to provide a host and port to test." + usage +fi + +WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} +WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} +WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} +WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} + +# Check to see if timeout is from busybox? +WAITFORIT_TIMEOUT_PATH=$(type -p timeout) +WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) + +WAITFORIT_BUSYTIMEFLAG="" +if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then + WAITFORIT_ISBUSY=1 + # Check if busybox timeout uses -t flag + # (recent Alpine versions don't support -t anymore) + if timeout &>/dev/stdout | grep -q -e '-t '; then + WAITFORIT_BUSYTIMEFLAG="-t" + fi +else + WAITFORIT_ISBUSY=0 +fi + +if [[ $WAITFORIT_CHILD -gt 0 ]]; then + wait_for + WAITFORIT_RESULT=$? + exit $WAITFORIT_RESULT +else + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + wait_for_wrapper + WAITFORIT_RESULT=$? + else + wait_for + WAITFORIT_RESULT=$? + fi +fi + +if [[ $WAITFORIT_CLI != "" ]]; then + if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then + echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" + exit $WAITFORIT_RESULT + fi + exec "${WAITFORIT_CLI[@]}" +else + exit $WAITFORIT_RESULT +fi \ No newline at end of file