diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 000000000..3ac87141b --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,27 @@ +name: Run python tests with pytest + +on: [push, pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + cd application + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test with pytest + run: | + python -m pytest diff --git a/application/Dockerfile b/application/Dockerfile index 12859724d..8c0839267 100644 --- a/application/Dockerfile +++ b/application/Dockerfile @@ -14,10 +14,10 @@ FROM python:3.10-slim-bullseye COPY --from=builder /usr/local/ /usr/local/ WORKDIR /app -COPY . /app +COPY . /app/application ENV FLASK_APP=app.py ENV FLASK_DEBUG=true EXPOSE 7091 -CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:7091", "wsgi:app"] +CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:7091", "application.wsgi:app"] diff --git a/application/__init__.py b/application/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/application/app.py b/application/app.py index 156e34a7a..3ba2de69a 100644 --- a/application/app.py +++ b/application/app.py @@ -37,9 +37,9 @@ from pymongo import MongoClient from werkzeug.utils import secure_filename -from core.settings import settings -from error import bad_request -from worker import ingest_worker +from application.core.settings import settings +from application.error import bad_request +from application.worker import ingest_worker from bson.objectid import ObjectId # os.environ["LANGCHAIN_HANDLER"] = "langchain" @@ -68,19 +68,20 @@ dotenv.load_dotenv() # load the prompts -with open("prompts/combine_prompt.txt", "r") as f: +current_dir = os.path.dirname(os.path.abspath(__file__)) +with open(os.path.join(current_dir, "prompts", "combine_prompt.txt"), "r") as f: template = f.read() -with open("prompts/combine_prompt_hist.txt", "r") as f: +with open(os.path.join(current_dir, "prompts", "combine_prompt_hist.txt"), "r") as f: template_hist = f.read() -with open("prompts/question_prompt.txt", "r") as f: +with open(os.path.join(current_dir, "prompts", "question_prompt.txt"), "r") as f: template_quest = f.read() -with open("prompts/chat_combine_prompt.txt", "r") as f: +with open(os.path.join(current_dir, "prompts", "chat_combine_prompt.txt"), "r") as f: chat_combine_template = f.read() -with open("prompts/chat_reduce_prompt.txt", "r") as f: +with open(os.path.join(current_dir, "prompts", "chat_reduce_prompt.txt"), "r") as f: chat_reduce_template = f.read() api_key_set = settings.API_KEY is not None @@ -92,7 +93,7 @@ app.config["CELERY_RESULT_BACKEND"] = settings.CELERY_RESULT_BACKEND app.config["MONGO_URI"] = settings.MONGO_URI celery = Celery() -celery.config_from_object("celeryconfig") +celery.config_from_object("application.celeryconfig") mongo = MongoClient(app.config["MONGO_URI"]) db = mongo["docsgpt"] vectors_collection = db["vectors"] @@ -129,6 +130,7 @@ def get_vectorstore(data): vectorstore = "" else: vectorstore = "" + vectorstore = os.path.join("application", vectorstore) return vectorstore diff --git a/application/parser/file/__init__.py b/application/parser/file/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/application/parser/file/__init__.py @@ -0,0 +1 @@ + diff --git a/application/parser/file/base.py b/application/parser/file/base.py index 2fe9a75d1..f63e8ef64 100644 --- a/application/parser/file/base.py +++ b/application/parser/file/base.py @@ -3,7 +3,7 @@ from typing import Any, List from langchain.docstore.document import Document as LCDocument -from parser.schema.base import Document +from application.parser.schema.base import Document class BaseReader: diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 2be8e3289..593681e29 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -3,15 +3,15 @@ from pathlib import Path from typing import Callable, Dict, List, Optional, Union -from parser.file.base import BaseReader -from parser.file.base_parser import BaseParser -from parser.file.docs_parser import DocxParser, PDFParser -from parser.file.epub_parser import EpubParser -from parser.file.html_parser import HTMLParser -from parser.file.markdown_parser import MarkdownParser -from parser.file.rst_parser import RstParser -from parser.file.tabular_parser import PandasCSVParser -from parser.schema.base import Document +from application.parser.file.base import BaseReader +from application.parser.file.base_parser import BaseParser +from application.parser.file.docs_parser import DocxParser, PDFParser +from application.parser.file.epub_parser import EpubParser +from application.parser.file.html_parser import HTMLParser +from application.parser.file.markdown_parser import MarkdownParser +from application.parser.file.rst_parser import RstParser +from application.parser.file.tabular_parser import PandasCSVParser +from application.parser.schema.base import Document DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), diff --git a/application/parser/file/docs_parser.py b/application/parser/file/docs_parser.py index 0cde40769..861e8e589 100644 --- a/application/parser/file/docs_parser.py +++ b/application/parser/file/docs_parser.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Dict -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class PDFParser(BaseParser): diff --git a/application/parser/file/epub_parser.py b/application/parser/file/epub_parser.py index 6ece5ecfe..4f5e87115 100644 --- a/application/parser/file/epub_parser.py +++ b/application/parser/file/epub_parser.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Dict -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class EpubParser(BaseParser): diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py index 96460c7c2..f6f885fc8 100644 --- a/application/parser/file/html_parser.py +++ b/application/parser/file/html_parser.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Dict, Union -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class HTMLParser(BaseParser): diff --git a/application/parser/file/markdown_parser.py b/application/parser/file/markdown_parser.py index d8aeb3b08..d906e9b6c 100644 --- a/application/parser/file/markdown_parser.py +++ b/application/parser/file/markdown_parser.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast import tiktoken -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class MarkdownParser(BaseParser): diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py index f8feff708..4bd0e6f42 100644 --- a/application/parser/file/rst_parser.py +++ b/application/parser/file/rst_parser.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class RstParser(BaseParser): diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py index d7c6402a4..81355ae07 100644 --- a/application/parser/file/tabular_parser.py +++ b/application/parser/file/tabular_parser.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any, Dict, List, Union -from parser.file.base_parser import BaseParser +from application.parser.file.base_parser import BaseParser class CSVParser(BaseParser): diff --git a/application/parser/schema/__init__.py b/application/parser/schema/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/application/parser/schema/__init__.py @@ -0,0 +1 @@ + diff --git a/application/parser/schema/base.py b/application/parser/schema/base.py index 3dafda1a0..61670f9a6 100644 --- a/application/parser/schema/base.py +++ b/application/parser/schema/base.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from langchain.docstore.document import Document as LCDocument -from parser.schema.schema import BaseDocument +from application.parser.schema.schema import BaseDocument @dataclass diff --git a/application/parser/token_func.py b/application/parser/token_func.py index aada673fa..14b231fcd 100644 --- a/application/parser/token_func.py +++ b/application/parser/token_func.py @@ -3,7 +3,7 @@ from typing import List import tiktoken -from parser.schema.base import Document +from application.parser.schema.base import Document def separate_header_and_body(text): diff --git a/application/requirements.txt b/application/requirements.txt index fc8d2a852..5bb6780b4 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -73,6 +73,7 @@ pymongo==4.3.3 pyowm==3.3.0 PyPDF2==3.0.1 PySocks==1.7.1 +pytest python-dateutil==2.8.2 python-dotenv==1.0.0 python-jose==3.3.0 diff --git a/application/worker.py b/application/worker.py index 2a3ff24a0..da955a7ec 100644 --- a/application/worker.py +++ b/application/worker.py @@ -7,11 +7,11 @@ import nltk import requests -from core.settings import settings -from parser.file.bulk import SimpleDirectoryReader -from parser.open_ai_func import call_openai_api -from parser.schema.base import Document -from parser.token_func import group_split +from application.core.settings import settings +from application.parser.file.bulk import SimpleDirectoryReader +from application.parser.open_ai_func import call_openai_api +from application.parser.schema.base import Document +from application.parser.token_func import group_split try: nltk.download('punkt', quiet=True) diff --git a/application/wsgi.py b/application/wsgi.py index 6b8b4d0cc..5160e115e 100644 --- a/application/wsgi.py +++ b/application/wsgi.py @@ -1,4 +1,4 @@ -from app import app +from application.app import app if __name__ == "__main__": app.run(debug=True, port=7091) diff --git a/docker-compose-azure.yaml b/docker-compose-azure.yaml index a015eef21..70a168085 100644 --- a/docker-compose-azure.yaml +++ b/docker-compose-azure.yaml @@ -27,16 +27,16 @@ services: ports: - "7091:7091" volumes: - - ./application/indexes:/app/indexes - - ./application/inputs:/app/inputs - - ./application/vectors:/app/vectors + - ./application/indexes:/app/application/indexes + - ./application/inputs:/app/application/inputs + - ./application/vectors:/app/application/vectors depends_on: - redis - mongo worker: build: ./application - command: celery -A app.celery worker -l INFO + command: celery -A application.app.celery worker -l INFO environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY diff --git a/docker-compose.yaml b/docker-compose.yaml index a8917af49..d5dd10e5b 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -22,16 +22,16 @@ services: ports: - "7091:7091" volumes: - - ./application/indexes:/app/indexes - - ./application/inputs:/app/inputs - - ./application/vectors:/app/vectors + - ./application/indexes:/app/application/indexes + - ./application/inputs:/app/application/inputs + - ./application/vectors:/app/application/vectors depends_on: - redis - mongo worker: build: ./application - command: celery -A app.celery worker -l INFO + command: celery -A application.app.celery worker -l INFO environment: - API_KEY=$OPENAI_API_KEY - EMBEDDINGS_KEY=$OPENAI_API_KEY diff --git a/scripts/requirements.txt b/scripts/requirements.txt index f6b0b451e..c56feab45 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -110,8 +110,6 @@ tenacity==8.2.2 threadpoolctl==3.2.0 tiktoken==0.4.0 tokenizers==0.13.3 -torch==2.0.1 -torchvision==0.15.2 tqdm==4.65.0 transformers==4.31.0 typer==0.9.0 diff --git a/tests/test_app.py b/tests/test_app.py new file mode 100644 index 000000000..8ae9ee340 --- /dev/null +++ b/tests/test_app.py @@ -0,0 +1,28 @@ +from application.app import get_vectorstore +import os + + +# Test cases for get_vectorstore function +def test_no_active_docs(): + data = {} + assert get_vectorstore(data) == os.path.join("application", "") + + +def test_local_default_active_docs(): + data = {"active_docs": "local/default"} + assert get_vectorstore(data) == os.path.join("application", "") + + +def test_local_non_default_active_docs(): + data = {"active_docs": "local/something"} + assert get_vectorstore(data) == os.path.join("application", "indexes/local/something") + + +def test_default_active_docs(): + data = {"active_docs": "default"} + assert get_vectorstore(data) == os.path.join("application", "") + + +def test_complex_active_docs(): + data = {"active_docs": "local/other/path"} + assert get_vectorstore(data) == os.path.join("application", "indexes/local/other/path")