diff --git a/core/pyproject.toml b/core/pyproject.toml index bb6d9c44e839..2bb0abc8bf5d 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "markupsafe>=2.1.5", "megaparse-sdk>=0.1.9", "langchain-mistralai>=0.2.3", + "fasttext-langdetect>=1.0.5", "langfuse>=2.57.0", ] readme = "README.md" diff --git a/core/quivr_core/brain/brain.py b/core/quivr_core/brain/brain.py index 0c6220a8e2a4..e8d657fcfbf5 100644 --- a/core/quivr_core/brain/brain.py +++ b/core/quivr_core/brain/brain.py @@ -10,9 +10,7 @@ from langchain_core.embeddings import Embeddings from langchain_core.messages import AIMessage, HumanMessage from langchain_core.vectorstores import VectorStore -from quivr_core.rag.entities.models import ParsedRAGResponse from langchain_openai import OpenAIEmbeddings -from quivr_core.rag.quivr_rag import QuivrQARAG from rich.console import Console from rich.panel import Panel @@ -24,16 +22,18 @@ LocalStorageConfig, TransparentStorageConfig, ) -from quivr_core.rag.entities.chat import ChatHistory -from quivr_core.rag.entities.config import RetrievalConfig from quivr_core.files.file import load_qfile from quivr_core.llm import LLMEndpoint +from quivr_core.processor.registry import get_processor_class +from quivr_core.rag.entities.chat import ChatHistory +from quivr_core.rag.entities.config import RetrievalConfig from quivr_core.rag.entities.models import ( ParsedRAGChunkResponse, + ParsedRAGResponse, QuivrKnowledge, SearchResult, ) -from quivr_core.processor.registry import get_processor_class +from quivr_core.rag.quivr_rag import QuivrQARAG from quivr_core.rag.quivr_rag_langgraph import QuivrQARAGLangGraph from quivr_core.storage.local_storage import LocalStorage, TransparentStorage from quivr_core.storage.storage_base import StorageBase @@ -567,6 +567,7 @@ async def aask( Returns: ParsedRAGResponse: The generated answer. """ + # question_language = detect_language(question) -- Commented until we use it full_answer = "" async for response in self.ask_streaming( diff --git a/core/quivr_core/language/models.py b/core/quivr_core/language/models.py new file mode 100644 index 000000000000..eef86728dcd5 --- /dev/null +++ b/core/quivr_core/language/models.py @@ -0,0 +1,181 @@ +from enum import Enum + + +class Language(str, Enum): + AF = "af" # Afrikaans + ALS = "als" # Alemannic + AM = "am" # Amharic + AN = "an" # Aragonese + AR = "ar" # Arabic + ARZ = "arz" # Egyptian Arabic + AS = "as" # Assamese + AST = "ast" # Asturian + AV = "av" # Avaric + AZ = "az" # Azerbaijani + AZB = "azb" # South Azerbaijani + BA = "ba" # Bashkir + BAR = "bar" # Bavarian + BCL = "bcl" # Central Bikol + BE = "be" # Belarusian + BG = "bg" # Bulgarian + BH = "bh" # Bihari + BN = "bn" # Bengali + BO = "bo" # Tibetan + BPY = "bpy" # Bishnupriya Manipuri + BR = "br" # Breton + BS = "bs" # Bosnian + BXR = "bxr" # Buryat + CA = "ca" # Catalan + CBK = "cbk" # Chavacano + CE = "ce" # Chechen + CEB = "ceb" # Cebuano + CKB = "ckb" # Central Kurdish + CO = "co" # Corsican + CS = "cs" # Czech + CV = "cv" # Chuvash + CY = "cy" # Welsh + DA = "da" # Danish + DE = "de" # German + DIQ = "diq" # Zazaki + DSB = "dsb" # Lower Sorbian + DTY = "dty" # Doteli + DV = "dv" # Dhivehi + EL = "el" # Greek + EML = "eml" # Emilian-Romagnol + EN = "en" # English + EO = "eo" # Esperanto + ES = "es" # Spanish + ET = "et" # Estonian + EU = "eu" # Basque + FA = "fa" # Persian + FI = "fi" # Finnish + FR = "fr" # French + FRR = "frr" # North Frisian + FY = "fy" # Western Frisian + GA = "ga" # Irish + GD = "gd" # Scottish Gaelic + GL = "gl" # Galician + GN = "gn" # Guarani + GOM = "gom" # Goan Konkani + GU = "gu" # Gujarati + GV = "gv" # Manx + HE = "he" # Hebrew + HI = "hi" # Hindi + HIF = "hif" # Fiji Hindi + HR = "hr" # Croatian + HSB = "hsb" # Upper Sorbian + HT = "ht" # Haitian Creole + HU = "hu" # Hungarian + HY = "hy" # Armenian + IA = "ia" # Interlingua + ID = "id" # Indonesian + IE = "ie" # Interlingue + ILO = "ilo" # Iloko + IO = "io" # Ido + IS = "is" # Icelandic + IT = "it" # Italian + JA = "ja" # Japanese + JBO = "jbo" # Lojban + JV = "jv" # Javanese + KA = "ka" # Georgian + KK = "kk" # Kazakh + KM = "km" # Khmer + KN = "kn" # Kannada + KO = "ko" # Korean + KRC = "krc" # Karachay-Balkar + KU = "ku" # Kurdish + KV = "kv" # Komi + KW = "kw" # Cornish + KY = "ky" # Kyrgyz + LA = "la" # Latin + LB = "lb" # Luxembourgish + LEZ = "lez" # Lezghian + LI = "li" # Limburgish + LMO = "lmo" # Lombard + LO = "lo" # Lao + LRC = "lrc" # Northern Luri + LT = "lt" # Lithuanian + LV = "lv" # Latvian + MAI = "mai" # Maithili + MG = "mg" # Malagasy + MHR = "mhr" # Eastern Mari + MIN = "min" # Minangkabau + MK = "mk" # Macedonian + ML = "ml" # Malayalam + MN = "mn" # Mongolian + MR = "mr" # Marathi + MRJ = "mrj" # Western Mari + MS = "ms" # Malay + MT = "mt" # Maltese + MWL = "mwl" # Mirandese + MY = "my" # Burmese + MYV = "myv" # Erzya + MZN = "mzn" # Mazanderani + NAH = "nah" # Nahuatl + NAP = "nap" # Neapolitan + NDS = "nds" # Low German + NE = "ne" # Nepali + NEW = "new" # Newari + NL = "nl" # Dutch + NN = "nn" # Norwegian Nynorsk + NO = "no" # Norwegian + OC = "oc" # Occitan + OR = "or" # Odia + OS = "os" # Ossetian + PA = "pa" # Punjabi + PAM = "pam" # Pampanga + PFL = "pfl" # Palatine German + PL = "pl" # Polish + PMS = "pms" # Piedmontese + PNB = "pnb" # Western Punjabi + PS = "ps" # Pashto + PT = "pt" # Portuguese + QU = "qu" # Quechua + RM = "rm" # Romansh + RO = "ro" # Romanian + RU = "ru" # Russian + RUE = "rue" # Rusyn + SA = "sa" # Sanskrit + SAH = "sah" # Yakut + SC = "sc" # Sardinian + SCN = "scn" # Sicilian + SCO = "sco" # Scots + SD = "sd" # Sindhi + SH = "sh" # Serbo-Croatian + SI = "si" # Sinhala + SK = "sk" # Slovak + SL = "sl" # Slovenian + SO = "so" # Somali + SQ = "sq" # Albanian + SR = "sr" # Serbian + SU = "su" # Sundanese + SV = "sv" # Swedish + SW = "sw" # Swahili + TA = "ta" # Tamil + TE = "te" # Telugu + TG = "tg" # Tajik + TH = "th" # Thai + TK = "tk" # Turkmen + TL = "tl" # Tagalog + TR = "tr" # Turkish + TT = "tt" # Tatar + TYV = "tyv" # Tuvan + UG = "ug" # Uyghur + UK = "uk" # Ukrainian + UR = "ur" # Urdu + UZ = "uz" # Uzbek + VEC = "vec" # Venetian + VEP = "vep" # Veps + VI = "vi" # Vietnamese + VLS = "vls" # West Flemish + VO = "vo" # Volapük + WA = "wa" # Walloon + WAR = "war" # Waray + WUU = "wuu" # Wu Chinese + XAL = "xal" # Kalmyk + XMF = "xmf" # Mingrelian + YI = "yi" # Yiddish + YO = "yo" # Yoruba + YUE = "yue" # Cantonese + ZH = "zh" # Chinese + UNKNOWN = "unknown" # Unknown diff --git a/core/quivr_core/language/utils.py b/core/quivr_core/language/utils.py new file mode 100644 index 000000000000..66c1efd52ff9 --- /dev/null +++ b/core/quivr_core/language/utils.py @@ -0,0 +1,12 @@ +from ftlangdetect import detect +from quivr_core.language.models import Language + + +def detect_language(text: str, low_memory: bool = True) -> Language: + detected_lang = detect(text=text, low_memory=low_memory) + try: + detected_language = Language(detected_lang["lang"]) + except ValueError: + return Language.UNKNOWN + + return detected_language diff --git a/core/quivr_core/processor/processor_base.py b/core/quivr_core/processor/processor_base.py index 5d41902b940b..aa247fa0af54 100644 --- a/core/quivr_core/processor/processor_base.py +++ b/core/quivr_core/processor/processor_base.py @@ -6,6 +6,7 @@ from langchain_core.documents import Document from quivr_core.files.file import FileExtension, QuivrFile +from quivr_core.language.utils import detect_language logger = logging.getLogger("quivr_core") @@ -44,6 +45,10 @@ async def process_file(self, file: QuivrFile) -> list[Document]: doc.metadata = { "chunk_index": idx, "quivr_core_version": qvr_version, + "language": detect_language( + text=doc.page_content.replace("\\n", " ").replace("\n", " "), + low_memory=True, + ).value, **file.metadata, **doc.metadata, **self.processor_metadata, diff --git a/core/requirements-dev.lock b/core/requirements-dev.lock index 58faa3eaaf4c..6e9b845e021a 100644 --- a/core/requirements-dev.lock +++ b/core/requirements-dev.lock @@ -73,6 +73,10 @@ faiss-cpu==1.9.0 # via quivr-core fastavro==1.9.7 # via cohere +fasttext==0.9.3 + # via fasttext-langdetect +fasttext-langdetect==1.0.5 + # via quivr-core filelock==3.16.1 # via huggingface-hub # via transformers @@ -211,6 +215,7 @@ nodeenv==1.9.1 # via pre-commit numpy==1.26.4 # via faiss-cpu + # via fasttext # via langchain # via langchain-community # via pandas @@ -262,6 +267,8 @@ pure-eval==0.2.3 # via stack-data py-cpuinfo==9.0.0 # via pytest-benchmark +pybind11==2.13.6 + # via fasttext pycodestyle==2.12.1 # via flake8 pycryptodome==3.21.0 @@ -321,6 +328,7 @@ regex==2024.9.11 # via transformers requests==2.32.3 # via cohere + # via fasttext-langdetect # via huggingface-hub # via langchain # via langchain-community @@ -338,6 +346,8 @@ safetensors==0.4.5 # via transformers sentencepiece==0.2.0 # via transformers +setuptools==75.6.0 + # via fasttext six==1.16.0 # via asttokens # via python-dateutil diff --git a/core/requirements.lock b/core/requirements.lock index 22ae9b50df12..ac6ee7332841 100644 --- a/core/requirements.lock +++ b/core/requirements.lock @@ -51,6 +51,10 @@ faiss-cpu==1.9.0 # via quivr-core fastavro==1.9.7 # via cohere +fasttext==0.9.3 + # via fasttext-langdetect +fasttext-langdetect==1.0.5 + # via quivr-core filelock==3.16.1 # via huggingface-hub # via transformers @@ -159,6 +163,7 @@ nats-py==2.9.0 # via megaparse-sdk numpy==1.26.4 # via faiss-cpu + # via fasttext # via langchain # via langchain-community # via pandas @@ -185,6 +190,8 @@ protobuf==5.28.2 # via transformers psutil==6.1.0 # via megaparse-sdk +pybind11==2.13.6 + # via fasttext pycryptodome==3.21.0 # via megaparse-sdk pydantic==2.9.2 @@ -227,6 +234,7 @@ regex==2024.9.11 # via transformers requests==2.32.3 # via cohere + # via fasttext-langdetect # via huggingface-hub # via langchain # via langchain-community @@ -243,6 +251,8 @@ safetensors==0.4.5 # via transformers sentencepiece==0.2.0 # via transformers +setuptools==75.6.0 + # via fasttext six==1.16.0 # via python-dateutil sniffio==1.3.1