Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: language detection after chunking #3532

Merged
merged 5 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies = [
"markupsafe>=2.1.5",
"megaparse-sdk>=0.1.9",
"langchain-mistralai>=0.2.3",
"fasttext-langdetect>=1.0.5",
chloedia marked this conversation as resolved.
Show resolved Hide resolved
]
readme = "README.md"
requires-python = ">= 3.11"
Expand Down
11 changes: 6 additions & 5 deletions core/quivr_core/brain/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
from langchain_core.embeddings import Embeddings
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.vectorstores import VectorStore
from quivr_core.rag.entities.models import ParsedRAGResponse
from langchain_openai import OpenAIEmbeddings
from quivr_core.rag.quivr_rag import QuivrQARAG
from rich.console import Console
from rich.panel import Panel

Expand All @@ -24,16 +22,18 @@
LocalStorageConfig,
TransparentStorageConfig,
)
from quivr_core.rag.entities.chat import ChatHistory
from quivr_core.rag.entities.config import RetrievalConfig
from quivr_core.files.file import load_qfile
from quivr_core.llm import LLMEndpoint
from quivr_core.processor.registry import get_processor_class
from quivr_core.rag.entities.chat import ChatHistory
from quivr_core.rag.entities.config import RetrievalConfig
from quivr_core.rag.entities.models import (
ParsedRAGChunkResponse,
ParsedRAGResponse,
QuivrKnowledge,
SearchResult,
)
from quivr_core.processor.registry import get_processor_class
from quivr_core.rag.quivr_rag import QuivrQARAG
from quivr_core.rag.quivr_rag_langgraph import QuivrQARAGLangGraph
from quivr_core.storage.local_storage import LocalStorage, TransparentStorage
from quivr_core.storage.storage_base import StorageBase
Expand Down Expand Up @@ -567,6 +567,7 @@ async def aask(
Returns:
ParsedRAGResponse: The generated answer.
"""
# question_language = detect_language(question) -- Commented until we use it
full_answer = ""

async for response in self.ask_streaming(
Expand Down
180 changes: 180 additions & 0 deletions core/quivr_core/language/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
from enum import Enum


class Language(str, Enum):
AF = "af" # Afrikaans
ALS = "als" # Alemannic
AM = "am" # Amharic
AN = "an" # Aragonese
AR = "ar" # Arabic
ARZ = "arz" # Egyptian Arabic
AS = "as" # Assamese
AST = "ast" # Asturian
AV = "av" # Avaric
AZ = "az" # Azerbaijani
AZB = "azb" # South Azerbaijani
BA = "ba" # Bashkir
BAR = "bar" # Bavarian
BCL = "bcl" # Central Bikol
BE = "be" # Belarusian
BG = "bg" # Bulgarian
BH = "bh" # Bihari
BN = "bn" # Bengali
BO = "bo" # Tibetan
BPY = "bpy" # Bishnupriya Manipuri
BR = "br" # Breton
BS = "bs" # Bosnian
BXR = "bxr" # Buryat
CA = "ca" # Catalan
CBK = "cbk" # Chavacano
CE = "ce" # Chechen
CEB = "ceb" # Cebuano
CKB = "ckb" # Central Kurdish
CO = "co" # Corsican
CS = "cs" # Czech
CV = "cv" # Chuvash
CY = "cy" # Welsh
DA = "da" # Danish
DE = "de" # German
DIQ = "diq" # Zazaki
DSB = "dsb" # Lower Sorbian
DTY = "dty" # Doteli
DV = "dv" # Dhivehi
EL = "el" # Greek
EML = "eml" # Emilian-Romagnol
EN = "en" # English
EO = "eo" # Esperanto
ES = "es" # Spanish
ET = "et" # Estonian
EU = "eu" # Basque
FA = "fa" # Persian
FI = "fi" # Finnish
FR = "fr" # French
FRR = "frr" # North Frisian
FY = "fy" # Western Frisian
GA = "ga" # Irish
GD = "gd" # Scottish Gaelic
GL = "gl" # Galician
GN = "gn" # Guarani
GOM = "gom" # Goan Konkani
GU = "gu" # Gujarati
GV = "gv" # Manx
HE = "he" # Hebrew
HI = "hi" # Hindi
HIF = "hif" # Fiji Hindi
HR = "hr" # Croatian
HSB = "hsb" # Upper Sorbian
HT = "ht" # Haitian Creole
HU = "hu" # Hungarian
HY = "hy" # Armenian
IA = "ia" # Interlingua
ID = "id" # Indonesian
IE = "ie" # Interlingue
ILO = "ilo" # Iloko
IO = "io" # Ido
IS = "is" # Icelandic
IT = "it" # Italian
JA = "ja" # Japanese
JBO = "jbo" # Lojban
JV = "jv" # Javanese
KA = "ka" # Georgian
KK = "kk" # Kazakh
KM = "km" # Khmer
KN = "kn" # Kannada
KO = "ko" # Korean
KRC = "krc" # Karachay-Balkar
KU = "ku" # Kurdish
KV = "kv" # Komi
KW = "kw" # Cornish
KY = "ky" # Kyrgyz
LA = "la" # Latin
LB = "lb" # Luxembourgish
LEZ = "lez" # Lezghian
LI = "li" # Limburgish
LMO = "lmo" # Lombard
LO = "lo" # Lao
LRC = "lrc" # Northern Luri
LT = "lt" # Lithuanian
LV = "lv" # Latvian
MAI = "mai" # Maithili
MG = "mg" # Malagasy
MHR = "mhr" # Eastern Mari
MIN = "min" # Minangkabau
MK = "mk" # Macedonian
ML = "ml" # Malayalam
MN = "mn" # Mongolian
MR = "mr" # Marathi
MRJ = "mrj" # Western Mari
MS = "ms" # Malay
MT = "mt" # Maltese
MWL = "mwl" # Mirandese
MY = "my" # Burmese
MYV = "myv" # Erzya
MZN = "mzn" # Mazanderani
NAH = "nah" # Nahuatl
NAP = "nap" # Neapolitan
NDS = "nds" # Low German
NE = "ne" # Nepali
NEW = "new" # Newari
NL = "nl" # Dutch
NN = "nn" # Norwegian Nynorsk
NO = "no" # Norwegian
OC = "oc" # Occitan
OR = "or" # Odia
OS = "os" # Ossetian
PA = "pa" # Punjabi
PAM = "pam" # Pampanga
PFL = "pfl" # Palatine German
PL = "pl" # Polish
PMS = "pms" # Piedmontese
PNB = "pnb" # Western Punjabi
PS = "ps" # Pashto
PT = "pt" # Portuguese
QU = "qu" # Quechua
RM = "rm" # Romansh
RO = "ro" # Romanian
RU = "ru" # Russian
RUE = "rue" # Rusyn
SA = "sa" # Sanskrit
SAH = "sah" # Yakut
SC = "sc" # Sardinian
SCN = "scn" # Sicilian
SCO = "sco" # Scots
SD = "sd" # Sindhi
SH = "sh" # Serbo-Croatian
SI = "si" # Sinhala
SK = "sk" # Slovak
SL = "sl" # Slovenian
SO = "so" # Somali
SQ = "sq" # Albanian
SR = "sr" # Serbian
SU = "su" # Sundanese
SV = "sv" # Swedish
SW = "sw" # Swahili
TA = "ta" # Tamil
TE = "te" # Telugu
TG = "tg" # Tajik
TH = "th" # Thai
TK = "tk" # Turkmen
TL = "tl" # Tagalog
TR = "tr" # Turkish
TT = "tt" # Tatar
TYV = "tyv" # Tuvan
UG = "ug" # Uyghur
UK = "uk" # Ukrainian
UR = "ur" # Urdu
UZ = "uz" # Uzbek
VEC = "vec" # Venetian
VEP = "vep" # Veps
VI = "vi" # Vietnamese
VLS = "vls" # West Flemish
VO = "vo" # Volapük
WA = "wa" # Walloon
WAR = "war" # Waray
WUU = "wuu" # Wu Chinese
XAL = "xal" # Kalmyk
XMF = "xmf" # Mingrelian
YI = "yi" # Yiddish
YO = "yo" # Yoruba
YUE = "yue" # Cantonese
ZH = "zh" # Chinese
7 changes: 7 additions & 0 deletions core/quivr_core/language/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from ftlangdetect import detect
from quivr_core.language.models import Language


def detect_language(text: str, low_memory: bool = True) -> Language:
detected_lang = detect(text=text, low_memory=low_memory)
chloedia marked this conversation as resolved.
Show resolved Hide resolved
return Language(detected_lang["lang"])
chloedia marked this conversation as resolved.
Show resolved Hide resolved
5 changes: 5 additions & 0 deletions core/quivr_core/processor/processor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from langchain_core.documents import Document

from quivr_core.files.file import FileExtension, QuivrFile
from quivr_core.language.utils import detect_language

logger = logging.getLogger("quivr_core")

Expand Down Expand Up @@ -44,6 +45,10 @@ async def process_file(self, file: QuivrFile) -> list[Document]:
doc.metadata = {
"chunk_index": idx,
"quivr_core_version": qvr_version,
"language": detect_language(
text=doc.page_content.replace("\\n", " ").replace("\n", " "),
low_memory=True,
).value,
**file.metadata,
**doc.metadata,
**self.processor_metadata,
Expand Down
Loading