From 0ff7ab4dd6c3913d935314d798615c6cd1c6b469 Mon Sep 17 00:00:00 2001 From: Miguel Cabero Date: Thu, 30 Jan 2025 15:13:00 +0100 Subject: [PATCH] Allow import of text files in non utf-8 encoding --- .env.sample | 4 +++- server/app/routes/convert.py | 24 +++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/.env.sample b/.env.sample index f271daf7..59842875 100644 --- a/.env.sample +++ b/.env.sample @@ -5,4 +5,6 @@ BACKEND_PORT=8000 BACKEND_RELOAD=True FRONTEND_HOST=0.0.0.0 -FRONTEND_PORT=3000 \ No newline at end of file +FRONTEND_PORT=3000 + +TEXT_FILE_ENCODINGS=utf-8,latin1,cp1252,iso-8859-1 \ No newline at end of file diff --git a/server/app/routes/convert.py b/server/app/routes/convert.py index 1451b0d7..0d37c0ea 100644 --- a/server/app/routes/convert.py +++ b/server/app/routes/convert.py @@ -173,9 +173,7 @@ async def convert_documents( # Process all files for filename, file_path, is_txt in zip(original_filenames, file_paths, txt_files): if is_txt: - # For txt files, just read the content - with open(file_path, 'r', encoding='utf-8') as f: - content = f.read() + content = try_read_file_with_encodings(file_path) results.append({ "filename": filename, "markdown": content @@ -243,4 +241,24 @@ async def azure_convert_documents( return {"documents": formatted_results} +def get_supported_encodings(): + """Get list of supported encodings from environment or use default.""" + encodings_str = os.getenv("TEXT_FILE_ENCODINGS", "utf-8") + return [enc.strip() for enc in encodings_str.split(",")] + +def try_read_file_with_encodings(file_path: str) -> str: + """Try to read a file with configured encodings.""" + encodings = get_supported_encodings() + + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as f: + return f.read() + except UnicodeDecodeError: + print(f"Failed to decode {file_path} with encoding {encoding}") + continue + + # If all encodings fail, try with the most permissive one and replace errors + with open(file_path, 'r', encoding='latin1', errors='replace') as f: + return f.read()