diff --git a/api/constants/__init__.py b/api/constants/__init__.py index 4500ef4306fc2a..b5dfd9cb1836f5 100644 --- a/api/constants/__init__.py +++ b/api/constants/__init__.py @@ -15,7 +15,7 @@ if dify_config.ETL_TYPE == "Unstructured": DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls"] - DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub")) + DOCUMENT_EXTENSIONS.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub")) if dify_config.UNSTRUCTURED_API_URL: DOCUMENT_EXTENSIONS.append("ppt") DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index c0d8c6409982e6..dab5c94df3ada9 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -107,8 +107,10 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str: return _extract_text_from_plain_text(file_content) case "application/pdf": return _extract_text_from_pdf(file_content) - case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/msword": + case "application/msword": return _extract_text_from_doc(file_content) + case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": + return _extract_text_from_docx(file_content) case "text/csv": return _extract_text_from_csv(file_content) case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel": @@ -144,6 +146,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) return _extract_text_from_pdf(file_content) case ".doc" | ".docx": return _extract_text_from_doc(file_content) + case ".docx": + return _extract_text_from_docx(file_content) case ".csv": return _extract_text_from_csv(file_content) case ".xls" | ".xlsx": @@ -203,7 +207,33 @@ def _extract_text_from_pdf(file_content: bytes) -> str: def _extract_text_from_doc(file_content: bytes) -> str: """ - Extract text from a DOC/DOCX file. + Extract text from a DOC file. + """ + from unstructured.partition.api import partition_via_api + + if not (dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY): + raise TextExtractionError("UNSTRUCTURED_API_URL and UNSTRUCTURED_API_KEY must be set") + + try: + with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file: + temp_file.write(file_content) + temp_file.flush() + with open(temp_file.name, "rb") as file: + elements = partition_via_api( + file=file, + metadata_filename=temp_file.name, + api_url=dify_config.UNSTRUCTURED_API_URL, + api_key=dify_config.UNSTRUCTURED_API_KEY, + ) + os.unlink(temp_file.name) + return "\n".join([getattr(element, "text", "") for element in elements]) + except Exception as e: + raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e + + +def _extract_text_from_docx(file_content: bytes) -> str: + """ + Extract text from a DOCX file. For now support only paragraph and table add more if needed """ try: @@ -255,13 +285,13 @@ def _extract_text_from_doc(file_content: bytes) -> str: text.append(markdown_table) except Exception as e: - logger.warning(f"Failed to extract table from DOC/DOCX: {e}") + logger.warning(f"Failed to extract table from DOC: {e}") continue return "\n".join(text) except Exception as e: - raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e + raise TextExtractionError(f"Failed to extract text from DOCX: {str(e)}") from e def _download_file_content(file: File) -> bytes: @@ -329,14 +359,29 @@ def _extract_text_from_excel(file_content: bytes) -> str: def _extract_text_from_ppt(file_content: bytes) -> str: + from unstructured.partition.api import partition_via_api from unstructured.partition.ppt import partition_ppt try: - with io.BytesIO(file_content) as file: - elements = partition_ppt(file=file) + if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY: + with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file: + temp_file.write(file_content) + temp_file.flush() + with open(temp_file.name, "rb") as file: + elements = partition_via_api( + file=file, + metadata_filename=temp_file.name, + api_url=dify_config.UNSTRUCTURED_API_URL, + api_key=dify_config.UNSTRUCTURED_API_KEY, + ) + os.unlink(temp_file.name) + else: + with io.BytesIO(file_content) as file: + elements = partition_ppt(file=file) return "\n".join([getattr(element, "text", "") for element in elements]) + except Exception as e: - raise TextExtractionError(f"Failed to extract text from PPT: {str(e)}") from e + raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e def _extract_text_from_pptx(file_content: bytes) -> str: diff --git a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py index 1a550ec5309aa3..5dfdfc0ebdac2f 100644 --- a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py +++ b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py @@ -8,7 +8,7 @@ from core.workflow.entities.node_entities import NodeRunResult from core.workflow.nodes.document_extractor import DocumentExtractorNode, DocumentExtractorNodeData from core.workflow.nodes.document_extractor.node import ( - _extract_text_from_doc, + _extract_text_from_docx, _extract_text_from_pdf, _extract_text_from_plain_text, ) @@ -120,7 +120,7 @@ def test_run_extract_text( monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_pdf", mock_pdf_extract) elif mime_type.startswith("application/vnd.openxmlformats"): mock_docx_extract = Mock(return_value=expected_text[0]) - monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_doc", mock_docx_extract) + monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_docx", mock_docx_extract) result = document_extractor_node._run() @@ -163,14 +163,14 @@ def test_extract_text_from_pdf(mock_pdf_document): @patch("docx.Document") -def test_extract_text_from_doc(mock_document): +def test_extract_text_from_docx(mock_document): mock_paragraph1 = Mock() mock_paragraph1.text = "Paragraph 1" mock_paragraph2 = Mock() mock_paragraph2.text = "Paragraph 2" mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2] - text = _extract_text_from_doc(b"PK\x03\x04") + text = _extract_text_from_docx(b"PK\x03\x04") assert text == "Paragraph 1\nParagraph 2" diff --git a/web/app/components/base/prompt-editor/constants.tsx b/web/app/components/base/prompt-editor/constants.tsx index c78b2fc50a4fa8..1288e1539e1c1f 100644 --- a/web/app/components/base/prompt-editor/constants.tsx +++ b/web/app/components/base/prompt-editor/constants.tsx @@ -52,7 +52,7 @@ export const getInputVars = (text: string): ValueSelector[] => { export const FILE_EXTS: Record = { [SupportUploadFileTypes.image]: ['JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG'], - [SupportUploadFileTypes.document]: ['TXT', 'MD', 'MDX', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB'], + [SupportUploadFileTypes.document]: ['TXT', 'MD', 'MDX', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOC', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB'], [SupportUploadFileTypes.audio]: ['MP3', 'M4A', 'WAV', 'WEBM', 'AMR', 'MPGA'], [SupportUploadFileTypes.video]: ['MP4', 'MOV', 'MPEG', 'MPGA'], }