From dfdda7f27fd3d424931e6705b2d74c7fdf9fddfd Mon Sep 17 00:00:00 2001 From: Pat Patterson Date: Thu, 21 Nov 2024 10:40:15 -0800 Subject: [PATCH] fix(filetype): handle missing libmagic library --- CHANGELOG.md | 2 ++ test_unstructured/file_utils/test_filetype.py | 9 +++++- unstructured/file_utils/filetype.py | 28 ++++++++++++++----- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff86e84aa2..ae9f7b27b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,9 @@ ### Features ### Fixes + - **ElementMetadata consolidation** Now `text_as_html` metadata is combined across all elements in CompositeElement when chunking HTML output +- **Fixed ImportError when `libmagic` library is not installed** File type detection now correctly falls back to using `filetype` if the `magic` module cannot be imported. ## 0.16.5 diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 933882f9e2..1eb29319d5 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -25,6 +25,7 @@ _ZipFileDifferentiator, detect_filetype, is_json_processable, + LIBMAGIC_AVAILABLE ) from unstructured.file_utils.model import FileType @@ -298,6 +299,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec assert file_type is expected_value +@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available") @pytest.mark.parametrize( ("expected_value", "file_name"), [ @@ -466,7 +468,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( detect_filetype(file=f) assert "WARNING" in caplog.text - assert "libmagic is unavailable but assists in filetype detection. Please cons" in caplog.text + assert "magic module is installed but libmagic is unavailable. Please cons" in caplog.text # ================================================================================================ @@ -632,10 +634,12 @@ def test_detect_filetype_raises_with_neither_path_or_file_like_object_specified( detect_filetype() +@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available") def test_it_detects_EMPTY_from_file_path_to_empty_file(): assert detect_filetype(example_doc_path("empty.txt")) == FileType.EMPTY +@pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available") def test_it_detects_EMPTY_from_empty_file_like_object(): with open(example_doc_path("empty.txt"), "rb") as f: assert detect_filetype(file=f) == FileType.EMPTY @@ -859,6 +863,7 @@ def it_knows_whether_it_is_a_zipfile(self, file_name: str, expected_value: bool) # -- .mime_type --------------------------------------------- + @pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available") def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_path(self): ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) assert ctx.mime_type == "text/plain" @@ -878,6 +883,7 @@ def but_it_warns_to_install_libmagic_when_the_filetype_lib_cannot_detect_the_MIM assert "libmagic is unavailable" in caplog.text assert "consider installing libmagic" in caplog.text + @pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available") def it_provides_the_MIME_type_detected_by_libmagic_from_a_file_like_object(self): with open(example_doc_path("norwich-city.txt"), "rb") as f: ctx = _FileTypeDetectionContext(file=f) @@ -1094,6 +1100,7 @@ class Describe_TextFileDifferentiator: # -- .applies() --------------------------------------------- + @pytest.mark.skipif(not LIBMAGIC_AVAILABLE, reason="Skipping this test since libmagic is not available") def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): """The constructor determines whether this differentiator is applicable. diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index d109cd7384..3ff4f2475e 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -51,7 +51,17 @@ from unstructured.partition.common.metadata import set_element_hierarchy from unstructured.utils import get_call_args_applying_defaults, lazyproperty -LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) +# Is the magic *module* available? +MAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) + +# Is the libmagic *library* also available? +LIBMAGIC_AVAILABLE = False +if MAGIC_AVAILABLE: + try: + import magic + LIBMAGIC_AVAILABLE = True + except ImportError: + pass def detect_filetype( @@ -359,8 +369,6 @@ def mime_type(self) -> str | None: file_path = self.file_path if LIBMAGIC_AVAILABLE: - import magic - mime_type = ( magic.from_file(file_path, mime=True) if file_path @@ -371,10 +379,16 @@ def mime_type(self) -> str | None: mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head) if mime_type is None: - logger.warning( - "libmagic is unavailable but assists in filetype detection. Please consider" - " installing libmagic for better results." - ) + if MAGIC_AVAILABLE: + logger.warning( + "The magic module is installed but libmagic is unavailable. Please consider" + " installing libmagic for better filetype detection results." + ) + else: + logger.warning( + "The magic module is unavailable but assists in filetype detection. Please consider" + " installing magic for better results." + ) return None return mime_type.lower()