Add support for different dict formats

Fixes #7 Fixes #29 Now, the following third-party dictionaries are imported and displayed correctly: - [Terminilogy](https://agiletortoise.com/terminology/mac/) - [Littré](https://www.competencemac.com/Bureautique-Dictionnaires-en-francais_a1737.html)
mr-pennyworth · Jun 21, 2024 · cdb670b · cdb670b
1 parent 8231bc6
commit cdb670b
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 14 deletions.
diff --git a/info.plist b/info.plist
@@ -974,7 +974,7 @@ Subsequent searches should be snappy
 	<key>variablesdontexport</key>
 	<array/>
 	<key>version</key>
-	<string>0.2.11</string>
+	<string>0.2.12</string>
 	<key>webaddress</key>
 	<string>https://github.com/mr-pennyworth/alfred-better-dictionaries</string>
 </dict>

diff --git a/pyapp/appledict.py b/pyapp/appledict.py
@@ -33,12 +33,41 @@ def _sections(self) -> typing.Iterable[bytes]:
             # number of bytes in the Body.data file
             limit = 0x40 + read_int(f)
 
-            # TODO: for 'HeapDataCompressionType': 2 (in Info.plist),
-            #  we need to skip to byte number 96, but not if it is 1.
-            f.seek(0x60)
+            # There seem to be two distinct formats of the header.
+            # One where the body begins at 0x60, and another where
+            # the body begins at 0x44.
+            #
+            # Look at these two examples:
+            # "New Oxford American Dictionary.dictionary" that comes
+            # pre-installed on macOS
+            # ❯ xxd Body.data | head
+            #    ... skipped ...
+            # 00000030: 0000 0000 0000 0000 0000 0000 0000 0000
+            # 00000040: d39b 8001 0000 0000 ffff ffff 2000 0000
+            # 00000050: 0000 0000 fa02 0000 ffff ffff ffff ffff
+            # 00000060: 6880 0000 6480 0000 0959 0400 78da ecbd
+            #
+            # "Littré.dictionary" found at
+            # https://www.competencemac.com/Bureautique-Dictionnaires-en-francais_a1737.html
+            # ❯ xxd Body.data | head
+            #    ... skipped ...
+            # 00000030: 0000 0000 0000 0000 0000 0000 0000 0000
+            # 00000040: 46a2 af02 9103 0000 8d03 0000 3906 0000
+            # 00000050: 789c 6d55 db6e db46 107d 8ebf 62a0 1725
+            # 00000060: a84c d67d 5469 0272 8c02 058a a040 9abc
+            #
+            # Based on the above two examples, and looking at the function
+            # guessFileOffsetLimit the pyglossary project,
+            # (https://github.com/ilius/pyglossary/blob/b41161d3f38a7e6523d315f4b8555083ef196e71/pyglossary/plugins/appledict_bin/appledict_file_tools.py#L58)
+            # looking for 0000 0000 ffff ffff at 0x44 seems to be a
+            # reliable enough (?) way to distinguish between these two
+            if (read_int(f), read_int(f)) == (0, -1):
+                f.seek(0x60)
+            else:
+                f.seek(0x44)
 
             while f.tell() < limit:
-                # a Body.data file can contain multiple sections with the format:
+                # Body.data file can contain multiple sections with the format:
                 # [section_size      (4 bytes (not including itself)),
                 #  ???               (4 bytes), (no idea what these are!)
                 #  decompressed_size (4 bytes),
@@ -60,12 +89,17 @@ def definitions(self) -> typing.Iterable[str]:
            class="entry">
         """
         for section in self._sections():
-            # each decompressed chunk contains multiple definitions
-            # each definition is of the format:
-            # [defn_size (4 bytes (not including itself)),
-            #  XML defn  (defn_size bytes)]
-            section_size = len(section)
-            section = io.BytesIO(section)  # convert to typing.BinaryIO
-            while section.tell() < section_size:
-                defn_size = read_int(section)
-                yield section.read(defn_size).decode("utf-8")
+            # each decompressed chunk:
+            # 1) either contains one single definition,
+            # 2) or multiple definitions of the format:
+            #    [defn_size (4 bytes (not including itself)),
+            #     XML defn  (defn_size bytes)]
+            opening_tag = b"<d:entry"
+            if section[: len(opening_tag)] == opening_tag:
+                yield section.decode("utf-8")
+            else:
+                section_size = len(section)
+                section = io.BytesIO(section)  # convert to typing.BinaryIO
+                while section.tell() < section_size:
+                    defn_size = read_int(section)
+                    yield section.read(defn_size).decode("utf-8")