Skip to content

Commit

Permalink
Add support for different dict formats
Browse files Browse the repository at this point in the history
Fixes #7
Fixes #29

Now, the following third-party dictionaries are
imported and displayed correctly:
 - [Terminilogy](https://agiletortoise.com/terminology/mac/)
 - [Littré](https://www.competencemac.com/Bureautique-Dictionnaires-en-francais_a1737.html)
  • Loading branch information
mr-pennyworth committed Jun 21, 2024
1 parent 8231bc6 commit cdb670b
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 14 deletions.
2 changes: 1 addition & 1 deletion info.plist
Original file line number Diff line number Diff line change
Expand Up @@ -974,7 +974,7 @@ Subsequent searches should be snappy
<key>variablesdontexport</key>
<array/>
<key>version</key>
<string>0.2.11</string>
<string>0.2.12</string>
<key>webaddress</key>
<string>https://github.com/mr-pennyworth/alfred-better-dictionaries</string>
</dict>
Expand Down
60 changes: 47 additions & 13 deletions pyapp/appledict.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,41 @@ def _sections(self) -> typing.Iterable[bytes]:
# number of bytes in the Body.data file
limit = 0x40 + read_int(f)

# TODO: for 'HeapDataCompressionType': 2 (in Info.plist),
# we need to skip to byte number 96, but not if it is 1.
f.seek(0x60)
# There seem to be two distinct formats of the header.
# One where the body begins at 0x60, and another where
# the body begins at 0x44.
#
# Look at these two examples:
# "New Oxford American Dictionary.dictionary" that comes
# pre-installed on macOS
# ❯ xxd Body.data | head
# ... skipped ...
# 00000030: 0000 0000 0000 0000 0000 0000 0000 0000
# 00000040: d39b 8001 0000 0000 ffff ffff 2000 0000
# 00000050: 0000 0000 fa02 0000 ffff ffff ffff ffff
# 00000060: 6880 0000 6480 0000 0959 0400 78da ecbd
#
# "Littré.dictionary" found at
# https://www.competencemac.com/Bureautique-Dictionnaires-en-francais_a1737.html
# ❯ xxd Body.data | head
# ... skipped ...
# 00000030: 0000 0000 0000 0000 0000 0000 0000 0000
# 00000040: 46a2 af02 9103 0000 8d03 0000 3906 0000
# 00000050: 789c 6d55 db6e db46 107d 8ebf 62a0 1725
# 00000060: a84c d67d 5469 0272 8c02 058a a040 9abc
#
# Based on the above two examples, and looking at the function
# guessFileOffsetLimit the pyglossary project,
# (https://github.com/ilius/pyglossary/blob/b41161d3f38a7e6523d315f4b8555083ef196e71/pyglossary/plugins/appledict_bin/appledict_file_tools.py#L58)
# looking for 0000 0000 ffff ffff at 0x44 seems to be a
# reliable enough (?) way to distinguish between these two
if (read_int(f), read_int(f)) == (0, -1):
f.seek(0x60)
else:
f.seek(0x44)

while f.tell() < limit:
# a Body.data file can contain multiple sections with the format:
# Body.data file can contain multiple sections with the format:
# [section_size (4 bytes (not including itself)),
# ??? (4 bytes), (no idea what these are!)
# decompressed_size (4 bytes),
Expand All @@ -60,12 +89,17 @@ def definitions(self) -> typing.Iterable[str]:
class="entry">
"""
for section in self._sections():
# each decompressed chunk contains multiple definitions
# each definition is of the format:
# [defn_size (4 bytes (not including itself)),
# XML defn (defn_size bytes)]
section_size = len(section)
section = io.BytesIO(section) # convert to typing.BinaryIO
while section.tell() < section_size:
defn_size = read_int(section)
yield section.read(defn_size).decode("utf-8")
# each decompressed chunk:
# 1) either contains one single definition,
# 2) or multiple definitions of the format:
# [defn_size (4 bytes (not including itself)),
# XML defn (defn_size bytes)]
opening_tag = b"<d:entry"
if section[: len(opening_tag)] == opening_tag:
yield section.decode("utf-8")
else:
section_size = len(section)
section = io.BytesIO(section) # convert to typing.BinaryIO
while section.tell() < section_size:
defn_size = read_int(section)
yield section.read(defn_size).decode("utf-8")

0 comments on commit cdb670b

Please sign in to comment.