Skip to content

Commit

Permalink
Add FileReference.from_file(), including checksum computation
Browse files Browse the repository at this point in the history
  • Loading branch information
mbollmann committed Jan 20, 2025
1 parent 6dd7806 commit 41b376c
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 2 deletions.
1 change: 1 addition & 0 deletions python/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- Bibkeys can now be generated and updated, guaranteeing uniqueness.
- Collections, Volumes, Papers, and Events can now be newly created with functions on their respective parent objects.
- Event creation currently has some unintuitive behaviour due to the existence of implicit event creation and linking; see docs.
- FileReferences can now be instantiated from files, and functions for checksum computation have been added.

### Changed

Expand Down
59 changes: 58 additions & 1 deletion python/acl_anthology/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
from attrs import define, field, validators as v, Factory
from lxml import etree
from lxml.builder import E
from os import PathLike
from pathlib import Path
from typing import cast, ClassVar, Optional
from zlib import crc32

if sys.version_info >= (3, 11):
from typing import Self
Expand All @@ -29,6 +32,32 @@
from .utils.xml import xsd_boolean


def compute_checksum(value: bytes) -> str:
"""Compute the checksum of a byte string.
Parameters:
value: Any byte string.
Returns:
The checksum of the byte string as an eight-character, hex-formatted string.
"""
checksum = crc32(value) & 0xFFFFFFFF
return f"{checksum:08x}"


def compute_checksum_from_file(path: PathLike[str]) -> str:
"""Compute the checksum of a file.
Parameters:
path: The path to a file.
Returns:
The checksum of the file's contents.
"""
with open(path, "rb") as f:
return compute_checksum(f.read())


@define
class FileReference:
"""Base class for all references to local or remote files in the XML data.
Expand Down Expand Up @@ -59,9 +88,37 @@ def url(self) -> str:
return self.name
return cast(str, config[self.template_field]).format(self.name)

@classmethod
def from_file(cls, filename: PathLike[str]) -> Self:
"""Instantiate a new file reference from a file.
This automatically computes the checksum for the file and determines its name.
The name of the returned reference will depend on the configured template string; for example, if this function is called on the [PDFReference][acl_anthology.files.PDFReference] class and given a filename ending in `".pdf"`, and [`config.pdf_location_template`][acl_anthology.config.DefaultConfig] ends in `".pdf"`, this means the filename should and will be stored _without_ the `".pdf"` suffix.
Parameters:
filename: The path to the file.
Returns:
A file reference for the given file.
Raises:
FileNotFoundError: If filename does not point to an existing file.
"""
if not (path := Path(filename)).is_file():
raise FileNotFoundError(f"Not a file: {filename}")

Check warning on line 109 in python/acl_anthology/files.py

View check run for this annotation

Codecov / codecov/patch

python/acl_anthology/files.py#L109

Added line #L109 was not covered by tests

if cast(str, config[cls.template_field]).endswith(path.suffix):
name = path.stem
else:
name = path.name

checksum = compute_checksum_from_file(path)
return cls(name=name, checksum=checksum)

@classmethod
def from_xml(cls, elem: etree._Element) -> Self:
"""Instantiates a new file reference from a corresponding XML element."""
"""Instantiate a new file reference from a corresponding XML element."""
checksum = elem.get("hash")
return cls(name=str(elem.text), checksum=str(checksum) if checksum else None)

Expand Down
8 changes: 8 additions & 0 deletions python/docs/api/files.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# files

::: acl_anthology.files
options:
members:
- FileReference

::: acl_anthology.files
options:
filters:
- "!^FileReference$"
Binary file added python/tests/J16-4001.pdf
Binary file not shown.
21 changes: 20 additions & 1 deletion python/tests/files_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
from lxml import etree

from acl_anthology import config
from acl_anthology.files import PDFReference, VideoReference, PapersWithCodeReference
from acl_anthology.files import (
AttachmentReference,
PDFReference,
VideoReference,
PapersWithCodeReference,
)


test_cases_pdf = (
Expand Down Expand Up @@ -198,3 +203,17 @@ def test_reference_cant_change_template_field():
assert isinstance(ref.template_field, str)
with pytest.raises(AttributeError):
ref.template_field = "foo"


def test_pdfreference_from_file():
name = "tests/J16-4001.pdf" # must exist
ref = PDFReference.from_file(name)
assert ref.name == "J16-4001" # without the .pdf
assert ref.checksum == "f9f4f558"


def test_attachmentreference_from_file():
name = "tests/J16-4001.pdf" # must exist
ref = AttachmentReference.from_file(name)
assert ref.name == "J16-4001.pdf" # WITH the .pdf
assert ref.checksum == "f9f4f558"

0 comments on commit 41b376c

Please sign in to comment.