Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: recreate document id if certain attributes are changed #8694

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion haystack/dataclasses/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,16 @@ def __eq__(self, other):
return False
return self.to_dict() == other.to_dict()

def __setattr__(self, name, value):
# Only trigger custom logic if the field is already set (post-initialization)
if hasattr(self, name) or name == "meta":
super().__setattr__(name, value)
# Recreate document id if certain attributes are updated
if name in {"content", "dataframe", "blob", "meta", "embedding", "sparse_embedding"}:
object.__setattr__(self, "id", self._create_id())
else:
super().__setattr__(name, value)

def __post_init__(self):
"""
Generate the ID based on the init parameters.
Expand All @@ -122,7 +132,7 @@ def _create_id(self):
dataframe = self.dataframe.to_json() if self.dataframe is not None else None
blob = self.blob.data if self.blob is not None else None
mime_type = self.blob.mime_type if self.blob is not None else None
meta = self.meta or {}
meta = self.meta if hasattr(self, "meta") and self.meta else {}
embedding = self.embedding if self.embedding is not None else None
sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
data = f"{text}{dataframe}{blob}{mime_type}{meta}{embedding}{sparse_embedding}"
Expand Down
4 changes: 4 additions & 0 deletions releasenotes/notes/document-id-update-27d2356a791fc86e.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Document ids are now updated upon changes of the document's attributes. Previously document ids were unchanged when adding meta data etc. to a document after its initialization
28 changes: 28 additions & 0 deletions test/dataclasses/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,34 @@ def test_basic_equality_id():
assert doc1 != doc2


@pytest.mark.parametrize(
"attribute, new_value",
[
("content", "new content"),
("dataframe", pd.DataFrame([10, 20, 30])),
("blob", ByteStream(b"some bytes", mime_type="application/pdf")),
("meta", {"date": "10-10-2023", "type": "article"}),
("embedding", [0.1, 0.2, 0.3]),
("sparse_embedding", SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3])),
],
)
def test_id_is_updated_on_attribute_change(attribute, new_value):
doc1 = Document(content="test text")
id_at_creation = doc1.id

# Update attributes "content", "dataframe", "blob", "meta", "embedding", "sparse_embedding"
setattr(doc1, attribute, new_value)
assert doc1.id != id_at_creation


def test_id_is_not_updated_on_score_change():
doc1 = Document(content="test text")
id_at_creation = doc1.id

doc1.score = 1.0
assert doc1.id == id_at_creation


def test_to_dict():
doc = Document()
assert doc.to_dict() == {
Expand Down
Loading