Skip to content

Commit

Permalink
created release 0.0.6
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikoletos-K committed Jun 1, 2023
1 parent 3c140d6 commit 4f7f5f8
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 8 deletions.
17 changes: 10 additions & 7 deletions pyjedai/datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,18 @@ def __init__(
"Dataset 1 must contain column names if attributes_1 is empty.")
else:
self.attributes_1: list = attributes_1

if dataset_2 is not None:
if dataset_2.columns.values.tolist():
self.attributes_2 = dataset_2.columns.values.tolist()
if self.id_column_name_2 in self.attributes_2:
self.attributes_2.remove(self.id_column_name_1)

if attributes_2 is None:
if dataset_2.columns.values.tolist():
self.attributes_2 = dataset_2.columns.values.tolist()
if self.id_column_name_2 in self.attributes_2:
self.attributes_2.remove(self.id_column_name_1)
else:
raise AttributeError("Dataset 2 must contain column names if attributes_2 is empty.")
else:
raise AttributeError("Dataset 2 must contain column names if attributes_2 is empty.")
else:
self.attributes_2: list = attributes_2
self.attributes_2: list = attributes_2

# Ground truth data
if ground_truth is not None:
Expand Down
18 changes: 17 additions & 1 deletion pyjedai/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,4 +525,20 @@ def _configuration(self) -> dict:
}

def stats(self) -> None:
pass
pass

def export_pairs(self, filename: str, with_similarity: bool = True) -> None:
if self.pairs is None:
raise AttributeError("Pairs have not been initialized yet. " +
"Please run the method `run` first.")

with open(filename, 'w') as f:
for e1, e2, similarity in self.pairs.edges(data='weight'):
e1 = self.data._ids_mapping_1[e1] if e1 < self.data.dataset_limit else self.data._ids_mapping_2[e1]
e2 = self.data._ids_mapping_1[e2] if e2 < self.data.dataset_limit else self.data._ids_mapping_2[e2]
if with_similarity:
f.write(f"{e1}, {e2}, {similarity}\n")
else:
f.write(f"{e1}, {e2}\n")
f.close()

5 changes: 5 additions & 0 deletions pyjedai/vector_based_blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def build_blocks(self,

self._si = SubsetIndexer(None, self.data)

# print(data.attributes_1, data.attributes_2)
self._entities_d1 = data.dataset_1[attributes_1 if attributes_1 else data.attributes_1] \
.apply(" ".join, axis=1) \
.apply(self._tokenize_entity) \
Expand Down Expand Up @@ -292,7 +293,11 @@ def _transform_entities_to_word_embeddings(self, entities, model, tokenizer) ->
return_attention_mask = True,
max_length=self.max_word_embeddings_size,
padding='max_length')

encoded_input = {key: value.to(self.device) for key, value in encoded_input.items()} # Move input tensors to GPU

with torch.no_grad():
encoded_input = {key: value.to(self.device) for key, value in encoded_input.items()} # Move input tensors to GPU
output = model(**encoded_input)
vector = output.last_hidden_state[:, 0, :]

Expand Down

0 comments on commit 4f7f5f8

Please sign in to comment.