created release 0.0.6

AI-team-UoA · Jun 1, 2023 · 4f7f5f8 · 4f7f5f8
1 parent 3c140d6
commit 4f7f5f8
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 8 deletions.
diff --git a/pyjedai/datamodel.py b/pyjedai/datamodel.py
@@ -125,15 +125,18 @@ def __init__(
                     "Dataset 1 must contain column names if attributes_1 is empty.")
         else:
             self.attributes_1: list = attributes_1
+
         if dataset_2 is not None:
-            if dataset_2.columns.values.tolist():
-                self.attributes_2 = dataset_2.columns.values.tolist()
-                if self.id_column_name_2 in self.attributes_2:
-                    self.attributes_2.remove(self.id_column_name_1)
+
+            if attributes_2 is None:
+                if dataset_2.columns.values.tolist():
+                    self.attributes_2 = dataset_2.columns.values.tolist()
+                    if self.id_column_name_2 in self.attributes_2:
+                        self.attributes_2.remove(self.id_column_name_1)
+                else:
+                    raise AttributeError("Dataset 2 must contain column names if attributes_2 is empty.")
             else:
-                raise AttributeError("Dataset 2 must contain column names if attributes_2 is empty.")
-        else:
-            self.attributes_2: list = attributes_2
+                self.attributes_2: list = attributes_2
 
         # Ground truth data
         if ground_truth is not None:

diff --git a/pyjedai/matching.py b/pyjedai/matching.py
@@ -525,4 +525,20 @@ def _configuration(self) -> dict:
         }
 
     def stats(self) -> None:
-        pass
+        pass
+
+    def export_pairs(self, filename: str, with_similarity: bool = True) -> None:
+        if self.pairs is None:
+            raise AttributeError("Pairs have not been initialized yet. " +
+                                 "Please run the method `run` first.")
+
+        with open(filename, 'w') as f:
+            for e1, e2, similarity in self.pairs.edges(data='weight'):
+                e1 = self.data._ids_mapping_1[e1] if e1 < self.data.dataset_limit else self.data._ids_mapping_2[e1]
+                e2 = self.data._ids_mapping_1[e2] if e2 < self.data.dataset_limit else self.data._ids_mapping_2[e2]
+                if with_similarity:
+                    f.write(f"{e1}, {e2}, {similarity}\n")
+                else:
+                    f.write(f"{e1}, {e2}\n")
+            f.close()
+
diff --git a/pyjedai/vector_based_blocking.py b/pyjedai/vector_based_blocking.py
@@ -145,6 +145,7 @@ def build_blocks(self,
 
         self._si = SubsetIndexer(None, self.data)
 
+        # print(data.attributes_1, data.attributes_2)
         self._entities_d1 = data.dataset_1[attributes_1 if attributes_1 else data.attributes_1] \
                             .apply(" ".join, axis=1) \
                             .apply(self._tokenize_entity) \
@@ -292,7 +293,11 @@ def _transform_entities_to_word_embeddings(self, entities, model, tokenizer) ->
                                         return_attention_mask = True,
                                         max_length=self.max_word_embeddings_size,
                                         padding='max_length')
+
+            encoded_input = {key: value.to(self.device) for key, value in encoded_input.items()}  # Move input tensors to GPU
+
             with torch.no_grad():
+                encoded_input = {key: value.to(self.device) for key, value in encoded_input.items()}  # Move input tensors to GPU
                 output = model(**encoded_input)
                 vector = output.last_hidden_state[:, 0, :]