From 64309e7158021f83c6d94374e2b4a899d1393ff1 Mon Sep 17 00:00:00 2001
From: future-xy <fy38607203@163.com>
Date: Thu, 10 Feb 2022 12:18:24 +0000
Subject: [PATCH 1/3] fix #219: fix a bug on randomization

---
 dlrm_data_pytorch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dlrm_data_pytorch.py b/dlrm_data_pytorch.py
index 852c5771..f874e936 100644
--- a/dlrm_data_pytorch.py
+++ b/dlrm_data_pytorch.py
@@ -220,9 +220,9 @@ def __init__(
                     indices = np.random.permutation(indices)
                     print("Randomized indices...")
 
-                X_int[indices] = X_int
-                X_cat[indices] = X_cat
-                y[indices] = y
+                self.X_int = X_int[indices]
+                self.X_cat = X_cat[indices]
+                self.y = y[indices]
 
             else:
                 indices = np.array_split(indices, self.offset_per_file[1:-1])

From 1481fae3f08aa79586a2630299787b06b446907d Mon Sep 17 00:00:00 2001
From: future-xy <fy38607203@163.com>
Date: Thu, 10 Feb 2022 12:27:58 +0000
Subject: [PATCH 2/3] fix #219: optimize performance 1. convert list first to
 np.ndarray before to torch.tensor 2. reorder ndarray faster

---
 dlrm_data_pytorch.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/dlrm_data_pytorch.py b/dlrm_data_pytorch.py
index f874e936..b0011195 100644
--- a/dlrm_data_pytorch.py
+++ b/dlrm_data_pytorch.py
@@ -246,17 +246,17 @@ def __init__(
 
                 # create training, validation, and test sets
                 if split == 'train':
-                    self.X_int = [X_int[i] for i in train_indices]
-                    self.X_cat = [X_cat[i] for i in train_indices]
-                    self.y = [y[i] for i in train_indices]
+                    self.X_int = X_int[train_indices]
+                    self.X_cat = X_cat[train_indices]
+                    self.y = y[train_indices]
                 elif split == 'val':
-                    self.X_int = [X_int[i] for i in val_indices]
-                    self.X_cat = [X_cat[i] for i in val_indices]
-                    self.y = [y[i] for i in val_indices]
+                    self.X_int = X_int[val_indices]
+                    self.X_cat = X_cat[val_indices]
+                    self.y = y[val_indices]
                 elif split == 'test':
-                    self.X_int = [X_int[i] for i in test_indices]
-                    self.X_cat = [X_cat[i] for i in test_indices]
-                    self.y = [y[i] for i in test_indices]
+                    self.X_int = X_int[test_indices]
+                    self.X_cat = X_cat[test_indices]
+                    self.y = y[test_indices]
 
             print("Split data according to indices...")
 
@@ -328,9 +328,9 @@ def __len__(self):
 def collate_wrapper_criteo_offset(list_of_tuples):
     # where each tuple is (X_int, X_cat, y)
     transposed_data = list(zip(*list_of_tuples))
-    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
-    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
-    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+    X_int = torch.log(torch.tensor(np.array(transposed_data[0]), dtype=torch.float) + 1)
+    X_cat = torch.tensor(np.array(transposed_data[1]), dtype=torch.long)
+    T = torch.tensor(np.array(transposed_data[2]), dtype=torch.float32).view(-1, 1)
 
     batchSize = X_cat.shape[0]
     featureCnt = X_cat.shape[1]
@@ -399,9 +399,9 @@ def diff(tensor):
 def collate_wrapper_criteo_length(list_of_tuples):
     # where each tuple is (X_int, X_cat, y)
     transposed_data = list(zip(*list_of_tuples))
-    X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
-    X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
-    T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
+    X_int = torch.log(torch.tensor(np.array(transposed_data[0]), dtype=torch.float) + 1)
+    X_cat = torch.tensor(np.array(transposed_data[1]), dtype=torch.long)
+    T = torch.tensor(np.array(transposed_data[2]), dtype=torch.float32).view(-1, 1)
 
     batchSize = X_cat.shape[0]
     featureCnt = X_cat.shape[1]

From aae60757350c467c63893229e149223b6996400e Mon Sep 17 00:00:00 2001
From: future-xy <fuyao3860@gmail.com>
Date: Fri, 6 May 2022 15:13:34 +0100
Subject: [PATCH 3/3] fix the best acc bug

---
 dlrm_s_pytorch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py
index ec3394b4..373e0c48 100644
--- a/dlrm_s_pytorch.py
+++ b/dlrm_s_pytorch.py
@@ -887,7 +887,7 @@ def inference(
             ),
             flush=True,
         )
-    return model_metrics_dict, is_best
+    return model_metrics_dict, is_best, best_acc_test
 
 
 def run():
@@ -1658,7 +1658,7 @@ def run():
                         print(
                             "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
                         )
-                        model_metrics_dict, is_best = inference(
+                        model_metrics_dict, is_best, best_acc_test = inference(
                             args,
                             dlrm,
                             best_acc_test,