From 5f227ff834698f6127660dbaec187fc2d7af318c Mon Sep 17 00:00:00 2001
From: akharche <angelina.kharchevnikova@intel.com>
Date: Mon, 11 Jan 2021 13:52:12 +0300
Subject: [PATCH] Improve performance of data processing

---
 data_utils.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/data_utils.py b/data_utils.py
index bf76dfff..17bfea2d 100644
--- a/data_utils.py
+++ b/data_utils.py
@@ -1003,20 +1003,18 @@ def process_one_file(
                 y[i] = target
                 X_int[i] = np.array(line[1:14], dtype=np.int32)
                 if max_ind_range > 0:
-                    X_cat[i] = np.array(
-                        list(map(lambda x: int(x, 16) % max_ind_range, line[14:])),
+                    X_cat[i] = np.fromiter(
+                        map(lambda x: int(x, 16) % max_ind_range, line[14:]),
                         dtype=np.int32
                     )
                 else:
-                    X_cat[i] = np.array(
-                        list(map(lambda x: int(x, 16), line[14:])),
+                    X_cat[i] = np.fromiter(
+                        map(lambda x: int(x, 16), line[14:]),
                         dtype=np.int32
                     )
 
                 # count uniques
                 if dataset_multiprocessing:
-                    for j in range(26):
-                        convertDicts_day[j][X_cat[i][j]] = 1
                     # debug prints
                     if float(i)/num_data_in_split*100 > percent+1:
                         percent = int(float(i)/num_data_in_split*100)
@@ -1033,8 +1031,6 @@ def process_one_file(
                             end="\n",
                         )
                 else:
-                    for j in range(26):
-                        convertDicts[j][X_cat[i][j]] = 1
                     # debug prints
                     print(
                         "Load %d/%d  Split: %d  Label True: %d  Stored: %d"
@@ -1049,6 +1045,16 @@ def process_one_file(
                     )
                 i += 1
 
+            if dataset_multiprocessing:
+                for j in range(26):
+                    unique_cats = np.unique(X_cat[:, j])
+                    for category in unique_cats:
+                        convertDicts_day[j][category] = 1
+            else:
+                for j in range(26):
+                    unique_cats = np.unique(X_cat[:, j])
+                    for category in unique_cats:
+                        convertDicts[j][category] = 1
             # store num_data_in_split samples or extras at the end of file
             # count uniques
             # X_cat_t  = np.transpose(X_cat)
@@ -1146,7 +1152,7 @@ def process_one_file(
             if not path.exists(dict_file_j):
                 np.savez_compressed(
                     dict_file_j,
-                    unique=np.array(list(convertDicts[j]), dtype=np.int32)
+                    unique=np.fromiter(convertDicts[j].keys(), dtype=np.int32)
                 )
             counts[j] = len(convertDicts[j])
         # store (uniques and) counts