Optimized function

jzsmoreno · Dec 30, 2024 · a6a5378 · a6a5378
1 parent 11a531e
commit a6a5378
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 96 deletions.
diff --git a/dataframe_partial_join/dataframe_partial_join/main.py b/dataframe_partial_join/dataframe_partial_join/main.py
@@ -2,139 +2,94 @@
 
 import numpy as np
 import pandas as pd
+from IPython.display import clear_output
 
 
-# a function that returns the key used in the matching process
+# Optimized function to build the key
 def get_key(key):
-    x = str(key[0])
-    for i in range(1, len(key)):
-        x += str(key[i])
-    return x
+    return "".join(map(str, key))
 
 
-# a function that builds the key
+# Optimized function to handle NaN efficiently
 def cat_key(x):
-    if pd.isna(x):
-        return ""
-    else:
-        return x
+    return "" if pd.isna(x) else str(x)
 
 
-# a function that applies a mask to the variables used to construct the key
+# Optimized function to apply mask and construct keys
 def get_cat_key(df):
-    return np.array(df.applymap(cat_key))
+    return df.apply(lambda col: col.map(cat_key)).values
 
 
-# the function returns all the keys calculated above
+# Optimized function to return all keys
 def get_all_keys(df, _list):
     keys_ = get_cat_key(df[_list])
-
-    all_keys = []
-
-    for i in range(keys_.shape[0]):
-        all_keys.append(get_key(keys_[i, :]))
-
-    return all_keys
+    return ["".join(map(str, row)) for row in keys_]
 
 
-# a function returning all previously calculated keys and codes
+# Optimized function to get all keys
 def get_keys(df, _list):
     return get_all_keys(df, _list)
 
 
-# auxiliary function for console cleaning
-def clearConsole():
-    command = "clear"
-    if os.name in ("nt", "dos"):  # If the machine is running on Windows, use cls
-        command = "cls"
-    os.system(command)
+# Auxiliary function to clear console
+def clear_console():
+    clear_output(wait=True)
+    os.system("cls" if os.name in ("nt", "dos") else "clear")
 
 
-# auxiliary function to rename columns after each match
+# Optimized function to rename columns after matching
 def rename_cols(df):
-    """
-    Operates on a dataframe resulting from a join.
-    Identifying the cases in which there was a renaming of similar columns
-    with different information, consolidating them.
-
-    params:
-        df (Dataframe) : The dataframe on which you want to operate
-
-    returns:
-        df (Dataframe) : The same df dataframe with the consolidated columns
-
-    example:
-        df_1 = df_1.merge(df_2, how = 'left')
-        df_1 = rename_cols(df_1)
-        >>
-    """
-    cols = []
-    for i in df.columns:
-        cols.append(i.replace("_x", ""))
-        cols.append(i.replace("_y", ""))
-
-    cols = [*set(cols)]
-
-    for i in cols:
-        try:
-            df[i + "_x"] = df[i + "_x"].fillna(df[i + "_y"])
-            df = df.drop(columns=[i + "_y"])
-            df.rename(columns={i + "_x": i}, inplace=True)
-        except:
-            None
-
+    for col in df.columns:
+        if "_x" in col:
+            base_col = col.replace("_x", "")
+            if base_col + "_y" in df.columns:
+                df[base_col] = df[base_col + "_x"].fillna(df[base_col + "_y"])
+                df.drop([base_col + "_x", base_col + "_y"], axis=1, inplace=True)
+                df.rename(columns={base_col + "_x": base_col}, inplace=True)
     return df
 
 
 ####################################################################################
 
 
+# Optimized like filter function
 def like_filter(df, filters):
-    mask = []
-    for i in filters:
-        try:
-            mask.append(df.filter(like=i).columns[0])
-        except:
-            None
-    return mask
+    return [col for i in filters for col in df.filter(like=i).columns]
 
 
+# Optimized match-making function with condition for dropping NaN filters
 def make_match(df1, df2, subset, key, dropna_filters):
     df1 = df1.drop_duplicates(subset=key)
-    df3 = df2[df2[dropna_filters].isnull().any(axis=1)].copy()
-    df3 = df3.merge(df1, how="left", on=key)
+    df3 = df2[df2[dropna_filters].isnull().any(axis=1)].merge(df1, how="left", on=key)
     df3 = rename_cols(df3)
+
     if len(df3) == len(df2):
         df2 = df2.combine_first(df3)
     else:
         df2 = df2.merge(df1, how="left", on=key)
         df2 = rename_cols(df2)
-    del df3
 
-    if subset != None:
+    if subset:
         df2 = df2.drop_duplicates(subset=subset)
     else:
         df2 = df2.drop_duplicates()
+
     return df2
 
 
+# Optimized function to return concatenated dataframe
 def return_df(list_df_):
-    df_ = pd.DataFrame()
-    for df in list_df_:
-        df = rename_cols(df)
-        df_ = pd.concat([df, df_], ignore_index=True)
-    return df_
+    return pd.concat(list_df_, ignore_index=True)
 
 
-# a main function that performs the piecewise (chunks) matching process
+# Optimized partial merge function with reduced print overhead
 def partial_merge(df1, df2, keys_to, n=None, dropna_filters=[], subset=None):
-    if n != None:
+    if n:
         list_df_ = [df1[i : i + n] for i in range(0, df1.shape[0], n)]
         count = 0
         for j in keys_to:
-            k = 0
             progress = 0
-            for df in list_df_:
+            for k, df in enumerate(list_df_):
                 df["key"] = get_keys(df, j)
                 df2["key"] = get_keys(df2, j)
                 print("Progress : ", "{:.2%}".format(count / len(keys_to)))
@@ -147,7 +102,7 @@ def partial_merge(df1, df2, keys_to, n=None, dropna_filters=[], subset=None):
                 )
                 print("Partial Progress : ", "{:.2%}".format(progress / len(list_df_)))
                 list_df_[k] = make_match(df2, df, subset, ["key"], dropna_filters)
-                clearConsole()
+                clear_console()
                 progress += 1
                 k += 1
             count += 1
@@ -167,7 +122,7 @@ def partial_merge(df1, df2, keys_to, n=None, dropna_filters=[], subset=None):
                 "{:,}".format(len(df2)),
             )
             df1 = make_match(df2, df1, subset, ["key"], dropna_filters)
-            clearConsole()
+            clear_console()
             count += 1
     df1 = df1.drop(columns=["key"])
     return df1

diff --git a/dataframe_partial_join/example_of_use.ipynb b/dataframe_partial_join/example_of_use.ipynb
@@ -266,12 +266,6 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Progress :  0.00%\n",
-      "aggregation :  0\n",
-      "Total Size df1:  3 | Total Size df2:  3\n",
-      "Progress :  33.33%\n",
-      "aggregation :  1\n",
-      "Total Size df1:  3 | Total Size df2:  3\n",
       "Progress :  66.67%\n",
       "aggregation :  2\n",
       "Total Size df1:  3 | Total Size df2:  3\n"
@@ -309,50 +303,50 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id</th>\n",
+       "      <th>performance</th>\n",
        "      <th>calories</th>\n",
        "      <th>duration</th>\n",
        "      <th>height</th>\n",
        "      <th>weight</th>\n",
-       "      <th>performance</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>420.0</td>\n",
        "      <td>50.0</td>\n",
        "      <td>39.0</td>\n",
        "      <td>110.0</td>\n",
-       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>1</td>\n",
+       "      <td>3.0</td>\n",
        "      <td>-1.0</td>\n",
        "      <td>40.0</td>\n",
        "      <td>35.0</td>\n",
        "      <td>132.0</td>\n",
-       "      <td>3.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>2</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>390.0</td>\n",
        "      <td>-1.0</td>\n",
        "      <td>42.0</td>\n",
        "      <td>143.0</td>\n",
-       "      <td>2.0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   id  calories  duration  height  weight  performance\n",
-       "0   0     420.0      50.0    39.0   110.0          1.0\n",
-       "1   1      -1.0      40.0    35.0   132.0          3.0\n",
-       "2   2     390.0      -1.0    42.0   143.0          2.0"
+       "   id  performance  calories  duration  height  weight\n",
+       "0   0          1.0     420.0      50.0    39.0   110.0\n",
+       "1   1          3.0      -1.0      40.0    35.0   132.0\n",
+       "2   2          2.0     390.0      -1.0    42.0   143.0"
       ]
      },
      "execution_count": 9,
@@ -388,7 +382,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.11.9"
   },
   "orig_nbformat": 4,
   "vscode": {