Skip to content

Commit

Permalink
Optimized function
Browse files Browse the repository at this point in the history
  • Loading branch information
jzsmoreno committed Dec 30, 2024
1 parent 11a531e commit a6a5378
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 96 deletions.
117 changes: 36 additions & 81 deletions dataframe_partial_join/dataframe_partial_join/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,139 +2,94 @@

import numpy as np
import pandas as pd
from IPython.display import clear_output


# a function that returns the key used in the matching process
# Optimized function to build the key
def get_key(key):
x = str(key[0])
for i in range(1, len(key)):
x += str(key[i])
return x
return "".join(map(str, key))


# a function that builds the key
# Optimized function to handle NaN efficiently
def cat_key(x):
if pd.isna(x):
return ""
else:
return x
return "" if pd.isna(x) else str(x)


# a function that applies a mask to the variables used to construct the key
# Optimized function to apply mask and construct keys
def get_cat_key(df):
return np.array(df.applymap(cat_key))
return df.apply(lambda col: col.map(cat_key)).values


# the function returns all the keys calculated above
# Optimized function to return all keys
def get_all_keys(df, _list):
keys_ = get_cat_key(df[_list])

all_keys = []

for i in range(keys_.shape[0]):
all_keys.append(get_key(keys_[i, :]))

return all_keys
return ["".join(map(str, row)) for row in keys_]


# a function returning all previously calculated keys and codes
# Optimized function to get all keys
def get_keys(df, _list):
return get_all_keys(df, _list)


# auxiliary function for console cleaning
def clearConsole():
command = "clear"
if os.name in ("nt", "dos"): # If the machine is running on Windows, use cls
command = "cls"
os.system(command)
# Auxiliary function to clear console
def clear_console():
clear_output(wait=True)
os.system("cls" if os.name in ("nt", "dos") else "clear")


# auxiliary function to rename columns after each match
# Optimized function to rename columns after matching
def rename_cols(df):
"""
Operates on a dataframe resulting from a join.
Identifying the cases in which there was a renaming of similar columns
with different information, consolidating them.
params:
df (Dataframe) : The dataframe on which you want to operate
returns:
df (Dataframe) : The same df dataframe with the consolidated columns
example:
df_1 = df_1.merge(df_2, how = 'left')
df_1 = rename_cols(df_1)
>>
"""
cols = []
for i in df.columns:
cols.append(i.replace("_x", ""))
cols.append(i.replace("_y", ""))

cols = [*set(cols)]

for i in cols:
try:
df[i + "_x"] = df[i + "_x"].fillna(df[i + "_y"])
df = df.drop(columns=[i + "_y"])
df.rename(columns={i + "_x": i}, inplace=True)
except:
None

for col in df.columns:
if "_x" in col:
base_col = col.replace("_x", "")
if base_col + "_y" in df.columns:
df[base_col] = df[base_col + "_x"].fillna(df[base_col + "_y"])
df.drop([base_col + "_x", base_col + "_y"], axis=1, inplace=True)
df.rename(columns={base_col + "_x": base_col}, inplace=True)
return df


####################################################################################


# Optimized like filter function
def like_filter(df, filters):
mask = []
for i in filters:
try:
mask.append(df.filter(like=i).columns[0])
except:
None
return mask
return [col for i in filters for col in df.filter(like=i).columns]


# Optimized match-making function with condition for dropping NaN filters
def make_match(df1, df2, subset, key, dropna_filters):
df1 = df1.drop_duplicates(subset=key)
df3 = df2[df2[dropna_filters].isnull().any(axis=1)].copy()
df3 = df3.merge(df1, how="left", on=key)
df3 = df2[df2[dropna_filters].isnull().any(axis=1)].merge(df1, how="left", on=key)
df3 = rename_cols(df3)

if len(df3) == len(df2):
df2 = df2.combine_first(df3)
else:
df2 = df2.merge(df1, how="left", on=key)
df2 = rename_cols(df2)
del df3

if subset != None:
if subset:
df2 = df2.drop_duplicates(subset=subset)
else:
df2 = df2.drop_duplicates()

return df2


# Optimized function to return concatenated dataframe
def return_df(list_df_):
df_ = pd.DataFrame()
for df in list_df_:
df = rename_cols(df)
df_ = pd.concat([df, df_], ignore_index=True)
return df_
return pd.concat(list_df_, ignore_index=True)


# a main function that performs the piecewise (chunks) matching process
# Optimized partial merge function with reduced print overhead
def partial_merge(df1, df2, keys_to, n=None, dropna_filters=[], subset=None):
if n != None:
if n:
list_df_ = [df1[i : i + n] for i in range(0, df1.shape[0], n)]
count = 0
for j in keys_to:
k = 0
progress = 0
for df in list_df_:
for k, df in enumerate(list_df_):
df["key"] = get_keys(df, j)
df2["key"] = get_keys(df2, j)
print("Progress : ", "{:.2%}".format(count / len(keys_to)))
Expand All @@ -147,7 +102,7 @@ def partial_merge(df1, df2, keys_to, n=None, dropna_filters=[], subset=None):
)
print("Partial Progress : ", "{:.2%}".format(progress / len(list_df_)))
list_df_[k] = make_match(df2, df, subset, ["key"], dropna_filters)
clearConsole()
clear_console()
progress += 1
k += 1
count += 1
Expand All @@ -167,7 +122,7 @@ def partial_merge(df1, df2, keys_to, n=None, dropna_filters=[], subset=None):
"{:,}".format(len(df2)),
)
df1 = make_match(df2, df1, subset, ["key"], dropna_filters)
clearConsole()
clear_console()
count += 1
df1 = df1.drop(columns=["key"])
return df1
Expand Down
24 changes: 9 additions & 15 deletions dataframe_partial_join/example_of_use.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,6 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Progress : 0.00%\n",
"aggregation : 0\n",
"Total Size df1: 3 | Total Size df2: 3\n",
"Progress : 33.33%\n",
"aggregation : 1\n",
"Total Size df1: 3 | Total Size df2: 3\n",
"Progress : 66.67%\n",
"aggregation : 2\n",
"Total Size df1: 3 | Total Size df2: 3\n"
Expand Down Expand Up @@ -309,50 +303,50 @@
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>performance</th>\n",
" <th>calories</th>\n",
" <th>duration</th>\n",
" <th>height</th>\n",
" <th>weight</th>\n",
" <th>performance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>420.0</td>\n",
" <td>50.0</td>\n",
" <td>39.0</td>\n",
" <td>110.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3.0</td>\n",
" <td>-1.0</td>\n",
" <td>40.0</td>\n",
" <td>35.0</td>\n",
" <td>132.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>390.0</td>\n",
" <td>-1.0</td>\n",
" <td>42.0</td>\n",
" <td>143.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id calories duration height weight performance\n",
"0 0 420.0 50.0 39.0 110.0 1.0\n",
"1 1 -1.0 40.0 35.0 132.0 3.0\n",
"2 2 390.0 -1.0 42.0 143.0 2.0"
" id performance calories duration height weight\n",
"0 0 1.0 420.0 50.0 39.0 110.0\n",
"1 1 3.0 -1.0 40.0 35.0 132.0\n",
"2 2 2.0 390.0 -1.0 42.0 143.0"
]
},
"execution_count": 9,
Expand Down Expand Up @@ -388,7 +382,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
"version": "3.11.9"
},
"orig_nbformat": 4,
"vscode": {
Expand Down

0 comments on commit a6a5378

Please sign in to comment.