Skip to content

Commit

Permalink
#7; add utils.py and visualize.py (ColumnTypePlotter) to src
Browse files Browse the repository at this point in the history
  • Loading branch information
abhi8893 committed Jul 2, 2021
1 parent d216469 commit e16ee26
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 0 deletions.
24 changes: 24 additions & 0 deletions mlops/mpg-pred-end-to-end-ml/src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import re
import pandas as pd

def remove_extra_whitespace(s):
'''remove extra whitespace from string'''

return re.sub(r'\s+', r' ', s.strip())

# TODO: Have option to lemmatize
def standardize_headers(headers):
'''
standardize the headers by removing extra whitespace
and converting to snake_case
'''
headers = map(remove_extra_whitespace, headers)
headers = map(lambda s: re.sub(r' ', '_', s).lower(), headers)
return list(headers)


def assure_index(df):
'''assure index is a RangeIndex with step 1'''

df.index = pd.RangeIndex(start=0, stop=df.shape[0])
return df
40 changes: 40 additions & 0 deletions mlops/mpg-pred-end-to-end-ml/src/visualize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import matplotlib.pyplot as plt
import numpy as np

# TODO: Add infer_column_types method
class ColumnTypePlotter:

def __init__(self, column_types):
'''Plot a dataframe by three column types: [continous, ordinal, nominal]'''

self.column_types = column_types

def plot(self, data, kind, figsize=(12, 4)):
cols = self.column_types[kind]
df = data[cols]
fig, axn = plt.subplots(nrows=1, ncols=len(cols), figsize=figsize)
if len(cols) == 1:
axn = np.array([axn])

plotter_func = self._get_plotter_func(kind)

i = 0
for ax in axn:
col = cols[i]
plotter_func(col, df, ax)
ax.set_title(col, fontweight='bold')
i += 1

fig.suptitle(kind, fontweight='bold', fontsize=20)
plt.subplots_adjust(top=0.8)

return fig, axn

@staticmethod
def _get_plotter_func(kind):
if kind == 'continous':
func = lambda col, data, ax: ax.hist(data[col])
elif kind in ['ordinal', 'nominal']:
func = lambda col, data, ax: data[col].value_counts().sort_index().plot(kind='bar', ax=ax)

return func

0 comments on commit e16ee26

Please sign in to comment.