-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#7; add utils.py and visualize.py (ColumnTypePlotter) to src
- Loading branch information
Showing
2 changed files
with
64 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import re | ||
import pandas as pd | ||
|
||
def remove_extra_whitespace(s): | ||
'''remove extra whitespace from string''' | ||
|
||
return re.sub(r'\s+', r' ', s.strip()) | ||
|
||
# TODO: Have option to lemmatize | ||
def standardize_headers(headers): | ||
''' | ||
standardize the headers by removing extra whitespace | ||
and converting to snake_case | ||
''' | ||
headers = map(remove_extra_whitespace, headers) | ||
headers = map(lambda s: re.sub(r' ', '_', s).lower(), headers) | ||
return list(headers) | ||
|
||
|
||
def assure_index(df): | ||
'''assure index is a RangeIndex with step 1''' | ||
|
||
df.index = pd.RangeIndex(start=0, stop=df.shape[0]) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
|
||
# TODO: Add infer_column_types method | ||
class ColumnTypePlotter: | ||
|
||
def __init__(self, column_types): | ||
'''Plot a dataframe by three column types: [continous, ordinal, nominal]''' | ||
|
||
self.column_types = column_types | ||
|
||
def plot(self, data, kind, figsize=(12, 4)): | ||
cols = self.column_types[kind] | ||
df = data[cols] | ||
fig, axn = plt.subplots(nrows=1, ncols=len(cols), figsize=figsize) | ||
if len(cols) == 1: | ||
axn = np.array([axn]) | ||
|
||
plotter_func = self._get_plotter_func(kind) | ||
|
||
i = 0 | ||
for ax in axn: | ||
col = cols[i] | ||
plotter_func(col, df, ax) | ||
ax.set_title(col, fontweight='bold') | ||
i += 1 | ||
|
||
fig.suptitle(kind, fontweight='bold', fontsize=20) | ||
plt.subplots_adjust(top=0.8) | ||
|
||
return fig, axn | ||
|
||
@staticmethod | ||
def _get_plotter_func(kind): | ||
if kind == 'continous': | ||
func = lambda col, data, ax: ax.hist(data[col]) | ||
elif kind in ['ordinal', 'nominal']: | ||
func = lambda col, data, ax: data[col].value_counts().sort_index().plot(kind='bar', ax=ax) | ||
|
||
return func |