From 2aa79197748e3a9b33d753869b53051580d73cba Mon Sep 17 00:00:00 2001 From: Elton Law Date: Sat, 13 Jul 2019 01:59:51 -0400 Subject: [PATCH] [#73] fix linting issues --- .gitignore | 1 + .pylintrc | 4 ++-- CONTRIBUTING.md | 4 +++- impyute/__init__.py | 7 ++++-- impyute/dataset/__init__.py | 4 +--- impyute/dataset/base.py | 13 +++-------- impyute/deletion/__init__.py | 4 +--- impyute/imputation/__init__.py | 4 +--- impyute/imputation/cs/__init__.py | 7 ++---- impyute/imputation/cs/buck_iterative.py | 27 ++++++++-------------- impyute/imputation/cs/central_tendency.py | 8 +++---- impyute/imputation/cs/em.py | 4 +--- impyute/imputation/cs/fast_knn.py | 24 ++++++++++--------- impyute/imputation/cs/random.py | 4 +--- impyute/imputation/ts/__init__.py | 6 ++--- impyute/imputation/ts/locf.py | 1 - impyute/imputation/ts/moving_window.py | 7 ++---- impyute/util/__init__.py | 7 ++---- impyute/util/checks.py | 1 - impyute/util/compare.py | 4 +--- impyute/util/count_missing.py | 1 - impyute/util/describe.py | 1 - impyute/util/errors.py | 8 +++---- impyute/util/find_null.py | 1 - impyute/util/inverse_distance_weighting.py | 3 ++- impyute/util/mcar_test.py | 19 --------------- impyute/util/preprocess.py | 9 ++++---- 27 files changed, 63 insertions(+), 120 deletions(-) delete mode 100644 impyute/util/mcar_test.py diff --git a/.gitignore b/.gitignore index 381ee3f..24a686f 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ build/ dist/ impyute.egg-info/ *.swo +syntastic diff --git a/.pylintrc b/.pylintrc index 4bcb4a1..2becba0 100644 --- a/.pylintrc +++ b/.pylintrc @@ -193,7 +193,7 @@ max-nested-blocks=5 [FORMAT] # Maximum number of characters on a single line. -max-line-length=79 +max-line-length=120 # Regexp for a line that is allowed to be longer than the limit. ignore-long-lines=^\s*(# )??$ @@ -247,7 +247,7 @@ ignore-comments=yes ignore-docstrings=yes # Ignore imports when computing similarities. -ignore-imports=no +ignore-imports=yes [SPELLING] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6a0de44..31bf45c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,10 +15,12 @@ This project was built for Python 3.5 and 3.6. (If you would like to work on ext Using [Sphinx's autodoc](http://www.sphinx-doc.org/en/stable/ext/autodoc.html) module, docstrings are used as the documentation. Make sure that all docstrings are formatted according to the [NumPy/SciPy Docstring Standard](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard) -Use [.pylintrc](https://github.com/eltonlaw/impyute/blob/master/.pylintrc) to lint files in accordance with [PEP8](https://www.python.org/dev/peps/pep-0008/). You will first need pylint installed: [install pylint](https://www.pylint.org/#install) +Use [.pylintrc](https://github.com/eltonlaw/impyute/blob/master/.pylintrc) to lint files in accordance (mostly) with [PEP8](https://www.python.org/dev/peps/pep-0008/). You will first need pylint installed: [install pylint](https://www.pylint.org/#install). I recommend [integrating it with your editor](https://docs.pylint.org/en/1.6.0/ide-integration.html) or you can call it from bash with: $ pylint --rcfile=.pylintrc impyute/ +Fix all warnings raised, if you feel that the warning isn't justified/serves no purpose, then feel free to [disable the specific message](http://pylint.pycqa.org/en/latest/user_guide/message-control.html) for whatever blocks are causing it. + To run unit tests you will need [Docker](https://docs.docker.com/install/). The unit testing framework used is the built-in one, [`unittest`](https://docs.python.org/3.6/library/unittest.html). Put unit tests in the `test` directory in root. The testing environment works like this: 1) Build a docker image with multiple python versions 2) Run the container with pytest for each python version. $ make test diff --git a/impyute/__init__.py b/impyute/__init__.py index a0bb349..4b34f6a 100644 --- a/impyute/__init__.py +++ b/impyute/__init__.py @@ -1,5 +1,8 @@ -""" -Library of missing data imputations +""" impyute: Data imputations library to preprocess datasets with missing dat + +impyute.imputations.cs: Imputations on cross sectional data +impyute.imputations.ts: Imputations on time series data +impyute.deletion: Deletion type missing data handling """ # pylint: disable=wrong-import-position diff --git a/impyute/dataset/__init__.py b/impyute/dataset/__init__.py index 1c17f18..96d5959 100644 --- a/impyute/dataset/__init__.py +++ b/impyute/dataset/__init__.py @@ -1,6 +1,4 @@ -""" -Real-world/mock datasets and missingness corruptors to experiment with. -""" +""" Real-world/mock datasets and missingness corruptors to experiment with. """ from .base import randu from .base import randn from .base import test_data diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py index 830b5d9..f492a8e 100644 --- a/impyute/dataset/base.py +++ b/impyute/dataset/base.py @@ -1,14 +1,8 @@ -""" impyute.dataset.base - -Load/generate data - -""" +""" Shared functions to load/generate data """ import numpy as np from impyute.dataset.corrupt import Corruptor - -def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", - thr=0.2, dtype="int"): +def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"): """ Return randomly generated dataset of numbers with uniformly distributed values between bound[0] and bound[1] @@ -40,8 +34,7 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", return raw_data -def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, - dtype="float"): +def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"): """ Return randomly generated dataset of numbers with normally distributed values with given and sigma. diff --git a/impyute/deletion/__init__.py b/impyute/deletion/__init__.py index 1ff05b4..381a622 100644 --- a/impyute/deletion/__init__.py +++ b/impyute/deletion/__init__.py @@ -1,6 +1,4 @@ -""" -Missing data approaches that delete values. -""" +""" Missing data approaches that delete values. """ from .complete_case import complete_case diff --git a/impyute/imputation/__init__.py b/impyute/imputation/__init__.py index e5dc14c..e7581aa 100644 --- a/impyute/imputation/__init__.py +++ b/impyute/imputation/__init__.py @@ -1,5 +1,3 @@ -""" -Imputations for cross-sectional and time-series data. -""" +""" Imputations for cross-sectional and time-series data. """ __all__ = ["cs", "ts"] diff --git a/impyute/imputation/cs/__init__.py b/impyute/imputation/cs/__init__.py index a17ec85..5b7e302 100644 --- a/impyute/imputation/cs/__init__.py +++ b/impyute/imputation/cs/__init__.py @@ -1,6 +1,4 @@ -""" -Imputations for cross-sectional data. -""" +""" Imputations for cross-sectional data. """ from .random import random from .central_tendency import mean @@ -10,5 +8,4 @@ from .em import em from .fast_knn import fast_knn -__all__ = ["random", "mean", "mode", - "median", "buck_iterative", "em", "fast_knn"] +__all__ = ["random", "mean", "mode", "median", "buck_iterative", "em", "fast_knn"] diff --git a/impyute/imputation/cs/buck_iterative.py b/impyute/imputation/cs/buck_iterative.py index e01bc85..d255832 100644 --- a/impyute/imputation/cs/buck_iterative.py +++ b/impyute/imputation/cs/buck_iterative.py @@ -5,23 +5,19 @@ from impyute.util import checks from impyute.util import preprocess # pylint: disable=too-many-locals -# pylint:disable=invalid-name -# pylint:disable=unused-argument @preprocess @checks -def buck_iterative(data, **kwargs): - """Multivariate Imputation by Chained Equations +def buck_iterative(data): + """ Iterative variant of buck's method - Reference: - Buuren, S. V., & Groothuis-Oudshoorn, K. (2011). Mice: Multivariate - Imputation by Chained Equations in R. Journal of Statistical Software, - 45(3). doi:10.18637/jss.v045.i03 + - Variable to regress on is chosen at random. + - EM type infinite regression loop stops after change in prediction from + previous prediction < 10% for all columns with missing values - Implementation follows the main idea from the paper above. Differs in - decision of which variable to regress on (here, I choose it at random). - Also differs in stopping criterion (here the model stops after change in - prediction from previous prediction is less than 10%). + A Method of Estimation of Missing Values in Multivariate Data Suitable for + use with an Electronic Computer S. F. Buck Journal of the Royal Statistical + Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306 Parameters ---------- @@ -41,7 +37,7 @@ def buck_iterative(data, **kwargs): null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv] temp = [] - cols_missing = set([y for _, y, _ in null_xyv]) + cols_missing = {y for _, y, _ in null_xyv} # Step 1: Simple Imputation, these are just placeholders for x_i, y_i, value in null_xyv: @@ -84,8 +80,5 @@ def buck_iterative(data, **kwargs): delta = (new_value-value)/0.01 else: delta = (new_value-value)/value - if abs(delta) < 0.1: - converged[i] = True - else: - converged[i] = False + converged[i] = abs(delta) < 0.1 return data diff --git a/impyute/imputation/cs/central_tendency.py b/impyute/imputation/cs/central_tendency.py index fd7d978..53fdf2e 100644 --- a/impyute/imputation/cs/central_tendency.py +++ b/impyute/imputation/cs/central_tendency.py @@ -3,12 +3,10 @@ from impyute.util import find_null from impyute.util import checks from impyute.util import preprocess -# pylint:disable=unused-argument -# pylint:disable=invalid-name @preprocess @checks -def mean(data, **kwargs): +def mean(data): """ Substitute missing values with the mean of that column. Parameters @@ -31,7 +29,7 @@ def mean(data, **kwargs): @preprocess @checks -def median(data, **kwargs): +def median(data): """ Substitute missing values with the median of that column(middle). Parameters @@ -58,7 +56,7 @@ def median(data, **kwargs): @preprocess @checks -def mode(data, **kwargs): +def mode(data): """ Substitute missing values with the mode of that column(most frequent). In the case that there is a tie (there are multiple, most frequent values) diff --git a/impyute/imputation/cs/em.py b/impyute/imputation/cs/em.py index 8a778b6..4eb28e1 100644 --- a/impyute/imputation/cs/em.py +++ b/impyute/imputation/cs/em.py @@ -3,12 +3,10 @@ from impyute.util import find_null from impyute.util import preprocess from impyute.util import checks -# pylint:disable=invalid-name -# pylint:disable=unused-argument @preprocess @checks -def em(data, loops=50, **kwargs): +def em(data, loops=50): """ Imputes given data using expectation maximization. E-step: Calculates the expected complete data log likelihood ratio. diff --git a/impyute/imputation/cs/fast_knn.py b/impyute/imputation/cs/fast_knn.py index 3471175..217658d 100644 --- a/impyute/imputation/cs/fast_knn.py +++ b/impyute/imputation/cs/fast_knn.py @@ -1,18 +1,16 @@ """ impyute.imputation.cs.knn """ import numpy as np +from scipy.spatial import KDTree from impyute.util import find_null from impyute.util import checks from impyute.util import preprocess from impyute.util import inverse_distance_weighting as util_idw from impyute.imputation.cs import mean -from scipy.spatial import KDTree -# pylint: disable=invalid-name -# pylint:disable=unused-argument +# pylint: disable=too-many-arguments @preprocess @checks -def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, - idw=util_idw.shepards): +def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, idw=util_idw.shepards): """ Impute using a variant of the nearest neighbours approach Basic idea: Impute array with a basic mean impute and then use the resulting complete @@ -31,14 +29,16 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, k: int, optional Parameter used for method querying the KDTree class object. Number of neighbours used in the KNN query. Refer to the docs for - [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). + [`scipy.spatial.KDTree.query`] + (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). eps: nonnegative float, optional Parameter used for method querying the KDTree class object. From the SciPy docs: "Return approximate nearest neighbors; the kth returned value is guaranteed to be no further than (1+eps) times the distance to the real kth nearest neighbor". Refer to the docs for - [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). + [`scipy.spatial.KDTree.query`] + (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). p : float, 1<=p<=infinity, optional Parameter used for method querying the KDTree class object. Straight from the @@ -46,7 +46,8 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, sum-of-absolute-values Manhattan distance 2 is the usual Euclidean distance infinity is the maximum-coordinate-difference distance". Refer to the docs for - [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). + [`scipy.spatial.KDTree.query`] + (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). distance_upper_bound : nonnegative float, optional Parameter used for method querying the KDTree class object. Straight @@ -54,7 +55,8 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, is used to prune tree searches, so if you are doing a series of nearest-neighbor queries, it may help to supply the distance to the nearest neighbor of the most recent point." Refer to the docs for - [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). + [`scipy.spatial.KDTree.query`] + (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). leafsize: int, optional Parameter used for construction of the `KDTree` class object. Straight from @@ -65,8 +67,8 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, idw: fn, optional Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom - one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using functools.partial, - for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)` + one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using + functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)` Returns ------- diff --git a/impyute/imputation/cs/random.py b/impyute/imputation/cs/random.py index 7d850b0..96653ee 100644 --- a/impyute/imputation/cs/random.py +++ b/impyute/imputation/cs/random.py @@ -3,12 +3,10 @@ from impyute.util import find_null from impyute.util import preprocess from impyute.util import checks -# pylint:disable=invalid-name -# pylint:disable=unused-argument @preprocess @checks -def random(data, **kwargs): +def random(data): """ Fill missing values in with a randomly selected value from the same column. diff --git a/impyute/imputation/ts/__init__.py b/impyute/imputation/ts/__init__.py index b91066b..5680abf 100644 --- a/impyute/imputation/ts/__init__.py +++ b/impyute/imputation/ts/__init__.py @@ -1,8 +1,6 @@ -""" -Imputations for time-series data. -""" +""" Imputations for time-series data. """ from .locf import locf from .moving_window import moving_window -__all__ = ["locf", "moving_window"] # , "dsae"] +__all__ = ["locf", "moving_window"] diff --git a/impyute/imputation/ts/locf.py b/impyute/imputation/ts/locf.py index d1ce55a..42521cf 100644 --- a/impyute/imputation/ts/locf.py +++ b/impyute/imputation/ts/locf.py @@ -4,7 +4,6 @@ from impyute.util import checks from impyute.util import preprocess - @preprocess @checks def locf(data, axis=0): diff --git a/impyute/imputation/ts/moving_window.py b/impyute/imputation/ts/moving_window.py index 0baed5c..10a3c56 100644 --- a/impyute/imputation/ts/moving_window.py +++ b/impyute/imputation/ts/moving_window.py @@ -3,13 +3,11 @@ from impyute.util import find_null from impyute.util import checks from impyute.util import preprocess -# pylint: disable=invalid-name -# pylint:disable=unused-argument +# pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-branches, broad-except, len-as-condition @preprocess @checks -def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, - inplace=False, **kwargs): +def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, inplace=False): """ Interpolate the missing values based on nearby values. For example, with an array like this: @@ -42,7 +40,6 @@ def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, You can also do something like take 1.5x the max of previous values in the window: moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1) - Parameters ---------- diff --git a/impyute/util/__init__.py b/impyute/util/__init__.py index 883b183..fd38ce2 100644 --- a/impyute/util/__init__.py +++ b/impyute/util/__init__.py @@ -1,10 +1,7 @@ -""" -Diagnostic tools to find information about data. -""" +""" Diagnostic tools to find information about data. """ from .find_null import find_null from .describe import describe -# from .mcar_test import mcar_test from .count_missing import count_missing from .errors import BadInputError from .checks import checks @@ -13,5 +10,5 @@ from . import inverse_distance_weighting __all__ = ["find_null", "describe", "count_missing", - "checks", "compare", "BadInputError", "preprocess" + "checks", "compare", "BadInputError", "preprocess", "inverse_distance_weighting"] diff --git a/impyute/util/checks.py b/impyute/util/checks.py index f4173bc..6dbbaa7 100644 --- a/impyute/util/checks.py +++ b/impyute/util/checks.py @@ -3,7 +3,6 @@ import numpy as np from impyute.util import find_null from impyute.util import BadInputError -# pylint:disable=invalid-name def checks(fn): """ Main check function to ensure input is correctly formatted diff --git a/impyute/util/compare.py b/impyute/util/compare.py index 2286b86..3e70b2b 100644 --- a/impyute/util/compare.py +++ b/impyute/util/compare.py @@ -2,9 +2,7 @@ import importlib from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score -# pylint: disable=too-many-locals -# pylint: disable=dangerous-default-value - +# pylint: disable=too-many-locals, dangerous-default-value def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None): """ diff --git a/impyute/util/count_missing.py b/impyute/util/count_missing.py index 1f92d36..c5717de 100644 --- a/impyute/util/count_missing.py +++ b/impyute/util/count_missing.py @@ -2,7 +2,6 @@ import numpy as np from impyute.util import find_null - def count_missing(data): """ Calculate the total percentage of missing values and also the percentage in each column. diff --git a/impyute/util/describe.py b/impyute/util/describe.py index 374d869..c87c717 100644 --- a/impyute/util/describe.py +++ b/impyute/util/describe.py @@ -1,7 +1,6 @@ """ impyute.util.describe """ from impyute.util import find_null - def describe(data): # verbose=True): """ Print input/output multiple times diff --git a/impyute/util/errors.py b/impyute/util/errors.py index 91d3f78..7431283 100644 --- a/impyute/util/errors.py +++ b/impyute/util/errors.py @@ -1,7 +1,5 @@ """ impyute.util.errors """ -class BadInputError(Exception): - def __init__(self, value): - self.value = value - def __str__(self): - return self.value +class BadInputError(Exception): + "Error thrown when input args don't match spec" + pass diff --git a/impyute/util/find_null.py b/impyute/util/find_null.py index 5db594d..478c12d 100644 --- a/impyute/util/find_null.py +++ b/impyute/util/find_null.py @@ -1,7 +1,6 @@ """ impyute.util.find_null """ import numpy as np - def find_null(data): """ Finds the indices of all missing values. diff --git a/impyute/util/inverse_distance_weighting.py b/impyute/util/inverse_distance_weighting.py index 6956787..a5a8fa1 100644 --- a/impyute/util/inverse_distance_weighting.py +++ b/impyute/util/inverse_distance_weighting.py @@ -1,5 +1,6 @@ """ impyute.util.inverse_distance_weighting """ -import numpy as np +import numpy as np + def shepards(distances, power=2): """ Basic inverse distance weighting function diff --git a/impyute/util/mcar_test.py b/impyute/util/mcar_test.py deleted file mode 100644 index 80eee54..0000000 --- a/impyute/util/mcar_test.py +++ /dev/null @@ -1,19 +0,0 @@ -import numpy as np - - -def mcar_test(data): - """ Implementation of Little's MCAR Test - - Parameters - ---------- - np.ndarray - - Returns - ------ - Boolean; - """ - for ii, datapoint in enumerate(data.T): - datapoint = datapoint[~np.isnan(datapoint)] - datapoint.mean() - - return True diff --git a/impyute/util/preprocess.py b/impyute/util/preprocess.py index b2bd1e0..8d2986d 100644 --- a/impyute/util/preprocess.py +++ b/impyute/util/preprocess.py @@ -1,8 +1,8 @@ """ impyute.util.preprocess """ from functools import wraps -# pylint:disable=invalid-name -# TODO:Some hacky ass code to handle python2 not having `ModuleNotFoundError` +# Hacky way to handle python2 not having `ModuleNotFoundError` +# pylint: disable=redefined-builtin, missing-docstring try: raise ModuleNotFoundError except NameError: @@ -10,7 +10,7 @@ class ModuleNotFoundError(Exception): pass except ModuleNotFoundError: pass - +# pylint: enable=redefined-builtin, missing-docstring def preprocess(fn): """ Base preprocess function for commonly used preprocessing @@ -51,7 +51,6 @@ def wrapper(*args, **kwargs): if pd_DataFrame and isinstance(args[0], pd_DataFrame): args[0] = args[0].as_matrix() return pd_DataFrame(fn(*args, **kwargs)) - else: - return fn(*args, **kwargs) + return fn(*args, **kwargs) return wrapper