Merge pull request #74 from eltonlaw/GH-73

[#73] fix linting issues
eltonlaw · Jul 13, 2019 · 2c27fe8 · 2c27fe8
2 parents 075a9b2 + 2aa7919
commit 2c27fe8
Show file tree

Hide file tree

Showing 27 changed files with 63 additions and 120 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,4 @@ build/
 dist/
 impyute.egg-info/
 *.swo
+syntastic
diff --git a/.pylintrc b/.pylintrc
@@ -193,7 +193,7 @@ max-nested-blocks=5
 [FORMAT]
 
 # Maximum number of characters on a single line.
-max-line-length=79
+max-line-length=120
 
 # Regexp for a line that is allowed to be longer than the limit.
 ignore-long-lines=^\s*(# )?<?https?://\S+>?$
@@ -247,7 +247,7 @@ ignore-comments=yes
 ignore-docstrings=yes
 
 # Ignore imports when computing similarities.
-ignore-imports=no
+ignore-imports=yes
 
 
 [SPELLING]

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -15,10 +15,12 @@ This project was built for Python 3.5 and 3.6. (If you would like to work on ext
 
 Using [Sphinx's autodoc](http://www.sphinx-doc.org/en/stable/ext/autodoc.html) module, docstrings are used as the documentation. Make sure that all docstrings are formatted according to the [NumPy/SciPy Docstring Standard](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard)
 
-Use [.pylintrc](https://github.com/eltonlaw/impyute/blob/master/.pylintrc) to lint files in accordance with [PEP8](https://www.python.org/dev/peps/pep-0008/). You will first need pylint installed: [install pylint](https://www.pylint.org/#install)
+Use [.pylintrc](https://github.com/eltonlaw/impyute/blob/master/.pylintrc) to lint files in accordance (mostly) with [PEP8](https://www.python.org/dev/peps/pep-0008/). You will first need pylint installed: [install pylint](https://www.pylint.org/#install). I recommend [integrating it with your editor](https://docs.pylint.org/en/1.6.0/ide-integration.html) or you can call it from bash with: 
 
     $ pylint --rcfile=.pylintrc impyute/
 
+Fix all warnings raised, if you feel that the warning isn't justified/serves no purpose, then feel free to [disable the specific message](http://pylint.pycqa.org/en/latest/user_guide/message-control.html) for whatever blocks are causing it.
+
 To run unit tests you will need [Docker](https://docs.docker.com/install/). The unit testing framework used is the built-in one, [`unittest`](https://docs.python.org/3.6/library/unittest.html). Put unit tests in the `test` directory in root. The testing environment works like this: 1) Build a docker image with multiple python versions 2) Run the container with pytest for each python version.
 
     $ make test

diff --git a/impyute/__init__.py b/impyute/__init__.py
@@ -1,5 +1,8 @@
-"""
-Library of missing data imputations
+""" impyute: Data imputations library to preprocess datasets with missing dat
+
+impyute.imputations.cs:   Imputations on cross sectional data
+impyute.imputations.ts:   Imputations on time series data
+impyute.deletion:         Deletion type missing data handling
 """
 # pylint: disable=wrong-import-position
 

diff --git a/impyute/dataset/__init__.py b/impyute/dataset/__init__.py
@@ -1,6 +1,4 @@
-"""
-Real-world/mock datasets and missingness corruptors to experiment with.
-"""
+""" Real-world/mock datasets and missingness corruptors to experiment with.  """
 from .base import randu
 from .base import randn
 from .base import test_data

diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py
@@ -1,14 +1,8 @@
-""" impyute.dataset.base
-
-Load/generate data
-
-"""
+""" Shared functions to load/generate data """
 import numpy as np
 from impyute.dataset.corrupt import Corruptor
 
-
-def randu(bound=(0, 10), shape=(5, 5), missingness="mcar",
-                   thr=0.2, dtype="int"):
+def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
     """ Return randomly generated dataset of numbers with uniformly
     distributed values between bound[0] and bound[1]
 
@@ -40,8 +34,7 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar",
     return raw_data
 
 
-def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2,
-                  dtype="float"):
+def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"):
     """ Return randomly generated dataset of numbers with normally
     distributed values with given and sigma.
 

diff --git a/impyute/deletion/__init__.py b/impyute/deletion/__init__.py
@@ -1,6 +1,4 @@
-"""
-Missing data approaches that delete values.
-"""
+""" Missing data approaches that delete values.  """
 
 from .complete_case import complete_case
 

diff --git a/impyute/imputation/__init__.py b/impyute/imputation/__init__.py
@@ -1,5 +1,3 @@
-"""
-Imputations for cross-sectional and time-series data.
-"""
+""" Imputations for cross-sectional and time-series data.  """
 
 __all__ = ["cs", "ts"]
diff --git a/impyute/imputation/cs/__init__.py b/impyute/imputation/cs/__init__.py
@@ -1,6 +1,4 @@
-"""
-Imputations for cross-sectional data.
-"""
+""" Imputations for cross-sectional data.  """
 
 from .random import random
 from .central_tendency import mean
@@ -10,5 +8,4 @@
 from .em import em
 from .fast_knn import fast_knn
 
-__all__ = ["random", "mean", "mode",
-           "median", "buck_iterative", "em", "fast_knn"]
+__all__ = ["random", "mean", "mode", "median", "buck_iterative", "em", "fast_knn"]
diff --git a/impyute/imputation/cs/buck_iterative.py b/impyute/imputation/cs/buck_iterative.py
@@ -5,23 +5,19 @@
 from impyute.util import checks
 from impyute.util import preprocess
 # pylint: disable=too-many-locals
-# pylint:disable=invalid-name
-# pylint:disable=unused-argument
 
 @preprocess
 @checks
-def buck_iterative(data, **kwargs):
-    """Multivariate Imputation by Chained Equations
+def buck_iterative(data):
+    """ Iterative variant of buck's method
 
-    Reference:
-        Buuren, S. V., & Groothuis-Oudshoorn, K. (2011). Mice: Multivariate
-        Imputation by Chained Equations in R. Journal of Statistical Software,
-        45(3). doi:10.18637/jss.v045.i03
+    - Variable to regress on is chosen at random.
+    - EM type infinite regression loop stops after change in prediction from
+      previous prediction < 10% for all columns with missing values
 
-    Implementation follows the main idea from the paper above. Differs in
-    decision of which variable to regress on (here, I choose it at random).
-    Also differs in stopping criterion (here the model stops after change in
-    prediction from previous prediction is less than 10%).
+    A Method of Estimation of Missing Values in Multivariate Data Suitable for
+    use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
+    Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306
 
     Parameters
     ----------
@@ -41,7 +37,7 @@ def buck_iterative(data, **kwargs):
 
     null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv]
     temp = []
-    cols_missing = set([y for _, y, _ in null_xyv])
+    cols_missing = {y for _, y, _ in null_xyv}
 
     # Step 1: Simple Imputation, these are just placeholders
     for x_i, y_i, value in null_xyv:
@@ -84,8 +80,5 @@ def buck_iterative(data, **kwargs):
                     delta = (new_value-value)/0.01
                 else:
                     delta = (new_value-value)/value
-                if abs(delta) < 0.1:
-                    converged[i] = True
-                else:
-                    converged[i] = False
+                converged[i] = abs(delta) < 0.1
     return data
diff --git a/impyute/imputation/cs/central_tendency.py b/impyute/imputation/cs/central_tendency.py
@@ -3,12 +3,10 @@
 from impyute.util import find_null
 from impyute.util import checks
 from impyute.util import preprocess
-# pylint:disable=unused-argument
-# pylint:disable=invalid-name
 
 @preprocess
 @checks
-def mean(data, **kwargs):
+def mean(data):
     """ Substitute missing values with the mean of that column.
 
     Parameters
@@ -31,7 +29,7 @@ def mean(data, **kwargs):
 
 @preprocess
 @checks
-def median(data, **kwargs):
+def median(data):
     """ Substitute missing values with the median of that column(middle).
 
     Parameters
@@ -58,7 +56,7 @@ def median(data, **kwargs):
 
 @preprocess
 @checks
-def mode(data, **kwargs):
+def mode(data):
     """ Substitute missing values with the mode of that column(most frequent).
 
     In the case that there is a tie (there are multiple, most frequent values)

diff --git a/impyute/imputation/cs/em.py b/impyute/imputation/cs/em.py
@@ -3,12 +3,10 @@
 from impyute.util import find_null
 from impyute.util import preprocess
 from impyute.util import checks
-# pylint:disable=invalid-name
-# pylint:disable=unused-argument
 
 @preprocess
 @checks
-def em(data, loops=50, **kwargs):
+def em(data, loops=50):
     """ Imputes given data using expectation maximization.
 
     E-step: Calculates the expected complete data log likelihood ratio.

diff --git a/impyute/imputation/cs/fast_knn.py b/impyute/imputation/cs/fast_knn.py
@@ -1,18 +1,16 @@
 """ impyute.imputation.cs.knn """
 import numpy as np
+from scipy.spatial import KDTree
 from impyute.util import find_null
 from impyute.util import checks
 from impyute.util import preprocess
 from impyute.util import inverse_distance_weighting as util_idw
 from impyute.imputation.cs import mean
-from scipy.spatial import KDTree
-# pylint: disable=invalid-name
-# pylint:disable=unused-argument
+# pylint: disable=too-many-arguments
 
 @preprocess
 @checks
-def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
-             idw=util_idw.shepards):
+def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, idw=util_idw.shepards):
     """ Impute using a variant of the nearest neighbours approach
 
     Basic idea: Impute array with a basic mean impute and then use the resulting complete
@@ -31,30 +29,34 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
     k: int, optional
         Parameter used for method querying the KDTree class object. Number of
         neighbours used in the KNN query. Refer to the docs for
-        [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
+        [`scipy.spatial.KDTree.query`]
+        (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
 
     eps: nonnegative float, optional
         Parameter used for method querying the KDTree class object. From the
         SciPy docs: "Return approximate nearest neighbors; the kth returned
         value is guaranteed to be no further than (1+eps) times the distance to
         the real kth nearest neighbor". Refer to the docs for
-        [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
+        [`scipy.spatial.KDTree.query`]
+        (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
 
     p : float, 1<=p<=infinity, optional
         Parameter used for method querying the KDTree class object. Straight from the
         SciPy docs: "Which Minkowski p-norm to use. 1 is the
         sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
         distance infinity is the maximum-coordinate-difference distance". Refer to
         the docs for
-        [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
+        [`scipy.spatial.KDTree.query`]
+        (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
 
     distance_upper_bound : nonnegative float, optional
         Parameter used for method querying the KDTree class object. Straight
         from the SciPy docs: "Return only neighbors within this distance. This
         is used to prune tree searches, so if you are doing a series of
         nearest-neighbor queries, it may help to supply the distance to the
         nearest neighbor of the most recent point." Refer to the docs for
-        [`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
+        [`scipy.spatial.KDTree.query`]
+        (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
 
     leafsize: int, optional
         Parameter used for construction of the `KDTree` class object. Straight from
@@ -65,8 +67,8 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
 
     idw: fn, optional
         Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
-        one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using functools.partial,
-        for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
+        one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
+        functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
 
     Returns
     -------

diff --git a/impyute/imputation/cs/random.py b/impyute/imputation/cs/random.py
@@ -3,12 +3,10 @@
 from impyute.util import find_null
 from impyute.util import preprocess
 from impyute.util import checks
-# pylint:disable=invalid-name
-# pylint:disable=unused-argument
 
 @preprocess
 @checks
-def random(data, **kwargs):
+def random(data):
     """ Fill missing values in with a randomly selected value from the same
     column.
 

diff --git a/impyute/imputation/ts/__init__.py b/impyute/imputation/ts/__init__.py
@@ -1,8 +1,6 @@
-"""
-Imputations for time-series data.
-"""
+""" Imputations for time-series data.  """
 
 from .locf import locf
 from .moving_window import moving_window
 
-__all__ = ["locf", "moving_window"]  # , "dsae"]
+__all__ = ["locf", "moving_window"]
diff --git a/impyute/imputation/ts/locf.py b/impyute/imputation/ts/locf.py
@@ -4,7 +4,6 @@
 from impyute.util import checks
 from impyute.util import preprocess
 
-
 @preprocess
 @checks
 def locf(data, axis=0):

diff --git a/impyute/imputation/ts/moving_window.py b/impyute/imputation/ts/moving_window.py
@@ -3,13 +3,11 @@
 from impyute.util import find_null
 from impyute.util import checks
 from impyute.util import preprocess
-# pylint: disable=invalid-name
-# pylint:disable=unused-argument
+# pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-branches, broad-except, len-as-condition
 
 @preprocess
 @checks
-def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean,
-        inplace=False, **kwargs):
+def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, inplace=False):
     """ Interpolate the missing values based on nearby values.
 
     For example, with an array like this:
@@ -42,7 +40,6 @@ def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean,
     You can also do something like take 1.5x the max of previous values in the window:
 
         moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1)
-    
 
     Parameters
     ----------

diff --git a/impyute/util/__init__.py b/impyute/util/__init__.py
@@ -1,10 +1,7 @@
-"""
-Diagnostic tools to find information about data.
-"""
+""" Diagnostic tools to find information about data.  """
 
 from .find_null import find_null
 from .describe import describe
-# from .mcar_test import mcar_test
 from .count_missing import count_missing
 from .errors import BadInputError
 from .checks import checks
@@ -13,5 +10,5 @@
 from . import inverse_distance_weighting
 
 __all__ = ["find_null", "describe", "count_missing",
-           "checks", "compare", "BadInputError", "preprocess"
+           "checks", "compare", "BadInputError", "preprocess",
            "inverse_distance_weighting"]
diff --git a/impyute/util/checks.py b/impyute/util/checks.py
@@ -3,7 +3,6 @@
 import numpy as np
 from impyute.util import find_null
 from impyute.util import BadInputError
-# pylint:disable=invalid-name
 
 def checks(fn):
     """ Main check function to ensure input is correctly formatted

diff --git a/impyute/util/compare.py b/impyute/util/compare.py
@@ -2,9 +2,7 @@
 import importlib
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
-# pylint: disable=too-many-locals
-# pylint: disable=dangerous-default-value
-
+# pylint: disable=too-many-locals, dangerous-default-value
 
 def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None):
     """

diff --git a/impyute/util/count_missing.py b/impyute/util/count_missing.py
@@ -2,7 +2,6 @@
 import numpy as np
 from impyute.util import find_null
 
-
 def count_missing(data):
     """ Calculate the total percentage of missing values and also the
     percentage in each column.

diff --git a/impyute/util/describe.py b/impyute/util/describe.py
@@ -1,7 +1,6 @@
 """ impyute.util.describe """
 from impyute.util import find_null
 
-
 def describe(data): # verbose=True):
     """ Print input/output multiple times
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,3 +12,4 @@ build/ @@
     dist/
     impyute.egg-info/
     *.swo
+    syntastic