Skip to content

Commit

Permalink
Merge pull request #74 from eltonlaw/GH-73
Browse files Browse the repository at this point in the history
[#73] fix linting issues
  • Loading branch information
eltonlaw authored Jul 13, 2019
2 parents 075a9b2 + 2aa7919 commit 2c27fe8
Show file tree
Hide file tree
Showing 27 changed files with 63 additions and 120 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ build/
dist/
impyute.egg-info/
*.swo
syntastic
4 changes: 2 additions & 2 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ max-nested-blocks=5
[FORMAT]

# Maximum number of characters on a single line.
max-line-length=79
max-line-length=120

# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
Expand Down Expand Up @@ -247,7 +247,7 @@ ignore-comments=yes
ignore-docstrings=yes

# Ignore imports when computing similarities.
ignore-imports=no
ignore-imports=yes


[SPELLING]
Expand Down
4 changes: 3 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ This project was built for Python 3.5 and 3.6. (If you would like to work on ext

Using [Sphinx's autodoc](http://www.sphinx-doc.org/en/stable/ext/autodoc.html) module, docstrings are used as the documentation. Make sure that all docstrings are formatted according to the [NumPy/SciPy Docstring Standard](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard)

Use [.pylintrc](https://github.com/eltonlaw/impyute/blob/master/.pylintrc) to lint files in accordance with [PEP8](https://www.python.org/dev/peps/pep-0008/). You will first need pylint installed: [install pylint](https://www.pylint.org/#install)
Use [.pylintrc](https://github.com/eltonlaw/impyute/blob/master/.pylintrc) to lint files in accordance (mostly) with [PEP8](https://www.python.org/dev/peps/pep-0008/). You will first need pylint installed: [install pylint](https://www.pylint.org/#install). I recommend [integrating it with your editor](https://docs.pylint.org/en/1.6.0/ide-integration.html) or you can call it from bash with:

$ pylint --rcfile=.pylintrc impyute/

Fix all warnings raised, if you feel that the warning isn't justified/serves no purpose, then feel free to [disable the specific message](http://pylint.pycqa.org/en/latest/user_guide/message-control.html) for whatever blocks are causing it.

To run unit tests you will need [Docker](https://docs.docker.com/install/). The unit testing framework used is the built-in one, [`unittest`](https://docs.python.org/3.6/library/unittest.html). Put unit tests in the `test` directory in root. The testing environment works like this: 1) Build a docker image with multiple python versions 2) Run the container with pytest for each python version.

$ make test
Expand Down
7 changes: 5 additions & 2 deletions impyute/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""
Library of missing data imputations
""" impyute: Data imputations library to preprocess datasets with missing dat
impyute.imputations.cs: Imputations on cross sectional data
impyute.imputations.ts: Imputations on time series data
impyute.deletion: Deletion type missing data handling
"""
# pylint: disable=wrong-import-position

Expand Down
4 changes: 1 addition & 3 deletions impyute/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
"""
Real-world/mock datasets and missingness corruptors to experiment with.
"""
""" Real-world/mock datasets and missingness corruptors to experiment with. """
from .base import randu
from .base import randn
from .base import test_data
Expand Down
13 changes: 3 additions & 10 deletions impyute/dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
""" impyute.dataset.base
Load/generate data
"""
""" Shared functions to load/generate data """
import numpy as np
from impyute.dataset.corrupt import Corruptor


def randu(bound=(0, 10), shape=(5, 5), missingness="mcar",
thr=0.2, dtype="int"):
def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
""" Return randomly generated dataset of numbers with uniformly
distributed values between bound[0] and bound[1]
Expand Down Expand Up @@ -40,8 +34,7 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar",
return raw_data


def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2,
dtype="float"):
def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"):
""" Return randomly generated dataset of numbers with normally
distributed values with given and sigma.
Expand Down
4 changes: 1 addition & 3 deletions impyute/deletion/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
"""
Missing data approaches that delete values.
"""
""" Missing data approaches that delete values. """

from .complete_case import complete_case

Expand Down
4 changes: 1 addition & 3 deletions impyute/imputation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
"""
Imputations for cross-sectional and time-series data.
"""
""" Imputations for cross-sectional and time-series data. """

__all__ = ["cs", "ts"]
7 changes: 2 additions & 5 deletions impyute/imputation/cs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
"""
Imputations for cross-sectional data.
"""
""" Imputations for cross-sectional data. """

from .random import random
from .central_tendency import mean
Expand All @@ -10,5 +8,4 @@
from .em import em
from .fast_knn import fast_knn

__all__ = ["random", "mean", "mode",
"median", "buck_iterative", "em", "fast_knn"]
__all__ = ["random", "mean", "mode", "median", "buck_iterative", "em", "fast_knn"]
27 changes: 10 additions & 17 deletions impyute/imputation/cs/buck_iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,19 @@
from impyute.util import checks
from impyute.util import preprocess
# pylint: disable=too-many-locals
# pylint:disable=invalid-name
# pylint:disable=unused-argument

@preprocess
@checks
def buck_iterative(data, **kwargs):
"""Multivariate Imputation by Chained Equations
def buck_iterative(data):
""" Iterative variant of buck's method
Reference:
Buuren, S. V., & Groothuis-Oudshoorn, K. (2011). Mice: Multivariate
Imputation by Chained Equations in R. Journal of Statistical Software,
45(3). doi:10.18637/jss.v045.i03
- Variable to regress on is chosen at random.
- EM type infinite regression loop stops after change in prediction from
previous prediction < 10% for all columns with missing values
Implementation follows the main idea from the paper above. Differs in
decision of which variable to regress on (here, I choose it at random).
Also differs in stopping criterion (here the model stops after change in
prediction from previous prediction is less than 10%).
A Method of Estimation of Missing Values in Multivariate Data Suitable for
use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306
Parameters
----------
Expand All @@ -41,7 +37,7 @@ def buck_iterative(data, **kwargs):

null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv]
temp = []
cols_missing = set([y for _, y, _ in null_xyv])
cols_missing = {y for _, y, _ in null_xyv}

# Step 1: Simple Imputation, these are just placeholders
for x_i, y_i, value in null_xyv:
Expand Down Expand Up @@ -84,8 +80,5 @@ def buck_iterative(data, **kwargs):
delta = (new_value-value)/0.01
else:
delta = (new_value-value)/value
if abs(delta) < 0.1:
converged[i] = True
else:
converged[i] = False
converged[i] = abs(delta) < 0.1
return data
8 changes: 3 additions & 5 deletions impyute/imputation/cs/central_tendency.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
from impyute.util import find_null
from impyute.util import checks
from impyute.util import preprocess
# pylint:disable=unused-argument
# pylint:disable=invalid-name

@preprocess
@checks
def mean(data, **kwargs):
def mean(data):
""" Substitute missing values with the mean of that column.
Parameters
Expand All @@ -31,7 +29,7 @@ def mean(data, **kwargs):

@preprocess
@checks
def median(data, **kwargs):
def median(data):
""" Substitute missing values with the median of that column(middle).
Parameters
Expand All @@ -58,7 +56,7 @@ def median(data, **kwargs):

@preprocess
@checks
def mode(data, **kwargs):
def mode(data):
""" Substitute missing values with the mode of that column(most frequent).
In the case that there is a tie (there are multiple, most frequent values)
Expand Down
4 changes: 1 addition & 3 deletions impyute/imputation/cs/em.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
from impyute.util import find_null
from impyute.util import preprocess
from impyute.util import checks
# pylint:disable=invalid-name
# pylint:disable=unused-argument

@preprocess
@checks
def em(data, loops=50, **kwargs):
def em(data, loops=50):
""" Imputes given data using expectation maximization.
E-step: Calculates the expected complete data log likelihood ratio.
Expand Down
24 changes: 13 additions & 11 deletions impyute/imputation/cs/fast_knn.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
""" impyute.imputation.cs.knn """
import numpy as np
from scipy.spatial import KDTree
from impyute.util import find_null
from impyute.util import checks
from impyute.util import preprocess
from impyute.util import inverse_distance_weighting as util_idw
from impyute.imputation.cs import mean
from scipy.spatial import KDTree
# pylint: disable=invalid-name
# pylint:disable=unused-argument
# pylint: disable=too-many-arguments

@preprocess
@checks
def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
idw=util_idw.shepards):
def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, idw=util_idw.shepards):
""" Impute using a variant of the nearest neighbours approach
Basic idea: Impute array with a basic mean impute and then use the resulting complete
Expand All @@ -31,30 +29,34 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
k: int, optional
Parameter used for method querying the KDTree class object. Number of
neighbours used in the KNN query. Refer to the docs for
[`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
[`scipy.spatial.KDTree.query`]
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
eps: nonnegative float, optional
Parameter used for method querying the KDTree class object. From the
SciPy docs: "Return approximate nearest neighbors; the kth returned
value is guaranteed to be no further than (1+eps) times the distance to
the real kth nearest neighbor". Refer to the docs for
[`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
[`scipy.spatial.KDTree.query`]
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
p : float, 1<=p<=infinity, optional
Parameter used for method querying the KDTree class object. Straight from the
SciPy docs: "Which Minkowski p-norm to use. 1 is the
sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
distance infinity is the maximum-coordinate-difference distance". Refer to
the docs for
[`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
[`scipy.spatial.KDTree.query`]
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
distance_upper_bound : nonnegative float, optional
Parameter used for method querying the KDTree class object. Straight
from the SciPy docs: "Return only neighbors within this distance. This
is used to prune tree searches, so if you are doing a series of
nearest-neighbor queries, it may help to supply the distance to the
nearest neighbor of the most recent point." Refer to the docs for
[`scipy.spatial.KDTree.query`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
[`scipy.spatial.KDTree.query`]
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
leafsize: int, optional
Parameter used for construction of the `KDTree` class object. Straight from
Expand All @@ -65,8 +67,8 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
idw: fn, optional
Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using functools.partial,
for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
Returns
-------
Expand Down
4 changes: 1 addition & 3 deletions impyute/imputation/cs/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
from impyute.util import find_null
from impyute.util import preprocess
from impyute.util import checks
# pylint:disable=invalid-name
# pylint:disable=unused-argument

@preprocess
@checks
def random(data, **kwargs):
def random(data):
""" Fill missing values in with a randomly selected value from the same
column.
Expand Down
6 changes: 2 additions & 4 deletions impyute/imputation/ts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
"""
Imputations for time-series data.
"""
""" Imputations for time-series data. """

from .locf import locf
from .moving_window import moving_window

__all__ = ["locf", "moving_window"] # , "dsae"]
__all__ = ["locf", "moving_window"]
1 change: 0 additions & 1 deletion impyute/imputation/ts/locf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from impyute.util import checks
from impyute.util import preprocess


@preprocess
@checks
def locf(data, axis=0):
Expand Down
7 changes: 2 additions & 5 deletions impyute/imputation/ts/moving_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
from impyute.util import find_null
from impyute.util import checks
from impyute.util import preprocess
# pylint: disable=invalid-name
# pylint:disable=unused-argument
# pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-branches, broad-except, len-as-condition

@preprocess
@checks
def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean,
inplace=False, **kwargs):
def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, inplace=False):
""" Interpolate the missing values based on nearby values.
For example, with an array like this:
Expand Down Expand Up @@ -42,7 +40,6 @@ def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean,
You can also do something like take 1.5x the max of previous values in the window:
moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1)
Parameters
----------
Expand Down
7 changes: 2 additions & 5 deletions impyute/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
"""
Diagnostic tools to find information about data.
"""
""" Diagnostic tools to find information about data. """

from .find_null import find_null
from .describe import describe
# from .mcar_test import mcar_test
from .count_missing import count_missing
from .errors import BadInputError
from .checks import checks
Expand All @@ -13,5 +10,5 @@
from . import inverse_distance_weighting

__all__ = ["find_null", "describe", "count_missing",
"checks", "compare", "BadInputError", "preprocess"
"checks", "compare", "BadInputError", "preprocess",
"inverse_distance_weighting"]
1 change: 0 additions & 1 deletion impyute/util/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import numpy as np
from impyute.util import find_null
from impyute.util import BadInputError
# pylint:disable=invalid-name

def checks(fn):
""" Main check function to ensure input is correctly formatted
Expand Down
4 changes: 1 addition & 3 deletions impyute/util/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
import importlib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# pylint: disable=too-many-locals
# pylint: disable=dangerous-default-value

# pylint: disable=too-many-locals, dangerous-default-value

def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None):
"""
Expand Down
1 change: 0 additions & 1 deletion impyute/util/count_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import numpy as np
from impyute.util import find_null


def count_missing(data):
""" Calculate the total percentage of missing values and also the
percentage in each column.
Expand Down
1 change: 0 additions & 1 deletion impyute/util/describe.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
""" impyute.util.describe """
from impyute.util import find_null


def describe(data): # verbose=True):
""" Print input/output multiple times
Expand Down
Loading

0 comments on commit 2c27fe8

Please sign in to comment.