Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP isolate convergence warning in LogReg #225

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ celer.egg-info
# build
build


# cache
.pytest_cache
__pycache__
Expand Down
25 changes: 25 additions & 0 deletions celer/tests/conv_warning/inspect_conv_warn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""For debbuging purposes."""

import numpy as np
from numpy.linalg import norm
from sklearn.utils.estimator_checks import check_estimator

from celer.dropin_sklearn import LogisticRegression
from celer.utils.testing import build_dataset


np.random.seed(1409)
X, y = build_dataset(
n_samples=30, n_features=60, sparse_X=True)
y = np.sign(y)
alpha_max = norm(X.T.dot(y), ord=np.inf) / 2
C = 20. / alpha_max

tol = 1e-4
clf1 = LogisticRegression(C=C, tol=tol, verbose=0)

generator = check_estimator(clf1, generate_only=True)
generator = list(generator)

for i, (estimator, check_estimator) in enumerate(generator[37:]):
check_estimator(estimator)
Binary file added celer/tests/conv_warning/logs/dumped_data_.pkl
Binary file not shown.
Binary file not shown.
48 changes: 48 additions & 0 deletions celer/tests/conv_warning/reproduce_warning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Note:

Use 'celer/tests/conv_warning/logs/dumped_data_.pkl'
as dumped_data path to run script from terminal.
"""

import pickle
import numpy as np
from numpy.linalg import norm
from celer import LogisticRegression
from sklearn.linear_model import LogisticRegression as sk_LR


# load data
dumped_data_path = './logs/dumped_data_.pkl'
with open(dumped_data_path, 'rb') as f:
DICT_DATA = pickle.load(f)

data = DICT_DATA['check_fit_idempotent']
X = data["X"] # data centered around 100
y = data["y"]
C = data["C"]

C_min = 2 / norm(X.T @ y, ord=np.inf)

# C is very high (higher is more difficult):
print(f"C / Cmin {C / C_min:.2e}")
# to get the warning:
clf = LogisticRegression(C=C, verbose=0).fit(X, y)

# a lower C gets no warning :
LogisticRegression(C=50 * C_min, verbose=1).fit(X, y)

###############################################################################
# regular celer converges but takes >= 10_000 epochs:
clf = LogisticRegression(C=C, solver="celer", verbose=1, tol=1e-10).fit(X, y)

# liblinear does not seem to
clf_sk = sk_LR(C=C, fit_intercept=False, penalty="l1",
solver="liblinear", max_iter=10000, tol=1e-8).fit(X, y)


# finally, centering the columns yields convergence in very few iterations
X_c = X - X.mean(axis=0)

C_min = 2 / np.max(np.abs(X_c.T @ y))
clf = LogisticRegression(C=C, verbose=2,
solver="celer-pn", tol=1e-10).fit(X_c, y)