From b28ec85f42db42492089b25d9227e4ce31ac48ff Mon Sep 17 00:00:00 2001 From: "Malte S. Kurz" Date: Tue, 4 May 2021 10:56:07 +0200 Subject: [PATCH 1/9] update years in the license --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index c072093..55e04a9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2020 Malte S. Kurz +Copyright (c) 2020-2021 Malte S. Kurz Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 78f605554a3f64d4cbb5e25349f4c860e0877672 Mon Sep 17 00:00:00 2001 From: "Malte S. Kurz" Date: Fri, 21 May 2021 16:53:23 +0200 Subject: [PATCH 2/9] add a unit test run with the dev version of DoubleML --- .github/workflows/pytest.yml | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index d9279bf..80becb6 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -19,14 +19,29 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.6', '3.7', '3.8', '3.9'] + config: + - {python-version: '3.6', doubleml-version: 'release'} + - {python-version: '3.7', doubleml-version: 'release'} + - {python-version: '3.8', doubleml-version: 'release'} + - {python-version: '3.8', doubleml-version: 'dev'} + - {python-version: '3.9', doubleml-version: 'release'} steps: - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.config.python-version }} uses: actions/setup-python@v2 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ matrix.config.python-version }} + - uses: actions/checkout@v2 + if: matrix.config.doubleml-version == 'dev' + with: + repository: DoubleML/doubleml-for-py + path: doubleml-for-py + - name: DoubleML dev version + if: matrix.config.doubleml-version == 'dev' + run: | + cd doubleml-for-py + pip install --editable . - name: Install dependencies run: | python -m pip install --upgrade pip From 2889a208ca28f077043058c2bd0e4ddb43d34c77 Mon Sep 17 00:00:00 2001 From: "Malte S. Kurz" Date: Fri, 4 Feb 2022 09:25:02 +0100 Subject: [PATCH 3/9] update the citation info to JMLR --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f4c48eb..d0080c7 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ DoubleML-Serverless is an extension for serverless cloud computing of the Python DoubleML is available via PyPI [https://pypi.org/project/DoubleML](https://pypi.org/project/DoubleML) and on GitHub [https://github.com/DoubleML/doubleml-for-py](https://github.com/DoubleML/doubleml-for-py). The Python package DoubleML was introduced in "DoubleML - An Object-Oriented Implementation of Double Machine Learning in Python" -([Bach et al., 2021](https://arxiv.org/abs/2104.03220)) +([Bach et al., 2022](https://www.jmlr.org/papers/v23/21-0862.html)) and a detailed documentation \& user guide for the package is available at [https://docs.doubleml.org](https://docs.doubleml.org). @@ -149,9 +149,10 @@ Bibtex-entry: ## References -Bach, P., Chernozhukov, V., Kurz, M. S., and Spindler, M. (2021). -DoubleML - An Object-Oriented Implementation of Double Machine Learning in Python. -arXiv:[2104.03220](https://arxiv.org/abs/2104.03220). +Bach, P., Chernozhukov, V., Kurz, M. S., and Spindler, M. (2022), DoubleML - An +Object-Oriented Implementation of Double Machine Learning in Python, +Journal of Machine Learning Research, 23(53): 1-6, +[https://www.jmlr.org/papers/v23/21-0862.html](https://www.jmlr.org/papers/v23/21-0862.html). Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018). Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. From f9833f6a7268b2cbfca1cd578e4beb04d11fe864 Mon Sep 17 00:00:00 2001 From: "Malte S. Kurz" Date: Tue, 14 Jun 2022 08:31:30 +0200 Subject: [PATCH 4/9] install xgboost which is used in the unit tests of DoubleML --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 80becb6..b0abdbd 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -45,7 +45,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install pytest + python -m pip install pytest xgboost pip install -r requirements.txt pip install . - name: Test with pytest From 55de8b6d51a9f94baba82921ebb4dd80895c859e Mon Sep 17 00:00:00 2001 From: "Malte S. Kurz" Date: Tue, 14 Jun 2022 08:48:24 +0200 Subject: [PATCH 5/9] only run the serverless tests --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index b0abdbd..88d41eb 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -50,4 +50,4 @@ jobs: pip install . - name: Test with pytest run: | - pytest + pytest doubleml_serverless/ From e368ad8dd9347547d5455455bdf9be50924c196c Mon Sep 17 00:00:00 2001 From: "Malte S. Kurz" Date: Wed, 15 Jun 2022 09:11:54 +0200 Subject: [PATCH 6/9] use kwargs only to call the initializer of super classes --- .../double_ml_iivm_aws_lambda.py | 30 +++++++++---------- .../double_ml_irm_aws_lambda.py | 26 ++++++++-------- .../double_ml_pliv_aws_lambda.py | 26 ++++++++-------- .../double_ml_plr_aws_lambda.py | 24 +++++++-------- 4 files changed, 53 insertions(+), 53 deletions(-) diff --git a/doubleml_serverless/double_ml_iivm_aws_lambda.py b/doubleml_serverless/double_ml_iivm_aws_lambda.py index dc08893..25ee143 100644 --- a/doubleml_serverless/double_ml_iivm_aws_lambda.py +++ b/doubleml_serverless/double_ml_iivm_aws_lambda.py @@ -26,22 +26,22 @@ def __init__(self, draw_sample_splitting=True, apply_cross_fitting=True): DoubleMLIIVM.__init__(self, - obj_dml_data, - ml_g, - ml_m, - ml_r, - n_folds, - n_rep, - score, - subgroups, - dml_procedure, - trimming_rule, - trimming_threshold, - draw_sample_splitting, - apply_cross_fitting) + obj_dml_data=obj_dml_data, + ml_g=ml_g, + ml_m=ml_m, + ml_r=ml_r, + n_folds=n_folds, + n_rep=n_rep, + score=score, + subgroups=subgroups, + dml_procedure=dml_procedure, + trimming_rule=trimming_rule, + trimming_threshold=trimming_threshold, + draw_sample_splitting=draw_sample_splitting, + apply_cross_fitting=apply_cross_fitting) DoubleMLLambda.__init__(self, - lambda_function_name, - aws_region) + lambda_function_name=lambda_function_name, + aws_region=aws_region) def _ml_nuisance_aws_lambda(self, cv_params): assert self._dml_data.n_treat == 1 diff --git a/doubleml_serverless/double_ml_irm_aws_lambda.py b/doubleml_serverless/double_ml_irm_aws_lambda.py index 3041611..5b7e4f8 100644 --- a/doubleml_serverless/double_ml_irm_aws_lambda.py +++ b/doubleml_serverless/double_ml_irm_aws_lambda.py @@ -24,20 +24,20 @@ def __init__(self, draw_sample_splitting=True, apply_cross_fitting=True): DoubleMLIRM.__init__(self, - obj_dml_data, - ml_g, - ml_m, - n_folds, - n_rep, - score, - dml_procedure, - trimming_rule, - trimming_threshold, - draw_sample_splitting, - apply_cross_fitting) + obj_dml_data=obj_dml_data, + ml_g=ml_g, + ml_m=ml_m, + n_folds=n_folds, + n_rep=n_rep, + score=score, + dml_procedure=dml_procedure, + trimming_rule=trimming_rule, + trimming_threshold=trimming_threshold, + draw_sample_splitting=draw_sample_splitting, + apply_cross_fitting=apply_cross_fitting) DoubleMLLambda.__init__(self, - lambda_function_name, - aws_region) + lambda_function_name=lambda_function_name, + aws_region=aws_region) def _ml_nuisance_aws_lambda(self, cv_params): assert self._dml_data.n_treat == 1 diff --git a/doubleml_serverless/double_ml_pliv_aws_lambda.py b/doubleml_serverless/double_ml_pliv_aws_lambda.py index 6fee671..e63a9b3 100644 --- a/doubleml_serverless/double_ml_pliv_aws_lambda.py +++ b/doubleml_serverless/double_ml_pliv_aws_lambda.py @@ -11,7 +11,7 @@ def __init__(self, lambda_function_name, aws_region, obj_dml_data, - ml_g, + ml_l, ml_m, ml_r, n_folds=5, @@ -21,19 +21,19 @@ def __init__(self, draw_sample_splitting=True, apply_cross_fitting=True): DoubleMLPLIV.__init__(self, - obj_dml_data, - ml_g, - ml_m, - ml_r, - n_folds, - n_rep, - score, - dml_procedure, - draw_sample_splitting, - apply_cross_fitting) + obj_dml_data=obj_dml_data, + ml_l=ml_l, + ml_m=ml_m, + ml_r=ml_r, + n_folds=n_folds, + n_rep=n_rep, + score=score, + dml_procedure=dml_procedure, + draw_sample_splitting=draw_sample_splitting, + apply_cross_fitting=apply_cross_fitting) DoubleMLLambda.__init__(self, - lambda_function_name, - aws_region) + lambda_function_name=lambda_function_name, + aws_region=aws_region) def _ml_nuisance_aws_lambda(self, cv_params): assert self._dml_data.n_treat == 1 diff --git a/doubleml_serverless/double_ml_plr_aws_lambda.py b/doubleml_serverless/double_ml_plr_aws_lambda.py index a1677ad..af636d1 100644 --- a/doubleml_serverless/double_ml_plr_aws_lambda.py +++ b/doubleml_serverless/double_ml_plr_aws_lambda.py @@ -11,7 +11,7 @@ def __init__(self, lambda_function_name, aws_region, obj_dml_data, - ml_g, + ml_l, ml_m, n_folds=5, n_rep=1, @@ -20,18 +20,18 @@ def __init__(self, draw_sample_splitting=True, apply_cross_fitting=True): DoubleMLPLR.__init__(self, - obj_dml_data, - ml_g, - ml_m, - n_folds, - n_rep, - score, - dml_procedure, - draw_sample_splitting, - apply_cross_fitting) + obj_dml_data=obj_dml_data, + ml_l=ml_l, + ml_m=ml_m, + n_folds=n_folds, + n_rep=n_rep, + score=score, + dml_procedure=dml_procedure, + draw_sample_splitting=draw_sample_splitting, + apply_cross_fitting=apply_cross_fitting) DoubleMLLambda.__init__(self, - lambda_function_name, - aws_region) + lambda_function_name=lambda_function_name, + aws_region=aws_region) def _ml_nuisance_aws_lambda(self, cv_params): assert self._dml_data.n_treat == 1 From 1f97643eaafb4f74387ef5267531d1ccebec4239 Mon Sep 17 00:00:00 2001 From: "Malte S. Kurz" Date: Wed, 15 Jun 2022 09:13:31 +0200 Subject: [PATCH 7/9] remove unused imports --- doubleml_serverless/double_ml_irm_aws_lambda.py | 1 - doubleml_serverless/double_ml_plr_aws_lambda.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/doubleml_serverless/double_ml_irm_aws_lambda.py b/doubleml_serverless/double_ml_irm_aws_lambda.py index 5b7e4f8..e3cf121 100644 --- a/doubleml_serverless/double_ml_irm_aws_lambda.py +++ b/doubleml_serverless/double_ml_irm_aws_lambda.py @@ -1,5 +1,4 @@ from doubleml import DoubleMLIRM -import numpy as np from sklearn.utils import check_X_y from ._helper import _get_cond_smpls diff --git a/doubleml_serverless/double_ml_plr_aws_lambda.py b/doubleml_serverless/double_ml_plr_aws_lambda.py index af636d1..603811c 100644 --- a/doubleml_serverless/double_ml_plr_aws_lambda.py +++ b/doubleml_serverless/double_ml_plr_aws_lambda.py @@ -1,9 +1,8 @@ from doubleml import DoubleMLPLR -import numpy as np from sklearn.utils import check_X_y from .double_ml_aws_lambda import DoubleMLLambda -from ._helper import _attach_learner, _attach_smpls, _extract_preds +from ._helper import _attach_learner, _attach_smpls class DoubleMLPLRServerless(DoubleMLPLR, DoubleMLLambda): From a3868df90395ad165fca8f9a137084a512710cf6 Mon Sep 17 00:00:00 2001 From: "Malte S. Kurz" Date: Wed, 15 Jun 2022 09:22:50 +0200 Subject: [PATCH 8/9] rename nuisance parts; deactivate IV-type score; #7 --- .../double_ml_pliv_aws_lambda.py | 11 +++++----- .../double_ml_plr_aws_lambda.py | 11 +++++----- doubleml_serverless/tests/test_pliv.py | 20 ++++++++--------- doubleml_serverless/tests/test_plr.py | 22 +++++++++---------- 4 files changed, 33 insertions(+), 31 deletions(-) diff --git a/doubleml_serverless/double_ml_pliv_aws_lambda.py b/doubleml_serverless/double_ml_pliv_aws_lambda.py index e63a9b3..abb1689 100644 --- a/doubleml_serverless/double_ml_pliv_aws_lambda.py +++ b/doubleml_serverless/double_ml_pliv_aws_lambda.py @@ -47,12 +47,12 @@ def _ml_nuisance_aws_lambda(self, cv_params): payload = self._dml_data.get_payload() - payload_ml_g = payload.copy() + payload_ml_l = payload.copy() payload_ml_m = payload.copy() payload_ml_r = payload.copy() - _attach_learner(payload_ml_g, - 'ml_g', self.learner['ml_g'], + _attach_learner(payload_ml_l, + 'ml_l', self.learner['ml_l'], self._dml_data.y_col, self._dml_data.x_cols) _attach_learner(payload_ml_m, @@ -63,7 +63,7 @@ def _ml_nuisance_aws_lambda(self, cv_params): 'ml_r', self.learner['ml_r'], self._dml_data.d_cols[0], self._dml_data.x_cols) - payloads = _attach_smpls([payload_ml_g, payload_ml_m, payload_ml_r], + payloads = _attach_smpls([payload_ml_l, payload_ml_m, payload_ml_r], [self.smpls, self.smpls, self.smpls], self.n_folds, self.n_rep, @@ -80,9 +80,10 @@ def _ml_nuisance_aws_lambda(self, cv_params): # compute score elements self._psi_a[:, i_rep, self._i_treat], self._psi_b[:, i_rep, self._i_treat] = \ self._score_elements(y, z, d, - preds['ml_g'][:, i_rep], + preds['ml_l'][:, i_rep], preds['ml_m'][:, i_rep], preds['ml_r'][:, i_rep], + None, self.smpls[i_rep]) return diff --git a/doubleml_serverless/double_ml_plr_aws_lambda.py b/doubleml_serverless/double_ml_plr_aws_lambda.py index 603811c..7f83ac1 100644 --- a/doubleml_serverless/double_ml_plr_aws_lambda.py +++ b/doubleml_serverless/double_ml_plr_aws_lambda.py @@ -41,18 +41,18 @@ def _ml_nuisance_aws_lambda(self, cv_params): payload = self._dml_data.get_payload() - payload_ml_g = payload.copy() + payload_ml_l = payload.copy() payload_ml_m = payload.copy() - _attach_learner(payload_ml_g, - 'ml_g', self.learner['ml_g'], + _attach_learner(payload_ml_l, + 'ml_l', self.learner['ml_l'], self._dml_data.y_col, self._dml_data.x_cols) _attach_learner(payload_ml_m, 'ml_m', self.learner['ml_m'], self._dml_data.d_cols[0], self._dml_data.x_cols) - payloads = _attach_smpls([payload_ml_g, payload_ml_m], + payloads = _attach_smpls([payload_ml_l, payload_ml_m], [self.smpls, self.smpls], self.n_folds, self.n_rep, @@ -69,8 +69,9 @@ def _ml_nuisance_aws_lambda(self, cv_params): # compute score elements self._psi_a[:, i_rep, self._i_treat], self._psi_b[:, i_rep, self._i_treat] = \ self._score_elements(y, d, - preds['ml_g'][:, i_rep], + preds['ml_l'][:, i_rep], preds['ml_m'][:, i_rep], + None, self.smpls[i_rep]) return diff --git a/doubleml_serverless/tests/test_pliv.py b/doubleml_serverless/tests/test_pliv.py index b5c9e83..2adb897 100644 --- a/doubleml_serverless/tests/test_pliv.py +++ b/doubleml_serverless/tests/test_pliv.py @@ -58,7 +58,7 @@ def dml_pliv_fixture(generate_data_pliv, idx, learner, score, dml_procedure): x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g - ml_g = clone(learner) + ml_l = clone(learner) ml_m = clone(learner) ml_r = clone(learner) @@ -66,8 +66,8 @@ def dml_pliv_fixture(generate_data_pliv, idx, learner, score, dml_procedure): dml_data_json = dml_lambda.DoubleMLDataJson(data, 'y', ['d'], x_cols, 'Z1') dml_pliv_lambda = DoubleMLPLIVServerlessLocal('local', 'local', dml_data_json, - ml_g, ml_m, ml_r, - n_folds, + ml_l, ml_m, ml_r, + n_folds=n_folds, score=score, dml_procedure=dml_procedure) @@ -76,8 +76,8 @@ def dml_pliv_fixture(generate_data_pliv, idx, learner, score, dml_procedure): np.random.seed(3141) dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'Z1') dml_pliv = dml.DoubleMLPLIV(dml_data, - ml_g, ml_m, ml_r, - n_folds, + ml_l, ml_m, ml_r, + n_folds=n_folds, score=score, dml_procedure=dml_procedure) @@ -140,7 +140,7 @@ def dml_pliv_scaling_fixture(generate_data_pliv, idx, learner, score, dml_proced x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g - ml_g = clone(learner) + ml_l = clone(learner) ml_m = clone(learner) ml_r = clone(learner) @@ -149,8 +149,8 @@ def dml_pliv_scaling_fixture(generate_data_pliv, idx, learner, score, dml_proced np.random.seed(3141) dml_pliv_folds = DoubleMLPLIVServerlessLocal('local', 'local', dml_data_json, - ml_g, ml_m, ml_r, - n_folds, + ml_l, ml_m, ml_r, + n_folds=n_folds, score=score, dml_procedure=dml_procedure) @@ -159,8 +159,8 @@ def dml_pliv_scaling_fixture(generate_data_pliv, idx, learner, score, dml_proced np.random.seed(3141) dml_pliv_reps = DoubleMLPLIVServerlessLocal('local', 'local', dml_data_json, - ml_g, ml_m, ml_r, - n_folds, + ml_l, ml_m, ml_r, + n_folds=n_folds, score=score, dml_procedure=dml_procedure) diff --git a/doubleml_serverless/tests/test_plr.py b/doubleml_serverless/tests/test_plr.py index 13e281b..ef44251 100644 --- a/doubleml_serverless/tests/test_plr.py +++ b/doubleml_serverless/tests/test_plr.py @@ -32,7 +32,7 @@ def learner(request): @pytest.fixture(scope='module', - params=['IV-type', 'partialling out']) + params=['partialling out']) def score(request): return request.param @@ -58,15 +58,15 @@ def dml_plr_fixture(generate_data_plr, idx, learner, score, dml_procedure): x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g - ml_g = clone(learner) + ml_l = clone(learner) ml_m = clone(learner) np.random.seed(3141) dml_data_json = dml_lambda.DoubleMLDataJson(data, 'y', ['d'], x_cols) dml_plr_lambda = DoubleMLPLRServerlessLocal('local', 'local', dml_data_json, - ml_g, ml_m, - n_folds, + ml_l, ml_m, + n_folds=n_folds, score=score, dml_procedure=dml_procedure) @@ -75,8 +75,8 @@ def dml_plr_fixture(generate_data_plr, idx, learner, score, dml_procedure): np.random.seed(3141) dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_plr = dml.DoubleMLPLR(dml_data, - ml_g, ml_m, - n_folds, + ml_l, ml_m, + n_folds=n_folds, score=score, dml_procedure=dml_procedure) @@ -139,7 +139,7 @@ def dml_plr_scaling_fixture(generate_data_plr, idx, learner, score, dml_procedur x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g - ml_g = clone(learner) + ml_l = clone(learner) ml_m = clone(learner) dml_data_json = dml_lambda.DoubleMLDataJson(data, 'y', ['d'], x_cols) @@ -147,8 +147,8 @@ def dml_plr_scaling_fixture(generate_data_plr, idx, learner, score, dml_procedur np.random.seed(3141) dml_plr_folds = DoubleMLPLRServerlessLocal('local', 'local', dml_data_json, - ml_g, ml_m, - n_folds, + ml_l, ml_m, + n_folds=n_folds, score=score, dml_procedure=dml_procedure) @@ -157,8 +157,8 @@ def dml_plr_scaling_fixture(generate_data_plr, idx, learner, score, dml_procedur np.random.seed(3141) dml_plr_reps = DoubleMLPLRServerlessLocal('local', 'local', dml_data_json, - ml_g, ml_m, - n_folds, + ml_l, ml_m, + n_folds=n_folds, score=score, dml_procedure=dml_procedure) From fdc7db95f8015abebaf2ad412735c4365c779671 Mon Sep 17 00:00:00 2001 From: "Malte S. Kurz" Date: Wed, 15 Jun 2022 09:25:08 +0200 Subject: [PATCH 9/9] minimum required DoubleML version due to API changes --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4855307..d991e54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -DoubleML>=0.2.2 +DoubleML>=0.5.0 joblib numpy pandas diff --git a/setup.py b/setup.py index f7f02e3..8105c30 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ url='https://github.com/DoubleML/doubleml-serverless', packages=find_packages(exclude=['aws_lambda_app*']), install_requires=[ - 'DoubleML>=0.2.2', + 'DoubleML>=0.5.0', 'joblib', 'numpy', 'pandas',