diff --git a/.circleci/config.yml b/.circleci/config.yml index eedc286a5a5f2..f58bbb0a42487 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -89,22 +89,23 @@ jobs: root: doc/_build/html paths: . - deploy: - docker: - - image: cimg/python:3.8.12 - steps: - - checkout - - run: ./build_tools/circle/checkout_merge_commit.sh - # Attach documentation generated in the 'doc' step so that it can be - # deployed. - - attach_workspace: - at: doc/_build/html - - run: ls -ltrh doc/_build/html/stable - - deploy: - command: | - if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then - bash build_tools/circle/push_doc.sh doc/_build/html/stable - fi + # XXX: in order to make sure our fork passes all the CIs and not remove too many LOC, we don't want to deploy + # deploy: + # docker: + # - image: cimg/python:3.8.12 + # steps: + # - checkout + # - run: ./build_tools/circle/checkout_merge_commit.sh + # # Attach documentation generated in the 'doc' step so that it can be + # # deployed. + # - attach_workspace: + # at: doc/_build/html + # - run: ls -ltrh doc/_build/html/stable + # - deploy: + # command: | + # if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then + # bash build_tools/circle/push_doc.sh doc/_build/html/stable + # fi workflows: version: 2 diff --git a/.cirrus.star b/.cirrus.star index 8b3de0d10c532..7a432556c1299 100644 --- a/.cirrus.star +++ b/.cirrus.star @@ -4,9 +4,9 @@ load("cirrus", "env", "fs", "http") def main(ctx): - # Only run for scikit-learn/scikit-learn. For debugging on a fork, you can + # Only run for neurodata/scikit-learn. For debugging on a fork, you can # comment out the following condition. - if env.get("CIRRUS_REPO_FULL_NAME") != "scikit-learn/scikit-learn": + if env.get("CIRRUS_REPO_FULL_NAME") != "neurodata/scikit-learn": return [] arm_wheel_yaml = "build_tools/cirrus/arm_wheel.yml" @@ -14,7 +14,7 @@ def main(ctx): # Nightly jobs always run if env.get("CIRRUS_CRON", "") == "nightly": - return fs.read(arm_wheel_yaml) + return fs.read(arm_wheel_yaml) + fs.read(arm_tests_yaml) # Get commit message for event. We can not use `git` here because there is # no command line access in starlark. Thus we need to query the GitHub API @@ -26,10 +26,12 @@ def main(ctx): response = http.get(url).json() commit_msg = response["message"] - if "[skip ci]" in commit_msg: - return [] + jobs_to_run = "" if "[cd build]" in commit_msg or "[cd build cirrus]" in commit_msg: - return fs.read(arm_wheel_yaml) + fs.read(arm_tests_yaml) + jobs_to_run += fs.read(arm_wheel_yaml) + + if "[cirrus arm]" in commit_msg: + jobs_to_run += fs.read(arm_tests_yaml) - return fs.read(arm_tests_yaml) + return jobs_to_run diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml index d5bfc8ef0f430..53f64ba5c886b 100644 --- a/.github/workflows/check-changelog.yml +++ b/.github/workflows/check-changelog.yml @@ -10,12 +10,13 @@ jobs: check: name: A reviewer will let you know if it is required or can be bypassed runs-on: ubuntu-latest - if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} + if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 && github.repository == 'scikit-learn/scikit-learn' }} steps: - name: Get PR number and milestone run: | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV + echo "${{ github.repository }}" - uses: actions/checkout@v3 with: fetch-depth: '0' diff --git a/.github/workflows/check-manifest.yml b/.github/workflows/check-manifest.yml index 004cc452e385e..5ef9ce2213e90 100644 --- a/.github/workflows/check-manifest.yml +++ b/.github/workflows/check-manifest.yml @@ -7,7 +7,7 @@ on: jobs: check-manifest: # Don't run on forks - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/check-upstream.yml b/.github/workflows/check-upstream.yml new file mode 100644 index 0000000000000..80e8ace610607 --- /dev/null +++ b/.github/workflows/check-upstream.yml @@ -0,0 +1,27 @@ +# Create Github Actions workflow that checks upstream scikit-learn 'main' branch and +# creates or updates +# an existing pull request to https://github.com/neurodata/scikit-learn:fork. +# Runs the check weekly. +# Creates a pull request if there are changes. + +# name: Check upstream scikit-learn + +# on: +# schedule: +# - cron: '0 0 * * 0' + +# jobs: +# check-upstream: +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v2 +# - name: Check upstream scikit-learn +# uses: neurodata/check-upstream@main +# with: +# upstream: scikit-learn/scikit-learn +# fork: neurodata/scikit-learn +# branch: fork +# token: ${{ secrets.GITHUB_TOKEN }} + +# # Creates a pull request if there are changes. + diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml index 468d3282903f2..8b4f39461b8da 100644 --- a/.github/workflows/labeler-module.yml +++ b/.github/workflows/labeler-module.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.1 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" @@ -27,7 +27,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.1 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" configuration-path: ".github/labeler-file-extensions.yml" diff --git a/.github/workflows/update_tracking_issue.yml b/.github/workflows/update_tracking_issue.yml index 124ea1e8c6ac4..c176ce356a4cf 100644 --- a/.github/workflows/update_tracking_issue.yml +++ b/.github/workflows/update_tracking_issue.yml @@ -24,7 +24,7 @@ on: jobs: update_tracking_issue: runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' && github.event_name == 'schedule' + if: github.repository == 'neurodata/scikit-learn' && github.event_name == 'schedule' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index b43f29ffa4f7f..4300db6c5e208 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -7,12 +7,12 @@ on: - cron: "42 3 */1 * *" push: branches: - - main + - fork # Release branches - "[0-9]+.[0-9]+.X" pull_request: branches: - - main + - fork - "[0-9]+.[0-9]+.X" # Manual run workflow_dispatch: @@ -26,7 +26,7 @@ jobs: check_build_trigger: name: Check build trigger runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' outputs: build: ${{ steps.check_build_trigger.outputs.build }} @@ -103,6 +103,18 @@ jobs: python: 311 platform_id: macosx_x86_64 + # MacOS arm64 + # The latest Python version is built and tested on CirrusCI + - os: macos-latest + python: 38 + platform_id: macosx_arm64 + - os: macos-latest + python: 39 + platform_id: macosx_arm64 + - os: macos-latest + python: 310 + platform_id: macosx_arm64 + steps: - name: Checkout scikit-learn uses: actions/checkout@v3 @@ -178,31 +190,8 @@ jobs: with: path: dist/*.tar.gz - # Upload the wheels and the source distribution - upload_anaconda: - name: Upload to Anaconda - runs-on: ubuntu-latest - needs: [build_wheels, build_sdist] - # The artifacts cannot be uploaded on PRs - if: github.event_name != 'pull_request' - - steps: - - name: Checkout scikit-learn - uses: actions/checkout@v3 - - - name: Download artifacts - uses: actions/download-artifact@v3 + - uses: actions/upload-artifact@v3 with: path: dist + name: ${{ matrix.python[0] }}-${{ matrix.os[1] }} - - name: Setup Python - uses: actions/setup-python@v4 - - - name: Upload artifacts - env: - # Secret variables need to be mapped to environment variables explicitly - SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }} - SCIKIT_LEARN_STAGING_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_STAGING_UPLOAD_TOKEN }} - ARTIFACTS_PATH: dist/artifact - # Force a replacement if the remote file already exists - run: bash build_tools/github/upload_anaconda.sh diff --git a/.gitignore b/.gitignore index f4601a15655a5..cfc13d4997b4b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ build sklearn/datasets/__config__.py sklearn/**/*.html +scikit_learn_tree.egg-info/* dist/ MANIFEST @@ -99,6 +100,9 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx +sklearn/neighbors/_ball_tree.pyx +sklearn/neighbors/_binary_tree.pxi +sklearn/neighbors/_kd_tree.pyx # Default JupyterLite content jupyterlite_contents diff --git a/Makefile b/Makefile index e2ae6aa75ca94..4e685872a4c61 100644 --- a/Makefile +++ b/Makefile @@ -62,3 +62,6 @@ doc-noplot: inplace code-analysis: build_tools/linting.sh + +build-dev: + pip install --verbose --no-build-isolation --editable . diff --git a/README.rst b/README.rst index 80de41a8890a1..4d1b135400c3e 100644 --- a/README.rst +++ b/README.rst @@ -44,20 +44,40 @@ .. |PytestMinVersion| replace:: 7.1.2 .. |PlotlyMinVersion| replace:: 5.14.0 -.. image:: https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/doc/logos/scikit-learn-logo.png - :target: https://scikit-learn.org/ +================= +Scikit-learn-tree +================= -**scikit-learn** is a Python module for machine learning built on top of -SciPy and is distributed under the 3-Clause BSD license. - -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. +``scikit-learn-tree`` is an alias of scikit-learn. It is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line +with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is +released under the name ``scikit-learn-tree`` to avoid confusion. It is currently maintained by a team of volunteers. -Website: https://scikit-learn.org +The upstream package **scikit-learn** is a Python module for machine learning built on top of +SciPy and is distributed under the 3-Clause BSD license. Refer to their website for all documentation +needs: https://scikit-learn.org. + +Why a fork? +----------- +Currently, the scikit-learn tree submodule is difficult to extend. Requests to modularize +and improve the extensibility of the code is currently unsupported, or may take a long time. +The desire for advanced tree models that also leverage the robustness of scikit-learn is desirable. + +However, "hard-forking" via copy/pasting the explicit Python/Cython code into another tree package +altogether is undesirable because it results in a tree codebase that is inherently different +and not compatible with ``scikit-learn``. For example, `quantile-forests `_, +and `EconML `_ do this, and their current tree submodules +cannot take advantage of improvements made in upstream ``scikit-learn``. + +An example of seamless integration would be `scikit-survival `_, which +only needs to implement a subclass of the Cython ``Criterion`` oject in their code to enable survival trees. + +Maintaining a "soft-fork" of ``scikit-learn`` in the form of a repository fork allows us to develop +a separate package that serves as a stand-in for ``sklearn`` in any package, extends the tree submodule +and can also be synced with upstream changes in ``scikit-learn``. This enables this fork to always +take advantage of improvements made in ``scikit-learn`` main upstream, while providing a customizable +tree API. Installation ------------ @@ -65,7 +85,7 @@ Installation Dependencies ~~~~~~~~~~~~ -scikit-learn requires: +scikit-learn-tree requires: - Python (>= |PythonMinVersion|) - NumPy (>= |NumPyMinVersion|) @@ -73,132 +93,193 @@ scikit-learn requires: - joblib (>= |JoblibMinVersion|) - threadpoolctl (>= |ThreadpoolctlMinVersion|) -======= +============================ +Installing scikit-learn-tree +============================ + +Scikit-learn-tree is a maintained fork of scikit-learn, which extends the +tree submodule in a few ways documented in `fork_changelog`_. -**Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.** -scikit-learn 1.0 and later require Python 3.7 or newer. -scikit-learn 1.1 and later require Python 3.8 or newer. +We release versions of scikit-learn-tree in an analagous fashion to +scikit-learn main. Due to maintenance resources, we only release on PyPi +and recommend therefore installing with ``pip``. -Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and -classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|). -For running the examples Matplotlib >= |MatplotlibMinVersion| is required. -A few examples require scikit-image >= |Scikit-ImageMinVersion|, a few examples -require pandas >= |PandasMinVersion|, some examples require seaborn >= -|SeabornMinVersion| and plotly >= |PlotlyMinVersion|. +There are different ways to install scikit-learn-tree: -User installation -~~~~~~~~~~~~~~~~~ + * Install the latest official release `install_fork_release`_. This + is the best approach for most users. It will provide a stable version + and pre-built packages are available for most platforms. + + * Building the package from source `install_source`_. This is best for users who want the + latest-and-greatest features and aren't afraid of running + brand-new code. This is also needed for users who wish to contribute to the + project. -If you already have a working installation of numpy and scipy, -the easiest way to install scikit-learn is using ``pip``:: +.. _install_fork_release: - pip install -U scikit-learn +Installing the latest release +----------------------------- +We release wheels for common distributions and this is thus installable via pip. -or ``conda``:: + pip install scikit-learn-tree - conda install -c conda-forge scikit-learn +This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then +can be used as a stand-in for any package that relies on the public API of ``sklearn``. -The documentation includes more detailed `installation instructions `_. +For example, any usage of ``scikit-learn`` is preserved with ``scikit-learn-tree`` + >>> # the sklearn installed is that of scikit-learn-tree and is equivalent to scikit-learn + >>> from sklearn.ensemble import RandomForestClassifier + >>> clf = RandomForestClassifier(random_state=0) + >>> X = [[ 1, 2, 3], # 2 samples, 3 features + ... [11, 12, 13]] + >>> y = [0, 1] # classes of each sample + >>> clf.fit(X, y) + RandomForestClassifier(random_state=0) -Changelog ---------- +.. _install_source: -See the `changelog `__ -for a history of notable changes to scikit-learn. +Building from source +-------------------- +If you are a developer and are interested in helping maintain, or add some new +features to the fork, the building from source instructions are exactly the same +as that of scikit-learn main, so please refer to `scikit-learn documentation `_ +for instructions on building from source. + +=========== Development ----------- -We welcome new contributors of all experience levels. The scikit-learn -community goals are to be helpful, welcoming, and effective. The +We welcome new contributors of all experience levels, specifically to maintain the fork. +Any contributions that make sure our fork is "better in-line" with scikit-learn upstream, +or improves the tree submodule in anyway will be appreciated. + +The scikit-learn community goals are to be helpful, welcoming, and effective. The `Development Guide `_ has detailed information about contributing code, documentation, tests, and more. We've included some basic information in this README. -Important links -~~~~~~~~~~~~~~~ - -- Official source code repo: https://github.com/scikit-learn/scikit-learn -- Download releases: https://pypi.org/project/scikit-learn/ -- Issue tracker: https://github.com/scikit-learn/scikit-learn/issues - -Source code -~~~~~~~~~~~ - -You can check the latest sources with the command:: - - git clone https://github.com/scikit-learn/scikit-learn.git - -Contributing -~~~~~~~~~~~~ - -To learn more about making a contribution to scikit-learn, please see our -`Contributing guide -`_. - -Testing -~~~~~~~ - -After installation, you can launch the test suite from outside the source -directory (you will need to have ``pytest`` >= |PyTestMinVersion| installed):: - - pytest sklearn - -See the web page https://scikit-learn.org/dev/developers/contributing.html#testing-and-improving-test-coverage -for more information. - - Random number generation can be controlled during testing by setting - the ``SKLEARN_SEED`` environment variable. - -Submitting a Pull Request -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Before opening a Pull Request, have a look at the -full Contributing page to make sure your code complies -with our guidelines: https://scikit-learn.org/stable/developers/index.html - -Project History ---------------- - -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. - -The project is currently maintained by a team of volunteers. - -**Note**: `scikit-learn` was previously referred to as `scikits.learn`. - -Help and Support ----------------- - -Documentation -~~~~~~~~~~~~~ - -- HTML documentation (stable release): https://scikit-learn.org -- HTML documentation (development version): https://scikit-learn.org/dev/ -- FAQ: https://scikit-learn.org/stable/faq.html - -Communication -~~~~~~~~~~~~~ - -- Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn -- Gitter: https://gitter.im/scikit-learn/scikit-learn -- Logos & Branding: https://github.com/scikit-learn/scikit-learn/tree/main/doc/logos -- Blog: https://blog.scikit-learn.org -- Calendar: https://blog.scikit-learn.org/calendar/ -- Twitter: https://twitter.com/scikit_learn -- Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn -- Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions -- Website: https://scikit-learn.org -- LinkedIn: https://www.linkedin.com/company/scikit-learn -- YouTube: https://www.youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw/playlists -- Facebook: https://www.facebook.com/scikitlearnofficial/ -- Instagram: https://www.instagram.com/scikitlearnofficial/ -- TikTok: https://www.tiktok.com/@scikit.learn - -Citation -~~~~~~~~ - -If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn +.. _fork_changelog: + +Major Changes of the Fork +------------------------- + +The purpose of this page is to illustrate some of the main features that +``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a +an understanding of core package ``scikit-learn`` and also decision trees +models. Please refer to our installation instructions `install_fork_release`_ for installing ``scikit-learn-tree``. + +Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``. +It is used in packages exactly the same way and will support all features +in the corresponding version of ``scikit-learn``. For example, if you +are interested in features of ``scikit-learn`` in v1.2.2 for ``NearestNeighbors`` algorithm, +then if ``scikit-learn-tree`` has a version release of v1.2.2, then it will have +all those features. + +The breaking API changes will be with respect to anything in the ``tree`` submodule, +and related Forest ensemble models. See below for a detailed list of breaking changes. + +See: https://scikit-learn.org/ for documentation on scikit-learn main. + +Our Philosophy +-------------- +Our design philosophy with this fork of ``scikit-learn`` is to maintain as few changes +as possible, such that incorporating upstream changes into the fork requires minimal effort. + +Candidate changes and PRs accepted into the fork are those that: + +- improve compatability with upstream ``scikit-learn`` main +- enable improved extensibility of tree models + +Decision tree generalizations +----------------------------- + +``Scikit-learn`` provides an axis-aligned `sklearn.tree.DecisionTreeClassifier `_ +decision tree model (classifier and regressor), which has a few fundamental limitations +that prevent 3rd parties from utilizing the existing class, without forking a large +amount of copy/pasted Python and Cython code. We highlight those limitations here +and then describe how we generalize that limitation. + +Cython Internal Private API: + +Note, the Cython API for scikit-learn is still not a publicly supported API, so it may +change without warning. + +- leaf and split nodes: These nodes are treated the same way and there is no internal + API for setting them differently. Quantile trees and causal trees inherently generalize + how leaf nodes are set. +- Criterion class: The criterion class currently assumes a supervised learning interface. + - Our fix: We implement a ``BaseCriterion`` object that provides an abstract API for unsupervised criterion. +- Splitter class: The splitter clas currently assumes a supervised learning interface and + does not provide a way of generalizing the way split candidates are proposed. + - Our fix: We implement a ``BaseSplitter`` object that provides an abstract API for unsupervised splitters and also implement an API to allow generalizations of the ``SplitRecord`` struct and ``Splitter.node_split`` function. For example, this enables oblique splits to be considered. +- Tree class: The tree class currently assumes a supervised learning interface and does not + provide a way of generalizing the type of tree. + - Our fix: We implementa ``BaseTree`` object that provides an abstract API for general tree models and also implement an API that allows generalization of the type of tree. For example, oblique trees are trivially implementable as an extension now. +- stopping conditions for splitter: Currently, the ``Splitter.node_split`` function has various + stopping conditions for the splitter based on hyperparameters. It is plausible that these conditions + may be extended. For example, in causal trees, one may want the splitter to also account for + a minimal degree of heterogeneity (i.e. variance) in its children nodes. + +Python API: + +- ``sklearn.tree.BaseDecisionTree`` assumes the underlying tree model is supervised: The ``y`` + parameter is required to be passed in, which is not necessary for general tree-based models. + For example, an unsupervised tree may pass in ``y=None``. + - Our fix: We fix this API, so the ``BaseDecisionTree`` is subclassable by unsupervised tree models that do not require ``y`` to be defined. +- ``sklearn.tree.BaseDecisionTree`` does not provide a way to generalize the ``Criterion``, ``Splitter`` + and ``Tree`` Cython classes used: The current codebase requires users to define custom + criterion and/or splitters outside the instantiation of the ``BaseDecisionTree``. This prevents + users from generalizing the ``Criterion`` and ``Splitter`` and creating a neat Python API wrapper. + Moreover, the ``Tree`` class is not customizable. + - Our fix: We internally implement a private function to actually build the entire tree, ``BaseDecisionTree._build_tree``, which can be overridden in subclasses that customize the criterion, splitter, or tree, or any combination of them. +- ``sklearn.ensemble.BaseForest`` and its subclass algorithms are slow when ``n_samples`` is very high. Binning + features into a histogram, which is the basis of "LightGBM" and "HistGradientBoostingClassifier" is a computational + trick that can both significantly increase runtime efficiency, but also help prevent overfitting in trees, since + the sorting in "BestSplitter" is done on bins rather than the continuous feature values. This would enable + random forests and their variants to scale to millions of samples. + - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below. + +Overall, the existing tree models, such as `sklearn.tree.DecisionTreeClassifier `_ +and `sklearn.ensemble.RandomForestClassifier `_ all work exactly the same as they +would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend +the Cython/Python API easily. + +Roadmap +------- +There are several improvements that can be made in this fork. Primarily, the binning feature +promises to make Random Forests and their variants ultra-fast. However, the binning needs +to be implemented in a similar fashion to ``HistGradientBoostingClassifier``, which passes +in the binning thresholds throughout the tree construction step, such that the split nodes +store the actual numerical value of the bin rather than the "bin index". This requires +modifying the tree Cython code to take in a ``binning_thresholds`` parameter that is part +of the ``_BinMapper`` fitted class. This also allows us not to do any binning during prediction/apply +time because the tree already stores the "numerical" threshold value we would want to apply +to any incoming ``X`` that is not binned. + +Besides that modification, the tree and splitter need to be able to handle not just ``np.float32`` +data (the type for X normally in Random Forests), but also ``uint8`` data (the type for X when it +is binned in to e.g. 255 bins). This would not only save RAM since ``uint8`` storage of millions +of samples would result in many GB saved, but also improved runtime. + +So in summary, the Cython code of the tree submodule needs to take in an extra parameter for +the binning thresholds if binning occurs and also be able to handle ``X`` being of dtype ``uint8``. +Afterwards, Random Forests will have fully leveraged the binning feature. + +Something to keep in mind is that upstream scikit-learn is actively working on incorporating +missing-value handling and categorical handling into Random Forests. + +Next steps +---------- + +We have briefly covered how the tree submodule has changed with respect to ``scikit-learn``. +This enables packages to leverage these changes in developing more complex tree models +that may, or may not eventually be PRed into ``scikit-learn``. For example, + +- `scikit-tree `_ is a scikit-learn + compatible package for more complex and advanced tree models. + +If you are developing tree models, we encourage you to take a look at that package, or +if you have suggestions to make the tree submodule of our fork, ``scikit-learn-tree`` +more diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6c3511319e4eb..464096fb69c29 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -171,7 +171,6 @@ jobs: DISTRIB: 'conda' LOCK_FILE: './build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock' COVERAGE: 'true' - SHOW_SHORT_SUMMARY: 'true' SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '42' # default global random seed # Check compilation with Ubuntu 22.04 LTS (Jammy Jellyfish) and scipy from conda-forge diff --git a/build_tools/azure/install_win.sh b/build_tools/azure/install_win.sh index ab559a1878971..011e962885d45 100755 --- a/build_tools/azure/install_win.sh +++ b/build_tools/azure/install_win.sh @@ -22,4 +22,4 @@ show_installed_libraries python setup.py bdist_wheel # Install the generated wheel package to test it -pip install --pre --no-index --find-links dist scikit-learn +pip install --pre --no-index --find-links dist scikit-learn-tree diff --git a/build_tools/azure/posix-docker.yml b/build_tools/azure/posix-docker.yml index af776c4c62f14..b00ca66c378ca 100644 --- a/build_tools/azure/posix-docker.yml +++ b/build_tools/azure/posix-docker.yml @@ -22,7 +22,6 @@ jobs: # Set in azure-pipelines.yml DISTRIB: '' DOCKER_CONTAINER: '' - SHOW_SHORT_SUMMARY: 'false' CREATE_ISSUE_ON_TRACKER: 'true' CCACHE_DIR: $(Pipeline.Workspace)/ccache CCACHE_COMPRESS: '1' diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml index 2ee03daafd288..35e5165d22c83 100644 --- a/build_tools/azure/posix.yml +++ b/build_tools/azure/posix.yml @@ -22,7 +22,6 @@ jobs: PYTEST_XDIST_VERSION: 'latest' COVERAGE: 'true' CREATE_ISSUE_ON_TRACKER: 'true' - SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 98ac2e797b73c..5117473ea6366 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -49,7 +49,7 @@ if [[ "$COVERAGE" == "true" ]]; then fi if [[ -n "$CHECK_WARNINGS" ]]; then - TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Werror::numpy.VisibleDeprecationWarning" + TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Werror::sklearn.utils.fixes.VisibleDeprecationWarning" # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib # removes its usage @@ -75,10 +75,6 @@ if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then TEST_CMD="$TEST_CMD -n$XDIST_WORKERS" fi -if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then - TEST_CMD="$TEST_CMD -ra" -fi - if [[ -n "$SELECTED_TESTS" ]]; then TEST_CMD="$TEST_CMD -k $SELECTED_TESTS" diff --git a/build_tools/cirrus/arm_tests.yml b/build_tools/cirrus/arm_tests.yml index a6e5919ecc32f..d1ac551a749e3 100644 --- a/build_tools/cirrus/arm_tests.yml +++ b/build_tools/cirrus/arm_tests.yml @@ -17,4 +17,10 @@ linux_aarch64_test_task: folder: /root/.conda/pkgs fingerprint_script: cat build_tools/cirrus/py39_conda_forge_linux-aarch64_conda.lock - test_script: bash build_tools/cirrus/build_test_arm.sh + test_script: | + bash build_tools/cirrus/build_test_arm.sh + # On success, this script is run updating the issue. + bash build_tools/cirrus/update_tracking_issue.sh true + + on_failure: + update_tracker_script: bash build_tools/cirrus/update_tracking_issue.sh false diff --git a/build_tools/cirrus/arm_wheel.yml b/build_tools/cirrus/arm_wheel.yml index a7023867e1109..5616108315fba 100644 --- a/build_tools/cirrus/arm_wheel.yml +++ b/build_tools/cirrus/arm_wheel.yml @@ -16,12 +16,8 @@ macos_arm64_wheel_task: # See `maint_tools/update_tracking_issue.py` for details on the permissions the token requires. BOT_GITHUB_TOKEN: ENCRYPTED[9b50205e2693f9e4ce9a3f0fcb897a259289062fda2f5a3b8aaa6c56d839e0854a15872f894a70fca337dd4787274e0f] matrix: - - env: - CIBW_BUILD: cp38-macosx_arm64 - - env: - CIBW_BUILD: cp39-macosx_arm64 - - env: - CIBW_BUILD: cp310-macosx_arm64 + # Only the latest Python version is built and tested on CirrusCI, the other + # macos arm64 builds are on GitHub Actions - env: CIBW_BUILD: cp311-macosx_arm64 @@ -60,12 +56,16 @@ linux_arm64_wheel_task: # See `maint_tools/update_tracking_issue.py` for details on the permissions the token requires. BOT_GITHUB_TOKEN: ENCRYPTED[9b50205e2693f9e4ce9a3f0fcb897a259289062fda2f5a3b8aaa6c56d839e0854a15872f894a70fca337dd4787274e0f] matrix: + # Only the latest Python version is tested - env: CIBW_BUILD: cp38-manylinux_aarch64 + CIBW_TEST_SKIP: "*_aarch64" - env: CIBW_BUILD: cp39-manylinux_aarch64 + CIBW_TEST_SKIP: "*_aarch64" - env: CIBW_BUILD: cp310-manylinux_aarch64 + CIBW_TEST_SKIP: "*_aarch64" - env: CIBW_BUILD: cp311-manylinux_aarch64 diff --git a/build_tools/cirrus/build_test_arm.sh b/build_tools/cirrus/build_test_arm.sh index 4eeef6ec2dc0c..dfe048da47a7f 100755 --- a/build_tools/cirrus/build_test_arm.sh +++ b/build_tools/cirrus/build_test_arm.sh @@ -25,7 +25,7 @@ setup_ccache() { MAMBAFORGE_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh" # Install Mambaforge -wget $MAMBAFORGE_URL -O mambaforge.sh +curl -L $MAMBAFORGE_URL -o mambaforge.sh MAMBAFORGE_PATH=$HOME/mambaforge bash ./mambaforge.sh -b -p $MAMBAFORGE_PATH export PATH=$MAMBAFORGE_PATH/bin:$PATH diff --git a/build_tools/github/repair_windows_wheels.sh b/build_tools/github/repair_windows_wheels.sh index cdd0c0c79d8c4..a857e61067960 100755 --- a/build_tools/github/repair_windows_wheels.sh +++ b/build_tools/github/repair_windows_wheels.sh @@ -9,7 +9,7 @@ DEST_DIR=$2 # By default, the Windows wheels are not repaired. # In this case, we need to vendor VCRUNTIME140.dll wheel unpack "$WHEEL" -WHEEL_DIRNAME=$(ls -d scikit_learn-*) +WHEEL_DIRNAME=$(ls -d scikit_learn_tree-*) python build_tools/github/vendor.py "$WHEEL_DIRNAME" wheel pack "$WHEEL_DIRNAME" -d "$DEST_DIR" rm -rf "$WHEEL_DIRNAME" diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py index 4854cc7936aca..35c382bd7f5ab 100644 --- a/build_tools/update_environments_and_lock_files.py +++ b/build_tools/update_environments_and_lock_files.py @@ -556,15 +556,15 @@ def check_conda_version(): # Avoid issues with glibc (https://github.com/conda/conda-lock/issues/292) # or osx (https://github.com/conda/conda-lock/issues/408) virtual package. # The glibc one has been fixed in conda 23.1.0 and the osx has been fixed - # in main and will be fixed when conda >= 23.6 is released. + # in conda 23.7.0. conda_info_output = execute_command(["conda", "info", "--json"]) conda_info = json.loads(conda_info_output) conda_version = Version(conda_info["conda_version"]) - if Version("22.9.0") < conda_version < Version("23.6"): + if Version("22.9.0") < conda_version < Version("23.7"): raise RuntimeError( - f"conda version should be <= 22.9.0 or >= 23.6 got: {conda_version}" + f"conda version should be <= 22.9.0 or >= 23.7 got: {conda_version}" ) diff --git a/doc/conf.py b/doc/conf.py index db69cfedd48a3..7c52a20014d1b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -124,7 +124,8 @@ # source_encoding = 'utf-8' # The main toctree document. -root_doc = "contents" +# root_doc = "contents" +root_doc = "index" # General information about the project. project = "scikit-learn" diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index fc1ef95dbced0..6aecc524a9a30 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -542,6 +542,7 @@ message, the following actions are taken. [pypy] Build & test with PyPy [pyodide] Build & test with Pyodide [azure parallel] Run Azure CI jobs in parallel + [cirrus arm] Run Cirrus CI ARM test [float32] Run float32 tests by setting `SKLEARN_RUN_FLOAT32_TESTS=1`. See :ref:`environment_variable` for more details [doc skip] Docs are not built [doc quick] Docs built, but excludes example gallery plots diff --git a/doc/glossary.rst b/doc/glossary.rst index 36afcd9483684..1dbb7e630c449 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -205,6 +205,29 @@ General Concepts exceptional behaviours on the estimator using semantic :term:`estimator tags`. + cross-fitting + cross fitting + A resampling method that iteratively partitions data into mutually + exclusive subsets to fit two stages. During the first stage, the + mutually exclusive subsets enable predictions or transformations to be + computed on data not seen during training. The computed data is then + used in the second stage. The objective is to avoid having any + overfitting in the first stage introduce bias into the input data + distribution of the second stage. + For examples of its use, see: :class:`~preprocessing.TargetEncoder`, + :class:`~ensemble.StackingClassifier`, + :class:`~ensemble.StackingRegressor` and + :class:`~calibration.CalibratedClassifierCV`. + + cross-validation + cross validation + A resampling method that iteratively partitions data into mutually + exclusive 'train' and 'test' subsets so model performance can be + evaluated on unseen data. This conserves data as avoids the need to hold + out a 'validation' dataset and accounts for variability as multiple + rounds of cross validation are generally performed. + See :ref:`User Guide ` for more details. + deprecation We use deprecation to slowly violate our :term:`backwards compatibility` assurances, usually to: diff --git a/doc/install.rst b/doc/install.rst index bf2832bf72f24..263e83cdc31a5 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -61,7 +61,7 @@ Installing the latest release >Install python3 and python3-pip using the package manager of the Linux Distribution.Install conda using the Anaconda or miniconda - installers or the miniforge installers + installers or the miniforge installers (no administrator permission required for any of those). @@ -279,14 +279,14 @@ and in the `main`, `conda-forge` and `intel` conda channels: conda install scikit-learn-intelex -This package has an Intel optimized version of many estimators. Whenever -an alternative implementation doesn't exist, scikit-learn implementation -is used as a fallback. Those optimized solvers come from the oneDAL -C++ library and are optimized for the x86_64 architecture, and are +This package has an Intel optimized version of many estimators. Whenever +an alternative implementation doesn't exist, scikit-learn implementation +is used as a fallback. Those optimized solvers come from the oneDAL +C++ library and are optimized for the x86_64 architecture, and are optimized for multi-core Intel CPUs. Note that those solvers are not enabled by default, please refer to the -`scikit-learn-intelex `_ +`scikit-learn-intelex `_ documentation for more details on usage scenarios. Direct export example: .. prompt:: bash $ diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 635395fd07c43..741ebbf240a6d 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -83,17 +83,26 @@ the tensors directly:: >>> X_trans.device.type 'cuda' -.. _array_api_estimators: +.. _array_api_supported: -Estimators with support for `Array API`-compatible inputs -========================================================= +Support for `Array API`-compatible inputs +========================================= + +Estimators and other tools in scikit-learn that support Array API compatible inputs. + +Estimators +---------- - :class:`decomposition.PCA` (with `svd_solver="full"`, `svd_solver="randomized"` and `power_iteration_normalizer="QR"`) - :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`) -Coverage for more estimators is expected to grow over time. Please follow the -dedicated `meta-issue on GitHub +Tools +----- + +- :func:`model_selection.train_test_split` + +Coverage is expected to grow over time. Please follow the dedicated `meta-issue on GitHub `_ to track progress. Common estimator checks diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index faba9a76ab94c..f277c32675c3f 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -66,10 +66,8 @@ it takes a variable number of estimators and returns a pipeline, filling in the names automatically:: >>> from sklearn.pipeline import make_pipeline - >>> from sklearn.naive_bayes import MultinomialNB - >>> from sklearn.preprocessing import Binarizer - >>> make_pipeline(Binarizer(), MultinomialNB()) - Pipeline(steps=[('binarizer', Binarizer()), ('multinomialnb', MultinomialNB())]) + >>> make_pipeline(PCA(), SVC()) + Pipeline(steps=[('pca', PCA()), ('svc', SVC())]) Accessing steps ............... diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 6158e000cb727..8afa467982736 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -102,6 +102,7 @@ where the number of samples is very small. .. image:: ../images/grid_search_cross_validation.png :width: 500px :height: 300px + :alt: A depiction of a 5 fold cross validation on a training set, while holding out a test set. :align: center Computing cross-validated metrics diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index c3ea63bc6e944..36eed98da0f6b 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -10,12 +10,12 @@ Ensembles: Gradient boosting, random forests, bagging, voting, stacking base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator. -Two very famous examples of ensemble methods are `gradient-boosted trees -`_ and `random forests `_. +Two very famous examples of ensemble methods are :ref:`gradient-boosted trees +` and :ref:`random forests `. More generally, ensemble models can be applied to any base learner beyond trees, in averaging methods such as :ref:`Bagging methods `, -`model stacking `_, or `Voting `_, or in +:ref:`model stacking `, or :ref:`Voting `, or in boosting, as :ref:`AdaBoost `. .. contents:: diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 43356763d69c3..aa9184a2bedc5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -37,7 +37,7 @@ solves a problem of the form: :align: center :scale: 50% -:class:`LinearRegression` will take in its ``fit`` method arrays X, y +:class:`LinearRegression` will take in its ``fit`` method arrays ``X``, ``y`` and will store the coefficients :math:`w` of the linear model in its ``coef_`` member:: @@ -114,7 +114,7 @@ of shrinkage and thus the coefficients become more robust to collinearity. As with other linear models, :class:`Ridge` will take in its ``fit`` method -arrays X, y and will store the coefficients :math:`w` of the linear model in +arrays ``X``, ``y`` and will store the coefficients :math:`w` of the linear model in its ``coef_`` member:: >>> from sklearn import linear_model @@ -889,12 +889,16 @@ the probability of the positive class :math:`P(y_i=1|X_i)` as .. math:: \hat{p}(X_i) = \operatorname{expit}(X_i w + w_0) = \frac{1}{1 + \exp(-X_i w - w_0)}. + As an optimization problem, binary class logistic regression with regularization term :math:`r(w)` minimizes the following cost function: -.. math:: \min_{w} C \sum_{i=1}^n \left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right) + r(w). - +.. math:: + :name: regularized-logistic-loss + + \min_{w} C \sum_{i=1}^n \left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right) + r(w). + We currently provide four choices for the regularization term :math:`r(w)` via the `penalty` argument: diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index d11287e7c29b1..d3a7df74e6348 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -188,13 +188,9 @@ distance can be supplied to compute the weights. .. |classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_001.png :target: ../auto_examples/neighbors/plot_classification.html - :scale: 50 - -.. |classification_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_002.png - :target: ../auto_examples/neighbors/plot_classification.html - :scale: 50 + :scale: 75 -.. centered:: |classification_1| |classification_2| +.. centered:: |classification_1| .. topic:: Examples: diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 1d7ad07f7023c..82fecf0c4e9f1 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -910,16 +910,16 @@ For continuous targets, the formulation is similar to binary classification: where :math:`L_i` is the set of observations with category :math:`i` and :math:`n_i` is the number of observations with category :math:`i`. -:meth:`~TargetEncoder.fit_transform` internally relies on a cross fitting +:meth:`~TargetEncoder.fit_transform` internally relies on a :term:`cross fitting` scheme to prevent target information from leaking into the train-time representation, especially for non-informative high-cardinality categorical variables, and help prevent the downstream model from overfitting spurious correlations. Note that as a result, `fit(X, y).transform(X)` does not equal `fit_transform(X, y)`. In :meth:`~TargetEncoder.fit_transform`, the training -data is split into *k* folds (determined by the `cv` parameter) and encodes each -fold using the encodings trained on the other *k-1* folds. The following diagram -shows the cross fitting scheme in :meth:`~TargetEncoder.fit_transform` with -the default `cv=5`: +data is split into *k* folds (determined by the `cv` parameter) and each fold is +encoded using the encodings learnt using the other *k-1* folds. The following +diagram shows the :term:`cross fitting` scheme in +:meth:`~TargetEncoder.fit_transform` with the default `cv=5`: .. image:: ../images/target_encoder_cross_validation.svg :width: 600 @@ -929,10 +929,10 @@ the default `cv=5`: the whole training set. This is never used in :meth:`~TargetEncoder.fit_transform` but is saved to the attribute `encodings_`, for use when :meth:`~TargetEncoder.transform` is called. Note that the encodings -learned for each fold during the cross fitting scheme are not saved to an -attribute. +learned for each fold during the :term:`cross fitting` scheme are not saved to +an attribute. -The :meth:`~TargetEncoder.fit` method does **not** use any cross fitting +The :meth:`~TargetEncoder.fit` method does **not** use any :term:`cross fitting` schemes and learns one encoding on the entire training set, which is used to encode categories in :meth:`~TargetEncoder.transform`. This encoding is the same as the 'full data' diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 7e886366aebae..0ac34cdcb6a10 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -60,14 +60,19 @@ capable of performing binary and multi-class classification on a dataset. :align: center -:class:`SVC` and :class:`NuSVC` are similar methods, but accept -slightly different sets of parameters and have different mathematical -formulations (see section :ref:`svm_mathematical_formulation`). On the -other hand, :class:`LinearSVC` is another (faster) implementation of Support -Vector Classification for the case of a linear kernel. Note that -:class:`LinearSVC` does not accept parameter ``kernel``, as this is -assumed to be linear. It also lacks some of the attributes of -:class:`SVC` and :class:`NuSVC`, like ``support_``. +:class:`SVC` and :class:`NuSVC` are similar methods, but accept slightly +different sets of parameters and have different mathematical formulations (see +section :ref:`svm_mathematical_formulation`). On the other hand, +:class:`LinearSVC` is another (faster) implementation of Support Vector +Classification for the case of a linear kernel. It also +lacks some of the attributes of :class:`SVC` and :class:`NuSVC`, like +`support_`. :class:`LinearSVC` uses `squared_hinge` loss and due to its +implementation in `liblinear` it also regularizes the intercept, if considered. +This effect can however be reduced by carefully fine tuning its +`intercept_scaling` parameter, which allows the intercept term to have a +different regularization behavior compared to the other features. The +classification results and score can therefore differ from the other two +classifiers. As other classifiers, :class:`SVC`, :class:`NuSVC` and :class:`LinearSVC` take as input two arrays: an array `X` of shape @@ -314,10 +319,15 @@ target. There are three different implementations of Support Vector Regression: :class:`SVR`, :class:`NuSVR` and :class:`LinearSVR`. :class:`LinearSVR` -provides a faster implementation than :class:`SVR` but only considers -the linear kernel, while :class:`NuSVR` implements a slightly different -formulation than :class:`SVR` and :class:`LinearSVR`. See -:ref:`svm_implementation_details` for further details. +provides a faster implementation than :class:`SVR` but only considers the +linear kernel, while :class:`NuSVR` implements a slightly different formulation +than :class:`SVR` and :class:`LinearSVR`. Due to its implementation in +`liblinear` :class:`LinearSVR` also regularizes the intercept, if considered. +This effect can however be reduced by carefully fine tuning its +`intercept_scaling` parameter, which allows the intercept term to have a +different regularization behavior compared to the other features. The +classification results and score can therefore differ from the other two +classifiers. See :ref:`svm_implementation_details` for further details. As with classification classes, the fit method will take as argument vectors X, y, only that in this case y is expected to have diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 7f4be9f6240a8..7ae039e64a49a 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -27,8 +27,8 @@ Some advantages of decision trees are: - Requires little data preparation. Other techniques often require data normalization, dummy variables need to be created and blank values to - be removed. Note however that this module does not support missing - values. + be removed. Some tree and algorithm combinations support + :ref:`missing values `. - The cost of using the tree (i.e., predicting data) is logarithmic in the number of data points used to train the tree. @@ -141,7 +141,7 @@ Once trained, you can plot the tree with the :func:`plot_tree` function:: >>> tree.plot_tree(clf) [...] -.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_003.png :target: ../auto_examples/tree/plot_iris_dtc.html :scale: 75 :align: center @@ -331,6 +331,8 @@ total cost over the entire trees (by summing the cost at each node) of :math:`O(n_{features}n_{samples}^{2}\log(n_{samples}))`. +.. _tree_tips_usage: + Tips on practical use ===================== @@ -671,11 +673,66 @@ be pruned. This process stops when the pruned tree's minimal * :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` +Classification, regression and multi-output problems +---------------------------------------------------- + +OTs can be used for both classification and regression, and can handle multi-output +problems in the same manner as DTs. + +Complexity +---------- + +The run time cost to construct an OT will be similar to that of a DT, with the +added complexity of a (possibly sparse) matrix multiplication to combine random +data columns into candidate split values. The cost at each node is +:math:`O(n_{features}n_{samples}\log(n_{samples}) + n_{features}n_{samples}max\_features \lambda)` +where the additional :math:`n_{features}n_{samples}max\_features \lambda` term +comes from the (possibly sparse) matrix multiplication controlled by both the +number of candidate splits to generate ("max_features") and the sparsity of +the projection matrix that combines the data features (":math:`\lambda`"). + +Another consideration is space-complexity. + +Space-complexity and storing the OT pickled on disc is also a consideration. OTs +at every node need to store an additional vector of feature indices and vector of +feature weights that are used together to form the candidate splits. + +Tips on practical use +--------------------- + +Similar to DTs, the intuition for most parameters are the same. Therefore refer +to :ref:`tips for using decision trees ` for information on standard +tree parameters. Specific parameters, such as ``max_features`` and +``feature_combinations`` are different or special to OTs. + + * As specified earlier, ``max_features`` is not constrained to ``n_features`` + as it is in DTs. Setting ``max_features`` higher requires more computation time because + the algorithm needs to sample more candidate splits at every node. However, it also possibly + lets the user to sample more informative splits, thereby improving the model fit. This + presents a tradeoff between runtime resources and improvements to the model. In practice, + we found that sampling more splits, say up to ``max_features=n_features**2``, is desirable + if one is willing to spend the computational resources. + + * ``feature_combinations`` is the :math:`\lambda` term presented in the complexity + analysis, which specifies how sparse our combination of features is. If + ``feature_combinations=n_features``, then OT is the ``Forest-RC`` version. However, + in practice, ``feature_combinations`` can be set much lower, therefore improving runtime + and storage complexity. + +Finally, when asking the question of when to use OTs vs DTs, scikit-learn recommends +always trying both model using some type of cross-validation procedure and hyperparameter +optimization (e.g. `GridSearchCV`). If one has prior knowledge about how the data is +distributed along its features, such as data being axis-aligned, then one might use a DT. +Other considerations are runtime and space complexity. + .. topic:: References: .. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification and Regression Trees. Wadsworth, Belmont, CA, 1984. - + + .. [RF] L. Breiman. Random Forests. Machine Learning 45, 5–32 (2001). + https://doi.org/10.1023/A:1010933404324. + * https://en.wikipedia.org/wiki/Decision_tree_learning * https://en.wikipedia.org/wiki/Predictive_analytics diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 9cc70ad89ffff..10304a7070be0 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -21,9 +21,6 @@ enhance the functionality of scikit-learn's estimators. **Data formats** -- `Fast svmlight / libsvm file loader `_ - Fast and memory-efficient svmlight / libsvm file loader for Python. - - `sklearn_pandas `_ bridge for scikit-learn pipelines and pandas data frame with dedicated transformers. @@ -64,19 +61,20 @@ enhance the functionality of scikit-learn's estimators. It incorporates multiple modeling libraries under one API, and the objects that EvalML creates use an sklearn-compatible API. -**Experimentation frameworks** +**Experimentation and model registry frameworks** + +- `MLFlow `_ MLflow is an open source platform to manage the ML + lifecycle, including experimentation, reproducibility, deployment, and a central + model registry. - `Neptune `_ Metadata store for MLOps, - built for teams that run a lot of experiments.‌ It gives you a single + built for teams that run a lot of experiments. It gives you a single place to log, store, display, organize, compare, and query all your model building metadata. - `Sacred `_ Tool to help you configure, organize, log and reproduce experiments -- `REP `_ Environment for conducting data-driven - research in a consistent and reproducible way - - `Scikit-Learn Laboratory `_ A command-line wrapper around scikit-learn that makes it easy to run machine learning @@ -91,10 +89,7 @@ enhance the functionality of scikit-learn's estimators. debugging/inspecting machine learning models and explaining their predictions. -- `mlxtend `_ Includes model visualization - utilities. - -- `sklearn-evaluation `_ +- `sklearn-evaluation `_ Machine learning model evaluation made easy: plots, tables, HTML reports, experiment tracking and Jupyter notebook analysis. Visual analysis, model selection, evaluation and diagnostics. @@ -140,7 +135,15 @@ enhance the functionality of scikit-learn's estimators. - `treelite `_ Compiles tree-based ensemble models into C code for minimizing prediction latency. - + +- `micromlgen `_ + MicroML brings Machine Learning algorithms to microcontrollers. + Supports several scikit-learn classifiers by transpiling them to C code. + +- `emlearn `_ + Implements scikit-learn estimators in C99 for embedded devices and microcontrollers. + Supports several classifier, regression and outlier detection models. + **Model throughput** - `Intel(R) Extension for scikit-learn `_ @@ -161,12 +164,40 @@ project. The following are projects providing interfaces similar to scikit-learn for additional learning algorithms, infrastructures and tasks. -**Structured learning** +**Time series and forecasting** + +- `Darts `_ Darts is a Python library for + user-friendly forecasting and anomaly detection on time series. It contains a variety + of models, from classics such as ARIMA to deep neural networks. The forecasting + models can all be used in the same way, using fit() and predict() functions, similar + to scikit-learn. + +- `sktime `_ A scikit-learn compatible + toolbox for machine learning with time series including time series + classification/regression and (supervised/panel) forecasting. + +- `skforecast `_ A python library + that eases using scikit-learn regressors as multi-step forecasters. It also works + with any regressor compatible with the scikit-learn API. + +- `tslearn `_ A machine learning library for + time series that offers tools for pre-processing and feature extraction as well as + dedicated models for clustering, classification and regression. -- `tslearn `_ A machine learning library for time series - that offers tools for pre-processing and feature extraction as well as dedicated models for clustering, classification and regression. +**Gradient (tree) boosting** -- `sktime `_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting. +Note scikit-learn own modern gradient boosting estimators +:class:`~sklearn.ensemble.HistGradientBoostingClassifier` and +:class:`~sklearn.ensemble.HistGradientBoostingRegressor`. + +- `XGBoost `_ XGBoost is an optimized distributed + gradient boosting library designed to be highly efficient, flexible and portable. + +- `LightGBM `_ LightGBM is a gradient boosting + framework that uses tree based learning algorithms. It is designed to be distributed + and efficient. + +**Structured learning** - `HMMLearn `_ Implementation of hidden markov models that was previously part of scikit-learn. @@ -182,21 +213,9 @@ and tasks. (`CRFsuite `_ wrapper with sklearn-like API). -- `skforecast `_ A python library - that eases using scikit-learn regressors as multi-step forecasters. It also works - with any regressor compatible with the scikit-learn API. **Deep neural networks etc.** -- `nolearn `_ A number of wrappers and - abstractions around existing neural network libraries - -- `Keras `_ High-level API for - TensorFlow with a scikit-learn inspired API. - -- `lasagne `_ A lightweight library to - build and train neural networks in Theano. - - `skorch `_ A scikit-learn compatible neural network library that wraps PyTorch. @@ -219,9 +238,6 @@ and tasks. **Other regression and classification** -- `xgboost `_ Optimised gradient boosted decision - tree library. - - `ML-Ensemble `_ Generalized ensemble learning (stacking, blending, subsemble, deep ensembles, etc.). @@ -232,10 +248,6 @@ and tasks. - `py-earth `_ Multivariate adaptive regression splines -- `Kernel Regression `_ - Implementation of Nadaraya-Watson kernel regression with automatic bandwidth - selection - - `gplearn `_ Genetic Programming for symbolic regression tasks. @@ -245,8 +257,6 @@ and tasks. - `seglearn `_ Time series and sequence learning using sliding window segmentation. -- `libOPF `_ Optimal path forest classifier - - `fastFM `_ Fast factorization machine implementation compatible with scikit-learn @@ -266,6 +276,7 @@ and tasks. - `hdbscan `_ HDBSCAN and Robust Single Linkage clustering algorithms for robust variable density clustering. + As of scikit-learn version 1.3.0, there is :class:`~sklearn.cluster.HDBSCAN`. - `spherecluster `_ Spherical K-means and mixture of von Mises Fisher clustering routines for data on the @@ -276,6 +287,8 @@ and tasks. - `categorical-encoding `_ A library of sklearn compatible categorical variable encoders. + As of scikit-learn version 1.3.0, there is + :class:`~sklearn.preprocessing.TargetEncoder`. - `imbalanced-learn `_ Various @@ -331,9 +344,6 @@ Recommendation Engine packages - `OpenRec `_ TensorFlow-based neural-network inspired recommendation algorithms. -- `Spotlight `_ Pytorch-based - implementation of deep recommender models. - - `Surprise Lib `_ Library for explicit feedback datasets. @@ -355,9 +365,6 @@ Domain specific packages - `AstroML `_ Machine learning for astronomy. -- `MSMBuilder `_ Machine learning for protein - conformational dynamics time series. - Translations of scikit-learn documentation ------------------------------------------ diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index 0a8822cdcd848..40ac5e25ea698 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -661,13 +661,19 @@ div.sk-sidebar-global-toc ul ul { div.sk-page-content h1 { background-color: #cde8ef; padding: 0.5rem; - margin-top: calc(max(2.5rem, 1vh)); + margin-top: calc(max(1rem, 1vh)); border-radius: 0 1rem; text-align: center; font-size: 2rem; word-wrap: break-word; } +/* General sibling selector: does not apply to first h1, to avoid gap in + * top of page */ +div.sk-page-content ~ h1 { + margin-top: calc(max(2.5rem, 1vh)); +} + div.sk-page-content h2 { padding: 0.5rem; background-color: #BED4EB; diff --git a/doc/tutorial/machine_learning_map/pyparsing.py b/doc/tutorial/machine_learning_map/pyparsing.py index 0418cf2b51528..88d00e138d02c 100644 --- a/doc/tutorial/machine_learning_map/pyparsing.py +++ b/doc/tutorial/machine_learning_map/pyparsing.py @@ -21,7 +21,7 @@ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # -# flake8: noqa +# ruff: noqa __doc__ = \ """ diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index fea27b0c1c1a4..da2f5e8796db8 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -392,7 +392,7 @@ Changelog - |Efficiency| :class:`decomposition.NMF` with `solver="mu"` fitted on sparse input matrices now uses batching to avoid briefly allocating an array with size - (#non-zero elements, n_components). :pr:`15257` by `Mart Willocx `_. + (#non-zero elements, n_components). :pr:`15257` by :user:`Mart Willocx `. - |Enhancement| :func:`decomposition.dict_learning` and :func:`decomposition.dict_learning_online` now accept `method_max_iter` and diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 8d39ca2fed143..dc955f7aa0f51 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -9,9 +9,22 @@ Version 1.3.1 **In development** +Changes impacting all modules +----------------------------- + +- |Fix| The `set_output` API correctly works with list input. :pr:`27044` by + `Thomas Fan`_. + Changelog --------- +:mod:`sklearn.impute` +..................... + +- |Fix| :class:`impute.KNNImputer` now correctly adds a missing indicator column in + ``transform`` when ``add_indicator`` is set to ``True`` and missing values are observed + during ``fit``. :pr:`26600` by :user:`Shreesha Kumar Bhat `. + :mod:`sklearn.neighbors` ........................ @@ -23,6 +36,22 @@ Changelog :attr:`sklearn.neighbors.KDTree.valid_metrics` as public class attributes. :pr:`26754` by :user:`Julien Jerphanion `. +- |Fix| :class:`sklearn.model_selection.HalvingRandomSearchCV` no longer raises + when the input to the `param_distributions` parameter is a list of dicts. + :pr:`26893` by :user:`Stefanie Senger `. + +:mod:`sklearn.preprocessing` +............................ + +- |Fix| :class:`preprocessing.LabelEncoder` correctly accepts `y` as a keyword + argument. :pr:`26940` by `Thomas Fan`_. + +:mod:`sklearn.tree` +................... + +- |Fix| :func:`tree.plot_tree` now accepts `class_names=True` as documented. + :pr:`26903` by :user:`Thomas Roehr <2maz>` + .. _changes_1_3: Version 1.3.0 @@ -596,6 +625,13 @@ Changelog `n_targets`, which is used to decide the number of outputs when sampling from the prior distributions. :pr:`23099` by :user:`Zhehao Liu `. +:mod:`sklearn.mixture` +...................... + +- |Efficiency| :class:`GaussianMixture` is more efficient now and will bypass unnecessary + initialization if the weights, means, and precisions are given by users. + :pr:`26021` by :user:`Jiawei Zhang `. + :mod:`sklearn.model_selection` .............................. diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index c2b7d19404af9..e168f1d667607 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -19,6 +19,11 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. +- |Fix| The initialization of :class:`mixture.GaussianMixture` from user-provided + `precisions_init` for `covariance_type` of `full` or `tied` was not correct, + and has been fixed. + :pr:`26416` by :user:`Yang Tao `. + Changes impacting all modules ----------------------------- @@ -61,6 +66,27 @@ Changelog - |Enhancement| :func:`base.clone` now supports `dict` as input and creates a copy. :pr:`26786` by `Adrin Jalali`_. +- |API|:func:`~utils.metadata_routing.process_routing` now has a different + signature. The first two (the object and the method) are positional only, + and all metadata are passed as keyword arguments. :pr:`26909` by `Adrin + Jalali`_. + +:mod:`sklearn.cluster` +............................ + +- |API| : `kdtree` and `balltree` values are now deprecated and are renamed as + `kd_tree` and `ball_tree` respectively for the `algorithm` parameter of + :class:`cluster.HDBSCAN` ensuring consistency in naming convention. + `kdtree` and `balltree` values will be removed in 1.6. + :pr:`26744` by :user:`Shreesha Kumar Bhat `. + +:mod:`sklearn.cross_decomposition` +.................................. + +- |Fix| :class:`cross_decomposition.PLSRegression` now automatically ravels the output + of `predict` if fitted with one dimensional `y`. + :pr:`26602` by :user:`Yao Xiao `. + :mod:`sklearn.decomposition` ............................ @@ -80,6 +106,12 @@ Changelog :mod:`sklearn.ensemble` ....................... +- |MajorFeature| :class:`ensemble.RandomForestClassifier` and + :class:`ensemble.RandomForestRegressor` support missing values when + the criterion is `gini`, `entropy`, or `log_loss`, + for classification or `squared_error`, `friedman_mse`, or `poisson` + for regression. :pr:`26391` by `Thomas Fan`_. + - |Feature| :class:`ensemble.RandomForestClassifier`, :class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier` and :class:`ensemble.ExtraTreesRegressor` now support monotonic constraints, @@ -88,6 +120,11 @@ Changelog :pr:`13649` by :user:`Samuel Ronsin `, initiated by :user:`Patrick O'Reilly `. +- |Efficiency| Improves runtime and memory usage for + :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` when trained on sparse data. + :pr:`26957` by `Thomas Fan`_. + :mod:`sklearn.feature_selection` ................................ @@ -120,6 +157,37 @@ Changelog object in the parameter grid if it's an estimator. :pr:`26786` by `Adrin Jalali`_. +- |Feature| :func:`~model_selection.cross_validate`, + :func:`~model_selection.cross_val_score`, and + :func:`~model_selection.cross_val_predict` now support metadata routing. The + metadata are routed to the estimator's `fit`, the scorer, and the CV + splitter's `split`. The metadata is accepted via the new `params` parameter. + `fit_params` is deprecated and will be removed in version 1.6. `groups` + parameter is also not accepted as a separate argument when metadata routing + is enabled and should be passed via the `params` parameter. :pr:`26896` by + `Adrin Jalali`_. + +:mod:`sklearn.neighbors` +........................ + +- |Fix| Neighbors based estimators now correctly work when `metric="minkowski"` and the + metric parameter `p` is in the range `0 < p < 1`, regardless of the `dtype` of `X`. + :pr:`26760` by :user:`Shreesha Kumar Bhat `. + +:mod:`sklearn.preprocessing` +............................ + +- |Efficiency| :class:`preprocessing.OrdinalEncoder` avoids calculating + missing indices twice to improve efficiency. + :pr:`27017` by :user:`Xuefeng Xu `. + +- |Fix| :class:`preprocessing.OneHotEncoder` shows a more informative error message + when `sparse_output=True` and the output is configured to be pandas. + :pr:`26931` by `Thomas Fan`_. + +- |Enhancement| :func:`sklearn.model_selection.train_test_split` now supports + Array API compatible inputs. :pr:`26855` by `Tim Head`_. + :mod:`sklearn.tree` ................... @@ -131,9 +199,37 @@ Changelog :pr:`13649` by :user:`Samuel Ronsin `, initiated by :user:`Patrick O'Reilly `. + +:mod:`sklearn.neighbors` +........................ + +- |API| :class:`neighbors.KNeighborsRegressor` now accepts + :class:`metric.DistanceMetric` objects directly via the `metric` keyword + argument allowing for the use of accelerated third-party + :class:`metric.DistanceMetric` objects. + :pr:`26267` by :user:`Meekail Zain ` + +:mod:`sklearn.metrics` +...................... + +- |Efficiency| Computing pairwise distances via :class:`metrics.DistanceMetric` + for CSR × CSR, Dense × CSR, and CSR × Dense datasets is now 1.5x faster. + :pr:`26765` by :user:`Meekail Zain ` + +- |Efficiency| Computing distances via :class:`metrics.DistanceMetric` + for CSR × CSR, Dense × CSR, and CSR × Dense now uses ~50% less memory, + and outputs distances in the same dtype as the provided data. + :pr:`27006` by :user:`Meekail Zain ` + :mod:`sklearn.utils` .................... +- |Enhancement| :func:`sklearn.utils.estimator_html_repr` dynamically adapts + diagram colors based on the browser's `prefers-color-scheme`, providing + improved adaptability to dark mode environments. + :pr:`26862` by :user:`Andrew Goh Yisheng <9y5>`, `Thomas Fan`_, `Adrin + Jalali`_. + - |Enhancement| :class:`~utils.metadata_routing.MetadataRequest` and :class:`~utils.metadata_routing.MetadataRouter` now have a ``consumes`` method which can be used to check whether a given set of parameters would be consumed. diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py index 75164cff8b492..8d7eb7c63c81a 100644 --- a/examples/classification/plot_classifier_comparison.py +++ b/examples/classification/plot_classifier_comparison.py @@ -58,13 +58,15 @@ classifiers = [ KNeighborsClassifier(3), - SVC(kernel="linear", C=0.025), - SVC(gamma=2, C=1), - GaussianProcessClassifier(1.0 * RBF(1.0)), - DecisionTreeClassifier(max_depth=5), - RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), - MLPClassifier(alpha=1, max_iter=1000), - AdaBoostClassifier(), + SVC(kernel="linear", C=0.025, random_state=42), + SVC(gamma=2, C=1, random_state=42), + GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42), + DecisionTreeClassifier(max_depth=5, random_state=42), + RandomForestClassifier( + max_depth=5, n_estimators=10, max_features=1, random_state=42 + ), + MLPClassifier(alpha=1, max_iter=1000, random_state=42), + AdaBoostClassifier(random_state=42), GaussianNB(), QuadraticDiscriminantAnalysis(), ] diff --git a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py index 7eab9a3437d65..0dde24116065d 100644 --- a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py +++ b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py @@ -12,7 +12,7 @@ trees according to each estimator: - `n_estimators` controls the number of trees in the forest. It's a fixed number. -- `max_iter` is the the maximum number of iterations in a gradient boosting +- `max_iter` is the maximum number of iterations in a gradient boosting based model. The number of iterations corresponds to the number of trees for regression and binary classification problems. Furthermore, the actual number of trees required by the model depends on the stopping criteria. @@ -210,7 +210,7 @@ # models uniformly dominate the Random Forest models in the "test score vs # training speed trade-off" (the HGBDT curve should be on the top left of the RF # curve, without ever crossing). The "test score vs prediction speed" trade-off -# can also be more disputed but it's most often favorable to HGBDT. It's always +# can also be more disputed, but it's most often favorable to HGBDT. It's always # a good idea to check both kinds of model (with hyper-parameter tuning) and # compare their performance on your specific problem to determine which model is # the best fit but **HGBT almost always offers a more favorable speed-accuracy diff --git a/examples/feature_selection/plot_select_from_model_diabetes.py b/examples/feature_selection/plot_select_from_model_diabetes.py index 688c2b4ba8079..f008d8d6e8b68 100644 --- a/examples/feature_selection/plot_select_from_model_diabetes.py +++ b/examples/feature_selection/plot_select_from_model_diabetes.py @@ -122,9 +122,6 @@ print(f"Done in {toc_bwd - tic_bwd:.3f}s") # %% -# Discussion -# ---------- -# # Interestingly, forward and backward selection have selected the same set of # features. In general, this isn't the case and the two methods would lead to # different results. @@ -145,3 +142,54 @@ # attribute. The forward SFS is faster than the backward SFS because it only # needs to perform `n_features_to_select = 2` iterations, while the backward # SFS needs to perform `n_features - n_features_to_select = 8` iterations. +# +# Using negative tolerance values +# ------------------------------- +# +# :class:`~sklearn.feature_selection.SequentialFeatureSelector` can be used +# to remove features present in the dataset and return a +# smaller subset of the original features with `direction="backward"` +# and a negative value of `tol`. +# +# We begin by loading the Breast Cancer dataset, consisting of 30 different +# features and 569 samples. +import numpy as np + +from sklearn.datasets import load_breast_cancer + +breast_cancer_data = load_breast_cancer() +X, y = breast_cancer_data.data, breast_cancer_data.target +feature_names = np.array(breast_cancer_data.feature_names) +print(breast_cancer_data.DESCR) + +# %% +# We will make use of the :class:`~sklearn.linear_model.LogisticRegression` +# estimator with :class:`~sklearn.feature_selection.SequentialFeatureSelector` +# to perform the feature selection. +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_auc_score +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler + +for tol in [-1e-2, -1e-3, -1e-4]: + start = time() + feature_selector = SequentialFeatureSelector( + LogisticRegression(), + n_features_to_select="auto", + direction="backward", + scoring="roc_auc", + tol=tol, + n_jobs=2, + ) + model = make_pipeline(StandardScaler(), feature_selector, LogisticRegression()) + model.fit(X, y) + end = time() + print(f"\ntol: {tol}") + print(f"Features selected: {feature_names[model[1].get_support()]}") + print(f"ROC AUC score: {roc_auc_score(y, model.predict_proba(X)[:, 1]):.3f}") + print(f"Done in {end - start:.3f}s") + +# %% +# We can see that the number of features selected tend to increase as negative +# values of `tol` approach to zero. The time taken for feature selection also +# decreases as the values of `tol` come closer to zero. diff --git a/examples/miscellaneous/plot_metadata_routing.py b/examples/miscellaneous/plot_metadata_routing.py index 350cd865d972e..9984bb6183348 100644 --- a/examples/miscellaneous/plot_metadata_routing.py +++ b/examples/miscellaneous/plot_metadata_routing.py @@ -447,7 +447,7 @@ def get_metadata_routing(self): return router def fit(self, X, y, **fit_params): - params = process_routing(self, "fit", fit_params) + params = process_routing(self, "fit", **fit_params) self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit) X_transformed = self.transformer_.transform(X, **params.transformer.transform) @@ -458,7 +458,7 @@ def fit(self, X, y, **fit_params): return self def predict(self, X, **predict_params): - params = process_routing(self, "predict", predict_params) + params = process_routing(self, "predict", **predict_params) X_transformed = self.transformer_.transform(X, **params.transformer.transform) return self.classifier_.predict(X_transformed, **params.classifier.predict) @@ -543,7 +543,7 @@ def __init__(self, estimator): self.estimator = estimator def fit(self, X, y, **fit_params): - params = process_routing(self, "fit", fit_params) + params = process_routing(self, "fit", **fit_params) self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) def get_metadata_routing(self): @@ -572,7 +572,7 @@ def __init__(self, estimator): self.estimator = estimator def fit(self, X, y, sample_weight=None, **fit_params): - params = process_routing(self, "fit", fit_params, sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params) check_metadata(self, sample_weight=sample_weight) self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) diff --git a/examples/neighbors/plot_classification.py b/examples/neighbors/plot_classification.py index 4ed23862ae455..43c45558054cf 100644 --- a/examples/neighbors/plot_classification.py +++ b/examples/neighbors/plot_classification.py @@ -3,61 +3,92 @@ Nearest Neighbors Classification ================================ -Sample usage of Nearest Neighbors classification. -It will plot the decision boundaries for each class. - +This example shows how to use :class:`~sklearn.neighbors.KNeighborsClassifier`. +We train such a classifier on the iris dataset and observe the difference of the +decision boundary obtained with regards to the parameter `weights`. """ -import matplotlib.pyplot as plt -import seaborn as sns -from matplotlib.colors import ListedColormap +# %% +# Load the data +# ------------- +# +# In this example, we use the iris dataset. We split the data into a train and test +# dataset. +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split -from sklearn import datasets, neighbors -from sklearn.inspection import DecisionBoundaryDisplay +iris = load_iris(as_frame=True) +X = iris.data[["sepal length (cm)", "sepal width (cm)"]] +y = iris.target +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) -n_neighbors = 15 +# %% +# K-nearest neighbors classifier +# ------------------------------ +# +# We want to use a k-nearest neighbors classifier considering a neighborhood of 11 data +# points. Since our k-nearest neighbors model uses euclidean distance to find the +# nearest neighbors, it is therefore important to scale the data beforehand. Refer to +# the example entitled +# :ref:`sphx_glr_auto_examples_preprocessing_plot_scaling_importance.py` for more +# detailed information. +# +# Thus, we use a :class:`~sklearn.pipeline.Pipeline` to chain a scaler before to use +# our classifier. +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler -# import some data to play with -iris = datasets.load_iris() +clf = Pipeline( + steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=11))] +) -# we only take the first two features. We could avoid this ugly -# slicing by using a two-dim dataset -X = iris.data[:, :2] -y = iris.target +# %% +# Decision boundary +# ----------------- +# +# Now, we fit two classifiers with different values of the parameter +# `weights`. We plot the decision boundary of each classifier as well as the original +# dataset to observe the difference. +import matplotlib.pyplot as plt -# Create color maps -cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"]) -cmap_bold = ["darkorange", "c", "darkblue"] +from sklearn.inspection import DecisionBoundaryDisplay -for weights in ["uniform", "distance"]: - # we create an instance of Neighbours Classifier and fit the data. - clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) - clf.fit(X, y) +_, axs = plt.subplots(ncols=2, figsize=(12, 5)) - _, ax = plt.subplots() - DecisionBoundaryDisplay.from_estimator( +for ax, weights in zip(axs, ("uniform", "distance")): + clf.set_params(knn__weights=weights).fit(X_train, y_train) + disp = DecisionBoundaryDisplay.from_estimator( clf, - X, - cmap=cmap_light, - ax=ax, + X_test, response_method="predict", plot_method="pcolormesh", xlabel=iris.feature_names[0], ylabel=iris.feature_names[1], shading="auto", + alpha=0.5, + ax=ax, ) - - # Plot also the training points - sns.scatterplot( - x=X[:, 0], - y=X[:, 1], - hue=iris.target_names[y], - palette=cmap_bold, - alpha=1.0, - edgecolor="black", + scatter = disp.ax_.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors="k") + disp.ax_.legend( + scatter.legend_elements()[0], + iris.target_names, + loc="lower left", + title="Classes", ) - plt.title( - "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights) + _ = disp.ax_.set_title( + f"3-Class classification\n(k={clf[-1].n_neighbors}, weights={weights!r})" ) plt.show() + +# %% +# Conclusion +# ---------- +# +# We observe that the parameter `weights` has an impact on the decision boundary. When +# `weights="unifom"` all nearest neighbors will have the same impact on the decision. +# Whereas when `weights="distance"` the weight given to each neighbor is proportional +# to the inverse of the distance from that neighbor to the query point. +# +# In some cases, taking the distance into account might improve the model. diff --git a/examples/preprocessing/plot_target_encoder_cross_val.py b/examples/preprocessing/plot_target_encoder_cross_val.py index f4ff643d8b48e..7244a1bf61cd6 100644 --- a/examples/preprocessing/plot_target_encoder_cross_val.py +++ b/examples/preprocessing/plot_target_encoder_cross_val.py @@ -6,21 +6,26 @@ .. currentmodule:: sklearn.preprocessing The :class:`TargetEncoder` replaces each category of a categorical feature with -the mean of the target variable for that category. This method is useful +the shrunk mean of the target variable for that category. This method is useful in cases where there is a strong relationship between the categorical feature and the target. To prevent overfitting, :meth:`TargetEncoder.fit_transform` uses -an internal cross fitting scheme to encode the training data to be used by a -downstream model. In this example, we demonstrate the importance of the cross fitting -procedure to prevent overfitting. +an internal :term:`cross fitting` scheme to encode the training data to be used +by a downstream model. This scheme involves splitting the data into *k* folds +and encoding each fold using the encodings learnt using the other *k-1* folds. +In this example, we demonstrate the importance of the cross +fitting procedure to prevent overfitting. """ # %% # Create Synthetic Dataset # ======================== -# For this example, we build a dataset with three categorical features: an informative -# feature with medium cardinality, an uninformative feature with medium cardinality, -# and an uninformative feature with high cardinality. First, we generate the informative -# feature: +# For this example, we build a dataset with three categorical features: +# +# * an informative feature with medium cardinality ("informative") +# * an uninformative feature with medium cardinality ("shuffled") +# * an uninformative feature with high cardinality ("near_unique") +# +# First, we generate the informative feature: import numpy as np from sklearn.preprocessing import KBinsDiscretizer @@ -33,12 +38,16 @@ n_categories = 100 kbins = KBinsDiscretizer( - n_bins=n_categories, encode="ordinal", strategy="uniform", random_state=rng + n_bins=n_categories, + encode="ordinal", + strategy="uniform", + random_state=rng, + subsample=None, ) X_informative = kbins.fit_transform((y + noise).reshape(-1, 1)) -# Remove the linear relationship between y and the bin index by permuting the values of -# X_informative +# Remove the linear relationship between y and the bin index by permuting the +# values of X_informative: permuted_categories = rng.permutation(n_categories) X_informative = permuted_categories[X_informative.astype(np.int32)] @@ -48,13 +57,13 @@ X_shuffled = rng.permutation(X_informative) # %% -# The uninformative feature with high cardinality is generated so that is independent of -# the target variable. We will show that target encoding without cross fitting will -# cause catastrophic overfitting for the downstream regressor. These high cardinality -# features are basically unique identifiers for samples which should generally be -# removed from machine learning dataset. In this example, we generate them to show how -# :class:`TargetEncoder`'s default cross fitting behavior mitigates the overfitting -# issue automatically. +# The uninformative feature with high cardinality is generated so that it is +# independent of the target variable. We will show that target encoding without +# :term:`cross fitting` will cause catastrophic overfitting for the downstream +# regressor. These high cardinality features are basically unique identifiers +# for samples which should generally be removed from machine learning datasets. +# In this example, we generate them to show how :class:`TargetEncoder`'s default +# :term:`cross fitting` behavior mitigates the overfitting issue automatically. X_near_unique_categories = rng.choice( int(0.9 * n_samples), size=n_samples, replace=True ).reshape(-1, 1) @@ -79,9 +88,10 @@ # ========================== # In this section, we train a ridge regressor on the dataset with and without # encoding and explore the influence of target encoder with and without the -# internal cross fitting. First, we see the Ridge model trained on the -# raw features will have low performance, because the order of the informative -# feature is not informative: +# internal :term:`cross fitting`. First, we see the Ridge model trained on the +# raw features will have low performance. This is because we permuted the order +# of the informative feature meaning `X_informative` is not informative when +# raw: import sklearn from sklearn.linear_model import Ridge @@ -96,15 +106,15 @@ # %% # Next, we create a pipeline with the target encoder and ridge model. The pipeline -# uses :meth:`TargetEncoder.fit_transform` which uses cross fitting. We see that -# the model fits the data well and generalizes to the test set: +# uses :meth:`TargetEncoder.fit_transform` which uses :term:`cross fitting`. We +# see that the model fits the data well and generalizes to the test set: from sklearn.pipeline import make_pipeline from sklearn.preprocessing import TargetEncoder -model_with_cv = make_pipeline(TargetEncoder(random_state=0), ridge) -model_with_cv.fit(X_train, y_train) -print("Model with CV on training set: ", model_with_cv.score(X_train, y_train)) -print("Model with CV on test set: ", model_with_cv.score(X_test, y_test)) +model_with_cf = make_pipeline(TargetEncoder(random_state=0), ridge) +model_with_cf.fit(X_train, y_train) +print("Model with CF on train set: ", model_with_cf.score(X_train, y_train)) +print("Model with CF on test set: ", model_with_cf.score(X_test, y_test)) # %% # The coefficients of the linear model shows that most of the weight is on the @@ -114,49 +124,68 @@ plt.rcParams["figure.constrained_layout.use"] = True -coefs_cv = pd.Series( - model_with_cv[-1].coef_, index=model_with_cv[-1].feature_names_in_ +coefs_cf = pd.Series( + model_with_cf[-1].coef_, index=model_with_cf[-1].feature_names_in_ ).sort_values() -_ = coefs_cv.plot(kind="barh") +ax = coefs_cf.plot(kind="barh") +_ = ax.set( + title="Target encoded with cross fitting", + xlabel="Ridge coefficient", + ylabel="Feature", +) # %% -# While :meth:`TargetEncoder.fit_transform` uses an internal cross fitting scheme, -# :meth:`TargetEncoder.transform` itself does not perform any cross fitting. -# It uses the aggregation of the complete training set to transform the categorical -# features. Thus, we can use :meth:`TargetEncoder.fit` followed by -# :meth:`TargetEncoder.transform` to disable the cross fitting. This encoding -# is then passed to the ridge model. +# While :meth:`TargetEncoder.fit_transform` uses an internal +# :term:`cross fitting` scheme to learn encodings for the training set, +# :meth:`TargetEncoder.transform` itself does not. +# It uses the complete training set to learn encodings and to transform the +# categorical features. Thus, we can use :meth:`TargetEncoder.fit` followed by +# :meth:`TargetEncoder.transform` to disable the :term:`cross fitting`. This +# encoding is then passed to the ridge model. target_encoder = TargetEncoder(random_state=0) target_encoder.fit(X_train, y_train) -X_train_no_cv_encoding = target_encoder.transform(X_train) -X_test_no_cv_encoding = target_encoder.transform(X_test) +X_train_no_cf_encoding = target_encoder.transform(X_train) +X_test_no_cf_encoding = target_encoder.transform(X_test) -model_no_cv = ridge.fit(X_train_no_cv_encoding, y_train) +model_no_cf = ridge.fit(X_train_no_cf_encoding, y_train) # %% -# We evaluate the model on the non-cross validated encoding and see that it overfits: +# We evaluate the model that did not use :term:`cross fitting` when encoding and +# see that it overfits: print( - "Model without CV on training set: ", - model_no_cv.score(X_train_no_cv_encoding, y_train), + "Model without CF on training set: ", + model_no_cf.score(X_train_no_cf_encoding, y_train), ) print( - "Model without CV on test set: ", model_no_cv.score(X_test_no_cv_encoding, y_test) + "Model without CF on test set: ", + model_no_cf.score( + X_test_no_cf_encoding, + y_test, + ), ) # %% -# The ridge model overfits, because it assigns more weight to the extremely high -# cardinality feature relative to the informative feature. -coefs_no_cv = pd.Series( - model_no_cv.coef_, index=model_no_cv.feature_names_in_ +# The ridge model overfits because it assigns much more weight to the +# uninformative extremely high cardinality ("near_unique") and medium +# cardinality ("shuffled") features than when the model used +# :term:`cross fitting` to encode the features. +coefs_no_cf = pd.Series( + model_no_cf.coef_, index=model_no_cf.feature_names_in_ ).sort_values() -_ = coefs_no_cv.plot(kind="barh") +ax = coefs_no_cf.plot(kind="barh") +_ = ax.set( + title="Target encoded without cross fitting", + xlabel="Ridge coefficient", + ylabel="Feature", +) # %% # Conclusion # ========== -# This example demonstrates the importance of :class:`TargetEncoder`'s internal cross -# fitting. It is important to use :meth:`TargetEncoder.fit_transform` to encode -# training data before passing it to a machine learning model. When a -# :class:`TargetEncoder` is a part of a :class:`~sklearn.pipeline.Pipeline` and the -# pipeline is fitted, the pipeline will correctly call -# :meth:`TargetEncoder.fit_transform` and pass the encoding along. +# This example demonstrates the importance of :class:`TargetEncoder`'s internal +# :term:`cross fitting`. It is important to use +# :meth:`TargetEncoder.fit_transform` to encode training data before passing it +# to a machine learning model. When a :class:`TargetEncoder` is a part of a +# :class:`~sklearn.pipeline.Pipeline` and the pipeline is fitted, the pipeline +# will correctly call :meth:`TargetEncoder.fit_transform` and use +# :term:`cross fitting` when encoding the training data. diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py index 7c6836632e3f0..d7ae7465a590b 100644 --- a/examples/release_highlights/plot_release_highlights_0_23_0.py +++ b/examples/release_highlights/plot_release_highlights_0_23_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================== Release Highlights for scikit-learn 0.23 diff --git a/examples/release_highlights/plot_release_highlights_0_24_0.py b/examples/release_highlights/plot_release_highlights_0_24_0.py index a55b4aabc7994..29082c1a078f4 100644 --- a/examples/release_highlights/plot_release_highlights_0_24_0.py +++ b/examples/release_highlights/plot_release_highlights_0_24_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================== Release Highlights for scikit-learn 0.24 diff --git a/examples/release_highlights/plot_release_highlights_1_0_0.py b/examples/release_highlights/plot_release_highlights_1_0_0.py index 383612e611688..7ac09dd193c0f 100644 --- a/examples/release_highlights/plot_release_highlights_1_0_0.py +++ b/examples/release_highlights/plot_release_highlights_1_0_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================= Release Highlights for scikit-learn 1.0 diff --git a/examples/release_highlights/plot_release_highlights_1_1_0.py b/examples/release_highlights/plot_release_highlights_1_1_0.py index f6432cf15037c..b3058a7e0aa27 100644 --- a/examples/release_highlights/plot_release_highlights_1_1_0.py +++ b/examples/release_highlights/plot_release_highlights_1_1_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================= Release Highlights for scikit-learn 1.1 diff --git a/examples/release_highlights/plot_release_highlights_1_2_0.py b/examples/release_highlights/plot_release_highlights_1_2_0.py index 8165c3bc4eed0..695e74cfcdd64 100644 --- a/examples/release_highlights/plot_release_highlights_1_2_0.py +++ b/examples/release_highlights/plot_release_highlights_1_2_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================= Release Highlights for scikit-learn 1.2 diff --git a/examples/release_highlights/plot_release_highlights_1_3_0.py b/examples/release_highlights/plot_release_highlights_1_3_0.py index 8fa1ea057ac91..5ce2617cd08aa 100644 --- a/examples/release_highlights/plot_release_highlights_1_3_0.py +++ b/examples/release_highlights/plot_release_highlights_1_3_0.py @@ -1,4 +1,4 @@ -# flake8: noqa +# ruff: noqa """ ======================================= Release Highlights for scikit-learn 1.3 diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py index b3d834da5d067..99b9e6b18b109 100644 --- a/examples/tree/plot_iris_dtc.py +++ b/examples/tree/plot_iris_dtc.py @@ -2,16 +2,12 @@ ======================================================================= Plot the decision surface of decision trees trained on the iris dataset ======================================================================= - Plot the decision surface of a decision tree trained on pairs of features of the iris dataset. - See :ref:`decision tree ` for more information on the estimator. - For each pair of iris features, the decision tree learns decision boundaries made of combinations of simple thresholding rules inferred from the training samples. - We also show the tree structure of a model built on all of the features. """ # %% diff --git a/min_dependency_substitutions.rst b/min_dependency_substitutions.rst new file mode 100644 index 0000000000000..575b003b15a32 --- /dev/null +++ b/min_dependency_substitutions.rst @@ -0,0 +1,28 @@ +.. |NumpyMinVersion| replace:: 1.17.3 +.. |ScipyMinVersion| replace:: 1.5.0 +.. |JoblibMinVersion| replace:: 1.1.1 +.. |ThreadpoolctlMinVersion| replace:: 2.0.0 +.. |CythonMinVersion| replace:: 0.29.33 +.. |MatplotlibMinVersion| replace:: 3.1.3 +.. |Scikit-imageMinVersion| replace:: 0.16.2 +.. |PandasMinVersion| replace:: 1.0.5 +.. |SeabornMinVersion| replace:: 0.9.0 +.. |Memory_profilerMinVersion| replace:: 0.57.0 +.. |PytestMinVersion| replace:: 7.1.2 +.. |Pytest-covMinVersion| replace:: 2.9.0 +.. |RuffMinVersion| replace:: 0.0.272 +.. |BlackMinVersion| replace:: 23.3.0 +.. |MypyMinVersion| replace:: 1.3 +.. |PyamgMinVersion| replace:: 4.0.0 +.. |PolarsMinVersion| replace:: 0.18.2 +.. |PyarrowMinVersion| replace:: 12.0.0 +.. |SphinxMinVersion| replace:: 6.0.0 +.. |Sphinx-copybuttonMinVersion| replace:: 0.5.2 +.. |Sphinx-galleryMinVersion| replace:: 0.10.1 +.. |NumpydocMinVersion| replace:: 1.2.0 +.. |PillowMinVersion| replace:: 7.1.2 +.. |PoochMinVersion| replace:: 1.6.0 +.. |Sphinx-promptMinVersion| replace:: 1.3.0 +.. |Sphinxext-opengraphMinVersion| replace:: 0.4.2 +.. |PlotlyMinVersion| replace:: 5.14.0 +.. |Conda-lockMinVersion| replace:: 2.1.1 diff --git a/min_dependency_table.rst b/min_dependency_table.rst new file mode 100644 index 0000000000000..3a223a0fef797 --- /dev/null +++ b/min_dependency_table.rst @@ -0,0 +1,32 @@ +======================= =================== ==================================== +Dependency Minimum Version Purpose +======================= =================== ==================================== +numpy 1.17.3 build, install +scipy 1.5.0 build, install +joblib 1.1.1 install +threadpoolctl 2.0.0 install +cython 0.29.33 build +matplotlib 3.1.3 benchmark, docs, examples, tests +scikit-image 0.16.2 docs, examples, tests +pandas 1.0.5 benchmark, docs, examples, tests +seaborn 0.9.0 docs, examples +memory_profiler 0.57.0 benchmark, docs +pytest 7.1.2 tests +pytest-cov 2.9.0 tests +ruff 0.0.272 tests +black 23.3.0 tests +mypy 1.3 tests +pyamg 4.0.0 tests +polars 0.18.2 tests +pyarrow 12.0.0 tests +sphinx 6.0.0 docs +sphinx-copybutton 0.5.2 docs +sphinx-gallery 0.10.1 docs +numpydoc 1.2.0 docs, tests +Pillow 7.1.2 docs +pooch 1.6.0 docs, examples, tests +sphinx-prompt 1.3.0 docs +sphinxext-opengraph 0.4.2 docs +plotly 5.14.0 docs, examples +conda-lock 2.1.1 maintenance +======================= =================== ==================================== diff --git a/pyproject.toml b/pyproject.toml index efd72adf44392..c98ed2130189f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,7 @@ exclude=[ # + E501 (line too long) because keeping it < 88 in cython # often makes code less readable. ignore = [ - # check ignored by default in flake8. Meaning unclear. + # multiple spaces/tab after comma 'E24', # space before : (needed for how black formats slicing) 'E203', diff --git a/setup.cfg b/setup.cfg index d91a27344c575..b7705781dbb7d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,6 @@ addopts = # correctly on the CI when running `pytest --pyargs sklearn` from the # source folder. -p sklearn.tests.random_seed - -rN filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning @@ -54,6 +53,9 @@ ignore = sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx + sklearn/neighbors/_ball_tree.pyx + sklearn/neighbors/_binary_tree.pxi + sklearn/neighbors/_kd_tree.pyx [codespell] diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index 5af738f5f841f..5c008944ec05c --- a/setup.py +++ b/setup.py @@ -29,19 +29,19 @@ builtins.__SKLEARN_SETUP__ = True -DISTNAME = "scikit-learn" -DESCRIPTION = "A set of python modules for machine learning and data mining" +DISTNAME = "scikit-learn-tree" +DESCRIPTION = "A maintained fork of scikit-learn that extends the tree submodule." with open("README.rst") as f: LONG_DESCRIPTION = f.read() -MAINTAINER = "Andreas Mueller" -MAINTAINER_EMAIL = "amueller@ais.uni-bonn.de" +MAINTAINER = "Adam Li" +MAINTAINER_EMAIL = "adam.li@columbia.edu" URL = "http://scikit-learn.org" -DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files" +DOWNLOAD_URL = "https://pypi.org/project/scikit-learn-tree/#files" LICENSE = "new BSD" PROJECT_URLS = { - "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues", + "Bug Tracker": "https://github.com/neurodata/scikit-learn/issues", "Documentation": "https://scikit-learn.org/stable/documentation.html", - "Source Code": "https://github.com/scikit-learn/scikit-learn", + "Source Code": "https://github.com/neurodata/scikit-learn", } # We can actually import a restricted version of sklearn that @@ -169,11 +169,11 @@ def check_package_status(package, min_version): package_status["up_to_date"] = False package_status["version"] = "" - req_str = "scikit-learn requires {} >= {}.\n".format(package, min_version) + req_str = "scikit-learn-tree requires {} >= {}.\n".format(package, min_version) instructions = ( "Installation instructions are available on the " - "scikit-learn website: " + "scikit-learn-tree website: " "http://scikit-learn.org/stable/install.html\n" ) @@ -225,10 +225,10 @@ def check_package_status(package, min_version): {"sources": ["_cdnmf_fast.pyx"], "include_np": True}, ], "ensemble": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, ], "ensemble._hist_gradient_boosting": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, {"sources": ["histogram.pyx"], "include_np": True}, {"sources": ["splitting.pyx"], "include_np": True}, {"sources": ["_binning.pyx"], "include_np": True}, @@ -306,10 +306,11 @@ def check_package_status(package, min_version): }, ], "neighbors": [ - {"sources": ["_ball_tree.pyx"], "include_np": True}, - {"sources": ["_kd_tree.pyx"], "include_np": True}, + {"sources": ["_binary_tree.pxi.tp"], "include_np": True}, + {"sources": ["_ball_tree.pyx.tp"], "include_np": True}, + {"sources": ["_kd_tree.pyx.tp"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, - {"sources": ["_quad_tree.pyx"], "include_np": True}, + {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True}, ], "svm": [ { @@ -377,9 +378,24 @@ def check_package_status(package, min_version): "include_np": True, "optimization_level": "O3", }, - {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"}, + { + "sources": ["_splitter.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_criterion.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_utils.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, ], "utils": [ {"sources": ["sparsefuncs_fast.pyx"], "include_np": True}, @@ -499,13 +515,18 @@ def configure_extension_modules(): # `source` is a Tempita file tempita_sources.append(source) - # Do not include pxd files that were generated by tempita - if os.path.splitext(new_source_path)[-1] == ".pxd": - continue - sources.append(new_source_path) + # Only include source files that are pyx files + if os.path.splitext(new_source_path)[-1] == ".pyx": + sources.append(new_source_path) gen_from_templates(tempita_sources) + # Do not progress if we only have a tempita file which we don't + # want to include like the .pxi.tp extension. In such a case + # sources would be empty. + if not sources: + continue + # By convention, our extensions always use the name of the first source source_name = os.path.splitext(os.path.basename(sources[0]))[0] if submodule: diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py index f3b61da0915d5..11cb0e42c47f6 100644 --- a/sklearn/_loss/loss.py +++ b/sklearn/_loss/loss.py @@ -113,7 +113,7 @@ class BaseLoss: Indicates whether n_classes > 2 is allowed. """ - # For decision trees: + # For gradient boosted decision trees: # This variable indicates whether the loss requires the leaves values to # be updated once the tree has been trained. The trees are trained to # predict a Newton-Raphson step (see grower._finalize_leaf()). But for @@ -122,8 +122,8 @@ class BaseLoss: # procedure. See the original paper Greedy Function Approximation: A # Gradient Boosting Machine by Friedman # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. - need_update_leaves_values = False differentiable = True + need_update_leaves_values = False is_multiclass = False def __init__(self, closs, link, n_classes=None): @@ -543,6 +543,10 @@ class AbsoluteError(BaseLoss): For a given sample x_i, the absolute error is defined as:: loss(x_i) = |y_true_i - raw_prediction_i| + + Note that the exact hessian = 0 almost everywhere (except at one point, therefore + differentiable = False). Optimization routines like in HGBT, however, need a + hessian > 0. Therefore, we assign 1. """ differentiable = False @@ -585,6 +589,10 @@ class PinballLoss(BaseLoss): Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError(). + Note that the exact hessian = 0 almost everywhere (except at one point, therefore + differentiable = False). Optimization routines like in HGBT, however, need a + hessian > 0. Therefore, we assign 1. + Additional Attributes --------------------- quantile : float diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 432ca9e25b152..8d9a964aea172 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -378,10 +378,10 @@ def fit(self, X, y, sample_weight=None, **fit_params): if _routing_enabled(): routed_params = process_routing( - obj=self, - method="fit", + self, + "fit", sample_weight=sample_weight, - other_params=fit_params, + **fit_params, ) else: # sample_weight checks @@ -450,7 +450,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): cv=cv, method=method_name, n_jobs=self.n_jobs, - fit_params=routed_params.estimator.fit, + params=routed_params.estimator.fit, ) predictions = _compute_predictions( pred_method, method_name, X, n_classes @@ -1186,7 +1186,7 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs): f"(Positive class: {self.pos_label})" if self.pos_label is not None else "" ) - line_kwargs = {} + line_kwargs = {"marker": "s", "linestyle": "-"} if name is not None: line_kwargs["label"] = name line_kwargs.update(**kwargs) @@ -1195,9 +1195,7 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs): existing_ref_line = ref_line_label in self.ax_.get_legend_handles_labels()[1] if ref_line and not existing_ref_line: self.ax_.plot([0, 1], [0, 1], "k:", label=ref_line_label) - self.line_ = self.ax_.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[ - 0 - ] + self.line_ = self.ax_.plot(self.prob_pred, self.prob_true, **line_kwargs)[0] # We always have to show the legend for at least the reference line self.ax_.legend(loc="lower right") diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 7280bc31423ae..4dd09c9531c44 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -22,6 +22,8 @@ from ._dbscan_inner import dbscan_inner +# This function is not validated using validate_params because +# it's just a factory for DBSCAN. def dbscan( X, eps=0.5, @@ -172,6 +174,9 @@ class DBSCAN(ClusterMixin, BaseEstimator): Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density. + The worst case memory complexity of DBSCAN is :math:`O({n}^2)`, which can + occur when the `eps` param is large and `min_samples` is low. + Read more in the :ref:`User Guide `. Parameters @@ -184,8 +189,11 @@ class DBSCAN(ClusterMixin, BaseEstimator): and distance function. min_samples : int, default=5 - The number of samples (or total weight) in a neighborhood for a point - to be considered as a core point. This includes the point itself. + The number of samples (or total weight) in a neighborhood for a point to + be considered as a core point. This includes the point itself. If + `min_samples` is set to a higher value, DBSCAN will find denser clusters, + whereas if it is set to a lower value, the found clusters will be more + sparse. metric : str, or callable, default='euclidean' The metric to use when calculating distance between instances in a diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 57de8962250b1..f8a37c52f55dc 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -462,12 +462,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator): A distance scaling parameter as used in robust single linkage. See [3]_ for more information. - algorithm : {"auto", "brute", "kdtree", "balltree"}, default="auto" + algorithm : {"auto", "brute", "kd_tree", "ball_tree"}, default="auto" Exactly which algorithm to use for computing core distances; By default this is set to `"auto"` which attempts to use a :class:`~sklearn.neighbors.KDTree` tree if possible, otherwise it uses - a :class:`~sklearn.neighbors.BallTree` tree. Both `"KDTree"` and - `"BallTree"` algorithms use the + a :class:`~sklearn.neighbors.BallTree` tree. Both `"kd_tree"` and + `"ball_tree"` algorithms use the :class:`~sklearn.neighbors.NearestNeighbors` estimator. If the `X` passed during `fit` is sparse or `metric` is invalid for @@ -475,6 +475,14 @@ class HDBSCAN(ClusterMixin, BaseEstimator): :class:`~sklearn.neighbors.BallTree`, then it resolves to use the `"brute"` algorithm. + .. deprecated:: 1.4 + The `'kdtree'` option was deprecated in version 1.4, + and will be renamed to `'kd_tree'` in 1.6. + + .. deprecated:: 1.4 + The `'balltree'` option was deprecated in version 1.4, + and will be renamed to `'ball_tree'` in 1.6. + leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries when a KDTree or a BallTree are used as core-distance algorithms. A large @@ -625,15 +633,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator): "metric": [StrOptions(FAST_METRICS | {"precomputed"}), callable], "metric_params": [dict, None], "alpha": [Interval(Real, left=0, right=None, closed="neither")], + # TODO(1.6): Remove "kdtree" and "balltree" option "algorithm": [ StrOptions( - { - "auto", - "brute", - "kdtree", - "balltree", - } - ) + {"auto", "brute", "kd_tree", "ball_tree", "kdtree", "balltree"}, + deprecated={"kdtree", "balltree"}, + ), ], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], "n_jobs": [Integral, None], @@ -759,6 +764,31 @@ def fit(self, X, y=None): f"min_samples ({self._min_samples}) must be at most the number of" f" samples in X ({X.shape[0]})" ) + + # TODO(1.6): Remove + if self.algorithm == "kdtree": + warn( + ( + "`algorithm='kdtree'`has been deprecated in 1.4 and will be renamed" + " to'kd_tree'`in 1.6. To keep the past behaviour, set" + " `algorithm='kd_tree'`." + ), + FutureWarning, + ) + self.algorithm = "kd_tree" + + # TODO(1.6): Remove + if self.algorithm == "balltree": + warn( + ( + "`algorithm='balltree'`has been deprecated in 1.4 and will be" + " renamed to'ball_tree'`in 1.6. To keep the past behaviour, set" + " `algorithm='ball_tree'`." + ), + FutureWarning, + ) + self.algorithm = "ball_tree" + mst_func = None kwargs = dict( X=X, @@ -768,12 +798,14 @@ def fit(self, X, y=None): n_jobs=self.n_jobs, **self._metric_params, ) - if self.algorithm == "kdtree" and self.metric not in KDTree.valid_metrics: + if self.algorithm == "kd_tree" and self.metric not in KDTree.valid_metrics: raise ValueError( f"{self.metric} is not a valid metric for a KDTree-based algorithm." " Please select a different metric." ) - elif self.algorithm == "balltree" and self.metric not in BallTree.valid_metrics: + elif ( + self.algorithm == "ball_tree" and self.metric not in BallTree.valid_metrics + ): raise ValueError( f"{self.metric} is not a valid metric for a BallTree-based algorithm." " Please select a different metric." @@ -790,11 +822,11 @@ def fit(self, X, y=None): if self.algorithm == "brute": mst_func = _hdbscan_brute kwargs["copy"] = self.copy - elif self.algorithm == "kdtree": + elif self.algorithm == "kd_tree": mst_func = _hdbscan_prims kwargs["algo"] = "kd_tree" kwargs["leaf_size"] = self.leaf_size - elif self.algorithm == "balltree": + else: mst_func = _hdbscan_prims kwargs["algo"] = "ball_tree" kwargs["leaf_size"] = self.leaf_size diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index c0c281ce31475..63087e75185dc 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -28,8 +28,8 @@ X = StandardScaler().fit_transform(X) ALGORITHMS = [ - "kdtree", - "balltree", + "kd_tree", + "ball_tree", "brute", "auto", ] @@ -149,8 +149,8 @@ def test_hdbscan_algorithms(algo, metric): return ALGOS_TREES = { - "kdtree": KDTree, - "balltree": BallTree, + "kd_tree": KDTree, + "ball_tree": BallTree, } metric_params = { "mahalanobis": {"V": np.eye(X.shape[1])}, @@ -287,22 +287,37 @@ def test_hdbscan_precomputed_non_brute(tree): def test_hdbscan_sparse(): """ Tests that HDBSCAN works correctly when passing sparse feature data. + Evaluates correctness by comparing against the same data passed as a dense + array. """ - sparse_X = sparse.csr_matrix(X) - labels = HDBSCAN().fit(sparse_X).labels_ - n_clusters = len(set(labels) - OUTLIER_SET) + dense_labels = HDBSCAN().fit(X).labels_ + n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 - sparse_X_nan = sparse_X.copy() - sparse_X_nan[0, 0] = np.nan - labels = HDBSCAN().fit(sparse_X_nan).labels_ - n_clusters = len(set(labels) - OUTLIER_SET) - assert n_clusters == 3 + _X_sparse = sparse.csr_matrix(X) + X_sparse = _X_sparse.copy() + sparse_labels = HDBSCAN().fit(X_sparse).labels_ + assert_array_equal(dense_labels, sparse_labels) + + # Compare that the sparse and dense non-precomputed routines return the same labels + # where the 0th observation contains the outlier. + for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")): + X_dense = X.copy() + X_dense[0, 0] = outlier_val + dense_labels = HDBSCAN().fit(X_dense).labels_ + n_clusters = len(set(dense_labels) - OUTLIER_SET) + assert n_clusters == 3 + assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] + + X_sparse = _X_sparse.copy() + X_sparse[0, 0] = outlier_val + sparse_labels = HDBSCAN().fit(X_sparse).labels_ + assert_array_equal(dense_labels, sparse_labels) msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X) + HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse) @pytest.mark.parametrize("algorithm", ALGORITHMS) @@ -353,7 +368,7 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): cluster_selection_epsilon=0.18, cluster_selection_method="eom", allow_single_cluster=True, - algorithm="kdtree", + algorithm="kd_tree", ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 @@ -418,16 +433,16 @@ def test_hdbscan_tree_invalid_metric(): # Callables are not supported for either with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="kdtree", metric=metric_callable).fit(X) + HDBSCAN(algorithm="kd_tree", metric=metric_callable).fit(X) with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="balltree", metric=metric_callable).fit(X) + HDBSCAN(algorithm="ball_tree", metric=metric_callable).fit(X) # The set of valid metrics for KDTree at the time of writing this test is a # strict subset of those supported in BallTree metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics)) if len(metrics_not_kd) > 0: with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="kdtree", metric=metrics_not_kd[0]).fit(X) + HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0]).fit(X) def test_hdbscan_too_many_min_samples(): @@ -531,3 +546,23 @@ def test_labelling_thresholding(): # and the largest value is exactly MAX_LAMBDA. num_noise = condensed_tree["value"] < MAX_LAMBDA assert sum(num_noise) == sum(labels == -1) + + +# TODO(1.6): Remove +def test_hdbscan_warning_on_deprecated_algorithm_name(): + # Test that warning message is shown when algorithm='kdtree' + msg = ( + "`algorithm='kdtree'`has been deprecated in 1.4 and will be renamed" + " to'kd_tree'`in 1.6. To keep the past behaviour, set `algorithm='kd_tree'`." + ) + with pytest.warns(FutureWarning, match=msg): + HDBSCAN(algorithm="kdtree").fit(X) + + # Test that warning message is shown when algorithm='balltree' + msg = ( + "`algorithm='balltree'`has been deprecated in 1.4 and will be renamed" + " to'ball_tree'`in 1.6. To keep the past behaviour, set" + " `algorithm='ball_tree'`." + ) + with pytest.warns(FutureWarning, match=msg): + HDBSCAN(algorithm="balltree").fit(X) diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index f1fc90af11d82..822a13064bb08 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -238,7 +238,10 @@ def fit(self, X, Y): Y, input_name="Y", dtype=np.float64, copy=self.copy, ensure_2d=False ) if Y.ndim == 1: + self._predict_1d = True Y = Y.reshape(-1, 1) + else: + self._predict_1d = False n = X.shape[0] p = X.shape[1] @@ -469,8 +472,8 @@ def predict(self, X, copy=True): # Normalize X -= self._x_mean X /= self._x_std - Ypred = X @ self.coef_.T - return Ypred + self.intercept_ + Ypred = X @ self.coef_.T + self.intercept_ + return Ypred.ravel() if self._predict_1d else Ypred def fit_transform(self, X, y=None): """Learn and apply the dimension reduction on the train data. diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py index fcdd927efb389..b8b5cbaa0f275 100644 --- a/sklearn/cross_decomposition/tests/test_pls.py +++ b/sklearn/cross_decomposition/tests/test_pls.py @@ -12,7 +12,9 @@ _svd_flip_1d, ) from sklearn.datasets import load_linnerud, make_regression +from sklearn.ensemble import VotingRegressor from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import LinearRegression from sklearn.utils import check_random_state from sklearn.utils.extmath import svd_flip @@ -621,3 +623,24 @@ def test_pls_set_output(Klass): assert isinstance(y_trans, np.ndarray) assert isinstance(X_trans, pd.DataFrame) assert_array_equal(X_trans.columns, est.get_feature_names_out()) + + +def test_pls_regression_fit_1d_y(): + """Check that when fitting with 1d `y`, prediction should also be 1d. + + Non-regression test for Issue #26549. + """ + X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]]) + y = np.array([2, 6, 12, 20, 30, 42]) + expected = y.copy() + + plsr = PLSRegression().fit(X, y) + y_pred = plsr.predict(X) + assert y_pred.shape == expected.shape + + # Check that it works in VotingRegressor + lr = LinearRegression().fit(X, y) + vr = VotingRegressor([("lr", lr), ("plsr", plsr)]) + y_pred = vr.fit(X, y).predict(X) + assert y_pred.shape == expected.shape + assert_allclose(y_pred, expected) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index a29545b1941a5..7348044e0d8fa 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -43,13 +43,14 @@ class calls the ``fit`` method of each sub-estimator on random samples import threading from abc import ABCMeta, abstractmethod from numbers import Integral, Real +from time import time from warnings import catch_warnings, simplefilter, warn import numpy as np from scipy.sparse import hstack as sparse_hstack from scipy.sparse import issparse -from ..base import ( +from sklearn.base import ( ClassifierMixin, MultiOutputMixin, RegressorMixin, @@ -57,9 +58,24 @@ class calls the ``fit`` method of each sub-estimator on random samples _fit_context, is_classifier, ) -from ..exceptions import DataConversionWarning -from ..metrics import accuracy_score, r2_score -from ..preprocessing import OneHotEncoder +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.exceptions import DataConversionWarning +from sklearn.metrics import accuracy_score, r2_score +from sklearn.preprocessing import OneHotEncoder +from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions +from sklearn.utils._tags import _safe_tags +from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.parallel import Parallel, delayed +from sklearn.utils.validation import ( + _check_feature_names_in, + _check_sample_weight, + _num_samples, + check_is_fitted, +) + from ..tree import ( BaseDecisionTree, DecisionTreeClassifier, @@ -68,17 +84,6 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeRegressor, ) from ..tree._tree import DOUBLE, DTYPE -from ..utils import check_random_state, compute_sample_weight -from ..utils._param_validation import Interval, RealNotInt, StrOptions -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.parallel import Parallel, delayed -from ..utils.validation import ( - _check_feature_names_in, - _check_sample_weight, - _num_samples, - check_is_fitted, -) -from ._base import BaseEnsemble, _partition_estimators __all__ = [ "RandomForestClassifier", @@ -159,6 +164,7 @@ def _parallel_build_trees( verbose=0, class_weight=None, n_samples_bootstrap=None, + missing_values_in_feature_mask=None, ): """ Private function used to fit a single tree in parallel.""" @@ -185,9 +191,21 @@ def _parallel_build_trees( elif class_weight == "balanced_subsample": curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices) - tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) + tree._fit( + X, + y, + sample_weight=curr_sample_weight, + check_input=False, + missing_values_in_feature_mask=missing_values_in_feature_mask, + ) else: - tree.fit(X, y, sample_weight=sample_weight, check_input=False) + tree._fit( + X, + y, + sample_weight=sample_weight, + check_input=False, + missing_values_in_feature_mask=missing_values_in_feature_mask, + ) return tree @@ -213,6 +231,11 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): Interval(RealNotInt, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], + "max_bins": [ + None, + Interval(Integral, 1, None, closed="left"), + ], + "store_leaf_values": ["boolean"], } @abstractmethod @@ -231,6 +254,8 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -247,6 +272,8 @@ def __init__( self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.max_bins = max_bins + self.store_leaf_values = store_leaf_values def apply(self, X): """ @@ -266,6 +293,15 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, @@ -345,9 +381,26 @@ def fit(self, X, y, sample_weight=None): # Validate or convert input data if issparse(y): raise ValueError("sparse multilabel-indicator for y is not supported.") + X, y = self._validate_data( - X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE + X, + y, + multi_output=True, + accept_sparse="csc", + dtype=DTYPE, + force_all_finite=False, ) + # _compute_missing_values_in_feature_mask checks if X has missing values and + # will raise an error if the underlying tree base estimator can't handle missing + # values. Only the criterion is required to determine if the tree supports + # missing values. + estimator = type(self.estimator)(criterion=self.criterion) + missing_values_in_feature_mask = ( + estimator._compute_missing_values_in_feature_mask( + X, estimator_name=self.__class__.__name__ + ) + ) + if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) @@ -424,6 +477,38 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " @@ -469,6 +554,7 @@ def fit(self, X, y, sample_weight=None): verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, + missing_values_in_feature_mask=missing_values_in_feature_mask, ) for i, t in enumerate(trees) ) @@ -596,7 +682,18 @@ def _validate_X_predict(self, X): """ Validate X whenever one tries to predict, apply, predict_proba.""" check_is_fitted(self) - X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False) + if self.estimators_[0]._support_missing_values(X): + force_all_finite = "allow-nan" + else: + force_all_finite = True + + X = self._validate_data( + X, + dtype=DTYPE, + accept_sparse="csr", + reset=False, + force_all_finite=force_all_finite, + ) if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError("No support for np.int64 index based sparse matrices") return X @@ -636,6 +733,180 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + def _bin_data(self, X, is_training_data): + """Bin data X. + + If is_training_data, then fit the _bin_mapper attribute. + Else, the binned data is converted to a C-contiguous array. + """ + description = "training" if is_training_data else "validation" + if self.verbose: + print( + "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), + end="", + flush=True, + ) + tic = time() + if is_training_data: + X_binned = self._bin_mapper.fit_transform(X) # F-aligned array + else: + X_binned = self._bin_mapper.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + return X_binned + + def predict_quantiles(self, X, quantiles=0.5, method="nearest"): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + + Returns + ------- + y : ndarray of shape (n_samples, n_quantiles, [n_outputs]) + The predicted values. The ``n_outputs`` dimension is present only + for multi-output regressors. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Quantile prediction is not available when store_leaf_values=False" + ) + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros( + (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64 + ) + else: + y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64) + + # get (n_samples, n_estimators) indicator of leaf nodes + X_leaves = self.apply(X) + + # we now want to aggregate all leaf samples across all trees for each sample + for idx in range(X.shape[0]): + # get leaf nodes for this sample + leaf_nodes = X_leaves[idx, :] + + # (n_total_leaf_samples, n_outputs) + leaf_node_samples = np.vstack( + [ + est.tree_.leaf_nodes_samples[leaf_nodes[jdx]] + for jdx, est in enumerate(self.estimators_) + ] + ) + + # get quantiles across all leaf node samples + try: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, method=method + ) + except TypeError: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, interpolation=method + ) + + if is_classifier(self): + if self.n_outputs_ == 1: + for i in range(len(quantiles)): + class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int) + y_hat[idx, ...] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + else: + for k in range(self.n_outputs_): + for i in range(len(quantiles)): + class_pred_per_sample = ( + y_hat[idx, i, k].squeeze().astype(int) + ) + y_hat[idx, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + return y_hat + + def get_leaf_node_samples(self, X): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + + Returns + ------- + leaf_node_samples : a list of array-like + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. Each array-like has shape (n_leaf_node_samples, n_outputs). + """ + if not self.store_leaf_values: + raise RuntimeError( + "Leaf node samples are not available when store_leaf_values=False" + ) + + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + result = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X) + for e in self.estimators_ + ) + leaf_nodes_samples = result[0] + for result_ in result[1:]: + for i, node_samples in enumerate(result_): + leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples)) + return leaf_nodes_samples + + def _more_tags(self): + # Only the criterion is required to determine if the tree supports + # missing values + estimator = type(self.estimator)(criterion=self.criterion) + return {"allow_nan": _safe_tags(estimator, key="allow_nan")} + def _accumulate_prediction(predict, X, out, lock): """ @@ -653,6 +924,17 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] +def _accumulate_leaf_nodes_samples(func, X): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + leaf_nodes_samples = func(X, check_input=False) + return leaf_nodes_samples + + class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ Base class for forest of trees-based classifiers. @@ -677,6 +959,8 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -691,6 +975,8 @@ def __init__( class_weight=class_weight, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) @staticmethod @@ -864,6 +1150,14 @@ def predict_proba(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -945,6 +1239,8 @@ def __init__( warm_start=False, max_samples=None, base_estimator="deprecated", + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator, @@ -958,6 +1254,8 @@ def __init__( warm_start=warm_start, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) def predict(self, X): @@ -983,6 +1281,14 @@ def predict(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1273,6 +1579,16 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1432,6 +1748,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1448,6 +1766,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -1458,6 +1777,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -1649,6 +1970,17 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. Used for + speeding up training time. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -1792,6 +2124,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1808,6 +2142,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -1817,6 +2152,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2016,6 +2353,16 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2164,6 +2511,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2180,6 +2529,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2190,6 +2540,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2377,6 +2729,16 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2505,6 +2867,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2521,6 +2885,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2530,6 +2895,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2653,6 +3020,9 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest): new forest. See :term:`Glossary ` and :ref:`gradient_boosting_warm_start` for details. + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance @@ -2756,6 +3126,7 @@ def __init__( random_state=None, verbose=0, warm_start=False, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2770,6 +3141,7 @@ def __init__( "max_leaf_nodes", "min_impurity_decrease", "random_state", + "store_leaf_values", ), bootstrap=False, oob_score=False, @@ -2778,6 +3150,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=None, + store_leaf_values=store_leaf_values, ) self.max_depth = max_depth diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index c39e330d63536..f1e7b7d6e063a 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -136,6 +136,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta): "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None], "tol": [Interval(Real, 0.0, None, closed="left")], } + _parameter_constraints.pop("store_leaf_values") _parameter_constraints.pop("splitter") _parameter_constraints.pop("monotonic_cst") @@ -242,13 +243,14 @@ def _fit_stage( # no inplace multiplication! sample_weight = sample_weight * sample_mask.astype(np.float64) - X = X_csr if X_csr is not None else X + X = X_csc if X_csc is not None else X tree.fit(X, residual, sample_weight=sample_weight, check_input=False) # update tree leaves + X_for_tree_update = X_csr if X_csr is not None else X loss.update_terminal_regions( tree.tree_, - X, + X_for_tree_update, y, residual, raw_predictions, @@ -433,16 +435,18 @@ def fit(self, X, y, sample_weight=None, monitor=None): if self.n_iter_no_change is not None: stratify = y if is_classifier(self) else None - X, X_val, y, y_val, sample_weight, sample_weight_val = train_test_split( - X, - y, - sample_weight, - random_state=self.random_state, - test_size=self.validation_fraction, - stratify=stratify, + X_train, X_val, y_train, y_val, sample_weight_train, sample_weight_val = ( + train_test_split( + X, + y, + sample_weight, + random_state=self.random_state, + test_size=self.validation_fraction, + stratify=stratify, + ) ) if is_classifier(self): - if self._n_classes != np.unique(y).shape[0]: + if self._n_classes != np.unique(y_train).shape[0]: # We choose to error here. The problem is that the init # estimator would be trained on y, which has some missing # classes now, so its predictions would not have the @@ -453,6 +457,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): "seed." ) else: + X_train, y_train, sample_weight_train = X, y, sample_weight X_val = y_val = sample_weight_val = None if not self._is_initialized(): @@ -462,19 +467,21 @@ def fit(self, X, y, sample_weight=None, monitor=None): # fit initial model and initialize raw predictions if self.init_ == "zero": raw_predictions = np.zeros( - shape=(X.shape[0], self._loss.K), dtype=np.float64 + shape=(X_train.shape[0], self._loss.K), dtype=np.float64 ) else: # XXX clean this once we have a support_sample_weight tag if sample_weight_is_none: - self.init_.fit(X, y) + self.init_.fit(X_train, y_train) else: msg = ( "The initial estimator {} does not support sample " "weights.".format(self.init_.__class__.__name__) ) try: - self.init_.fit(X, y, sample_weight=sample_weight) + self.init_.fit( + X_train, y_train, sample_weight=sample_weight_train + ) except TypeError as e: if "unexpected keyword argument 'sample_weight'" in str(e): # regular estimator without SW support @@ -492,7 +499,9 @@ def fit(self, X, y, sample_weight=None, monitor=None): else: # regular estimator whose input checking failed raise - raw_predictions = self._loss.get_init_raw_predictions(X, self.init_) + raw_predictions = self._loss.get_init_raw_predictions( + X_train, self.init_ + ) begin_at_stage = 0 @@ -512,22 +521,22 @@ def fit(self, X, y, sample_weight=None, monitor=None): # The requirements of _raw_predict # are more constrained than fit. It accepts only CSR # matrices. Finite values have already been checked in _validate_data. - X = check_array( - X, + X_train = check_array( + X_train, dtype=DTYPE, order="C", accept_sparse="csr", force_all_finite=False, ) - raw_predictions = self._raw_predict(X) + raw_predictions = self._raw_predict(X_train) self._resize_state() # fit the boosting stages n_stages = self._fit_stages( - X, - y, + X_train, + y_train, raw_predictions, - sample_weight, + sample_weight_train, self._rng, X_val, y_val, diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 5d030d3add5bb..c3af930654b73 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -59,13 +59,23 @@ def _update_leaves_values(loss, grower, y_true, raw_prediction, sample_weight): Update equals: loss.fit_intercept_only(y_true - raw_prediction) - This is only applied if loss.need_update_leaves_values is True. + This is only applied if loss.differentiable is False. Note: It only works, if the loss is a function of the residual, as is the case for AbsoluteError and PinballLoss. Otherwise, one would need to get the minimum of loss(y_true, raw_prediction + x) in x. A few examples: - AbsoluteError: median(y_true - raw_prediction). - PinballLoss: quantile(y_true - raw_prediction). - See also notes about need_update_leaves_values in BaseLoss. + + More background: + For the standard gradient descent method according to "Greedy Function + Approximation: A Gradient Boosting Machine" by Friedman, all loss functions but the + squared loss need a line search step. BaseHistGradientBoosting, however, implements + a so called Newton boosting where the trees are fitted to a 2nd order + approximations of the loss in terms of gradients and hessians. In this case, the + line search step is only necessary if the loss is not smooth, i.e. not + differentiable, which renders the 2nd order approximation invalid. In fact, + non-smooth losses arbitrarily set hessians to 1 and effectively use the standard + gradient descent method with line search. """ # TODO: Ideally this should be computed in parallel over the leaves using something # similar to _update_raw_predictions(), but this requires a cython version of @@ -699,7 +709,7 @@ def fit(self, X, y, sample_weight=None): acc_find_split_time += grower.total_find_split_time acc_compute_hist_time += grower.total_compute_hist_time - if self._loss.need_update_leaves_values: + if not self._loss.differentiable: _update_leaves_values( loss=self._loss, grower=grower, diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 2129e4d9a0134..539d97fbf345e 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -254,7 +254,7 @@ def fit(self, X, y, sample_weight=None): cv=deepcopy(cv), method=meth, n_jobs=self.n_jobs, - fit_params=fit_params, + params=fit_params, verbose=self.verbose, ) for est, meth in zip(all_estimators, self.stack_method_) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 15d2999b5ef4d..efc5d7d5ee5a4 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -115,6 +115,120 @@ FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) +def _sparse_parity(n, p=20, p_star=3, random_state=None): + """Generate sparse parity dataset. + + Sparse parity is a multivariate generalization of the + XOR problem. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset, by default 20 + p_star : int, optional + The number of informative dimensions, by default 3. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Sparse parity dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + y[i] = sum(X[i, :p_star] > 0) % 2 + + return X, y + + +def _orthant(n, p=6, random_state=None): + """Generate orthant dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 6. + rec : int, optional + _description_, by default 1 + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Orthant dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + orth_labels = np.asarray([2**i for i in range(0, p)][::-1]) + + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + idx = np.where(X[i, :] > 0)[0] + y[i] = sum(orth_labels[idx]) + + if len(np.unique(y)) < 2**p: + raise RuntimeError("Increase sample size to get a label in each orthant.") + + return X, y + + +def _trunk(n, p=10, random_state=None): + """Generate trunk dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 10. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Trunk dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + + References + ---------- + [1] Gerard V. Trunk. A problem of dimensionality: A + simple example. IEEE Transactions on Pattern Analysis + and Machine Intelligence, 1(3):306–307, 1979. + """ + rng = np.random.RandomState(seed=random_state) + + mu_1 = np.array([1 / i for i in range(1, p + 1)]) + mu_0 = -1 * mu_1 + cov = np.identity(p) + + X = np.vstack( + ( + rng.multivariate_normal(mu_0, cov, int(n / 2)), + rng.multivariate_normal(mu_1, cov, int(n / 2)), + ) + ) + y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2)))) + return X, y + + def check_classification_toy(name): """Check classification on a toy dataset.""" ForestClassifier = FOREST_CLASSIFIERS[name] @@ -1809,3 +1923,202 @@ def test_round_samples_to_one_when_samples_too_low(class_weight): n_estimators=10, max_samples=1e-4, class_weight=class_weight, random_state=0 ) forest.fit(X, y) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classification_toy_withbins(name): + """Check classification on a toy dataset.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + clf = ForestClassifier( + n_estimators=10, max_features=1, random_state=1, max_bins=255 + ) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + # also test apply + leaf_indices = clf.apply(X) + assert leaf_indices.shape == (len(X), clf.n_estimators) + + +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +@pytest.mark.parametrize( + "criterion", ("squared_error", "absolute_error", "friedman_mse") +) +def test_regression_criterion_withbins(name, criterion): + # Check consistency on regression dataset. + ForestRegressor = FOREST_REGRESSORS[name] + + reg = ForestRegressor( + n_estimators=5, criterion=criterion, random_state=1, max_bins=250 + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert ( + score > 0.93 + ), "Failed with max_features=None, criterion %s and score = %f" % ( + criterion, + score, + ) + + reg = ForestRegressor( + n_estimators=5, + criterion=criterion, + max_features=6, + random_state=1, + max_bins=250, + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % ( + criterion, + score, + ) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_multioutput_quantiles(name): + # Check estimators on multi-output problems. + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + est = FOREST_ESTIMATORS[name]( + random_state=0, bootstrap=False, store_leaf_values=True + ) + est.fit(X_train, y_train) + + y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75]) + assert_array_almost_equal(y_pred[:, 1, :], y_test) + assert_array_almost_equal(y_pred[:, 0, :], y_test) + assert_array_almost_equal(y_pred[:, 2, :], y_test) + + # test the leaf nodes samples + leaf_nodes_samples = est.get_leaf_node_samples(X_test) + assert len(leaf_nodes_samples) == len(X_test) + for node_samples in leaf_nodes_samples: + assert node_samples.shape[1] == est.n_outputs_ + + +@pytest.mark.parametrize( + "make_data, Forest", + [ + (datasets.make_regression, RandomForestRegressor), + (datasets.make_classification, RandomForestClassifier), + ], +) +def test_missing_values_is_resilient(make_data, Forest): + """Check that forest can deal with missing values and has decent performance.""" + + rng = np.random.RandomState(0) + n_samples, n_features = 1000, 10 + X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng) + + # Create dataset with missing values + X_missing = X.copy() + X_missing[rng.choice([False, True], size=X.shape, p=[0.95, 0.05])] = np.nan + assert np.isnan(X_missing).any() + + X_missing_train, X_missing_test, y_train, y_test = train_test_split( + X_missing, y, random_state=0 + ) + + # Train forest with missing values + forest_with_missing = Forest(random_state=rng, n_estimators=50) + forest_with_missing.fit(X_missing_train, y_train) + score_with_missing = forest_with_missing.score(X_missing_test, y_test) + + # Train forest without missing values + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + forest = Forest(random_state=rng, n_estimators=50) + forest.fit(X_train, y_train) + score_without_missing = forest.score(X_test, y_test) + + # Score is still 80 percent of the forest's score that had no missing values + assert score_with_missing >= 0.80 * score_without_missing + + +@pytest.mark.parametrize("Forest", [RandomForestClassifier, RandomForestRegressor]) +def test_missing_value_is_predictive(Forest): + """Check that the forest learns when missing values are only present for + a predictive feature.""" + rng = np.random.RandomState(0) + n_samples = 300 + + X_non_predictive = rng.standard_normal(size=(n_samples, 10)) + y = rng.randint(0, high=2, size=n_samples) + + # Create a predictive feature using `y` and with some noise + X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05]) + y_mask = y.astype(bool) + y_mask[X_random_mask] = ~y_mask[X_random_mask] + + predictive_feature = rng.standard_normal(size=n_samples) + predictive_feature[y_mask] = np.nan + assert np.isnan(predictive_feature).any() + + X_predictive = X_non_predictive.copy() + X_predictive[:, 5] = predictive_feature + + ( + X_predictive_train, + X_predictive_test, + X_non_predictive_train, + X_non_predictive_test, + y_train, + y_test, + ) = train_test_split(X_predictive, X_non_predictive, y, random_state=0) + forest_predictive = Forest(random_state=0).fit(X_predictive_train, y_train) + forest_non_predictive = Forest(random_state=0).fit(X_non_predictive_train, y_train) + + predictive_test_score = forest_predictive.score(X_predictive_test, y_test) + + assert predictive_test_score >= 0.75 + assert predictive_test_score >= forest_non_predictive.score( + X_non_predictive_test, y_test + ) + + +def test_non_supported_criterion_raises_error_with_missing_values(): + """Raise error for unsupported criterion when there are missing values.""" + X = np.array([[0, 1, 2], [np.nan, 0, 2.0]]) + y = [0.5, 1.0] + + forest = RandomForestRegressor(criterion="absolute_error") + + msg = "RandomForestRegressor does not accept missing values" + with pytest.raises(ValueError, match=msg): + forest.fit(X, y) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 11cf083992653..b3dffa5494b0d 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -33,7 +33,12 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): X_train, y_train, lambda estimator, features: _score( - estimator, X_test[:, features], y_test, scorer + # TODO(SLEP6): pass score_params here + estimator, + X_test[:, features], + y_test, + scorer, + score_params=None, ), ).scores_ diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index db0da278b39ef..e36b49f262b2d 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -282,7 +282,12 @@ def transform(self, X): Xc[:, ~valid_mask] = 0 else: Xc = X[:, valid_mask] - return Xc + + # Even if there are no missing values in X, we still concatenate Xc + # with the missing value indicator matrix, X_indicator. + # This is to ensure that the output maintains consistency in terms + # of columns, regardless of whether missing values exist in X or not. + return super()._concatenate_indicator(Xc, X_indicator) row_missing_idx = np.flatnonzero(mask.any(axis=1)) diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index aad7eb12a0a92..be2fa6e4d1736 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -181,3 +181,39 @@ def test_keep_empty_features(imputer, keep_empty_features): assert X_imputed.shape == X.shape else: assert X_imputed.shape == (X.shape[0], X.shape[1] - 1) + + +@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) +@pytest.mark.parametrize("missing_value_test", [np.nan, 1]) +def test_imputation_adds_missing_indicator_if_add_indicator_is_true( + imputer, missing_value_test +): + """Check that missing indicator always exists when add_indicator=True. + + Non-regression test for gh-26590. + """ + X_train = np.array([[0, np.NaN], [1, 2]]) + + # Test data where missing_value_test variable can be set to np.NaN or 1. + X_test = np.array([[0, missing_value_test], [1, 2]]) + + imputer.set_params(add_indicator=True) + imputer.fit(X_train) + + X_test_imputed_with_indicator = imputer.transform(X_test) + assert X_test_imputed_with_indicator.shape == (2, 3) + + imputer.set_params(add_indicator=False) + imputer.fit(X_train) + X_test_imputed_without_indicator = imputer.transform(X_test) + assert X_test_imputed_without_indicator.shape == (2, 2) + + assert_allclose( + X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator + ) + if np.isnan(missing_value_test): + expected_missing_indicator = [1, 0] + else: + expected_missing_indicator = [0, 0] + + assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 1a9bc7216a0b5..a76fd98940ad4 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -856,8 +856,9 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): in 1.4. Use `None` instead. dual : bool, default=False - Dual or primal formulation. Dual formulation is only implemented for - l2 penalty with liblinear solver. Prefer dual=False when + Dual (constrained) or primal (regularized, see also + :ref:`this equation `) formulation. Dual formulation + is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. tol : float, default=1e-4 @@ -1474,8 +1475,9 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima ``cv`` default value if None changed from 3-fold to 5-fold. dual : bool, default=False - Dual or primal formulation. Dual formulation is only implemented for - l2 penalty with liblinear solver. Prefer dual=False when + Dual (constrained) or primal (regularized, see also + :ref:`this equation `) formulation. Dual formulation + is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. penalty : {'l1', 'l2', 'elasticnet'}, default='l2' @@ -1857,10 +1859,10 @@ def fit(self, X, y, sample_weight=None, **params): if _routing_enabled(): routed_params = process_routing( - obj=self, - method="fit", + self, + "fit", sample_weight=sample_weight, - other_params=params, + **params, ) else: routed_params = Bunch() @@ -2148,10 +2150,10 @@ def score(self, X, y, sample_weight=None, **score_params): scoring = self._get_scorer() if _routing_enabled(): routed_params = process_routing( - obj=self, - method="score", + self, + "score", sample_weight=sample_weight, - other_params=score_params, + **score_params, ) else: routed_params = Bunch() diff --git a/sklearn/metrics/_dist_metrics.pxd.tp b/sklearn/metrics/_dist_metrics.pxd.tp index 60b8da3ecfa46..313225088c776 100644 --- a/sklearn/metrics/_dist_metrics.pxd.tp +++ b/sklearn/metrics/_dist_metrics.pxd.tp @@ -71,26 +71,26 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef object func cdef object kwargs - cdef float64_t dist( + cdef {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil - cdef float64_t rdist( + cdef {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil - cdef float64_t dist_csr( + cdef {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -98,12 +98,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): const intp_t size, ) except -1 nogil - cdef float64_t rdist_csr( + cdef {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -114,39 +114,39 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef int pdist( self, const {{INPUT_DTYPE_t}}[:, ::1] X, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 cdef int cdist( self, const {{INPUT_DTYPE_t}}[:, ::1] X, const {{INPUT_DTYPE_t}}[:, ::1] Y, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 cdef int pdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const int32_t[:] x1_indptr, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, const intp_t size, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 nogil cdef int cdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const int32_t[:] x1_indptr, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, - const int32_t[:] x2_indptr, + const int32_t[::1] x2_indices, + const int32_t[::1] x2_indptr, const intp_t size, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 nogil - cdef float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil + cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil - cdef float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil + cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil {{endfor}} diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index bc54e51a7511a..6b5ea300f038b 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -65,6 +65,118 @@ def get_valid_metric_ids(L): if (val.__name__ in L) or (val in L)] cdef class DistanceMetric: + """Uniform interface for fast distance metric functions. + + The `DistanceMetric` class provides a convenient way to compute pairwise distances + between samples. It supports various distance metrics, such as Euclidean distance, + Manhattan distance, and more. + + The `pairwise` method can be used to compute pairwise distances between samples in + the input arrays. It returns a distance matrix representing the distances between + all pairs of samples. + + The :meth:`get_metric` method allows you to retrieve a specific metric using its + string identifier. + + Examples + -------- + >>> from sklearn.metrics import DistanceMetric + >>> dist = DistanceMetric.get_metric('euclidean') + >>> X = [[1, 2], [3, 4], [5, 6]] + >>> Y = [[7, 8], [9, 10]] + >>> dist.pairwise(X,Y) + array([[7.81..., 10.63...] + [5.65..., 8.48...] + [1.41..., 4.24...]]) + + Available Metrics + + The following lists the string metric identifiers and the associated + distance metric classes: + + **Metrics intended for real-valued vector spaces:** + + ============== ==================== ======== =============================== + identifier class name args distance function + -------------- -------------------- -------- ------------------------------- + "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` + "manhattan" ManhattanDistance - ``sum(|x - y|)`` + "chebyshev" ChebyshevDistance - ``max(|x - y|)`` + "minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` + "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` + "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` + ============== ==================== ======== =============================== + + **Metrics intended for two-dimensional vector spaces:** Note that the haversine + distance metric requires data in the form of [latitude, longitude] and both + inputs and outputs are in units of radians. + + ============ ================== =============================================================== + identifier class name distance function + ------------ ------------------ --------------------------------------------------------------- + "haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))`` + ============ ================== =============================================================== + + + **Metrics intended for integer-valued vector spaces:** Though intended + for integer-valued vectors, these are also valid metrics in the case of + real-valued vectors. + + ============= ==================== ======================================== + identifier class name distance function + ------------- -------------------- ---------------------------------------- + "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` + "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` + "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` + ============= ==================== ======================================== + + **Metrics intended for boolean-valued vector spaces:** Any nonzero entry + is evaluated to "True". In the listings below, the following + abbreviations are used: + + - N : number of dimensions + - NTT : number of dims in which both values are True + - NTF : number of dims in which the first value is True, second is False + - NFT : number of dims in which the first value is False, second is True + - NFF : number of dims in which both values are False + - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT + - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT + + ================= ======================= =============================== + identifier class name distance function + ----------------- ----------------------- ------------------------------- + "jaccard" JaccardDistance NNEQ / NNZ + "matching" MatchingDistance NNEQ / N + "dice" DiceDistance NNEQ / (NTT + NNZ) + "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) + "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) + "russellrao" RussellRaoDistance (N - NTT) / N + "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) + "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) + ================= ======================= =============================== + + **User-defined distance:** + + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "pyfunc" PyFuncDistance func + =========== =============== ======= + + Here ``func`` is a function which takes two one-dimensional numpy + arrays, and returns a distance. Note that in order to be used within + the BallTree, the distance must be a true metric: + i.e. it must satisfy the following properties + + 1) Non-negativity: d(x, y) >= 0 + 2) Identity: d(x, y) = 0 if and only if x == y + 3) Symmetry: d(x, y) = d(y, x) + 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) + + Because of the Python object overhead involved in calling the python + function, this will be fairly slow, but it will have the same + scaling as other distances. + """ @classmethod def get_metric(cls, metric, dtype=np.float64, **kwargs): """Get the given distance metric from the string identifier. @@ -74,11 +186,24 @@ cdef class DistanceMetric: Parameters ---------- metric : str or class name - The distance metric to use + The string identifier or class name of the desired distance metric. + See the documentation of the `DistanceMetric` class for a list of + available metrics. + dtype : {np.float32, np.float64}, default=np.float64 - The dtype of the data on which the metric will be applied + The data type of the input on which the metric will be applied. + This affects the precision of the computed distances. + By default, it is set to `np.float64`. + **kwargs - additional arguments will be passed to the requested metric + Additional keyword arguments that will be passed to the requested metric. + These arguments can be used to customize the behavior of the specific + metric. + + Returns + ------- + metric_obj : instance of the requested metric + An instance of the requested distance metric class. """ if dtype == np.float32: specialized_class = DistanceMetric32 @@ -332,7 +457,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): """ return - cdef float64_t dist( + cdef {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -344,7 +469,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): """ return -999 - cdef float64_t rdist( + cdef {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -364,7 +489,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef int pdist( self, const {{INPUT_DTYPE_t}}[:, ::1] X, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1: """Compute the pairwise distances between points in X""" cdef intp_t i1, i2 @@ -379,7 +504,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): self, const {{INPUT_DTYPE_t}}[:, ::1] X, const {{INPUT_DTYPE_t}}[:, ::1] Y, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1: """Compute the cross-pairwise distances between arrays X and Y""" cdef intp_t i1, i2 @@ -390,12 +515,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) return 0 - cdef float64_t dist_csr( + cdef {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -420,12 +545,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): 2. An alternative signature would be: - cdef float64_t dist_csr( + cdef {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, ) except -1 nogil: Where callers would use slicing on the original CSR data and indices @@ -456,12 +581,12 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): """ return -999 - cdef float64_t rdist_csr( + cdef {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -500,10 +625,10 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef int pdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const int32_t[:] x1_indptr, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, const intp_t size, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 nogil: """Pairwise distances between rows in CSR matrix X. @@ -523,9 +648,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): x2_end = x1_indptr[i2 + 1] D[i1, i2] = D[i2, i1] = self.dist_csr( x1_data, - x1_indices, + &x1_indices[0], x1_data, - x1_indices, + &x1_indices[0], x1_start, x1_end, x2_start, @@ -537,13 +662,13 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef int cdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const int32_t[:] x1_indptr, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, - const int32_t[:] x2_indptr, + const int32_t[::1] x2_indices, + const int32_t[::1] x2_indptr, const intp_t size, - float64_t[:, ::1] D, + {{INPUT_DTYPE_t}}[:, ::1] D, ) except -1 nogil: """Compute the cross-pairwise distances between arrays X and Y represented in the CSR format.""" @@ -562,9 +687,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): D[i1, i2] = self.dist_csr( x1_data, - x1_indices, + &x1_indices[0], x2_data, - x2_indices, + &x2_indices[0], x1_start, x1_end, x2_start, @@ -573,11 +698,11 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): ) return 0 - cdef float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: """Convert the rank-preserving surrogate distance to the distance""" return rdist - cdef float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: """Convert the distance to the rank-preserving surrogate distance""" return dist @@ -624,33 +749,33 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): def _pairwise_dense_dense(self, X, Y): cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr cdef const {{INPUT_DTYPE_t}}[:, ::1] Yarr - cdef float64_t[:, ::1] Darr + cdef {{INPUT_DTYPE_t}}[:, ::1] Darr Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Xarr) if X is Y: - Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype=np.float64, order='C') + Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C') self.pdist(Xarr, Darr) else: Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') self._validate_data(Yarr) - Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype=np.float64, order='C') + Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C') self.cdist(Xarr, Yarr, Darr) return np.asarray(Darr) def _pairwise_sparse_sparse(self, X: csr_matrix , Y: csr_matrix): cdef: intp_t n_X, n_features - const {{INPUT_DTYPE_t}}[:] X_data - const int32_t[:] X_indices - const int32_t[:] X_indptr + const {{INPUT_DTYPE_t}}[::1] X_data + const int32_t[::1] X_indices + const int32_t[::1] X_indptr intp_t n_Y - const {{INPUT_DTYPE_t}}[:] Y_data - const int32_t[:] Y_indices - const int32_t[:] Y_indptr + const {{INPUT_DTYPE_t}}[::1] Y_data + const int32_t[::1] Y_indices + const int32_t[::1] Y_indptr - float64_t[:, ::1] Darr + {{INPUT_DTYPE_t}}[:, ::1] Darr X_csr = X.tocsr() n_X, n_features = X_csr.shape @@ -658,7 +783,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): X_indices = np.asarray(X_csr.indices, dtype=np.int32) X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) if X is Y: - Darr = np.empty((n_X, n_X), dtype=np.float64, order='C') + Darr = np.empty((n_X, n_X), dtype={{INPUT_DTYPE}}, order='C') self.pdist_csr( x1_data=&X_data[0], x1_indices=X_indices, @@ -673,7 +798,7 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) - Darr = np.empty((n_X, n_Y), dtype=np.float64, order='C') + Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') self.cdist_csr( x1_data=&X_data[0], x1_indices=X_indices, @@ -690,13 +815,13 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): cdef: intp_t n_X = X.shape[0] intp_t n_features = X.shape[1] - const {{INPUT_DTYPE_t}}[:] X_data = np.asarray( + const {{INPUT_DTYPE_t}}[::1] X_data = np.asarray( X.data, dtype={{INPUT_DTYPE}}, ) - const int32_t[:] X_indices = np.asarray( + const int32_t[::1] X_indices = np.asarray( X.indices, dtype=np.int32, ) - const int32_t[:] X_indptr = np.asarray( + const int32_t[::1] X_indptr = np.asarray( X.indptr, dtype=np.int32, ) @@ -704,11 +829,11 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): Y, dtype={{INPUT_DTYPE}}, order="C", ) intp_t n_Y = Y_data.shape[0] - const int32_t[:] Y_indices = ( + const int32_t[::1] Y_indices = ( np.arange(n_features, dtype=np.int32) ) - float64_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=np.float64, order='C') + {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') intp_t i1, i2 intp_t x1_start, x1_end @@ -735,9 +860,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): Darr[i1, i2] = self.dist_csr( x1_data=&X_data[0], - x1_indices=X_indices, + x1_indices=&X_indices[0], x2_data=x2_data, - x2_indices=Y_indices, + x2_indices=&Y_indices[0], x1_start=x1_start, x1_end=x1_end, x2_start=0, @@ -758,22 +883,22 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): const {{INPUT_DTYPE_t}}[:, ::1] X_data = np.asarray( X, dtype={{INPUT_DTYPE}}, order="C", ) - const int32_t[:] X_indices = np.arange( + const int32_t[::1] X_indices = np.arange( n_features, dtype=np.int32, ) intp_t n_Y = Y.shape[0] - const {{INPUT_DTYPE_t}}[:] Y_data = np.asarray( + const {{INPUT_DTYPE_t}}[::1] Y_data = np.asarray( Y.data, dtype={{INPUT_DTYPE}}, ) - const int32_t[:] Y_indices = np.asarray( + const int32_t[::1] Y_indices = np.asarray( Y.indices, dtype=np.int32, ) - const int32_t[:] Y_indptr = np.asarray( + const int32_t[::1] Y_indptr = np.asarray( Y.indptr, dtype=np.int32, ) - float64_t[:, ::1] Darr = np.empty((n_X, n_Y), dtype=np.float64, order='C') + {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') intp_t i1, i2 {{INPUT_DTYPE_t}} * x1_data @@ -801,9 +926,9 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): Darr[i1, i2] = self.dist_csr( x1_data=x1_data, - x1_indices=X_indices, + x1_indices=&X_indices[0], x2_data=&Y_data[0], - x2_indices=Y_indices, + x2_indices=&Y_indices[0], x1_start=0, x1_end=n_features, x2_start=x2_start, @@ -867,24 +992,24 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 2 - cdef inline float64_t dist(self, + cdef inline {{INPUT_DTYPE_t}} dist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: return euclidean_dist{{name_suffix}}(x1, x2, size) - cdef inline float64_t rdist(self, + cdef inline {{INPUT_DTYPE_t}} rdist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: return euclidean_rdist{{name_suffix}}(x1, x2, size) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return sqrt(rdist) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: return dist * dist def rdist_to_dist(self, rdist): @@ -893,12 +1018,12 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -945,12 +1070,12 @@ cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -988,7 +1113,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('SEuclidean dist: size of V does not match') - cdef inline float64_t rdist( + cdef inline {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1001,7 +1126,7 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (tmp * tmp / self.vec[j]) return d - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1009,10 +1134,10 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ) except -1 nogil: return sqrt(self.rdist(x1, x2, size)) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return sqrt(rdist) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: return dist * dist def rdist_to_dist(self, rdist): @@ -1021,12 +1146,12 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1074,12 +1199,12 @@ cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): i1 = i1 + 1 return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1111,7 +1236,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = 1 - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1123,12 +1248,12 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs(x1[j] - x2[j]) return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1141,7 +1266,7 @@ cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): intp_t i1 = x1_start intp_t i2 = x2_start - float64_t d = 0.0 + {{INPUT_DTYPE_t}} d = 0.0 while i1 < x1_end and i2 < x2_end: ix1 = x1_indices[i1] @@ -1194,7 +1319,7 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def __init__(self): self.p = INF{{name_suffix}} - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1207,12 +1332,12 @@ cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1271,19 +1396,27 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): Parameters ---------- - p : int + p : float The order of the p-norm of the difference (see above). + + .. versionchanged:: 1.4.0 + Minkowski distance allows `p` to be `0= 1 and finite. For p = infinity, - use ChebyshevDistance. + Minkowski Distance requires p > 0 and finite. + When :math:`p \in (0,1)`, it isn't a true metric but is permissible when + the triangular inequality isn't necessary. + For p = infinity, use ChebyshevDistance. Note that for p=1, ManhattanDistance is more efficient, and for p=2, EuclideanDistance is more efficient. + """ def __init__(self, p, w=None): - if p < 1: - raise ValueError("p must be greater than 1") + if p <= 0: + raise ValueError("p must be greater than 0") elif np.isinf(p): raise ValueError("MinkowskiDistance requires finite p. " "For p=inf, use ChebyshevDistance.") @@ -1307,7 +1440,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): f"the number of features ({X.shape[1]}). " f"Currently len(w)={self.size}.") - cdef inline float64_t rdist( + cdef inline {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1324,7 +1457,7 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += (pow(fabs(x1[j] - x2[j]), self.p)) return d - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1332,10 +1465,10 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ) except -1 nogil: return pow(self.rdist(x1, x2, size), 1. / self.p) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return pow(rdist, 1. / self.p) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: return pow(dist, self.p) def rdist_to_dist(self, rdist): @@ -1344,12 +1477,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** self.p - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1424,12 +1557,12 @@ cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1496,7 +1629,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): if X.shape[1] != self.size: raise ValueError('Mahalanobis dist: size of V does not match') - cdef inline float64_t rdist( + cdef inline {{INPUT_DTYPE_t}} rdist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1516,7 +1649,7 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += tmp * self.buffer[i] return d - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1524,10 +1657,10 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ) except -1 nogil: return sqrt(self.rdist(x1, x2, size)) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return sqrt(rdist) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: return dist * dist def rdist_to_dist(self, rdist): @@ -1536,12 +1669,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): def dist_to_rdist(self, dist): return dist ** 2 - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1590,12 +1723,12 @@ cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1627,7 +1760,7 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1641,12 +1774,12 @@ cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return float(n_unequal) / size - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1702,7 +1835,7 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1716,12 +1849,12 @@ cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): d += fabs(x1[j] - x2[j]) / denom return d - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1777,7 +1910,7 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): .. math:: D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1793,12 +1926,12 @@ cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): else: return 0.0 - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1857,7 +1990,7 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1877,12 +2010,12 @@ cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): return 0 return (nnz - n_eq) * 1.0 / nnz - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -1946,7 +2079,7 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / N """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -1960,12 +2093,12 @@ cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq * 1. / size - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2021,7 +2154,7 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2036,12 +2169,12 @@ cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return n_neq / (2.0 * n_tt + n_neq) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2102,7 +2235,7 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 1 - N_TT / (N + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2117,12 +2250,12 @@ cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return (n_neq - n_tt + size) * 1.0 / (n_neq + size) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2181,7 +2314,7 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2195,12 +2328,12 @@ cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2258,7 +2391,7 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N - N_TT) / N """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2272,12 +2405,12 @@ cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return (size - n_tt) * 1. / size - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2328,7 +2461,7 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2342,12 +2475,12 @@ cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_neq += (tf1 != tf2) return (2.0 * n_neq) / (size + n_neq) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2405,7 +2538,7 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF) """ - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2420,12 +2553,12 @@ cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): n_tt += (tf1 and tf2) return n_neq / (0.5 * n_tt + n_neq) - cdef inline float64_t dist_csr( + cdef inline {{INPUT_DTYPE_t}} dist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2494,7 +2627,7 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): raise ValueError("Haversine distance only valid " "in 2 dimensions") - cdef inline float64_t rdist(self, + cdef inline {{INPUT_DTYPE_t}} rdist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, @@ -2503,17 +2636,17 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): cdef float64_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) - cdef inline float64_t dist(self, + cdef inline {{INPUT_DTYPE_t}} dist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, intp_t size, ) except -1 nogil: return 2 * asin(sqrt(self.rdist(x1, x2, size))) - cdef inline float64_t _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: return 2 * asin(sqrt(rdist)) - cdef inline float64_t _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: cdef float64_t tmp = sin(0.5 * dist) return tmp * tmp @@ -2524,17 +2657,17 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): tmp = np.sin(0.5 * dist) return tmp * tmp - cdef inline float64_t dist_csr( - self, - const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, - const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, - const int32_t x1_start, - const int32_t x1_end, - const int32_t x2_start, - const int32_t x2_end, - const intp_t size, + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, ) except -1 nogil: return 2 * asin(sqrt(self.rdist_csr( x1_data, @@ -2548,12 +2681,12 @@ cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): size, ))) - cdef inline float64_t rdist_csr( + cdef inline {{INPUT_DTYPE_t}} rdist_csr( self, const {{INPUT_DTYPE_t}}* x1_data, - const int32_t[:] x1_indices, + const int32_t* x1_indices, const {{INPUT_DTYPE_t}}* x2_data, - const int32_t[:] x2_indices, + const int32_t* x2_indices, const int32_t x1_start, const int32_t x1_end, const int32_t x2_start, @@ -2640,7 +2773,7 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The # only way to be back compatible is to inherit `dist` from the base class # without GIL and called an inline `_dist` which acquire GIL. - cdef inline float64_t dist( + cdef inline {{INPUT_DTYPE_t}} dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, @@ -2648,7 +2781,7 @@ cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): ) except -1 nogil: return self._dist(x1, x2, size) - cdef inline float64_t _dist( + cdef inline {{INPUT_DTYPE_t}} _dist( self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index 7edc64c59a050..dd66299223efe 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -36,7 +36,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): X, Y, intp_t k, - str metric="euclidean", + metric="euclidean", chunk_size=None, dict metric_kwargs=None, str strategy=None, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp index 3d0ea84b0091d..f9719f6959dfc 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp @@ -8,13 +8,7 @@ from ...utils._typedefs cimport intp_t, float64_t import numpy as np from scipy.sparse import issparse from sklearn.utils.fixes import threadpool_limits - -cpdef enum WeightingStrategy: - uniform = 0 - # TODO: Implement the following options, most likely in - # `weighted_histogram_mode` - distance = 1 - callable = 2 +from ._classmode cimport WeightingStrategy {{for name_suffix in ["32", "64"]}} from ._argkmin cimport ArgKmin{{name_suffix}} @@ -25,8 +19,8 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): {{name_suffix}}bit implementation of ArgKminClassMode. """ cdef: - const intp_t[:] class_membership, - const intp_t[:] unique_labels + const intp_t[:] Y_labels, + const intp_t[:] unique_Y_labels float64_t[:, :] class_scores cpp_map[intp_t, intp_t] labels_to_index WeightingStrategy weight_type @@ -38,14 +32,14 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): Y, intp_t k, weights, - class_membership, - unique_labels, + Y_labels, + unique_Y_labels, str metric="euclidean", chunk_size=None, dict metric_kwargs=None, str strategy=None, ): - """Compute the argkmin reduction with class_membership. + """Compute the argkmin reduction with Y_labels. This classmethod is responsible for introspecting the arguments values to dispatch to the most appropriate implementation of @@ -66,8 +60,8 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): chunk_size=chunk_size, strategy=strategy, weights=weights, - class_membership=class_membership, - unique_labels=unique_labels, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, ) # Limit the number of threads in second level of nested parallelism for BLAS @@ -83,8 +77,8 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): def __init__( self, DatasetsPair{{name_suffix}} datasets_pair, - const intp_t[:] class_membership, - const intp_t[:] unique_labels, + const intp_t[:] Y_labels, + const intp_t[:] unique_Y_labels, chunk_size=None, strategy=None, intp_t k=1, @@ -103,15 +97,15 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): self.weight_type = WeightingStrategy.distance else: self.weight_type = WeightingStrategy.callable - self.class_membership = class_membership + self.Y_labels = Y_labels - self.unique_labels = unique_labels + self.unique_Y_labels = unique_Y_labels cdef intp_t idx, neighbor_class_idx # Map from set of unique labels to their indices in `class_scores` # Buffer used in building a histogram for one-pass weighted mode self.class_scores = np.zeros( - (self.n_samples_X, unique_labels.shape[0]), dtype=np.float64, + (self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64, ) def _finalize_results(self): @@ -142,7 +136,7 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): if use_distance_weighting: score_incr = 1 / distances[neighbor_rank] neighbor_idx = indices[neighbor_rank] - neighbor_class_idx = self.class_membership[neighbor_idx] + neighbor_class_idx = self.Y_labels[neighbor_idx] self.class_scores[sample_index][neighbor_class_idx] += score_incr return diff --git a/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd b/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd new file mode 100644 index 0000000000000..65db044d668e8 --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd @@ -0,0 +1,5 @@ +cpdef enum WeightingStrategy: + uniform = 0 + # TODO: Implement the following options in weighted_histogram_mode + distance = 1 + callable = 2 diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index fc56a59cab16f..1e57b3291a8f4 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -38,22 +38,22 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef: const {{INPUT_DTYPE_t}}[:] X_data - const int32_t[:] X_indices - const int32_t[:] X_indptr + const int32_t[::1] X_indices + const int32_t[::1] X_indptr const {{INPUT_DTYPE_t}}[:] Y_data - const int32_t[:] Y_indices - const int32_t[:] Y_indptr + const int32_t[::1] Y_indices + const int32_t[::1] Y_indptr cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef: const {{INPUT_DTYPE_t}}[:] X_data - const int32_t[:] X_indices - const int32_t[:] X_indptr + const int32_t[::1] X_indices + const int32_t[::1] X_indptr const {{INPUT_DTYPE_t}}[:] Y_data - const int32_t[:] Y_indices + const int32_t[::1] Y_indices intp_t n_Y diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 40a9a45e8b8e1..2c3ca44047145 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -1,3 +1,5 @@ +import copy + {{py: implementation_specific_values = [ @@ -53,7 +55,7 @@ cdef class DatasetsPair{{name_suffix}}: cls, X, Y, - str metric="euclidean", + metric="euclidean", dict metric_kwargs=None, ) -> DatasetsPair{{name_suffix}}: """Return the DatasetsPair implementation for the given arguments. @@ -70,7 +72,7 @@ cdef class DatasetsPair{{name_suffix}}: If provided as a ndarray, it must be C-contiguous. If provided as a sparse matrix, it must be in CSR format. - metric : str, default='euclidean' + metric : str or DistanceMetric object, default='euclidean' The distance metric to compute between rows of X and Y. The default metric is a fast implementation of the Euclidean metric. For a list of available metrics, see the documentation @@ -84,12 +86,17 @@ cdef class DatasetsPair{{name_suffix}}: datasets_pair: DatasetsPair{{name_suffix}} The suited DatasetsPair{{name_suffix}} implementation. """ - # Y_norm_squared might be propagated down to DatasetsPairs - # via metrics_kwargs when the Euclidean specialisations - # can't be used. To prevent Y_norm_squared to be passed + # X_norm_squared and Y_norm_squared might be propagated + # down to DatasetsPairs via metrics_kwargs when the Euclidean + # specialisations can't be used. + # To prevent X_norm_squared and Y_norm_squared to be passed # down to DistanceMetrics (whose constructors would raise - # a RuntimeError), we pop it here. + # a RuntimeError), we pop them here. if metric_kwargs is not None: + # Copying metric_kwargs not to pop "X_norm_squared" + # and "Y_norm_squared" where they are used + metric_kwargs = copy.copy(metric_kwargs) + metric_kwargs.pop("X_norm_squared", None) metric_kwargs.pop("Y_norm_squared", None) cdef: {{DistanceMetric}} distance_metric = DistanceMetric.get_metric( @@ -231,9 +238,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], x2_data=&self.Y_data[0], - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=self.Y_indptr[j], @@ -245,9 +252,9 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil: return self.distance_metric.dist_csr( x1_data=&self.X_data[0], - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], x2_data=&self.Y_data[0], - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=self.Y_indptr[j], @@ -324,11 +331,11 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: return self.distance_metric.rdist_csr( x1_data=&self.X_data[0], - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], # Increment the data pointer such that x2_start=0 is aligned with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=0, @@ -341,11 +348,11 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): return self.distance_metric.dist_csr( x1_data=&self.X_data[0], - x1_indices=self.X_indices, + x1_indices=&self.X_indices[0], # Increment the data pointer such that x2_start=0 is aligned with the # j-th row x2_data=&self.Y_data[0] + j * self.n_features, - x2_indices=self.Y_indices, + x2_indices=&self.Y_indices[0], x1_start=self.X_indptr[i], x1_end=self.X_indptr[i + 1], x2_start=0, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 796f15ab6fca0..e23da467d723a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -5,7 +5,11 @@ from scipy.sparse import issparse from ... import get_config -from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING64 +from .._dist_metrics import ( + BOOL_METRICS, + METRIC_MAPPING64, + DistanceMetric, +) from ._argkmin import ( ArgKmin32, ArgKmin64, @@ -117,7 +121,7 @@ def is_valid_sparse_matrix(X): and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y)) and X.dtype == Y.dtype and X.dtype in (np.float32, np.float64) - and metric in cls.valid_metrics() + and (metric in cls.valid_metrics() or isinstance(metric, DistanceMetric)) ) return is_usable @@ -456,7 +460,7 @@ def is_usable_for(cls, X, Y, metric) -> bool: The input array to be labelled. Y : ndarray of shape (n_samples_Y, n_features) - The input array whose labels are provided through the `labels` + The input array whose labels are provided through the `Y_labels` parameter. metric : str, default='euclidean' @@ -484,8 +488,8 @@ def compute( Y, k, weights, - labels, - unique_labels, + Y_labels, + unique_Y_labels, metric="euclidean", chunk_size=None, metric_kwargs=None, @@ -499,23 +503,23 @@ def compute( The input array to be labelled. Y : ndarray of shape (n_samples_Y, n_features) - The input array whose labels are provided through the `labels` - parameter. + The input array whose class membership are provided through the + `Y_labels` parameter. k : int The number of nearest neighbors to consider. weights : ndarray - The weights applied over the `labels` of `Y` when computing the + The weights applied over the `Y_labels` of `Y` when computing the weighted mode of the labels. - class_membership : ndarray + Y_labels : ndarray An array containing the index of the class membership of the associated samples in `Y`. This is used in labeling `X`. - unique_classes : ndarray + unique_Y_labels : ndarray An array containing all unique indices contained in the - corresponding `class_membership` array. + corresponding `Y_labels` array. metric : str, default='euclidean' The distance metric to use. For a list of available metrics, see @@ -587,8 +591,8 @@ def compute( Y=Y, k=k, weights=weights, - class_membership=np.array(labels, dtype=np.intp), - unique_labels=np.array(unique_labels, dtype=np.intp), + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), metric=metric, chunk_size=chunk_size, metric_kwargs=metric_kwargs, @@ -601,8 +605,8 @@ def compute( Y=Y, k=k, weights=weights, - class_membership=np.array(labels, dtype=np.intp), - unique_labels=np.array(unique_labels, dtype=np.intp), + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), metric=metric, chunk_size=chunk_size, metric_kwargs=metric_kwargs, diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index aee1615c55630..302831366aa54 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -124,7 +124,7 @@ def __call__(self, estimator, *args, **kwargs): cached_call = partial(_cached_call, cache) if _routing_enabled(): - routed_params = process_routing(self, "score", kwargs) + routed_params = process_routing(self, "score", **kwargs) else: # they all get the same args, and they all get them all routed_params = Bunch( @@ -293,6 +293,13 @@ def set_score_request(self, **kwargs): Arguments should be of the form ``param_name=alias``, and `alias` can be one of ``{True, False, None, str}``. """ + if not _routing_enabled(): + raise RuntimeError( + "This method is only available when metadata routing is enabled." + " You can enable it using" + " sklearn.set_config(enable_metadata_routing=True)." + ) + self._warn_overlap( message=( "You are setting metadata request for parameters which are " diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index a05a532ecb3f2..cfcb08a312443 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -159,10 +159,10 @@ def test_classification_report_dictionary_output(): for metric in expected_report[key]: assert_almost_equal(expected_report[key][metric], report[key][metric]) - assert type(expected_report["setosa"]["precision"]) == float - assert type(expected_report["macro avg"]["precision"]) == float - assert type(expected_report["setosa"]["support"]) == int - assert type(expected_report["macro avg"]["support"]) == int + assert isinstance(expected_report["setosa"]["precision"], float) + assert isinstance(expected_report["macro avg"]["precision"], float) + assert isinstance(expected_report["setosa"]["support"], int) + assert isinstance(expected_report["macro avg"]["support"], int) def test_classification_report_output_dict_empty_input(): diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 16aa5c569b161..7d44b988b9161 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -15,6 +15,7 @@ ) from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose, create_memmap_backed_data +from sklearn.utils.fixes import parse_version, sp_version def dist_func(x1, x2, p): @@ -42,18 +43,17 @@ def dist_func(x1, x2, p): V = rng.random_sample((d, d)) VI = np.dot(V, V.T) - METRICS_DEFAULT_PARAMS = [ ("euclidean", {}), ("cityblock", {}), - ("minkowski", dict(p=(1, 1.5, 2, 3))), + ("minkowski", dict(p=(0.5, 1, 1.5, 2, 3))), ("chebyshev", {}), ("seuclidean", dict(V=(rng.random_sample(d),))), ("mahalanobis", dict(VI=(VI,))), ("hamming", {}), ("canberra", {}), ("braycurtis", {}), - ("minkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))), + ("minkowski", dict(p=(0.5, 1, 1.5, 3), w=(rng.random_sample(d),))), ] @@ -76,6 +76,13 @@ def test_cdist(metric_param_grid, X, Y): # with scipy rtol_dict = {"rtol": 1e-6} + # TODO: Remove when scipy minimum version >= 1.7.0 + # scipy supports 0= 1.7.0 + if metric == "minkowski": + p = kwargs["p"] + if sp_version < parse_version("1.7.0") and p < 1: + pytest.skip("scipy does not support 0= 1.7.0 + # scipy supports 0= 1.7.0 + if metric == "minkowski": + p = kwargs["p"] + if sp_version < parse_version("1.7.0") and p < 1: + pytest.skip("scipy does not support 0>> print(scores['train_r2']) [0.28009951 0.3908844 0.22784907] """ - X, y, groups = indexable(X, y, groups) + params = _check_params_groups_deprecation(fit_params, params, groups) + + X, y = indexable(X, y) cv = check_cv(cv, y, classifier=is_classifier(estimator)) @@ -298,7 +359,62 @@ def cross_validate( else: scorers = _check_multimetric_scoring(estimator, scoring) - indices = cv.split(X, y, groups) + if _routing_enabled(): + # `cross_validate` will create a `_MultiMetricScorer` if `scoring` is a + # dict at a later stage. We need the same object for the purpose of + # routing. However, creating it here and passing it around would create + # a much larger diff since the dict is used in many places. + if isinstance(scorers, dict): + _scorer = _MultimetricScorer( + scorers=scorers, raise_exc=(error_score == "raise") + ) + else: + _scorer = scorers + # For estimators, a MetadataRouter is created in get_metadata_routing + # methods. For these router methods, we create the router to use + # `process_routing` on it. + router = ( + MetadataRouter(owner="cross_validate") + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata to the predict method for + # scoring? + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + .add( + scorer=_scorer, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + raise UnsetMetadataPassedError( + message=( + f"{sorted(e.unrequested_params.keys())} are passed to cross" + " validation but are not explicitly requested or unrequested. See" + " the Metadata Routing User guide" + " for more" + " information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.estimator = Bunch(fit=params) + routed_params.scorer = Bunch(score={}) + + indices = cv.split(X, y, **routed_params.splitter.split) if return_indices: # materialize the indices since we need to store them in the returned dict indices = list(indices) @@ -311,12 +427,13 @@ def cross_validate( clone(estimator), X, y, - scorers, - train, - test, - verbose, - None, - fit_params, + scorer=scorers, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, return_train_score=return_train_score, return_times=True, return_estimator=return_estimator, @@ -436,6 +553,7 @@ def _warn_or_raise_about_fit_failures(results, error_score): "n_jobs": [Integral, None], "verbose": ["verbose"], "fit_params": [dict, None], + "params": [dict, None], "pre_dispatch": [Integral, str, None], "error_score": [StrOptions({"raise"}), Real], }, @@ -452,6 +570,7 @@ def cross_val_score( n_jobs=None, verbose=0, fit_params=None, + params=None, pre_dispatch="2*n_jobs", error_score=np.nan, ): @@ -477,6 +596,13 @@ def cross_val_score( train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_val_score(..., params={'groups': groups})``. + scoring : str or callable, default=None A str (see model evaluation documentation) or a scorer callable object / function with signature @@ -521,6 +647,16 @@ def cross_val_score( fit_params : dict, default=None Parameters to pass to the fit method of the estimator. + .. deprecated:: 1.4 + This parameter is deprecated and will be removed in version 1.6. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit``, the scorer, + and the CV splitter. + + .. versionadded:: 1.4 + pre_dispatch : int or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an @@ -585,6 +721,7 @@ def cross_val_score( n_jobs=n_jobs, verbose=verbose, fit_params=fit_params, + params=params, pre_dispatch=pre_dispatch, error_score=error_score, ) @@ -595,12 +732,14 @@ def _fit_and_score( estimator, X, y, + *, scorer, train, test, verbose, parameters, fit_params, + score_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, @@ -654,6 +793,9 @@ def _fit_and_score( fit_params : dict or None Parameters that will be passed to ``estimator.fit``. + score_params : dict or None + Parameters that will be passed to the scorer. + return_train_score : bool, default=False Compute and return score on training set. @@ -724,6 +866,9 @@ def _fit_and_score( # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = _check_method_params(X, params=fit_params, indices=train) + score_params = score_params if score_params is not None else {} + score_params_train = _check_method_params(X, params=score_params, indices=train) + score_params_test = _check_method_params(X, params=score_params, indices=test) if parameters is not None: # here we clone the parameters, since sometimes the parameters @@ -764,10 +909,14 @@ def _fit_and_score( result["fit_error"] = None fit_time = time.time() - start_time - test_scores = _score(estimator, X_test, y_test, scorer, error_score) + test_scores = _score( + estimator, X_test, y_test, scorer, score_params_test, error_score + ) score_time = time.time() - start_time - fit_time if return_train_score: - train_scores = _score(estimator, X_train, y_train, scorer, error_score) + train_scores = _score( + estimator, X_train, y_train, scorer, score_params_train, error_score + ) if verbose > 1: total_time = score_time + fit_time @@ -809,7 +958,7 @@ def _fit_and_score( return result -def _score(estimator, X_test, y_test, scorer, error_score="raise"): +def _score(estimator, X_test, y_test, scorer, score_params, error_score="raise"): """Compute the score(s) of an estimator on a given test set. Will return a dict of floats if `scorer` is a dict, otherwise a single @@ -819,11 +968,13 @@ def _score(estimator, X_test, y_test, scorer, error_score="raise"): # will cache method calls if needed. scorer() returns a dict scorer = _MultimetricScorer(scorers=scorer, raise_exc=(error_score == "raise")) + score_params = {} if score_params is None else score_params + try: if y_test is None: - scores = scorer(estimator, X_test) + scores = scorer(estimator, X_test, **score_params) else: - scores = scorer(estimator, X_test, y_test) + scores = scorer(estimator, X_test, y_test, **score_params) except Exception: if isinstance(scorer, _MultimetricScorer): # If `_MultimetricScorer` raises exception, the `error_score` @@ -891,6 +1042,7 @@ def _score(estimator, X_test, y_test, scorer, error_score="raise"): "n_jobs": [Integral, None], "verbose": ["verbose"], "fit_params": [dict, None], + "params": [dict, None], "pre_dispatch": [Integral, str, None], "method": [ StrOptions( @@ -915,6 +1067,7 @@ def cross_val_predict( n_jobs=None, verbose=0, fit_params=None, + params=None, pre_dispatch="2*n_jobs", method="predict", ): @@ -950,6 +1103,13 @@ def cross_val_predict( train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_val_predict(..., params={'groups': groups})``. + cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -983,6 +1143,16 @@ def cross_val_predict( fit_params : dict, default=None Parameters to pass to the fit method of the estimator. + .. deprecated:: 1.4 + This parameter is deprecated and will be removed in version 1.6. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit`` and the CV + splitter. + + .. versionadded:: 1.4 + pre_dispatch : int or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an @@ -1042,10 +1212,50 @@ def cross_val_predict( >>> lasso = linear_model.Lasso() >>> y_pred = cross_val_predict(lasso, X, y, cv=3) """ - X, y, groups = indexable(X, y, groups) + params = _check_params_groups_deprecation(fit_params, params, groups) + X, y = indexable(X, y) + + if _routing_enabled(): + # For estimators, a MetadataRouter is created in get_metadata_routing + # methods. For these router methods, we create the router to use + # `process_routing` on it. + router = ( + MetadataRouter(owner="cross_validate") + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata for the predict method. + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + ) + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + raise UnsetMetadataPassedError( + message=( + f"{sorted(e.unrequested_params.keys())} are passed to cross" + " validation but are not explicitly requested or unrequested. See" + " the Metadata Routing User guide" + " for more" + " information." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.estimator = Bunch(fit=params) cv = check_cv(cv, y, classifier=is_classifier(estimator)) - splits = list(cv.split(X, y, groups)) + splits = list(cv.split(X, y, **routed_params.splitter.split)) test_indices = np.concatenate([test for _, test in splits]) if not _check_is_permutation(test_indices, _num_samples(X)): @@ -1073,7 +1283,13 @@ def cross_val_predict( parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) predictions = parallel( delayed(_fit_and_predict)( - clone(estimator), X, y, train, test, verbose, fit_params, method + clone(estimator), + X, + y, + train, + test, + routed_params.estimator.fit, + method, ) for train, test in splits ) @@ -1103,7 +1319,7 @@ def cross_val_predict( return predictions[inv_test_indices] -def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method): +def _fit_and_predict(estimator, X, y, train, test, fit_params, method): """Fit estimator and predict values for a given dataset split. Read more in the :ref:`User Guide `. @@ -1129,9 +1345,6 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method): test : array-like of shape (n_test_samples,) Indices of test samples. - verbose : int - The verbosity level. - fit_params : dict or None Parameters that will be passed to ``estimator.fit``. @@ -1707,7 +1920,6 @@ def learning_curve( test, train_sizes_abs, scorer, - verbose, return_times, error_score=error_score, fit_params=fit_params, @@ -1726,12 +1938,14 @@ def learning_curve( clone(estimator), X, y, - scorer, - train, - test, - verbose, + scorer=scorer, + train=train, + test=test, + verbose=verbose, parameters=None, fit_params=fit_params, + # TODO(SLEP6): support score params here + score_params=None, return_train_score=True, error_score=error_score, return_times=return_times, @@ -1833,7 +2047,6 @@ def _incremental_fit_estimator( test, train_sizes, scorer, - verbose, return_times, error_score, fit_params, @@ -1863,9 +2076,27 @@ def _incremental_fit_estimator( start_score = time.time() - test_scores.append(_score(estimator, X_test, y_test, scorer, error_score)) - train_scores.append(_score(estimator, X_train, y_train, scorer, error_score)) - + # TODO(SLEP6): support score params in the following two calls + test_scores.append( + _score( + estimator, + X_test, + y_test, + scorer, + score_params=None, + error_score=error_score, + ) + ) + train_scores.append( + _score( + estimator, + X_train, + y_train, + scorer, + score_params=None, + error_score=error_score, + ) + ) score_time = time.time() - start_score score_times.append(score_time) @@ -2025,12 +2256,14 @@ def validation_curve( clone(estimator), X, y, - scorer, - train, - test, - verbose, + scorer=scorer, + train=train, + test=test, + verbose=verbose, parameters={param_name: v}, fit_params=fit_params, + # TODO(SLEP6): support score params here + score_params=None, return_train_score=True, error_score=error_score, ) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 6ea52049f3ced..04c3f1f156fab 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -900,18 +900,16 @@ def check_cv_results_array_types(search, param_keys, score_keys): assert cv_results["rank_test_%s" % key].dtype == np.int32 -def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand): +def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()): # Test the search.cv_results_ contains all the required results - assert_array_equal( - sorted(cv_results.keys()), sorted(param_keys + score_keys + ("params",)) - ) + all_keys = param_keys + score_keys + extra_keys + assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",))) assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys) def test_grid_search_cv_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) - n_splits = 3 n_grid_points = 6 params = [ dict( @@ -949,9 +947,7 @@ def test_grid_search_cv_results(): ) n_candidates = n_grid_points - search = GridSearchCV( - SVC(), cv=n_splits, param_grid=params, return_train_score=True - ) + search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True) search.fit(X, y) cv_results = search.cv_results_ # Check if score and timing are reasonable @@ -967,17 +963,20 @@ def test_grid_search_cv_results(): check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) # Check masking cv_results = search.cv_results_ - n_candidates = len(search.cv_results_["params"]) - assert all( + + poly_results = [ ( cv_results["param_C"].mask[i] and cv_results["param_gamma"].mask[i] and not cv_results["param_degree"].mask[i] ) for i in range(n_candidates) - if cv_results["param_kernel"][i] == "linear" - ) - assert all( + if cv_results["param_kernel"][i] == "poly" + ] + assert all(poly_results) + assert len(poly_results) == 2 + + rbf_results = [ ( not cv_results["param_C"].mask[i] and not cv_results["param_gamma"].mask[i] @@ -985,13 +984,14 @@ def test_grid_search_cv_results(): ) for i in range(n_candidates) if cv_results["param_kernel"][i] == "rbf" - ) + ] + assert all(rbf_results) + assert len(rbf_results) == 4 def test_random_search_cv_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) - n_splits = 3 n_search_iter = 30 params = [ @@ -1016,12 +1016,12 @@ def test_random_search_cv_results(): "mean_score_time", "std_score_time", ) - n_cand = n_search_iter + n_candidates = n_search_iter search = RandomizedSearchCV( SVC(), n_iter=n_search_iter, - cv=n_splits, + cv=3, param_distributions=params, return_train_score=True, ) @@ -1029,8 +1029,7 @@ def test_random_search_cv_results(): cv_results = search.cv_results_ # Check results structure check_cv_results_array_types(search, param_keys, score_keys) - check_cv_results_keys(cv_results, param_keys, score_keys, n_cand) - n_candidates = len(search.cv_results_["params"]) + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) assert all( ( cv_results["param_C"].mask[i] @@ -1038,7 +1037,7 @@ def test_random_search_cv_results(): and not cv_results["param_degree"].mask[i] ) for i in range(n_candidates) - if cv_results["param_kernel"][i] == "linear" + if cv_results["param_kernel"][i] == "poly" ) assert all( ( diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index d92f624441541..151498205dd39 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -14,6 +14,7 @@ ) from scipy.special import comb +from sklearn import config_context from sklearn.datasets import load_digits, make_classification from sklearn.dummy import DummyClassifier from sklearn.model_selection import ( @@ -43,7 +44,15 @@ _yields_constant_splits, ) from sklearn.svm import SVC -from sklearn.tests.test_metadata_routing import assert_request_is_empty +from sklearn.tests.metadata_routing_common import assert_request_is_empty +from sklearn.utils._array_api import ( + _convert_to_numpy, + get_namespace, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._array_api import ( + device as array_api_device, +) from sklearn.utils._mocking import MockDataFrame from sklearn.utils._testing import ( assert_allclose, @@ -51,6 +60,9 @@ assert_array_equal, ignore_warnings, ) +from sklearn.utils.estimator_checks import ( + _array_api_for_tests, +) from sklearn.utils.validation import _num_samples NO_GROUP_SPLITTERS = [ @@ -1259,6 +1271,70 @@ def test_train_test_split_default_test_size(train_size, exp_train, exp_test): assert len(X_test) == exp_test +@pytest.mark.parametrize( + "array_namepsace, device, dtype", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize( + "shuffle,stratify", + ( + (True, None), + (True, np.hstack((np.ones(6), np.zeros(4)))), + # stratification only works with shuffling + (False, None), + ), +) +def test_array_api_train_test_split(shuffle, stratify, array_namepsace, device, dtype): + xp, device, dtype = _array_api_for_tests(array_namepsace, device, dtype) + + X = np.arange(100).reshape((10, 10)) + y = np.arange(10) + + X_np = X.astype(dtype) + X_xp = xp.asarray(X_np, device=device) + + y_np = y.astype(dtype) + y_xp = xp.asarray(y_np, device=device) + + X_train_np, X_test_np, y_train_np, y_test_np = train_test_split( + X_np, y, random_state=0, shuffle=shuffle, stratify=stratify + ) + with config_context(array_api_dispatch=True): + if stratify is not None: + stratify_xp = xp.asarray(stratify) + else: + stratify_xp = stratify + X_train_xp, X_test_xp, y_train_xp, y_test_xp = train_test_split( + X_xp, y_xp, shuffle=shuffle, stratify=stratify_xp, random_state=0 + ) + + # Check that namespace is preserved, has to happen with + # array_api_dispatch enabled. + assert get_namespace(X_train_xp)[0] == get_namespace(X_xp)[0] + assert get_namespace(X_test_xp)[0] == get_namespace(X_xp)[0] + assert get_namespace(y_train_xp)[0] == get_namespace(y_xp)[0] + assert get_namespace(y_test_xp)[0] == get_namespace(y_xp)[0] + + # Check device and dtype is preserved on output + assert array_api_device(X_train_xp) == array_api_device(X_xp) + assert array_api_device(y_train_xp) == array_api_device(y_xp) + assert array_api_device(X_test_xp) == array_api_device(X_xp) + assert array_api_device(y_test_xp) == array_api_device(y_xp) + + assert X_train_xp.dtype == X_xp.dtype + assert y_train_xp.dtype == y_xp.dtype + assert X_test_xp.dtype == X_xp.dtype + assert y_test_xp.dtype == y_xp.dtype + + assert_allclose( + _convert_to_numpy(X_train_xp, xp=xp), + X_train_np, + ) + assert_allclose( + _convert_to_numpy(X_test_xp, xp=xp), + X_test_np, + ) + + def test_train_test_split(): X = np.arange(100).reshape((10, 10)) X_s = coo_matrix(X) @@ -1808,7 +1884,7 @@ def test_nested_cv(): error_score="raise", ) cross_val_score( - gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={"groups": groups} + gs, X=X, y=y, groups=groups, cv=outer_cv, params={"groups": groups} ) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index d4cc09ee01044..6c89f89afa684 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from scipy.stats import norm, randint +from scipy.stats import expon, norm, randint from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier @@ -23,7 +23,11 @@ _SubsampleMetaSplitter, _top_k, ) -from sklearn.svm import LinearSVC +from sklearn.model_selection.tests.test_search import ( + check_cv_results_array_types, + check_cv_results_keys, +) +from sklearn.svm import SVC, LinearSVC class FastClassifier(DummyClassifier): @@ -777,3 +781,68 @@ def test_select_best_index(SearchCV): # we expect the index of 'i' best_index = SearchCV._select_best_index(None, None, results) assert best_index == 8 + + +def test_halving_random_search_list_of_dicts(): + """Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution` + being a list of dictionary. + """ + X, y = make_classification(n_samples=150, n_features=4, random_state=42) + + params = [ + {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)}, + {"kernel": ["poly"], "degree": [2, 3]}, + ] + param_keys = ( + "param_C", + "param_degree", + "param_gamma", + "param_kernel", + ) + score_keys = ( + "mean_test_score", + "mean_train_score", + "rank_test_score", + "split0_test_score", + "split1_test_score", + "split2_test_score", + "split0_train_score", + "split1_train_score", + "split2_train_score", + "std_test_score", + "std_train_score", + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ) + extra_keys = ("n_resources", "iter") + + search = HalvingRandomSearchCV( + SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0 + ) + search.fit(X, y) + n_candidates = sum(search.n_candidates_) + cv_results = search.cv_results_ + # Check results structure + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys) + check_cv_results_array_types(search, param_keys, score_keys) + + assert all( + ( + cv_results["param_C"].mask[i] + and cv_results["param_gamma"].mask[i] + and not cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "poly" + ) + assert all( + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "rbf" + ) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index ba9f66ab240e4..c944b06b30860 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -73,6 +73,13 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder, scale from sklearn.svm import SVC, LinearSVC +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingScorer, + ConsumingSplitter, + _Registry, + check_recorded_metadata, +) from sklearn.utils import shuffle from sklearn.utils._mocking import CheckingClassifier, MockDataFrame from sklearn.utils._testing import ( @@ -706,7 +713,7 @@ def assert_fit_params(clf): "dummy_obj": DUMMY_OBJ, "callback": assert_fit_params, } - cross_val_score(clf, X, y, fit_params=fit_params) + cross_val_score(clf, X, y, params=fit_params) def test_cross_val_score_score_func(): @@ -1160,7 +1167,7 @@ def test_cross_val_score_sparse_fit_params(): X, y = iris.data, iris.target clf = MockClassifier() fit_params = {"sparse_sample_weight": coo_matrix(np.eye(X.shape[0]))} - a = cross_val_score(clf, X, y, fit_params=fit_params, cv=3) + a = cross_val_score(clf, X, y, params=fit_params, cv=3) assert_array_equal(a, np.ones(3)) @@ -2082,12 +2089,23 @@ def test_fit_and_score_failing(): failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER) # dummy X data X = np.arange(1, 10) - fit_and_score_args = [failing_clf, X, None, dict(), None, None, 0, None, None] + fit_and_score_args = dict( + estimator=failing_clf, + X=X, + y=None, + scorer=dict(), + train=None, + test=None, + verbose=0, + parameters=None, + fit_params=None, + score_params=None, + ) # passing error score to trigger the warning message - fit_and_score_kwargs = {"error_score": "raise"} + fit_and_score_args["error_score"] = "raise" # check if exception was raised, with default error_score='raise' with pytest.raises(ValueError, match="Failing classifier failed as required"): - _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) + _fit_and_score(**fit_and_score_args) assert failing_clf.score() == 0.0 # FailingClassifier coverage @@ -2097,14 +2115,21 @@ def test_fit_and_score_working(): clf = SVC(kernel="linear", random_state=0) train, test = next(ShuffleSplit().split(X)) # Test return_parameters option - fit_and_score_args = [clf, X, y, dict(), train, test, 0] - fit_and_score_kwargs = { - "parameters": {"max_iter": 100, "tol": 0.1}, - "fit_params": None, - "return_parameters": True, - } - result = _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) - assert result["parameters"] == fit_and_score_kwargs["parameters"] + fit_and_score_args = dict( + estimator=clf, + X=X, + y=y, + scorer=dict(), + train=train, + test=test, + verbose=0, + parameters={"max_iter": 100, "tol": 0.1}, + fit_params=None, + score_params=None, + return_parameters=True, + ) + result = _fit_and_score(**fit_and_score_args) + assert result["parameters"] == fit_and_score_args["parameters"] class DataDependentFailingClassifier(BaseEstimator): @@ -2315,13 +2340,22 @@ def test_fit_and_score_verbosity( train, test = next(ShuffleSplit().split(X)) # test print without train score - fit_and_score_args = [clf, X, y, scorer, train, test, verbose, None, None] - fit_and_score_kwargs = { - "return_train_score": train_score, - "split_progress": split_prg, - "candidate_progress": cdt_prg, - } - _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) + fit_and_score_args = dict( + estimator=clf, + X=X, + y=y, + scorer=scorer, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=None, + score_params=None, + return_train_score=train_score, + split_progress=split_prg, + candidate_progress=cdt_prg, + ) + _fit_and_score(**fit_and_score_args) out, _ = capsys.readouterr() outlines = out.split("\n") if len(outlines) > 2: @@ -2336,9 +2370,15 @@ def test_score(): def two_params_scorer(estimator, X_test): return None - fit_and_score_args = [None, None, None, two_params_scorer] with pytest.raises(ValueError, match=error_message): - _score(*fit_and_score_args, error_score=np.nan) + _score( + estimator=None, + X_test=None, + y_test=None, + scorer=two_params_scorer, + score_params=None, + error_score=np.nan, + ) def test_callable_multimetric_confusion_matrix_cross_validate(): @@ -2391,3 +2431,149 @@ def test_cross_validate_return_indices(global_random_seed): for split_idx, (expected_train_idx, expected_test_idx) in enumerate(cv.split(X, y)): assert_array_equal(train_indices[split_idx], expected_train_idx) assert_array_equal(test_indices[split_idx], expected_test_idx) + + +# Tests for metadata routing in cross_val* +# ======================================== + + +# TODO(1.6): remove this test in 1.6 +def test_cross_validate_fit_param_deprecation(): + """Check that we warn about deprecating `fit_params`.""" + with pytest.warns(FutureWarning, match="`fit_params` is deprecated"): + cross_validate(estimator=ConsumingClassifier(), X=X, y=y, cv=2, fit_params={}) + + with pytest.raises( + ValueError, match="`params` and `fit_params` cannot both be provided" + ): + cross_validate( + estimator=ConsumingClassifier(), X=X, y=y, fit_params={}, params={} + ) + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "cv_method", [cross_validate, cross_val_score, cross_val_predict] +) +def test_groups_with_routing_validation(cv_method): + """Check that we raise an error if `groups` are passed to the cv method instead + of `params` when metadata routing is enabled. + """ + with pytest.raises(ValueError, match="`groups` can only be passed if"): + cv_method( + estimator=ConsumingClassifier(), + X=X, + y=y, + groups=[], + ) + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "cv_method", [cross_validate, cross_val_score, cross_val_predict] +) +def test_passed_unrequested_metadata(cv_method): + """Check that we raise an error when passing metadata that is not + requested.""" + err_msg = re.escape("['metadata'] are passed to cross validation") + with pytest.raises(ValueError, match=err_msg): + cv_method( + estimator=ConsumingClassifier(), + X=X, + y=y, + params=dict(metadata=[]), + ) + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "cv_method", [cross_validate, cross_val_score, cross_val_predict] +) +def test_cross_validate_routing(cv_method): + """Check that the respective cv method is properly dispatching the metadata + to the consumer.""" + scorer_registry = _Registry() + scorer = ConsumingScorer(registry=scorer_registry).set_score_request( + sample_weight="score_weights", metadata="score_metadata" + ) + splitter_registry = _Registry() + splitter = ConsumingSplitter(registry=splitter_registry).set_split_request( + groups="split_groups", metadata="split_metadata" + ) + estimator_registry = _Registry() + estimator = ConsumingClassifier(registry=estimator_registry).set_fit_request( + sample_weight="fit_sample_weight", metadata="fit_metadata" + ) + n_samples = _num_samples(X) + rng = np.random.RandomState(0) + score_weights = rng.rand(n_samples) + score_metadata = rng.rand(n_samples) + split_groups = rng.randint(0, 3, n_samples) + split_metadata = rng.rand(n_samples) + fit_sample_weight = rng.rand(n_samples) + fit_metadata = rng.rand(n_samples) + + extra_params = { + cross_validate: dict(scoring=dict(my_scorer=scorer, accuracy="accuracy")), + # cross_val_score doesn't support multiple scorers + cross_val_score: dict(scoring=scorer), + # cross_val_predict doesn't need a scorer + cross_val_predict: dict(), + } + + params = dict( + split_groups=split_groups, + split_metadata=split_metadata, + fit_sample_weight=fit_sample_weight, + fit_metadata=fit_metadata, + ) + + if cv_method is not cross_val_predict: + params.update( + score_weights=score_weights, + score_metadata=score_metadata, + ) + + cv_method( + estimator, + X=X, + y=y, + cv=splitter, + **extra_params[cv_method], + params=params, + ) + + if cv_method is not cross_val_predict: + # cross_val_predict doesn't need a scorer + assert len(scorer_registry) + for _scorer in scorer_registry: + check_recorded_metadata( + obj=_scorer, + method="score", + split_params=("sample_weight", "metadata"), + sample_weight=score_weights, + metadata=score_metadata, + ) + + assert len(splitter_registry) + for _splitter in splitter_registry: + check_recorded_metadata( + obj=_splitter, + method="split", + groups=split_groups, + metadata=split_metadata, + ) + + assert len(estimator_registry) + for _estimator in estimator_registry: + check_recorded_metadata( + obj=_estimator, + method="fit", + split_params=("sample_weight", "metadata"), + sample_weight=fit_sample_weight, + metadata=fit_metadata, + ) + + +# End of metadata routing tests +# ============================= diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 8bd71924f954b..a75f41307b758 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -163,10 +163,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para if _routing_enabled(): routed_params = process_routing( - obj=self, - method="partial_fit", - other_params=partial_fit_params, + self, + "partial_fit", sample_weight=sample_weight, + **partial_fit_params, ) else: if sample_weight is not None and not has_fit_parameter( @@ -249,10 +249,10 @@ def fit(self, X, y, sample_weight=None, **fit_params): if _routing_enabled(): routed_params = process_routing( - obj=self, - method="fit", - other_params=fit_params, + self, + "fit", sample_weight=sample_weight, + **fit_params, ) else: if sample_weight is not None and not has_fit_parameter( @@ -706,9 +706,7 @@ def fit(self, X, Y, **fit_params): del Y_pred_chain if _routing_enabled(): - routed_params = process_routing( - obj=self, method="fit", other_params=fit_params - ) + routed_params = process_routing(self, "fit", **fit_params) else: routed_params = Bunch(estimator=Bunch(fit=fit_params)) diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx deleted file mode 100644 index d9b933cb43c66..0000000000000 --- a/sklearn/neighbors/_ball_tree.pyx +++ /dev/null @@ -1,195 +0,0 @@ -# Author: Jake Vanderplas -# License: BSD 3 clause - -__all__ = ['BallTree'] - -DOC_DICT = {'BinaryTree': 'BallTree', 'binary_tree': 'ball_tree'} - -VALID_METRICS = [ - 'BrayCurtisDistance64', - 'CanberraDistance64', - 'ChebyshevDistance64', - 'DiceDistance64', - 'EuclideanDistance64', - 'HammingDistance64', - 'HaversineDistance64', - 'JaccardDistance64', - 'MahalanobisDistance64', - 'ManhattanDistance64', - 'MinkowskiDistance64', - 'PyFuncDistance64', - 'RogersTanimotoDistance64', - 'RussellRaoDistance64', - 'SEuclideanDistance64', - 'SokalMichenerDistance64', - 'SokalSneathDistance64', - 'WMinkowskiDistance64', -] - -include "_binary_tree.pxi" - -# Inherit BallTree from BinaryTree -cdef class BallTree(BinaryTree): - __doc__ = CLASS_DOC.format(**DOC_DICT) - pass - - -# ---------------------------------------------------------------------- -# The functions below specialized the Binary Tree as a Ball Tree -# -# Note that these functions use the concept of "reduced distance". -# The reduced distance, defined for some metrics, is a quantity which -# is more efficient to compute than the distance, but preserves the -# relative rankings of the true distance. For example, the reduced -# distance for the Euclidean metric is the squared-euclidean distance. -# For some metrics, the reduced distance is simply the distance. - -cdef int allocate_data(BinaryTree tree, intp_t n_nodes, - intp_t n_features) except -1: - """Allocate arrays needed for the KD Tree""" - tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype=np.float64) - return 0 - - -cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node, - intp_t idx_start, intp_t idx_end) except -1: - """Initialize the node for the dataset stored in tree.data""" - cdef intp_t n_features = tree.data.shape[1] - cdef intp_t n_points = idx_end - idx_start - - cdef intp_t i, j - cdef float64_t radius - cdef float64_t *this_pt - - cdef intp_t* idx_array = &tree.idx_array[0] - cdef float64_t* data = &tree.data[0, 0] - cdef float64_t* centroid = &tree.node_bounds[0, i_node, 0] - - cdef bint with_sample_weight = tree.sample_weight is not None - cdef float64_t* sample_weight - cdef float64_t sum_weight_node - if with_sample_weight: - sample_weight = &tree.sample_weight[0] - - # determine Node centroid - for j in range(n_features): - centroid[j] = 0 - - if with_sample_weight: - sum_weight_node = 0 - for i in range(idx_start, idx_end): - sum_weight_node += sample_weight[idx_array[i]] - this_pt = data + n_features * idx_array[i] - for j from 0 <= j < n_features: - centroid[j] += this_pt[j] * sample_weight[idx_array[i]] - - for j in range(n_features): - centroid[j] /= sum_weight_node - else: - for i in range(idx_start, idx_end): - this_pt = data + n_features * idx_array[i] - for j from 0 <= j < n_features: - centroid[j] += this_pt[j] - - for j in range(n_features): - centroid[j] /= n_points - - # determine Node radius - radius = 0 - for i in range(idx_start, idx_end): - radius = fmax(radius, - tree.rdist(centroid, - data + n_features * idx_array[i], - n_features)) - - node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) - node_data[i_node].idx_start = idx_start - node_data[i_node].idx_end = idx_end - return 0 - - -cdef inline float64_t min_dist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1 nogil: - """Compute the minimum distance between a point and a node""" - cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - return fmax(0, dist_pt - tree.node_data[i_node].radius) - - -cdef inline float64_t max_dist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1: - """Compute the maximum distance between a point and a node""" - cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - return dist_pt + tree.node_data[i_node].radius - - -cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, - float64_t* min_dist, float64_t* max_dist) except -1 nogil: - """Compute the minimum and maximum distance between a point and a node""" - cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], - tree.data.shape[1]) - cdef float64_t rad = tree.node_data[i_node].radius - min_dist[0] = fmax(0, dist_pt - rad) - max_dist[0] = dist_pt + rad - return 0 - - -cdef inline float64_t min_rdist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1 nogil: - """Compute the minimum reduced-distance between a point and a node""" - if tree.euclidean: - return euclidean_dist_to_rdist64(min_dist(tree, i_node, pt)) - else: - return tree.dist_metric._dist_to_rdist(min_dist(tree, i_node, pt)) - - -cdef inline float64_t max_rdist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1: - """Compute the maximum reduced-distance between a point and a node""" - if tree.euclidean: - return euclidean_dist_to_rdist64(max_dist(tree, i_node, pt)) - else: - return tree.dist_metric._dist_to_rdist(max_dist(tree, i_node, pt)) - - -cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the minimum distance between two nodes""" - cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], - &tree1.node_bounds[0, i_node1, 0], - tree1.data.shape[1]) - return fmax(0, (dist_pt - tree1.node_data[i_node1].radius - - tree2.node_data[i_node2].radius)) - - -cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the maximum distance between two nodes""" - cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], - &tree1.node_bounds[0, i_node1, 0], - tree1.data.shape[1]) - return (dist_pt + tree1.node_data[i_node1].radius - + tree2.node_data[i_node2].radius) - - -cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the minimum reduced distance between two nodes""" - if tree1.euclidean: - return euclidean_dist_to_rdist64(min_dist_dual(tree1, i_node1, - tree2, i_node2)) - else: - return tree1.dist_metric._dist_to_rdist(min_dist_dual(tree1, i_node1, - tree2, i_node2)) - - -cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: - """compute the maximum reduced distance between two nodes""" - if tree1.euclidean: - return euclidean_dist_to_rdist64(max_dist_dual(tree1, i_node1, - tree2, i_node2)) - else: - return tree1.dist_metric._dist_to_rdist(max_dist_dual(tree1, i_node1, - tree2, i_node2)) diff --git a/sklearn/neighbors/_ball_tree.pyx.tp b/sklearn/neighbors/_ball_tree.pyx.tp new file mode 100644 index 0000000000000..92b26714e5d9f --- /dev/null +++ b/sklearn/neighbors/_ball_tree.pyx.tp @@ -0,0 +1,284 @@ +{{py: + +# Generated file: _ball_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + +# Author: Jake Vanderplas +# License: BSD 3 clause + +}} + + +__all__ = ['BallTree', 'BallTree64', 'BallTree32'] + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'BallTree{{name_suffix}}', + 'binary_tree': 'ball_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'BrayCurtisDistance{{name_suffix}}', + 'CanberraDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'DiceDistance{{name_suffix}}', + 'EuclideanDistance{{name_suffix}}', + 'HammingDistance{{name_suffix}}', + 'HaversineDistance{{name_suffix}}', + 'JaccardDistance{{name_suffix}}', + 'MahalanobisDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}', + 'PyFuncDistance{{name_suffix}}', + 'RogersTanimotoDistance{{name_suffix}}', + 'RussellRaoDistance{{name_suffix}}', + 'SEuclideanDistance{{name_suffix}}', + 'SokalMichenerDistance{{name_suffix}}', + 'SokalSneathDistance{{name_suffix}}', + 'WMinkowskiDistance{{name_suffix}}', +] + +{{endfor}} + +include "_binary_tree.pxi" + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) + pass + +{{endfor}} + + +#---------------------------------------------------------------------- +# The functions below specialized the Binary Tree as a Ball Tree +# +# Note that these functions use the concept of "reduced distance". +# The reduced distance, defined for some metrics, is a quantity which +# is more efficient to compute than the distance, but preserves the +# relative rankings of the true distance. For example, the reduced +# distance for the Euclidean metric is the squared-euclidean distance. +# For some metrics, the reduced distance is simply the distance. + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: + """Allocate arrays needed for the KD Tree""" + tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}}) + return 0 + + +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: + """Initialize the node for the dataset stored in tree.data""" + cdef intp_t n_features = tree.data.shape[1] + cdef intp_t n_points = idx_end - idx_start + + cdef intp_t i, j + cdef float64_t radius + cdef {{INPUT_DTYPE_t}} *this_pt + + cdef intp_t* idx_array = &tree.idx_array[0] + cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0] + + cdef bint with_sample_weight = tree.sample_weight is not None + cdef {{INPUT_DTYPE_t}}* sample_weight + cdef float64_t sum_weight_node + if with_sample_weight: + sample_weight = &tree.sample_weight[0] + + # determine Node centroid + for j in range(n_features): + centroid[j] = 0 + + if with_sample_weight: + sum_weight_node = 0 + for i in range(idx_start, idx_end): + sum_weight_node += sample_weight[idx_array[i]] + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] * sample_weight[idx_array[i]] + + for j in range(n_features): + centroid[j] /= sum_weight_node + else: + for i in range(idx_start, idx_end): + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] + + for j in range(n_features): + centroid[j] /= n_points + + # determine Node radius + radius = 0 + for i in range(idx_start, idx_end): + radius = fmax(radius, + tree.rdist(centroid, + data + n_features * idx_array[i], + n_features)) + + node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end + return 0 + + +cdef inline float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return fmax(0, dist_pt - tree.node_data[i_node].radius) + + +cdef inline float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return dist_pt + tree.node_data[i_node].radius + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: + """Compute the minimum and maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + cdef float64_t rad = tree.node_data[i_node].radius + min_dist[0] = fmax(0, dist_pt - rad) + max_dist[0] = dist_pt + rad + return 0 + + +cdef inline float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return fmax(0, (dist_pt - tree1.node_data[i_node1].radius + - tree2.node_data[i_node2].radius)) + + +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return (dist_pt + tree1.node_data[i_node1].radius + + tree2.node_data[i_node2].radius) + + +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + + +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class BallTree(BallTree64): + __doc__ = CLASS_DOC.format(BinaryTree="BallTree") + pass diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index dcff18e10fa48..519db9bead3d3 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -19,7 +19,7 @@ from ..base import BaseEstimator, MultiOutputMixin, is_classifier from ..exceptions import DataConversionWarning, EfficiencyWarning -from ..metrics import pairwise_distances_chunked +from ..metrics import DistanceMetric, pairwise_distances_chunked from ..metrics._pairwise_distances_reduction import ( ArgKmin, RadiusNeighbors, @@ -414,7 +414,11 @@ def _check_algorithm_metric(self): if self.algorithm == "auto": if self.metric == "precomputed": alg_check = "brute" - elif callable(self.metric) or self.metric in VALID_METRICS["ball_tree"]: + elif ( + callable(self.metric) + or self.metric in VALID_METRICS["ball_tree"] + or isinstance(self.metric, DistanceMetric) + ): alg_check = "ball_tree" else: alg_check = "brute" @@ -430,7 +434,9 @@ def _check_algorithm_metric(self): "in very poor performance." % self.metric ) - elif self.metric not in VALID_METRICS[alg_check]: + elif self.metric not in VALID_METRICS[alg_check] and not isinstance( + self.metric, DistanceMetric + ): raise ValueError( "Metric '%s' not valid. Use " "sorted(sklearn.neighbors.VALID_METRICS['%s']) " @@ -563,9 +569,11 @@ def _fit(self, X, y=None): if self.algorithm not in ("auto", "brute"): warnings.warn("cannot use tree with sparse input: using brute force") - if self.effective_metric_ not in VALID_METRICS_SPARSE[ - "brute" - ] and not callable(self.effective_metric_): + if ( + self.effective_metric_ not in VALID_METRICS_SPARSE["brute"] + and not callable(self.effective_metric_) + and not isinstance(self.effective_metric_, DistanceMetric) + ): raise ValueError( "Metric '%s' not valid for sparse input. " "Use sorted(sklearn.neighbors." diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi.tp similarity index 90% rename from sklearn/neighbors/_binary_tree.pxi rename to sklearn/neighbors/_binary_tree.pxi.tp index b60ea3a0a6d70..6322f809f7eb9 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -1,14 +1,32 @@ -#!python +{{py: +# Generated file: _binary_tree.pxi + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE + # + ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'), + ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT') +] # KD Tree and Ball Tree # ===================== # # Author: Jake Vanderplas , 2012-2013 +# Omar Salman +# # License: BSD # -# This file is meant to be a literal include in a pyx file. -# See ball_tree.pyx and kd_tree.pyx +# _binary_tree.pxi is generated and is then literally Cython included in +# ball_tree.pyx and kd_tree.pyx. See ball_tree.pyx.tp and kd_tree.pyx.tp. + +}} + + +# KD Tree and Ball Tree +# ===================== # # The routines here are the core algorithms of the KDTree and BallTree # structures. If Cython supported polymorphism, we would be able to @@ -143,6 +161,7 @@ # """Compute the maximum distance between two nodes""" cimport numpy as cnp +from cython cimport floating from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma from libc.math cimport fmin, fmax from libc.stdlib cimport calloc, malloc, free @@ -154,15 +173,19 @@ import warnings from ..metrics._dist_metrics cimport ( DistanceMetric, DistanceMetric64, + DistanceMetric32, euclidean_dist64, + euclidean_dist32, euclidean_rdist64, + euclidean_rdist32, euclidean_dist_to_rdist64, + euclidean_dist_to_rdist32, ) from ._partition_nodes cimport partition_node_indices from ..utils import check_array -from ..utils._typedefs cimport float64_t, intp_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t from ..utils._heap cimport heap_push from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort @@ -500,8 +523,9 @@ def kernel_norm(h, d, kernel, return_log=False): else: return np.exp(result) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} -cdef class NeighborsHeap: +cdef class NeighborsHeap{{name_suffix}}: """A max-heap structure to keep track of distances/indices of neighbors This implements an efficient pre-allocated set of fixed-size heaps @@ -516,19 +540,19 @@ cdef class NeighborsHeap: n_nbrs : int the size of each heap. """ - cdef float64_t[:, ::1] distances + cdef {{INPUT_DTYPE_t}}[:, ::1] distances cdef intp_t[:, ::1] indices def __cinit__(self): # One-element arrays are used as placeholders to prevent # any problem due to potential access to those attributes # (e.g. assigning to NULL or a to value in another segment). - self.distances = np.zeros((1, 1), dtype=np.float64, order='C') + self.distances = np.zeros((1, 1), dtype={{INPUT_DTYPE}}, order='C') self.indices = np.zeros((1, 1), dtype=np.intp, order='C') def __init__(self, n_pts, n_nbrs): self.distances = np.full( - (n_pts, n_nbrs), np.inf, dtype=np.float64, order='C' + (n_pts, n_nbrs), np.inf, dtype={{INPUT_DTYPE}}, order='C' ) self.indices = np.zeros((n_pts, n_nbrs), dtype=np.intp, order='C') @@ -571,14 +595,16 @@ cdef class NeighborsHeap: ) return 0 -# ------------------------------------------------------------ +{{endfor}} + +#------------------------------------------------------------ # find_node_split_dim: # this computes the equivalent of # j_max = np.argmax(np.max(data, 0) - np.min(data, 0)) -cdef intp_t find_node_split_dim(float64_t* data, - intp_t* node_indices, - intp_t n_features, - intp_t n_points) except -1: +cdef intp_t find_node_split_dim(const floating* data, + intp_t* node_indices, + intp_t n_features, + intp_t n_points) except -1: """Find the dimension with the largest spread. Parameters @@ -764,29 +790,31 @@ def newObj(obj): return obj.__new__(obj) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} + ###################################################################### -# define the reverse mapping of VALID_METRICS +# define the reverse mapping of VALID_METRICS{{name_suffix}} from sklearn.metrics._dist_metrics import get_valid_metric_ids -VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS) +VALID_METRIC_IDS{{name_suffix}} = get_valid_metric_ids(VALID_METRICS{{name_suffix}}) ###################################################################### # Binary Tree class -cdef class BinaryTree: +cdef class BinaryTree{{name_suffix}}: - cdef readonly const float64_t[:, ::1] data - cdef readonly const float64_t[::1] sample_weight + cdef readonly const {{INPUT_DTYPE_t}}[:, ::1] data + cdef readonly const {{INPUT_DTYPE_t}}[::1] sample_weight cdef public float64_t sum_weight cdef public const intp_t[::1] idx_array cdef public const NodeData_t[::1] node_data - cdef public const float64_t[:, :, ::1] node_bounds + cdef public const {{INPUT_DTYPE_t}}[:, :, ::1] node_bounds cdef intp_t leaf_size cdef intp_t n_levels cdef intp_t n_nodes - cdef DistanceMetric64 dist_metric + cdef DistanceMetric{{name_suffix}} dist_metric cdef int euclidean # variables to keep track of building & querying stats @@ -795,7 +823,7 @@ cdef class BinaryTree: cdef int n_splits cdef int n_calls - valid_metrics = VALID_METRIC_IDS + valid_metrics = VALID_METRIC_IDS{{name_suffix}} # Use cinit to initialize all arrays to empty: this will prevent memory # errors and seg-faults in rare cases where __init__ is not called @@ -803,11 +831,11 @@ cdef class BinaryTree: # any problem due to potential access to this attribute # (e.g. assigning to NULL or a to value in another segment). def __cinit__(self): - self.data = np.empty((1, 1), dtype=np.float64, order='C') - self.sample_weight = np.empty(1, dtype=np.float64, order='C') + self.data = np.empty((1, 1), dtype={{INPUT_DTYPE}}, order='C') + self.sample_weight = np.empty(1, dtype={{INPUT_DTYPE}}, order='C') self.idx_array = np.empty(1, dtype=np.intp, order='C') self.node_data = np.empty(1, dtype=NodeData, order='C') - self.node_bounds = np.empty((1, 1, 1), dtype=np.float64) + self.node_bounds = np.empty((1, 1, 1), dtype={{INPUT_DTYPE}}) self.leaf_size = 0 self.n_levels = 0 @@ -823,7 +851,7 @@ cdef class BinaryTree: def __init__(self, data, leaf_size=40, metric='minkowski', sample_weight=None, **kwargs): # validate data - self.data = check_array(data, dtype=np.float64, order='C') + self.data = check_array(data, dtype={{INPUT_DTYPE}}, order='C') if self.data.size == 0: raise ValueError("X is an empty array") @@ -834,15 +862,15 @@ cdef class BinaryTree: raise ValueError("leaf_size must be greater than or equal to 1") self.leaf_size = leaf_size - self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) + self.dist_metric = DistanceMetric.get_metric(metric, dtype={{INPUT_DTYPE}}, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ - == 'EuclideanDistance64') + == 'EuclideanDistance{{name_suffix}}') metric = self.dist_metric.__class__.__name__ - if metric not in VALID_METRICS: + if metric not in VALID_METRICS{{name_suffix}}: raise ValueError('metric {metric} is not valid for ' '{BinaryTree}'.format(metric=metric, - **DOC_DICT)) + **DOC_DICT{{name_suffix}})) self.dist_metric._validate_data(self.data) # determine number of levels in the tree, and from this @@ -859,7 +887,7 @@ cdef class BinaryTree: self._update_sample_weight(n_samples, sample_weight) # Allocate tree-specific data - allocate_data(self, self.n_nodes, n_features) + allocate_data{{name_suffix}}(self, self.n_nodes, n_features) self._recursive_build( node_data=self.node_data.base, i_node=0, @@ -870,7 +898,7 @@ cdef class BinaryTree: def _update_sample_weight(self, n_samples, sample_weight): if sample_weight is not None: self.sample_weight = np.asarray( - sample_weight, dtype=np.float64, order='C') + sample_weight, dtype={{INPUT_DTYPE}}, order='C') self.sum_weight = np.sum(self.sample_weight) else: self.sample_weight = None @@ -982,17 +1010,17 @@ cdef class BinaryTree: self.node_bounds.base, ) - cdef inline float64_t dist(self, float64_t* x1, float64_t* x2, - intp_t size) except -1 nogil: + cdef inline float64_t dist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: """Compute the distance between arrays x1 and x2""" self.n_calls += 1 if self.euclidean: - return euclidean_dist64(x1, x2, size) + return euclidean_dist{{name_suffix}}(x1, x2, size) else: return self.dist_metric.dist(x1, x2, size) - cdef inline float64_t rdist(self, float64_t* x1, float64_t* x2, - intp_t size) except -1 nogil: + cdef inline float64_t rdist(self, {{INPUT_DTYPE_t}}* x1, {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: """Compute the reduced distance between arrays x1 and x2. The reduced distance, defined for some metrics, is a quantity which @@ -1002,7 +1030,7 @@ cdef class BinaryTree: """ self.n_calls += 1 if self.euclidean: - return euclidean_rdist64(x1, x2, size) + return euclidean_rdist{{name_suffix}}(x1, x2, size) else: return self.dist_metric.rdist(x1, x2, size) @@ -1023,10 +1051,10 @@ cdef class BinaryTree: cdef intp_t n_points = idx_end - idx_start cdef intp_t n_mid = n_points / 2 cdef intp_t* idx_array = &self.idx_array[idx_start] - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # initialize node data - init_node(self, node_data, i_node, idx_start, idx_end) + init_node{{name_suffix}}(self, node_data, i_node, idx_start, idx_end) if 2 * i_node + 1 >= self.n_nodes: node_data[i_node].is_leaf = True @@ -1103,7 +1131,7 @@ cdef class BinaryTree: corresponding point. """ # XXX: we should allow X to be a pre-built tree. - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " @@ -1115,13 +1143,13 @@ cdef class BinaryTree: # flatten X, and save original shape information np_Xarr = X.reshape((-1, self.data.shape[1])) - cdef const float64_t[:, ::1] Xarr = np_Xarr + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr cdef float64_t reduced_dist_LB cdef intp_t i - cdef float64_t* pt + cdef {{INPUT_DTYPE_t}}* pt # initialize heap for neighbors - cdef NeighborsHeap heap = NeighborsHeap(Xarr.shape[0], k) + cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(Xarr.shape[0], k) # node heap for breadth-first queries cdef NodeHeap nodeheap @@ -1141,7 +1169,7 @@ cdef class BinaryTree: if breadth_first: self._query_dual_breadthfirst(other, heap, nodeheap) else: - reduced_dist_LB = min_rdist_dual(self, 0, other, 0) + reduced_dist_LB = min_rdist_dual{{name_suffix}}(self, 0, other, 0) bounds = np.full(other.node_data.shape[0], np.inf) self._query_dual_depthfirst(0, other, 0, bounds, heap, reduced_dist_LB) @@ -1155,7 +1183,7 @@ cdef class BinaryTree: else: with nogil: for i in range(Xarr.shape[0]): - reduced_dist_LB = min_rdist(self, 0, pt) + reduced_dist_LB = min_rdist{{name_suffix}}(self, 0, pt) self._query_single_depthfirst(0, pt, i, heap, reduced_dist_LB) pt += Xarr.shape[1] @@ -1233,20 +1261,20 @@ cdef class BinaryTree: cdef intp_t i, count_i = 0 cdef intp_t n_features = self.data.shape[1] - cdef float64_t[::1] dist_arr_i + cdef {{INPUT_DTYPE_t}}[::1] dist_arr_i cdef intp_t[::1] idx_arr_i, counts - cdef float64_t* pt + cdef {{INPUT_DTYPE_t}}* pt cdef intp_t** indices = NULL - cdef float64_t** distances = NULL + cdef {{INPUT_DTYPE_t}}** distances = NULL # validate X and prepare for query - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " "match training data dimension") - cdef const float64_t[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) # prepare r for query r = np.asarray(r, dtype=np.float64, order='C') @@ -1265,7 +1293,7 @@ cdef class BinaryTree: if indices == NULL: raise MemoryError() if return_distance: - distances = calloc(Xarr.shape[0], sizeof(float64_t*)) + distances = <{{INPUT_DTYPE_t}}**>calloc(Xarr.shape[0], sizeof({{INPUT_DTYPE_t}}*)) if distances == NULL: free(indices) raise MemoryError() @@ -1273,7 +1301,7 @@ cdef class BinaryTree: np_idx_arr = np.zeros(self.data.shape[0], dtype=np.intp) idx_arr_i = np_idx_arr - np_dist_arr = np.zeros(self.data.shape[0], dtype=np.float64) + np_dist_arr = np.zeros(self.data.shape[0], dtype={{INPUT_DTYPE}}) dist_arr_i = np_dist_arr counts_arr = np.zeros(Xarr.shape[0], dtype=np.intp) @@ -1306,11 +1334,11 @@ cdef class BinaryTree: if return_distance: # equivalent to: distances[i] = np_dist_arr[:counts[i]].copy() - distances[i] = malloc(counts[i] * sizeof(float64_t)) + distances[i] = <{{INPUT_DTYPE_t}}*>malloc(counts[i] * sizeof({{INPUT_DTYPE_t}})) if distances[i] == NULL: memory_error = True break - memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof(float64_t)) + memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}})) try: if memory_error: @@ -1333,7 +1361,7 @@ cdef class BinaryTree: # make a new numpy array that wraps the existing data # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 - distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_DOUBLE, distances[i]) + distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], {{NPY_TYPE}}, distances[i]) # make sure the data will be freed when the numpy array is garbage collected PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA) # make sure the data is not freed twice @@ -1445,18 +1473,18 @@ cdef class BinaryTree: cdef float64_t log_knorm = _log_kernel_norm(h_c, n_features, kernel_c) # validate X and prepare for query - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != n_features: raise ValueError("query data dimension must " "match training data dimension") Xarr_np = X.reshape((-1, n_features)) - cdef float64_t[:, ::1] Xarr = Xarr_np + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np - log_density_arr = np.zeros(Xarr.shape[0], dtype=np.float64) - cdef float64_t[::1] log_density = log_density_arr + log_density_arr = np.zeros(Xarr.shape[0], dtype={{INPUT_DTYPE}}) + cdef {{INPUT_DTYPE_t}}[::1] log_density = log_density_arr - cdef float64_t* pt = &Xarr[0, 0] + cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] cdef NodeHeap nodeheap if breadth_first: @@ -1481,7 +1509,7 @@ cdef class BinaryTree: pt += n_features else: for i in range(Xarr.shape[0]): - min_max_dist(self, 0, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, 0, pt, &dist_LB, &dist_UB) # compute max & min bounds on density within top node log_min_bound = (log(self.sum_weight) + compute_log_kernel(dist_UB, @@ -1539,14 +1567,14 @@ cdef class BinaryTree: cdef intp_t i # validate X and prepare for query - X = check_array(X, dtype=np.float64, order='C') + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " "match training data dimension") np_Xarr = X.reshape((-1, self.data.shape[1])) - cdef float64_t[:, ::1] Xarr = np_Xarr + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr # prepare r for query r = np.asarray(r, dtype=np.float64, order='C') @@ -1561,7 +1589,7 @@ cdef class BinaryTree: count = np.zeros(r.shape[0], dtype=np.intp) cdef intp_t[::1] carr = count - cdef float64_t* pt = &Xarr[0, 0] + cdef {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] if dualtree: other = self.__class__(Xarr, metric=self.dist_metric, @@ -1576,17 +1604,21 @@ cdef class BinaryTree: return count - cdef int _query_single_depthfirst(self, intp_t i_node, - float64_t* pt, intp_t i_pt, - NeighborsHeap heap, - float64_t reduced_dist_LB) except -1 nogil: + cdef int _query_single_depthfirst( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1 nogil: """Recursive Single-tree k-neighbors query, depth-first approach""" cdef NodeData_t node_info = self.node_data[i_node] cdef float64_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2 cdef intp_t i, i1, i2 - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # ------------------------------------------------------------ # Case 1: query point is outside node radius: @@ -1611,8 +1643,8 @@ cdef class BinaryTree: self.n_splits += 1 i1 = 2 * i_node + 1 i2 = i1 + 1 - reduced_dist_LB_1 = min_rdist(self, i1, pt) - reduced_dist_LB_2 = min_rdist(self, i2, pt) + reduced_dist_LB_1 = min_rdist{{name_suffix}}(self, i1, pt) + reduced_dist_LB_2 = min_rdist{{name_suffix}}(self, i2, pt) # recursively query subnodes if reduced_dist_LB_1 <= reduced_dist_LB_2: @@ -1627,19 +1659,22 @@ cdef class BinaryTree: reduced_dist_LB_1) return 0 - cdef int _query_single_breadthfirst(self, float64_t* pt, - intp_t i_pt, - NeighborsHeap heap, - NodeHeap nodeheap) except -1: + cdef int _query_single_breadthfirst( + self, + {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: """Non-recursive single-tree k-neighbors query, breadth-first search""" cdef intp_t i, i_node cdef float64_t dist_pt, reduced_dist_LB cdef NodeData_t* node_data = &self.node_data[0] - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] # Set up the node heap and push the head node onto it cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_rdist(self, 0, pt) + nodeheap_item.val = min_rdist{{name_suffix}}(self, 0, pt) nodeheap_item.i1 = 0 nodeheap.push(nodeheap_item) @@ -1672,15 +1707,19 @@ cdef class BinaryTree: self.n_splits += 1 for i in range(2 * i_node + 1, 2 * i_node + 3): nodeheap_item.i1 = i - nodeheap_item.val = min_rdist(self, i, pt) + nodeheap_item.val = min_rdist{{name_suffix}}(self, i, pt) nodeheap.push(nodeheap_item) return 0 - cdef int _query_dual_depthfirst(self, intp_t i_node1, - BinaryTree other, intp_t i_node2, - float64_t[::1] bounds, - NeighborsHeap heap, - float64_t reduced_dist_LB) except -1: + cdef int _query_dual_depthfirst( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t[::1] bounds, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1: """Recursive dual-tree k-neighbors query, depth-first""" # note that the array `bounds` is maintained such that # bounds[i] is the largest distance among any of the @@ -1688,8 +1727,8 @@ cdef class BinaryTree: cdef NodeData_t node_info1 = self.node_data[i_node1] cdef NodeData_t node_info2 = other.node_data[i_node2] - cdef float64_t* data1 = &self.data[0, 0] - cdef float64_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef intp_t n_features = self.data.shape[1] cdef float64_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2 @@ -1740,9 +1779,9 @@ cdef class BinaryTree: # recursively query, starting with the nearest subnode elif node_info1.is_leaf or (not node_info2.is_leaf and node_info2.radius > node_info1.radius): - reduced_dist_LB1 = min_rdist_dual(self, i_node1, + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, i_node1, other, 2 * i_node2 + 1) - reduced_dist_LB2 = min_rdist_dual(self, i_node1, + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, i_node1, other, 2 * i_node2 + 2) if reduced_dist_LB1 < reduced_dist_LB2: @@ -1760,9 +1799,9 @@ cdef class BinaryTree: # Case 3b: node 2 is a leaf or is smaller: split node 1 and # recursively query, starting with the nearest subnode else: - reduced_dist_LB1 = min_rdist_dual(self, 2 * i_node1 + 1, + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 1, other, i_node2) - reduced_dist_LB2 = min_rdist_dual(self, 2 * i_node1 + 2, + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 2, other, i_node2) if reduced_dist_LB1 < reduced_dist_LB2: @@ -1777,9 +1816,12 @@ cdef class BinaryTree: bounds, heap, reduced_dist_LB1) return 0 - cdef int _query_dual_breadthfirst(self, BinaryTree other, - NeighborsHeap heap, - NodeHeap nodeheap) except -1: + cdef int _query_dual_breadthfirst( + self, + BinaryTree{{name_suffix}} other, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: """Non-recursive dual-tree k-neighbors query, breadth-first""" cdef intp_t i, i1, i2, i_node1, i_node2, i_pt cdef float64_t dist_pt, reduced_dist_LB @@ -1787,13 +1829,13 @@ cdef class BinaryTree: cdef NodeData_t* node_data1 = &self.node_data[0] cdef NodeData_t* node_data2 = &other.node_data[0] cdef NodeData_t node_info1, node_info2 - cdef float64_t* data1 = &self.data[0, 0] - cdef float64_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef intp_t n_features = self.data.shape[1] # Set up the node heap and push the head nodes onto it cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_rdist_dual(self, 0, other, 0) + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, 0, other, 0) nodeheap_item.i1 = 0 nodeheap_item.i2 = 0 nodeheap.push(nodeheap_item) @@ -1845,7 +1887,7 @@ cdef class BinaryTree: nodeheap_item.i1 = i_node1 for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): nodeheap_item.i2 = i2 - nodeheap_item.val = min_rdist_dual(self, i_node1, + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i_node1, other, i2) nodeheap.push(nodeheap_item) @@ -1856,21 +1898,24 @@ cdef class BinaryTree: nodeheap_item.i2 = i_node2 for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): nodeheap_item.i1 = i1 - nodeheap_item.val = min_rdist_dual(self, i1, + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i1, other, i_node2) nodeheap.push(nodeheap_item) return 0 - cdef intp_t _query_radius_single(self, - intp_t i_node, - float64_t* pt, float64_t r, - intp_t* indices, - float64_t* distances, - intp_t count, - int count_only, - int return_distance) noexcept nogil: + cdef intp_t _query_radius_single( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t r, + intp_t* indices, + {{INPUT_DTYPE_t}}* distances, + intp_t count, + int count_only, + int return_distance, + ) noexcept nogil: """recursive single-tree radius query, depth-first""" - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef intp_t* idx_array = &self.idx_array[0] cdef intp_t n_features = self.data.shape[1] cdef NodeData_t node_info = self.node_data[i_node] @@ -1879,7 +1924,7 @@ cdef class BinaryTree: cdef float64_t reduced_r cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 - min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) # ------------------------------------------------------------ # Case 1: all node points are outside distance r. @@ -1937,13 +1982,17 @@ cdef class BinaryTree: return count - cdef float64_t _kde_single_breadthfirst(self, float64_t* pt, - KernelType kernel, float64_t h, - float64_t log_knorm, - float64_t log_atol, float64_t log_rtol, - NodeHeap nodeheap, - float64_t* node_log_min_bounds, - float64_t* node_log_bound_spreads): + cdef float64_t _kde_single_breadthfirst( + self, {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + NodeHeap nodeheap, + float64_t* node_log_min_bounds, + float64_t* node_log_bound_spreads, + ): """non-recursive single-tree kernel density estimation""" # For the given point, node_log_min_bounds and node_log_bound_spreads # will encode the current bounds on the density between the point @@ -1957,9 +2006,9 @@ cdef class BinaryTree: cdef float64_t global_log_min_bound, global_log_bound_spread cdef float64_t global_log_max_bound - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef bint with_sample_weight = self.sample_weight is not None - cdef float64_t* sample_weight + cdef {{INPUT_DTYPE_t}}* sample_weight if with_sample_weight: sample_weight = &self.sample_weight[0] cdef intp_t* idx_array = &self.idx_array[0] @@ -1981,13 +2030,13 @@ cdef class BinaryTree: # push the top node to the heap cdef NodeHeapData_t nodeheap_item - nodeheap_item.val = min_dist(self, 0, pt) + nodeheap_item.val = min_dist{{name_suffix}}(self, 0, pt) nodeheap_item.i1 = 0 nodeheap.push(nodeheap_item) - global_log_min_bound = log(N) + compute_log_kernel(max_dist(self, - 0, pt), - h, kernel) + global_log_min_bound = log(N) + compute_log_kernel( + max_dist{{name_suffix}}(self, 0, pt), h, kernel + ) global_log_max_bound = log(N) + compute_log_kernel(nodeheap_item.val, h, kernel) global_log_bound_spread = logsubexp(global_log_max_bound, @@ -2056,8 +2105,8 @@ cdef class BinaryTree: N1 = node_data[i1].idx_end - node_data[i1].idx_start N2 = node_data[i2].idx_end - node_data[i2].idx_start - min_max_dist(self, i1, pt, &dist_LB_1, &dist_UB_1) - min_max_dist(self, i2, pt, &dist_LB_2, &dist_UB_2) + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB_1, &dist_UB_1) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB_2, &dist_UB_2) node_log_min_bounds[i1] = (log(N1) + compute_log_kernel(dist_UB_1, @@ -2102,14 +2151,19 @@ cdef class BinaryTree: global_log_bound_spread - log(2)) cdef int _kde_single_depthfirst( - self, intp_t i_node, float64_t* pt, - KernelType kernel, float64_t h, - float64_t log_knorm, - float64_t log_atol, float64_t log_rtol, - float64_t local_log_min_bound, - float64_t local_log_bound_spread, - float64_t* global_log_min_bound, - float64_t* global_log_bound_spread) except -1: + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + float64_t local_log_min_bound, + float64_t local_log_bound_spread, + float64_t* global_log_min_bound, + float64_t* global_log_bound_spread, + ) except -1: """recursive single-tree kernel density estimate, depth-first""" # For the given point, local_min_bound and local_max_bound give the # minimum and maximum density for the current node, while @@ -2119,10 +2173,10 @@ cdef class BinaryTree: cdef intp_t i, i1, i2, iw, start, end cdef float64_t N1, N2 - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef NodeData_t* node_data = &self.node_data[0] cdef bint with_sample_weight = self.sample_weight is not None - cdef float64_t* sample_weight + cdef {{INPUT_DTYPE_t}}* sample_weight cdef float64_t log_weight if with_sample_weight: sample_weight = &self.sample_weight[0] @@ -2194,7 +2248,7 @@ cdef class BinaryTree: N1 = (self.node_data[i1].idx_end - self.node_data[i1].idx_start) N2 = (self.node_data[i2].idx_end - self.node_data[i2].idx_start) - min_max_dist(self, i1, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB, &dist_UB) child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h, kernel) child1_log_bound_spread = logsubexp(log(N1) + @@ -2202,7 +2256,7 @@ cdef class BinaryTree: kernel), child1_log_min_bound) - min_max_dist(self, i2, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB, &dist_UB) child2_log_min_bound = log(N2) + compute_log_kernel(dist_UB, h, kernel) child2_log_bound_spread = logsubexp(log(N2) + @@ -2238,11 +2292,17 @@ cdef class BinaryTree: global_log_bound_spread) return 0 - cdef int _two_point_single(self, intp_t i_node, float64_t* pt, float64_t* r, - intp_t* count, intp_t i_min, - intp_t i_max) except -1: + cdef int _two_point_single( + self, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: """recursive single-tree two-point correlation function query""" - cdef float64_t* data = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data = &self.data[0, 0] cdef intp_t* idx_array = &self.idx_array[0] cdef intp_t n_features = self.data.shape[1] cdef NodeData_t node_info = self.node_data[i_node] @@ -2251,7 +2311,7 @@ cdef class BinaryTree: cdef float64_t reduced_r cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 - min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) # ------------------------------------------------------------ # Go through bounds and check for cuts @@ -2287,13 +2347,19 @@ cdef class BinaryTree: count, i_min, i_max) return 0 - cdef int _two_point_dual(self, intp_t i_node1, - BinaryTree other, intp_t i_node2, - float64_t* r, intp_t* count, - intp_t i_min, intp_t i_max) except -1: + cdef int _two_point_dual( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: """recursive dual-tree two-point correlation function query""" - cdef float64_t* data1 = &self.data[0, 0] - cdef float64_t* data2 = &other.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] cdef intp_t* idx_array1 = &self.idx_array[0] cdef intp_t* idx_array2 = &other.idx_array[0] cdef NodeData_t node_info1 = self.node_data[i_node1] @@ -2305,8 +2371,8 @@ cdef class BinaryTree: cdef float64_t reduced_r cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 - dist_LB = min_dist_dual(self, i_node1, other, i_node2) - dist_UB = max_dist_dual(self, i_node1, other, i_node2) + dist_LB = min_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) + dist_UB = max_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) # ------------------------------------------------------------ # Go through bounds and check for cuts @@ -2359,21 +2425,11 @@ cdef class BinaryTree: r, count, i_min, i_max) return 0 +{{endfor}} ###################################################################### # Python functions for benchmarking and testing C implementations -def load_heap(float64_t[:, ::1] X, intp_t k): - """test fully loading the heap""" - assert k <= X.shape[1] - cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) - cdef intp_t i, j - for i in range(X.shape[0]): - for j in range(X.shape[1]): - heap._push(i, X[i, j], j) - return heap.get_arrays() - - def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): """In-place simultaneous sort the given row of the arrays @@ -2412,10 +2468,12 @@ def nodeheap_sort(float64_t[::1] vals): return np.asarray(vals_sorted), np.asarray(indices) -cdef inline float64_t _total_node_weight(NodeData_t* node_data, - float64_t* sample_weight, - intp_t* idx_array, - intp_t i_node): +cdef inline float64_t _total_node_weight( + NodeData_t* node_data, + const floating* sample_weight, + intp_t* idx_array, + intp_t i_node, +): cdef intp_t i cdef float64_t N = 0.0 for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end): diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index a9b78d6e499c9..0f5bd1439f81c 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -329,8 +329,8 @@ def predict_proba(self, X): self._fit_X, k=self.n_neighbors, weights=self.weights, - labels=self._y, - unique_labels=self.classes_, + Y_labels=self._y, + unique_Y_labels=self.classes_, metric=metric, metric_kwargs=metric_kwargs, # `strategy="parallel_on_X"` has in practice be shown diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx.tp similarity index 65% rename from sklearn/neighbors/_kd_tree.pyx rename to sklearn/neighbors/_kd_tree.pyx.tp index f5cd2617be147..1006ec2a8398c 100644 --- a/sklearn/neighbors/_kd_tree.pyx +++ b/sklearn/neighbors/_kd_tree.pyx.tp @@ -1,22 +1,52 @@ +{{py: + +# Generated file: _kd_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + # By Jake Vanderplas (2013) # written for the scikit-learn project # License: BSD -__all__ = ['KDTree'] +}} + -DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'} +__all__ = ['KDTree', 'KDTree64', 'KDTree32'] -VALID_METRICS = ['EuclideanDistance64', 'ManhattanDistance64', - 'ChebyshevDistance64', 'MinkowskiDistance64'] +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'KDTree{{name_suffix}}', + 'binary_tree': 'kd_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'EuclideanDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}' +] + +{{endfor}} include "_binary_tree.pxi" -# Inherit KDTree from BinaryTree -cdef class KDTree(BinaryTree): - __doc__ = CLASS_DOC.format(**DOC_DICT) +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) pass +{{endfor}} + # ---------------------------------------------------------------------- # The functions below specialized the Binary Tree as a KD Tree @@ -28,27 +58,36 @@ cdef class KDTree(BinaryTree): # distance for the Euclidean metric is the squared-euclidean distance. # For some metrics, the reduced distance is simply the distance. +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} -cdef int allocate_data(BinaryTree tree, intp_t n_nodes, - intp_t n_features) except -1: +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: """Allocate arrays needed for the KD Tree""" - tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype=np.float64) + tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}}) return 0 -cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node, - intp_t idx_start, intp_t idx_end) except -1: +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: """Initialize the node for the dataset stored in tree.data""" cdef intp_t n_features = tree.data.shape[1] cdef intp_t i, j cdef float64_t rad = 0 - cdef float64_t* lower_bounds = &tree.node_bounds[0, i_node, 0] - cdef float64_t* upper_bounds = &tree.node_bounds[1, i_node, 0] - cdef float64_t* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0] + cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0] + cdef {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] cdef intp_t* idx_array = &tree.idx_array[0] - cdef float64_t* data_row + cdef {{INPUT_DTYPE_t}}* data_row # determine Node bounds for j in range(n_features): @@ -81,8 +120,11 @@ cdef int init_node(BinaryTree tree, NodeData_t[::1] node_data, intp_t i_node, return 0 -cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, - float64_t* pt) except -1 nogil: +cdef float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: """Compute the minimum reduced-distance between a point and a node""" cdef intp_t n_features = tree.data.shape[1] cdef float64_t d, d_lo, d_hi, rdist=0.0 @@ -105,16 +147,26 @@ cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, return rdist -cdef float64_t min_dist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1: +cdef float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the minimum distance between a point and a node""" if tree.dist_metric.p == INF: - return min_rdist(tree, i_node, pt) + return min_rdist{{name_suffix}}(tree, i_node, pt) else: - return pow(min_rdist(tree, i_node, pt), 1. / tree.dist_metric.p) + return pow( + min_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) -cdef float64_t max_rdist(BinaryTree tree, - intp_t i_node, float64_t* pt) except -1: +cdef float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the maximum reduced-distance between a point and a node""" cdef intp_t n_features = tree.data.shape[1] @@ -134,16 +186,28 @@ cdef float64_t max_rdist(BinaryTree tree, return rdist -cdef float64_t max_dist(BinaryTree tree, intp_t i_node, float64_t* pt) except -1: +cdef float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, +) except -1: """Compute the maximum distance between a point and a node""" if tree.dist_metric.p == INF: - return max_rdist(tree, i_node, pt) + return max_rdist{{name_suffix}}(tree, i_node, pt) else: - return pow(max_rdist(tree, i_node, pt), 1. / tree.dist_metric.p) - - -cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, - float64_t* min_dist, float64_t* max_dist) except -1 nogil: + return pow( + max_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: """Compute the minimum and maximum distance between a point and a node""" cdef intp_t n_features = tree.data.shape[1] @@ -177,8 +241,12 @@ cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, return 0 -cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the minimum reduced distance between two nodes""" cdef intp_t n_features = tree1.data.shape[1] @@ -208,15 +276,24 @@ cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, return rdist -cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the minimum distance between two nodes""" - return tree1.dist_metric._rdist_to_dist(min_rdist_dual(tree1, i_node1, - tree2, i_node2)) + return tree1.dist_metric._rdist_to_dist( + min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) -cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the maximum reduced distance between two nodes""" cdef intp_t n_features = tree1.data.shape[1] @@ -240,8 +317,20 @@ cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, return rdist -cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1, - BinaryTree tree2, intp_t i_node2) except -1: +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: """Compute the maximum distance between two nodes""" - return tree1.dist_metric._rdist_to_dist(max_rdist_dual(tree1, i_node1, - tree2, i_node2)) + return tree1.dist_metric._rdist_to_dist( + max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class KDTree(KDTree64): + __doc__ = CLASS_DOC.format(BinaryTree="KDTree") + pass diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd index 927fde873ee58..c6a0d4bb975c2 100644 --- a/sklearn/neighbors/_partition_nodes.pxd +++ b/sklearn/neighbors/_partition_nodes.pxd @@ -1,7 +1,8 @@ +from cython cimport floating from ..utils._typedefs cimport float64_t, intp_t cdef int partition_node_indices( - float64_t *data, + floating *data, intp_t *node_indices, intp_t split_dim, intp_t split_index, diff --git a/sklearn/neighbors/_partition_nodes.pyx b/sklearn/neighbors/_partition_nodes.pyx index d293b765ea279..011b024fccb14 100644 --- a/sklearn/neighbors/_partition_nodes.pyx +++ b/sklearn/neighbors/_partition_nodes.pyx @@ -16,6 +16,8 @@ # - https://en.cppreference.com/w/cpp/algorithm/nth_element. # - https://github.com/scikit-learn/scikit-learn/pull/11103 # - https://github.com/scikit-learn/scikit-learn/pull/19473 +from cython cimport floating + cdef extern from *: """ @@ -63,7 +65,7 @@ cdef extern from *: cdef int partition_node_indices( - float64_t *data, + floating *data, intp_t *node_indices, intp_t split_dim, intp_t split_index, diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index b9b7f4030d02c..2897c1ce409e8 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -15,6 +15,7 @@ import numpy as np from ..base import RegressorMixin, _fit_context +from ..metrics import DistanceMetric from ..utils._param_validation import StrOptions from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights @@ -71,7 +72,7 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : str or callable, default='minkowski' + metric : str, DistanceMetric object or callable, default='minkowski' Metric to use for distance computation. Default is "minkowski", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance @@ -89,6 +90,9 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. + If metric is a DistanceMetric object, it will be passed directly to + the underlying computation routines. + metric_params : dict, default=None Additional keyword arguments for the metric function. @@ -164,6 +168,7 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): **NeighborsBase._parameter_constraints, "weights": [StrOptions({"uniform", "distance"}), callable, None], } + _parameter_constraints["metric"].append(DistanceMetric) _parameter_constraints.pop("radius") def __init__( diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index efca4e491ce01..5263f201f320b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -2,9 +2,9 @@ import numpy as np import pytest -from numpy.testing import assert_array_almost_equal +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal -from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64 from sklearn.utils import check_random_state from sklearn.utils._testing import _convert_container from sklearn.utils.validation import check_array @@ -15,6 +15,13 @@ DIMENSION = 3 +METRICS = { + "euclidean": {}, + "manhattan": {}, + "minkowski": dict(p=3), + "chebyshev": {}, +} + DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"] BOOLEAN_METRICS = [ @@ -26,6 +33,11 @@ "sokalsneath", ] +BALL_TREE_CLASSES = [ + BallTree64, + BallTree32, +] + def brute_force_neighbors(X, Y, k, metric, **kwargs): from sklearn.metrics import DistanceMetric @@ -37,9 +49,14 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): return dist, ind +def test_BallTree_is_BallTree64_subclass(): + assert issubclass(BallTree, BallTree64) + + @pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) @pytest.mark.parametrize("array_type", ["list", "array"]) -def test_ball_tree_query_metrics(metric, array_type): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) @@ -52,31 +69,36 @@ def test_ball_tree_query_metrics(metric, array_type): k = 5 - bt = BallTree(X, leaf_size=1, metric=metric) + bt = BallTreeImplementation(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2) -def test_query_haversine(): +@pytest.mark.parametrize( + "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5]) +) +def test_query_haversine(BallTreeImplementation, decimal_tol): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) - bt = BallTree(X, leaf_size=1, metric="haversine") + bt = BallTreeImplementation(X, leaf_size=1, metric="haversine") dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") - assert_array_almost_equal(dist1, dist2) + assert_array_almost_equal(dist1, dist2, decimal=decimal_tol) assert_array_almost_equal(ind1, ind2) -def test_array_object_type(): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_array_object_type(BallTreeImplementation): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - BallTree(X) + BallTreeImplementation(X) -def test_bad_pyfunc_metric(): +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_bad_pyfunc_metric(BallTreeImplementation): def wrong_returned_value(x, y): return "1" @@ -86,8 +108,93 @@ def one_arg_func(x): X = np.ones((5, 2)) msg = "Custom distance function must accept two vectors and return a float." with pytest.raises(TypeError, match=msg): - BallTree(X, metric=wrong_returned_value) + BallTreeImplementation(X, metric=wrong_returned_value) msg = "takes 1 positional argument but 2 were given" with pytest.raises(TypeError, match=msg): - BallTree(X, metric=one_arg_func) + BallTreeImplementation(X, metric=one_arg_func) + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_ball_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 5 + dist_64, ind_64 = bt_64.query(Y_64, k=k) + dist_32, ind_32 = bt_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = bt_64.query_radius(Y_64, r=r) + ind_32 = bt_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 + + +def test_two_point_correlation_numerical_consistency(global_random_seed): + # Test consistency with respect to the `two_point_correlation` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + bt_64 = BallTree64(X_64, leaf_size=10) + bt_32 = BallTree32(X_32, leaf_size=10) + + r = np.linspace(0, 1, 10) + + counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True) + counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True) + assert_allclose(counts_64, counts_32) + + +def get_dataset_for_binary_tree(random_seed, features=3): + rng = np.random.RandomState(random_seed) + _X = rng.rand(100, features) + _Y = rng.rand(5, features) + + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) + + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) + + return X_64, X_32, Y_64, Y_32 diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 1aee28cc36bd0..749601baaf66f 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -1,30 +1,100 @@ import numpy as np import pytest +from numpy.testing import assert_allclose, assert_equal -from sklearn.neighbors._kd_tree import KDTree +from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64 +from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree from sklearn.utils.parallel import Parallel, delayed DIMENSION = 3 METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)} +KD_TREE_CLASSES = [ + KDTree64, + KDTree32, +] -def test_array_object_type(): + +def test_KDTree_is_KDTree64_subclass(): + assert issubclass(KDTree, KDTree64) + + +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_array_object_type(BinarySearchTree): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): - KDTree(X) + BinarySearchTree(X) -def test_kdtree_picklable_with_joblib(): +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_kdtree_picklable_with_joblib(BinarySearchTree): """Make sure that KDTree queries work when joblib memmaps. Non-regression test for #21685 and #21228.""" rng = np.random.RandomState(0) X = rng.random_sample((10, 3)) - tree = KDTree(X, leaf_size=2) + tree = BinarySearchTree(X, leaf_size=2) # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that # use to raise "ValueError: buffer source array is read-only" in a previous # version of the Cython code. Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X]) + + +@pytest.mark.parametrize("metric", METRICS) +def test_kd_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 4 + dist_64, ind_64 = kd_64.query(Y_64, k=k) + dist_32, ind_32 = kd_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = kd_64.query_radius(Y_64, r=r) + ind_32 = kd_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", METRICS) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 405ac3a6d0847..35fc210bea7f3 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -24,6 +24,9 @@ ) from sklearn.base import clone from sklearn.exceptions import DataConversionWarning, EfficiencyWarning, NotFittedError +from sklearn.metrics._dist_metrics import ( + DistanceMetric, +) from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS from sklearn.metrics.tests.test_pairwise_distances_reduction import ( @@ -69,6 +72,7 @@ COMMON_VALID_METRICS = sorted( set.intersection(*map(set, neighbors.VALID_METRICS.values())) ) # type: ignore + P = (1, 2, 3, 4, np.inf) JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys()) @@ -76,6 +80,25 @@ neighbors.kneighbors_graph = ignore_warnings(neighbors.kneighbors_graph) neighbors.radius_neighbors_graph = ignore_warnings(neighbors.radius_neighbors_graph) +# A list containing metrics where the string specifies the use of the +# DistanceMetric object directly (as resolved in _parse_metric) +DISTANCE_METRIC_OBJS = ["DM_euclidean"] + + +def _parse_metric(metric: str, dtype=None): + """ + Helper function for properly building a type-specialized DistanceMetric instances. + + Constructs a type-specialized DistanceMetric instance from a string + beginning with "DM_" while allowing a pass-through for other metric-specifying + strings. This is necessary since we wish to parameterize dtype independent of + metric, yet DistanceMetric requires it for construction. + + """ + if metric[:3] == "DM_": + return DistanceMetric.get_metric(metric[3:], dtype=dtype) + return metric + def _generate_test_params_for(metric: str, n_features: int): """Return list of DistanceMetric kwargs for tests.""" @@ -129,7 +152,7 @@ def _weight_func(dist): ], ) @pytest.mark.parametrize("query_is_train", [False, True]) -@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS) # type: ignore # noqa def test_unsupervised_kneighbors( global_dtype, n_samples, @@ -143,6 +166,8 @@ def test_unsupervised_kneighbors( # on their common metrics, with and without returning # distances + metric = _parse_metric(metric, global_dtype) + # Redefining the rng locally to use the same generated X local_rng = np.random.RandomState(0) X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False) @@ -157,6 +182,12 @@ def test_unsupervised_kneighbors( results = [] for algorithm in ALGORITHMS: + if isinstance(metric, DistanceMetric) and global_dtype == np.float32: + if "tree" in algorithm: # pragma: nocover + pytest.skip( + "Neither KDTree nor BallTree support 32-bit distance metric" + " objects." + ) neigh = neighbors.NearestNeighbors( n_neighbors=n_neighbors, algorithm=algorithm, metric=metric ) @@ -206,7 +237,7 @@ def test_unsupervised_kneighbors( (1000, 5, 100), ], ) -@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS) # type: ignore # noqa @pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)]) @pytest.mark.parametrize( "NeighborsMixinSubclass", @@ -230,6 +261,19 @@ def test_neigh_predictions_algorithm_agnosticity( # The different algorithms must return identical predictions results # on their common metrics. + metric = _parse_metric(metric, global_dtype) + if isinstance(metric, DistanceMetric): + if "Classifier" in NeighborsMixinSubclass.__name__: + pytest.skip( + "Metrics of type `DistanceMetric` are not yet supported for" + " classifiers." + ) + if "Radius" in NeighborsMixinSubclass.__name__: + pytest.skip( + "Metrics of type `DistanceMetric` are not yet supported for" + " radius-neighbor estimators." + ) + # Redefining the rng locally to use the same generated X local_rng = np.random.RandomState(0) X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False) @@ -244,6 +288,12 @@ def test_neigh_predictions_algorithm_agnosticity( ) for algorithm in ALGORITHMS: + if isinstance(metric, DistanceMetric) and global_dtype == np.float32: + if "tree" in algorithm: # pragma: nocover + pytest.skip( + "Neither KDTree nor BallTree support 32-bit distance metric" + " objects." + ) neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric) neigh.fit(X, y) @@ -985,15 +1035,26 @@ def test_query_equidistant_kth_nn(algorithm): @pytest.mark.parametrize( ["algorithm", "metric"], - [ - ("ball_tree", "euclidean"), - ("kd_tree", "euclidean"), + list( + product( + ("kd_tree", "ball_tree", "brute"), + ("euclidean", *DISTANCE_METRIC_OBJS), + ) + ) + + [ ("brute", "euclidean"), ("brute", "precomputed"), ], ) def test_radius_neighbors_sort_results(algorithm, metric): # Test radius_neighbors[_graph] output when sort_result is True + + metric = _parse_metric(metric, np.float64) + if isinstance(metric, DistanceMetric): + pytest.skip( + "Metrics of type `DistanceMetric` are not yet supported for radius-neighbor" + " estimators." + ) n_samples = 10 rng = np.random.RandomState(42) X = rng.random_sample((n_samples, 4)) @@ -1560,11 +1621,14 @@ def test_nearest_neighbors_validate_params(): neighbors.VALID_METRICS["brute"] ) - set(["pyfunc", *BOOL_METRICS]) - ), + ) + + DISTANCE_METRIC_OBJS, ) def test_neighbors_metrics( global_dtype, metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5 ): + metric = _parse_metric(metric, global_dtype) + # Test computing the neighbors for various metrics algorithms = ["brute", "ball_tree", "kd_tree"] X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) @@ -1574,12 +1638,21 @@ def test_neighbors_metrics( for metric_params in metric_params_list: # Some metric (e.g. Weighted minkowski) are not supported by KDTree - exclude_kd_tree = metric not in neighbors.VALID_METRICS["kd_tree"] or ( - "minkowski" in metric and "w" in metric_params + exclude_kd_tree = ( + False + if isinstance(metric, DistanceMetric) + else metric not in neighbors.VALID_METRICS["kd_tree"] + or ("minkowski" in metric and "w" in metric_params) ) results = {} p = metric_params.pop("p", 2) for algorithm in algorithms: + if isinstance(metric, DistanceMetric) and global_dtype == np.float32: + if "tree" in algorithm: # pragma: nocover + pytest.skip( + "Neither KDTree nor BallTree support 32-bit distance metric" + " objects." + ) neigh = neighbors.NearestNeighbors( n_neighbors=n_neighbors, algorithm=algorithm, @@ -1684,10 +1757,14 @@ def custom_metric(x1, x2): assert_allclose(dist1, dist2) -@pytest.mark.parametrize("metric", neighbors.VALID_METRICS["brute"]) +@pytest.mark.parametrize( + "metric", neighbors.VALID_METRICS["brute"] + DISTANCE_METRIC_OBJS +) def test_valid_brute_metric_for_auto_algorithm( global_dtype, metric, n_samples=20, n_features=12 ): + metric = _parse_metric(metric, global_dtype) + X = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) Xcsr = csr_matrix(X) @@ -2207,3 +2284,22 @@ def test_predict_dataframe(): knn = neighbors.KNeighborsClassifier(n_neighbors=2).fit(X, y) knn.predict(X) + + +def test_nearest_neighbours_works_with_p_less_than_1(): + """Check that NearestNeighbors works with :math:`p \\in (0,1)` when `algorithm` + is `"auto"` or `"brute"` regardless of the dtype of X. + + Non-regression test for issue #26548 + """ + X = np.array([[1.0, 0.0], [0.0, 0.0], [0.0, 1.0]]) + neigh = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="brute", metric_params={"p": 0.5} + ) + neigh.fit(X) + + y = neigh.radius_neighbors(X[0].reshape(1, -1), radius=4, return_distance=False) + assert_allclose(y[0], [0, 1, 2]) + + y = neigh.kneighbors(X[0].reshape(1, -1), return_distance=False) + assert_allclose(y[0], [0, 1, 2]) diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index 590e72ab785d2..4d8bac12f7423 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -13,7 +13,7 @@ kernel_norm, ) from sklearn.neighbors._ball_tree import ( - NeighborsHeap as NeighborsHeapBT, + NeighborsHeap64 as NeighborsHeapBT, ) from sklearn.neighbors._ball_tree import ( nodeheap_sort as nodeheap_sort_bt, @@ -25,7 +25,7 @@ KDTree, ) from sklearn.neighbors._kd_tree import ( - NeighborsHeap as NeighborsHeapKDT, + NeighborsHeap64 as NeighborsHeapKDT, ) from sklearn.neighbors._kd_tree import ( nodeheap_sort as nodeheap_sort_kdt, diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index d6ad0001ad257..d85196e879b45 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -334,9 +334,7 @@ def _log_message(self, step_idx): def _check_method_params(self, method, props, **kwargs): if _routing_enabled(): - routed_params = process_routing( - self, method=method, other_params=props, **kwargs - ) + routed_params = process_routing(self, method, **props, **kwargs) return routed_params else: fit_params_steps = Bunch( @@ -586,7 +584,7 @@ def predict(self, X, **params): return self.steps[-1][1].predict(Xt, **params) # metadata routing enabled - routed_params = process_routing(self, "predict", other_params=params) + routed_params = process_routing(self, "predict", **params) for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].predict(Xt, **routed_params[self.steps[-1][0]].predict) @@ -706,7 +704,7 @@ def predict_proba(self, X, **params): return self.steps[-1][1].predict_proba(Xt, **params) # metadata routing enabled - routed_params = process_routing(self, "predict_proba", other_params=params) + routed_params = process_routing(self, "predict_proba", **params) for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].predict_proba( @@ -747,7 +745,7 @@ def decision_function(self, X, **params): # not branching here since params is only available if # enable_metadata_routing=True - routed_params = process_routing(self, "decision_function", other_params=params) + routed_params = process_routing(self, "decision_function", **params) Xt = X for _, name, transform in self._iter(with_final=False): @@ -833,7 +831,7 @@ def predict_log_proba(self, X, **params): return self.steps[-1][1].predict_log_proba(Xt, **params) # metadata routing enabled - routed_params = process_routing(self, "predict_log_proba", other_params=params) + routed_params = process_routing(self, "predict_log_proba", **params) for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt, **routed_params[name].transform) return self.steps[-1][1].predict_log_proba( @@ -882,7 +880,7 @@ def transform(self, X, **params): # not branching here since params is only available if # enable_metadata_routing=True - routed_params = process_routing(self, "transform", other_params=params) + routed_params = process_routing(self, "transform", **params) Xt = X for _, name, transform in self._iter(): Xt = transform.transform(Xt, **routed_params[name].transform) @@ -925,7 +923,7 @@ def inverse_transform(self, Xt, **params): # we don't have to branch here, since params is only non-empty if # enable_metadata_routing=True. - routed_params = process_routing(self, "inverse_transform", other_params=params) + routed_params = process_routing(self, "inverse_transform", **params) reverse_iter = reversed(list(self._iter())) for _, name, transform in reverse_iter: Xt = transform.inverse_transform( @@ -981,7 +979,7 @@ def score(self, X, y=None, sample_weight=None, **params): # metadata routing is enabled. routed_params = process_routing( - self, "score", sample_weight=sample_weight, other_params=params + self, "score", sample_weight=sample_weight, **params ) Xt = X @@ -1108,7 +1106,7 @@ def get_metadata_routing(self): router = MetadataRouter(owner=self.__class__.__name__) # first we add all steps except the last one - for _, name, trans in self._iter(with_final=False): + for _, name, trans in self._iter(with_final=False, filter_passthrough=True): method_mapping = MethodMapping() # fit, fit_predict, and fit_transform call fit_transform if it # exists, or else fit and transform @@ -1142,7 +1140,7 @@ def get_metadata_routing(self): router.add(method_mapping=method_mapping, **{name: trans}) final_name, final_est = self.steps[-1] - if not final_est: + if final_est is None or final_est == "passthrough": return router # then we add the last step diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 0a0447de95cd8..2c4ea4af450f2 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -14,6 +14,7 @@ from ..utils._encode import _check_unknown, _encode, _get_counts, _unique from ..utils._mask import _get_mask from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from ..utils._set_output import _get_output_config from ..utils.validation import _check_feature_names_in, check_is_fitted __all__ = ["OneHotEncoder", "OrdinalEncoder"] @@ -176,11 +177,11 @@ def _transform( warn_on_unknown=False, ignore_category_indices=None, ): - self._check_feature_names(X, reset=False) - self._check_n_features(X, reset=False) X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite ) + self._check_feature_names(X, reset=False) + self._check_n_features(X, reset=False) X_int = np.zeros((n_samples, n_features), dtype=int) X_mask = np.ones((n_samples, n_features), dtype=bool) @@ -437,7 +438,7 @@ def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices): X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i]) def _more_tags(self): - return {"X_types": ["categorical"]} + return {"X_types": ["2darray", "categorical"], "allow_nan": True} class OneHotEncoder(_BaseEncoder): @@ -1008,6 +1009,14 @@ def transform(self, X): returned. """ check_is_fitted(self) + transform_output = _get_output_config("transform", estimator=self)["dense"] + if transform_output == "pandas" and self.sparse_output: + raise ValueError( + "Pandas output does not support sparse data. Set sparse_output=False to" + " output pandas DataFrames or disable pandas output via" + ' `ohe.set_output(transform="default").' + ) + # validation of X happens in _check_X called by _transform warn_on_unknown = self.drop is not None and self.handle_unknown in { "ignore", @@ -1499,15 +1508,11 @@ def fit(self, X, y=None): if infrequent is not None: cardinalities[feature_idx] -= len(infrequent) - # stores the missing indices per category - self._missing_indices = {} + # missing values are not considered part of the cardinality + # when considering unknown categories or encoded_missing_value for cat_idx, categories_for_idx in enumerate(self.categories_): - for i, cat in enumerate(categories_for_idx): + for cat in categories_for_idx: if is_scalar_nan(cat): - self._missing_indices[cat_idx] = i - - # missing values are not considered part of the cardinality - # when considering unknown categories or encoded_missing_value cardinalities[cat_idx] -= 1 continue diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index d621c6c410153..3008710d3c3dc 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -31,7 +31,7 @@ ] -class LabelEncoder(TransformerMixin, BaseEstimator): +class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): """Encode target labels with value between 0 and n_classes-1. This transformer should be used to encode target values, *i.e.* `y`, and @@ -56,8 +56,8 @@ class LabelEncoder(TransformerMixin, BaseEstimator): -------- `LabelEncoder` can be used to normalize labels. - >>> from sklearn import preprocessing - >>> le = preprocessing.LabelEncoder() + >>> from sklearn.preprocessing import LabelEncoder + >>> le = LabelEncoder() >>> le.fit([1, 2, 2, 6]) LabelEncoder() >>> le.classes_ @@ -70,7 +70,7 @@ class LabelEncoder(TransformerMixin, BaseEstimator): It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. - >>> le = preprocessing.LabelEncoder() + >>> le = LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder() >>> list(le.classes_) @@ -165,7 +165,7 @@ def _more_tags(self): return {"X_types": ["1dlabels"]} -class LabelBinarizer(TransformerMixin, BaseEstimator): +class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): """Binarize labels in a one-vs-all fashion. Several regression and binary classification algorithms are @@ -221,8 +221,8 @@ class LabelBinarizer(TransformerMixin, BaseEstimator): Examples -------- - >>> from sklearn import preprocessing - >>> lb = preprocessing.LabelBinarizer() + >>> from sklearn.preprocessing import LabelBinarizer + >>> lb = LabelBinarizer() >>> lb.fit([1, 2, 6, 4, 2]) LabelBinarizer() >>> lb.classes_ @@ -233,7 +233,7 @@ class LabelBinarizer(TransformerMixin, BaseEstimator): Binary targets transform to a column vector - >>> lb = preprocessing.LabelBinarizer() + >>> lb = LabelBinarizer() >>> lb.fit_transform(['yes', 'no', 'no', 'yes']) array([[1], [0], @@ -685,7 +685,7 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold): raise ValueError("{0} format is not supported".format(output_type)) -class MultiLabelBinarizer(TransformerMixin, BaseEstimator): +class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): """Transform between iterable of iterables and a multilabel format. Although a list of sets or tuples is a very intuitive format for multilabel diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py index ea2f3b202bac4..81afeb6a8bd43 100644 --- a/sklearn/preprocessing/_target_encoder.py +++ b/sklearn/preprocessing/_target_encoder.py @@ -27,8 +27,8 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder): .. note:: `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a - cross fitting scheme is used in `fit_transform` for encoding. See the - :ref:`User Guide ` for details. + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide ` for details. .. versionadded:: 1.3 @@ -68,7 +68,7 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder): If `"auto"`, then `smooth` is set to an empirical Bayes estimate. cv : int, default=5 - Determines the number of folds in the cross fitting strategy used in + Determines the number of folds in the :term:`cross fitting` strategy used in :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used and for continuous targets, `KFold` is used. @@ -204,8 +204,8 @@ def fit_transform(self, X, y): .. note:: `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a - cross fitting scheme is used in `fit_transform` for encoding. See the - :ref:`User Guide `. for details. + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide `. for details. Parameters ---------- @@ -260,8 +260,8 @@ def transform(self, X): .. note:: `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a - cross fitting scheme is used in `fit_transform` for encoding. See the - :ref:`User Guide `. for details. + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide `. for details. Parameters ---------- @@ -273,14 +273,14 @@ def transform(self, X): X_trans : ndarray of shape (n_samples, n_features) Transformed input. """ - X_ordinal, X_valid = self._transform( + X_ordinal, X_known_mask = self._transform( X, handle_unknown="ignore", force_all_finite="allow-nan" ) X_out = np.empty_like(X_ordinal, dtype=np.float64) self._transform_X_ordinal( X_out, X_ordinal, - ~X_valid, + ~X_known_mask, slice(None), self.encodings_, self.target_mean_, @@ -299,8 +299,9 @@ def _fit_encodings_all(self, X, y): inferred_type_of_target = type_of_target(y, input_name="y") if inferred_type_of_target not in accepted_target_types: raise ValueError( - f"Target type was inferred to be {inferred_type_of_target!r}. Only" - f" {accepted_target_types} are supported." + "Unknown label type: Target type was inferred to be " + f"{inferred_type_of_target!r}. Only {accepted_target_types} are " + "supported." ) self.target_type_ = inferred_type_of_target else: @@ -343,4 +344,13 @@ def _transform_X_ordinal( X_out[X_unknown_mask[:, f_idx], f_idx] = y_mean def _more_tags(self): - return {"requires_y": True} + return { + "requires_y": True, + # TargetEncoder is a special case where a transformer uses `y` but + # only accept binary classification and regression targets. For the + # purpose of common tests we use `binary_only` tag to eliminate the + # multiclass tests. TODO: remove this special case when multiclass + # support is added to TargetEncoder. xref: + # https://github.com/scikit-learn/scikit-learn/pull/26674 + "binary_only": True, + } diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index ca809dd513cf3..9ba041c90f5de 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -1588,6 +1588,26 @@ def test_ohe_drop_first_explicit_categories(handle_unknown): assert_allclose(X_trans, X_expected) +def test_ohe_more_informative_error_message(): + """Raise informative error message when pandas output and sparse_output=True.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"]) + + ohe = OneHotEncoder(sparse_output=True) + ohe.set_output(transform="pandas") + + msg = ( + "Pandas output does not support sparse data. Set " + "sparse_output=False to output pandas DataFrames or disable pandas output" + ) + with pytest.raises(ValueError, match=msg): + ohe.fit_transform(df) + + ohe.fit(df) + with pytest.raises(ValueError, match=msg): + ohe.transform(df) + + def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype(): """Test ordinal encoder with nan passthrough fails when dtype=np.int32.""" diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index fa19171503a1d..7c4bb01535dca 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -6,7 +6,6 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer -from sklearn.utils import _safe_indexing from sklearn.utils._testing import ( _convert_container, assert_allclose_dense_sparse, @@ -196,9 +195,7 @@ def test_function_transformer_raise_error_with_mixed_dtype(X_type): data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype) def func(X): - return np.array( - [mapping[_safe_indexing(X, i)] for i in range(X.size)], dtype=object - ) + return np.array([mapping[X[i]] for i in range(X.size)], dtype=object) def inverse_func(X): return _convert_container( diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 7d413063968e4..633a386c75951 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -672,3 +672,17 @@ def test_nan_label_encoder(): y_trans = le.transform([np.nan]) assert_array_equal(y_trans, [2]) + + +@pytest.mark.parametrize( + "encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()] +) +def test_label_encoders_do_not_have_set_output(encoder): + """Check that label encoders do not define set_output and work with y as a kwarg. + + Non-regression test for #26854. + """ + assert not hasattr(encoder, "set_output") + y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"]) + y_encoded_positional = encoder.fit_transform(["a", "b", "c"]) + assert_array_equal(y_encoded_with_kwarg, y_encoded_positional) diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 2fc5e04b5df83..eb126ec77e526 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -825,7 +825,7 @@ def predict(self, X): def _check_proba(self): if not self.probability: raise AttributeError( - "predict_proba is not available when probability=False" + "predict_proba is not available when probability=False" ) if self._impl not in ("c_svc", "nu_svc"): raise AttributeError("predict_proba only implemented for SVC and NuSVC") @@ -835,7 +835,7 @@ def _check_proba(self): def predict_proba(self, X): """Compute probabilities of possible outcomes for samples in X. - The model need to have probability information computed at training + The model needs to have probability information computed at training time: fit with attribute `probability` set to True. Parameters @@ -1095,18 +1095,26 @@ def _fit_liblinear( Target vector relative to X C : float - Inverse of cross-validation parameter. Lower the C, the more + Inverse of cross-validation parameter. The lower the C, the higher the penalization. fit_intercept : bool - Whether or not to fit the intercept, that is to add a intercept - term to the decision function. + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: ``[x_1, ..., x_n, 1]``, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). intercept_scaling : float - LibLinear internally penalizes the intercept and this term is subject - to regularization just like the other terms of the feature vector. - In order to avoid this, one should increase the intercept_scaling. - such that the feature vector becomes [x, intercept_scaling]. + Liblinear internally penalizes the intercept, treating it like any + other term in the feature vector. To reduce the impact of the + regularization on the intercept, the `intercept_scaling` parameter can + be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 7a54c02201ccb..dfa48b4937147 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -49,6 +49,10 @@ class LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): penalties and loss functions and should scale better to large numbers of samples. + The main differences between :class:`~sklearn.svm.LinearSVC` and + :class:`~sklearn.svm.SVC` lie in the loss function used by default, and in + the handling of intercept regularization between those two implementations. + This class supports both dense and sparse input and the multiclass support is handled according to a one-vs-the-rest scheme. @@ -99,20 +103,26 @@ class LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): will be ignored. fit_intercept : bool, default=True - Whether to calculate the intercept for this model. If set - to false, no intercept will be used in calculations - (i.e. data is expected to be already centered). + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). intercept_scaling : float, default=1.0 - When self.fit_intercept is True, instance vector x becomes - ``[x, self.intercept_scaling]``, - i.e. a "synthetic" feature with constant value equals to - intercept_scaling is appended to the instance vector. - The intercept becomes intercept_scaling * synthetic feature weight - Note! the synthetic feature weight is subject to l1/l2 regularization - as all other features. - To lessen the effect of regularization on synthetic feature weight - (and therefore on the intercept) intercept_scaling has to be increased. + When `fit_intercept` is True, the instance vector x becomes ``[x_1, + ..., x_n, intercept_scaling]``, i.e. a "synthetic" feature with a + constant value equal to `intercept_scaling` is appended to the instance + vector. The intercept becomes intercept_scaling * synthetic feature + weight. Note that liblinear internally penalizes the intercept, + treating it like any other term in the feature vector. To reduce the + impact of the regularization on the intercept, the `intercept_scaling` + parameter can be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. class_weight : dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for @@ -362,6 +372,10 @@ class LinearSVR(RegressorMixin, LinearModel): penalties and loss functions and should scale better to large numbers of samples. + The main differences between :class:`~sklearn.svm.LinearSVR` and + :class:`~sklearn.svm.SVR` lie in the loss function used by default, and in + the handling of intercept regularization between those two implementations. + This class supports both dense and sparse input. Read more in the :ref:`User Guide `. @@ -389,20 +403,26 @@ class LinearSVR(RegressorMixin, LinearModel): loss ('squared_epsilon_insensitive') is the L2 loss. fit_intercept : bool, default=True - Whether to calculate the intercept for this model. If set - to false, no intercept will be used in calculations - (i.e. data is expected to be already centered). + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). intercept_scaling : float, default=1.0 - When self.fit_intercept is True, instance vector x becomes - [x, self.intercept_scaling], - i.e. a "synthetic" feature with constant value equals to - intercept_scaling is appended to the instance vector. - The intercept becomes intercept_scaling * synthetic feature weight - Note! the synthetic feature weight is subject to l1/l2 regularization - as all other features. - To lessen the effect of regularization on synthetic feature weight - (and therefore on the intercept) intercept_scaling has to be increased. + When `fit_intercept` is True, the instance vector x becomes `[x_1, ..., + x_n, intercept_scaling]`, i.e. a "synthetic" feature with a constant + value equal to `intercept_scaling` is appended to the instance vector. + The intercept becomes intercept_scaling * synthetic feature weight. + Note that liblinear internally penalizes the intercept, treating it + like any other term in the feature vector. To reduce the impact of the + regularization on the intercept, the `intercept_scaling` parameter can + be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. dual : "auto" or bool, default=True Select the algorithm to either solve the dual or primal @@ -462,8 +482,8 @@ class LinearSVR(RegressorMixin, LinearModel): same library as this class (liblinear). SVR : Implementation of Support Vector Machine regression using libsvm: - the kernel can be non-linear but its SMO algorithm does not - scale to large number of samples as LinearSVC does. + the kernel can be non-linear but its SMO algorithm does not scale to + large number of samples as :class:`~sklearn.svm.LinearSVR` does. sklearn.linear_model.SGDRegressor : SGDRegressor can optimize the same cost function as LinearSVR @@ -774,7 +794,7 @@ class SVC(BaseSVC): Indices of support vectors. support_vectors_ : ndarray of shape (n_SV, n_features) - Support vectors. + Support vectors. An empty array if kernel is precomputed. n_support_ : ndarray of shape (n_classes,), dtype=int32 Number of support vectors for each class. diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py new file mode 100644 index 0000000000000..59166e6687369 --- /dev/null +++ b/sklearn/tests/metadata_routing_common.py @@ -0,0 +1,407 @@ +from functools import partial + +import numpy as np + +from sklearn.base import ( + BaseEstimator, + ClassifierMixin, + MetaEstimatorMixin, + RegressorMixin, + TransformerMixin, + clone, +) +from sklearn.metrics._scorer import _PredictScorer, mean_squared_error +from sklearn.model_selection import BaseCrossValidator +from sklearn.model_selection._split import GroupsConsumerMixin +from sklearn.utils._metadata_requests import ( + SIMPLE_METHODS, +) +from sklearn.utils.metadata_routing import ( + MetadataRouter, + process_routing, +) + + +def record_metadata(obj, method, record_default=True, **kwargs): + """Utility function to store passed metadata to a method. + + If record_default is False, kwargs whose values are "default" are skipped. + This is so that checks on keyword arguments whose default was not changed + are skipped. + + """ + if not hasattr(obj, "_records"): + obj._records = {} + if not record_default: + kwargs = { + key: val + for key, val in kwargs.items() + if not isinstance(val, str) or (val != "default") + } + obj._records[method] = kwargs + + +def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): + """Check whether the expected metadata is passed to the object's method. + + Parameters + ---------- + split_params : tuple, default=empty + specifies any parameters which are to be checked as being a subset + of the original values. + + """ + records = getattr(obj, "_records", dict()).get(method, dict()) + assert set(kwargs.keys()) == set(records.keys()) + for key, value in kwargs.items(): + recorded_value = records[key] + # The following condition is used to check for any specified parameters + # being a subset of the original values + if key in split_params and recorded_value is not None: + assert np.isin(recorded_value, value).all() + else: + assert recorded_value is value + + +record_metadata_not_default = partial(record_metadata, record_default=False) + + +def assert_request_is_empty(metadata_request, exclude=None): + """Check if a metadata request dict is empty. + + One can exclude a method or a list of methods from the check using the + ``exclude`` parameter. + """ + if isinstance(metadata_request, MetadataRouter): + for _, route_mapping in metadata_request: + assert_request_is_empty(route_mapping.router) + return + + exclude = [] if exclude is None else exclude + for method in SIMPLE_METHODS: + if method in exclude: + continue + mmr = getattr(metadata_request, method) + props = [ + prop + for prop, alias in mmr.requests.items() + if isinstance(alias, str) or alias is not None + ] + assert not len(props) + + +def assert_request_equal(request, dictionary): + for method, requests in dictionary.items(): + mmr = getattr(request, method) + assert mmr.requests == requests + + empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary] + for method in empty_methods: + assert not len(getattr(request, method).requests) + + +class _Registry(list): + # This list is used to get a reference to the sub-estimators, which are not + # necessarily stored on the metaestimator. We need to override __deepcopy__ + # because the sub-estimators are probably cloned, which would result in a + # new copy of the list, but we need copy and deep copy both to return the + # same instance. + def __deepcopy__(self, memo): + return self + + def __copy__(self): + return self + + +class ConsumingRegressor(RegressorMixin, BaseEstimator): + """A regressor consuming metadata. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + + """ + + def __init__(self, registry=None): + self.registry = registry + + def partial_fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "partial_fit", sample_weight=sample_weight, metadata=metadata + ) + return self + + def fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "fit", sample_weight=sample_weight, metadata=metadata + ) + return self + + def predict(self, X, sample_weight="default", metadata="default"): + pass # pragma: no cover + + # when needed, uncomment the implementation + # if self.registry is not None: + # self.registry.append(self) + + # record_metadata_not_default( + # self, "predict", sample_weight=sample_weight, metadata=metadata + # ) + # return np.zeros(shape=(len(X),)) + + +class NonConsumingClassifier(ClassifierMixin, BaseEstimator): + """A classifier which accepts no metadata on any method.""" + + def __init__(self, registry=None): + self.registry = registry + + def fit(self, X, y): + if self.registry is not None: + self.registry.append(self) + + self.classes_ = [0, 1] + return self + + def predict(self, X): + return np.ones(len(X)) # pragma: no cover + + +class ConsumingClassifier(ClassifierMixin, BaseEstimator): + """A classifier consuming metadata. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + + """ + + def __init__(self, registry=None): + self.registry = registry + + def partial_fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "partial_fit", sample_weight=sample_weight, metadata=metadata + ) + self.classes_ = [0, 1] + return self + + def fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "fit", sample_weight=sample_weight, metadata=metadata + ) + self.classes_ = [0, 1] + return self + + def predict(self, X, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "predict", sample_weight=sample_weight, metadata=metadata + ) + return np.zeros(shape=(len(X),)) + + def predict_proba(self, X, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "predict_proba", sample_weight=sample_weight, metadata=metadata + ) + return np.asarray([[0.0, 1.0]] * len(X)) + + def predict_log_proba(self, X, sample_weight="default", metadata="default"): + pass # pragma: no cover + + # when needed, uncomment the implementation + # if self.registry is not None: + # self.registry.append(self) + + # record_metadata_not_default( + # self, "predict_log_proba", sample_weight=sample_weight, metadata=metadata + # ) + # return np.zeros(shape=(len(X), 2)) + + +class ConsumingTransformer(TransformerMixin, BaseEstimator): + """A transformer which accepts metadata on fit and transform. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + """ + + def __init__(self, registry=None): + self.registry = registry + + def fit(self, X, y=None, sample_weight=None, metadata=None): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, "fit", sample_weight=sample_weight, metadata=metadata + ) + return self + + def transform(self, X, sample_weight=None): + record_metadata(self, "transform", sample_weight=sample_weight) + return X + + +class ConsumingScorer(_PredictScorer): + def __init__(self, registry=None): + super().__init__(score_func=mean_squared_error, sign=1, kwargs={}) + self.registry = registry + + def _score(self, method_caller, clf, X, y, **kwargs): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default(self, "score", **kwargs) + + sample_weight = kwargs.get("sample_weight", None) + return super()._score(method_caller, clf, X, y, sample_weight=sample_weight) + + +class ConsumingSplitter(BaseCrossValidator, GroupsConsumerMixin): + def __init__(self, registry=None): + self.registry = registry + + def split(self, X, y=None, groups="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default(self, "split", groups=groups, metadata=metadata) + + split_index = len(X) // 2 + train_indices = list(range(0, split_index)) + test_indices = list(range(split_index, len(X))) + yield test_indices, train_indices + yield train_indices, test_indices + + def get_n_splits(self, X=None, y=None, groups=None): + pass # pragma: no cover + + def _iter_test_indices(self, X=None, y=None, groups=None): + split_index = len(X) // 2 + train_indices = list(range(0, split_index)) + test_indices = list(range(split_index, len(X))) + yield test_indices + yield train_indices + + +class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): + """A meta-regressor which is only a router.""" + + def __init__(self, estimator): + self.estimator = estimator + + def fit(self, X, y, **fit_params): + params = process_routing(self, "fit", **fit_params) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + + def get_metadata_routing(self): + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, method_mapping="one-to-one" + ) + return router + + +class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): + """A meta-regressor which is also a consumer.""" + + def __init__(self, estimator, registry=None): + self.estimator = estimator + self.registry = registry + + def fit(self, X, y, sample_weight=None, **fit_params): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, "fit", sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + return self + + def predict(self, X, **predict_params): + params = process_routing(self, "predict", **predict_params) + return self.estimator_.predict(X, **params.estimator.predict) + + def get_metadata_routing(self): + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add(estimator=self.estimator, method_mapping="one-to-one") + ) + return router + + +class WeightedMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): + """A meta-estimator which also consumes sample_weight itself in ``fit``.""" + + def __init__(self, estimator, registry=None): + self.estimator = estimator + self.registry = registry + + def fit(self, X, y, sample_weight=None, **kwargs): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, "fit", sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **kwargs) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + return self + + def get_metadata_routing(self): + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add(estimator=self.estimator, method_mapping="fit") + ) + return router + + +class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator): + """A simple meta-transformer.""" + + def __init__(self, transformer): + self.transformer = transformer + + def fit(self, X, y=None, **fit_params): + params = process_routing(self, "fit", **fit_params) + self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit) + return self + + def transform(self, X, y=None, **transform_params): + params = process_routing(self, "transform", **transform_params) + return self.transformer_.transform(X, **params.transformer.transform) + + def get_metadata_routing(self): + return MetadataRouter(owner=self.__class__.__name__).add( + transformer=self.transformer, method_mapping="one-to-one" + ) diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py index 3fc6a9c337f47..50b6f912667ba 100644 --- a/sklearn/tests/test_metadata_routing.py +++ b/sklearn/tests/test_metadata_routing.py @@ -13,13 +13,23 @@ from sklearn import config_context from sklearn.base import ( BaseEstimator, - ClassifierMixin, - MetaEstimatorMixin, - RegressorMixin, - TransformerMixin, clone, ) from sklearn.linear_model import LinearRegression +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingRegressor, + ConsumingTransformer, + MetaRegressor, + MetaTransformer, + NonConsumingClassifier, + WeightedMetaClassifier, + WeightedMetaRegressor, + _Registry, + assert_request_equal, + assert_request_is_empty, + check_recorded_metadata, +) from sklearn.utils import metadata_routing from sklearn.utils._metadata_requests import ( COMPOSITE_METHODS, @@ -56,209 +66,6 @@ def enable_slep006(): yield -def assert_request_is_empty(metadata_request, exclude=None): - """Check if a metadata request dict is empty. - - One can exclude a method or a list of methods from the check using the - ``exclude`` parameter. - """ - if isinstance(metadata_request, MetadataRouter): - for _, route_mapping in metadata_request: - assert_request_is_empty(route_mapping.router) - return - - exclude = [] if exclude is None else exclude - for method in SIMPLE_METHODS: - if method in exclude: - continue - mmr = getattr(metadata_request, method) - props = [ - prop - for prop, alias in mmr.requests.items() - if isinstance(alias, str) or alias is not None - ] - assert not len(props) - - -def assert_request_equal(request, dictionary): - for method, requests in dictionary.items(): - mmr = getattr(request, method) - assert mmr.requests == requests - - empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary] - for method in empty_methods: - assert not len(getattr(request, method).requests) - - -def record_metadata(obj, method, record_default=True, **kwargs): - """Utility function to store passed metadata to a method. - - If record_default is False, kwargs whose values are "default" are skipped. - This is so that checks on keyword arguments whose default was not changed - are skipped. - - """ - if not hasattr(obj, "_records"): - obj._records = {} - if not record_default: - kwargs = { - key: val - for key, val in kwargs.items() - if not isinstance(val, str) or (val != "default") - } - obj._records[method] = kwargs - - -def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): - """Check whether the expected metadata is passed to the object's method. - - Parameters - ---------- - split_params : tuple, default=empty - specifies any parameters which are to be checked as being a subset - of the original values. - - """ - records = getattr(obj, "_records", dict()).get(method, dict()) - assert set(kwargs.keys()) == set(records.keys()) - for key, value in kwargs.items(): - recorded_value = records[key] - # The following condition is used to check for any specified parameters - # being a subset of the original values - if key in split_params and recorded_value is not None: - assert np.isin(recorded_value, value).all() - else: - assert recorded_value is value - - -class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): - """A meta-regressor which is only a router.""" - - def __init__(self, estimator): - self.estimator = estimator - - def fit(self, X, y, **fit_params): - params = process_routing(self, "fit", fit_params) - self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) - - def get_metadata_routing(self): - router = MetadataRouter(owner=self.__class__.__name__).add( - estimator=self.estimator, method_mapping="one-to-one" - ) - return router - - -class RegressorMetadata(RegressorMixin, BaseEstimator): - """A regressor consuming a metadata.""" - - def fit(self, X, y, sample_weight=None): - record_metadata(self, "fit", sample_weight=sample_weight) - return self - - def predict(self, X): - return np.zeros(shape=(len(X))) - - -class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): - """A meta-regressor which is also a consumer.""" - - def __init__(self, estimator): - self.estimator = estimator - - def fit(self, X, y, sample_weight=None, **fit_params): - record_metadata(self, "fit", sample_weight=sample_weight) - params = process_routing(self, "fit", fit_params, sample_weight=sample_weight) - self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) - return self - - def predict(self, X, **predict_params): - params = process_routing(self, "predict", predict_params) - return self.estimator_.predict(X, **params.estimator.predict) - - def get_metadata_routing(self): - router = ( - MetadataRouter(owner=self.__class__.__name__) - .add_self_request(self) - .add(estimator=self.estimator, method_mapping="one-to-one") - ) - return router - - -class ClassifierNoMetadata(ClassifierMixin, BaseEstimator): - """An estimator which accepts no metadata on any method.""" - - def fit(self, X, y): - return self - - def predict(self, X): - return np.ones(len(X)) # pragma: no cover - - -class ClassifierFitMetadata(ClassifierMixin, BaseEstimator): - """An estimator accepting two metadata in its ``fit`` method.""" - - def fit(self, X, y, sample_weight=None, brand=None): - record_metadata(self, "fit", sample_weight=sample_weight, brand=brand) - return self - - def predict(self, X): - return np.ones(len(X)) # pragma: no cover - - -class SimpleMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): - """A meta-estimator which also consumes sample_weight itself in ``fit``.""" - - def __init__(self, estimator): - self.estimator = estimator - - def fit(self, X, y, sample_weight=None, **kwargs): - record_metadata(self, "fit", sample_weight=sample_weight) - params = process_routing(self, "fit", kwargs, sample_weight=sample_weight) - self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) - return self - - def get_metadata_routing(self): - router = ( - MetadataRouter(owner=self.__class__.__name__) - .add_self_request(self) - .add(estimator=self.estimator, method_mapping="fit") - ) - return router - - -class TransformerMetadata(TransformerMixin, BaseEstimator): - """A transformer which accepts metadata on fit and transform.""" - - def fit(self, X, y=None, brand=None, sample_weight=None): - record_metadata(self, "fit", brand=brand, sample_weight=sample_weight) - return self - - def transform(self, X, sample_weight=None): - record_metadata(self, "transform", sample_weight=sample_weight) - return X - - -class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator): - """A simple meta-transformer.""" - - def __init__(self, transformer): - self.transformer = transformer - - def fit(self, X, y=None, **fit_params): - params = process_routing(self, "fit", fit_params) - self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit) - return self - - def transform(self, X, y=None, **transform_params): - params = process_routing(self, "transform", transform_params) - return self.transformer_.transform(X, **params.transformer.transform) - - def get_metadata_routing(self): - return MetadataRouter(owner=self.__class__.__name__).add( - transformer=self.transformer, method_mapping="one-to-one" - ) - - class SimplePipeline(BaseEstimator): """A very simple pipeline, assuming the last step is always a predictor.""" @@ -267,7 +74,7 @@ def __init__(self, steps): def fit(self, X, y, **fit_params): self.steps_ = [] - params = process_routing(self, "fit", fit_params) + params = process_routing(self, "fit", **fit_params) X_transformed = X for i, step in enumerate(self.steps[:-1]): transformer = clone(step).fit( @@ -286,7 +93,7 @@ def fit(self, X, y, **fit_params): def predict(self, X, **predict_params): check_is_fitted(self) X_transformed = X - params = process_routing(self, "predict", predict_params) + params = process_routing(self, "predict", **predict_params) for i, step in enumerate(self.steps_[:-1]): X_transformed = step.transform(X, **params.get(f"step_{i}").transform) @@ -334,10 +141,27 @@ def test_assert_request_is_empty(): assert_request_is_empty( MetadataRouter(owner="test") .add_self_request(WeightedMetaRegressor(estimator=None)) - .add(method_mapping="fit", estimator=RegressorMetadata()) + .add(method_mapping="fit", estimator=ConsumingRegressor()) ) +@pytest.mark.parametrize( + "estimator", + [ + ConsumingClassifier(registry=_Registry()), + ConsumingRegressor(registry=_Registry()), + ConsumingTransformer(registry=_Registry()), + NonConsumingClassifier(registry=_Registry()), + WeightedMetaClassifier(estimator=ConsumingClassifier(), registry=_Registry()), + WeightedMetaRegressor(estimator=ConsumingRegressor(), registry=_Registry()), + ], +) +def test_estimator_puts_self_in_registry(estimator): + """Check that an estimator puts itself in the registry upon fit.""" + estimator.fit(X, y) + assert estimator in estimator.registry + + @pytest.mark.parametrize( "val, res", [ @@ -383,90 +207,90 @@ class OddEstimator(BaseEstimator): assert odd_request.fit.requests == {"sample_weight": True} # check other test estimators - assert not len(get_routing_for_object(ClassifierNoMetadata()).fit.requests) - assert_request_is_empty(ClassifierNoMetadata().get_metadata_routing()) + assert not len(get_routing_for_object(NonConsumingClassifier()).fit.requests) + assert_request_is_empty(NonConsumingClassifier().get_metadata_routing()) - trs_request = get_routing_for_object(TransformerMetadata()) + trs_request = get_routing_for_object(ConsumingTransformer()) assert trs_request.fit.requests == { "sample_weight": None, - "brand": None, + "metadata": None, } assert trs_request.transform.requests == { "sample_weight": None, } assert_request_is_empty(trs_request) - est_request = get_routing_for_object(ClassifierFitMetadata()) + est_request = get_routing_for_object(ConsumingClassifier()) assert est_request.fit.requests == { "sample_weight": None, - "brand": None, + "metadata": None, } assert_request_is_empty(est_request) def test_process_routing_invalid_method(): with pytest.raises(TypeError, match="Can only route and process input"): - process_routing(ClassifierFitMetadata(), "invalid_method", {}) + process_routing(ConsumingClassifier(), "invalid_method", **{}) def test_process_routing_invalid_object(): class InvalidObject: pass - with pytest.raises(AttributeError, match="has not implemented the routing"): - process_routing(InvalidObject(), "fit", {}) + with pytest.raises(AttributeError, match="either implement the routing method"): + process_routing(InvalidObject(), "fit", **{}) def test_simple_metadata_routing(): # Tests that metadata is properly routed # The underlying estimator doesn't accept or request metadata - clf = SimpleMetaClassifier(estimator=ClassifierNoMetadata()) + clf = WeightedMetaClassifier(estimator=NonConsumingClassifier()) clf.fit(X, y) # Meta-estimator consumes sample_weight, but doesn't forward it to the underlying # estimator - clf = SimpleMetaClassifier(estimator=ClassifierNoMetadata()) + clf = WeightedMetaClassifier(estimator=NonConsumingClassifier()) clf.fit(X, y, sample_weight=my_weights) # If the estimator accepts the metadata but doesn't explicitly say it doesn't # need it, there's an error - clf = SimpleMetaClassifier(estimator=ClassifierFitMetadata()) + clf = WeightedMetaClassifier(estimator=ConsumingClassifier()) err_message = ( "[sample_weight] are passed but are not explicitly set as requested or" - " not for ClassifierFitMetadata.fit" + " not for ConsumingClassifier.fit" ) with pytest.raises(ValueError, match=re.escape(err_message)): clf.fit(X, y, sample_weight=my_weights) # Explicitly saying the estimator doesn't need it, makes the error go away, - # because in this case `SimpleMetaClassifier` consumes `sample_weight`. If + # because in this case `WeightedMetaClassifier` consumes `sample_weight`. If # there was no consumer of sample_weight, passing it would result in an # error. - clf = SimpleMetaClassifier( - estimator=ClassifierFitMetadata().set_fit_request(sample_weight=False) + clf = WeightedMetaClassifier( + estimator=ConsumingClassifier().set_fit_request(sample_weight=False) ) - # this doesn't raise since SimpleMetaClassifier itself is a consumer, + # this doesn't raise since WeightedMetaClassifier itself is a consumer, # and passing metadata to the consumer directly is fine regardless of its # metadata_request values. clf.fit(X, y, sample_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=None, brand=None) + check_recorded_metadata(clf.estimator_, "fit") # Requesting a metadata will make the meta-estimator forward it correctly - clf = SimpleMetaClassifier( - estimator=ClassifierFitMetadata().set_fit_request(sample_weight=True) + clf = WeightedMetaClassifier( + estimator=ConsumingClassifier().set_fit_request(sample_weight=True) ) clf.fit(X, y, sample_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights, brand=None) + check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights) # And requesting it with an alias - clf = SimpleMetaClassifier( - estimator=ClassifierFitMetadata().set_fit_request( + clf = WeightedMetaClassifier( + estimator=ConsumingClassifier().set_fit_request( sample_weight="alternative_weight" ) ) clf.fit(X, y, alternative_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights, brand=None) + check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights) def test_nested_routing(): @@ -474,23 +298,23 @@ def test_nested_routing(): pipeline = SimplePipeline( [ MetaTransformer( - transformer=TransformerMetadata() - .set_fit_request(brand=True, sample_weight=False) + transformer=ConsumingTransformer() + .set_fit_request(metadata=True, sample_weight=False) .set_transform_request(sample_weight=True) ), WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request( - sample_weight="inner_weights" - ) + estimator=ConsumingRegressor() + .set_fit_request(sample_weight="inner_weights", metadata=False) + .set_predict_request(sample_weight=False) ).set_fit_request(sample_weight="outer_weights"), ] ) w1, w2, w3 = [1], [2], [3] pipeline.fit( - X, y, brand=my_groups, sample_weight=w1, outer_weights=w2, inner_weights=w3 + X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2, inner_weights=w3 ) check_recorded_metadata( - pipeline.steps_[0].transformer_, "fit", brand=my_groups, sample_weight=None + pipeline.steps_[0].transformer_, "fit", metadata=my_groups, sample_weight=None ) check_recorded_metadata( pipeline.steps_[0].transformer_, "transform", sample_weight=w1 @@ -509,12 +333,12 @@ def test_nested_routing_conflict(): pipeline = SimplePipeline( [ MetaTransformer( - transformer=TransformerMetadata() - .set_fit_request(brand=True, sample_weight=False) + transformer=ConsumingTransformer() + .set_fit_request(metadata=True, sample_weight=False) .set_transform_request(sample_weight=True) ), WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request(sample_weight=True) + estimator=ConsumingRegressor().set_fit_request(sample_weight=True) ).set_fit_request(sample_weight="outer_weights"), ] ) @@ -530,13 +354,13 @@ def test_nested_routing_conflict(): ) ), ): - pipeline.fit(X, y, brand=my_groups, sample_weight=w1, outer_weights=w2) + pipeline.fit(X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2) def test_invalid_metadata(): # check that passing wrong metadata raises an error trs = MetaTransformer( - transformer=TransformerMetadata().set_transform_request(sample_weight=True) + transformer=ConsumingTransformer().set_transform_request(sample_weight=True) ) with pytest.raises( TypeError, @@ -546,7 +370,7 @@ def test_invalid_metadata(): # passing a metadata which is not requested by any estimator should also raise trs = MetaTransformer( - transformer=TransformerMetadata().set_transform_request(sample_weight=False) + transformer=ConsumingTransformer().set_transform_request(sample_weight=False) ) with pytest.raises( TypeError, @@ -751,14 +575,14 @@ def test_metadata_router_consumes_method(): cases = [ ( WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request(sample_weight=True) + estimator=ConsumingRegressor().set_fit_request(sample_weight=True) ), {"sample_weight"}, {"sample_weight"}, ), ( WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request( + estimator=ConsumingRegressor().set_fit_request( sample_weight="my_weights" ) ), @@ -784,13 +608,13 @@ class WeightedMetaRegressorWarn(WeightedMetaRegressor): def test_estimator_warnings(): - class RegressorMetadataWarn(RegressorMetadata): + class ConsumingRegressorWarn(ConsumingRegressor): __metadata_request__fit = {"sample_weight": metadata_routing.WARN} with pytest.warns( UserWarning, match="Support for .* has recently been added to this class" ): - MetaRegressor(estimator=RegressorMetadataWarn()).fit( + MetaRegressor(estimator=ConsumingRegressorWarn()).fit( X, y, sample_weight=my_weights ) @@ -811,12 +635,14 @@ class RegressorMetadataWarn(RegressorMetadata): (MethodMapping.from_str("score"), "[{'callee': 'score', 'caller': 'score'}]"), ( MetadataRouter(owner="test").add( - method_mapping="predict", estimator=RegressorMetadata() + method_mapping="predict", estimator=ConsumingRegressor() ), ( - "{'estimator': {'mapping': [{'callee': 'predict', 'caller': " - "'predict'}], 'router': {'fit': {'sample_weight': None}, " - "'score': {'sample_weight': None}}}}" + "{'estimator': {'mapping': [{'callee': 'predict', 'caller':" + " 'predict'}], 'router': {'fit': {'sample_weight': None, 'metadata':" + " None}, 'partial_fit': {'sample_weight': None, 'metadata': None}," + " 'predict': {'sample_weight': None, 'metadata': None}, 'score':" + " {'sample_weight': None}}}}" ), ), ], @@ -857,7 +683,7 @@ def test_string_representations(obj, string): "Given `obj` is neither a `MetadataRequest` nor does it implement", ), ( - ClassifierFitMetadata(), + ConsumingClassifier(), "set_fit_request", {"invalid": True}, TypeError, @@ -900,14 +726,14 @@ def test_metadatarouter_add_self_request(): assert router._self_request is not request # one can add an estimator as self - est = RegressorMetadata().set_fit_request(sample_weight="my_weights") + est = ConsumingRegressor().set_fit_request(sample_weight="my_weights") router = MetadataRouter(owner="test").add_self_request(obj=est) assert str(router._self_request) == str(est.get_metadata_routing()) assert router._self_request is not est.get_metadata_routing() # adding a consumer+router as self should only add the consumer part est = WeightedMetaRegressor( - estimator=RegressorMetadata().set_fit_request(sample_weight="nested_weights") + estimator=ConsumingRegressor().set_fit_request(sample_weight="nested_weights") ) router = MetadataRouter(owner="test").add_self_request(obj=est) # _get_metadata_request() returns the consumer part of the requests @@ -923,25 +749,27 @@ def test_metadata_routing_add(): # adding one with a string `method_mapping` router = MetadataRouter(owner="test").add( method_mapping="fit", - est=RegressorMetadata().set_fit_request(sample_weight="weights"), + est=ConsumingRegressor().set_fit_request(sample_weight="weights"), ) assert ( str(router) - == "{'est': {'mapping': [{'callee': 'fit', 'caller': 'fit'}], " - "'router': {'fit': {'sample_weight': 'weights'}, 'score': " - "{'sample_weight': None}}}}" + == "{'est': {'mapping': [{'callee': 'fit', 'caller': 'fit'}], 'router': {'fit':" + " {'sample_weight': 'weights', 'metadata': None}, 'partial_fit':" + " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':" + " None, 'metadata': None}, 'score': {'sample_weight': None}}}}" ) # adding one with an instance of MethodMapping router = MetadataRouter(owner="test").add( method_mapping=MethodMapping().add(callee="score", caller="fit"), - est=RegressorMetadata().set_score_request(sample_weight=True), + est=ConsumingRegressor().set_score_request(sample_weight=True), ) assert ( str(router) - == "{'est': {'mapping': [{'callee': 'score', 'caller': 'fit'}], " - "'router': {'fit': {'sample_weight': None}, 'score': " - "{'sample_weight': True}}}}" + == "{'est': {'mapping': [{'callee': 'score', 'caller': 'fit'}], 'router':" + " {'fit': {'sample_weight': None, 'metadata': None}, 'partial_fit':" + " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':" + " None, 'metadata': None}, 'score': {'sample_weight': True}}}}" ) @@ -949,13 +777,13 @@ def test_metadata_routing_get_param_names(): router = ( MetadataRouter(owner="test") .add_self_request( - WeightedMetaRegressor(estimator=RegressorMetadata()).set_fit_request( + WeightedMetaRegressor(estimator=ConsumingRegressor()).set_fit_request( sample_weight="self_weights" ) ) .add( method_mapping="fit", - trs=TransformerMetadata().set_fit_request( + trs=ConsumingTransformer().set_fit_request( sample_weight="transform_weights" ), ) @@ -963,24 +791,23 @@ def test_metadata_routing_get_param_names(): assert ( str(router) - == "{'$self_request': {'fit': {'sample_weight': 'self_weights'}, 'score': " - "{'sample_weight': None}}, 'trs': {'mapping': [{'callee': 'fit', " - "'caller': 'fit'}], 'router': {'fit': {'brand': None, " - "'sample_weight': 'transform_weights'}, 'transform': " - "{'sample_weight': None}}}}" + == "{'$self_request': {'fit': {'sample_weight': 'self_weights'}, 'score':" + " {'sample_weight': None}}, 'trs': {'mapping': [{'callee': 'fit', 'caller':" + " 'fit'}], 'router': {'fit': {'sample_weight': 'transform_weights'," + " 'metadata': None}, 'transform': {'sample_weight': None}}}}" ) assert router._get_param_names( method="fit", return_alias=True, ignore_self_request=False - ) == {"transform_weights", "brand", "self_weights"} + ) == {"transform_weights", "metadata", "self_weights"} # return_alias=False will return original names for "self" assert router._get_param_names( method="fit", return_alias=False, ignore_self_request=False - ) == {"sample_weight", "brand", "transform_weights"} + ) == {"sample_weight", "metadata", "transform_weights"} # ignoring self would remove "sample_weight" assert router._get_param_names( method="fit", return_alias=False, ignore_self_request=True - ) == {"brand", "transform_weights"} + ) == {"metadata", "transform_weights"} # return_alias is ignored when ignore_self_request=True assert router._get_param_names( method="fit", return_alias=True, ignore_self_request=True @@ -1138,9 +965,9 @@ def test_no_feature_flag_raises_error(): """Test that when feature flag disabled, set_{method}_requests raises.""" with config_context(enable_metadata_routing=False): with pytest.raises(RuntimeError, match="This method is only available"): - ClassifierFitMetadata().set_fit_request(sample_weight=True) + ConsumingClassifier().set_fit_request(sample_weight=True) def test_none_metadata_passed(): """Test that passing None as metadata when not requested doesn't raise""" - MetaRegressor(estimator=RegressorMetadata()).fit(X, y, sample_weight=None) + MetaRegressor(estimator=ConsumingRegressor()).fit(X, y, sample_weight=None) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 768a57c61dc52..4a548fe9f067f 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -1,28 +1,27 @@ import copy import re -from functools import partial import numpy as np import pytest from sklearn import config_context -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.calibration import CalibratedClassifierCV from sklearn.exceptions import UnsetMetadataPassedError from sklearn.linear_model import LogisticRegressionCV -from sklearn.metrics._scorer import _BaseScorer -from sklearn.model_selection import BaseCrossValidator -from sklearn.model_selection._split import GroupsConsumerMixin from sklearn.multioutput import ( ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain, ) -from sklearn.tests.test_metadata_routing import ( +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingRegressor, + ConsumingScorer, + ConsumingSplitter, + _Registry, assert_request_is_empty, check_recorded_metadata, - record_metadata, ) from sklearn.utils.metadata_routing import MetadataRouter @@ -43,179 +42,6 @@ def enable_slep006(): yield -record_metadata_not_default = partial(record_metadata, record_default=False) - - -class _Registry(list): - # This list is used to get a reference to the sub-estimators, which are not - # necessarily stored on the metaestimator. We need to override __deepcopy__ - # because the sub-estimators are probably cloned, which would result in a - # new copy of the list, but we need copy and deep copy both to return the - # same instance. - def __deepcopy__(self, memo): - return self - - def __copy__(self): - return self - - -class ConsumingRegressor(RegressorMixin, BaseEstimator): - """A regressor consuming metadata. - - Parameters - ---------- - registry : list, default=None - If a list, the estimator will append itself to the list in order to have - a reference to the estimator later on. Since that reference is not - required in all tests, registration can be skipped by leaving this value - as None. - - """ - - def __init__(self, registry=None): - self.registry = registry - - def partial_fit(self, X, y, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "partial_fit", sample_weight=sample_weight, metadata=metadata - ) - return self - - def fit(self, X, y, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata - ) - return self - - def predict(self, X, sample_weight="default", metadata="default"): - pass # pragma: no cover - - # when needed, uncomment the implementation - # if self.registry is not None: - # self.registry.append(self) - - # record_metadata_not_default( - # self, "predict", sample_weight=sample_weight, metadata=metadata - # ) - # return np.zeros(shape=(len(X),)) - - -class ConsumingClassifier(ClassifierMixin, BaseEstimator): - """A classifier consuming metadata. - - Parameters - ---------- - registry : list, default=None - If a list, the estimator will append itself to the list in order to have - a reference to the estimator later on. Since that reference is not - required in all tests, registration can be skipped by leaving this value - as None. - - """ - - def __init__(self, registry=None): - self.registry = registry - - def partial_fit(self, X, y, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "partial_fit", sample_weight=sample_weight, metadata=metadata - ) - self.classes_ = [0, 1] - return self - - def fit(self, X, y, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata - ) - self.classes_ = [0, 1] - return self - - def predict(self, X, sample_weight="default", metadata="default"): - pass # pragma: no cover - - # when needed, uncomment the implementation - # if self.registry is not None: - # self.registry.append(self) - - # record_metadata_not_default( - # self, "predict", sample_weight=sample_weight, metadata=metadata - # ) - # return np.zeros(shape=(len(X),)) - - def predict_proba(self, X, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "predict_proba", sample_weight=sample_weight, metadata=metadata - ) - return np.asarray([[0.0, 1.0]] * len(X)) - - def predict_log_proba(self, X, sample_weight="default", metadata="default"): - pass # pragma: no cover - - # when needed, uncomment the implementation - # if self.registry is not None: - # self.registry.append(self) - - # record_metadata_not_default( - # self, "predict_log_proba", sample_weight=sample_weight, metadata=metadata - # ) - # return np.zeros(shape=(len(X), 2)) - - -class ConsumingScorer(_BaseScorer): - def __init__(self, registry=None): - super().__init__(score_func="test", sign=1, kwargs={}) - self.registry = registry - - def __call__( - self, estimator, X, y_true, sample_weight="default", metadata="default" - ): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default( - self, "score", sample_weight=sample_weight, metadata=metadata - ) - - return 0.0 - - -class ConsumingSplitter(BaseCrossValidator, GroupsConsumerMixin): - def __init__(self, registry=None): - self.registry = registry - - def split(self, X, y=None, groups="default"): - if self.registry is not None: - self.registry.append(self) - - record_metadata_not_default(self, "split", groups=groups) - - split_index = len(X) - 10 - train_indices = range(0, split_index) - test_indices = range(split_index, len(X)) - yield test_indices, train_indices - - def get_n_splits(self, X=None, y=None, groups=None): - pass # pragma: no cover - - def _iter_test_indices(self, X=None, y=None, groups=None): - pass # pragma: no cover - - METAESTIMATORS: list = [ { "metaestimator": MultiOutputRegressor, @@ -279,7 +105,7 @@ def _iter_test_indices(self, X=None, y=None, groups=None): # ids used for pytest fixture METAESTIMATOR_IDS = [str(row["metaestimator"].__name__) for row in METAESTIMATORS] -CV_SCORERS = [ +CV_SCORERS: list = [ { "cv_estimator": LogisticRegressionCV, "scorer_name": "scoring", @@ -287,7 +113,7 @@ def _iter_test_indices(self, X=None, y=None, groups=None): }, ] -CV_SPLITTERS = [ +CV_SPLITTERS: list = [ { "cv_estimator": LogisticRegressionCV, "splitter_name": "cv", @@ -295,6 +121,10 @@ def _iter_test_indices(self, X=None, y=None, groups=None): } ] +# IDs used by pytest to get meaningful verbose messages when running the tests +CV_SCORER_IDS = [x["cv_estimator"].__name__ for x in CV_SCORERS] +CV_SPLITTER_IDS = [x["cv_estimator"].__name__ for x in CV_SPLITTERS] + def test_registry_copy(): # test that _Registry is not copied into a new instance. @@ -390,7 +220,7 @@ def set_request(estimator, method_name): check_recorded_metadata(estimator, method_name, **kwargs) -@pytest.mark.parametrize("cv_scorer", CV_SCORERS) +@pytest.mark.parametrize("cv_scorer", CV_SCORERS, ids=CV_SCORER_IDS) def test_metadata_is_routed_correctly_to_scorer(cv_scorer): """Test that any requested metadata is correctly routed to the underlying scorers in CV estimators. @@ -406,6 +236,8 @@ def test_metadata_is_routed_correctly_to_scorer(cv_scorer): instance = cls(**{scorer_name: scorer}) method = getattr(instance, method_name) kwargs = {"sample_weight": sample_weight} + if "fit" not in method_name: # instance needs to be fitted first + instance.fit(X, y) method(X, y, **kwargs) for _scorer in registry: check_recorded_metadata( @@ -416,7 +248,7 @@ def test_metadata_is_routed_correctly_to_scorer(cv_scorer): ) -@pytest.mark.parametrize("cv_splitter", CV_SPLITTERS) +@pytest.mark.parametrize("cv_splitter", CV_SPLITTERS, ids=CV_SPLITTER_IDS) def test_metadata_is_routed_correctly_to_splitter(cv_splitter): """Test that any requested metadata is correctly routed to the underlying splitters in CV estimators. diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index c4e565e13aae1..793e5793aec3f 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -17,7 +17,11 @@ from sklearn.datasets import load_iris from sklearn.decomposition import PCA, TruncatedSVD from sklearn.dummy import DummyRegressor -from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + RandomForestClassifier, + RandomTreesEmbedding, +) from sklearn.exceptions import NotFittedError from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectKBest, f_classif @@ -27,7 +31,7 @@ from sklearn.model_selection import train_test_split from sklearn.neighbors import LocalOutlierFactor from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import FunctionTransformer, StandardScaler from sklearn.svm import SVC from sklearn.utils._metadata_requests import COMPOSITE_METHODS, METHODS from sklearn.utils._testing import ( @@ -1828,5 +1832,26 @@ def test_routing_passed_metadata_not_supported(method): getattr(pipe, method)([[1]], sample_weight=[1], prop="a") +@pytest.mark.usefixtures("enable_slep006") +def test_pipeline_with_estimator_with_len(): + """Test that pipeline works with estimators that have a `__len__` method.""" + pipe = Pipeline( + [("trs", RandomTreesEmbedding()), ("estimator", RandomForestClassifier())] + ) + pipe.fit([[1]], [1]) + pipe.predict([[1]]) + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize("last_step", [None, "passthrough"]) +def test_pipeline_with_no_last_step(last_step): + """Test that the pipeline works when there is not last step. + + It should just ignore and pass through the data on transform. + """ + pipe = Pipeline([("trs", FunctionTransformer()), ("estimator", last_step)]) + assert pipe.fit([[1]], [1]).transform([[1], [2], [3]]) == [[1], [2], [3]] + + # End of routing tests # ==================== diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index a9f367f0b21d3..26267a1355f6f 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -24,7 +24,7 @@ import numpy as np from scipy.sparse import issparse -from ..base import ( +from sklearn.base import ( BaseEstimator, ClassifierMixin, MultiOutputMixin, @@ -33,18 +33,19 @@ clone, is_classifier, ) -from ..utils import Bunch, check_random_state, compute_sample_weight -from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions -from ..utils.multiclass import check_classification_targets -from ..utils.validation import ( +from sklearn.utils import Bunch, check_random_state, compute_sample_weight +from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import ( _assert_all_finite_element_wise, _check_sample_weight, assert_all_finite, check_is_fitted, ) + from . import _criterion, _splitter, _tree -from ._criterion import Criterion -from ._splitter import Splitter +from ._criterion import BaseCriterion +from ._splitter import BaseSplitter from ._tree import ( BestFirstTreeBuilder, DepthFirstTreeBuilder, @@ -122,6 +123,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], + "store_leaf_values": ["boolean"], "monotonic_cst": ["array-like", None], } @@ -141,6 +143,7 @@ def __init__( min_impurity_decrease, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): self.criterion = criterion @@ -155,6 +158,7 @@ def __init__( self.min_impurity_decrease = min_impurity_decrease self.class_weight = class_weight self.ccp_alpha = ccp_alpha + self.store_leaf_values = store_leaf_values self.monotonic_cst = monotonic_cst def get_depth(self): @@ -189,7 +193,7 @@ def _support_missing_values(self, X): and self.monotonic_cst is None ) - def _compute_missing_values_in_feature_mask(self, X): + def _compute_missing_values_in_feature_mask(self, X, estimator_name=None): """Return boolean mask denoting if there are missing values for each feature. This method also ensures that X is finite. @@ -199,13 +203,17 @@ def _compute_missing_values_in_feature_mask(self, X): X : array-like of shape (n_samples, n_features), dtype=DOUBLE Input data. + estimator_name : str or None, default=None + Name to use when raising an error. Defaults to the class name. + Returns ------- missing_values_in_feature_mask : ndarray of shape (n_features,), or None Missing value mask. If missing values are not supported or there are no missing values, return None. """ - common_kwargs = dict(estimator_name=self.__class__.__name__, input_name="X") + estimator_name = estimator_name or self.__class__.__name__ + common_kwargs = dict(estimator_name=estimator_name, input_name="X") if not self._support_missing_values(X): assert_all_finite(X, **common_kwargs) @@ -246,9 +254,12 @@ def _fit( dtype=DTYPE, accept_sparse="csc", force_all_finite=False ) check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data( - X, y, validate_separately=(check_X_params, check_y_params) - ) + if y is not None or self._get_tags()["requires_y"]: + X, y = self._validate_data( + X, y, validate_separately=(check_X_params, check_y_params) + ) + else: + X = self._validate_data(X, **check_X_params) missing_values_in_feature_mask = ( self._compute_missing_values_in_feature_mask(X) @@ -261,7 +272,7 @@ def _fit( "No support for np.int64 index based sparse matrices" ) - if self.criterion == "poisson": + if y is not None and self.criterion == "poisson": if np.any(y < 0): raise ValueError( "Some value(s) of y are negative which is" @@ -275,45 +286,56 @@ def _fit( # Determine output settings n_samples, self.n_features_in_ = X.shape - is_classification = is_classifier(self) - y = np.atleast_1d(y) - expanded_class_weight = None + # Do preprocessing if 'y' is passed + is_classification = False + if y is not None: + is_classification = is_classifier(self) - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) + y = np.atleast_1d(y) + expanded_class_weight = None - self.n_outputs_ = y.shape[1] + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) - if is_classification: - check_classification_targets(y) - y = np.copy(y) + self.n_outputs_ = y.shape[1] - self.classes_ = [] - self.n_classes_ = [] + if is_classification: + check_classification_targets(y) + y = np.copy(y) - if self.class_weight is not None: - y_original = np.copy(y) + self.classes_ = [] + self.n_classes_ = [] - y_encoded = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - y = y_encoded - - if self.class_weight is not None: - expanded_class_weight = compute_sample_weight( - self.class_weight, y_original - ) + if self.class_weight is not None: + y_original = np.copy(y) + + y_encoded = np.zeros(y.shape, dtype=int) + for k in range(self.n_outputs_): + classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_encoded + + if self.class_weight is not None: + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original + ) - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) + self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + if len(y) != n_samples: + raise ValueError( + "Number of labels=%d does not match number of samples=%d" + % (len(y), n_samples) + ) + # set decision-tree model parameters max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth if isinstance(self.min_samples_leaf, numbers.Integral): @@ -369,16 +391,10 @@ def _fit( max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes - if len(y) != n_samples: - raise ValueError( - "Number of labels=%d does not match number of samples=%d" - % (len(y), n_samples) - ) - if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) - if expanded_class_weight is not None: + if y is not None and expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: @@ -390,10 +406,65 @@ def _fit( else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + # build the actual tree now with the parameters + self._build_tree( + X, + y, + sample_weight, + missing_values_in_feature_mask, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ) + + return self + + def _build_tree( + self, + X, + y, + sample_weight, + missing_values_in_feature_mask, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : Array-like + X dataset. + y : Array-like + Y targets. + sample_weight : Array-like + Sample weights + min_samples_leaf : float + Number of samples required to be a leaf. + min_weight_leaf : float + Weight of samples required to be a leaf. + max_leaf_nodes : float + Maximum number of leaf nodes allowed in tree. + min_samples_split : float + Minimum number of samples to split on. + max_depth : int + The maximum depth of any tree. + random_state : int + Random seed. + """ + + n_samples = X.shape[0] + # Build tree criterion = self.criterion - if not isinstance(criterion, Criterion): - if is_classification: + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): criterion = CRITERIA_CLF[self.criterion]( self.n_outputs_, self.n_classes_ ) @@ -406,7 +477,6 @@ def _fit( SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS - splitter = self.splitter if self.monotonic_cst is None: monotonic_cst = None else: @@ -446,7 +516,7 @@ def _fit( # *positive class*, all signs must be flipped. monotonic_cst *= -1 - if not isinstance(self.splitter, Splitter): + if not isinstance(self.splitter, BaseSplitter): splitter = SPLITTERS[self.splitter]( criterion, self.max_features_, @@ -475,6 +545,7 @@ def _fit( min_weight_leaf, max_depth, self.min_impurity_decrease, + self.store_leaf_values, ) else: builder = BestFirstTreeBuilder( @@ -485,8 +556,8 @@ def _fit( max_depth, max_leaf_nodes, self.min_impurity_decrease, + self.store_leaf_values, ) - builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) if self.n_outputs_ == 1 and is_classifier(self): @@ -495,8 +566,6 @@ def _fit( self._prune_tree() - return self - def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: @@ -545,6 +614,9 @@ def predict(self, X, check_input=True): """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) + + # proba is a count matrix of leaves that fall into + # (n_samples, n_outputs, max_n_classes) array proba = self.tree_.predict(X) n_samples = X.shape[0] @@ -571,6 +643,134 @@ def predict(self, X, check_input=True): else: return proba[:, :, 0] + def get_leaf_node_samples(self, X, check_input=True): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + check_input : bool, default=True + Allow to bypass several input checking. + + Returns + ------- + leaf_nodes_samples : a list of array-like of length (n_samples,) + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. Each array has shape (n_leaf_node_samples, n_outputs). + """ + if not self.store_leaf_values: + raise RuntimeError( + "leaf node samples are not stored when store_leaf_values=False" + ) + + # get indices of leaves per sample (n_samples,) + X_leaves = self.apply(X, check_input=check_input) + n_samples = X_leaves.shape[0] + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + leaf_nodes_samples = [] + for idx in range(n_samples): + leaf_id = X_leaves[idx] + leaf_nodes_samples.append(leaf_samples[leaf_id]) + return leaf_nodes_samples + + def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + check_input : bool, optional + Whether or not to check input, by default True. + + Returns + ------- + predictions : array-like of shape (n_samples, n_outputs, len(quantiles)) + The predicted quantiles. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Predicting quantiles requires that the tree stores leaf node samples." + ) + + check_is_fitted(self) + + # Check data + X = self._validate_X_predict(X, check_input) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # get indices of leaves per sample + X_leaves = self.apply(X) + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + # compute quantiles (n_samples, n_quantiles, n_outputs) + n_samples = X.shape[0] + n_quantiles = len(quantiles) + proba = np.zeros((n_samples, n_quantiles, self.n_outputs_)) + for idx, leaf_id in enumerate(X_leaves): + # predict by taking the quantile across the samples in the leaf for + # each output + try: + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, method=method + ) + except TypeError: + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, interpolation=method + ) + + # Classification + if is_classifier(self): + if self.n_outputs_ == 1: + # return the class with the highest probability for each quantile + # (n_samples, n_quantiles) + class_preds = np.zeros( + (n_samples, n_quantiles), dtype=self.classes_.dtype + ) + for i in range(n_quantiles): + class_pred_per_sample = ( + proba[:, i, :].squeeze().astype(self.classes_.dtype) + ) + class_preds[:, i] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + return class_preds + else: + class_type = self.classes_[0].dtype + predictions = np.zeros( + (n_samples, n_quantiles, self.n_outputs_), dtype=class_type + ) + for k in range(self.n_outputs_): + for i in range(n_quantiles): + class_pred_per_sample = proba[:, i, k].squeeze().astype(int) + predictions[:, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + + return predictions + # Regression + else: + if self.n_outputs_ == 1: + return proba[:, :, 0] + + else: + return proba + def apply(self, X, check_input=True): """Return the index of the leaf that each sample is predicted as. @@ -845,6 +1045,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -956,7 +1166,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, - "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], + "criterion": [ + StrOptions({"gini", "entropy", "log_loss"}), + Hidden(BaseCriterion), + ], "class_weight": [dict, list, StrOptions({"balanced"}), None], } @@ -975,6 +1188,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -991,6 +1205,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, monotonic_cst=monotonic_cst, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) @_fit_context(prefer_skip_nested_validation=True) @@ -1023,7 +1238,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): self : DecisionTreeClassifier Fitted estimator. """ - super()._fit( X, y, @@ -1242,6 +1456,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1338,7 +1562,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): **BaseDecisionTree._parameter_constraints, "criterion": [ StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}), - Hidden(Criterion), + Hidden(BaseCriterion), ], } @@ -1356,6 +1580,7 @@ def __init__( max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1370,6 +1595,7 @@ def __init__( random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, ) @@ -1585,6 +1811,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1704,6 +1940,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1719,6 +1956,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, ) @@ -1850,6 +2088,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1949,6 +2197,7 @@ def __init__( min_impurity_decrease=0.0, max_leaf_nodes=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1963,5 +2212,6 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, ) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index b765d324bebb9..690f4d0c54c64 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -4,33 +4,33 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause # See _criterion.pyx for implementation details. cimport numpy as cnp -from ._tree cimport DTYPE_t # Type of X -from ._tree cimport DOUBLE_t # Type of y, sample_weight -from ._tree cimport SIZE_t # Type for indices and counters -from ._tree cimport INT32_t # Signed 32 bit integer -from ._tree cimport UINT32_t # Unsigned 32 bit integer +from libcpp.vector cimport vector -cdef class Criterion: - # The criterion computes the impurity of a node and the reduction of - # impurity of a split on that node. It also computes the output statistics - # such as the mean in regression and class probabilities in classification. +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport UINT32_t # Unsigned 32 bit integer + + +cdef class BaseCriterion: + """Abstract interface for criterion.""" # Internal structures - cdef const DOUBLE_t[:, ::1] y # Values of y cdef const DOUBLE_t[:] sample_weight # Sample weights cdef const SIZE_t[:] sample_indices # Sample indices in X, y cdef SIZE_t start # samples[start:pos] are the samples in the left node cdef SIZE_t pos # samples[pos:end] are the samples in the right node cdef SIZE_t end - cdef SIZE_t n_missing # Number of missing values for the feature being evaluated - cdef bint missing_go_to_left # Whether missing values go to the left node cdef SIZE_t n_outputs # Number of outputs cdef SIZE_t n_samples # Number of samples @@ -41,21 +41,11 @@ cdef class Criterion: cdef double weighted_n_right # Weighted number of samples in the right node cdef double weighted_n_missing # Weighted number of samples that are missing + # Core methods that criterion class _must_ implement. # The criterion object is maintained such that left and right collected # statistics correspond to samples[start:pos] and samples[pos:end]. # Methods - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end - ) except -1 nogil - cdef void init_sum_missing(self) - cdef void init_missing(self, SIZE_t n_missing) noexcept nogil cdef int reset(self) except -1 nogil cdef int reverse_reset(self) except -1 nogil cdef int update(self, SIZE_t new_pos) except -1 nogil @@ -69,13 +59,6 @@ cdef class Criterion: self, double* dest ) noexcept nogil - cdef void clip_node_value( - self, - double* dest, - double lower_bound, - double upper_bound - ) noexcept nogil - cdef double middle_value(self) noexcept nogil cdef double impurity_improvement( self, double impurity_parent, @@ -83,6 +66,35 @@ cdef class Criterion: double impurity_right ) noexcept nogil cdef double proxy_impurity_improvement(self) noexcept nogil + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil + + +cdef class Criterion(BaseCriterion): + """Abstract interface for supervised impurity criteria.""" + + cdef const DOUBLE_t[:, ::1] y # Values of y + cdef SIZE_t n_missing # Number of missing values for the feature being evaluated + cdef bint missing_go_to_left # Whether missing values go to the left node + + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil + cdef void init_sum_missing(self) + cdef void init_missing(self, SIZE_t n_missing) noexcept nogil + + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]& dest + ) noexcept nogil + cdef bint check_monotonicity( self, cnp.int8_t monotonic_cst, @@ -97,6 +109,13 @@ cdef class Criterion: double sum_left, double sum_right, ) noexcept nogil + cdef void clip_node_value( + self, + double* dest, + double lower_bound, + double upper_bound + ) noexcept nogil + cdef double middle_value(self) noexcept nogil cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index ed8a12065554e..f47feb9c9f59d 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -9,30 +12,47 @@ # Fares Hedayati # Jacob Schreiber # Nelson Liu +# Adam Li +# Jong Shin # # License: BSD 3 clause -from libc.string cimport memcpy -from libc.string cimport memset -from libc.math cimport fabs, INFINITY +from libc.math cimport INFINITY, fabs +from libc.string cimport memcpy, memset import numpy as np + cimport numpy as cnp + cnp.import_array() from scipy.special.cython_special cimport xlogy -from ._utils cimport log -from ._utils cimport WeightedMedianCalculator +from ._utils cimport WeightedMedianCalculator, log + # EPSILON is used in the Poisson criterion cdef double EPSILON = 10 * np.finfo('double').eps -cdef class Criterion: - """Interface for impurity criteria. +cdef class BaseCriterion: + """This is an abstract interface for criterion. + + For example, a tree model could + be either supervisedly, or unsupervisedly computing impurity on samples of + covariates, or labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for criteria. + + The downstream classes _must_ implement methods to compute the impurity + in current node and in children nodes. This object stores methods on how to calculate how good a split is using - different metrics. + a set API. + + Samples in the "current" node are stored in `samples[start:end]` which is + partitioned around `pos` (an index in `start:end`) so that: + - the samples of left child node are stored in `samples[start:pos]` + - the samples of right child node are stored in `samples[pos:end]` """ def __getstate__(self): return {} @@ -40,53 +60,6 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, - ) except -1 nogil: - """Placeholder for a method which will initialize the criterion. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - y : ndarray, dtype=DOUBLE_t - y is a buffer that can store values for n_outputs target variables - stored as a Cython memoryview. - sample_weight : ndarray, dtype=DOUBLE_t - The weight of each sample stored as a Cython memoryview. - weighted_n_samples : double - The total weight of the samples being considered - sample_indices : ndarray, dtype=SIZE_t - A mask on the samples. Indices of the samples in X and y we want to use, - where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to be used on this node - end : SIZE_t - The last sample used on this node - - """ - pass - - cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: - """Initialize sum_missing if there are missing values. - - This method assumes that caller placed the missing samples in - self.sample_indices[-n_missing:] - - Parameters - ---------- - n_missing: SIZE_t - Number of missing values for specific feature. - """ - pass - cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. @@ -157,16 +130,6 @@ cdef class Criterion: """ pass - cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: - pass - - cdef double middle_value(self) noexcept nogil: - """Compute the middle value of a split for monotonicity constraints - - This method is implemented in ClassificationCriterion and RegressionCriterion. - """ - pass - cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. @@ -221,6 +184,90 @@ cdef class Criterion: - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Abstract method which will set sample pointers in the criterion. + + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to be used on computation of criteria of the current node. + end : SIZE_t + The last sample used on this node + """ + pass + + +cdef class Criterion(BaseCriterion): + """Interface for impurity criteria. + + The supervised criterion computes the impurity of a node and the reduction of + impurity of a split on that node using the distribution of labels in parent and + children nodes. It also computes the output statistics such as the mean in regression + and class probabilities in classification. Instances of this class are responsible + for compute splits' impurity difference. + + Criterion is the base class for criteria used in supervised tree-based models + with a homogeneous float64-dtyped y. + """ + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil: + """Placeholder for a method which will initialize the criterion. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + y : ndarray, dtype=DOUBLE_t + y is a buffer that can store values for n_outputs target variables + stored as a Cython memoryview. + sample_weight : ndarray, dtype=DOUBLE_t + The weight of each sample stored as a Cython memoryview. + weighted_n_samples : double + The total weight of the samples being considered + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. + """ + pass + + cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: + """Initialize sum_missing if there are missing values. + + This method assumes that caller placed the missing samples in + self.sample_indices[-n_missing:] + + Parameters + ---------- + n_missing: SIZE_t + Number of missing values for specific feature. + """ + pass + + cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: + pass + + cdef double middle_value(self) noexcept nogil: + """Compute the middle value of a split for monotonicity constraints + + This method is implemented in ClassificationCriterion and RegressionCriterion. + """ + pass + cdef bint check_monotonicity( self, cnp.int8_t monotonic_cst, @@ -254,6 +301,33 @@ cdef class Criterion: cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]& dest + ) noexcept nogil: + """Copy the samples of the current node into dest. + + Parameters + ---------- + dest : reference vector[vector[DOUBLE_t]] + The vector of vectors where the samples should be copied. + This is passed by reference and modified in place. + """ + cdef SIZE_t i, j, k + + # Resize the destination vector of vectors + dest.resize(self.n_node_samples) + + # Loop over the samples + for i in range(self.n_node_samples): + # Get the index of the current sample + j = self.sample_indices[self.start + i] + + # Get the sample values for each output + for k in range(self.n_outputs): + dest[i].push_back(self.y[j, k]) + + cdef inline void _move_sums_classification( ClassificationCriterion criterion, double[:, ::1] sum_1, @@ -352,15 +426,10 @@ cdef class ClassificationCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end + const SIZE_t[:] sample_indices ) except -1 nogil: """Initialize the criterion. - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. @@ -375,18 +444,24 @@ cdef class ClassificationCriterion(Criterion): sample_indices : ndarray, dtype=SIZE_t A mask on the samples. Indices of the samples in X and y we want to use, where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to use in the mask - end : SIZE_t - The last sample to use in the mask """ self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + self.n_node_samples = end - start self.start = start self.end = end - self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0.0 cdef SIZE_t i @@ -399,12 +474,12 @@ cdef class ClassificationCriterion(Criterion): memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] # w is originally set to be 1.0, meaning that if no sample weights # are given, the default weight of each sample is 1.0. - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] # Count weighted class frequency for each target for k in range(self.n_outputs): @@ -415,7 +490,6 @@ cdef class ClassificationCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" @@ -695,13 +769,10 @@ cdef class Gini(ClassificationCriterion): This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. The Gini Index is then defined as: - index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 """ @@ -819,7 +890,6 @@ cdef class RegressionCriterion(Criterion): evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` by using :: - var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ @@ -831,7 +901,6 @@ cdef class RegressionCriterion(Criterion): ---------- n_outputs : SIZE_t The number of targets to be predicted - n_samples : SIZE_t The total number of samples to fit on """ @@ -862,23 +931,29 @@ cdef class RegressionCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + + self.sq_sum_total = 0.0 self.weighted_n_node_samples = 0. cdef SIZE_t i @@ -887,14 +962,14 @@ cdef class RegressionCriterion(Criterion): cdef DOUBLE_t y_ik cdef DOUBLE_t w_y_ik cdef DOUBLE_t w = 1.0 - self.sq_sum_total = 0.0 + memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] @@ -906,7 +981,6 @@ cdef class RegressionCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" @@ -1074,7 +1148,6 @@ cdef class RegressionCriterion(Criterion): cdef class MSE(RegressionCriterion): """Mean squared error impurity criterion. - MSE = var_left + var_right """ @@ -1222,26 +1295,30 @@ cdef class MAE(RegressionCriterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ - cdef SIZE_t i, p, k - cdef DOUBLE_t w = 1.0 - + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 + self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. cdef void** left_child = self.left_child_ptr @@ -1252,10 +1329,10 @@ cdef class MAE(RegressionCriterion): ( right_child[k]).reset() for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): # push method ends up calling safe_realloc, hence `except -1` @@ -1270,7 +1347,6 @@ cdef class MAE(RegressionCriterion): # Reset to pos=start self.reset() - return 0 cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: """Raise error if n_missing != 0.""" @@ -1561,6 +1637,7 @@ cdef class Poisson(RegressionCriterion): Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): + 1/n * sum(y_true * log(y_true/y_pred) """ # FIXME in 1.0: diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 02f8ea81404c7..b43ce1712709d 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -17,9 +17,15 @@ import numpy as np -from ..base import is_classifier -from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params -from ..utils.validation import check_array, check_is_fitted +from sklearn.base import is_classifier +from sklearn.utils._param_validation import ( + HasMethods, + Interval, + StrOptions, + validate_params, +) +from sklearn.utils.validation import check_array, check_is_fitted + from . import DecisionTreeClassifier, DecisionTreeRegressor, _criterion, _tree from ._reingold_tilford import Tree, buchheim @@ -79,7 +85,7 @@ def __repr__(self): "decision_tree": [DecisionTreeClassifier, DecisionTreeRegressor], "max_depth": [Interval(Integral, 0, None, closed="left"), None], "feature_names": [list, None], - "class_names": [list, None], + "class_names": ["array-like", "boolean", None], "label": [StrOptions({"all", "root", "none"})], "filled": ["boolean"], "impurity": ["boolean"], @@ -134,7 +140,7 @@ def plot_tree( Names of each of the features. If None, generic names will be used ("x[0]", "x[1]", ...). - class_names : list of str or bool, default=None + class_names : array-like of str or True, default=None Names of each of the target classes in ascending numerical order. Only relevant for classification and not supported for multi-output. If ``True``, shows a symbolic representation of the class name. diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 2547e14b324df..7da118347414a 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -4,19 +4,23 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause # See _splitter.pyx for details. cimport numpy as cnp -from ._criterion cimport Criterion +from libcpp.vector cimport vector + +from ._criterion cimport BaseCriterion, Criterion +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport UINT32_t # Unsigned 32 bit integer -from ._tree cimport DTYPE_t # Type of X -from ._tree cimport DOUBLE_t # Type of y, sample_weight -from ._tree cimport SIZE_t # Type for indices and counters -from ._tree cimport INT32_t # Signed 32 bit integer -from ._tree cimport UINT32_t # Unsigned 32 bit integer cdef struct SplitRecord: # Data to track sample split @@ -33,14 +37,15 @@ cdef struct SplitRecord: unsigned char missing_go_to_left # Controls if missing values go to the left node. SIZE_t n_missing # Number of missing values for the feature being split on -cdef class Splitter: +cdef class BaseSplitter: + """Abstract interface for splitter.""" + # The splitter searches in the input space for a feature and a threshold # to split the samples samples[start:end]. # # The impurity computations are delegated to a criterion object. # Internal structures - cdef public Criterion criterion # Impurity criterion cdef public SIZE_t max_features # Number of features to test cdef public SIZE_t min_samples_leaf # Min samples in a leaf cdef public double min_weight_leaf # Minimum weight in a leaf @@ -59,14 +64,6 @@ cdef class Splitter: cdef SIZE_t start # Start position for the current node cdef SIZE_t end # End position for the current node - cdef const DOUBLE_t[:, ::1] y - # Monotonicity constraints for each feature. - # The encoding is as follows: - # -1: monotonic decrease - # 0: no constraint - # +1: monotonic increase - cdef const cnp.int8_t[:] monotonic_cst - cdef bint with_monotonic_cst cdef const DOUBLE_t[:] sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -86,21 +83,12 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init( - self, - object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, - ) except -1 - cdef int node_reset( self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples ) except -1 nogil - cdef int node_split( self, double impurity, # Impurity of the node @@ -109,9 +97,47 @@ cdef class Splitter: double lower_bound, double upper_bound, ) except -1 nogil - cdef void node_value(self, double* dest) noexcept nogil + cdef double node_impurity(self) noexcept nogil + cdef int pointer_size(self) noexcept nogil - cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil +cdef class Splitter(BaseSplitter): + cdef public Criterion criterion # Impurity criterion + cdef const DOUBLE_t[:, ::1] y - cdef double node_impurity(self) noexcept nogil + # Monotonicity constraints for each feature. + # The encoding is as follows: + # -1: monotonic decrease + # 0: no constraint + # +1: monotonic increase + cdef const cnp.int8_t[:] monotonic_cst + cdef bint with_monotonic_cst + + cdef int init( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + const unsigned char[::1] missing_values_in_feature_mask, + ) except -1 + + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil + + # Methods that allow modifications to stopping conditions + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + SIZE_t n_missing, + bint missing_go_to_left, + ) noexcept nogil + + cdef bint check_postsplit_conditions( + self + ) noexcept nogil + + cdef void clip_node_value( + self, + double* dest, + double lower_bound, + double upper_bound + ) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 5c30ba315a90a..f2d0a4dfde0f2 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -8,26 +11,26 @@ # Joel Nothman # Fares Hedayati # Jacob Schreiber +# Adam Li +# Jong Shin # + # License: BSD 3 clause +from cython cimport final +from libc.math cimport isnan +from libc.stdlib cimport qsort +from libc.string cimport memcpy cimport numpy as cnp from ._criterion cimport Criterion -from libc.stdlib cimport qsort -from libc.string cimport memcpy -from libc.math cimport isnan -from cython cimport final - import numpy as np from scipy.sparse import issparse -from ._utils cimport log -from ._utils cimport rand_int -from ._utils cimport rand_uniform -from ._utils cimport RAND_R_MAX +from ._utils cimport RAND_R_MAX, log, rand_int, rand_uniform + cdef double INFINITY = np.inf @@ -48,13 +51,96 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil self.missing_go_to_left = False self.n_missing = 0 -cdef class Splitter: - """Abstract splitter class. +cdef class BaseSplitter: + """This is an abstract interface for splitters. + + For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of + covariates, labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for splitting. + + A splitter is usually used in conjunction with a criterion class, which explicitly handles + computing the criteria, which we split on. The setting of that criterion class is handled + by downstream classes. - Splitters are called by tree builders to find the best splits on both - sparse and dense data, one split at a time. + The downstream classes _must_ implement methods to compute the split in a node. """ + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + cdef int node_reset(self, SIZE_t start, SIZE_t end, + double* weighted_n_node_samples) except -1 nogil: + """Reset splitter on node samples[start:end]. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to consider + end : SIZE_t + The index of the last sample to consider + weighted_n_node_samples : ndarray, dtype=double pointer + The total weight of those samples + """ + pass + + cdef int node_split( + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound + ) except -1 nogil: + """Find the best split on node samples[start:end]. + + This is a placeholder method. The majority of computation will be done + here. + + It should return -1 upon errors. + + Parameters + ---------- + impurity : double + The impurity of the current node. + split : SplitRecord pointer + A pointer to a memory-allocated SplitRecord object which will be filled with the + split chosen. + n_constant_features : SIZE_t pointer + A pointer to a memory-allocated SIZE_t object which will be filled with the + number of constant features. Optional to use. + lower_bound : double + The lower bound of the monotonic constraint if used. + upper_bound : double + The upper bound of the monotonic constraint if used. + """ + pass + + cdef void node_value(self, double* dest) noexcept nogil: + """Copy the value of node samples[start:end] into dest.""" + pass + + cdef double node_impurity(self) noexcept nogil: + """Return the impurity of the current node.""" + pass + + cdef int pointer_size(self) noexcept nogil: + """Size of the pointer for split records. + + Overriding this function allows one to use different subclasses of + `SplitRecord`. + """ + return sizeof(SplitRecord) + +cdef class Splitter(BaseSplitter): + """Abstract interface for supervised splitters.""" + def __cinit__( self, Criterion criterion, @@ -63,6 +149,7 @@ cdef class Splitter: double min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, + *argv ): """ Parameters @@ -90,7 +177,6 @@ cdef class Splitter: Monotonicity constraints """ - self.criterion = criterion self.n_samples = 0 @@ -103,12 +189,6 @@ cdef class Splitter: self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass - def __reduce__(self): return (type(self), (self.criterion, self.max_features, @@ -149,7 +229,6 @@ cdef class Splitter: has_missing : bool At least one missing values is in X. """ - self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef SIZE_t n_samples = X.shape[0] @@ -187,8 +266,21 @@ cdef class Splitter: self.y = y self.sample_weight = sample_weight + + self.criterion.init( + self.y, + self.sample_weight, + self.weighted_n_samples, + self.samples + ) + + self.criterion.set_sample_pointers( + self.start, + self.end + ) if missing_values_in_feature_mask is not None: self.criterion.init_sum_missing() + return 0 cdef int node_reset(self, SIZE_t start, SIZE_t end, @@ -211,37 +303,11 @@ cdef class Splitter: self.start = start self.end = end - self.criterion.init( - self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples, - start, - end - ) + self.criterion.set_sample_pointers(start, end) weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef int node_split( - self, - double impurity, - SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound, - ) except -1 nogil: - - """Find the best split on node samples[start:end]. - - This is a placeholder method. The majority of computation will be done - here. - - It should return -1 upon errors. - """ - - pass - cdef void node_value(self, double* dest) noexcept nogil: """Copy the value of node samples[start:end] into dest.""" @@ -252,11 +318,62 @@ cdef class Splitter: self.criterion.clip_node_value(dest, lower_bound, upper_bound) + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil: + """Copy the samples[start:end] into dest.""" + self.criterion.node_samples(dest) + cdef double node_impurity(self) noexcept nogil: """Return the impurity of the current node.""" return self.criterion.node_impurity() + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + SIZE_t n_missing, + bint missing_go_to_left, + ) noexcept nogil: + """Check stopping conditions pre-split. + + This is typically a metric that is cheaply computed given the + current proposed split, which is stored as a the `current_split` + argument. + """ + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef SIZE_t end_non_missing = self.end - n_missing + cdef SIZE_t n_left, n_right + + if missing_go_to_left: + n_left = current_split.pos - self.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - self.start + n_right = end_non_missing - current_split.pos + n_missing + + # Reject if min_samples_leaf is not guaranteed + if n_left < min_samples_leaf or n_right < min_samples_leaf: + return 1 + + return 0 + + cdef bint check_postsplit_conditions( + self + ) noexcept nogil: + """Check stopping conditions after evaluating the split. + + This takes some metric that is stored in the Criterion + object and checks against internal stop metrics. + """ + cdef double min_weight_leaf = self.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + return 1 + + return 0 + + cdef inline void shift_missing_values_to_left_if_required( SplitRecord* best, SIZE_t[::1] samples, @@ -275,6 +392,7 @@ cdef inline void shift_missing_values_to_left_if_required( samples[i], samples[current_end] = samples[current_end], samples[i] best.pos += best.n_missing + # Introduce a fused-class to make it possible to share the split implementation # between the dense and sparse cases in the node_split_best and node_split_random # functions. The alternative would have been to use inheritance-based polymorphism @@ -412,7 +530,6 @@ cdef inline int node_split_best( if has_missing: criterion.init_missing(n_missing) # Evaluate all splits - # If there are missing values, then we search twice for the most optimal split. # The first search will have all the missing values going to the right node. # The second search will have all the missing values going to the left node. @@ -433,18 +550,30 @@ cdef inline int node_split_best( if p >= end_non_missing: continue - if missing_go_to_left: - n_left = p - start + n_missing - n_right = end_non_missing - p - else: - n_left = p - start - n_right = end_non_missing - p + n_missing + current_split.pos = p + + # Reject if monotonicity constraints are not satisfied + if ( + with_monotonic_cst and + monotonic_cst[current_split.feature] != 0 and + not criterion.check_monotonicity( + monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) + ): + continue # Reject if min_samples_leaf is not guaranteed - if n_left < min_samples_leaf or n_right < min_samples_leaf: + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing + if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1: continue - current_split.pos = p criterion.update(current_split.pos) # Reject if monotonicity constraints are not satisfied @@ -460,8 +589,7 @@ cdef inline int node_split_best( continue # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() @@ -691,8 +819,6 @@ cdef inline int node_split_random( cdef SIZE_t n_features = splitter.n_features cdef SIZE_t max_features = splitter.max_features - cdef SIZE_t min_samples_leaf = splitter.min_samples_leaf - cdef double min_weight_leaf = splitter.min_weight_leaf cdef UINT32_t* random_state = &splitter.rand_r_state cdef SplitRecord best_split, current_split @@ -788,8 +914,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): + if splitter.check_presplit_conditions(current_split, 0, 0) == 1: continue # Evaluate split @@ -799,8 +924,19 @@ cdef inline int node_split_random( criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: + continue + + # Reject if monotonicity constraints are not satisfied + if ( + with_monotonic_cst and + monotonic_cst[current_split.feature] != 0 and + not criterion.check_monotonicity( + monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) + ): continue # Reject if monotonicity constraints are not satisfied @@ -1501,12 +1637,12 @@ cdef class BestSplitter(Splitter): ) cdef int node_split( - self, - double impurity, - SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound ) except -1 nogil: return node_split_best( self, diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index b99f44c0472a2..dedd820c41e0f 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -11,7 +11,10 @@ # See _tree.pyx for details. import numpy as np + cimport numpy as cnp +from libcpp.unordered_map cimport unordered_map +from libcpp.vector cimport vector ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight @@ -19,8 +22,8 @@ ctypedef cnp.npy_intp SIZE_t # Type for indices and counters ctypedef cnp.npy_int32 INT32_t # Signed 32 bit integer ctypedef cnp.npy_uint32 UINT32_t # Unsigned 32 bit integer -from ._splitter cimport Splitter -from ._splitter cimport SplitRecord +from ._splitter cimport SplitRecord, Splitter + cdef struct Node: # Base storage structure for the nodes in a Tree object @@ -35,40 +38,33 @@ cdef struct Node: unsigned char missing_go_to_left # Whether features have missing values -cdef class Tree: - # The Tree object is a binary tree structure constructed by the - # TreeBuilder. The tree structure is used for predictions and - # feature importances. - - # Input/Output layout - cdef public SIZE_t n_features # Number of features in X - cdef SIZE_t* n_classes # Number of classes in y[:, k] - cdef public SIZE_t n_outputs # Number of outputs in y - cdef public SIZE_t max_n_classes # max(n_classes) - +cdef class BaseTree: # Inner structures: values are stored separately from node structure, # since size is determined at runtime. cdef public SIZE_t max_depth # Max depth of the tree cdef public SIZE_t node_count # Counter for node IDs cdef public SIZE_t capacity # Capacity of tree, in terms of nodes cdef Node* nodes # Array of nodes - cdef double* value # (capacity, n_outputs, max_n_classes) array of values - cdef SIZE_t value_stride # = n_outputs * max_n_classes - # Methods - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples, - unsigned char missing_go_to_left) except -1 nogil + cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample + cdef double* value # Array of values prediction values for each node + + # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil - - cdef cnp.ndarray _get_value_ndarray(self) - cdef cnp.ndarray _get_node_ndarray(self) - - cpdef cnp.ndarray predict(self, object X) - + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil + + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) cdef cnp.ndarray _apply_sparse_csr(self, object X) @@ -80,6 +76,58 @@ cdef class Tree: cpdef compute_node_depths(self) cpdef compute_feature_importances(self, normalize=*) + # Abstract methods: these functions must be implemented by any decision tree + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node, + ) noexcept nogil + +cdef class Tree(BaseTree): + # The Supervised Tree object is a binary tree structure constructed by the + # TreeBuilder. The tree structure is used for predictions and + # feature importances. + # + # Value of upstream properties: + # - value_stride = n_outputs * max_n_classes + # - value = (capacity, n_outputs, max_n_classes) array of values + + # Input/Output layout for supervised tree + cdef public SIZE_t n_features # Number of features in X + cdef SIZE_t* n_classes # Number of classes in y[:, k] + cdef public SIZE_t n_outputs # Number of outputs in y + cdef public SIZE_t max_n_classes # max(n_classes) + + # Enables the use of tree to store distributions of the output to allow + # arbitrary usage of the the leaves. This is used in the quantile + # estimators for example. + # for storing samples at each leaf node with leaf's node ID as the key and + # the sample values as the value + cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples + + # Methods + cdef cnp.ndarray _get_value_ndarray(self) + cdef cnp.ndarray _get_node_ndarray(self) + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id) + cdef cnp.ndarray _get_value_samples_keys(self) + + cpdef cnp.ndarray predict(self, object X) # ============================================================================= # Tree builder @@ -101,6 +149,8 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping + cdef unsigned char store_leaf_values # Whether to store leaf values + cpdef build( self, Tree tree, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 24c698e4fc2b2..492b5219fa18e 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -13,21 +16,21 @@ # License: BSD 3 clause from cpython cimport Py_INCREF, PyObject, PyTypeObject - -from libc.stdlib cimport free -from libc.string cimport memcpy -from libc.string cimport memset -from libc.stdint cimport INTPTR_MAX +from cython.operator cimport dereference as deref from libc.math cimport isnan -from libcpp.vector cimport vector -from libcpp.algorithm cimport pop_heap -from libcpp.algorithm cimport push_heap +from libc.stdint cimport INTPTR_MAX +from libc.stdlib cimport free, malloc +from libc.string cimport memcpy, memset from libcpp cimport bool +from libcpp.algorithm cimport pop_heap, push_heap +from libcpp.vector cimport vector import struct import numpy as np + cimport numpy as cnp + cnp.import_array() from scipy.sparse import issparse @@ -36,6 +39,7 @@ from scipy.sparse import csr_matrix from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray + cdef extern from "numpy/arrayobject.h": object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, int nd, cnp.npy_intp* dims, @@ -153,15 +157,23 @@ cdef struct StackRecord: cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, double min_weight_leaf, - SIZE_t max_depth, double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + double min_impurity_decrease, + unsigned char store_leaf_values=False + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values cpdef build( self, @@ -204,9 +216,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef bint is_left cdef SIZE_t n_node_samples = splitter.n_samples cdef double weighted_n_node_samples - cdef SplitRecord split cdef SIZE_t node_id + cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef double impurity = INFINITY cdef double lower_bound cdef double upper_bound @@ -266,11 +280,16 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not is_leaf: splitter.node_split( impurity, - &split, + split_ptr, &n_constant_features, lower_bound, upper_bound ) + + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 @@ -278,10 +297,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (split.improvement + EPSILON < min_impurity_decrease)) - node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, impurity, n_node_samples, - weighted_n_node_samples, - split.missing_go_to_left) + node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, + impurity, n_node_samples, + weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: rc = -1 @@ -351,6 +369,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "lower_bound": left_child_min, "upper_bound": left_child_max, }) + elif self.store_leaf_values and is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[node_id]) if depth > max_depth_seen: max_depth_seen = depth @@ -360,6 +381,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc >= 0: tree.max_depth = max_depth_seen + + # free the memory created for the SplitRecord pointer + free(split_ptr) + if rc == -1: raise MemoryError() @@ -406,10 +431,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder): """ cdef SIZE_t max_leaf_nodes - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, min_weight_leaf, - SIZE_t max_depth, SIZE_t max_leaf_nodes, - double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + SIZE_t max_leaf_nodes, + double min_impurity_decrease, + unsigned char store_leaf_values=False, + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf @@ -417,6 +449,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values cpdef build( self, @@ -492,6 +525,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + if self.store_leaf_values: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[record.node_id]) else: # Node is expandable @@ -600,6 +636,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): ) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef SIZE_t node_id cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 @@ -623,11 +661,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if not is_leaf: splitter.node_split( impurity, - &split, + split_ptr, &n_constant_features, lower_bound, upper_bound ) + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 is_leaf = (is_leaf or split.pos >= end or @@ -637,9 +679,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.threshold, impurity, n_node_samples, - weighted_n_node_samples, - split.missing_go_to_left) + split_ptr, impurity, n_node_samples, + weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: return -1 @@ -673,6 +714,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.impurity_left = impurity res.impurity_right = impurity + free(split_ptr) return 0 @@ -680,194 +722,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # Tree # ============================================================================= -cdef class Tree: - """Array-based representation of a binary decision tree. - - The binary tree is represented as a number of parallel arrays. The i-th - element of each array holds information about the node `i`. Node 0 is the - tree's root. You can find a detailed description of all arrays in - `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split - nodes, resp. In this case the values of nodes of the other type are - arbitrary! - - Attributes - ---------- - node_count : int - The number of nodes (internal nodes + leaves) in the tree. - - capacity : int - The current capacity (i.e., size) of the arrays, which is at least as - great as `node_count`. - - max_depth : int - The depth of the tree, i.e. the maximum depth of its leaves. - - children_left : array of int, shape [node_count] - children_left[i] holds the node id of the left child of node i. - For leaves, children_left[i] == TREE_LEAF. Otherwise, - children_left[i] > i. This child handles the case where - X[:, feature[i]] <= threshold[i]. - - children_right : array of int, shape [node_count] - children_right[i] holds the node id of the right child of node i. - For leaves, children_right[i] == TREE_LEAF. Otherwise, - children_right[i] > i. This child handles the case where - X[:, feature[i]] > threshold[i]. - - feature : array of int, shape [node_count] - feature[i] holds the feature to split on, for the internal node i. - - threshold : array of double, shape [node_count] - threshold[i] holds the threshold for the internal node i. - - value : array of double, shape [node_count, n_outputs, max_n_classes] - Contains the constant prediction value of each node. - - impurity : array of double, shape [node_count] - impurity[i] holds the impurity (i.e., the value of the splitting - criterion) at node i. - - n_node_samples : array of int, shape [node_count] - n_node_samples[i] holds the number of training samples reaching node i. +cdef class BaseTree: + """Base class for Cython tree models. - weighted_n_node_samples : array of double, shape [node_count] - weighted_n_node_samples[i] holds the weighted number of training samples - reaching node i. + Downstream classes must implement """ - # Wrap for outside world. - # WARNING: these reference the current `nodes` and `value` buffers, which - # must not be freed by a subsequent memory allocation. - # (i.e. through `_resize` or `__setstate__`) - @property - def n_classes(self): - return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) - - @property - def children_left(self): - return self._get_node_ndarray()['left_child'][:self.node_count] - - @property - def children_right(self): - return self._get_node_ndarray()['right_child'][:self.node_count] - - @property - def n_leaves(self): - return np.sum(np.logical_and( - self.children_left == -1, - self.children_right == -1)) - - @property - def feature(self): - return self._get_node_ndarray()['feature'][:self.node_count] - - @property - def threshold(self): - return self._get_node_ndarray()['threshold'][:self.node_count] - - @property - def impurity(self): - return self._get_node_ndarray()['impurity'][:self.node_count] - - @property - def n_node_samples(self): - return self._get_node_ndarray()['n_node_samples'][:self.node_count] - - @property - def weighted_n_node_samples(self): - return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] - - @property - def missing_go_to_left(self): - return self._get_node_ndarray()['missing_go_to_left'][:self.node_count] - - @property - def value(self): - return self._get_value_ndarray()[:self.node_count] - - # TODO: Convert n_classes to cython.integral memory view once - # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): - """Constructor.""" - cdef SIZE_t dummy = 0 - size_t_dtype = np.array(dummy).dtype - - n_classes = _check_n_classes(n_classes, size_t_dtype) - - # Input/Output layout - self.n_features = n_features - self.n_outputs = n_outputs - self.n_classes = NULL - safe_realloc(&self.n_classes, n_outputs) - - self.max_n_classes = np.max(n_classes) - self.value_stride = n_outputs * self.max_n_classes - - cdef SIZE_t k - for k in range(n_outputs): - self.n_classes[k] = n_classes[k] - - # Inner structures - self.max_depth = 0 - self.node_count = 0 - self.capacity = 0 - self.value = NULL - self.nodes = NULL - - def __dealloc__(self): - """Destructor.""" - # Free all inner structures - free(self.n_classes) - free(self.value) - free(self.nodes) - - def __reduce__(self): - """Reduce re-implementation, for pickling.""" - return (Tree, (self.n_features, - sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), - self.n_outputs), self.__getstate__()) - - def __getstate__(self): - """Getstate re-implementation, for pickling.""" - d = {} - # capacity is inferred during the __setstate__ using nodes - d["max_depth"] = self.max_depth - d["node_count"] = self.node_count - d["nodes"] = self._get_node_ndarray() - d["values"] = self._get_value_ndarray() - return d - - def __setstate__(self, d): - """Setstate re-implementation, for unpickling.""" - self.max_depth = d["max_depth"] - self.node_count = d["node_count"] - - if 'nodes' not in d: - raise ValueError('You have loaded Tree version which ' - 'cannot be imported') - - node_ndarray = d['nodes'] - value_ndarray = d['values'] - - value_shape = (node_ndarray.shape[0], self.n_outputs, - self.max_n_classes) - - node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) - value_ndarray = _check_value_ndarray( - value_ndarray, - expected_dtype=np.dtype(np.float64), - expected_shape=value_shape - ) - - self.capacity = node_ndarray.shape[0] - if self._resize_c(self.capacity) != 0: - raise MemoryError("resizing tree to %d" % self.capacity) - - memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), - self.capacity * sizeof(Node)) - memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) - - cdef int _resize(self, SIZE_t capacity) except -1 nogil: + cdef int _resize( + self, + SIZE_t capacity + ) except -1 nogil: """Resize all inner arrays to `capacity`, if `capacity` == -1, then double the size of the inner arrays. @@ -879,7 +742,10 @@ cdef class Tree: with gil: raise MemoryError() - cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil: + cdef int _resize_c( + self, + SIZE_t capacity=INTPTR_MAX + ) except -1 nogil: """Guts of _resize Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -910,15 +776,93 @@ cdef class Tree: self.capacity = capacity return 0 - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples, - unsigned char missing_go_to_left) except -1 nogil: + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set split node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the split node data. + node : Node* + The pointer to the node that will hold the split node. + """ + # left_child and right_child will be set later for a split node + node.feature = split_node.feature + node.threshold = split_node.threshold + return 1 + + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set leaf node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the leaf node data. + node : Node* + The pointer to the node that will hold the leaf node. + """ + node.left_child = _TREE_LEAF + node.right_child = _TREE_LEAF + node.feature = _TREE_UNDEFINED + node.threshold = _TREE_UNDEFINED + return 1 + + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil: + """Compute feature from a given data matrix, X. + + In axis-aligned trees, this is simply the value in the column of X + for this specific feature. + """ + # the feature index + cdef DTYPE_t feature = X_ndarray[sample_index, node.feature] + return feature + + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil: """Add a node to the tree. The new node registers itself as the child of its parent. + Parameters + ---------- + parent : SIZE_t + The index of the parent. If '_TREE_UNDEFINED', then the current + node is a root node. + is_left : bint + Whether or not the current node is to the left of the parent node. + is_leaf : bint + Whether or not the current node is a leaf node. + split_node : SplitRecord* + A pointer to a SplitRecord pointer address. + impurity : double + The impurity of the node to be added. + n_node_samples : SIZE_t + The number of samples in the node. + weighted_n_node_samples : double + The weight of the samples in the node. + Returns (size_t)(-1) on error. """ cdef SIZE_t node_id = self.node_count @@ -939,29 +883,19 @@ cdef class Tree: self.nodes[parent].right_child = node_id if is_leaf: - node.left_child = _TREE_LEAF - node.right_child = _TREE_LEAF - node.feature = _TREE_UNDEFINED - node.threshold = _TREE_UNDEFINED - + if self._set_leaf_node(split_node, node) != 1: + with gil: + raise RuntimeError else: - # left_child and right_child will be set later - node.feature = feature - node.threshold = threshold + if self._set_split_node(split_node, node) != 1: + with gil: + raise RuntimeError node.missing_go_to_left = missing_go_to_left self.node_count += 1 return node_id - cpdef cnp.ndarray predict(self, object X): - """Predict target for X.""" - out = self._get_value_ndarray().take(self.apply(X), axis=0, - mode='clip') - if self.n_outputs == 1: - out = out.reshape(X.shape[0], self.max_n_classes) - return out - cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): @@ -995,9 +929,10 @@ cdef class Tree: with nogil: for i in range(n_samples): node = self.nodes + # While node not a leaf while node.left_child != _TREE_LEAF: - X_i_node_feature = X_ndarray[i, node.feature] + X_i_node_feature = self._compute_feature(X_ndarray, i, node) # ... and node.right_child != _TREE_LEAF: if isnan(X_i_node_feature): if node.missing_go_to_left: @@ -1065,7 +1000,6 @@ cdef class Tree: # ... and node.right_child != _TREE_LEAF: if feature_to_sample[node.feature] == i: feature_value = X_sample[node.feature] - else: feature_value = 0. @@ -1114,6 +1048,9 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + # the feature index + cdef DOUBLE_t feature + with nogil: for i in range(n_samples): node = self.nodes @@ -1125,7 +1062,9 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 - if X_ndarray[i, node.feature] <= node.threshold: + # compute the feature value to compare against threshold + feature = self._compute_feature(X_ndarray, i, node) + if feature <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1254,13 +1193,12 @@ cdef class Tree: cpdef compute_feature_importances(self, normalize=True): """Computes the importance of each feature (aka variable).""" - cdef Node* left - cdef Node* right cdef Node* nodes = self.nodes cdef Node* node = nodes cdef Node* end_node = node + self.node_count cdef double normalizer = 0. + cdef int i = 0 cdef cnp.float64_t[:] importances = np.zeros(self.n_features) @@ -1268,13 +1206,9 @@ cdef class Tree: while node != end_node: if node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - left = &nodes[node.left_child] - right = &nodes[node.right_child] + self._compute_feature_importances( + importances, node) - importances[node.feature] += ( - node.weighted_n_node_samples * node.impurity - - left.weighted_n_node_samples * left.impurity - - right.weighted_n_node_samples * right.impurity) node += 1 for i in range(self.n_features): @@ -1290,44 +1224,27 @@ cdef class Tree: return np.asarray(importances) - cdef cnp.ndarray _get_value_ndarray(self): - """Wraps value as a 3-d NumPy array. + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node + ) noexcept nogil: + """Compute feature importances from a Node in the Tree. - The array keeps a reference to this Tree, which manages the underlying - memory. + Wrapped in a private function to allow subclassing that + computes feature importances. """ - cdef cnp.npy_intp shape[3] - shape[0] = self.node_count - shape[1] = self.n_outputs - shape[2] = self.max_n_classes - cdef cnp.ndarray arr - arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + cdef Node* nodes = self.nodes + cdef Node* left + cdef Node* right - cdef cnp.ndarray _get_node_ndarray(self): - """Wraps nodes as a NumPy struct array. + left = &nodes[node.left_child] + right = &nodes[node.right_child] - The array keeps a reference to this Tree, which manages the underlying - memory. Individual fields are publicly accessible as properties of the - Tree. - """ - cdef cnp.npy_intp shape[1] - shape[0] = self.node_count - cdef cnp.npy_intp strides[1] - strides[0] = sizeof(Node) - cdef cnp.ndarray arr - Py_INCREF(NODE_DTYPE) - arr = PyArray_NewFromDescr( cnp.ndarray, - NODE_DTYPE, 1, shape, - strides, self.nodes, - cnp.NPY_ARRAY_DEFAULT, None) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + importances[node.feature] += ( + node.weighted_n_node_samples * node.impurity - + left.weighted_n_node_samples * left.impurity - + right.weighted_n_node_samples * right.impurity) def compute_partial_dependence(self, DTYPE_t[:, ::1] X, int[::1] target_features, @@ -1436,6 +1353,282 @@ cdef class Tree: total_weight) +cdef class Tree(BaseTree): + """Array-based representation of a binary decision tree. + + The binary tree is represented as a number of parallel arrays. The i-th + element of each array holds information about the node `i`. Node 0 is the + tree's root. You can find a detailed description of all arrays in + `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split + nodes, resp. In this case the values of nodes of the other type are + arbitrary! + + Attributes + ---------- + node_count : int + The number of nodes (internal nodes + leaves) in the tree. + + capacity : int + The current capacity (i.e., size) of the arrays, which is at least as + great as `node_count`. + + max_depth : int + The depth of the tree, i.e. the maximum depth of its leaves. + + children_left : array of int, shape [node_count] + children_left[i] holds the node id of the left child of node i. + For leaves, children_left[i] == TREE_LEAF. Otherwise, + children_left[i] > i. This child handles the case where + X[:, feature[i]] <= threshold[i]. + + children_right : array of int, shape [node_count] + children_right[i] holds the node id of the right child of node i. + For leaves, children_right[i] == TREE_LEAF. Otherwise, + children_right[i] > i. This child handles the case where + X[:, feature[i]] > threshold[i]. + + feature : array of int, shape [node_count] + feature[i] holds the feature to split on, for the internal node i. + + threshold : array of double, shape [node_count] + threshold[i] holds the threshold for the internal node i. + + value : array of double, shape [node_count, n_outputs, max_n_classes] + Contains the constant prediction value of each node. + + impurity : array of double, shape [node_count] + impurity[i] holds the impurity (i.e., the value of the splitting + criterion) at node i. + + n_node_samples : array of int, shape [node_count] + n_node_samples[i] holds the number of training samples reaching node i. + + weighted_n_node_samples : array of double, shape [node_count] + weighted_n_node_samples[i] holds the weighted number of training samples + reaching node i. + """ + # Wrap for outside world. + # WARNING: these reference the current `nodes` and `value` buffers, which + # must not be freed by a subsequent memory allocation. + # (i.e. through `_resize` or `__setstate__`) + @property + def n_classes(self): + return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) + + @property + def children_left(self): + return self._get_node_ndarray()['left_child'][:self.node_count] + + @property + def children_right(self): + return self._get_node_ndarray()['right_child'][:self.node_count] + + @property + def n_leaves(self): + return np.sum(np.logical_and( + self.children_left == -1, + self.children_right == -1)) + + @property + def feature(self): + return self._get_node_ndarray()['feature'][:self.node_count] + + @property + def threshold(self): + return self._get_node_ndarray()['threshold'][:self.node_count] + + @property + def impurity(self): + return self._get_node_ndarray()['impurity'][:self.node_count] + + @property + def n_node_samples(self): + return self._get_node_ndarray()['n_node_samples'][:self.node_count] + + @property + def weighted_n_node_samples(self): + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + + @property + def missing_go_to_left(self): + return self._get_node_ndarray()['missing_go_to_left'][:self.node_count] + + @property + def value(self): + return self._get_value_ndarray()[:self.node_count] + + @property + def leaf_nodes_samples(self): + leaf_node_samples = dict() + keys = self._get_value_samples_keys() + for node_id in keys: + leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id) + return leaf_node_samples + + # TODO: Convert n_classes to cython.integral memory view once + # https://github.com/cython/cython/issues/5243 is fixed + def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): + """Constructor.""" + cdef SIZE_t dummy = 0 + size_t_dtype = np.array(dummy).dtype + + n_classes = _check_n_classes(n_classes, size_t_dtype) + + # Input/Output layout + self.n_features = n_features + self.n_outputs = n_outputs + self.n_classes = NULL + safe_realloc(&self.n_classes, n_outputs) + + self.max_n_classes = np.max(n_classes) + self.value_stride = n_outputs * self.max_n_classes + + cdef SIZE_t k + for k in range(n_outputs): + self.n_classes[k] = n_classes[k] + + # Inner structures + self.max_depth = 0 + self.node_count = 0 + self.capacity = 0 + self.value = NULL + self.nodes = NULL + + # initialize the hash map for the value samples + self.value_samples = unordered_map[SIZE_t, vector[vector[DOUBLE_t]]]() + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.n_classes) + free(self.value) + free(self.nodes) + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (Tree, (self.n_features, + sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), + self.n_outputs), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["node_count"] = self.node_count + d["nodes"] = self._get_node_ndarray() + d["values"] = self._get_value_ndarray() + d['value_samples'] = self.leaf_nodes_samples + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.node_count = d["node_count"] + + if 'nodes' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + node_ndarray = d['nodes'] + value_ndarray = d['values'] + + value_shape = (node_ndarray.shape[0], self.n_outputs, + self.max_n_classes) + + node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) + value_ndarray = _check_value_ndarray( + value_ndarray, + expected_dtype=np.dtype(np.float64), + expected_shape=value_shape + ) + + self.capacity = node_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + + memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), + self.capacity * sizeof(Node)) + memcpy(self.value, cnp.PyArray_DATA(value_ndarray), + self.capacity * self.value_stride * sizeof(double)) + + # store the leaf node samples if they exist + value_samples_dict = d['value_samples'] + for node_id, leaf_samples in value_samples_dict.items(): + self.value_samples[node_id].resize(leaf_samples.shape[0]) + for idx in range(leaf_samples.shape[0]): + for jdx in range(leaf_samples.shape[1]): + self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx]) + + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id): + """Wraps value_samples as a 2-d NumPy array per node_id.""" + cdef int i, j + cdef int n_samples = self.value_samples[node_id].size() + cdef cnp.ndarray[DOUBLE_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64) + + for i in range(n_samples): + for j in range(self.n_outputs): + leaf_node_samples[i, j] = self.value_samples[node_id][i][j] + return leaf_node_samples + + cdef cnp.ndarray _get_value_samples_keys(self): + """Wraps value_samples keys as a 1-d NumPy array of keys.""" + cdef cnp.ndarray[SIZE_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp) + cdef unsigned int i = 0 + + for key in self.value_samples: + keys[i] = key.first + i += 1 + return keys + + cdef cnp.ndarray _get_value_ndarray(self): + """Wraps value as a 3-d NumPy array. + + The array keeps a reference to this Tree, which manages the underlying + memory. + """ + cdef cnp.npy_intp shape[3] + shape[0] = self.node_count + shape[1] = self.n_outputs + shape[2] = self.max_n_classes + cdef cnp.ndarray arr + arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cdef cnp.ndarray _get_node_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.node_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Node) + cdef cnp.ndarray arr + Py_INCREF(NODE_DTYPE) + arr = PyArray_NewFromDescr( cnp.ndarray, + NODE_DTYPE, 1, shape, + strides, self.nodes, + cnp.NPY_ARRAY_DEFAULT, None) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cpdef cnp.ndarray predict(self, object X): + """Predict target for X.""" + out = self._get_value_ndarray().take(self.apply(X), axis=0, + mode='clip') + if self.n_outputs == 1: + out = out.reshape(X.shape[0], self.max_n_classes) + return out + + def _check_n_classes(n_classes, expected_dtype): if n_classes.ndim != 1: raise ValueError( @@ -1920,6 +2113,8 @@ cdef _build_pruned_tree( stack[BuildPrunedRecord] prune_stack BuildPrunedRecord stack_record + SplitRecord split + with nogil: # push root node onto stack prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0}) @@ -1936,8 +2131,12 @@ cdef _build_pruned_tree( is_leaf = leaves_in_subtree[orig_node_id] node = &orig_tree.nodes[orig_node_id] + # redefine to a SplitRecord to pass into _add_node + split.feature = node.feature + split.threshold = node.threshold + new_node_id = tree._add_node( - parent, is_left, is_leaf, node.feature, node.threshold, + parent, is_left, is_leaf, &split, node.impurity, node.n_node_samples, node.weighted_n_node_samples, node.missing_go_to_left) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 4b953af2d9b2b..61ba8af197c2e 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -9,8 +9,10 @@ # See _utils.pyx for details. cimport numpy as cnp + +from sklearn.neighbors._quad_tree cimport Cell + from ._tree cimport Node -from ..neighbors._quad_tree cimport Cell ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 669d69409fdc3..02dc7cf426efc 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Arnaud Joly @@ -7,16 +10,17 @@ # # License: BSD 3 clause -from libc.stdlib cimport free -from libc.stdlib cimport realloc -from libc.math cimport log as ln from libc.math cimport isnan +from libc.math cimport log as ln +from libc.stdlib cimport free, realloc import numpy as np + cimport numpy as cnp + cnp.import_array() -from ..utils._random cimport our_rand_r +from sklearn.utils._random cimport our_rand_r # ============================================================================= # Helper functions diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 034ee5fc39917..ccca6d60ed48b 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -882,7 +882,7 @@ def test_pickle(): else: X, y = diabetes.data, diabetes.target - est = TreeEstimator(random_state=0) + est = TreeEstimator(random_state=0, store_leaf_values=True) est.fit(X, y) score = est.score(X, y) @@ -901,6 +901,7 @@ def test_pickle(): "n_node_samples", "weighted_n_node_samples", "value", + "leaf_nodes_samples", ] fitted_attribute = { attribute: getattr(est.tree_, attribute) for attribute in attributes @@ -915,14 +916,25 @@ def test_pickle(): score == score2 ), "Failed to generate same score after pickling with {0}".format(name) for attribute in fitted_attribute: - assert_array_equal( - getattr(est2.tree_, attribute), - fitted_attribute[attribute], - err_msg=( - f"Failed to generate same attribute {attribute} after pickling with" - f" {name}" - ), - ) + if attribute == "leaf_nodes_samples": + for key in fitted_attribute[attribute].keys(): + assert_array_equal( + getattr(est2.tree_, attribute)[key], + fitted_attribute[attribute][key], + err_msg=( + f"Failed to generate same attribute {attribute} after" + f" pickling with {name}" + ), + ) + else: + assert_array_equal( + getattr(est2.tree_, attribute), + fitted_attribute[attribute], + err_msg=( + f"Failed to generate same attribute {attribute} after pickling" + f" with {name}" + ), + ) def test_multioutput(): @@ -2414,7 +2426,9 @@ def test_missing_values_on_equal_nodes_no_missing(criterion): X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) - dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) + dtc = DecisionTreeRegressor( + random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True + ) dtc.fit(X, y) # Goes to right node because it has the most data points @@ -2626,3 +2640,148 @@ def test_sample_weight_non_uniform(make_data, Tree): tree_samples_removed.fit(X[1::2, :], y[1::2]) assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) + + +@pytest.mark.parametrize( + "tree_name", + ALL_TREES, +) +def test_leaf_node_samples(tree_name): + """Test getting leaf node samples from fitted tree.""" + tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False) + tree.fit(X_small, y_small) + + # Check that the leaf node samples are not stored by default + assert tree.tree_.leaf_nodes_samples == dict() + + # error should be raised if trying to predict quantiles + assert hasattr(tree, "predict_quantiles") + for meth in ["predict_quantiles", "get_leaf_node_samples"]: + if hasattr(tree, meth): + with pytest.raises( + RuntimeError, + match="leaf node samples", + ): + getattr(tree, meth)(X_small) + + quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True) + quantile_tree.fit(X_small, y_small) + + score = tree.score(X_small, y_small) + new_score = quantile_tree.score(X_small, y_small) + assert np.isclose(score, new_score) + + # Check that the leaf node samples are what they should be + X_leaves = quantile_tree.apply(X_small) + for idx in range(X_leaves.shape[0]): + leaf_idx = X_leaves[idx] + assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx] + assert set(np.unique(X_leaves)) == set( + quantile_tree.tree_.leaf_nodes_samples.keys() + ) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0) + + # fit on binary results in perfect leaves, so all quantiles are the same + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(X_small), pred[:, 0]) + assert_array_equal(est.predict(X_small), pred[:, 1]) + assert_array_equal(est.predict(X_small), pred[:, 2]) + assert_array_equal(pred[:, 0], y_small) + assert np.unique(pred, axis=1).shape[1] == 1 + + est.fit(X_small[:-5], y_small[:-5]) + held_out_X = X_small[-5:, :] + pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(held_out_X), pred[:, 0]) + assert_array_equal(est.predict(held_out_X), pred[:, 1]) + assert_array_equal(est.predict(held_out_X), pred[:, 2]) + + # fit on real data + est.fit(iris.data, iris.target) + pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(pred[:, 0], iris.target) + assert_array_equal(pred[:, 1], iris.target) + assert_array_equal(pred[:, 2], iris.target) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict_impure_leaves(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4) + # fit on binary results with constrained depth will result in impure leaves + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert np.unique(pred, axis=1).shape[1] > 1 + + +def test_multioutput_quantiles(): + # Check estimators on multi-output problems. + X = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + + y = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + + T = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + # toy classification problem + for name, TreeClassifier in CLF_TREES.items(): + clf = TreeClassifier(random_state=0, store_leaf_values=True) + clf.fit(X, y) + + y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + y_hat = y_hat.squeeze() + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) + + # toy regression problem + for name, TreeRegressor in REG_TREES.items(): + reg = TreeRegressor(random_state=0, store_leaf_values=True) + y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 87173032a3bd3..f8b4d2042223c 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -188,7 +188,7 @@ def _array_indexing(array, key, key_dtype, axis): key = np.asarray(key) if isinstance(key, tuple): key = list(key) - return array[key] if axis == 0 else array[:, key] + return array[key, ...] if axis == 0 else array[:, key] def _pandas_indexing(X, key, key_dtype, axis): diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index ca0d9fcaf1509..ed16ce767a0cd 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -186,6 +186,9 @@ def __init__(self, array_namespace): def __getattr__(self, name): return getattr(self._namespace, name) + def __eq__(self, other): + return self._namespace == other._namespace + def take(self, X, indices, *, axis=0): # When array_api supports `take` we can use this directly # https://github.com/data-apis/array-api/issues/177 diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index 5affa4616be01..fb3912b27dbfe 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -177,7 +177,7 @@ def _unique_python(values, *, return_inverse, return_counts): except TypeError: types = sorted(t.__qualname__ for t in set(type(v) for v in values)) raise TypeError( - "Encoders require their input to be uniformly " + "Encoders require their input argument must be uniformly " f"strings or numbers. Got {types}" ) ret = (uniques,) diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py index e9b95666cdd32..207096823cae6 100644 --- a/sklearn/utils/_estimator_html_repr.py +++ b/sklearn/utils/_estimator_html_repr.py @@ -190,13 +190,35 @@ def _write_estimator_html( _STYLE = """ #$id { - color: black; + --sklearn-color-text: black; + --sklearn-color-line: gray; + --sklearn-color-background: white; + --sklearn-color-background-box: #f0f8ff; + --sklearn-color-border-box: black; + --sklearn-color-icon: #696969; + --sklearn-color-active: #d4ebff; + --sklearn-color-highlight: #d4ebff; + + @media (prefers-color-scheme: dark) { + --sklearn-color-text: white; + --sklearn-color-line: gray; + --sklearn-color-background: #111; + --sklearn-color-background-box: #424242; + --sklearn-color-border-box: white; + --sklearn-color-icon: #878787; + --sklearn-color-active: #616161; + --sklearn-color-highlight: #616161; + } +} + +#$id { + color: var(--sklearn-color-text); } #$id pre{ padding: 0; } #$id div.sk-toggleable { - background-color: white; + background-color: var(--sklearn-color-background); } #$id label.sk-toggleable__label { cursor: pointer; @@ -211,26 +233,26 @@ def _write_estimator_html( content: "▸"; float: left; margin-right: 0.25em; - color: #696969; + color: var(--sklearn-color-icon); } #$id label.sk-toggleable__label-arrow:hover:before { - color: black; + color: var(--sklearn-color-text); } #$id div.sk-estimator:hover label.sk-toggleable__label-arrow:before { - color: black; + color: var(--sklearn-color-text); } #$id div.sk-toggleable__content { max-height: 0; max-width: 0; overflow: hidden; text-align: left; - background-color: #f0f8ff; + background-color: var(--sklearn-color-background-box); } #$id div.sk-toggleable__content pre { margin: 0.2em; - color: black; + color: var(--sklearn-color-text); border-radius: 0.25em; - background-color: #f0f8ff; + background-color: var(--sklearn-color-background-box); } #$id input.sk-toggleable__control:checked~div.sk-toggleable__content { max-height: 200px; @@ -241,10 +263,10 @@ def _write_estimator_html( content: "▾"; } #$id div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label { - background-color: #d4ebff; + background-color: var(--sklearn-color-active); } #$id div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label { - background-color: #d4ebff; + background-color: var(--sklearn-color-active); } #$id input.sk-hidden--visually { border: 0; @@ -259,28 +281,28 @@ def _write_estimator_html( } #$id div.sk-estimator { font-family: monospace; - background-color: #f0f8ff; - border: 1px dotted black; + background-color: var(--sklearn-color-background-box); + border: 1px dotted var(--sklearn-color-border-box); border-radius: 0.25em; box-sizing: border-box; margin-bottom: 0.5em; } #$id div.sk-estimator:hover { - background-color: #d4ebff; + background-color: var(--sklearn-color-highlight); } #$id div.sk-parallel-item::after { content: ""; width: 100%; - border-bottom: 1px solid gray; + border-bottom: 1px solid var(--sklearn-color-line); flex-grow: 1; } #$id div.sk-label:hover label.sk-toggleable__label { - background-color: #d4ebff; + background-color: var(--sklearn-color-highlight); } #$id div.sk-serial::before { content: ""; position: absolute; - border-left: 1px solid gray; + border-left: 1px solid var(--sklearn-color-line); box-sizing: border-box; top: 0; bottom: 0; @@ -291,7 +313,7 @@ def _write_estimator_html( display: flex; flex-direction: column; align-items: center; - background-color: white; + background-color: var(--sklearn-color-background); padding-right: 0.2em; padding-left: 0.2em; position: relative; @@ -304,13 +326,13 @@ def _write_estimator_html( display: flex; align-items: stretch; justify-content: center; - background-color: white; + background-color: var(--sklearn-color-background); position: relative; } #$id div.sk-item::before, #$id div.sk-parallel-item::before { content: ""; position: absolute; - border-left: 1px solid gray; + border-left: 1px solid var(--sklearn-color-line); box-sizing: border-box; top: 0; bottom: 0; @@ -322,7 +344,7 @@ def _write_estimator_html( flex-direction: column; z-index: 1; position: relative; - background-color: white; + background-color: var(--sklearn-color-background); } #$id div.sk-parallel-item:first-child::after { align-self: flex-end; @@ -336,11 +358,11 @@ def _write_estimator_html( width: 0; } #$id div.sk-dashed-wrapped { - border: 1px dashed gray; + border: 1px dashed var(--sklearn-color-line); margin: 0 0.4em 0.5em 0.4em; box-sizing: border-box; padding-bottom: 0.4em; - background-color: white; + background-color: var(--sklearn-color-background); } #$id div.sk-label label { font-family: monospace; diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py index 17d8e37510e48..1a9c07438b17a 100644 --- a/sklearn/utils/_metadata_requests.py +++ b/sklearn/utils/_metadata_requests.py @@ -80,7 +80,7 @@ import inspect from collections import namedtuple from copy import deepcopy -from typing import Optional, Union +from typing import TYPE_CHECKING, Optional, Union from warnings import warn from .. import get_config @@ -89,6 +89,9 @@ # Only the following methods are supported in the routing mechanism. Adding new # methods at the moment involves monkeypatching this list. +# Note that if this list is changed or monkeypatched, the corresponding method +# needs to be added under a TYPE_CHECKING condition like the one done here in +# _MetadataRequester SIMPLE_METHODS = [ "fit", "partial_fit", @@ -1251,6 +1254,27 @@ class _MetadataRequester: .. versionadded:: 1.3 """ + if TYPE_CHECKING: # pragma: no cover + # This code is never run in runtime, but it's here for type checking. + # Type checkers fail to understand that the `set_{method}_request` + # methods are dynamically generated, and they complain that they are + # not defined. We define them here to make type checkers happy. + # During type checking analyzers assume this to be True. + # The following list of defined methods mirrors the list of methods + # in SIMPLE_METHODS. + # fmt: off + def set_fit_request(self, **kwargs): pass + def set_partial_fit_request(self, **kwargs): pass + def set_predict_request(self, **kwargs): pass + def set_predict_proba_request(self, **kwargs): pass + def set_predict_log_proba_request(self, **kwargs): pass + def set_decision_function_request(self, **kwargs): pass + def set_score_request(self, **kwargs): pass + def set_split_request(self, **kwargs): pass + def set_transform_request(self, **kwargs): pass + def set_inverse_transform_request(self, **kwargs): pass + # fmt: on + def __init_subclass__(cls, **kwargs): """Set the ``set_{method}_request`` methods. @@ -1412,7 +1436,11 @@ def get_metadata_routing(self): # given metadata. This is to minimize the boilerplate required in routers. -def process_routing(obj, method, other_params, **kwargs): +# Here the first two arguments are positional only which makes everything +# passed as keyword argument a metadata. The first two args also have an `_` +# prefix to reduce the chances of name collisions with the passed metadata, and +# since they're positional only, users will never type those underscores. +def process_routing(_obj, _method, /, **kwargs): """Validate and route input parameters. This function is used inside a router's method, e.g. :term:`fit`, @@ -1420,26 +1448,21 @@ def process_routing(obj, method, other_params, **kwargs): Assuming this signature: ``fit(self, X, y, sample_weight=None, **fit_params)``, a call to this function would be: - ``process_routing(self, fit_params, sample_weight=sample_weight)``. + ``process_routing(self, sample_weight=sample_weight, **fit_params)``. .. versionadded:: 1.3 Parameters ---------- - obj : object + _obj : object An object implementing ``get_metadata_routing``. Typically a meta-estimator. - method : str + _method : str The name of the router's method in which this function is called. - other_params : dict - A dictionary of extra parameters passed to the router's method, - e.g. ``**fit_params`` passed to a meta-estimator's :term:`fit`. - **kwargs : dict - Parameters explicitly accepted and included in the router's method - signature. + Metadata to be routed. Returns ------- @@ -1449,27 +1472,20 @@ def process_routing(obj, method, other_params, **kwargs): corresponding methods or corresponding child objects. The object names are those defined in `obj.get_metadata_routing()`. """ - if not hasattr(obj, "get_metadata_routing"): + if not (hasattr(_obj, "get_metadata_routing") or isinstance(_obj, MetadataRouter)): raise AttributeError( - f"This {repr(obj.__class__.__name__)} has not implemented the routing" - " method `get_metadata_routing`." + f"The given object ({repr(_obj.__class__.__name__)}) needs to either" + " implement the routing method `get_metadata_routing` or be a" + " `MetadataRouter` instance." ) - if method not in METHODS: + if _method not in METHODS: raise TypeError( f"Can only route and process input on these methods: {METHODS}, " - f"while the passed method is: {method}." + f"while the passed method is: {_method}." ) - # We take the extra params (**fit_params) which is passed as `other_params` - # and add the explicitly passed parameters (passed as **kwargs) to it. This - # is equivalent to a code such as this in a router: - # if sample_weight is not None: - # fit_params["sample_weight"] = sample_weight - all_params = other_params if other_params is not None else dict() - all_params.update(kwargs) - - request_routing = get_routing_for_object(obj) - request_routing.validate_metadata(params=all_params, method=method) - routed_params = request_routing.route_params(params=all_params, caller=method) + request_routing = get_routing_for_object(_obj) + request_routing.validate_metadata(params=kwargs, method=_method) + routed_params = request_routing.route_params(params=kwargs, caller=_method) return routed_params diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index 9eee7c370e341..bb289535c45ec 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -5,6 +5,7 @@ from .._config import get_config from . import check_pandas_support from ._available_if import available_if +from .validation import _is_pandas_df def _wrap_in_pandas_container( @@ -125,9 +126,10 @@ def _wrap_data_with_container(method, data_to_wrap, original_input, estimator): return data_to_wrap # dense_config == "pandas" + index = original_input.index if _is_pandas_df(original_input) else None return _wrap_in_pandas_container( data_to_wrap=data_to_wrap, - index=getattr(original_input, "index", None), + index=index, columns=estimator.get_feature_names_out, ) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 8b54df9f25b72..bf558ff7e6dd4 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -38,6 +38,7 @@ assert_array_almost_equal, assert_array_equal, assert_array_less, + assert_no_warnings, ) import sklearn @@ -65,6 +66,7 @@ "assert_approx_equal", "assert_allclose", "assert_run_python_script", + "assert_no_warnings", "SkipTest", ] @@ -80,32 +82,6 @@ assert_raises_regexp = assert_raises_regex -# To remove when we support numpy 1.7 -def assert_no_warnings(func, *args, **kw): - """ - Parameters - ---------- - func - *args - **kw - """ - # very important to avoid uncontrolled state propagation - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - result = func(*args, **kw) - if hasattr(np, "FutureWarning"): - # Filter out numpy-specific warnings in numpy >= 1.9 - w = [e for e in w if e.category is not np.VisibleDeprecationWarning] - - if len(w) > 0: - raise AssertionError( - "Got warnings when calling %s: [%s]" - % (func.__name__, ", ".join(str(warning) for warning in w)) - ) - return result - - def ignore_warnings(obj=None, category=Warning): """Context manager and decorator to ignore warnings. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e77197e24a69e..53ae056b4d2f7 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1350,7 +1350,10 @@ def check_dtype_object(name, estimator_orig): if "string" not in tags["X_types"]: X[0, 0] = {"foo": "bar"} - msg = "argument must be a string.* number" + # This error is raised by: + # - `np.asarray` in `check_array` + # - `_unique_python` for encoders + msg = "argument must be .* string.* number" with raises(TypeError, match=msg): estimator.fit(X, y) else: @@ -3542,7 +3545,6 @@ def _enforce_estimator_tags_y(estimator, y): # Create strictly positive y. The minimal increment above 0 is 1, as # y could be of integer dtype. y += 1 + abs(y.min()) - # Estimators with a `binary_only` tag only accept up to two unique y values if _safe_tags(estimator, key="binary_only") and y.size > 0: y = np.where(y == y.flat[0], y, y.flat[0] + 1) # Estimators in mono_output_task_error raise ValueError if y is of 1-D @@ -3562,7 +3564,8 @@ def _enforce_estimator_tags_X(estimator, X, kernel=linear_kernel): if _safe_tags(estimator, key="requires_positive_X"): X = X - X.min() if "categorical" in _safe_tags(estimator, key="X_types"): - X = (X - X.min()).astype(np.int32) + dtype = np.float64 if _safe_tags(estimator, key="allow_nan") else np.int32 + X = np.round((X - X.min())).astype(dtype) if estimator.__class__.__name__ == "SkewedChi2Sampler": # SkewedChi2Sampler requires X > -skewdness in transform @@ -4584,7 +4587,7 @@ def check_set_output_transform_pandas(name, transformer_orig): outputs_pandas = _output_from_fit_transform(transformer_pandas, name, X, df, y) except ValueError as e: # transformer does not support sparse data - assert str(e) == "Pandas output does not support sparse data.", e + assert "Pandas output does not support sparse data." in str(e), e return for case in outputs_default: @@ -4630,7 +4633,7 @@ def check_global_output_transform_pandas(name, transformer_orig): ) except ValueError as e: # transformer does not support sparse data - assert str(e) == "Pandas output does not support sparse data.", e + assert "Pandas output does not support sparse data." in str(e), e return for case in outputs_default: diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 2202a1daaf90a..d33b638358157 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -158,3 +158,11 @@ def _contents(data_module): ) else: return resources.contents(data_module) + + +# For +1.25 NumPy versions exceptions and warnings are being moved +# to a dedicated submodule. +if np_version >= parse_version("1.25.0"): + from numpy.exceptions import VisibleDeprecationWarning +else: + from numpy import VisibleDeprecationWarning # type: ignore # noqa diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 892d77c7e01e5..1f46f6400df98 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -14,6 +14,7 @@ from scipy.sparse import issparse from ..utils._array_api import get_namespace +from ..utils.fixes import VisibleDeprecationWarning from .validation import _assert_all_finite, check_array @@ -161,10 +162,10 @@ def is_multilabel(y): ensure_min_features=0, ) with warnings.catch_warnings(): - warnings.simplefilter("error", np.VisibleDeprecationWarning) + warnings.simplefilter("error", VisibleDeprecationWarning) try: y = check_array(y, dtype=None, **check_y_kwargs) - except (np.VisibleDeprecationWarning, ValueError) as e: + except (VisibleDeprecationWarning, ValueError) as e: if str(e).startswith("Complex data not supported"): raise @@ -324,11 +325,11 @@ def type_of_target(y, input_name=""): ) with warnings.catch_warnings(): - warnings.simplefilter("error", np.VisibleDeprecationWarning) + warnings.simplefilter("error", VisibleDeprecationWarning) if not issparse(y): try: y = check_array(y, dtype=None, **check_y_kwargs) - except (np.VisibleDeprecationWarning, ValueError) as e: + except (VisibleDeprecationWarning, ValueError) as e: if str(e).startswith("Complex data not supported"): raise diff --git a/sklearn/utils/tests/test_estimator_html_repr.py b/sklearn/utils/tests/test_estimator_html_repr.py index e4327dcbc2c46..bbe44ac8974fa 100644 --- a/sklearn/utils/tests/test_estimator_html_repr.py +++ b/sklearn/utils/tests/test_estimator_html_repr.py @@ -197,6 +197,9 @@ def test_estimator_html_repr_pipeline(): assert f"" in html_output assert f"
{html.escape(str(est))}
" in html_output + # verify that prefers-color-scheme is implemented + assert "prefers-color-scheme" in html_output + @pytest.mark.parametrize("final_estimator", [None, LinearSVC()]) def test_stacking_classifier(final_estimator): diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py index a4aaa8f21b6b7..ec48c4a012574 100644 --- a/sklearn/utils/tests/test_pprint.py +++ b/sklearn/utils/tests/test_pprint.py @@ -12,7 +12,7 @@ # Ignore flake8 (lots of line too long issues) -# flake8: noqa +# ruff: noqa # Constructors excerpted to test pprinting diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index 403a5db63ec54..d1722a1553f9c 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -315,3 +315,32 @@ def test_set_output_named_tuple_out(): assert isinstance(X_trans, Output) assert_array_equal(X_trans.X, X) assert_array_equal(X_trans.Y, 2 * X) + + +class EstimatorWithListInput(_SetOutputMixin): + def fit(self, X, y=None): + assert isinstance(X, list) + self.n_features_in_ = len(X[0]) + return self + + def transform(self, X, y=None): + return X + + def get_feature_names_out(self, input_features=None): + return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object) + + +def test_set_output_list_input(): + """Check set_output for list input. + + Non-regression test for #27037. + """ + pd = pytest.importorskip("pandas") + + X = [[0, 1, 2, 3], [4, 5, 6, 7]] + est = EstimatorWithListInput() + est.set_output(transform="pandas") + + X_out = est.fit(X).transform(X) + assert isinstance(X_out, pd.DataFrame) + assert_array_equal(X_out.columns, ["X0", "X1", "X2", "X3"])