diff --git a/.github/workflows/push_doc.yml b/.github/workflows/push_doc.yml new file mode 100644 index 000000000..ff3e9d5ea --- /dev/null +++ b/.github/workflows/push_doc.yml @@ -0,0 +1,54 @@ +name: Build documentation + +on: + push: + branches: + - main + - test-ci* + pull_request: + branches: + - '**' + +permissions: + contents: write + +jobs: + push_doc: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - run: pip install .[doc] + + - name: Sphinx build + run: sphinx-build doc build + + - name: Save the PR number + env: + GITHUB_PULL_REQUEST_NUMBER: ${{github.event.number}} + run: | + echo "Storing PR number ${{github.event.number}} to 'pull_request_number' file" + echo ${{github.event.number}} > pull_request_number + + - name: Upload doc preview + # The publication of the preview itself happens in pr-doc-preview.yml + if: ${{ github.event_name == 'pull_request' }} + uses: actions/upload-artifact@v3 + with: + name: doc-preview + path: | + ./build + pull_request_number + + - name: Deploy + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + uses: peaceiris/actions-gh-pages@v3 + with: + publish_branch: gh-pages + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./build + commit_message: "[ci skip] ${{ github.event.head_commit.message }}" diff --git a/.gitignore b/.gitignore index fdeb0af25..12a0c436c 100644 --- a/.gitignore +++ b/.gitignore @@ -164,6 +164,10 @@ cython_debug/ .DS_Store *.datamander +# Sphinx documentation +doc/_build/ +doc/auto_examples/ + # Visual studio code .vscode diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 000000000..d4bb2cbb9 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/_static/css/custom.css b/doc/_static/css/custom.css new file mode 100644 index 000000000..d75a5b900 --- /dev/null +++ b/doc/_static/css/custom.css @@ -0,0 +1,43 @@ +div.docutils.container.index-features { + width: 450px; +} + +div.docutils.container.index-box { + background-color: var(--pst-color-white-highlight); + border-radius: 10px; + color: black; + padding: 5px; + margin-left: 15px; + float: right; + width: 400px; +} + +/* Need to be able to have absolute positioning for the div inside */ +.bd-main .bd-content .bd-article-container .bd-article { + position: relative; +} + +@media (min-width: 792px) { + div.docutils.container.index-features { + position: absolute; + top: 10px; + right: 0px; + } +} + +@media (min-width: 1092px) { + div.docutils.container.index-box { + margin-left: 30px; + margin-right: -15px; + } +} + +@media (max-width: 892px) { + div.docutils.container.index-box { + max-width: 500px; + width: unset; + float: unset; + margin-left: auto; + margin-right: auto; + } +} diff --git a/doc/_static/empty_placeholder_to_avoid_sphinx_warning b/doc/_static/empty_placeholder_to_avoid_sphinx_warning new file mode 100644 index 000000000..e69de29bb diff --git a/doc/_static/seer_extract.png b/doc/_static/seer_extract.png new file mode 100644 index 000000000..131a5e153 Binary files /dev/null and b/doc/_static/seer_extract.png differ diff --git a/doc/_static/seer_home.png b/doc/_static/seer_home.png new file mode 100644 index 000000000..d770d6b95 Binary files /dev/null and b/doc/_static/seer_home.png differ diff --git a/doc/_templates/class.rst b/doc/_templates/class.rst new file mode 100644 index 000000000..bf72932a4 --- /dev/null +++ b/doc/_templates/class.rst @@ -0,0 +1,19 @@ +:mod:`{{module}}`.{{objname}} +{{ underline }}============== + +.. rst-class:: side_comment + + Usage examples at the bottom of this page. + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :inherited-members: + + {% block methods %} + + {% endblock %} + +.. raw:: html + +
diff --git a/doc/_templates/function.rst b/doc/_templates/function.rst new file mode 100644 index 000000000..d8c9bd480 --- /dev/null +++ b/doc/_templates/function.rst @@ -0,0 +1,10 @@ +:mod:`{{module}}`.{{objname}} +{{ underline }}==================== + +.. currentmodule:: {{ module }} + +.. autofunction:: {{ objname }} + +.. raw:: html + +
diff --git a/doc/api.rst b/doc/api.rst new file mode 100644 index 000000000..f76e8c85e --- /dev/null +++ b/doc/api.rst @@ -0,0 +1,39 @@ +API +=== + +.. currentmodule:: skore + +This page lists all the public functions and classes of the ``skore`` +package. + + +The ``skore`` UI +---------------- + +These classes are meant for ``skore``'s user interface. + +.. autosummary:: + :toctree: generated/ + :template: class.rst + :nosignatures: + + Project + +.. autosummary:: + :toctree: generated/ + :template: function.rst + :nosignatures: + + load + +The ``skore`` machine learning programming assistant +---------------------------------------------------- + +These functions and classes enhance ``scikit-learn``'s ones. + +.. autosummary:: + :toctree: generated/ + :template: function.rst + :nosignatures: + + cross_validate diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 000000000..e0f115bd0 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,61 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "skore" +copyright = "2024, Probabl team" +author = "Probabl team" +release = "0.1.1" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx_gallery.gen_gallery", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", + "numpydoc", + "sphinx_design", + "myst_parser", # added by sylvaincom +] +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "pydata_sphinx_theme" +html_static_path = ["_static"] + +html_css_files = [ + "css/custom.css", +] +html_js_files = [] + +# sphinx_gallery options +sphinx_gallery_conf = { + "examples_dirs": "../examples", # path to example scripts + "gallery_dirs": "auto_examples", # path to gallery generated output +} + +# intersphinx configuration +intersphinx_mapping = { + "sklearn": ("https://scikit-learn.org/stable/", None), +} + +numpydoc_show_class_members = False + +html_title = "skore" + +html_theme_options = { + "announcement": ( + "https://raw.githubusercontent.com/soda-inria/hazardous/main/doc/announcement.html" + ), +} diff --git a/doc/generated/skore.Project.rst b/doc/generated/skore.Project.rst new file mode 100644 index 000000000..693ec0b43 --- /dev/null +++ b/doc/generated/skore.Project.rst @@ -0,0 +1,19 @@ +:mod:`skore`.Project +=========================== + +.. rst-class:: side_comment + + Usage examples at the bottom of this page. + +.. currentmodule:: skore + +.. autoclass:: Project + :inherited-members: + + + + + +.. raw:: html + +
diff --git a/doc/generated/skore.cross_validate.rst b/doc/generated/skore.cross_validate.rst new file mode 100644 index 000000000..3e20db320 --- /dev/null +++ b/doc/generated/skore.cross_validate.rst @@ -0,0 +1,10 @@ +:mod:`skore`.cross_validate +======================================== + +.. currentmodule:: skore + +.. autofunction:: cross_validate + +.. raw:: html + +
diff --git a/doc/generated/skore.load.rst b/doc/generated/skore.load.rst new file mode 100644 index 000000000..c7f823087 --- /dev/null +++ b/doc/generated/skore.load.rst @@ -0,0 +1,10 @@ +:mod:`skore`.load +============================== + +.. currentmodule:: skore + +.. autofunction:: load + +.. raw:: html + +
diff --git a/doc/getting_started.rst b/doc/getting_started.rst new file mode 100644 index 000000000..9f965e986 --- /dev/null +++ b/doc/getting_started.rst @@ -0,0 +1,41 @@ +.. _getting_started: + +Getting started +=============== + +``skore`` UI +------------ + +.. currentmodule:: skore + +From your shell, initialize a skore project, here named ``project.skore``, that +will be in your current working directory: + +.. code:: console + + python -m skore create "project.skore" + +Then, from your Python code (in the same directory), load the project and store +an integer for example: + +.. code-block:: python + + from skore import load + project = load("project.skore") + project.put("my_int", 3) + +Finally, from your shell (in the same directory), start the UI locally: + +.. code:: console + + python -m skore launch project.skore + +This will automatically open a browser at the UI's location: + +#. On the top left, create a new ``View``. +#. From the ``Elements`` section on the bottom left, you can add stored items to this view, either by double-cliking on them or by doing drag-and-drop. + +For more features, please look into :ref:`auto_examples`. + +.. image:: https://raw.githubusercontent.com/sylvaincom/sylvaincom.github.io/master/files/probabl/skore/2024_10_14_skore_demo.gif + :alt: Getting started with ``skore`` demo diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 000000000..96a609ebb --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,39 @@ +.. You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to ``skore`` +==================== + +With skore, data scientists can: + +#. Track and visualize their ML/DS results. +#. Get assistance when developing their ML/DS projects. + + - Scikit-learn compatible :func:`~skore.cross_validate` provides insights and checks on cross-validation. + +These are only the initial features: skore is a work in progress and aims to be +an end-to-end library for data scientists. +Stay tuned! +Feedbacks are welcome: please feel free to join our `Discord `_ or `create an issue `_. + +We are a product team working at `Probabl `_ and our motto is #OwnYourDataScience. + +- License: BSD +- GitHub repository: https://github.com/probabl-ai/skore +- Discord: http://discord.probabl.ai +- Status: under development, API is subject to change. + +.. image:: https://raw.githubusercontent.com/sylvaincom/sylvaincom.github.io/master/files/probabl/skore/2024_10_14_skore_demo.gif + :alt: Getting started with skore demo + +.. currentmodule:: skore + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + install + getting_started + auto_examples/index + user_guide + api diff --git a/doc/install.rst b/doc/install.rst new file mode 100644 index 000000000..df5a450e7 --- /dev/null +++ b/doc/install.rst @@ -0,0 +1,21 @@ +.. _install: + +Install +======= + +.. currentmodule:: skore + +First of all, we recommend using a `virtual environment (venv) `_. +You need ``python>=3.9``. + +Then, you can install ``skore`` by using ``pip``: + +.. code:: console + + pip install -U skore + +You can check ``skore``'s latest version on `PyPI `_. + +🚨 For Windows users, the encoding must be set to +`UTF-8 `_: +see `PYTHONUTF8 `_. diff --git a/doc/sg_execution_times.rst b/doc/sg_execution_times.rst new file mode 100644 index 000000000..8356d7c17 --- /dev/null +++ b/doc/sg_execution_times.rst @@ -0,0 +1,43 @@ + +:orphan: + +.. _sphx_glr_sg_execution_times: + + +Computation times +================= +**00:03.753** total execution time for 3 files **from all galleries**: + +.. container:: + + .. raw:: html + + + + + + + + .. list-table:: + :header-rows: 1 + :class: table table-striped sg-datatable + + * - Example + - Time + - Mem (MB) + * - :ref:`sphx_glr_auto_examples_plot_01_getting_started.py` (``../examples/plot_01_getting_started.py``) + - 00:01.402 + - 0.0 + * - :ref:`sphx_glr_auto_examples_plot_03_cross_validate.py` (``../examples/plot_03_cross_validate.py``) + - 00:01.324 + - 0.0 + * - :ref:`sphx_glr_auto_examples_plot_02_basic_usage.py` (``../examples/plot_02_basic_usage.py``) + - 00:01.027 + - 0.0 diff --git a/doc/user_guide.rst b/doc/user_guide.rst new file mode 100644 index 000000000..5b7c16c0b --- /dev/null +++ b/doc/user_guide.rst @@ -0,0 +1,20 @@ +.. _user_guide: + +User guide +========== + +.. currentmodule:: skore + +*The User Guide is currently work in progress and will be released soon, thank +you for your patience!* + +For now, please look into: + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + install + getting_started + auto_examples/index + api diff --git a/doc/user_guide/getting_started.rst b/doc/user_guide/getting_started.rst new file mode 100644 index 000000000..90d824ef1 --- /dev/null +++ b/doc/user_guide/getting_started.rst @@ -0,0 +1,107 @@ +.. _getting_started: + +Getting started with ``skore`` +============================== + +This example builds on top of the :ref:`getting_started` guide. + +``skore`` UI +------------ + +This section provides a quick start to the ``skore`` UI, an open-source package that aims to enable data scientists to: + +#. Store objects of different types from their Python code: python lists, ``scikit-learn`` fitted pipelines, ``plotly`` figures, and more. +#. Track and visualize these stored objects on a user-friendly dashboard. +#. Export the dashboard to a HTML file. + +Initialize a Project and launch the UI +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +From your shell, initialize a skore project, here named ``project``, that will be +in your current working directory: + +.. code:: console + + python -m skore create "project" + +This will create a ``skore`` project directory named ``project`` in the current +directory. + +From your shell (in the same directory), start the UI locally: + +.. code:: console + + python -m skore launch "project" + +This will automatically open a browser at the UI's location. + +Now that the project file exists, we can load it in our notebook so that we can +read from and write to it: + +.. code-block:: python + + from skore import load + + project = load("project.skore") + +Storing some items +------------------ + +Storing an integer: + +.. code-block:: python + + project.put("my_int", 3) + +Here, the name of my stored item is ``my_int`` and the integer value is 3. + +For a ``pandas`` data frame: + +.. code-block:: python + + import numpy as np + import pandas as pd + + my_df = pd.DataFrame(np.random.randn(3, 3)) + + project.put("my_df", my_df) + + +For a ``matplotlib`` figure: + +.. code-block:: python + + import matplotlib.pyplot as plt + + x = [0, 1, 2, 3, 4, 5] + fig, ax = plt.subplots(figsize=(5, 3), layout="constrained") + _ = ax.plot(x) + + project.put("my_figure", fig) + +For a ``scikit-learn`` fitted pipeline: + +.. code-block:: python + + from sklearn.datasets import load_diabetes + from sklearn.linear_model import Lasso + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + + diabetes = load_diabetes() + X = diabetes.data[:150] + y = diabetes.target[:150] + my_pipeline = Pipeline( + [("standard_scaler", StandardScaler()), ("lasso", Lasso(alpha=2))] + ) + my_pipeline.fit(X, y) + + project.put("my_fitted_pipeline", my_pipeline) + +Back to the dashboard +^^^^^^^^^^^^^^^^^^^^^ +#. On the top left, create a new ``View``. +#. From the ``Elements`` section on the bottom left, you can add stored items to this view, either by double-cliking on them or by doing drag-and-drop. + +.. image:: https://raw.githubusercontent.com/sylvaincom/sylvaincom.github.io/master/files/probabl/skore/2024_10_14_skore_demo.gif + :alt: Getting started with ``skore`` demo diff --git a/examples/00_getting_started.ipynb b/examples/00_getting_started.ipynb deleted file mode 100644 index 762f58b95..000000000 --- a/examples/00_getting_started.ipynb +++ /dev/null @@ -1,201 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0", - "metadata": {}, - "source": [ - "# Getting started with `skore`\n", - "\n", - "This guide provides a quick start to `skore`, an open-source package that aims at enable data scientist to:\n", - "1. Store objects of different types from their Python code: python lists, `scikit-learn` fitted pipelines, `plotly` figures, and more.\n", - "2. **Track** and **visualize** these stored objects on a user-friendly dashboard.\n", - "3. Export the dashboard to a HTML file." - ] - }, - { - "cell_type": "markdown", - "id": "1", - "metadata": {}, - "source": [ - "## Initialize a Project and launch the UI\n", - "\n", - "From your shell, initialize a `skore` project, here named `project`, that will be in your current working directory:\n", - "```bash\n", - "python -m skore create \"project\"\n", - "```\n", - "This will create a skore project directory named `project.skore` in the current directory.\n", - "\n", - "From your shell (in the same directory), start the UI locally:\n", - "```bash\n", - "python -m skore launch \"project\"\n", - "```\n", - "This will automatically open a browser at the UI's location.\n", - "\n", - "Now that the project file exists, we can load it in our notebook so that we can read from and write to it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "from skore import load\n", - "\n", - "project = load(\"project\")" - ] - }, - { - "cell_type": "markdown", - "id": "3", - "metadata": {}, - "source": [ - "## Storing some items" - ] - }, - { - "cell_type": "markdown", - "id": "4", - "metadata": {}, - "source": [ - "Storing an integer:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\"my_int\", 3)" - ] - }, - { - "cell_type": "markdown", - "id": "6", - "metadata": {}, - "source": [ - "Here, the name of my stored item is `my_int` and the integer value is 3." - ] - }, - { - "cell_type": "markdown", - "id": "7", - "metadata": {}, - "source": [ - "For a `pandas` data frame:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "my_df = pd.DataFrame(np.random.randn(3, 3))\n", - "project.put(\"my_df\", my_df)" - ] - }, - { - "cell_type": "markdown", - "id": "9", - "metadata": {}, - "source": [ - "for a `matplotlib` figure:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "x = [0, 1, 2, 3, 4, 5]\n", - "fig, ax = plt.subplots(figsize=(5, 3), layout=\"constrained\")\n", - "ax.plot(x)\n", - "project.put(\"my_figure\", fig)" - ] - }, - { - "cell_type": "markdown", - "id": "11", - "metadata": {}, - "source": [ - "For a `scikit-learn` fitted pipeline:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import load_diabetes\n", - "from sklearn.linear_model import Lasso\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "diabetes = load_diabetes()\n", - "X = diabetes.data[:150]\n", - "y = diabetes.target[:150]\n", - "my_pipeline = Pipeline(\n", - " [(\"standard_scaler\", StandardScaler()), (\"lasso\", Lasso(alpha=2))]\n", - ")\n", - "my_pipeline.fit(X, y)\n", - "project.put(\"my_fitted_pipeline\", my_pipeline)" - ] - }, - { - "cell_type": "markdown", - "id": "13", - "metadata": {}, - "source": [ - "## Back to the dashboard\n", - "\n", - "1. On the top left, create a new `View`.\n", - "2. From the `Items` section on the bottom left, you can add stored items to this view, either by double-cliking on them or by doing drag-and-drop." - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "jupytext": { - "formats": "ipynb,py:percent" - }, - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/00_getting_started.py b/examples/00_getting_started.py deleted file mode 100644 index 92401bdae..000000000 --- a/examples/00_getting_started.py +++ /dev/null @@ -1,104 +0,0 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.16.2 -# kernelspec: -# display_name: .venv -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Getting started with `skore` -# -# This guide provides a quick start to `skore`, an open-source package that aims at enable data scientist to: -# 1. Store objects of different types from their Python code: python lists, `scikit-learn` fitted pipelines, `plotly` figures, and more. -# 2. **Track** and **visualize** these stored objects on a user-friendly dashboard. -# 3. Export the dashboard to a HTML file. - -# %% [markdown] -# ## Initialize a Project and launch the UI -# -# From your shell, initialize a `skore` project, here named `project`, that will be in your current working directory: -# ```bash -# python -m skore create "project" -# ``` -# This will create a skore project directory named `project.skore` in the current directory. -# -# From your shell (in the same directory), start the UI locally: -# ```bash -# python -m skore launch "project" -# ``` -# This will automatically open a browser at the UI's location. -# -# Now that the project file exists, we can load it in our notebook so that we can read from and write to it: - -# %% -from skore import load - -project = load("project") - -# %% [markdown] -# ## Storing some items - -# %% [markdown] -# Storing an integer: - -# %% -project.put("my_int", 3) - -# %% [markdown] -# Here, the name of my stored item is `my_int` and the integer value is 3. - -# %% [markdown] -# For a `pandas` data frame: - -# %% -import numpy as np -import pandas as pd - -my_df = pd.DataFrame(np.random.randn(3, 3)) -project.put("my_df", my_df) - -# %% [markdown] -# for a `matplotlib` figure: - -# %% -import matplotlib.pyplot as plt - -x = [0, 1, 2, 3, 4, 5] -fig, ax = plt.subplots(figsize=(5, 3), layout="constrained") -ax.plot(x) -project.put("my_figure", fig) - -# %% [markdown] -# For a `scikit-learn` fitted pipeline: - -# %% -from sklearn.datasets import load_diabetes -from sklearn.linear_model import Lasso -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler - -diabetes = load_diabetes() -X = diabetes.data[:150] -y = diabetes.target[:150] -my_pipeline = Pipeline( - [("standard_scaler", StandardScaler()), ("lasso", Lasso(alpha=2))] -) -my_pipeline.fit(X, y) -project.put("my_fitted_pipeline", my_pipeline) - -# %% [markdown] -# ## Back to the dashboard -# -# 1. On the top left, create a new `View`. -# 2. From the `Items` section on the bottom left, you can add stored items to this view, either by double-cliking on them or by doing drag-and-drop. - -# %% [markdown] -# diff --git a/examples/01_basic_usage.ipynb b/examples/01_basic_usage.ipynb deleted file mode 100644 index d3ec4deda..000000000 --- a/examples/01_basic_usage.ipynb +++ /dev/null @@ -1,817 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0", - "metadata": {}, - "source": [ - "# Basic usage of `skore`" - ] - }, - { - "cell_type": "markdown", - "id": "1", - "metadata": {}, - "source": [ - "## Introduction\n", - "\n", - "This guide is to illustrate some of the main features that `skore` currently provides. `skore` an open-source package that aims at enable data scientist to:\n", - "1. Store objects of different types from their Python code: python lists and dictionaries, `numpy` arrays, `pandas` dataframes, `scikit-learn` fitted pipelines, `matplotlib` / `plotly` / `altair` figures, and more.\n", - "2. **Track** and **visualize** these stored objects on a user-friendly dashboard.\n", - "3. Export the dashboard to a HTML file.\n", - "\n", - "This notebook stores some items that have been used to generated a `skore` report available at [this link](https://sylvaincom.github.io/files/probabl/skore/01_basic_usage.html)." - ] - }, - { - "cell_type": "markdown", - "id": "2", - "metadata": {}, - "source": [ - "Imports:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": {}, - "outputs": [], - "source": [ - "import altair as alt\n", - "import io\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import plotly.express as px\n", - "import PIL\n", - "\n", - "from sklearn.datasets import load_diabetes\n", - "from sklearn.linear_model import Lasso\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "from skore import load\n", - "from skore.item import MediaItem" - ] - }, - { - "cell_type": "markdown", - "id": "4", - "metadata": {}, - "source": [ - "## Initialize and use a Project\n", - "\n", - "From your shell, initialize a `skore` project, here named `project`, that will be in your current working directory:\n", - "```bash\n", - "python -m skore create \"project\"\n", - "```\n", - "This will create a skore project directory named `project.skore` in the current directory.\n", - "\n", - "Now that you have created the `project.skore` folder (even though nothing has yet been stored), you can launch the UI.\n", - "\n", - "From your shell (in the same directory), start the UI locally:\n", - "```bash\n", - "python -m skore launch project\n", - "```\n", - "This will automatically open a browser at the UI's location.\n", - "\n", - "---\n", - "**NOTE**: If you already had a `project.skore` directory from a previous run -- you can check for that using your shell:\n", - "```bash\n", - "ls\n", - "```\n", - "and if you no longer need it, we recommend deleting this folder using your shell:\n", - "```bash\n", - "rm -r project.skore\n", - "```\n", - "This deletion needs to be done before the cells above: before initializing the store and before launching the UI!\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "id": "5", - "metadata": {}, - "source": [ - "Now that the project file exists, we can load it in our notebook so that we can read from and write to it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": {}, - "outputs": [], - "source": [ - "project = load(\"project\")" - ] - }, - { - "cell_type": "markdown", - "id": "7", - "metadata": {}, - "source": [ - "### Storing an integer" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": {}, - "source": [ - "Now, let us store our first object, for example an integer:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\"my_int\", 3)" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": {}, - "source": [ - "Here, the name of my object is `my_int` and the integer value is 3.\n", - "\n", - "You can read it from the Project:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "project.get(\"my_int\")" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": {}, - "source": [ - "Careful; like in a traditional Python dictionary, the `put` method will *overwrite* past data if you use a key which already exists!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\"my_int\", 30_000)" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": {}, - "source": [ - "Let us check the updated value:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": {}, - "outputs": [], - "source": [ - "project.get(\"my_int\")" - ] - }, - { - "cell_type": "markdown", - "id": "16", - "metadata": {}, - "source": [ - "By using the `delete_item` method, you can also delete an object so that your `skore` UI does not become cluttered:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\"my_int_2\", 10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18", - "metadata": {}, - "outputs": [], - "source": [ - "project.delete_item(\"my_int_2\")" - ] - }, - { - "cell_type": "markdown", - "id": "19", - "metadata": {}, - "source": [ - "You can use `project.list_item_keys` to display all the keys in your project:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20", - "metadata": {}, - "outputs": [], - "source": [ - "project.list_item_keys()" - ] - }, - { - "cell_type": "markdown", - "id": "21", - "metadata": {}, - "source": [ - "### Storing a string" - ] - }, - { - "cell_type": "markdown", - "id": "22", - "metadata": {}, - "source": [ - "We just stored a integer, now let us store some text using strings!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\"my_string\", \"Hello world!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24", - "metadata": {}, - "outputs": [], - "source": [ - "project.get(\"my_string\")" - ] - }, - { - "cell_type": "markdown", - "id": "25", - "metadata": {}, - "source": [ - "`project.get` infers the type of the inserted object by default. For example, strings are assumed to be in Markdown format. Hence, you can customize the display of your text:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\n", - " \"my_string_2\",\n", - " (\n", - " \"\"\"Hello world!, **bold**, *italic*, `code`\n", - "\n", - "```python\n", - "def my_func(x):\n", - " return x+2\n", - "```\n", - "\"\"\"\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "27", - "metadata": {}, - "source": [ - "Moreover, you can also explicitly tell `skore` the media type of an object, for example in HTML:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28", - "metadata": {}, - "outputs": [], - "source": [ - "# Note we use `put_item` instead of `put`\n", - "project.put_item(\n", - " \"my_string_3\",\n", - " MediaItem.factory(\n", - " \"

Title

bold, italic, etc.

\", media_type=\"text/html\"\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "29", - "metadata": {}, - "source": [ - "Note that the media type is only used for the UI, and not in this notebook at hand:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30", - "metadata": {}, - "outputs": [], - "source": [ - "project.get(\"my_string_3\")" - ] - }, - { - "cell_type": "markdown", - "id": "31", - "metadata": {}, - "source": [ - "You can also conveniently use Python f-strings:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32", - "metadata": {}, - "outputs": [], - "source": [ - "x = 2\n", - "y = [1, 2, 3, 4]\n", - "project.put(\"my_string_4\", f\"The value of `x` is {x} and the value of `y` is {y}.\")" - ] - }, - { - "cell_type": "markdown", - "id": "33", - "metadata": {}, - "source": [ - "### Storing many kinds of data" - ] - }, - { - "cell_type": "markdown", - "id": "34", - "metadata": {}, - "source": [ - "Python list:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35", - "metadata": {}, - "outputs": [], - "source": [ - "my_list = [1, 2, 3, 4]\n", - "project.put(\"my_list\", my_list)" - ] - }, - { - "cell_type": "markdown", - "id": "36", - "metadata": {}, - "source": [ - "Python dictionary:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "37", - "metadata": {}, - "outputs": [], - "source": [ - "my_dict = {\n", - " \"company\": \"probabl\",\n", - " \"year\": 2023,\n", - "}\n", - "project.put(\"my_dict\", my_dict)" - ] - }, - { - "cell_type": "markdown", - "id": "38", - "metadata": {}, - "source": [ - "NumPy array:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "39", - "metadata": {}, - "outputs": [], - "source": [ - "my_arr = np.random.randn(3, 3)\n", - "project.put(\"my_arr\", my_arr)" - ] - }, - { - "cell_type": "markdown", - "id": "40", - "metadata": {}, - "source": [ - "Pandas data frame:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41", - "metadata": {}, - "outputs": [], - "source": [ - "my_df = pd.DataFrame(np.random.randn(3, 3))\n", - "project.put(\"my_df\", my_df)" - ] - }, - { - "cell_type": "markdown", - "id": "42", - "metadata": {}, - "source": [ - "### Data visualization\n", - "\n", - "Note that, in the dashboard, the interactivity of plots is supported, for example for `altair` and `plotly`." - ] - }, - { - "cell_type": "markdown", - "id": "43", - "metadata": {}, - "source": [ - "Matplotlib figures:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44", - "metadata": {}, - "outputs": [], - "source": [ - "x = np.linspace(0, 2, 100)\n", - "\n", - "fig, ax = plt.subplots(figsize=(5, 2.7), layout=\"constrained\")\n", - "ax.plot(x, x, label=\"linear\")\n", - "ax.plot(x, x**2, label=\"quadratic\")\n", - "ax.plot(x, x**3, label=\"cubic\")\n", - "ax.set_xlabel(\"x label\")\n", - "ax.set_ylabel(\"y label\")\n", - "ax.set_title(\"Simple Plot\")\n", - "ax.legend()\n", - "plt.show()\n", - "\n", - "project.put(\"my_figure\", fig)" - ] - }, - { - "cell_type": "markdown", - "id": "45", - "metadata": {}, - "source": [ - "Altair charts:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46", - "metadata": {}, - "outputs": [], - "source": [ - "num_points = 100\n", - "df_plot = pd.DataFrame(\n", - " {\"x\": np.random.randn(num_points), \"y\": np.random.randn(num_points)}\n", - ")\n", - "\n", - "my_altair_chart = (\n", - " alt.Chart(df_plot)\n", - " .mark_circle()\n", - " .encode(x=\"x\", y=\"y\", tooltip=[\"x\", \"y\"])\n", - " .interactive()\n", - " .properties(title=\"My title\")\n", - ")\n", - "my_altair_chart.show()\n", - "\n", - "project.put(\"my_altair_chart\", my_altair_chart)" - ] - }, - { - "cell_type": "markdown", - "id": "47", - "metadata": {}, - "source": [ - "Plotly figures:\n", - "\n", - "> NOTE: Some users reported the following error when running the Plotly cells:\n", - "> ```\n", - "> ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed\n", - "> ```\n", - "> This is a Plotly issue which is documented [here](https://github.com/plotly/plotly.py/issues/3285); to solve it, we recommend installing nbformat in your environment, e.g. with\n", - "> ```sh\n", - "> pip install --upgrade nbformat\n", - "> ```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48", - "metadata": {}, - "outputs": [], - "source": [ - "df = px.data.iris()\n", - "fig = px.scatter(df, x=df.sepal_length, y=df.sepal_width, color=df.species, size=df.petal_length)\n", - "fig.show()\n", - "project.put(\"my_plotly_fig\", fig)" - ] - }, - { - "cell_type": "markdown", - "id": "49", - "metadata": {}, - "source": [ - "Animated plotly figures:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50", - "metadata": {}, - "outputs": [], - "source": [ - "df = px.data.gapminder()\n", - "my_anim_plotly_fig = px.scatter(df, x=\"gdpPercap\", y=\"lifeExp\", animation_frame=\"year\", animation_group=\"country\",\n", - " size=\"pop\", color=\"continent\", hover_name=\"country\",\n", - " log_x=True, size_max=55, range_x=[100,100000], range_y=[25,90])\n", - "my_anim_plotly_fig.show()\n", - "project.put(\"my_anim_plotly_fig\", my_anim_plotly_fig)" - ] - }, - { - "cell_type": "markdown", - "id": "51", - "metadata": {}, - "source": [ - "PIL images:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52", - "metadata": {}, - "outputs": [], - "source": [ - "my_pil_image = PIL.Image.new(\"RGB\", (100, 100), color=\"red\")\n", - "with io.BytesIO() as output:\n", - " my_pil_image.save(output, format=\"png\")\n", - "\n", - "project.put(\"my_pil_image\", my_pil_image)" - ] - }, - { - "cell_type": "markdown", - "id": "53", - "metadata": {}, - "source": [ - "### Scikit-learn models and pipelines\n", - "\n", - "As `skore` is developed by :probabl., the spin-off of scikit-learn, `skore` treats scikit-learn models and pipelines as first-class citizens.\n", - "\n", - "First of all, you can store a scikit-learn model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54", - "metadata": {}, - "outputs": [], - "source": [ - "my_model = Lasso(alpha=2)\n", - "project.put(\"my_model\", my_model)" - ] - }, - { - "cell_type": "markdown", - "id": "55", - "metadata": {}, - "source": [ - "You can also store scikit-learn pipelines:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56", - "metadata": {}, - "outputs": [], - "source": [ - "my_pipeline = Pipeline(\n", - " [(\"standard_scaler\", StandardScaler()), (\"lasso\", Lasso(alpha=2))]\n", - ")\n", - "project.put(\"my_pipeline\", my_pipeline)" - ] - }, - { - "cell_type": "markdown", - "id": "57", - "metadata": {}, - "source": [ - "Moreover, you can store fitted scikit-learn pipelines:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58", - "metadata": {}, - "outputs": [], - "source": [ - "diabetes = load_diabetes()\n", - "X = diabetes.data[:150]\n", - "y = diabetes.target[:150]\n", - "my_pipeline.fit(X, y)\n", - "\n", - "project.put(\"my_fitted_pipeline\", my_pipeline)" - ] - }, - { - "cell_type": "markdown", - "id": "59aaa", - "metadata": {}, - "source": [ - "---\n", - "## Cross-validation with skore" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58aaaa", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn import datasets, linear_model\n", - "from skore.cross_validate import cross_validate\n", - "diabetes = datasets.load_diabetes()\n", - "X = diabetes.data[:150]\n", - "y = diabetes.target[:150]\n", - "lasso = linear_model.Lasso()\n", - "\n", - "cv_results = cross_validate(lasso, X, y, cv=3, project=project)" - ] - }, - { - "cell_type": "markdown", - "id": "59", - "metadata": {}, - "source": [ - "_Stay tuned for some new features!_" - ] - }, - { - "cell_type": "markdown", - "id": "60", - "metadata": {}, - "source": [ - "---\n", - "## Manipulating the skore UI\n", - "\n", - "The following is just some `skore` strings that we generate in order to provide more context on the obtained report." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\n", - " \"my_comment_1\",\n", - " \"

Welcome to skore!

skore allows data scientists to create tracking and visualizations from their Python code. This HTML document is actually a skore report generated using the 01_basic_usage.ipynb example notebook then exported (into HTML)!

\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\n", - " \"my_comment_2\",\n", - " \"

Integers

\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\"my_comment_3\", \"

Strings

\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\n", - " \"my_comment_4\",\n", - " \"

Many kinds of data

\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\n", - " \"my_comment_5\",\n", - " \"

Plots

\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66", - "metadata": {}, - "outputs": [], - "source": [ - "project.put(\"my_comment_6\", \"

Scikit-learn models and pipelines

\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "formats": "ipynb,py:percent" - }, - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/01_basic_usage.py b/examples/01_basic_usage.py deleted file mode 100644 index 058020a4b..000000000 --- a/examples/01_basic_usage.py +++ /dev/null @@ -1,384 +0,0 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.16.2 -# kernelspec: -# display_name: .venv -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Basic usage of `skore` - -# %% [markdown] -# ## Introduction -# -# This guide is to illustrate some of the main features that `skore` currently provides. `skore` an open-source package that aims at enable data scientist to: -# 1. Store objects of different types from their Python code: python lists and dictionaries, `numpy` arrays, `pandas` dataframes, `scikit-learn` fitted pipelines, `matplotlib` / `plotly` / `altair` figures, and more. -# 2. **Track** and **visualize** these stored objects on a user-friendly dashboard. -# 3. Export the dashboard to a HTML file. -# -# This notebook stores some items that have been used to generated a `skore` report available at [this link](https://sylvaincom.github.io/files/probabl/skore/01_basic_usage.html). - -# %% [markdown] -# Imports: - -# %% -import altair as alt -import io -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import plotly.express as px -import PIL - -from sklearn.datasets import load_diabetes -from sklearn.linear_model import Lasso -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler - -from skore import load -from skore.item import MediaItem - -# %% [markdown] -# ## Initialize and use a Project -# -# From your shell, initialize a `skore` project, here named `project`, that will be in your current working directory: -# ```bash -# python -m skore create "project" -# ``` -# This will create a skore project directory named `project.skore` in the current directory. -# -# Now that you have created the `project.skore` folder (even though nothing has yet been stored), you can launch the UI. -# -# From your shell (in the same directory), start the UI locally: -# ```bash -# python -m skore launch project -# ``` -# This will automatically open a browser at the UI's location. -# -# --- -# **NOTE**: If you already had a `project.skore` directory from a previous run -- you can check for that using your shell: -# ```bash -# ls -# ``` -# and if you no longer need it, we recommend deleting this folder using your shell: -# ```bash -# rm -r project.skore -# ``` -# This deletion needs to be done before the cells above: before initializing the store and before launching the UI! -# -# --- - -# %% [markdown] -# Now that the project file exists, we can load it in our notebook so that we can read from and write to it: - -# %% -project = load("project") - -# %% [markdown] -# ### Storing an integer - -# %% [markdown] -# Now, let us store our first object, for example an integer: - -# %% -project.put("my_int", 3) - -# %% [markdown] -# Here, the name of my object is `my_int` and the integer value is 3. -# -# You can read it from the Project: - -# %% -project.get("my_int") - -# %% [markdown] -# Careful; like in a traditional Python dictionary, the `put` method will *overwrite* past data if you use a key which already exists! - -# %% -project.put("my_int", 30_000) - -# %% [markdown] -# Let us check the updated value: - -# %% -project.get("my_int") - -# %% [markdown] -# By using the `delete_item` method, you can also delete an object so that your `skore` UI does not become cluttered: - -# %% -project.put("my_int_2", 10) - -# %% -project.delete_item("my_int_2") - -# %% [markdown] -# You can use `project.list_item_keys` to display all the keys in your project: - -# %% -project.list_item_keys() - -# %% [markdown] -# ### Storing a string - -# %% [markdown] -# We just stored a integer, now let us store some text using strings! - -# %% -project.put("my_string", "Hello world!") - -# %% -project.get("my_string") - -# %% [markdown] -# `project.get` infers the type of the inserted object by default. For example, strings are assumed to be in Markdown format. Hence, you can customize the display of your text: - -# %% -project.put( - "my_string_2", - ( - """Hello world!, **bold**, *italic*, `code` - -```python -def my_func(x): - return x+2 -``` -""" - ), -) - -# %% [markdown] -# Moreover, you can also explicitly tell `skore` the media type of an object, for example in HTML: - -# %% -# Note we use `put_item` instead of `put` -project.put_item( - "my_string_3", - MediaItem.factory( - "

Title

bold, italic, etc.

", media_type="text/html" - ), -) - -# %% [markdown] -# Note that the media type is only used for the UI, and not in this notebook at hand: - -# %% -project.get("my_string_3") - -# %% [markdown] -# You can also conveniently use Python f-strings: - -# %% -x = 2 -y = [1, 2, 3, 4] -project.put("my_string_4", f"The value of `x` is {x} and the value of `y` is {y}.") - -# %% [markdown] -# ### Storing many kinds of data - -# %% [markdown] -# Python list: - -# %% -my_list = [1, 2, 3, 4] -project.put("my_list", my_list) - -# %% [markdown] -# Python dictionary: - -# %% -my_dict = { - "company": "probabl", - "year": 2023, -} -project.put("my_dict", my_dict) - -# %% [markdown] -# NumPy array: - -# %% -my_arr = np.random.randn(3, 3) -project.put("my_arr", my_arr) - -# %% [markdown] -# Pandas data frame: - -# %% -my_df = pd.DataFrame(np.random.randn(3, 3)) -project.put("my_df", my_df) - -# %% [markdown] -# ### Data visualization -# -# Note that, in the dashboard, the interactivity of plots is supported, for example for `altair` and `plotly`. - -# %% [markdown] -# Matplotlib figures: - -# %% -x = np.linspace(0, 2, 100) - -fig, ax = plt.subplots(figsize=(5, 2.7), layout="constrained") -ax.plot(x, x, label="linear") -ax.plot(x, x**2, label="quadratic") -ax.plot(x, x**3, label="cubic") -ax.set_xlabel("x label") -ax.set_ylabel("y label") -ax.set_title("Simple Plot") -ax.legend() -plt.show() - -project.put("my_figure", fig) - -# %% [markdown] -# Altair charts: - -# %% -num_points = 100 -df_plot = pd.DataFrame( - {"x": np.random.randn(num_points), "y": np.random.randn(num_points)} -) - -my_altair_chart = ( - alt.Chart(df_plot) - .mark_circle() - .encode(x="x", y="y", tooltip=["x", "y"]) - .interactive() - .properties(title="My title") -) -my_altair_chart.show() - -project.put("my_altair_chart", my_altair_chart) - -# %% [markdown] -# Plotly figures: -# -# > NOTE: Some users reported the following error when running the Plotly cells: -# > ``` -# > ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed -# > ``` -# > This is a Plotly issue which is documented [here](https://github.com/plotly/plotly.py/issues/3285); to solve it, we recommend installing nbformat in your environment, e.g. with -# > ```sh -# > pip install --upgrade nbformat -# > ``` - -# %% -df = px.data.iris() -fig = px.scatter(df, x=df.sepal_length, y=df.sepal_width, color=df.species, size=df.petal_length) -fig.show() -project.put("my_plotly_fig", fig) - -# %% [markdown] -# Animated plotly figures: - -# %% -df = px.data.gapminder() -my_anim_plotly_fig = px.scatter(df, x="gdpPercap", y="lifeExp", animation_frame="year", animation_group="country", - size="pop", color="continent", hover_name="country", - log_x=True, size_max=55, range_x=[100,100000], range_y=[25,90]) -my_anim_plotly_fig.show() -project.put("my_anim_plotly_fig", my_anim_plotly_fig) - -# %% [markdown] -# PIL images: - -# %% -my_pil_image = PIL.Image.new("RGB", (100, 100), color="red") -with io.BytesIO() as output: - my_pil_image.save(output, format="png") - -project.put("my_pil_image", my_pil_image) - -# %% [markdown] -# ### Scikit-learn models and pipelines -# -# As `skore` is developed by :probabl., the spin-off of scikit-learn, `skore` treats scikit-learn models and pipelines as first-class citizens. -# -# First of all, you can store a scikit-learn model: - -# %% -my_model = Lasso(alpha=2) -project.put("my_model", my_model) - -# %% [markdown] -# You can also store scikit-learn pipelines: - -# %% -my_pipeline = Pipeline( - [("standard_scaler", StandardScaler()), ("lasso", Lasso(alpha=2))] -) -project.put("my_pipeline", my_pipeline) - -# %% [markdown] -# Moreover, you can store fitted scikit-learn pipelines: - -# %% -diabetes = load_diabetes() -X = diabetes.data[:150] -y = diabetes.target[:150] -my_pipeline.fit(X, y) - -project.put("my_fitted_pipeline", my_pipeline) - -# %% [markdown] -# --- -# ## Cross-validation with skore - -# %% -from sklearn import datasets, linear_model -from skore.cross_validate import cross_validate -diabetes = datasets.load_diabetes() -X = diabetes.data[:150] -y = diabetes.target[:150] -lasso = linear_model.Lasso() - -cv_results = cross_validate(lasso, X, y, cv=3, project=project) - -# %% [markdown] -# _Stay tuned for some new features!_ - -# %% [markdown] -# --- -# ## Manipulating the skore UI -# -# The following is just some `skore` strings that we generate in order to provide more context on the obtained report. - -# %% -project.put( - "my_comment_1", - "

Welcome to skore!

skore allows data scientists to create tracking and visualizations from their Python code. This HTML document is actually a skore report generated using the 01_basic_usage.ipynb example notebook then exported (into HTML)!

", -) - -# %% -project.put( - "my_comment_2", - "

Integers

", -) - -# %% -project.put("my_comment_3", "

Strings

") - -# %% -project.put( - "my_comment_4", - "

Many kinds of data

", -) - -# %% -project.put( - "my_comment_5", - "

Plots

", -) - -# %% -project.put("my_comment_6", "

Scikit-learn models and pipelines

") - -# %% diff --git a/examples/README.rst b/examples/README.rst new file mode 100644 index 000000000..06e3b58fa --- /dev/null +++ b/examples/README.rst @@ -0,0 +1,6 @@ +.. _auto_examples: + +Examples +======== + +Below is a gallery of examples on how to use ``skore``. diff --git a/examples/README.txt b/examples/README.txt deleted file mode 100644 index cda0d581f..000000000 --- a/examples/README.txt +++ /dev/null @@ -1,4 +0,0 @@ -Examples -======== - -This is the gallery of examples that showcase how skore can be used. diff --git a/examples/plot_01_getting_started.py b/examples/plot_01_getting_started.py new file mode 100644 index 000000000..ebfb42e17 --- /dev/null +++ b/examples/plot_01_getting_started.py @@ -0,0 +1,124 @@ +""" +================================= +1) Getting started with ``skore`` +================================= + +This example runs the :ref:`getting_started` guide. + +``skore`` UI +------------ + +This section provides a quick start to the ``skore`` UI, an open-source package that aims to enable data scientists to: + +#. Store objects of different types from their Python code: python lists, ``scikit-learn`` fitted pipelines, ``plotly`` figures, and more. +#. Track and visualize these stored objects on a user-friendly dashboard. +#. Export the dashboard to a HTML file. + +Initialize a Project and launch the UI +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +From your shell, initialize a skore project, here named ``my_project_gs``, that +will be in your current working directory: +""" + +# %% +import subprocess + +# remove the project if it already exists +subprocess.run("rm -rf my_project_gs.skore".split()) + +# create the project +subprocess.run("python3 -m skore create my_project_gs".split()) + +# %% +# This will create a ``skore`` project directory named ``my_project_gs`` in the +# current directory. +# +# From your shell (in the same directory), start the UI locally: +# +# .. code:: console +# +# python -m skore launch "my_project_gs" +# +# This will automatically open a browser at the UI's location. +# +# Now that the project file exists, we can load it in our notebook so that we can +# read from and write to it: + +# %% +from skore import load + +my_project_gs = load("my_project_gs.skore") + +# %% +# Storing some items +# ^^^^^^^^^^^^^^^^^^ +# +# Storing an integer: + +# %% +my_project_gs.put("my_int", 3) + +# %% +# Here, the name of my stored item is ``my_int`` and the integer value is 3. + +# %% +my_project_gs.get("my_int") + +# %% +# For a ``pandas`` data frame: + +# %% +import numpy as np +import pandas as pd + +my_df = pd.DataFrame(np.random.randn(3, 3)) + +my_project_gs.put("my_df", my_df) + +# %% +my_project_gs.get("my_df") + +# %% +# For a ``matplotlib`` figure: + +# %% +import matplotlib.pyplot as plt + +x = [0, 1, 2, 3, 4, 5] +fig, ax = plt.subplots(figsize=(5, 3), layout="constrained") +_ = ax.plot(x) + +my_project_gs.put("my_figure", fig) + +# %% +# For a ``scikit-learn`` fitted pipeline: + +# %% +from sklearn.datasets import load_diabetes +from sklearn.linear_model import Lasso +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +diabetes = load_diabetes() +X = diabetes.data[:150] +y = diabetes.target[:150] +my_pipeline = Pipeline( + [("standard_scaler", StandardScaler()), ("lasso", Lasso(alpha=2))] +) +my_pipeline.fit(X, y) + +my_project_gs.put("my_fitted_pipeline", my_pipeline) + +# %% +my_project_gs.get("my_fitted_pipeline") + +# %% +# Back to the dashboard +# ^^^^^^^^^^^^^^^^^^^^^ +# +# #. On the top left, create a new ``View``. +# #. From the ``Elements`` section on the bottom left, you can add stored items to this view, either by double-cliking on them or by doing drag-and-drop. +# +# .. image:: https://raw.githubusercontent.com/sylvaincom/sylvaincom.github.io/master/files/probabl/skore/2024_10_14_skore_demo.gif +# :alt: Getting started with ``skore`` demo diff --git a/examples/plot_02_basic_usage.py b/examples/plot_02_basic_usage.py new file mode 100644 index 000000000..d507b1506 --- /dev/null +++ b/examples/plot_02_basic_usage.py @@ -0,0 +1,352 @@ +""" +=========================== +2) Basic usage of ``skore`` +=========================== + +This example complements the `Getting started with skore` example. +""" + +# %% +# ``skore`` UI +# ------------ + +# %% +import altair as alt +import io +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import plotly.express as px +import PIL + +from sklearn.datasets import load_diabetes +from sklearn.linear_model import Lasso +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from skore import load +from skore.item import MediaItem + +# %% +# Initialize a Project and launch the UI +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# %% +import subprocess + +# remove the project if it already exists +subprocess.run("rm -rf my_project_bu.skore".split()) + +# create the project +subprocess.run("python3 -m skore create my_project_bu".split()) + + +# %% +from skore import load + +my_project_gs = load("my_project_gs.skore") + + +# %% +# Storing an integer +# ^^^^^^^^^^^^^^^^^^ +# +# Now, let us store our first object, for example an integer: + +# %% +my_project_gs.put("my_int", 3) + +# %% +# Here, the name of my object is ``my_int`` and the integer value is 3. +# +# You can read it from the project: + +# %% +my_project_gs.get("my_int") + +# %% +# Careful; like in a traditional Python dictionary, the `put` method will *overwrite* past data if you use a key which already exists! + +# %% +my_project_gs.put("my_int", 30_000) + +# %% +# Let us check the updated value: + +# %% +my_project_gs.get("my_int") + +# %% +# By using the `delete_item` method, you can also delete an object so that your `skore` UI does not become cluttered: + +# %% +my_project_gs.put("my_int_2", 10) + +# %% +my_project_gs.delete_item("my_int_2") + +# %% +# You can use `my_project_gs.list_item_keys` to display all the keys in your project: + +# %% +my_project_gs.list_item_keys() + +# %% +# ### Storing a string + +# %% +# We just stored a integer, now let us store some text using strings! + +# %% +my_project_gs.put("my_string", "Hello world!") + +# %% +my_project_gs.get("my_string") + +# %% +# ``my_project_gs.get`` infers the type of the inserted object by default. For example, strings are assumed to be in Markdown format. Hence, you can customize the display of your text: + +# %% +my_project_gs.put( + "my_string_2", + ( + """Hello world!, **bold**, *italic*, `code` + +```python +def my_func(x): + return x+2 +``` +""" + ), +) + +# %% +# Moreover, you can also explicitly tell `skore` the media type of an object, for example in HTML: + +# %% +# Note: we use ``put_item`` instead of ``put``: +my_project_gs.put_item( + "my_string_3", + MediaItem.factory( + "

Title

bold, italic, etc.

", media_type="text/html" + ), +) + +# %% +# Note that the media type is only used for the UI, and not in this notebook at hand: + +# %% +my_project_gs.get("my_string_3") + +# %% +# You can also conveniently use Python f-strings: + +# %% +x = 2 +y = [1, 2, 3, 4] +my_project_gs.put( + "my_string_4", f"The value of `x` is {x} and the value of `y` is {y}." +) + +# %% +# Storing many kinds of data +# ^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# %% +# Python list: + +# %% +my_list = [1, 2, 3, 4] +my_project_gs.put("my_list", my_list) + +# %% +# Python dictionary: + +# %% +my_dict = { + "company": "probabl", + "year": 2023, +} +my_project_gs.put("my_dict", my_dict) + +# %% +# ``numpy`` array: + +# %% +my_arr = np.random.randn(3, 3) +my_project_gs.put("my_arr", my_arr) + +# %% +# ``pandas`` data frame: + +# %% +my_df = pd.DataFrame(np.random.randn(3, 3)) +my_project_gs.put("my_df", my_df) + +# %% +# Data visualization +# ^^^^^^^^^^^^^^^^^^ +# +# Note that, in the dashboard, the interactivity of plots is supported, for example for ``altair`` and ``plotly``. + +# %% +# ``matplotlib`` figures: + +# %% +x = np.linspace(0, 2, 100) + +fig, ax = plt.subplots(figsize=(5, 2.7), layout="constrained") +ax.plot(x, x, label="linear") +ax.plot(x, x**2, label="quadratic") +ax.plot(x, x**3, label="cubic") +ax.set_xlabel("x label") +ax.set_ylabel("y label") +ax.set_title("Simple Plot") +ax.legend() +plt.show() + +my_project_gs.put("my_figure", fig) + +# %% +# ``altair`` charts: + +# %% +num_points = 100 +df_plot = pd.DataFrame( + {"x": np.random.randn(num_points), "y": np.random.randn(num_points)} +) + +my_altair_chart = ( + alt.Chart(df_plot) + .mark_circle() + .encode(x="x", y="y", tooltip=["x", "y"]) + .interactive() + .properties(title="My title") +) +my_altair_chart.show() + +my_project_gs.put("my_altair_chart", my_altair_chart) + +# %% +# Plotly figures: +# +# NOTE: Some users reported the following error when running the Plotly cells: +# ``` +# ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed +# ``` +# This is a Plotly issue which is documented `here `_; to solve it, we recommend installing nbformat in your environment, e.g. with +# +# .. code:: console +# +# pip install --upgrade nbformat + +# %% +df = px.data.iris() +fig = px.scatter( + df, x=df.sepal_length, y=df.sepal_width, color=df.species, size=df.petal_length +) +fig.show() +my_project_gs.put("my_plotly_fig", fig) + +# %% +# Animated ``plotly`` figures: + +# %% +df = px.data.gapminder() +my_anim_plotly_fig = px.scatter( + df, + x="gdpPercap", + y="lifeExp", + animation_frame="year", + animation_group="country", + size="pop", + color="continent", + hover_name="country", + log_x=True, + size_max=55, + range_x=[100, 100000], + range_y=[25, 90], +) +my_anim_plotly_fig.show() +my_project_gs.put("my_anim_plotly_fig", my_anim_plotly_fig) + +# %% +# PIL images: + +# %% +my_pil_image = PIL.Image.new("RGB", (100, 100), color="red") +with io.BytesIO() as output: + my_pil_image.save(output, format="png") + +my_project_gs.put("my_pil_image", my_pil_image) + +# %% +# Scikit-learn models and pipelines +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# As ``skore`` is developed by `Probabl `_, the spin-off of scikit-learn, ``skore`` treats scikit-learn models and pipelines as first-class citizens. +# +# First of all, you can store a scikit-learn model: + +# %% +my_model = Lasso(alpha=2) +my_project_gs.put("my_model", my_model) + +# %% +# You can also store ``scikit-learn`` pipelines: + +# %% +my_pipeline = Pipeline( + [("standard_scaler", StandardScaler()), ("lasso", Lasso(alpha=2))] +) +my_project_gs.put("my_pipeline", my_pipeline) + +# %% +# Moreover, you can store fitted ``scikit-learn`` pipelines: + +# %% +diabetes = load_diabetes() +X = diabetes.data[:150] +y = diabetes.target[:150] +my_pipeline.fit(X, y) + +my_project_gs.put("my_fitted_pipeline", my_pipeline) + +# %% +# *Stay tuned for some new features!* + +# %% +# Manipulating the skore UI +# ^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# The following is just some ``skore`` strings that we generate in order to provide more context on the obtained report. + +# %% +my_project_gs.put( + "my_comment_1", + "

Welcome to skore!

skore allows data scientists to create tracking and visualizations from their Python code. This HTML document is actually a skore report generated using the 01_basic_usage.ipynb example notebook then exported (into HTML)!

", +) + +# %% +my_project_gs.put( + "my_comment_2", + "

Integers

", +) + +# %% +my_project_gs.put("my_comment_3", "

Strings

") + +# %% +my_project_gs.put( + "my_comment_4", + "

Many kinds of data

", +) + +# %% +my_project_gs.put( + "my_comment_5", + "

Plots

", +) + +# %% +my_project_gs.put("my_comment_6", "

Scikit-learn models and pipelines

") diff --git a/examples/plot_03_cross_validate.py b/examples/plot_03_cross_validate.py new file mode 100644 index 000000000..d4bf45a11 --- /dev/null +++ b/examples/plot_03_cross_validate.py @@ -0,0 +1,35 @@ +""" +=================================== +3) Using ``skore``'s cross validate +=================================== + +This example illustrates the use of :func:`~skore.cross_validate`. +""" + +# %% +import subprocess + +# remove the project if it already exists +subprocess.run("rm -rf my_project_cv.skore".split()) + +# create the project +subprocess.run("python3 -m skore create my_project_cv".split()) + + +# %% +from skore import load + +my_project_gs = load("my_project_cv.skore") + +# %% +from sklearn import datasets, linear_model +from skore.cross_validate import cross_validate + +diabetes = datasets.load_diabetes() +X = diabetes.data[:150] +y = diabetes.target[:150] +lasso = linear_model.Lasso() + +cv_results = cross_validate(lasso, X, y, cv=3, project=my_project_gs) + +my_project_gs.get_item("cross_validation").plot diff --git a/examples/skrub_demo.ipynb b/examples/skrub_demo.ipynb deleted file mode 100644 index 2b7620454..000000000 --- a/examples/skrub_demo.ipynb +++ /dev/null @@ -1,453 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "0", - "metadata": {}, - "outputs": [], - "source": [ - "# ruff: noqa\n", - "import base64\n", - "from pathlib import Path\n", - "from time import time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": {}, - "outputs": [], - "source": [ - "import altair as alt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "from matplotlib import pyplot as plt\n", - "from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor\n", - "from sklearn.inspection import permutation_importance\n", - "from sklearn.linear_model import RidgeCV\n", - "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.utils import Bunch\n", - "from skrub import TableReport, tabular_learner\n", - "from skrub.datasets import fetch_employee_salaries\n", - "from tqdm import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "from skore import load\n", - "from skore.item import MediaItem" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": {}, - "outputs": [], - "source": [ - "DIR_MANDER = \"datamander\"\n", - "PATH_PROJECT = Path(\"skrub_demo\")\n", - "N_SEEDS = 5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a project at path './skrub_demo.skore'\n", - "!python -m skore create skrub_demo" - ] - }, - { - "cell_type": "markdown", - "id": "5", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "Launch the web UI with `python -m skore launch skrub_demo`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": {}, - "outputs": [], - "source": [ - "def init_ridge():\n", - " return tabular_learner(RidgeCV())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": {}, - "outputs": [], - "source": [ - "def init_rf():\n", - " return tabular_learner(RandomForestRegressor(n_jobs=4))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8", - "metadata": {}, - "outputs": [], - "source": [ - "def init_gb():\n", - " return tabular_learner(HistGradientBoostingRegressor())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "INIT_MODEL_FUNC = {\n", - " \"ridge\": init_ridge,\n", - " \"rf\": init_rf,\n", - " \"gb\": init_gb,\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10", - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate_models(model_names):\n", - " results = []\n", - " for model_name in model_names:\n", - " print(f\"{' Evaluating ' + model_name + ' ':=^50}\")\n", - " results.append(evaluate_seeds(model_name))\n", - "\n", - " project = load(PATH_PROJECT)\n", - " project.put_item(\n", - " \"skrub_report\",\n", - " MediaItem.factory(plot_skrub_report(), media_type=\"text/html\"),\n", - " )\n", - "\n", - " project.put(\"target_distribution\", plot_y_distribution())\n", - " project.put(\"Metrics\", plot_table_metrics(results))\n", - " project.put(\"R2 vs fit time\", plot_r2_vs_fit_time(results))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate_seeds(model_name):\n", - " path_model = PATH_PROJECT / model_name\n", - "\n", - " seed_scores = []\n", - " for random_state in tqdm(range(N_SEEDS)):\n", - " bunch = get_data(random_state)\n", - " model = INIT_MODEL_FUNC[model_name]()\n", - "\n", - " tic = time()\n", - " model.fit(bunch.X_train, bunch.y_train)\n", - " fit_time = time() - tic\n", - "\n", - " scores = evaluate(model, bunch)\n", - " scores.update(\n", - " {\n", - " \"random_state\": random_state,\n", - " \"model_name\": model_name,\n", - " \"fit_time\": fit_time,\n", - " }\n", - " )\n", - "\n", - " path_seed = path_model / f\"random_state{random_state}\"\n", - "\n", - " project = load(PATH_PROJECT)\n", - " project.put(path_seed / \"scores\", scores) # scores is a dict\n", - " project.put_item(\n", - " path_seed / \"model_repr\",\n", - " MediaItem.factory(plot_model_repr(model), media_type=\"text/html\"),\n", - " )\n", - " project.put(\n", - " path_seed / \"feature importance\", plot_feature_importance(model, bunch)\n", - " )\n", - " seed_scores.append(scores)\n", - "\n", - " agg_scores = aggregate_seeds_results(seed_scores)\n", - " project.put(path_model / \"agg_scores\", agg_scores)\n", - "\n", - " return agg_scores" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12", - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, bunch):\n", - " y_pred = model.predict(bunch.X_test)\n", - " y_test = bunch[\"y_test\"]\n", - "\n", - " r2 = r2_score(y_test, y_pred)\n", - " mae = mean_absolute_error(y_test, y_pred)\n", - " mse = mean_squared_error(y_test, y_pred)\n", - "\n", - " scores = {\n", - " \"y_pred\": y_pred.tolist(),\n", - " \"r2\": r2,\n", - " \"mae\": mae,\n", - " \"mse\": mse,\n", - " }\n", - "\n", - " return scores" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "def aggregate_seeds_results(scores):\n", - " agg_score = dict()\n", - " for metric in [\"r2\", \"mae\", \"mse\", \"fit_time\"]:\n", - " score_seeds = [score[metric] for score in scores]\n", - " agg_score.update(\n", - " {\n", - " f\"mean_{metric}\": np.mean(score_seeds),\n", - " f\"std_{metric}\": np.std(score_seeds),\n", - " }\n", - " )\n", - "\n", - " agg_score[\"model_name\"] = scores[0][\"model_name\"]\n", - "\n", - " return agg_score" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14", - "metadata": {}, - "outputs": [], - "source": [ - "def get_data(random_state, split=True):\n", - " dataset = fetch_employee_salaries()\n", - " X, y = dataset.X, dataset.y\n", - " if split:\n", - " X_train, X_test, y_train, y_test = train_test_split(\n", - " X, y, random_state=random_state\n", - " )\n", - " return Bunch(\n", - " X_train=X_train,\n", - " y_train=y_train,\n", - " X_test=X_test,\n", - " y_test=y_test,\n", - " )\n", - " else:\n", - " return Bunch(X=X, y=y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": {}, - "outputs": [], - "source": [ - "def plot_table_metrics(results):\n", - " df = pd.DataFrame(results)\n", - " rename = {\n", - " \"r2\": \"R2 (↑)\",\n", - " \"mse\": \"MSE (↓)\",\n", - " \"mae\": \"MAE (↓)\",\n", - " \"fit_time\": \"Fit time (↓)\",\n", - " }\n", - "\n", - " for metric in [\"r2\", \"mae\", \"mse\", \"fit_time\"]:\n", - " mean_key, std_key = f\"mean_{metric}\", f\"std_{metric}\"\n", - " df[rename[metric]] = (\n", - " df[mean_key].round(4).astype(str) + \" ± \" + df[std_key].round(4).astype(str)\n", - " )\n", - " df = df.drop([mean_key, std_key], axis=1)\n", - "\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.figure\n", - "\n", - "\n", - "def plot_r2_vs_fit_time(results) -> matplotlib.figure.Figure:\n", - " df = pd.DataFrame(results)\n", - "\n", - " model_names = df[\"model_name\"].tolist()\n", - " palette = dict(\n", - " zip(\n", - " list(model_names),\n", - " sns.color_palette(\"colorblind\", n_colors=len(model_names)),\n", - " )\n", - " )\n", - "\n", - " fig, ax = plt.subplots(figsize=(8, 5), dpi=100)\n", - " c = \"black\"\n", - " plt.errorbar(\n", - " x=df[\"mean_fit_time\"],\n", - " y=df[\"mean_r2\"],\n", - " yerr=df[\"std_r2\"],\n", - " fmt=\"none\",\n", - " c=c,\n", - " capsize=2,\n", - " )\n", - " plt.errorbar(\n", - " x=df[\"mean_fit_time\"],\n", - " xerr=df[\"std_fit_time\"],\n", - " y=df[\"mean_r2\"],\n", - " fmt=\"none\",\n", - " c=c,\n", - " capsize=2,\n", - " )\n", - " ax = sns.scatterplot(\n", - " df,\n", - " x=\"mean_fit_time\",\n", - " y=\"mean_r2\",\n", - " hue=\"model_name\",\n", - " s=200,\n", - " palette=palette,\n", - " zorder=10,\n", - " alpha=1,\n", - " )\n", - "\n", - " ax.grid()\n", - " sns.move_legend(ax, \"upper left\", bbox_to_anchor=(1, 1))\n", - " # plt.tight_layout()\n", - "\n", - " return fig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": {}, - "outputs": [], - "source": [ - "def plot_skrub_report():\n", - " bunch = get_data(random_state=0, split=False)\n", - " df = pd.concat([bunch.X, bunch.y], axis=1)\n", - " return TableReport(df).html()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18", - "metadata": {}, - "outputs": [], - "source": [ - "def plot_feature_importance(model, bunch) -> alt.Chart:\n", - " importances = permutation_importance(model, bunch.X_test, bunch.y_test, n_jobs=4)\n", - "\n", - " feature_imp = pd.DataFrame(\n", - " importances[\"importances\"].T, columns=bunch.X_train.columns\n", - " ).melt() # Convert the dataframe to a long format\n", - "\n", - " return (\n", - " alt.Chart(feature_imp)\n", - " .mark_boxplot(extent=\"min-max\")\n", - " .encode(\n", - " alt.X(\"value:Q\").scale(domain=[0, 1]),\n", - " alt.Y(\"variable:N\"),\n", - " )\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": {}, - "outputs": [], - "source": [ - "def plot_y_distribution() -> alt.Chart:\n", - " bunch = get_data(random_state=0, split=False)\n", - " df = pd.concat([bunch.X, bunch.y], axis=1)\n", - " N = min(1000, df.shape[0])\n", - " df = df.sample(N)\n", - "\n", - " # alt.data_transformers.enable(\"vegafusion\")\n", - "\n", - " return (\n", - " alt.Chart(df)\n", - " .mark_bar()\n", - " .encode(\n", - " x=alt.X(\"current_annual_salary:Q\", bin=alt.Bin(maxbins=30)),\n", - " y=\"count()\",\n", - " color=\"gender:N\",\n", - " )\n", - " .properties(width=600, height=400)\n", - " .interactive()\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20", - "metadata": {}, - "outputs": [], - "source": [ - "def plot_model_repr(model) -> str:\n", - " return model._repr_html_()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21", - "metadata": {}, - "outputs": [], - "source": [ - "if __name__ == \"__main__\":\n", - " evaluate_models(model_names=list(INIT_MODEL_FUNC))" - ] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "-all", - "formats": "ipynb,py:percent", - "main_language": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/skrub_demo.py b/examples/skrub_demo.py deleted file mode 100644 index d13632411..000000000 --- a/examples/skrub_demo.py +++ /dev/null @@ -1,313 +0,0 @@ -# --- -# jupyter: -# jupytext: -# cell_metadata_filter: -all -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.16.1 -# --- - -# %% -# ruff: noqa -import base64 -from pathlib import Path -from time import time - -# %% -import altair as alt -import numpy as np -import pandas as pd -import seaborn as sns -from matplotlib import pyplot as plt -from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor -from sklearn.inspection import permutation_importance -from sklearn.linear_model import RidgeCV -from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score -from sklearn.model_selection import train_test_split -from sklearn.utils import Bunch -from skrub import TableReport, tabular_learner -from skrub.datasets import fetch_employee_salaries -from tqdm import tqdm - -# %% -from skore import load -from skore.item import MediaItem - -# %% -DIR_MANDER = "datamander" -PATH_PROJECT = Path("skrub_demo") -N_SEEDS = 5 - -# %% -# Create a project at path './skrub_demo.skore' -# !python -m skore create skrub_demo - -# %% [markdown] -# Launch the web UI with `python -m skore launch skrub_demo` - - -# %% -def init_ridge(): - return tabular_learner(RidgeCV()) - - -# %% -def init_rf(): - return tabular_learner(RandomForestRegressor(n_jobs=4)) - - -# %% -def init_gb(): - return tabular_learner(HistGradientBoostingRegressor()) - - -# %% -INIT_MODEL_FUNC = { - "ridge": init_ridge, - "rf": init_rf, - "gb": init_gb, -} - - -# %% -def evaluate_models(model_names): - results = [] - for model_name in model_names: - print(f"{' Evaluating ' + model_name + ' ':=^50}") - results.append(evaluate_seeds(model_name)) - - project = load(PATH_PROJECT) - project.put_item( - "skrub_report", - MediaItem.factory(plot_skrub_report(), media_type="text/html"), - ) - - project.put("target_distribution", plot_y_distribution()) - project.put("Metrics", plot_table_metrics(results)) - project.put("R2 vs fit time", plot_r2_vs_fit_time(results)) - - -# %% -def evaluate_seeds(model_name): - path_model = PATH_PROJECT / model_name - - seed_scores = [] - for random_state in tqdm(range(N_SEEDS)): - bunch = get_data(random_state) - model = INIT_MODEL_FUNC[model_name]() - - tic = time() - model.fit(bunch.X_train, bunch.y_train) - fit_time = time() - tic - - scores = evaluate(model, bunch) - scores.update( - { - "random_state": random_state, - "model_name": model_name, - "fit_time": fit_time, - } - ) - - path_seed = path_model / f"random_state{random_state}" - - project = load(PATH_PROJECT) - project.put(path_seed / "scores", scores) # scores is a dict - project.put_item( - path_seed / "model_repr", - MediaItem.factory(plot_model_repr(model), media_type="text/html"), - ) - project.put( - path_seed / "feature importance", plot_feature_importance(model, bunch) - ) - seed_scores.append(scores) - - agg_scores = aggregate_seeds_results(seed_scores) - project.put(path_model / "agg_scores", agg_scores) - - return agg_scores - - -# %% -def evaluate(model, bunch): - y_pred = model.predict(bunch.X_test) - y_test = bunch["y_test"] - - r2 = r2_score(y_test, y_pred) - mae = mean_absolute_error(y_test, y_pred) - mse = mean_squared_error(y_test, y_pred) - - scores = { - "y_pred": y_pred.tolist(), - "r2": r2, - "mae": mae, - "mse": mse, - } - - return scores - - -# %% -def aggregate_seeds_results(scores): - agg_score = dict() - for metric in ["r2", "mae", "mse", "fit_time"]: - score_seeds = [score[metric] for score in scores] - agg_score.update( - { - f"mean_{metric}": np.mean(score_seeds), - f"std_{metric}": np.std(score_seeds), - } - ) - - agg_score["model_name"] = scores[0]["model_name"] - - return agg_score - - -# %% -def get_data(random_state, split=True): - dataset = fetch_employee_salaries() - X, y = dataset.X, dataset.y - if split: - X_train, X_test, y_train, y_test = train_test_split( - X, y, random_state=random_state - ) - return Bunch( - X_train=X_train, - y_train=y_train, - X_test=X_test, - y_test=y_test, - ) - else: - return Bunch(X=X, y=y) - - -# %% -def plot_table_metrics(results): - df = pd.DataFrame(results) - rename = { - "r2": "R2 (↑)", - "mse": "MSE (↓)", - "mae": "MAE (↓)", - "fit_time": "Fit time (↓)", - } - - for metric in ["r2", "mae", "mse", "fit_time"]: - mean_key, std_key = f"mean_{metric}", f"std_{metric}" - df[rename[metric]] = ( - df[mean_key].round(4).astype(str) + " ± " + df[std_key].round(4).astype(str) - ) - df = df.drop([mean_key, std_key], axis=1) - - return df - - -# %% -import matplotlib.figure - - -def plot_r2_vs_fit_time(results) -> matplotlib.figure.Figure: - df = pd.DataFrame(results) - - model_names = df["model_name"].tolist() - palette = dict( - zip( - list(model_names), - sns.color_palette("colorblind", n_colors=len(model_names)), - ) - ) - - fig, ax = plt.subplots(figsize=(8, 5), dpi=100) - c = "black" - plt.errorbar( - x=df["mean_fit_time"], - y=df["mean_r2"], - yerr=df["std_r2"], - fmt="none", - c=c, - capsize=2, - ) - plt.errorbar( - x=df["mean_fit_time"], - xerr=df["std_fit_time"], - y=df["mean_r2"], - fmt="none", - c=c, - capsize=2, - ) - ax = sns.scatterplot( - df, - x="mean_fit_time", - y="mean_r2", - hue="model_name", - s=200, - palette=palette, - zorder=10, - alpha=1, - ) - - ax.grid() - sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) - # plt.tight_layout() - - return fig - - -# %% -def plot_skrub_report(): - bunch = get_data(random_state=0, split=False) - df = pd.concat([bunch.X, bunch.y], axis=1) - return TableReport(df).html() - - -# %% -def plot_feature_importance(model, bunch) -> alt.Chart: - importances = permutation_importance(model, bunch.X_test, bunch.y_test, n_jobs=4) - - feature_imp = pd.DataFrame( - importances["importances"].T, columns=bunch.X_train.columns - ).melt() # Convert the dataframe to a long format - - return ( - alt.Chart(feature_imp) - .mark_boxplot(extent="min-max") - .encode( - alt.X("value:Q").scale(domain=[0, 1]), - alt.Y("variable:N"), - ) - ) - - -# %% -def plot_y_distribution() -> alt.Chart: - bunch = get_data(random_state=0, split=False) - df = pd.concat([bunch.X, bunch.y], axis=1) - N = min(1000, df.shape[0]) - df = df.sample(N) - - # alt.data_transformers.enable("vegafusion") - - return ( - alt.Chart(df) - .mark_bar() - .encode( - x=alt.X("current_annual_salary:Q", bin=alt.Bin(maxbins=30)), - y="count()", - color="gender:N", - ) - .properties(width=600, height=400) - .interactive() - ) - - -# %% -def plot_model_repr(model) -> str: - return model._repr_html_() - - -# %% -if __name__ == "__main__": - evaluate_models(model_names=list(INIT_MODEL_FUNC)) diff --git a/skore/pyproject.toml b/skore/pyproject.toml index f6d813d92..c745e078f 100644 --- a/skore/pyproject.toml +++ b/skore/pyproject.toml @@ -77,6 +77,16 @@ test = [ "scikit-learn", ] +doc = [ + "sphinx", + "pydata-sphinx-theme", + "sphinx-gallery", + "sphinx-design", + "matplotlib", + "scikit-learn", + "numpydoc", +] + [tool.pytest.ini_options] addopts = [ "--doctest-modules",