From 460e03db50f41b2100688109d8caa73930419626 Mon Sep 17 00:00:00 2001
From: Lorenzo Stella <stellalo@amazon.com>
Date: Fri, 8 Jul 2022 10:49:51 +0200
Subject: [PATCH] Backports for v0.10.1 (#2145)

* Docs: Rework `installation` section. (#2130)

* Fix DatasetCollection (#2135)

* Fix `PandasDataset` for Python 3.9 (#2141)

* Docs: Fix running tutorials for publishing docs. (#2138)

* Docs: Fix running tutorials for publishing docs.

* Update requirements.

* Docs: Make notebook templates. (#2122)

* Use of check_github_event.

* Fix issues with hyperparameter tuning tutorial (#2143)

* Apply black to notebooks. (#2144)

Co-authored-by: Jasper <schjaspe@amazon.de>
Co-authored-by: rsnirwan <rajbir.nirwan@gmail.com>
---
 .github/workflows/docs.yml                    |   3 -
 Justfile                                      |  24 +--
 docs/getting_started/install.md               | 159 ++++++++++++++++++
 docs/getting_started/install.rst              |  78 ---------
 docs/md2ipynb.py                              | 113 +++++++++----
 ...ut => howto_pytorch_lightning.md.template} |   0
 ...nput => hp_tuning_with_optuna.md.template} |   2 -
 ...md.input => trainer_callbacks.md.template} |   0
 ....md.input => pandasdataframes.md.template} |   0
 ... => synthetic_data_generation.md.template} |   0
 ...md.input => extended_tutorial.md.template} |   0
 ...input => quick_start_tutorial.md.template} |   0
 requirements/requirements-docs.txt            |   3 +
 src/gluonts/dataset/__init__.py               |   2 +-
 src/gluonts/dataset/pandas.py                 |   4 +-
 15 files changed, 255 insertions(+), 133 deletions(-)
 create mode 100644 docs/getting_started/install.md
 delete mode 100644 docs/getting_started/install.rst
 rename docs/tutorials/advanced_topics/{howto_pytorch_lightning.md.input => howto_pytorch_lightning.md.template} (100%)
 rename docs/tutorials/advanced_topics/{hp_tuning_with_optuna.md.input => hp_tuning_with_optuna.md.template} (99%)
 rename docs/tutorials/advanced_topics/{trainer_callbacks.md.input => trainer_callbacks.md.template} (100%)
 rename docs/tutorials/data_manipulation/{pandasdataframes.md.input => pandasdataframes.md.template} (100%)
 rename docs/tutorials/data_manipulation/{synthetic_data_generation.md.input => synthetic_data_generation.md.template} (100%)
 rename docs/tutorials/forecasting/{extended_tutorial.md.input => extended_tutorial.md.template} (100%)
 rename docs/tutorials/forecasting/{quick_start_tutorial.md.input => quick_start_tutorial.md.template} (100%)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 758e533620..956bf6f09d 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -6,9 +6,6 @@ defaults:
   run:
     shell: bash
 
-env:
-  SKIP_BUILD_NOTEBOOK: ${{!( github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'pr:docs-build-notebook'))}}
-
 jobs:
   docs-build:
     runs-on: ubuntu-latest
diff --git a/Justfile b/Justfile
index ff6c1bfed1..6d64f73e44 100644
--- a/Justfile
+++ b/Justfile
@@ -15,30 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 
-ROOTDIR := invocation_directory()
+ROOTDIR := justfile_directory()
 MD2IPYNB := ROOTDIR + "/docs/md2ipynb.py"
 
-skip_build_notebook := env_var_or_default("SKIP_BUILD_NOTEBOOK", "false")
-
-
-docs: release
+docs: compile_notebooks
   make -C docs html # SPHINXOPTS=-W
 
 clean:
   git clean -ff -d -x --exclude="{{ROOTDIR}}/tests/externaldata/*" --exclude="{{ROOTDIR}}/tests/data/*" --exclude="{{ROOTDIR}}/conda/"
 
-compile_notebooks:
-  if [ {{skip_build_notebook}} = "true" ] ; \
-  then \
-    find docs/tutorials/**/*.md.input | sed 'p;s/\.input//' | xargs -n2 cp; \
-  else \
-    python -m ipykernel install --user --name docsbuild; \
-    python {{MD2IPYNB}} --kernel docsbuild "docs/tutorials/**/*.md.input"; \
-  fi;
-
-dist_notebooks: compile_notebooks
-  cd docs/tutorials && \
-  find * -type d -prune | grep -v 'tests\|__pycache__' | xargs -t -n 1 -I{} zip --no-dir-entries -r {}.zip {} -x "*.md" -x "__pycache__" -x "*.pyc" -x "*.txt" -x "*.log" -x "*.params" -x "*.npz" -x "*.json"
+compile_notebooks mode="release":
+    python -m ipykernel install --user --name docsbuild
+    python {{MD2IPYNB}} --kernel docsbuild docs/tutorials/**/*.md.template --mode {{mode}}
 
-release: dist_notebooks
+release:
   python setup.py sdist
diff --git a/docs/getting_started/install.md b/docs/getting_started/install.md
new file mode 100644
index 0000000000..ed37e4b2e1
--- /dev/null
+++ b/docs/getting_started/install.md
@@ -0,0 +1,159 @@
+
+# Installation
+
+GluonTS is available from PyPi via:
+
+```sh
+pip install gluonts
+````
+
+```{attention}
+**GluonTS uses a minimal dependency model.**
+
+This means that to use most models and features additional dependencies need to
+be installed. See the next section for more information.
+
+```
+
+## Optional and Extra Dependencies
+
+Python has the notion of [extras](https://peps.python.org/pep-0508/#extras)
+-- dependencies that can be optionally installed to unlock certain features of
+a pacakge.
+
+When installing a package, they are passed via ``[...]`` after the package
+name:
+
+```sh
+pip install some-package[extra-1,extra-2]
+````
+
+We make extensive use of optional dependencies in GluonTS to keep the amount of
+required dependencies minimal. To still allow users to opt-in to certain
+features, we expose many extra dependencies.
+
+For example, we offer support for reading and writing Arrow and Parquet based
+datasets using [Apache Arrow](https://arrow.apache.org/). However, it is a
+hefty dependency to require, especially if one has no need for it. Thus, we
+offer the ``arrow``-extra, which installs the required packages and can be
+simply enabled using:
+
+```sh
+pip install gluonts[arrow]
+````
+
+### Models
+
+
+#### PyTorch
+
+Models written using [PyTorch](https://pytorch.org/) are available via the
+``gluonts.torch`` subpackage.
+
+In addition to PyTorch we require [PyTorch Lightning](https://www.pytorchlightning.ai/)
+to be installed as well.
+
+Both required dependencies are included in the ``torch``-extra:
+
+```sh
+pip install gluonts[torch]
+````
+
+
+#### MXNet
+
+MXNet based models require a version of ``mxnet`` to be installed.
+
+```{note}
+
+MXNet provives different package for CPU and GPU usages. Please refer to its
+[documentation](https://mxnet.apache.org/versions/1.9.1/get_started?) to
+select the right version fitting your use-case.
+
+```
+
+The ``mxnet``-extra will install a CPU-only version:
+
+```sh
+pip install gluonts[mxnet]
+
+````
+
+
+#### 3rd Party
+
+##### R-Forecast
+
+GluonTS includes a thin wrapper for calling the ``R`` `forecast` package.
+
+In order to use it you need to install [``R``](https://www.r-project.org/) and
+install the `forecast` package:
+
+```sh
+R -e 'install.packages(c("forecast", "nnfor"), repos="https://cloud.r-project.org")'
+```
+
+In addition, we require rpy2 to be installed:
+
+```sh
+pip install 'rpy2>=2.9.*,<3.*'
+````
+
+##### Prophet
+
+The [Prophet](https://facebook.github.io/prophet/) forecasting library is
+available via `gluonts.model.prophet` and requires the ``prophet`` package to
+be installed.
+
+The ``prophet``-extra also depends on it:
+
+```sh
+pip install gluonts[prophet]
+```
+
+
+### Datasets
+
+#### JSON
+
+Since Python's build in ``json`` package is known to be relatively slow, we use
+faster implementations if available: ``orjson`` (recommended) and ``ujson``.
+
+You can install ``orjson`` via:
+
+```sh
+pip install orjson
+```
+
+```{hint}
+GluonTS will emit a warning if neither ``orjson`` nor ``ujson`` are installed.
+There is no functional difference between the different implementations, but
+especially when working with larger datasets, performance can be notably
+impacted when relying on the default ``json`` package.
+```
+
+#### Arrow
+
+GluonTS support [Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) files
+using [``PyArrow``](https://arrow.apache.org/docs/python/index.html).
+
+Further, [arrow's custom data formats](https://arrow.apache.org/docs/python/ipc.html)
+are also supported.
+
+To utilise these, either install the ``pyarrow`` package or use the
+``arrow``-extra:
+
+```sh
+pip install gluonts[arrow]
+```
+
+### Other
+
+#### Shell
+
+The ``shell`` module offers integration with Amazon SageMaker and is available
+through:
+
+```sh
+pip install gluonts[shell]
+```
diff --git a/docs/getting_started/install.rst b/docs/getting_started/install.rst
deleted file mode 100644
index bc119ebdc1..0000000000
--- a/docs/getting_started/install.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-Installation
-============
-
-GluonTS relies on the recent version of MXNet. The easiest way to install MXNet
-is through `pip <https://pip.pypa.io/en/stable/installing/>`_. The following
-command installs the latest version of MXNet.
-
-.. code-block:: console
-
-   pip install --upgrade mxnet~=1.7
-
-.. note::
-
-   There are other pre-build MXNet packages that enable GPU supports and
-   accelerate CPU performance, please refer to `this page
-   <http://beta.mxnet.io/install.html>`_ for details. Some
-   training scripts are recommended to run on GPUs, if you don't have a GPU
-   machine at hand, you may consider `running on AWS
-   <http://d2l.ai/chapter_appendix/aws.html>`_.
-
-
-After installing MXNet, you can install the GluonTS toolkit by
-
-.. code-block:: console
-
-   pip install gluonts
-
-
-Install from Dev Branch
------------------------
-
-If you are interested in trying out features on dev branch that hasn't been released yet, you have
-the option of installing from dev branch directly.
-
-
-Install from GitHub
--------------------
-
-Use the following command to automatically download and install the current code on dev branch:
-
-.. code-block:: console
-
-   pip install git+https://github.com/awslabs/gluon-ts.git
-
-
-Install from Source Code
-------------------------
-
-You can also first check out the code locally using Git:
-
-.. code-block:: console
-
-   git clone https://github.com/awslabs/gluon-ts
-   cd gluon-ts
-
-then use the provided `setup.py` to install into site-packages:
-
-.. code-block:: console
-
-   python setup.py install
-
-
-.. note::
-
-   You may need to use `sudo` in case you run into permission denied error.
-
-
-Alternatively, you can set up the package with development mode, so that local changes are
-immediately reflected in the installed python package
-
-.. code-block:: console
-
-   python setup.py develop
-
-.. note::
-
-   The dev branch may rely on MXNet nightly builds which are available on PyPI,
-   please refer to `this page <http://beta.mxnet.io/install.html>`_ for installation guide.
diff --git a/docs/md2ipynb.py b/docs/md2ipynb.py
index 8ee425eee1..0186dd5745 100644
--- a/docs/md2ipynb.py
+++ b/docs/md2ipynb.py
@@ -1,60 +1,115 @@
-import argparse
+import json
+import re
 import sys
 import time
+import os
 from itertools import chain
 from pathlib import Path
 
+import black
+import click
+import jinja2
 import nbformat
 import notedown
 from nbclient import NotebookClient
 
+from jinja2 import Environment
 
-def convert(path, kernel_name=None, timeout=40 * 60):
-    with path.open() as in_file:
-        notebook = notedown.MarkdownReader().read(in_file)
+env = Environment()
 
-    print(f"=== {path.name} ", end="")
-    sys.stdout.flush()
 
-    start = time.time()
+def check_github_event(default):
+    if "GITHUB_EVENT_PATH" not in os.environ:
+        return default
+
+    with open(os.environ["GITHUB_EVENT_PATH"]) as infile:
+        event = json.load(infile)
+
+    if "pull_request" in event:
+        for label in event["pull_request"]["labels"]:
+            if label["name"] == "pr:docs-build-notebook":
+                return default
+
+        return "skip"
+
+    return default
+
+
+def run_notebook(text, kernel_name, timeout) -> str:
+    notebook = notedown.MarkdownReader().reads(text)
+
+    kwargs = {}
+    if kernel_name is not None:
+        kwargs["kernel_name"] = kernel_name
 
     client = NotebookClient(
         notebook,
-        timeout=600,
-        kernel_name=kernel_name,
+        timeout=timeout,
         resources={"metadata": {"path": "."}},
+        **kwargs,
     )
     client.execute()
 
-    print(f"finished evaluation in {time.time() - start} sec")
-
     # need to add language info to for syntax highlight
     notebook["metadata"].update(language_info={"name": "python"})
 
+    return nbformat.writes(notebook)
+
+
+def black_cells(text):
+    CODE_RE = r"```py(?:thon)?\s*\n(.*?)```"
+
+    text = re.sub(r"^%", r"#%#", text, flags=re.M)
+
+    def apply_black(match):
+        code = match.group(1)
+
+        formatted = black.format_str(code, mode=black.Mode())
+
+        return "\n".join(["```", formatted.rstrip(), "```"])
+
+    formatted = re.sub(CODE_RE, apply_black, text, flags=re.S)
+    return re.sub(r"^#%#", r"%", formatted, flags=re.M)
+
+
+def convert(path, mode, kernel_name=None, timeout=40 * 60):
+    print(f"=== {path.name} ", end="")
+    sys.stdout.flush()
+
+    with path.open() as in_file:
+        template = env.from_string(in_file.read())
+
+    markdown = template.render(mode=mode)
+    markdown = black_cells(markdown)
+
+    if mode != "skip":
+        suffix = ".ipynb"
+        start = time.time()
+        output = run_notebook(markdown, kernel_name, timeout)
+        print(f"finished evaluation in {time.time() - start} sec")
+    else:
+        suffix = ".md"
+        print(f"convert to {suffix}")
+        output = markdown
+
     # XXX.md.input -> XXX.ipynb
     # `with_suffix` only operates on last suffix, so we need some more involved
     # logic.
     stem = path.name.split(".", 1)[0]
-    nbformat.write(notebook, path.with_name(stem).with_suffix(".ipynb"))
+    with path.with_name(stem).with_suffix(suffix).open("w") as outfile:
+        outfile.write(output)
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-k",
-        "--kernel",
-        dest="kernel_name",
-        default=None,
-        help="name of ipython kernel to use",
-    )
-    parser.add_argument(
-        "files", type=str, nargs="+", help="path to files to convert"
-    )
+@click.command()
+@click.argument("paths", type=click.Path(), nargs=-1)
+@click.option("--kernel", "-k", help="Name of iPython kernel to use.")
+@click.option("--mode", "-m", default="release")
+def cli(paths, kernel, mode):
+    mode = check_github_event(mode)
 
-    args = parser.parse_args()
+    for file in map(Path, paths):
+        convert(file, kernel_name=kernel, mode=mode)
 
-    here = Path(".")
-    files = list(chain.from_iterable(map(here.glob, args.files)))
 
-    for file in files:
-        convert(file, kernel_name=args.kernel_name)
+if __name__ == "__main__":
+    cli()
diff --git a/docs/tutorials/advanced_topics/howto_pytorch_lightning.md.input b/docs/tutorials/advanced_topics/howto_pytorch_lightning.md.template
similarity index 100%
rename from docs/tutorials/advanced_topics/howto_pytorch_lightning.md.input
rename to docs/tutorials/advanced_topics/howto_pytorch_lightning.md.template
diff --git a/docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.input b/docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.template
similarity index 99%
rename from docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.input
rename to docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.template
index 2b7c5d2b1e..8f1660b231 100644
--- a/docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.input
+++ b/docs/tutorials/advanced_topics/hp_tuning_with_optuna.md.template
@@ -8,8 +8,6 @@ In this notebook we will see how to tune the hyperparameters of a GlutonTS model
 
 
 ```python
-import mxnet as mx
-from mxnet import gluon
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
diff --git a/docs/tutorials/advanced_topics/trainer_callbacks.md.input b/docs/tutorials/advanced_topics/trainer_callbacks.md.template
similarity index 100%
rename from docs/tutorials/advanced_topics/trainer_callbacks.md.input
rename to docs/tutorials/advanced_topics/trainer_callbacks.md.template
diff --git a/docs/tutorials/data_manipulation/pandasdataframes.md.input b/docs/tutorials/data_manipulation/pandasdataframes.md.template
similarity index 100%
rename from docs/tutorials/data_manipulation/pandasdataframes.md.input
rename to docs/tutorials/data_manipulation/pandasdataframes.md.template
diff --git a/docs/tutorials/data_manipulation/synthetic_data_generation.md.input b/docs/tutorials/data_manipulation/synthetic_data_generation.md.template
similarity index 100%
rename from docs/tutorials/data_manipulation/synthetic_data_generation.md.input
rename to docs/tutorials/data_manipulation/synthetic_data_generation.md.template
diff --git a/docs/tutorials/forecasting/extended_tutorial.md.input b/docs/tutorials/forecasting/extended_tutorial.md.template
similarity index 100%
rename from docs/tutorials/forecasting/extended_tutorial.md.input
rename to docs/tutorials/forecasting/extended_tutorial.md.template
diff --git a/docs/tutorials/forecasting/quick_start_tutorial.md.input b/docs/tutorials/forecasting/quick_start_tutorial.md.template
similarity index 100%
rename from docs/tutorials/forecasting/quick_start_tutorial.md.input
rename to docs/tutorials/forecasting/quick_start_tutorial.md.template
diff --git a/requirements/requirements-docs.txt b/requirements/requirements-docs.txt
index 8022ccc46b..4f3a32913d 100644
--- a/requirements/requirements-docs.txt
+++ b/requirements/requirements-docs.txt
@@ -11,3 +11,6 @@ optuna~=2.10
 furo==2022.6.4.1
 m2r2
 myst-parser
+click
+orjson
+black
diff --git a/src/gluonts/dataset/__init__.py b/src/gluonts/dataset/__init__.py
index 6be7faaa76..d514686512 100644
--- a/src/gluonts/dataset/__init__.py
+++ b/src/gluonts/dataset/__init__.py
@@ -34,7 +34,7 @@ def __len__(self) -> int:
 
 
 @dataclass
-class DatasetCollection(Dataset):
+class DatasetCollection:
     """Flattened access to a collection of datasets."""
 
     datasets: List[Dataset]
diff --git a/src/gluonts/dataset/pandas.py b/src/gluonts/dataset/pandas.py
index f13049932b..72ecbefc65 100644
--- a/src/gluonts/dataset/pandas.py
+++ b/src/gluonts/dataset/pandas.py
@@ -19,12 +19,12 @@
 from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
 from toolz import valmap
 
-from gluonts.dataset.common import Dataset, DataEntry, ProcessDataEntry
+from gluonts.dataset.common import DataEntry, ProcessDataEntry
 from gluonts.dataset.field_names import FieldName
 
 
 @dataclass
-class PandasDataset(Dataset):
+class PandasDataset:
     """
     A pandas.DataFrame-based dataset type.