diff --git a/databricks-iris/.gitignore b/databricks-iris/.gitignore new file mode 100644 index 00000000..89ff27bd --- /dev/null +++ b/databricks-iris/.gitignore @@ -0,0 +1,155 @@ +########################## +# KEDRO PROJECT + +# ignore all local configuration +conf/local/** +!conf/local/.gitkeep +.telemetry + +# ignore potentially sensitive credentials files +conf/**/*credentials* + +# ignore everything in the following folders +data/** +logs/** + +# except their sub-folders +!data/**/ +!logs/**/ + +# also keep all .gitkeep files +!.gitkeep + +# also keep the example dataset +!data/01_raw/iris.csv + + +########################## +# Common files + +# IntelliJ +.idea/ +*.iml +out/ +.idea_modules/ + +### macOS +*.DS_Store +.AppleDouble +.LSOverride +.Trashes + +# Vim +*~ +.*.swo +.*.swp + +# emacs +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc + +# JIRA plugin +atlassian-ide-plugin.xml + +# C extensions +*.so + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/databricks-iris/README.md b/databricks-iris/README.md new file mode 100644 index 00000000..caaa2e81 --- /dev/null +++ b/databricks-iris/README.md @@ -0,0 +1,59 @@ +# The `databricks-iris` Kedro starter + +## Introduction + +The code in this repository demonstrates best practice when working with Kedro and PySpark on Databricks. It contains a Kedro starter template with some initial configuration and an example pipeline, it accompanies the documentation on [developing and deploying Kedro projects on Databricks](https://docs.kedro.org/en/stable/integrations/index.html#databricks-integration). + +This repository is a fork of the `pyspark-iris` starter that has been modified to run natively on Databricks. + +## Getting started + +The starter template can be used to start a new project using the [`starter` option](https://docs.kedro.org/en/stable/kedro_project_setup/starters.html) in `kedro new`: + +```bash +kedro new --starter=databricks-iris +``` + +## Features + +### Configuration for Databricks in `conf/base` + +This starter has a base configuration that allows it to run natively on Databricks. Directories to store data and logs still need to be manually created in the user's Databricks DBFS instance: + +```bash +/dbfs/FileStore/iris-databricks/data +/dbfs/FileStore/iris-databricks/logs +``` + +See the documentation on deploying a packaged Kedro project to Databricks for more information. + +### Databricks entry point + +The starter contains a script and an entry point (`databricks_run.py`) that enables a packaged project created with this starter to run on Databricks. See the documentation on deploying a packaged Kedro project to Databricks for more information. + +### Single configuration in `/conf/base/spark.yml` + +While Spark allows you to specify many different [configuration options](https://spark.apache.org/docs/latest/configuration.html), this starter uses `/conf/base/spark.yml` as a single configuration location. + +### `SparkSession` initialisation + +This Kedro starter contains the initialisation code for `SparkSession` in the `ProjectContext` and takes its configuration from `/conf/base/spark.yml`. Modify this code if you want to further customise your `SparkSession`, e.g. to use [YARN](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html). + +### Configures `MemoryDataSet` to work with Spark objects + +Out of the box, Kedro's `MemoryDataSet` works with Spark's `DataFrame`. However, it doesn't work with other Spark objects such as machine learning models unless you add further configuration. This Kedro starter demonstrates how to configure `MemoryDataSet` for Spark's machine learning model in the `catalog.yml`. + +> Note: The use of `MemoryDataSet` is encouraged to propagate Spark's `DataFrame` between nodes in the pipeline. A best practice is to delay triggering Spark actions for as long as needed to take advantage of Spark's lazy evaluation. + +### An example machine learning pipeline that uses only `PySpark` and `Kedro` + +![Iris Pipeline Visualisation](./images/iris_pipeline.png) + +This Kedro starter uses the simple and familiar [Iris dataset](https://www.kaggle.com/uciml/iris). It contains the code for an example machine learning pipeline that runs a 1-nearest neighbour classifier to classify an iris. +[Transcoding](https://kedro.readthedocs.io/en/stable/data/data_catalog.html#transcoding-datasets) is used to convert the Spark Dataframes into pandas DataFrames after splitting the data into training and testing sets. + +The pipeline includes: + +* A node to split the data into training dataset and testing dataset using a configurable ratio +* A node to run a simple 1-nearest neighbour classifier and make predictions +* A node to report the accuracy of the predictions performed by the model diff --git a/databricks-iris/cookiecutter.json b/databricks-iris/cookiecutter.json new file mode 100644 index 00000000..910abee6 --- /dev/null +++ b/databricks-iris/cookiecutter.json @@ -0,0 +1,6 @@ +{ + "project_name": "Iris", + "repo_name": "{{ cookiecutter.project_name.strip().replace(' ', '-').replace('_', '-').lower() }}", + "python_package": "{{ cookiecutter.project_name.strip().replace(' ', '_').replace('-', '_').lower() }}", + "kedro_version": "{{ cookiecutter.kedro_version }}" +} diff --git a/databricks-iris/credentials.yml b/databricks-iris/credentials.yml new file mode 100644 index 00000000..3cbffcd7 --- /dev/null +++ b/databricks-iris/credentials.yml @@ -0,0 +1,18 @@ +# Here you can define credentials for different data sets and environment. +# +# THIS FILE MUST BE PLACED IN `conf/local`. DO NOT PUSH THIS FILE TO GitHub. +# +# Example: +# +# dev_s3: +# client_kwargs: +# aws_access_key_id: token +# aws_secret_access_key: key +# +# prod_s3: +# aws_access_key_id: token +# aws_secret_access_key: key +# +# dev_sql: +# username: admin +# password: admin diff --git a/databricks-iris/images/iris_pipeline.png b/databricks-iris/images/iris_pipeline.png new file mode 100644 index 00000000..65e832cf Binary files /dev/null and b/databricks-iris/images/iris_pipeline.png differ diff --git a/databricks-iris/prompts.yml b/databricks-iris/prompts.yml new file mode 100644 index 00000000..7e4bf62f --- /dev/null +++ b/databricks-iris/prompts.yml @@ -0,0 +1,9 @@ +project_name: + title: "Project Name" + text: | + Please enter a human readable name for your new project. + Spaces, hyphens, and underscores are allowed. + regex_validator: "^[\\w -]{2,}$" + error_message: | + It must contain only alphanumeric symbols, spaces, underscores and hyphens and + be at least 2 characters long. diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/.gitignore b/databricks-iris/{{ cookiecutter.repo_name }}/.gitignore new file mode 100644 index 00000000..f2b2bd0e --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/.gitignore @@ -0,0 +1,152 @@ +########################## +# KEDRO PROJECT + +# ignore all local configuration +conf/local/** +!conf/local/.gitkeep + +# ignore potentially sensitive credentials files +conf/**/*credentials* + +# ignore everything in the following folders +data/** + +# except their sub-folders +!data/**/ + +# also keep all .gitkeep files +!.gitkeep + +# also keep the example dataset +!data/01_raw/*.csv + + +########################## +# Common files + +# IntelliJ +.idea/ +*.iml +out/ +.idea_modules/ + +### macOS +*.DS_Store +.AppleDouble +.LSOverride +.Trashes + +# Vim +*~ +.*.swo +.*.swp + +# emacs +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc + +# JIRA plugin +atlassian-ide-plugin.xml + +# C extensions +*.so + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/README.md b/databricks-iris/{{ cookiecutter.repo_name }}/README.md new file mode 100644 index 00000000..5a07de4f --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/README.md @@ -0,0 +1,122 @@ +# {{ cookiecutter.project_name }} + +## Overview + +This is your new Kedro project, which was generated using `Kedro {{ cookiecutter.kedro_version }}`. + +Take a look at the [Kedro documentation](https://kedro.readthedocs.io) to get started. + +## Rules and guidelines + +In order to get the best out of the template: + +* Don't remove any lines from the `.gitignore` file we provide +* Make sure your results can be reproduced by following a [data engineering convention](https://kedro.readthedocs.io/en/stable/faq/faq.html#what-is-data-engineering-convention) +* Don't commit data to your repository +* Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/` + +## How to install dependencies + +Declare any dependencies in `src/requirements.txt` for `pip` installation and `src/environment.yml` for `conda` installation. + +To install them, run: + +``` +pip install -r src/requirements.txt +``` + +## How to run your Kedro pipeline + +You can run your Kedro project with: + +``` +kedro run +``` + +## How to test your Kedro project + +Have a look at the file `src/tests/test_run.py` for instructions on how to write your tests. You can run your tests as follows: + +``` +kedro test +``` + +To configure the coverage threshold, go to the `.coveragerc` file. + +## Project dependencies + +To generate or update the dependency requirements for your project: + +``` +kedro build-reqs +``` + +This will `pip-compile` the contents of `src/requirements.txt` into a new file `src/requirements.lock`. You can see the output of the resolution by opening `src/requirements.lock`. + +After this, if you'd like to update your project requirements, please update `src/requirements.txt` and re-run `kedro build-reqs`. + +[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) + +## How to work with Kedro and notebooks + +> Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `catalog`, `context`, `pipelines` and `session`. +> +> Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r src/requirements.txt` you will not need to take any extra steps before you use them. + +### Jupyter +To use Jupyter notebooks in your Kedro project, you need to install Jupyter: + +``` +pip install jupyter +``` + +After installing Jupyter, you can start a local notebook server: + +``` +kedro jupyter notebook +``` + +### JupyterLab +To use JupyterLab, you need to install it: + +``` +pip install jupyterlab +``` + +You can also start JupyterLab: + +``` +kedro jupyter lab +``` + +### IPython +And if you want to run an IPython session: + +``` +kedro ipython +``` + +### How to convert notebook cells to nodes in a Kedro project +You can move notebook code over into a Kedro project structure using a mixture of [cell tagging](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#release-5-0-0) and Kedro CLI commands. + +By adding the `node` tag to a cell and running the command below, the cell's source code will be copied over to a Python file within `src//nodes/`: + +``` +kedro jupyter convert +``` +> *Note:* The name of the Python file matches the name of the original notebook. + +Alternatively, you may want to transform all your notebooks in one go. Run the following command to convert all notebook files found in the project root directory and under any of its sub-folders: + +``` +kedro jupyter convert --all +``` + +### How to ignore notebook output cells in `git` +To automatically strip out all output cell contents before committing to `git`, you can run `kedro activate-nbstripout`. This will add a hook in `.git/config` which will run `nbstripout` before anything is committed to `git`. + +> *Note:* Your output cells will be retained locally. + +## Package your Kedro project + +[Further information about building project documentation and packaging your project](https://kedro.readthedocs.io/en/stable/tutorial/package_a_project.html) diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/conf/README.md b/databricks-iris/{{ cookiecutter.repo_name }}/conf/README.md new file mode 100644 index 00000000..a6a80a42 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/conf/README.md @@ -0,0 +1,26 @@ +# What is this for? + +This folder should be used to store configuration files used by Kedro or by separate tools. + +This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the [Instructions](#Instructions) section. + +## Local configuration + +The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys). + +> *Note:* Please do not check in any local configuration to version control. + +## Base configuration + +The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members. + +WARNING: Please do not put access credentials in the base configuration folder. + +## Instructions + + + + + +## Find out more +You can find out more about configuration from the [user guide documentation](https://kedro.readthedocs.io/en/stable/user_guide/configuration.html). diff --git a/pyspark-iris/{{ cookiecutter.repo_name }}/conf/databricks/catalog.yml b/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/catalog.yml similarity index 79% rename from pyspark-iris/{{ cookiecutter.repo_name }}/conf/databricks/catalog.yml rename to databricks-iris/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index 3307e014..061d16e3 100644 --- a/pyspark-iris/{{ cookiecutter.repo_name }}/conf/databricks/catalog.yml +++ b/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -56,43 +56,43 @@ example_iris_data: # for all SparkDataSets. X_train@pyspark: type: spark.SparkDataSet - filepath: /dbfs/root/projects/iris-databricks/data/02_intermediate/X_train.parquet + filepath: /dbfs/FileStore/iris-databricks/data/02_intermediate/X_train.parquet save_args: mode: overwrite X_train@pandas: type: pandas.ParquetDataSet - filepath: /dbfs/root/projects/iris-databricks/data/02_intermediate/X_train.parquet + filepath: /dbfs/FileStore/iris-databricks/data/02_intermediate/X_train.parquet X_test@pyspark: type: spark.SparkDataSet - filepath: /dbfs/root/projects/iris-databricks/data/02_intermediate/X_test.parquet + filepath: /dbfs/FileStore/iris-databricks/data/02_intermediate/X_test.parquet save_args: mode: overwrite X_test@pandas: type: pandas.ParquetDataSet - filepath: /dbfs/root/projects/iris-databricks/data/02_intermediate/X_test.parquet + filepath: /dbfs/FileStore/iris-databricks/data/02_intermediate/X_test.parquet y_train@pyspark: type: spark.SparkDataSet - filepath: /dbfs/root/projects/iris-databricks/data/02_intermediate/y_train.parquet + filepath: /dbfs/FileStore/iris-databricks/data/02_intermediate/y_train.parquet save_args: mode: overwrite y_train@pandas: type: pandas.ParquetDataSet - filepath: /dbfs/root/projects/iris-databricks/data/02_intermediate/y_train.parquet + filepath: /dbfs/FileStore/iris-databricks/data/02_intermediate/y_train.parquet y_test@pyspark: type: spark.SparkDataSet - filepath: /dbfs/root/projects/iris-databricks/data/02_intermediate/y_test.parquet + filepath: /dbfs/FileStore/iris-databricks/data/02_intermediate/y_test.parquet save_args: mode: overwrite y_test@pandas: type: pandas.ParquetDataSet - filepath: /dbfs/root/projects/iris-databricks/data/02_intermediate/y_test.parquet + filepath: /dbfs/FileStore/iris-databricks/data/02_intermediate/y_test.parquet # This is an example how to use `MemoryDataSet` with Spark objects that aren't `DataFrame`'s. # In particular, the `assign` copy mode ensures that the `MemoryDataSet` will be assigned diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/logging.yml b/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/logging.yml new file mode 100644 index 00000000..5b63e37e --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/logging.yml @@ -0,0 +1,32 @@ +version: 1 + +disable_existing_loggers: False + +formatters: + simple: + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +handlers: + info_file_handler: + class: logging.handlers.RotatingFileHandler + level: INFO + formatter: simple + filename: /dbfs/FileStore/iris-databricks/info.log + maxBytes: 10485760 # 10MB + backupCount: 20 + encoding: utf8 + delay: True + +loggers: + kedro: + level: INFO + + iris_databricks: + level: INFO + +root: + # File-based logging is disabled by default so that the starter works immediately on Databricks Repos. + # See https://docs.kedro.org/en/stable/logging/logging.html#disable-file-based-logging for more. + # To enable file-based logging, change the below line to: + # handlers: [rich, info_file_handler] + handlers: [info_file_handler] diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/parameters.yml b/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/parameters.yml new file mode 100644 index 00000000..48c3cd72 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/parameters.yml @@ -0,0 +1,3 @@ +train_fraction: 0.8 +random_state: 3 +target_column: species diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/spark.yml b/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/spark.yml new file mode 100644 index 00000000..ab831b62 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/conf/base/spark.yml @@ -0,0 +1,8 @@ +# You can define spark specific configuration here. + +spark.driver.maxResultSize: 3g +spark.hadoop.fs.s3a.impl: org.apache.hadoop.fs.s3a.S3AFileSystem +spark.sql.execution.arrow.pyspark.enabled: true + +# https://docs.kedro.org/en/stable/integrations/pyspark_integration.html#tips-for-maximising-concurrency-using-threadrunner +spark.scheduler.mode: FAIR diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/conf/local/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/conf/local/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/data/01_raw/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/data/01_raw/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/data/01_raw/iris.csv b/databricks-iris/{{ cookiecutter.repo_name }}/data/01_raw/iris.csv new file mode 100644 index 00000000..ba0ebd24 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/data/01_raw/iris.csv @@ -0,0 +1,151 @@ +sepal_length,sepal_width,petal_length,petal_width,species +5.1,3.5,1.4,0.2,setosa +4.9,3.0,1.4,0.2,setosa +4.7,3.2,1.3,0.2,setosa +4.6,3.1,1.5,0.2,setosa +5.0,3.6,1.4,0.2,setosa +5.4,3.9,1.7,0.4,setosa +4.6,3.4,1.4,0.3,setosa +5.0,3.4,1.5,0.2,setosa +4.4,2.9,1.4,0.2,setosa +4.9,3.1,1.5,0.1,setosa +5.4,3.7,1.5,0.2,setosa +4.8,3.4,1.6,0.2,setosa +4.8,3.0,1.4,0.1,setosa +4.3,3.0,1.1,0.1,setosa +5.8,4.0,1.2,0.2,setosa +5.7,4.4,1.5,0.4,setosa +5.4,3.9,1.3,0.4,setosa +5.1,3.5,1.4,0.3,setosa +5.7,3.8,1.7,0.3,setosa +5.1,3.8,1.5,0.3,setosa +5.4,3.4,1.7,0.2,setosa +5.1,3.7,1.5,0.4,setosa +4.6,3.6,1.0,0.2,setosa +5.1,3.3,1.7,0.5,setosa +4.8,3.4,1.9,0.2,setosa +5.0,3.0,1.6,0.2,setosa +5.0,3.4,1.6,0.4,setosa +5.2,3.5,1.5,0.2,setosa +5.2,3.4,1.4,0.2,setosa +4.7,3.2,1.6,0.2,setosa +4.8,3.1,1.6,0.2,setosa +5.4,3.4,1.5,0.4,setosa +5.2,4.1,1.5,0.1,setosa +5.5,4.2,1.4,0.2,setosa +4.9,3.1,1.5,0.1,setosa +5.0,3.2,1.2,0.2,setosa +5.5,3.5,1.3,0.2,setosa +4.9,3.1,1.5,0.1,setosa +4.4,3.0,1.3,0.2,setosa +5.1,3.4,1.5,0.2,setosa +5.0,3.5,1.3,0.3,setosa +4.5,2.3,1.3,0.3,setosa +4.4,3.2,1.3,0.2,setosa +5.0,3.5,1.6,0.6,setosa +5.1,3.8,1.9,0.4,setosa +4.8,3.0,1.4,0.3,setosa +5.1,3.8,1.6,0.2,setosa +4.6,3.2,1.4,0.2,setosa +5.3,3.7,1.5,0.2,setosa +5.0,3.3,1.4,0.2,setosa +7.0,3.2,4.7,1.4,versicolor +6.4,3.2,4.5,1.5,versicolor +6.9,3.1,4.9,1.5,versicolor +5.5,2.3,4.0,1.3,versicolor +6.5,2.8,4.6,1.5,versicolor +5.7,2.8,4.5,1.3,versicolor +6.3,3.3,4.7,1.6,versicolor +4.9,2.4,3.3,1.0,versicolor +6.6,2.9,4.6,1.3,versicolor +5.2,2.7,3.9,1.4,versicolor +5.0,2.0,3.5,1.0,versicolor +5.9,3.0,4.2,1.5,versicolor +6.0,2.2,4.0,1.0,versicolor +6.1,2.9,4.7,1.4,versicolor +5.6,2.9,3.6,1.3,versicolor +6.7,3.1,4.4,1.4,versicolor +5.6,3.0,4.5,1.5,versicolor +5.8,2.7,4.1,1.0,versicolor +6.2,2.2,4.5,1.5,versicolor +5.6,2.5,3.9,1.1,versicolor +5.9,3.2,4.8,1.8,versicolor +6.1,2.8,4.0,1.3,versicolor +6.3,2.5,4.9,1.5,versicolor +6.1,2.8,4.7,1.2,versicolor +6.4,2.9,4.3,1.3,versicolor +6.6,3.0,4.4,1.4,versicolor +6.8,2.8,4.8,1.4,versicolor +6.7,3.0,5.0,1.7,versicolor +6.0,2.9,4.5,1.5,versicolor +5.7,2.6,3.5,1.0,versicolor +5.5,2.4,3.8,1.1,versicolor +5.5,2.4,3.7,1.0,versicolor +5.8,2.7,3.9,1.2,versicolor +6.0,2.7,5.1,1.6,versicolor +5.4,3.0,4.5,1.5,versicolor +6.0,3.4,4.5,1.6,versicolor +6.7,3.1,4.7,1.5,versicolor +6.3,2.3,4.4,1.3,versicolor +5.6,3.0,4.1,1.3,versicolor +5.5,2.5,4.0,1.3,versicolor +5.5,2.6,4.4,1.2,versicolor +6.1,3.0,4.6,1.4,versicolor +5.8,2.6,4.0,1.2,versicolor +5.0,2.3,3.3,1.0,versicolor +5.6,2.7,4.2,1.3,versicolor +5.7,3.0,4.2,1.2,versicolor +5.7,2.9,4.2,1.3,versicolor +6.2,2.9,4.3,1.3,versicolor +5.1,2.5,3.0,1.1,versicolor +5.7,2.8,4.1,1.3,versicolor +6.3,3.3,6.0,2.5,virginica +5.8,2.7,5.1,1.9,virginica +7.1,3.0,5.9,2.1,virginica +6.3,2.9,5.6,1.8,virginica +6.5,3.0,5.8,2.2,virginica +7.6,3.0,6.6,2.1,virginica +4.9,2.5,4.5,1.7,virginica +7.3,2.9,6.3,1.8,virginica +6.7,2.5,5.8,1.8,virginica +7.2,3.6,6.1,2.5,virginica +6.5,3.2,5.1,2.0,virginica +6.4,2.7,5.3,1.9,virginica +6.8,3.0,5.5,2.1,virginica +5.7,2.5,5.0,2.0,virginica +5.8,2.8,5.1,2.4,virginica +6.4,3.2,5.3,2.3,virginica +6.5,3.0,5.5,1.8,virginica +7.7,3.8,6.7,2.2,virginica +7.7,2.6,6.9,2.3,virginica +6.0,2.2,5.0,1.5,virginica +6.9,3.2,5.7,2.3,virginica +5.6,2.8,4.9,2.0,virginica +7.7,2.8,6.7,2.0,virginica +6.3,2.7,4.9,1.8,virginica +6.7,3.3,5.7,2.1,virginica +7.2,3.2,6.0,1.8,virginica +6.2,2.8,4.8,1.8,virginica +6.1,3.0,4.9,1.8,virginica +6.4,2.8,5.6,2.1,virginica +7.2,3.0,5.8,1.6,virginica +7.4,2.8,6.1,1.9,virginica +7.9,3.8,6.4,2.0,virginica +6.4,2.8,5.6,2.2,virginica +6.3,2.8,5.1,1.5,virginica +6.1,2.6,5.6,1.4,virginica +7.7,3.0,6.1,2.3,virginica +6.3,3.4,5.6,2.4,virginica +6.4,3.1,5.5,1.8,virginica +6.0,3.0,4.8,1.8,virginica +6.9,3.1,5.4,2.1,virginica +6.7,3.1,5.6,2.4,virginica +6.9,3.1,5.1,2.3,virginica +5.8,2.7,5.1,1.9,virginica +6.8,3.2,5.9,2.3,virginica +6.7,3.3,5.7,2.5,virginica +6.7,3.0,5.2,2.3,virginica +6.3,2.5,5.0,1.9,virginica +6.5,3.0,5.2,2.0,virginica +6.2,3.4,5.4,2.3,virginica +5.9,3.0,5.1,1.8,virginica diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/data/02_intermediate/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/data/02_intermediate/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/data/03_primary/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/data/03_primary/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/data/04_feature/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/data/04_feature/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/data/05_model_input/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/data/05_model_input/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/data/06_models/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/data/06_models/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/data/07_model_output/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/data/07_model_output/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/data/08_reporting/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/data/08_reporting/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/docs/source/conf.py b/databricks-iris/{{ cookiecutter.repo_name }}/docs/source/conf.py new file mode 100644 index 00000000..740fa356 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/docs/source/conf.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 + + +# {{ cookiecutter.python_package }} documentation build +# configuration file, created by sphinx-quickstart. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import re + +from kedro.framework.cli.utils import find_stylesheets + +from {{ cookiecutter.python_package }} import __version__ as release + +# -- Project information ----------------------------------------------------- + +project = "{{ cookiecutter.python_package }}" +author = "Kedro" + +# The short X.Y version. +version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx_autodoc_typehints", + "sphinx.ext.doctest", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.mathjax", + "nbsphinx", + "sphinx_copybutton", + "myst_parser", +] + +# enable autosummary plugin (table of contents for modules/classes/class +# methods) +autosummary_generate = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = ["_build", "**.ipynb_checkpoints"] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +html_theme_options = {"collapse_navigation": False, "style_external_links": True} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + +html_show_sourcelink = False + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = "{{ cookiecutter.python_package }}doc" + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ( + master_doc, + "{{ cookiecutter.python_package }}.tex", + "{{ cookiecutter.python_package }} Documentation", + "Kedro", + "manual", + ) +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ( + master_doc, + "{{ cookiecutter.python_package }}", + "{{ cookiecutter.python_package }} Documentation", + [author], + 1, + ) +] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "{{ cookiecutter.python_package }}", + "{{ cookiecutter.python_package }} Documentation", + author, + "{{ cookiecutter.python_package }}", + "Project {{ cookiecutter.python_package }} codebase.", + "Nearest-Neighbour", + ) +] + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# -- Extension configuration ------------------------------------------------- + +# nbsphinx_prolog = """ +# see here for prolog/epilog details: +# https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html +# """ + +# -- NBconvert kernel config ------------------------------------------------- +nbsphinx_kernel_name = "python3" + + +def remove_arrows_in_examples(lines): + for i, line in enumerate(lines): + lines[i] = line.replace(">>>", "") + + +def autodoc_process_docstring(app, what, name, obj, options, lines): + remove_arrows_in_examples(lines) + + +def skip(app, what, name, obj, skip, options): + if name == "__init__": + return False + return skip + + +def setup(app): + app.connect("autodoc-process-docstring", autodoc_process_docstring) + app.connect("autodoc-skip-member", skip) + # add Kedro stylesheets + for stylesheet in find_stylesheets(): + app.add_css_file(stylesheet) + # enable rendering RST tables in Markdown diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/docs/source/index.rst b/databricks-iris/{{ cookiecutter.repo_name }}/docs/source/index.rst new file mode 100644 index 00000000..06252d32 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/docs/source/index.rst @@ -0,0 +1,19 @@ +.. {{ cookiecutter.python_package }} documentation master file, created by sphinx-quickstart. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to project {{ cookiecutter.python_package }}'s API docs! +============================================= + +.. toctree:: + :maxdepth: 4 + + modules + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/notebooks/.gitkeep b/databricks-iris/{{ cookiecutter.repo_name }}/notebooks/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/pyproject.toml b/databricks-iris/{{ cookiecutter.repo_name }}/pyproject.toml new file mode 100644 index 00000000..7ae06368 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/pyproject.toml @@ -0,0 +1,17 @@ +[tool.kedro] +package_name = "{{ cookiecutter.python_package }}" +project_name = "{{ cookiecutter.project_name }}" +kedro_init_version = "{{ cookiecutter.kedro_version }}" + +[tool.isort] +profile = "black" + +[tool.pytest.ini_options] +addopts = """ +--cov-report term-missing \ +--cov src/{{ cookiecutter.python_package }} -ra""" + +[tool.coverage.report] +fail_under = 0 +show_missing = true +exclude_lines = ["pragma: no cover", "raise NotImplementedError"] diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/setup.cfg b/databricks-iris/{{ cookiecutter.repo_name }}/setup.cfg new file mode 100644 index 00000000..63ea6730 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/setup.cfg @@ -0,0 +1,3 @@ +[flake8] +max-line-length=88 +extend-ignore=E203 diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/requirements.txt b/databricks-iris/{{ cookiecutter.repo_name }}/src/requirements.txt new file mode 100644 index 00000000..9d822588 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/requirements.txt @@ -0,0 +1,16 @@ +black~=22.0 +flake8>=3.7.9, <4.0 +ipython>=7.31.1, <8.0; python_version < '3.8' +ipython~=8.10; python_version >= '3.8' +isort~=5.0 +jupyter~=1.0 +jupyterlab~=3.0 +kedro~={{ cookiecutter.kedro_version }} +kedro-datasets[spark.SparkDataSet, pandas.ParquetDataSet]~=1.0.0 +kedro-telemetry~=0.2.0 +nbstripout~=0.4 +numpy~=1.21 +pytest-cov~=3.0 +pytest-mock>=1.7.1, <2.0 +pytest~=7.2 + diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/setup.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/setup.py new file mode 100644 index 00000000..ee21a4f4 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/setup.py @@ -0,0 +1,39 @@ +from setuptools import find_packages, setup + +entry_point = ( + "databricks_run = {{ cookiecutter.python_package }}.databricks_run:main" +) + + +# get the dependencies and installs +with open("requirements.txt", encoding="utf-8") as f: + # Make sure we strip all comments and options (e.g "--extra-index-url") + # that arise from a modified pip.conf file that configure global options + # when running kedro build-reqs + requires = [] + for line in f: + req = line.split("#", 1)[0].strip() + if req and not req.startswith("--"): + requires.append(req) + +setup( + name="{{ cookiecutter.python_package }}", + version="0.1", + packages=find_packages(exclude=["tests"]), + entry_points={"console_scripts": [entry_point]}, + install_requires=requires, + extras_require={ + "docs": [ + "docutils<0.18.0", + "sphinx~=3.4.3", + "sphinx_rtd_theme==0.5.1", + "nbsphinx==0.8.1", + "nbstripout~=0.4", + "myst-parser~=0.17.2", + "sphinx-autodoc-typehints==1.11.1", + "sphinx_copybutton==0.3.1", + "ipykernel>=5.3, <7.0", + "Jinja2<3.1.0", + ] + }, +) diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/tests/__init__.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/tests/test_pipeline.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/tests/test_pipeline.py new file mode 100644 index 00000000..2cdf4269 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/tests/test_pipeline.py @@ -0,0 +1,9 @@ +""" +This is a boilerplate test file +generated using Kedro {{ cookiecutter.kedro_version }}. +Please add your pipeline tests here. + +Kedro recommends using `pytest` framework, more info about it can be found +in the official documentation: +https://docs.pytest.org/en/latest/getting-started.html +""" diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/tests/test_run.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/tests/test_run.py new file mode 100644 index 00000000..059aeac8 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/tests/test_run.py @@ -0,0 +1,39 @@ +""" +This module contains an example test. + +Tests should be placed in ``src/tests``, in modules that mirror your +project's structure, and in files named test_*.py. They are simply functions +named ``test_*`` which test a unit of logic. + +To run the tests, run ``kedro test`` from the project root directory. +""" + +from pathlib import Path + +import pytest +from kedro.config import ConfigLoader +from kedro.framework.context import KedroContext +from kedro.framework.hooks import _create_hook_manager + + +@pytest.fixture +def config_loader(): + return ConfigLoader(conf_source=str(Path.cwd())) + + +@pytest.fixture +def project_context(config_loader): + return KedroContext( + package_name="{{ cookiecutter.python_package }}", + project_path=Path.cwd(), + config_loader=config_loader, + hook_manager=_create_hook_manager(), + ) + + +# The tests below are here for the demonstration purpose +# and should be replaced with the ones testing the project +# functionality +class TestProjectContext: + def test_project_path(self, project_context): + assert project_context.project_path == Path.cwd() diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/README.md b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/README.md new file mode 100644 index 00000000..01b31808 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/README.md @@ -0,0 +1,72 @@ +# Pipeline + +> *Note:* This is a `README.md` boilerplate generated using `Kedro {{ cookiecutter.kedro_version }}`. + +## Overview + +[Transcoding](https://kedro.readthedocs.io/en/stable/data/data_catalog.html#transcoding-datasets) is used to convert the Spark DataFrames into pandas DataFrames after splitting the data into training and testing sets. + +This pipeline: +1. splits the data into training dataset and testing dataset using a configurable ratio found in `conf/base/parameters.yml` +2. runs a simple 1-nearest neighbour model (`make_prediction` node) and makes prediction dataset +3. reports the model accuracy on a test set (`report_accuracy` node) + +## Pipeline inputs + +### `example_iris_data` + +| | | +| ---- | ------------------ | +| Type | `spark.SparkDataSet` | +| Description | Example iris data containing columns | + + +### `parameters` + +| | | +| ---- | ------------------ | +| Type | `dict` | +| Description | Project parameter dictionary that must contain the following keys: `train_fraction` (the ratio used to determine the train-test split), `random_state` (random generator to ensure train-test split is deterministic) and `target_column` (identify the target column in the dataset) | + + +## Pipeline intermediate outputs + +### `X_train` + +| | | +| ---- | ------------------ | +| Type | `pyspark.sql.DataFrame` | +| Description | DataFrame containing train set features | + +### `y_train` + +| | | +| ---- | ------------------ | +| Type | `pyspark.sql.DataFrame` | +| Description | Series containing train set target | + +### `X_test` + +| | | +| ---- | ------------------ | +| Type | `pyspark.sql.DataFrame` | +| Description | DataFrame containing test set features | + +### `y_test` + +| | | +| ---- | ------------------ | +| Type | `pyspark.sql.DataFrame` | +| Description | Series containing test set target | + +### `y_pred` + +| | | +| ---- | ------------------ | +| Type | `pandas.Series` | +| Description | Predictions from the 1-nearest neighbour model | + + +## Pipeline outputs + +### `None` \ No newline at end of file diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py new file mode 100644 index 00000000..177bba98 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py @@ -0,0 +1,4 @@ +"""{{ cookiecutter.project_name }} +""" + +__version__ = "0.1" diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/databricks_run.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/databricks_run.py new file mode 100644 index 00000000..fac518fb --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/databricks_run.py @@ -0,0 +1,29 @@ +import argparse +import logging + +from kedro.framework.project import configure_project +from kedro.framework.session import KedroSession + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--env", dest="env", type=str) + parser.add_argument("--conf-source", dest="conf_source", type=str) + parser.add_argument("--package-name", dest="package_name", type=str) + + args = parser.parse_args() + env = args.env + conf_source = args.conf_source + package_name = args.package_name + + # https://kb.databricks.com/notebooks/cmd-c-on-object-id-p0.html + logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR) + logging.getLogger("py4j.py4j.clientserver").setLevel(logging.ERROR) + + configure_project(package_name) + with KedroSession.create(env=env, conf_source=conf_source) as session: + session.run() + + +if __name__ == "__main__": + main() diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py new file mode 100644 index 00000000..39966779 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py @@ -0,0 +1,24 @@ +from kedro.framework.hooks import hook_impl +from pyspark import SparkConf +from pyspark.sql import SparkSession + + +class SparkHooks: + @hook_impl + def after_context_created(self, context) -> None: + """Initialises a SparkSession using the config + defined in project's conf folder. + """ + + # Load the spark configuration in spark.yaml using the config loader + parameters = context.config_loader.get("spark*", "spark*/**") + spark_conf = SparkConf().setAll(parameters.items()) + + # Initialise the spark session + spark_session_conf = ( + SparkSession.builder.appName(context.project_path.name) + .enableHiveSupport() + .config(conf=spark_conf) + ) + _spark_session = spark_session_conf.getOrCreate() + _spark_session.sparkContext.setLogLevel("WARN") diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/nodes.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/nodes.py new file mode 100644 index 00000000..365bd796 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/nodes.py @@ -0,0 +1,73 @@ +""" +This is a boilerplate pipeline +generated using Kedro {{ cookiecutter.kedro_version }} +""" + +import logging +from typing import Dict, Tuple + +import numpy as np +import pandas as pd +from pyspark.sql import DataFrame + + +def split_data(data: DataFrame, parameters: Dict) -> Tuple: + """Splits data into features and targets training and test sets. + + Args: + data: Data containing features and target. + parameters: Parameters defined in parameters.yml. + Returns: + Split data. + """ + + # Split to training and testing data + data_train, data_test = data.randomSplit( + weights=[parameters["train_fraction"], 1 - parameters["train_fraction"]] + ) + + X_train = data_train.drop(parameters["target_column"]) + X_test = data_test.drop(parameters["target_column"]) + y_train = data_train.select(parameters["target_column"]) + y_test = data_test.select(parameters["target_column"]) + + return X_train, X_test, y_train, y_test + + +def make_predictions( + X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame +) -> DataFrame: + """Uses 1-nearest neighbour classifier to create predictions. + + Args: + X_train: Training data of features. + y_train: Training data for target. + X_test: Test data for features. + + Returns: + y_pred: Prediction of the target variable. + """ + + X_train_numpy = X_train.to_numpy() + X_test_numpy = X_test.to_numpy() + + squared_distances = np.sum( + (X_train_numpy[:, None, :] - X_test_numpy[None, :, :]) ** 2, axis=-1 + ) + nearest_neighbour = squared_distances.argmin(axis=0) + y_pred = y_train.iloc[nearest_neighbour] + y_pred.index = X_test.index + + return y_pred + + +def report_accuracy(y_pred: pd.Series, y_test: pd.Series): + """Calculates and logs the accuracy. + + Args: + y_pred: Predicted target. + y_test: True target. + """ + accuracy = (y_pred == y_test).sum() / len(y_test) + logger = logging.getLogger(__name__) + logger.info("Model has an accuracy of %.3f on test data.", accuracy) diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline.py new file mode 100644 index 00000000..119c58fa --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline.py @@ -0,0 +1,38 @@ +""" +This is a boilerplate pipeline +generated using Kedro {{ cookiecutter.kedro_version }} +""" + +from kedro.pipeline import Pipeline, node, pipeline + +from .nodes import make_predictions, report_accuracy, split_data + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( + [ + node( + func=split_data, + inputs=["example_iris_data", "parameters"], + outputs=[ + "X_train@pyspark", + "X_test@pyspark", + "y_train@pyspark", + "y_test@pyspark", + ], + name="split", + ), + node( + func=make_predictions, + inputs=["X_train@pandas", "X_test@pandas", "y_train@pandas"], + outputs="y_pred", + name="make_predictions", + ), + node( + func=report_accuracy, + inputs=["y_pred", "y_test@pandas"], + outputs=None, + name="report_accuracy", + ), + ] + ) diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py new file mode 100644 index 00000000..2d4272e3 --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py @@ -0,0 +1,16 @@ +"""Project pipelines.""" +from typing import Dict + +from kedro.framework.project import find_pipelines +from kedro.pipeline import Pipeline + + +def register_pipelines() -> Dict[str, Pipeline]: + """Register the project's pipelines. + + Returns: + A mapping from pipeline names to ``Pipeline`` objects. + """ + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + return pipelines diff --git a/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py new file mode 100644 index 00000000..e5593b1c --- /dev/null +++ b/databricks-iris/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py @@ -0,0 +1,38 @@ +"""Project settings. There is no need to edit this file unless you want to change values +from the Kedro defaults. For further information, including these default values, see +https://kedro.readthedocs.io/en/stable/kedro_project_setup/settings.html.""" + +# Instantiated project hooks. +from {{cookiecutter.python_package}}.hooks import SparkHooks + +HOOKS = (SparkHooks(),) + +# Installed plugins for which to disable hook auto-registration. +# DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) + +# Class that manages storing KedroSession data. +# from kedro.framework.session.shelvestore import ShelveStore +# SESSION_STORE_CLASS = ShelveStore +# Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor. +# SESSION_STORE_ARGS = { +# "path": "./sessions" +# } + +# Class that manages Kedro's library components. +# from kedro.framework.context import KedroContext +# CONTEXT_CLASS = KedroContext + +# Directory that holds configuration. +# CONF_SOURCE = "conf" + +# Class that manages how configuration is loaded. +# from kedro.config import TemplatedConfigLoader +# CONFIG_LOADER_CLASS = TemplatedConfigLoader +# Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor. +# CONFIG_LOADER_ARGS = { +# "globals_pattern": "*globals.yml", +# } + +# Class that manages the Data Catalog. +# from kedro.io import DataCatalog +# DATA_CATALOG_CLASS = DataCatalog