From 224d1b49b571e9c21c4dae455dd674f7e6e05463 Mon Sep 17 00:00:00 2001 From: Rajiv Sambasivan Date: Mon, 17 Feb 2020 16:54:27 +0530 Subject: [PATCH] Changed pip install from testpypi to pypi. --- .../Arangopipe_Feature_Example_ext1.ipynb | 74 +- examples/Arangopipe_Feature_Examples.ipynb | 1169 ++++++----------- examples/Arangopipe_Feature_ext2.ipynb | 66 +- ...opipe_with_TensorFlow_Beginner_Guide.ipynb | 154 ++- .../Reuse_Old_Arangopipe_Connection.ipynb | 48 +- 5 files changed, 561 insertions(+), 950 deletions(-) diff --git a/examples/Arangopipe_Feature_Example_ext1.ipynb b/examples/Arangopipe_Feature_Example_ext1.ipynb index 693be41..d491fc7 100644 --- a/examples/Arangopipe_Feature_Example_ext1.ipynb +++ b/examples/Arangopipe_Feature_Example_ext1.ipynb @@ -1,12 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, "cells": [ { "cell_type": "markdown", @@ -24,30 +16,30 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "!pip install python-arango\n", - "!pip install -i https://test.pypi.org/simple/ arangopipe\n", + "!pip install arangopipe==0.0.6.8.6\n", "!pip install pandas PyYAML==5.1.1 sklearn2\n", "!pip install jsonpickle\n", "!pip install seaborn\n", "!pip install dtreeviz\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "data_url = \"https://raw.githubusercontent.com/arangoml/arangopipe/arangopipe_examples/examples/data/cal_housing.csv\"\n", "df = pd.read_csv(data_url, error_bad_lines=False)\n", "df.head()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -69,7 +61,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from sklearn import linear_model\n", "df['medianHouseValue'] = df['medianHouseValue'].apply(np.log)\n", @@ -98,22 +92,20 @@ " \n", " if index % 100 == 0:\n", " print('Completed estimating %4d points in the dataset' % (index))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "Xm = df[preds].values\n", "Ym = df['medianHouseValue'].values\n", "clf_0 = linear_model.Lasso(alpha=0.001, max_iter = 10000)\n", "clf_0.fit(Xm, Ym)\n", "Yhat_m = clf_0.predict(Xm)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -124,15 +116,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "# see section 2.2 from https://www.stat.cmu.edu/~cshalizi/402/lectures/08-bootstrap/lecture-08.pdf\n", "# see https://web.engr.oregonstate.edu/~tgd/classes/534/slides/part9.pdf\n", "Expval_at_i = { i : np.mean(np.array(bootstrap_Yest[i])) for i in range(df.shape[0])}\n", "bias_at_i = {i : Expval_at_i[i] - Yhat_m[i] for i in range(df.shape[0])}\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -143,7 +135,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", @@ -152,9 +146,7 @@ "bias_values = [bias for (pt, bias) in bias_at_i.items()]\n", "sns.kdeplot(bias_values)\n", "plt.grid(True)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -168,14 +160,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from sklearn.cluster import KMeans\n", "cluster_labels = KMeans(n_clusters=5, random_state=0).fit_predict(Xm)\n", "cluster_labels.shape" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -186,19 +178,21 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "df_bias = pd.DataFrame(Xm)\n", "df_bias['cluster'] = cluster_labels\n", "df_bias['bias'] = bias_values\n", "df_bias.groupby('cluster')['bias'].agg([np.mean, np.size])" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from arangopipe.arangopipe_storage.arangopipe_api import ArangoPipe\n", "from arangopipe.arangopipe_storage.arangopipe_admin_api import ArangoPipeAdmin\n", @@ -250,9 +244,15 @@ " \"tag\": \"Housing-Price-Hyperopt-Experiment\",\\\n", " \"project\": \"Housing Price Estimation Project\"}\n", "ap.log_run(run_info)" - ], - "execution_count": null, - "outputs": [] + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" } - ] + }, + "nbformat": 4, + "nbformat_minor": 1 } diff --git a/examples/Arangopipe_Feature_Examples.ipynb b/examples/Arangopipe_Feature_Examples.ipynb index 044cf59..470f5bf 100644 --- a/examples/Arangopipe_Feature_Examples.ipynb +++ b/examples/Arangopipe_Feature_Examples.ipynb @@ -1,778 +1,395 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Arangopipe_Feature_Examples.ipynb", - "provenance": [], - "collapsed_sections": [], - "authorship_tag": "ABX9TyNWcZ8yiLeseEDNo2yLQwMZ", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y0DGfxMg0kYm", - "colab_type": "text" - }, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3oU7MvQc0vn_", - "colab_type": "text" - }, - "source": [ - "# Installation Prerequisites\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "w-_hAxJgwjCX", - "colab_type": "code", - "outputId": "423803ae-fbdd-46a9-9403-08c89eed463d", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - } - }, - "source": [ - "!pip install python-arango\n", - "!pip install -i https://test.pypi.org/simple/ arangopipe\n", - "!pip install pandas PyYAML==5.1.1 sklearn2\n", - "!pip install jsonpickle" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collecting python-arango\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f9/c3/1f8445ffc2505997da53ad5ce5e9f40875d561623d6bc84c259879e5cbcc/python-arango-5.2.1.tar.gz (78kB)\n", - "\r\u001b[K |████▏ | 10kB 17.3MB/s eta 0:00:01\r\u001b[K |████████▍ | 20kB 1.8MB/s eta 0:00:01\r\u001b[K |████████████▌ | 30kB 2.7MB/s eta 0:00:01\r\u001b[K |████████████████▊ | 40kB 1.8MB/s eta 0:00:01\r\u001b[K |████████████████████▉ | 51kB 2.2MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 61kB 2.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▏ | 71kB 3.0MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 81kB 2.6MB/s \n", - "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from python-arango) (2.21.0)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from python-arango) (1.12.0)\n", - "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->python-arango) (2.8)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->python-arango) (2019.11.28)\n", - "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->python-arango) (1.24.3)\n", - "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->python-arango) (3.0.4)\n", - "Building wheels for collected packages: python-arango\n", - " Building wheel for python-arango (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for python-arango: filename=python_arango-5.2.1-py2.py3-none-any.whl size=86479 sha256=f498d1a4d22711aff475b38989c01e0380a67078fa914b8c8b0522d3993a7101\n", - " Stored in directory: /root/.cache/pip/wheels/31/30/00/ef5ee59a25096d89fbb9e2526877f74c189eb6db50bbef9474\n", - "Successfully built python-arango\n", - "Installing collected packages: python-arango\n", - "Successfully installed python-arango-5.2.1\n", - "Looking in indexes: https://test.pypi.org/simple/\n", - "Collecting arangopipe\n", - " Downloading https://test-files.pythonhosted.org/packages/4e/a5/5c735a7b1893d5f61a647b3cefc569921526223f2ee44c52f154b54c896a/arangopipe-0.0.6.8.4-py3-none-any.whl\n", - "Installing collected packages: arangopipe\n", - "Successfully installed arangopipe-0.0.6.8.4\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (0.25.3)\n", - "Collecting PyYAML==5.1.1\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a3/65/837fefac7475963d1eccf4aa684c23b95aa6c1d033a2c5965ccb11e22623/PyYAML-5.1.1.tar.gz (274kB)\n", - "\u001b[K |████████████████████████████████| 276kB 2.9MB/s \n", - "\u001b[?25hCollecting sklearn2\n", - " Downloading https://files.pythonhosted.org/packages/4d/b3/1d0d7e771b96212fa19013726b123a209e1dc109e2802bd99b2576bf74ed/sklearn2-0.0.13-py2.py3-none-any.whl\n", - "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.6.1)\n", - "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.17.5)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2018.9)\n", - "Requirement already satisfied: pytest in /usr/local/lib/python3.6/dist-packages (from sklearn2) (3.6.4)\n", - "Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.6/dist-packages (from sklearn2) (0.22.1)\n", - "Requirement already satisfied: statsmodels in /usr/local/lib/python3.6/dist-packages (from sklearn2) (0.10.2)\n", - "Requirement already satisfied: pydot in /usr/local/lib/python3.6/dist-packages (from sklearn2) (1.3.0)\n", - "Requirement already satisfied: patsy in /usr/local/lib/python3.6/dist-packages (from sklearn2) (0.5.1)\n", - "Collecting category-encoders\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)\n", - "\u001b[K |████████████████████████████████| 102kB 11.3MB/s \n", - "\u001b[?25hRequirement already satisfied: matplotlib>=1.5.1 in /usr/local/lib/python3.6/dist-packages (from sklearn2) (3.1.3)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas) (1.12.0)\n", - "Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.6/dist-packages (from pytest->sklearn2) (1.3.0)\n", - "Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from pytest->sklearn2) (1.8.1)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from pytest->sklearn2) (45.1.0)\n", - "Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from pytest->sklearn2) (8.2.0)\n", - "Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.6/dist-packages (from pytest->sklearn2) (0.7.1)\n", - "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from pytest->sklearn2) (19.3.0)\n", - "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.18->sklearn2) (1.4.1)\n", - "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.18->sklearn2) (0.14.1)\n", - "Requirement already satisfied: pyparsing>=2.1.4 in /usr/local/lib/python3.6/dist-packages (from pydot->sklearn2) (2.4.6)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.5.1->sklearn2) (1.1.0)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.5.1->sklearn2) (0.10.0)\n", - "Building wheels for collected packages: PyYAML\n", - " Building wheel for PyYAML (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for PyYAML: filename=PyYAML-5.1.1-cp36-cp36m-linux_x86_64.whl size=44099 sha256=49138b4d1c252b4273be84d49b094df92083961d308b1630151c1026aab5d422\n", - " Stored in directory: /root/.cache/pip/wheels/16/27/a1/775c62ddea7bfa62324fd1f65847ed31c55dadb6051481ba3f\n", - "Successfully built PyYAML\n", - "Installing collected packages: PyYAML, category-encoders, sklearn2\n", - " Found existing installation: PyYAML 3.13\n", - " Uninstalling PyYAML-3.13:\n", - " Successfully uninstalled PyYAML-3.13\n", - "Successfully installed PyYAML-5.1.1 category-encoders-2.1.0 sklearn2-0.0.13\n", - "Collecting jsonpickle\n", - " Downloading https://files.pythonhosted.org/packages/07/07/c157520a3ebd166c8c24c6ae0ecae7c3968eb4653ff0e5af369bb82f004d/jsonpickle-1.2-py2.py3-none-any.whl\n", - "Installing collected packages: jsonpickle\n", - "Successfully installed jsonpickle-1.2\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HRAUp8m58vvB", - "colab_type": "text" - }, - "source": [ - "# Using Arangopipe" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yX2s2rKe87Hh", - "colab_type": "text" - }, - "source": [ - "The details interacting with Arangopipe to manage meta-data from machine learning project activity are illustrated in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nMZAel3295XX", - "colab_type": "text" - }, - "source": [ - "## Creating a Project" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W0OCV0vaAE0T", - "colab_type": "text" - }, - "source": [ - "To use Arangopipe to track meta-data for projects, projects have to be registered with Arangopipe. For purposes of illustration, we will use the california housing dataset from UCI machine learning repository. Our project entails developing a regression model with this dataset. We will first register this project with Arangopipe as shown below.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Cs0cBDMCBRIm", - "colab_type": "code", - "outputId": "e41757a1-a083-4398-d505-fec3a25d72ee", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 125 - } - }, - "source": [ - "\n", - "from arangopipe.arangopipe_storage.arangopipe_api import ArangoPipe\n", - "from arangopipe.arangopipe_storage.arangopipe_admin_api import ArangoPipeAdmin\n", - "from arangopipe.arangopipe_storage.arangopipe_config import ArangoPipeConfig\n", - "from arangopipe.arangopipe_storage.managed_service_conn_parameters import ManagedServiceConnParam\n", - "mdb_config = ArangoPipeConfig()\n", - "msc = ManagedServiceConnParam()\n", - "conn_params = { msc.DB_SERVICE_HOST : \"arangoml.arangodb.cloud\", \\\n", - " msc.DB_SERVICE_END_POINT : \"createDB\",\\\n", - " msc.DB_SERVICE_NAME : \"createDB\",\\\n", - " msc.DB_SERVICE_PORT : 8529,\\\n", - " msc.DB_CONN_PROTOCOL : 'https'}\n", - " \n", - "mdb_config = mdb_config.create_connection_config(conn_params)\n", - "admin = ArangoPipeAdmin(reuse_connection = False, config = mdb_config)\n", - "ap_config = admin.get_config()\n", - "ap = ArangoPipe(config = ap_config)\n", - "proj_info = {\"name\": \"Housing_Price_Estimation_Project\"}\n", - "proj_reg = admin.register_project(proj_info)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "API endpoint: https://arangoml.arangodb.cloud:8529/_db/_system/createDB/createDB\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n", - " InsecureRequestWarning)\n" - ], - "name": "stderr" - }, - { - "output_type": "stream", - "text": [ - "Host Connection: https://arangoml.arangodb.cloud:8529\n" - ], - "name": "stdout" - }, - { - "output_type": "stream", - "text": [ - "2020-02-11 09:50:50,543 - arangopipe_logger - ERROR - The dataset by name: heart beat check was not found in Arangopipe!\n" - ], - "name": "stderr" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jadsDtQKDNXA", - "colab_type": "text" - }, - "source": [ - "## Model Building\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8hSnL6RRUtqr", - "colab_type": "text" - }, - "source": [ - "In this section, the details of capturing meta-data with Arangopipe as part of model building activity will be illustrated. Model selection is an important activity for data scientists. Data scientists consider many candidate models for a task. The best performing model is then chosen. This procedure is illustrated in the notebook illustrating the use of hyperopt to capture meta data from a hyper-parameter tuning experiment, (see [hyperopt.](https://github.com/arangoml/arangopipe/blob/master/arangopipe/tests/hyperopt/hyperopt_integration.ipynb)). We will use a simpler setting for this notebook. We will assume model selection has been completed and that a LASSO regression model is the best candidate for the task. Having made this decision, we capture information about the model and its parameters. This information is stored in Arangopipe. The details of performing these tasks are shown below. Before model building, we capture information related to the dataset and the features used to build the model.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qZi9oCD-X31q", - "colab_type": "text" - }, - "source": [ - "### Register Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8wJZveAFYZS1", - "colab_type": "text" - }, - "source": [ - "Read a copy of the dataset from the Arangopipe repository" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NDfYoP6uw8sZ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import pandas as pd\n", - "data_url = \"https://raw.githubusercontent.com/arangoml/arangopipe/arangopipe_examples/examples/data/cal_housing.csv\"\n", - "df = pd.read_csv(data_url, error_bad_lines=False)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "OU7n7gNa4yk9", - "colab_type": "code", - "outputId": "0b33ca8d-acf8-4c7e-efcb-5d0542558996", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 195 - } - }, - "source": [ - "df.head()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
latlonghousingMedAgetotalRoomstotalBedroomspopulationhouseholdsmedianIncomemedianHouseValue
0-122.2237.862170991106240111388.3014358500.0
1-122.2437.855214671904961777.2574352100.0
2-122.2537.855212742355582195.6431341300.0
3-122.2537.855216272805652593.8462342200.0
4-122.2537.85529192134131934.0368269700.0
\n", - "
" - ], - "text/plain": [ - " lat long housingMedAge ... households medianIncome medianHouseValue\n", - "0 -122.22 37.86 21 ... 1138 8.3014 358500.0\n", - "1 -122.24 37.85 52 ... 177 7.2574 352100.0\n", - "2 -122.25 37.85 52 ... 219 5.6431 341300.0\n", - "3 -122.25 37.85 52 ... 259 3.8462 342200.0\n", - "4 -122.25 37.85 52 ... 193 4.0368 269700.0\n", - "\n", - "[5 rows x 9 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 4 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yOIaobfWYl4j", - "colab_type": "text" - }, - "source": [ - "Register it with Arangopipe." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "IiLwjBdlYR6B", - "colab_type": "code", - "colab": {} - }, - "source": [ - "\n", - "ds_info = {\"name\" : \"california-housing-dataset\",\\\n", - " \"description\": \"This dataset lists median house prices in Califoria. Various house features are provided\",\\\n", - " \"source\": \"UCI ML Repository\" }\n", - "ds_reg = ap.register_dataset(ds_info)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "A7amG0DxZFmF", - "colab_type": "text" - }, - "source": [ - "### Register Featureset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1X2w-4RUZXlw", - "colab_type": "text" - }, - "source": [ - "Register the features used to develop the model.\n", - "\n", - "\n", - "* Note that the response variable has been log transformed\n", - "* Note that when the featureset is registered, it is linked to the dataset\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HGIDoX--5ioC", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import numpy as np\n", - "df[\"medianHouseValue\"] = df[\"medianHouseValue\"].apply(lambda x: np.log(x))\n", - "featureset = df.dtypes.to_dict()\n", - "featureset = {k:str(featureset[k]) for k in featureset}\n", - "featureset[\"name\"] = \"log_transformed_median_house_value\"\n", - "fs_reg = ap.register_featureset(featureset, ds_reg[\"_key\"]) # note that the dataset and featureset are linked here." - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dwh2bEMliRx0", - "colab_type": "text" - }, - "source": [ - "### Develop a Model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "q8h4kn99fKJo", - "colab_type": "text" - }, - "source": [ - "Create test and training sets for the model building activity" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "w4U-UXKuZjym", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from sklearn.model_selection import train_test_split\n", - "preds = df.columns.to_list()\n", - "preds.remove('medianHouseValue')\n", - "X = df[preds].values\n", - "Y = df['medianHouseValue'].values\n", - "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xADFlc1Ifch_", - "colab_type": "text" - }, - "source": [ - "Fit a Lasso model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qprL7-vLhdqf", - "colab_type": "text" - }, - "source": [ - "A Lasso model is developed. Train and test performances are noted." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HjoeXxs3dr0V", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from sklearn import linear_model\n", - "from sklearn.metrics import mean_squared_error\n", - "clf = linear_model.Lasso(alpha=0.001)\n", - "clf.fit(X_train, y_train)\n", - "train_pred = clf.predict(X_train)\n", - "test_pred = clf.predict(X_test)\n", - "train_mse = mean_squared_error(train_pred, y_train)\n", - "test_mse = mean_squared_error(test_pred, y_test)\n" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hod1-yu_inrZ", - "colab_type": "text" - }, - "source": [ - "### Register the Model\n", - "* Note that project and model are linked\n", - "* The notebook associated withe the model can be retreived from github. This can be part of the meta-data associated with the model\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "tWws1OWJqQuK", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import io\n", - "import requests\n", - "url = ('https://raw.githubusercontent.com/arangoml/arangopipe/master/examples/Arangopipe_Feature_Examples.ipynb')\n", - "nbjson = requests.get(url).text" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "vADXpiZwG1Qa", - "colab_type": "code", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "xu3M87jogVdd", - "colab_type": "code", - "colab": {} - }, - "source": [ - "\n", - "model_info = {\"name\": \"Lasso Model for Housing Dataset\", \"task\": \"Regression\", 'notebook': nbjson}\n", - "model_reg = ap.register_model(model_info, project = \"Housing_Price_Estimation_Project\")\n" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "TytN77IarjvV" - }, - "source": [ - "## Log Model Building Activity" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v4KZLX4koVLQ", - "colab_type": "text" - }, - "source": [ - "In this section the details of capturing a consolidated version of this model building activity. The execution of this notebook, or any ML activity, is captured by the 'Run' entity in the Arangopipe schema (see [schema](https://github.com/arangoml/arangopipe)). To record the execution, we need to create a unique identifier for it in ArangoDB. After generating a unique identifier, we capture the model parameters and model performance and then record the details of this experiment in Arangopipe. Each of these steps is shown below.\n", - "\n", - "Note: Model parameters are important metadata. Results from model building activity are converted to JSON for storage in ArangoDB. We are now getting ready to create a consolidated capture of the activities performed in this notebook in Arangopipe. To do so, we need to create a unique identifier for this activity. These are shown below.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XI1dMPNJr3lJ", - "colab_type": "text" - }, - "source": [ - "Note that capturing the 'Run' or execution of this cell captures information that links\n", - "\n", - "\n", - "1. The dataset used in this execution (ds_reg)\n", - "2. The featureset used in this execution (fs_reg)\n", - "3. The model parameters used in this execution (model_params)\n", - "4. The model performance that was observed in this execution (model perf)\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FHPfWuNQqafz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import uuid\n", - "import datetime\n", - "import jsonpickle\n", - "\n", - "ruuid = str(uuid.uuid4().int)\n", - "model_perf = {'training_mse': train_mse, 'test_mse': test_mse, 'run_id': ruuid, \"timestamp\": str(datetime.datetime.now())}\n", - "\n", - "mp = clf.get_params()\n", - "mp = jsonpickle.encode(mp)\n", - "model_params = {'run_id': ruuid, 'model_params': mp}\n", - "\n", - "run_info = {\"dataset\" : ds_reg[\"_key\"],\\\n", - " \"featureset\": fs_reg[\"_key\"],\\\n", - " \"run_id\": ruuid,\\\n", - " \"model\": model_reg[\"_key\"],\\\n", - " \"model-params\": model_params,\\\n", - " \"model-perf\": model_perf,\\\n", - " \"tag\": \"Housing-Price-Hyperopt-Experiment\",\\\n", - " \"project\": \"Housing Price Estimation Project\"}\n", - "ap.log_run(run_info)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_qgaio2bX9ef", - "colab_type": "text" - }, - "source": [ - "### Save the connection information to google drive so that this can used to connect to the instance that was used in this session," - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "_lfXXLYpkUyi", - "colab_type": "code", - "outputId": "2f30464d-e5e7-473f-ea27-d1ea1f3f7746", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 125 - } - }, - "source": [ - "from google.colab import drive\n", - "drive.mount('/content/drive')\n", - "fp = '/content/drive/My Drive/saved_arangopipe_config.yaml'\n", - "mdb_config.export_cfg(fp)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n", - "\n", - "Enter your authorization code:\n", - "··········\n", - "Mounted at /content/drive\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lpzDiY0yt7tX", - "colab_type": "text" - }, - "source": [ - "## Using Arangopipe with Common Tools in a Machine Learning Stack" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bXtu0bqz3GQT", - "colab_type": "text" - }, - "source": [ - "This notebook provides the details of working with Arangopipe to capture meta-data from a machine learning project activity. If you would like to see Arangopipe can be used with some common tools in a machine learning stack:\n", - "\n", - "\n", - "1. See [TFX](https://github.com/arangoml/arangopipe/tree/master/arangopipe/tests/TFX) for the details of using Arangopipe with TFX\n", - "2. See [Pytorch](https://github.com/arangoml/arangopipe/tree/master/arangopipe/tests/pytorch) for details of using Arangopipe with Pytorch.\n", - "3. See [Hyperopt](https://github.com/arangoml/arangopipe/tree/master/arangopipe/tests/hyperopt) for details of using Arangopipe with Hyperopt\n", - "4. See [MLFlow](https://github.com/arangoml/arangopipe/tree/master/arangopipe/tests/mlflow) for details of using Arangopipe with MLFlow.\n", - "\n" - ] - } - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Installation Prerequisites\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install python-arango\n", + "!pip install arangopipe==0.0.6.8.6\n", + "!pip install pandas PyYAML==5.1.1 sklearn2\n", + "!pip install jsonpickle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using Arangopipe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The details interacting with Arangopipe to manage meta-data from machine learning project activity are illustrated in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a Project" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use Arangopipe to track meta-data for projects, projects have to be registered with Arangopipe. For purposes of illustration, we will use the california housing dataset from UCI machine learning repository. Our project entails developing a regression model with this dataset. We will first register this project with Arangopipe as shown below.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from arangopipe.arangopipe_storage.arangopipe_api import ArangoPipe\n", + "from arangopipe.arangopipe_storage.arangopipe_admin_api import ArangoPipeAdmin\n", + "from arangopipe.arangopipe_storage.arangopipe_config import ArangoPipeConfig\n", + "from arangopipe.arangopipe_storage.managed_service_conn_parameters import ManagedServiceConnParam\n", + "mdb_config = ArangoPipeConfig()\n", + "msc = ManagedServiceConnParam()\n", + "conn_params = { msc.DB_SERVICE_HOST : \"arangoml.arangodb.cloud\", \\\n", + " msc.DB_SERVICE_END_POINT : \"createDB\",\\\n", + " msc.DB_SERVICE_NAME : \"createDB\",\\\n", + " msc.DB_SERVICE_PORT : 8529,\\\n", + " msc.DB_CONN_PROTOCOL : 'https'}\n", + " \n", + "mdb_config = mdb_config.create_connection_config(conn_params)\n", + "admin = ArangoPipeAdmin(reuse_connection = False, config = mdb_config)\n", + "ap_config = admin.get_config()\n", + "ap = ArangoPipe(config = ap_config)\n", + "proj_info = {\"name\": \"Housing_Price_Estimation_Project\"}\n", + "proj_reg = admin.register_project(proj_info)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Building\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section, the details of capturing meta-data with Arangopipe as part of model building activity will be illustrated. Model selection is an important activity for data scientists. Data scientists consider many candidate models for a task. The best performing model is then chosen. This procedure is illustrated in the notebook illustrating the use of hyperopt to capture meta data from a hyper-parameter tuning experiment, (see [hyperopt.](https://github.com/arangoml/arangopipe/blob/master/arangopipe/tests/hyperopt/hyperopt_integration.ipynb)). We will use a simpler setting for this notebook. We will assume model selection has been completed and that a LASSO regression model is the best candidate for the task. Having made this decision, we capture information about the model and its parameters. This information is stored in Arangopipe. The details of performing these tasks are shown below. Before model building, we capture information related to the dataset and the features used to build the model.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read a copy of the dataset from the Arangopipe repository" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "data_url = \"https://raw.githubusercontent.com/arangoml/arangopipe/arangopipe_examples/examples/data/cal_housing.csv\"\n", + "df = pd.read_csv(data_url, error_bad_lines=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Register it with Arangopipe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "ds_info = {\"name\" : \"california-housing-dataset\",\\\n", + " \"description\": \"This dataset lists median house prices in Califoria. Various house features are provided\",\\\n", + " \"source\": \"UCI ML Repository\" }\n", + "ds_reg = ap.register_dataset(ds_info)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register Featureset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Register the features used to develop the model.\n", + "\n", + "\n", + "* Note that the response variable has been log transformed\n", + "* Note that when the featureset is registered, it is linked to the dataset\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "df[\"medianHouseValue\"] = df[\"medianHouseValue\"].apply(lambda x: np.log(x))\n", + "featureset = df.dtypes.to_dict()\n", + "featureset = {k:str(featureset[k]) for k in featureset}\n", + "featureset[\"name\"] = \"log_transformed_median_house_value\"\n", + "fs_reg = ap.register_featureset(featureset, ds_reg[\"_key\"]) # note that the dataset and featureset are linked here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Develop a Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create test and training sets for the model building activity" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "preds = df.columns.to_list()\n", + "preds.remove('medianHouseValue')\n", + "X = df[preds].values\n", + "Y = df['medianHouseValue'].values\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fit a Lasso model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Lasso model is developed. Train and test performances are noted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import linear_model\n", + "from sklearn.metrics import mean_squared_error\n", + "clf = linear_model.Lasso(alpha=0.001)\n", + "clf.fit(X_train, y_train)\n", + "train_pred = clf.predict(X_train)\n", + "test_pred = clf.predict(X_test)\n", + "train_mse = mean_squared_error(train_pred, y_train)\n", + "test_mse = mean_squared_error(test_pred, y_test)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register the Model\n", + "* Note that project and model are linked\n", + "* The notebook associated withe the model can be retreived from github. This can be part of the meta-data associated with the model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import io\n", + "import requests\n", + "url = ('https://raw.githubusercontent.com/arangoml/arangopipe/master/examples/Arangopipe_Feature_Examples.ipynb')\n", + "nbjson = requests.get(url).text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "model_info = {\"name\": \"Lasso Model for Housing Dataset\", \"task\": \"Regression\", 'notebook': nbjson}\n", + "model_reg = ap.register_model(model_info, project = \"Housing_Price_Estimation_Project\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Log Model Building Activity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section the details of capturing a consolidated version of this model building activity. The execution of this notebook, or any ML activity, is captured by the 'Run' entity in the Arangopipe schema (see [schema](https://github.com/arangoml/arangopipe)). To record the execution, we need to create a unique identifier for it in ArangoDB. After generating a unique identifier, we capture the model parameters and model performance and then record the details of this experiment in Arangopipe. Each of these steps is shown below.\n", + "\n", + "Note: Model parameters are important metadata. Results from model building activity are converted to JSON for storage in ArangoDB. We are now getting ready to create a consolidated capture of the activities performed in this notebook in Arangopipe. To do so, we need to create a unique identifier for this activity. These are shown below.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that capturing the 'Run' or execution of this cell captures information that links\n", + "\n", + "\n", + "1. The dataset used in this execution (ds_reg)\n", + "2. The featureset used in this execution (fs_reg)\n", + "3. The model parameters used in this execution (model_params)\n", + "4. The model performance that was observed in this execution (model perf)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import datetime\n", + "import jsonpickle\n", + "\n", + "ruuid = str(uuid.uuid4().int)\n", + "model_perf = {'training_mse': train_mse, 'test_mse': test_mse, 'run_id': ruuid, \"timestamp\": str(datetime.datetime.now())}\n", + "\n", + "mp = clf.get_params()\n", + "mp = jsonpickle.encode(mp)\n", + "model_params = {'run_id': ruuid, 'model_params': mp}\n", + "\n", + "run_info = {\"dataset\" : ds_reg[\"_key\"],\\\n", + " \"featureset\": fs_reg[\"_key\"],\\\n", + " \"run_id\": ruuid,\\\n", + " \"model\": model_reg[\"_key\"],\\\n", + " \"model-params\": model_params,\\\n", + " \"model-perf\": model_perf,\\\n", + " \"tag\": \"Housing-Price-Hyperopt-Experiment\",\\\n", + " \"project\": \"Housing Price Estimation Project\"}\n", + "ap.log_run(run_info)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save the connection information to google drive so that this can used to connect to the instance that was used in this session," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "fp = '/content/drive/My Drive/saved_arangopipe_config.yaml'\n", + "mdb_config.export_cfg(fp)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Arangopipe with Common Tools in a Machine Learning Stack" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook provides the details of working with Arangopipe to capture meta-data from a machine learning project activity. If you would like to see Arangopipe can be used with some common tools in a machine learning stack:\n", + "\n", + "\n", + "1. See [TFX](https://github.com/arangoml/arangopipe/tree/master/arangopipe/tests/TFX) for the details of using Arangopipe with TFX\n", + "2. See [Pytorch](https://github.com/arangoml/arangopipe/tree/master/arangopipe/tests/pytorch) for details of using Arangopipe with Pytorch.\n", + "3. See [Hyperopt](https://github.com/arangoml/arangopipe/tree/master/arangopipe/tests/hyperopt) for details of using Arangopipe with Hyperopt\n", + "4. See [MLFlow](https://github.com/arangoml/arangopipe/tree/master/arangopipe/tests/mlflow) for details of using Arangopipe with MLFlow.\n", + "\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 } diff --git a/examples/Arangopipe_Feature_ext2.ipynb b/examples/Arangopipe_Feature_ext2.ipynb index 0f5a8f1..bc9a7a6 100644 --- a/examples/Arangopipe_Feature_ext2.ipynb +++ b/examples/Arangopipe_Feature_ext2.ipynb @@ -1,12 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, "cells": [ { "cell_type": "markdown", @@ -17,19 +9,21 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "!pip install python-arango\n", - "!pip install -i https://test.pypi.org/simple/ arangopipe\n", + "!pip install arangopipe==0.0.6.8.6\n", "!pip install pandas PyYAML==5.1.1 sklearn2\n", "!pip install jsonpickle" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -37,9 +31,7 @@ "df = pd.read_csv(data_url, error_bad_lines=False)\n", "df = df.sample(n = 500)\n", "df.head()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -62,31 +54,29 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "" - ] + "source": [] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "\n", "req_cols = df.columns.tolist()\n", "req_cols.remove(\"medianHouseValue\")\n", "df = df[req_cols]\n", "df.dtypes" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "df[\"lat\"].describe()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -110,26 +100,26 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline \n", "df[\"lat\"].hist()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "\n", "df1 = df.query(\"lat <= -119\")\n", "df2 = df.query(\"lat > -119\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -140,16 +130,16 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from arangopipe.arangopipe_analytics.rf_dataset_shift_detector import RF_DatasetShiftDetector\n", "\n", "rfd = RF_DatasetShiftDetector()\n", "score = rfd.detect_dataset_shift(df1, df2)\n", "print (\"Detaset shift score : %2.2f\" % (score))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -165,5 +155,13 @@ "The API uses a classifier to discriminate between the datasets provided to it. The score reported by the API is the accuracy of the classifier to discriminate between the datasets. Values close to $0.5$ indicate that the classifier in not able to discriminate between the two datasets. This could be interpretted as a situation where no discernable shift has occured in the data since the last model deployment. Values close $1$ indicate that dataset shift is discernable and that we may need to revisit modeling. How dataset shift affects the performance of the deployed model is problem dependent. So the score must be assessed in the context of a particular application. An experiment to track the loss of model accuracy with the observed score could provide insights into a threshold score beyond which a model redevelopment is needed." ] } - ] + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 } diff --git a/examples/Arangopipe_with_TensorFlow_Beginner_Guide.ipynb b/examples/Arangopipe_with_TensorFlow_Beginner_Guide.ipynb index fe3a5b7..ad30010 100644 --- a/examples/Arangopipe_with_TensorFlow_Beginner_Guide.ipynb +++ b/examples/Arangopipe_with_TensorFlow_Beginner_Guide.ipynb @@ -1,12 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, "cells": [ { "cell_type": "markdown", @@ -27,7 +19,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -40,9 +34,7 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -93,15 +85,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "!pip install python-arango\n", - "!pip install -i https://test.pypi.org/simple/ arangopipe\n", + "!pip install arangopipe==0.0.6.8.6\n", "!pip install pandas PyYAML==5.1.1 sklearn2\n", "!pip install json-tricks " - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -112,7 +104,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from __future__ import absolute_import, division, print_function, unicode_literals\n", "\n", @@ -124,9 +118,7 @@ " pass\n", "\n", "import tensorflow as tf" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -137,7 +129,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from arangopipe.arangopipe_storage.arangopipe_api import ArangoPipe\n", "from arangopipe.arangopipe_storage.arangopipe_admin_api import ArangoPipeAdmin\n", @@ -152,22 +146,20 @@ " msc.DB_CONN_PROTOCOL : 'https'}\n", " \n", "mdb_config = mdb_config.create_connection_config(conn_params)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "%%capture\n", "admin = ArangoPipeAdmin(reuse_connection = False, config = mdb_config)\n", "ap_config = admin.get_config()\n", "ap = ArangoPipe(config = ap_config)\n", "# Error indicating \"heart beat check was not found\" is expected." - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -178,15 +170,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "mnist = tf.keras.datasets.mnist\n", "\n", "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n", "x_train, x_test = x_train / 255.0, x_test / 255.0\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -199,7 +191,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "proj_info = {\"name\": \"MNIST Handwriting Analysis\"}\n", "proj_reg = admin.register_project(proj_info)\n", @@ -215,9 +209,7 @@ "model_info = {\"name\": \"Neural Network\",\\\n", " \"type\": \"Neural network with Linear layer, ReLU activation, Dropout Layer (20%) and Softmax output layer\"}\n", "model_reg = ap.register_model(model_info, project = \"MNIST Handwriting Analysis\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -228,7 +220,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "model = tf.keras.models.Sequential([\n", " tf.keras.layers.Flatten(input_shape=(28, 28)),\n", @@ -240,9 +234,7 @@ "model.compile(optimizer='adam',\n", " loss='sparse_categorical_crossentropy',\n", " metrics=['accuracy'])" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -253,7 +245,9 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "import uuid #used as run id\n", "from datetime import datetime\n", @@ -274,43 +268,43 @@ " \"accuracy\": str(accuracy),\n", " \"run_id\": str(ruuid),\n", " \"timestamp\": timestamp}\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "model_json = model.to_json()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from json_tricks import dumps\n", "weights = model.get_weights()\n", "json_weights = dumps(weights)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "model_params['json_weights'] = json_weights\n", "model_params['model_json'] = model_json" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ " run_info = {\"dataset\" : ds_reg[\"_key\"],\\\n", " \"featureset\": fs_reg[\"_key\"],\\\n", @@ -323,9 +317,7 @@ " \"project\": \"MNIST Handwriting Analysis\"}\n", "\n", " ap.log_run(run_info)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -343,15 +335,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "admin = ArangoPipeAdmin() \n", "ap_config = admin.get_config()\n", "ap = ArangoPipe(config = ap_config)\n", "# Error indicating \"heart beat check was not found\" is expected." - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -362,12 +354,12 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "ap.lookup_model(\"Neural Network\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -385,43 +377,43 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "saved_model_params = ap.lookup_modelparams(tag_value = \"MNIST_model_params_saved\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "saved_model = saved_model_params['model_json']\n", "saved_model_weights = saved_model_params['json_weights']" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "mdb_config.cfg" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from json_tricks import loads\n", "remat_weight = loads(saved_model_weights)\n", "reinitialized_model = tf.keras.models.model_from_json(saved_model)\n", "reinitialized_model.set_weights(remat_weight)\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -432,32 +424,38 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "new_predictions = reinitialized_model.predict(x_test)\n", "old_predictions = model.predict(x_test)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "import numpy as np\n", "np.array_equal(new_predictions, old_predictions)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "metadata": {}, - "source": [ - "" - ], "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" } - ] + }, + "nbformat": 4, + "nbformat_minor": 1 } diff --git a/examples/Reuse_Old_Arangopipe_Connection.ipynb b/examples/Reuse_Old_Arangopipe_Connection.ipynb index 6bccda3..b9ffbba 100644 --- a/examples/Reuse_Old_Arangopipe_Connection.ipynb +++ b/examples/Reuse_Old_Arangopipe_Connection.ipynb @@ -1,12 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, "cells": [ { "cell_type": "markdown", @@ -17,19 +9,21 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "!pip install python-arango\n", - "!pip install -i https://test.pypi.org/simple/ arangopipe\n", + "!pip install arangopipe==0.0.6.8.6\n", "!pip install pandas PyYAML==5.1.1 sklearn2\n", "!pip install jsonpickle" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from arangopipe.arangopipe_storage.arangopipe_api import ArangoPipe\n", "from arangopipe.arangopipe_storage.arangopipe_admin_api import ArangoPipeAdmin\n", @@ -40,22 +34,22 @@ "drive.mount('/content/drive')\n", "fp = '/content/drive/My Drive/saved_arangopipe_config.yaml'\n", "conn_params = mdb_config.create_config(fp)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "conn_params" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "mdb_config = mdb_config.create_connection_config(conn_params)\n", "admin = ArangoPipeAdmin(reuse_connection = True, config = mdb_config)\n", @@ -63,16 +57,20 @@ "ap = ArangoPipe(config = ap_config)\n", "proj_info = {\"name\": \"Housing_Price_Estimation_Project\"}\n", "proj_reg = admin.register_project(proj_info)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", "metadata": {}, - "source": [ - "" - ] + "source": [] } - ] + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 }