From a3be24d9a5daf14baa048bf3fbabfbd466f991f9 Mon Sep 17 00:00:00 2001 From: Fred Li <51424245+fredms@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:19:59 -0700 Subject: [PATCH] Enable automation test for featurestore samples (#2585) * Enable automation test for featurestore samples * Update setup resources * Fix typo and reformat the notebook * Add python package to cluster * Add import package * Run script sequentially * Put script run in the same process * Update to python output command * Replace output command in all notebooks during test * Update mounted path * Update the relative path * Restructure the file hierarchy * Regenerate feature store notebook workflow * Update relateive path * List folder in current path * Update relative folder path * Update storage account name and user object id * Get signed in user id * Update user object id * Update user id * Enable notebook 1 first --- ...e_sample-test_featurestore_sdk_samples.yml | 80 +++++++++++++++++ .../featurestore_sdk_job.py | 26 ++++++ ... register with managed feature store.ipynb | 2 +- ...ialization and backfill feature data.ipynb | 6 +- ...ment and train models using features.ipynb | 2 +- ...erialization and run batch inference.ipynb | 2 +- .../featurestore_sample/setup-resources.sh | 36 ++++++++ .../test_featurestore_sdk_samples.ipynb | 88 +++++++++++++++++++ sdk/python/readme.py | 13 +++ 9 files changed, 249 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml create mode 100644 sdk/python/featurestore_sample/featurestore_sdk_job.py create mode 100644 sdk/python/featurestore_sample/setup-resources.sh create mode 100644 sdk/python/featurestore_sample/test_featurestore_sdk_samples.ipynb diff --git a/.github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml b/.github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml new file mode 100644 index 0000000000..360b615937 --- /dev/null +++ b/.github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml @@ -0,0 +1,80 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: sdk-featurestore_sample-test_featurestore_sdk_samples +# This file is created by sdk/python/readme.py. +# Please do not edit directly. +on: + workflow_dispatch: + schedule: + - cron: "22 2/12 * * *" + pull_request: + branches: + - main + paths: + - sdk/python/featurestore_sample/** + - .github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml + - sdk/python/dev-requirements.txt + - infra/bootstrapping/** + - sdk/python/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: pip install notebook reqs + run: pip install -r sdk/python/dev-requirements.txt + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup SDK + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: sdk/python + continue-on-error: true + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: setup feature-store resources + run: | + bash -x setup-resources.sh test_featurestore_sdk_samples.ipynb + working-directory: sdk/python/featurestore_sample + continue-on-error: true + - name: run featurestore_sample/test_featurestore_sdk_samples.ipynb + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "test_featurestore_sdk_samples.ipynb"; + [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; + papermill -k python test_featurestore_sdk_samples.ipynb test_featurestore_sdk_samples.output.ipynb + working-directory: sdk/python/featurestore_sample + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: test_featurestore_sdk_samples + path: sdk/python/featurestore_sample diff --git a/sdk/python/featurestore_sample/featurestore_sdk_job.py b/sdk/python/featurestore_sample/featurestore_sdk_job.py new file mode 100644 index 0000000000..b71c9a6ece --- /dev/null +++ b/sdk/python/featurestore_sample/featurestore_sdk_job.py @@ -0,0 +1,26 @@ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("AccessData").getOrCreate() + +print("=======Test Notebook 1============") +with open( + "notebooks/sdk_only/1. Develop a feature set and register with managed feature store.py" +) as f: + exec(f.read()) + +## Enable test for notebook 1 first +# print("=======Test Notebook 2============") +# with open( +# "notebooks/sdk_only/2. Enable materialization and backfill feature data.py" +# ) as f: +# exec(f.read()) + +# print("=======Test Notebook 3============") +# with open("notebooks/sdk_only/3. Experiment and train models using features.py") as f: +# exec(f.read()) + +# print("=======Test Notebook 3============") +# with open( +# "notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.py" +# ) as f: +# exec(f.read()) diff --git a/sdk/python/featurestore_sample/notebooks/sdk_only/1. Develop a feature set and register with managed feature store.ipynb b/sdk/python/featurestore_sample/notebooks/sdk_only/1. Develop a feature set and register with managed feature store.ipynb index 67623387e9..674c36064b 100644 --- a/sdk/python/featurestore_sample/notebooks/sdk_only/1. Develop a feature set and register with managed feature store.ipynb +++ b/sdk/python/featurestore_sample/notebooks/sdk_only/1. Develop a feature set and register with managed feature store.ipynb @@ -196,7 +196,7 @@ "\n", "# Please update your alias belpw (or any custom directory you uploaded the samples to).\n", "# You can find the name from the directory structure in the left nav\n", - "root_dir = \"./Users//featurestore_sample\"\n", + "root_dir = \"./Users//featurestore_sample\"\n", "\n", "if os.path.isdir(root_dir):\n", " print(\"The folder exists.\")\n", diff --git a/sdk/python/featurestore_sample/notebooks/sdk_only/2. Enable materialization and backfill feature data.ipynb b/sdk/python/featurestore_sample/notebooks/sdk_only/2. Enable materialization and backfill feature data.ipynb index b01948b992..b2cb282824 100644 --- a/sdk/python/featurestore_sample/notebooks/sdk_only/2. Enable materialization and backfill feature data.ipynb +++ b/sdk/python/featurestore_sample/notebooks/sdk_only/2. Enable materialization and backfill feature data.ipynb @@ -159,7 +159,7 @@ "\n", "# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n", "# You can find the name from the directory structure inm the left nav\n", - "root_dir = \"./Users//featurestore_sample\"\n", + "root_dir = \"./Users//featurestore_sample\"\n", "\n", "if os.path.isdir(root_dir):\n", " print(\"The folder exists.\")\n", @@ -419,7 +419,7 @@ "# storage\n", "storage_subscription_id = os.environ[\"AZUREML_ARM_SUBSCRIPTION\"]\n", "storage_resource_group_name = os.environ[\"AZUREML_ARM_RESOURCEGROUP\"]\n", - "storage_account_name = \"fstorestorage\"\n", + "storage_account_name = \"\"\n", "storage_location = ws_location\n", "storage_file_system_name = \"offlinestore\"" ] @@ -759,7 +759,7 @@ "outputs": [], "source": [ "# This utility function is created for ease of use in the docs tutorials. It uses standard azure API's. You can optionally inspect it `featurestore/setup/setup_storage_uai.py`\n", - "your_aad_objectid = \"\"\n", + "your_aad_objectid = \"\"\n", "\n", "grant_user_aad_storage_data_reader_role(\n", " AzureMLOnBehalfOfCredential(),\n", diff --git a/sdk/python/featurestore_sample/notebooks/sdk_only/3. Experiment and train models using features.ipynb b/sdk/python/featurestore_sample/notebooks/sdk_only/3. Experiment and train models using features.ipynb index 380d7bd6aa..f184ddba9d 100644 --- a/sdk/python/featurestore_sample/notebooks/sdk_only/3. Experiment and train models using features.ipynb +++ b/sdk/python/featurestore_sample/notebooks/sdk_only/3. Experiment and train models using features.ipynb @@ -175,7 +175,7 @@ "\n", "# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n", "# You can find the name from the directory structure inm the left nav\n", - "root_dir = \"./Users//featurestore_sample\"\n", + "root_dir = \"./Users//featurestore_sample\"\n", "\n", "if os.path.isdir(root_dir):\n", " print(\"The folder exists.\")\n", diff --git a/sdk/python/featurestore_sample/notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.ipynb b/sdk/python/featurestore_sample/notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.ipynb index e88f433d5a..f3b05b26a4 100644 --- a/sdk/python/featurestore_sample/notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.ipynb +++ b/sdk/python/featurestore_sample/notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.ipynb @@ -172,7 +172,7 @@ "\n", "# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n", "# You can find the name from the directory structure inm the left nav\n", - "root_dir = \"./Users//featurestore_sample\"\n", + "root_dir = \"./Users//featurestore_sample\"\n", "\n", "if os.path.isdir(root_dir):\n", " print(\"The folder exists.\")\n", diff --git a/sdk/python/featurestore_sample/setup-resources.sh b/sdk/python/featurestore_sample/setup-resources.sh new file mode 100644 index 0000000000..12363c0b4c --- /dev/null +++ b/sdk/python/featurestore_sample/setup-resources.sh @@ -0,0 +1,36 @@ +pip install --upgrade jupytext + +# +SUBSCRIPTION_ID=$(az account show --query id -o tsv) +LOCATION=$(az ml workspace show --query location -o tsv) +RESOURCE_GROUP=$(az group show --query name -o tsv) +AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv) +OUTPUT_COMMAND="print" +FEATURE_STORAGE_ACCOUNT_NAME=${RESOURCE_GROUP}fs +USER_ID="36b5b70a-a2b2-45e6-a496-df3c2ffde085" + +# + +# +NOTEBOOK_1="notebooks/sdk_only/1. Develop a feature set and register with managed feature store" +NOTEBOOK_2="notebooks/sdk_only/2. Enable materialization and backfill feature data" +NOTEBOOK_3="notebooks/sdk_only/3. Experiment and train models using features" +NOTEBOOK_4="notebooks/sdk_only/4. Enable recurrent materialization and run batch inference" +jupytext --to py "${NOTEBOOK_1}.ipynb" +jupytext --to py "${NOTEBOOK_2}.ipynb" +jupytext --to py "${NOTEBOOK_3}.ipynb" +jupytext --to py "${NOTEBOOK_4}.ipynb" +# + +# +sed -i "s//$SUBSCRIPTION_ID/g; + s//$RESOURCE_GROUP/g; + s//$AML_WORKSPACE_NAME/g;" $1 + +# +sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/\/featurestore_sample/.\//g;" "${NOTEBOOK_1}.py" +sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/\/featurestore_sample/.\//g; + s//$FEATURE_STORAGE_ACCOUNT_NAME/g; + s//$USER_ID/g;" "${NOTEBOOK_2}.py" +sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/\/featurestore_sample/.\//g;" "${NOTEBOOK_3}.py" +sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/\/featurestore_sample/.\//g;" "${NOTEBOOK_4}.py" \ No newline at end of file diff --git a/sdk/python/featurestore_sample/test_featurestore_sdk_samples.ipynb b/sdk/python/featurestore_sample/test_featurestore_sdk_samples.ipynb new file mode 100644 index 0000000000..48ebda4498 --- /dev/null +++ b/sdk/python/featurestore_sample/test_featurestore_sdk_samples.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use a serverless Spark compute" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should have an attached Synapse Spark pool available in your workspace. Please see documentation page: [Attach and manage a Synapse Spark pool in Azure Machine Learning (preview)](https://learn.microsoft.com/azure/machine-learning/how-to-manage-synapse-spark-pool) for more details.\n", + "\n", + "**Note** - To ensure successful execution of Spark job, the identity being used for the Spark job should be assigned **Contributor** and **Storage Blob Data Contributor** roles on the Azure storage account used for data input and output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient, spark, Input, Output\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.ai.ml.entities import Environment\n", + "\n", + "subscription_id = \"\"\n", + "resource_group = \"\"\n", + "workspace = \"\"\n", + "ml_client = MLClient(\n", + " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark_job = spark(\n", + " display_name=\"featurestore_sample_test\",\n", + " code=\"./\",\n", + " entry={\"file\": \"featurestore_sdk_job.py\"},\n", + " driver_cores=1,\n", + " driver_memory=\"1g\",\n", + " executor_cores=1,\n", + " executor_memory=\"1g\",\n", + " executor_instances=1,\n", + " resources={\n", + " \"instance_type\": \"Standard_E8S_V3\",\n", + " \"runtime_version\": \"3.2.0\",\n", + " },\n", + " environment=Environment(conda_file=\"project/env/conda.yml\"),\n", + ")\n", + "\n", + "returned_spark_job = ml_client.jobs.create_or_update(spark_job)\n", + "\n", + "print(returned_spark_job.id)\n", + "# Wait until the job completes\n", + "ml_client.jobs.stream(returned_spark_job.name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "name": "python", + "version": "3.7.10" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "6aeff17a1aa7735c2f7cb3a6d691fe1b4d4c3b8d2d650f644ad0f24e1b8e3f3f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/readme.py b/sdk/python/readme.py index 70f0e7cac1..ef3ad304dc 100644 --- a/sdk/python/readme.py +++ b/sdk/python/readme.py @@ -197,6 +197,7 @@ def write_notebook_workflow( "assets-component" in classification ) is_spark_notebook_sample = ("jobs-spark" in classification) or ("_spark_" in name) + is_featurestore_sample = "featurestore_sample" in classification creds = "${{secrets.AZUREML_CREDENTIALS}}" # Duplicate name in working directory during checkout # https://github.com/actions/checkout/issues/739 @@ -278,6 +279,8 @@ def write_notebook_workflow( continue-on-error: true\n""" if is_spark_notebook_sample: workflow_yaml += get_spark_config_workflow(posix_folder, name) + if is_featurestore_sample: + workflow_yaml += get_featurestore_config_workflow(posix_folder, name) workflow_yaml += f""" - name: run {posix_notebook} run: | source "{github_workspace}/infra/bootstrapping/sdk_helpers.sh"; @@ -468,6 +471,16 @@ def get_spark_config_workflow(folder_name, file_name): return workflow +def get_featurestore_config_workflow(folder_name, file_name): + workflow = f""" - name: setup feature-store resources + run: | + bash -x setup-resources.sh {file_name}.ipynb + working-directory: sdk/python/featurestore_sample + continue-on-error: true\n""" + + return workflow + + @contextlib.contextmanager def change_working_dir(path): """Context manager for changing the current working directory"""