Enable automation test for featurestore samples (#2585)

* Enable automation test for featurestore samples * Update setup resources * Fix typo and reformat the notebook * Add python package to cluster * Add import package * Run script sequentially * Put script run in the same process * Update to python output command * Replace output command in all notebooks during test * Update mounted path * Update the relative path * Restructure the file hierarchy * Regenerate feature store notebook workflow * Update relateive path * List folder in current path * Update relative folder path * Update storage account name and user object id * Get signed in user id * Update user object id * Update user id * Enable notebook 1 first
Azure · Sep 7, 2023 · a3be24d · a3be24d
1 parent 4c1c62f
commit a3be24d
Show file tree

Hide file tree

Showing 9 changed files with 249 additions and 6 deletions.
diff --git a/.github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml b/.github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml
@@ -0,0 +1,80 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: sdk-featurestore_sample-test_featurestore_sdk_samples
+# This file is created by sdk/python/readme.py.
+# Please do not edit directly.
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "22 2/12 * * *"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - sdk/python/featurestore_sample/**
+      - .github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml
+      - sdk/python/dev-requirements.txt
+      - infra/bootstrapping/**
+      - sdk/python/setup.sh
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: check out repo
+      uses: actions/checkout@v2
+    - name: setup python
+      uses: actions/setup-python@v2
+      with:
+        python-version: "3.8"
+    - name: pip install notebook reqs
+      run: pip install -r sdk/python/dev-requirements.txt
+    - name: azure login
+      uses: azure/login@v1
+      with:
+        creds: ${{secrets.AZUREML_CREDENTIALS}}
+    - name: bootstrap resources
+      run: |
+          echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
+          bash bootstrap.sh
+      working-directory: infra/bootstrapping
+      continue-on-error: false
+    - name: setup SDK
+      run: |
+          source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
+          bash setup.sh
+      working-directory: sdk/python
+      continue-on-error: true
+    - name: setup-cli
+      run: |
+          source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
+          bash setup.sh
+      working-directory: cli
+      continue-on-error: true
+    - name: setup feature-store resources
+      run: |
+          bash -x setup-resources.sh test_featurestore_sdk_samples.ipynb
+      working-directory: sdk/python/featurestore_sample
+      continue-on-error: true
+    - name: run featurestore_sample/test_featurestore_sdk_samples.ipynb
+      run: |
+          source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
+          bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
+          bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "test_featurestore_sdk_samples.ipynb";
+          [ -f "../../.azureml/config" ] && cat "../../.azureml/config";
+          papermill -k python test_featurestore_sdk_samples.ipynb test_featurestore_sdk_samples.output.ipynb
+      working-directory: sdk/python/featurestore_sample
+    - name: upload notebook's working folder as an artifact
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: test_featurestore_sdk_samples
+        path: sdk/python/featurestore_sample
diff --git a/sdk/python/featurestore_sample/featurestore_sdk_job.py b/sdk/python/featurestore_sample/featurestore_sdk_job.py
@@ -0,0 +1,26 @@
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.appName("AccessData").getOrCreate()
+
+print("=======Test Notebook 1============")
+with open(
+    "notebooks/sdk_only/1. Develop a feature set and register with managed feature store.py"
+) as f:
+    exec(f.read())
+
+## Enable test for notebook 1 first
+# print("=======Test Notebook 2============")
+# with open(
+#     "notebooks/sdk_only/2. Enable materialization and backfill feature data.py"
+# ) as f:
+#     exec(f.read())
+
+# print("=======Test Notebook 3============")
+# with open("notebooks/sdk_only/3. Experiment and train models using features.py") as f:
+#     exec(f.read())
+
+# print("=======Test Notebook 3============")
+# with open(
+#     "notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.py"
+# ) as f:
+#     exec(f.read())
diff --git a/...notebooks/sdk_only/1. Develop a feature set and register with managed feature store.ipynb b/...notebooks/sdk_only/1. Develop a feature set and register with managed feature store.ipynb
@@ -196,7 +196,7 @@
     "\n",
     "# Please update your alias belpw (or any custom directory you uploaded the samples to).\n",
     "# You can find the name from the directory structure in the left nav\n",
-    "root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
+    "root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
     "\n",
     "if os.path.isdir(root_dir):\n",
     "    print(\"The folder exists.\")\n",

diff --git a/...store_sample/notebooks/sdk_only/2. Enable materialization and backfill feature data.ipynb b/...store_sample/notebooks/sdk_only/2. Enable materialization and backfill feature data.ipynb
@@ -159,7 +159,7 @@
     "\n",
     "# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n",
     "# You can find the name from the directory structure inm the left nav\n",
-    "root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
+    "root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
     "\n",
     "if os.path.isdir(root_dir):\n",
     "    print(\"The folder exists.\")\n",
@@ -419,7 +419,7 @@
     "# storage\n",
     "storage_subscription_id = os.environ[\"AZUREML_ARM_SUBSCRIPTION\"]\n",
     "storage_resource_group_name = os.environ[\"AZUREML_ARM_RESOURCEGROUP\"]\n",
-    "storage_account_name = \"fstorestorage\"\n",
+    "storage_account_name = \"<FEATURE_STORAGE_ACCOUNT_NAME>\"\n",
     "storage_location = ws_location\n",
     "storage_file_system_name = \"offlinestore\""
    ]
@@ -759,7 +759,7 @@
    "outputs": [],
    "source": [
     "# This utility function is created for ease of use in the docs tutorials. It uses standard azure API's. You can optionally inspect it `featurestore/setup/setup_storage_uai.py`\n",
-    "your_aad_objectid = \"<your_aad_objectId>\"\n",
+    "your_aad_objectid = \"<USER_AAD_OBJECTID>\"\n",
     "\n",
     "grant_user_aad_storage_data_reader_role(\n",
     "    AzureMLOnBehalfOfCredential(),\n",

diff --git a/...eaturestore_sample/notebooks/sdk_only/3. Experiment and train models using features.ipynb b/...eaturestore_sample/notebooks/sdk_only/3. Experiment and train models using features.ipynb
@@ -175,7 +175,7 @@
     "\n",
     "# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n",
     "# You can find the name from the directory structure inm the left nav\n",
-    "root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
+    "root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
     "\n",
     "if os.path.isdir(root_dir):\n",
     "    print(\"The folder exists.\")\n",

diff --git a/...mple/notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.ipynb b/...mple/notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.ipynb
@@ -172,7 +172,7 @@
     "\n",
     "# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n",
     "# You can find the name from the directory structure inm the left nav\n",
-    "root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
+    "root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
     "\n",
     "if os.path.isdir(root_dir):\n",
     "    print(\"The folder exists.\")\n",

diff --git a/sdk/python/featurestore_sample/setup-resources.sh b/sdk/python/featurestore_sample/setup-resources.sh
@@ -0,0 +1,36 @@
+pip install --upgrade jupytext
+
+# <create_variables>
+SUBSCRIPTION_ID=$(az account show --query id -o tsv)
+LOCATION=$(az ml workspace show --query location -o tsv)
+RESOURCE_GROUP=$(az group show --query name -o tsv)
+AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv)
+OUTPUT_COMMAND="print"
+FEATURE_STORAGE_ACCOUNT_NAME=${RESOURCE_GROUP}fs
+USER_ID="36b5b70a-a2b2-45e6-a496-df3c2ffde085"
+
+# </create_variables>
+
+# <convert_notebook_to_py>
+NOTEBOOK_1="notebooks/sdk_only/1. Develop a feature set and register with managed feature store"
+NOTEBOOK_2="notebooks/sdk_only/2. Enable materialization and backfill feature data"
+NOTEBOOK_3="notebooks/sdk_only/3. Experiment and train models using features"
+NOTEBOOK_4="notebooks/sdk_only/4. Enable recurrent materialization and run batch inference"
+jupytext --to py "${NOTEBOOK_1}.ipynb"
+jupytext --to py "${NOTEBOOK_2}.ipynb"
+jupytext --to py "${NOTEBOOK_3}.ipynb"
+jupytext --to py "${NOTEBOOK_4}.ipynb"
+# <convert_notebook_to_py>
+
+#<replace_template_values>
+sed -i "s/<SUBSCRIPTION_ID>/$SUBSCRIPTION_ID/g;
+    s/<RESOURCE_GROUP>/$RESOURCE_GROUP/g;
+    s/<AML_WORKSPACE_NAME>/$AML_WORKSPACE_NAME/g;" $1
+
+#<replace_template_values>
+sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;" "${NOTEBOOK_1}.py"
+sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;
+    s/<FEATURE_STORAGE_ACCOUNT_NAME>/$FEATURE_STORAGE_ACCOUNT_NAME/g;
+    s/<USER_AAD_OBJECTID>/$USER_ID/g;" "${NOTEBOOK_2}.py"
+sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;" "${NOTEBOOK_3}.py"
+sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;" "${NOTEBOOK_4}.py"
diff --git a/sdk/python/featurestore_sample/test_featurestore_sdk_samples.ipynb b/sdk/python/featurestore_sample/test_featurestore_sdk_samples.ipynb
@@ -0,0 +1,88 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Use a serverless Spark compute"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should have an attached Synapse Spark pool available in your workspace. Please see documentation page: [Attach and manage a Synapse Spark pool in Azure Machine Learning (preview)](https://learn.microsoft.com/azure/machine-learning/how-to-manage-synapse-spark-pool) for more details.\n",
+    "\n",
+    "**Note** - To ensure successful execution of Spark job, the identity being used for the Spark job should be assigned **Contributor** and **Storage Blob Data Contributor** roles on the Azure storage account used for data input and output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.ml import MLClient, spark, Input, Output\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "from azure.ai.ml.entities import Environment\n",
+    "\n",
+    "subscription_id = \"<SUBSCRIPTION_ID>\"\n",
+    "resource_group = \"<RESOURCE_GROUP>\"\n",
+    "workspace = \"<AML_WORKSPACE_NAME>\"\n",
+    "ml_client = MLClient(\n",
+    "    DefaultAzureCredential(), subscription_id, resource_group, workspace\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark_job = spark(\n",
+    "    display_name=\"featurestore_sample_test\",\n",
+    "    code=\"./\",\n",
+    "    entry={\"file\": \"featurestore_sdk_job.py\"},\n",
+    "    driver_cores=1,\n",
+    "    driver_memory=\"1g\",\n",
+    "    executor_cores=1,\n",
+    "    executor_memory=\"1g\",\n",
+    "    executor_instances=1,\n",
+    "    resources={\n",
+    "        \"instance_type\": \"Standard_E8S_V3\",\n",
+    "        \"runtime_version\": \"3.2.0\",\n",
+    "    },\n",
+    "    environment=Environment(conda_file=\"project/env/conda.yml\"),\n",
+    ")\n",
+    "\n",
+    "returned_spark_job = ml_client.jobs.create_or_update(spark_job)\n",
+    "\n",
+    "print(returned_spark_job.id)\n",
+    "# Wait until the job completes\n",
+    "ml_client.jobs.stream(returned_spark_job.name)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10 - SDK V2",
+   "language": "python",
+   "name": "python310-sdkv2"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.7.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "6aeff17a1aa7735c2f7cb3a6d691fe1b4d4c3b8d2d650f644ad0f24e1b8e3f3f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sdk/python/readme.py b/sdk/python/readme.py
@@ -197,6 +197,7 @@ def write_notebook_workflow(
         "assets-component" in classification
     )
     is_spark_notebook_sample = ("jobs-spark" in classification) or ("_spark_" in name)
+    is_featurestore_sample = "featurestore_sample" in classification
     creds = "${{secrets.AZUREML_CREDENTIALS}}"
     # Duplicate name in working directory during checkout
     # https://github.com/actions/checkout/issues/739
@@ -278,6 +279,8 @@ def write_notebook_workflow(
       continue-on-error: true\n"""
     if is_spark_notebook_sample:
         workflow_yaml += get_spark_config_workflow(posix_folder, name)
+    if is_featurestore_sample:
+        workflow_yaml += get_featurestore_config_workflow(posix_folder, name)
     workflow_yaml += f"""    - name: run {posix_notebook}
       run: |
           source "{github_workspace}/infra/bootstrapping/sdk_helpers.sh";
@@ -468,6 +471,16 @@ def get_spark_config_workflow(folder_name, file_name):
     return workflow
 
 
+def get_featurestore_config_workflow(folder_name, file_name):
+    workflow = f"""    - name: setup feature-store resources
+      run: |
+          bash -x setup-resources.sh {file_name}.ipynb
+      working-directory: sdk/python/featurestore_sample
+      continue-on-error: true\n"""
+
+    return workflow
+
+
 @contextlib.contextmanager
 def change_working_dir(path):
     """Context manager for changing the current working directory"""