Merge branch 'main' into main

Azure · Sep 7, 2023 · 4d891e3 · 4d891e3
2 parents ba8d7da + a3be24d
commit 4d891e3
Show file tree

Hide file tree

Showing 17 changed files with 1,088 additions and 17 deletions.
diff --git a/.github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml b/.github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml
@@ -0,0 +1,80 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: sdk-featurestore_sample-test_featurestore_sdk_samples
+# This file is created by sdk/python/readme.py.
+# Please do not edit directly.
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "22 2/12 * * *"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - sdk/python/featurestore_sample/**
+      - .github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml
+      - sdk/python/dev-requirements.txt
+      - infra/bootstrapping/**
+      - sdk/python/setup.sh
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: check out repo
+      uses: actions/checkout@v2
+    - name: setup python
+      uses: actions/setup-python@v2
+      with:
+        python-version: "3.8"
+    - name: pip install notebook reqs
+      run: pip install -r sdk/python/dev-requirements.txt
+    - name: azure login
+      uses: azure/login@v1
+      with:
+        creds: ${{secrets.AZUREML_CREDENTIALS}}
+    - name: bootstrap resources
+      run: |
+          echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
+          bash bootstrap.sh
+      working-directory: infra/bootstrapping
+      continue-on-error: false
+    - name: setup SDK
+      run: |
+          source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
+          bash setup.sh
+      working-directory: sdk/python
+      continue-on-error: true
+    - name: setup-cli
+      run: |
+          source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
+          bash setup.sh
+      working-directory: cli
+      continue-on-error: true
+    - name: setup feature-store resources
+      run: |
+          bash -x setup-resources.sh test_featurestore_sdk_samples.ipynb
+      working-directory: sdk/python/featurestore_sample
+      continue-on-error: true
+    - name: run featurestore_sample/test_featurestore_sdk_samples.ipynb
+      run: |
+          source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
+          bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
+          bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "test_featurestore_sdk_samples.ipynb";
+          [ -f "../../.azureml/config" ] && cat "../../.azureml/config";
+          papermill -k python test_featurestore_sdk_samples.ipynb test_featurestore_sdk_samples.output.ipynb
+      working-directory: sdk/python/featurestore_sample
+    - name: upload notebook's working folder as an artifact
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: test_featurestore_sdk_samples
+        path: sdk/python/featurestore_sample
diff --git a/cli/endpoints/online/model-1/environment/conda.yaml b/cli/endpoints/online/model-1/environment/conda.yaml
@@ -8,6 +8,6 @@ dependencies:
   - scikit-learn=1.2.2
   - scipy=1.10.1
   - pip:
-    - azureml-defaults==1.49.0
+    - azureml-defaults==1.53.0
     - inference-schema[numpy-support]==1.5.1
     - joblib==1.2.0
diff --git a/cli/endpoints/online/model-2/environment/conda.yaml b/cli/endpoints/online/model-2/environment/conda.yaml
@@ -8,6 +8,6 @@ dependencies:
   - scikit-learn=1.2.2
   - scipy=1.10.1
   - pip:
-    - azureml-defaults==1.49.0
+    - azureml-defaults==1.53.0
     - inference-schema[numpy-support]==1.5.1
     - joblib==1.2.0
diff --git a/infra/bootstrapping/init_environment.sh b/infra/bootstrapping/init_environment.sh
@@ -51,7 +51,7 @@ let "REGISTRY_TOMORROW=10#$(date -d '+1 days' +'%m%d')"
 export LOCATION="East US"
 export PREFIX=aml
 export SUFFIX=sdkv202
-export APP_NAME="github-sp-amlsdkv2-gh"
+export APP_NAME="github-sp-amlsdkv2-gh-2"
 export timestamp=$(date +%s)
 # export RESOURCE_GROUP_NAME=test-data-rg
 # export WORKSPACE_NAME=${PREFIX}${SUFFIX}${DATE_ONLY}-ws

diff --git a/sdk/python/featurestore_sample/featurestore_sdk_job.py b/sdk/python/featurestore_sample/featurestore_sdk_job.py
@@ -0,0 +1,26 @@
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.appName("AccessData").getOrCreate()
+
+print("=======Test Notebook 1============")
+with open(
+    "notebooks/sdk_only/1. Develop a feature set and register with managed feature store.py"
+) as f:
+    exec(f.read())
+
+## Enable test for notebook 1 first
+# print("=======Test Notebook 2============")
+# with open(
+#     "notebooks/sdk_only/2. Enable materialization and backfill feature data.py"
+# ) as f:
+#     exec(f.read())
+
+# print("=======Test Notebook 3============")
+# with open("notebooks/sdk_only/3. Experiment and train models using features.py") as f:
+#     exec(f.read())
+
+# print("=======Test Notebook 3============")
+# with open(
+#     "notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.py"
+# ) as f:
+#     exec(f.read())
diff --git a/...notebooks/sdk_only/1. Develop a feature set and register with managed feature store.ipynb b/...notebooks/sdk_only/1. Develop a feature set and register with managed feature store.ipynb
@@ -196,7 +196,7 @@
     "\n",
     "# Please update your alias belpw (or any custom directory you uploaded the samples to).\n",
     "# You can find the name from the directory structure in the left nav\n",
-    "root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
+    "root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
     "\n",
     "if os.path.isdir(root_dir):\n",
     "    print(\"The folder exists.\")\n",

diff --git a/...store_sample/notebooks/sdk_only/2. Enable materialization and backfill feature data.ipynb b/...store_sample/notebooks/sdk_only/2. Enable materialization and backfill feature data.ipynb
@@ -159,7 +159,7 @@
     "\n",
     "# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n",
     "# You can find the name from the directory structure inm the left nav\n",
-    "root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
+    "root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
     "\n",
     "if os.path.isdir(root_dir):\n",
     "    print(\"The folder exists.\")\n",
@@ -419,7 +419,7 @@
     "# storage\n",
     "storage_subscription_id = os.environ[\"AZUREML_ARM_SUBSCRIPTION\"]\n",
     "storage_resource_group_name = os.environ[\"AZUREML_ARM_RESOURCEGROUP\"]\n",
-    "storage_account_name = \"fstorestorage\"\n",
+    "storage_account_name = \"<FEATURE_STORAGE_ACCOUNT_NAME>\"\n",
     "storage_location = ws_location\n",
     "storage_file_system_name = \"offlinestore\""
    ]
@@ -759,7 +759,7 @@
    "outputs": [],
    "source": [
     "# This utility function is created for ease of use in the docs tutorials. It uses standard azure API's. You can optionally inspect it `featurestore/setup/setup_storage_uai.py`\n",
-    "your_aad_objectid = \"<your_aad_objectId>\"\n",
+    "your_aad_objectid = \"<USER_AAD_OBJECTID>\"\n",
     "\n",
     "grant_user_aad_storage_data_reader_role(\n",
     "    AzureMLOnBehalfOfCredential(),\n",

diff --git a/...eaturestore_sample/notebooks/sdk_only/3. Experiment and train models using features.ipynb b/...eaturestore_sample/notebooks/sdk_only/3. Experiment and train models using features.ipynb
@@ -175,7 +175,7 @@
     "\n",
     "# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n",
     "# You can find the name from the directory structure inm the left nav\n",
-    "root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
+    "root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
     "\n",
     "if os.path.isdir(root_dir):\n",
     "    print(\"The folder exists.\")\n",

diff --git a/...mple/notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.ipynb b/...mple/notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.ipynb
@@ -172,7 +172,7 @@
     "\n",
     "# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n",
     "# You can find the name from the directory structure inm the left nav\n",
-    "root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
+    "root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
     "\n",
     "if os.path.isdir(root_dir):\n",
     "    print(\"The folder exists.\")\n",

diff --git a/sdk/python/featurestore_sample/setup-resources.sh b/sdk/python/featurestore_sample/setup-resources.sh
@@ -0,0 +1,36 @@
+pip install --upgrade jupytext
+
+# <create_variables>
+SUBSCRIPTION_ID=$(az account show --query id -o tsv)
+LOCATION=$(az ml workspace show --query location -o tsv)
+RESOURCE_GROUP=$(az group show --query name -o tsv)
+AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv)
+OUTPUT_COMMAND="print"
+FEATURE_STORAGE_ACCOUNT_NAME=${RESOURCE_GROUP}fs
+USER_ID="36b5b70a-a2b2-45e6-a496-df3c2ffde085"
+
+# </create_variables>
+
+# <convert_notebook_to_py>
+NOTEBOOK_1="notebooks/sdk_only/1. Develop a feature set and register with managed feature store"
+NOTEBOOK_2="notebooks/sdk_only/2. Enable materialization and backfill feature data"
+NOTEBOOK_3="notebooks/sdk_only/3. Experiment and train models using features"
+NOTEBOOK_4="notebooks/sdk_only/4. Enable recurrent materialization and run batch inference"
+jupytext --to py "${NOTEBOOK_1}.ipynb"
+jupytext --to py "${NOTEBOOK_2}.ipynb"
+jupytext --to py "${NOTEBOOK_3}.ipynb"
+jupytext --to py "${NOTEBOOK_4}.ipynb"
+# <convert_notebook_to_py>
+
+#<replace_template_values>
+sed -i "s/<SUBSCRIPTION_ID>/$SUBSCRIPTION_ID/g;
+    s/<RESOURCE_GROUP>/$RESOURCE_GROUP/g;
+    s/<AML_WORKSPACE_NAME>/$AML_WORKSPACE_NAME/g;" $1
+
+#<replace_template_values>
+sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;" "${NOTEBOOK_1}.py"
+sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;
+    s/<FEATURE_STORAGE_ACCOUNT_NAME>/$FEATURE_STORAGE_ACCOUNT_NAME/g;
+    s/<USER_AAD_OBJECTID>/$USER_ID/g;" "${NOTEBOOK_2}.py"
+sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;" "${NOTEBOOK_3}.py"
+sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;" "${NOTEBOOK_4}.py"
diff --git a/sdk/python/featurestore_sample/test_featurestore_sdk_samples.ipynb b/sdk/python/featurestore_sample/test_featurestore_sdk_samples.ipynb
@@ -0,0 +1,88 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Use a serverless Spark compute"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should have an attached Synapse Spark pool available in your workspace. Please see documentation page: [Attach and manage a Synapse Spark pool in Azure Machine Learning (preview)](https://learn.microsoft.com/azure/machine-learning/how-to-manage-synapse-spark-pool) for more details.\n",
+    "\n",
+    "**Note** - To ensure successful execution of Spark job, the identity being used for the Spark job should be assigned **Contributor** and **Storage Blob Data Contributor** roles on the Azure storage account used for data input and output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.ml import MLClient, spark, Input, Output\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "from azure.ai.ml.entities import Environment\n",
+    "\n",
+    "subscription_id = \"<SUBSCRIPTION_ID>\"\n",
+    "resource_group = \"<RESOURCE_GROUP>\"\n",
+    "workspace = \"<AML_WORKSPACE_NAME>\"\n",
+    "ml_client = MLClient(\n",
+    "    DefaultAzureCredential(), subscription_id, resource_group, workspace\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark_job = spark(\n",
+    "    display_name=\"featurestore_sample_test\",\n",
+    "    code=\"./\",\n",
+    "    entry={\"file\": \"featurestore_sdk_job.py\"},\n",
+    "    driver_cores=1,\n",
+    "    driver_memory=\"1g\",\n",
+    "    executor_cores=1,\n",
+    "    executor_memory=\"1g\",\n",
+    "    executor_instances=1,\n",
+    "    resources={\n",
+    "        \"instance_type\": \"Standard_E8S_V3\",\n",
+    "        \"runtime_version\": \"3.2.0\",\n",
+    "    },\n",
+    "    environment=Environment(conda_file=\"project/env/conda.yml\"),\n",
+    ")\n",
+    "\n",
+    "returned_spark_job = ml_client.jobs.create_or_update(spark_job)\n",
+    "\n",
+    "print(returned_spark_job.id)\n",
+    "# Wait until the job completes\n",
+    "ml_client.jobs.stream(returned_spark_job.name)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10 - SDK V2",
+   "language": "python",
+   "name": "python310-sdkv2"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.7.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "6aeff17a1aa7735c2f7cb3a6d691fe1b4d4c3b8d2d650f644ad0f24e1b8e3f3f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/...-models/system/finetune/Llama-notebooks/text-classification/emotion-detection-llama.ipynb b/...-models/system/finetune/Llama-notebooks/text-classification/emotion-detection-llama.ipynb
@@ -87,9 +87,9 @@
     "except:\n",
     "    workspace_ml_client = MLClient(\n",
     "        credential,\n",
-    "        subscription_id=\"ed2cab61-14cc-4fb3-ac23-d72609214cfd\",\n",
-    "        resource_group_name=\"training_rg\",\n",
-    "        workspace_name=\"train-finetune-dev-workspace\",\n",
+    "        subscription_id=\"<SUBSCRIPTION_ID>\",\n",
+    "        resource_group_name=\"<RESOURCE_GROUP>\",\n",
+    "        workspace_name=\"<WORKSPACE_NAME>\",\n",
     "    )\n",
     "\n",
     "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml\"\n",

diff --git a/...hon/foundation-models/system/finetune/Llama-notebooks/text-generation/download-dataset.py b/...hon/foundation-models/system/finetune/Llama-notebooks/text-generation/download-dataset.py
@@ -0,0 +1,33 @@
+# import library to parse command line arguments
+import argparse, os
+
+parser = argparse.ArgumentParser()
+# add an argument to specify a dataset name to download
+parser.add_argument("--dataset", type=str, default="samsum", help="dataset name")
+# add an argument to specify a dataset name to download
+parser.add_argument(
+    "--dataset_subset", type=str, default="split", help="dataset subset name"
+)
+# add an argument to specify the directory to download the dataset to
+parser.add_argument(
+    "--download_dir",
+    type=str,
+    default="data",
+    help="directory to download the dataset to",
+)
+args = parser.parse_args()
+
+# create the download directory if it does not exist
+if not os.path.exists(args.download_dir):
+    os.makedirs(args.download_dir)
+
+
+# import hugging face datasets library
+from datasets import load_dataset, get_dataset_split_names
+
+for split in get_dataset_split_names(args.dataset):
+    # load the split of the dataset
+    dataset = load_dataset(args.dataset, split=split)
+    # save the split of the dataset to the download directory as json lines file
+    dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
+    # print dataset features
diff --git a/...dation-models/system/finetune/Llama-notebooks/text-generation/text-generation-config.json b/...dation-models/system/finetune/Llama-notebooks/text-generation/text-generation-config.json
@@ -0,0 +1 @@
+{}