Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
savitamittal1 authored Sep 7, 2023
2 parents ba8d7da + a3be24d commit 4d891e3
Show file tree
Hide file tree
Showing 17 changed files with 1,088 additions and 17 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.

name: sdk-featurestore_sample-test_featurestore_sdk_samples
# This file is created by sdk/python/readme.py.
# Please do not edit directly.
on:
workflow_dispatch:
schedule:
- cron: "22 2/12 * * *"
pull_request:
branches:
- main
paths:
- sdk/python/featurestore_sample/**
- .github/workflows/sdk-featurestore_sample-test_featurestore_sdk_samples.yml
- sdk/python/dev-requirements.txt
- infra/bootstrapping/**
- sdk/python/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: setup python
uses: actions/setup-python@v2
with:
python-version: "3.8"
- name: pip install notebook reqs
run: pip install -r sdk/python/dev-requirements.txt
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup SDK
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: sdk/python
continue-on-error: true
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: setup feature-store resources
run: |
bash -x setup-resources.sh test_featurestore_sdk_samples.ipynb
working-directory: sdk/python/featurestore_sample
continue-on-error: true
- name: run featurestore_sample/test_featurestore_sdk_samples.ipynb
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "test_featurestore_sdk_samples.ipynb";
[ -f "../../.azureml/config" ] && cat "../../.azureml/config";
papermill -k python test_featurestore_sdk_samples.ipynb test_featurestore_sdk_samples.output.ipynb
working-directory: sdk/python/featurestore_sample
- name: upload notebook's working folder as an artifact
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: test_featurestore_sdk_samples
path: sdk/python/featurestore_sample
2 changes: 1 addition & 1 deletion cli/endpoints/online/model-1/environment/conda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ dependencies:
- scikit-learn=1.2.2
- scipy=1.10.1
- pip:
- azureml-defaults==1.49.0
- azureml-defaults==1.53.0
- inference-schema[numpy-support]==1.5.1
- joblib==1.2.0
2 changes: 1 addition & 1 deletion cli/endpoints/online/model-2/environment/conda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ dependencies:
- scikit-learn=1.2.2
- scipy=1.10.1
- pip:
- azureml-defaults==1.49.0
- azureml-defaults==1.53.0
- inference-schema[numpy-support]==1.5.1
- joblib==1.2.0
2 changes: 1 addition & 1 deletion infra/bootstrapping/init_environment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ let "REGISTRY_TOMORROW=10#$(date -d '+1 days' +'%m%d')"
export LOCATION="East US"
export PREFIX=aml
export SUFFIX=sdkv202
export APP_NAME="github-sp-amlsdkv2-gh"
export APP_NAME="github-sp-amlsdkv2-gh-2"
export timestamp=$(date +%s)
# export RESOURCE_GROUP_NAME=test-data-rg
# export WORKSPACE_NAME=${PREFIX}${SUFFIX}${DATE_ONLY}-ws
Expand Down
26 changes: 26 additions & 0 deletions sdk/python/featurestore_sample/featurestore_sdk_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AccessData").getOrCreate()

print("=======Test Notebook 1============")
with open(
"notebooks/sdk_only/1. Develop a feature set and register with managed feature store.py"
) as f:
exec(f.read())

## Enable test for notebook 1 first
# print("=======Test Notebook 2============")
# with open(
# "notebooks/sdk_only/2. Enable materialization and backfill feature data.py"
# ) as f:
# exec(f.read())

# print("=======Test Notebook 3============")
# with open("notebooks/sdk_only/3. Experiment and train models using features.py") as f:
# exec(f.read())

# print("=======Test Notebook 3============")
# with open(
# "notebooks/sdk_only/4. Enable recurrent materialization and run batch inference.py"
# ) as f:
# exec(f.read())
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@
"\n",
"# Please update your alias belpw (or any custom directory you uploaded the samples to).\n",
"# You can find the name from the directory structure in the left nav\n",
"root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
"root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
"\n",
"if os.path.isdir(root_dir):\n",
" print(\"The folder exists.\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@
"\n",
"# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n",
"# You can find the name from the directory structure inm the left nav\n",
"root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
"root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
"\n",
"if os.path.isdir(root_dir):\n",
" print(\"The folder exists.\")\n",
Expand Down Expand Up @@ -419,7 +419,7 @@
"# storage\n",
"storage_subscription_id = os.environ[\"AZUREML_ARM_SUBSCRIPTION\"]\n",
"storage_resource_group_name = os.environ[\"AZUREML_ARM_RESOURCEGROUP\"]\n",
"storage_account_name = \"fstorestorage\"\n",
"storage_account_name = \"<FEATURE_STORAGE_ACCOUNT_NAME>\"\n",
"storage_location = ws_location\n",
"storage_file_system_name = \"offlinestore\""
]
Expand Down Expand Up @@ -759,7 +759,7 @@
"outputs": [],
"source": [
"# This utility function is created for ease of use in the docs tutorials. It uses standard azure API's. You can optionally inspect it `featurestore/setup/setup_storage_uai.py`\n",
"your_aad_objectid = \"<your_aad_objectId>\"\n",
"your_aad_objectid = \"<USER_AAD_OBJECTID>\"\n",
"\n",
"grant_user_aad_storage_data_reader_role(\n",
" AzureMLOnBehalfOfCredential(),\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@
"\n",
"# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n",
"# You can find the name from the directory structure inm the left nav\n",
"root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
"root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
"\n",
"if os.path.isdir(root_dir):\n",
" print(\"The folder exists.\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@
"\n",
"# please update the dir to ./Users/{your-alias} (or any custom directory you uploaded the samples to).\n",
"# You can find the name from the directory structure inm the left nav\n",
"root_dir = \"./Users/<your user alias>/featurestore_sample\"\n",
"root_dir = \"./Users/<your_user_alias>/featurestore_sample\"\n",
"\n",
"if os.path.isdir(root_dir):\n",
" print(\"The folder exists.\")\n",
Expand Down
36 changes: 36 additions & 0 deletions sdk/python/featurestore_sample/setup-resources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
pip install --upgrade jupytext

# <create_variables>
SUBSCRIPTION_ID=$(az account show --query id -o tsv)
LOCATION=$(az ml workspace show --query location -o tsv)
RESOURCE_GROUP=$(az group show --query name -o tsv)
AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv)
OUTPUT_COMMAND="print"
FEATURE_STORAGE_ACCOUNT_NAME=${RESOURCE_GROUP}fs
USER_ID="36b5b70a-a2b2-45e6-a496-df3c2ffde085"

# </create_variables>

# <convert_notebook_to_py>
NOTEBOOK_1="notebooks/sdk_only/1. Develop a feature set and register with managed feature store"
NOTEBOOK_2="notebooks/sdk_only/2. Enable materialization and backfill feature data"
NOTEBOOK_3="notebooks/sdk_only/3. Experiment and train models using features"
NOTEBOOK_4="notebooks/sdk_only/4. Enable recurrent materialization and run batch inference"
jupytext --to py "${NOTEBOOK_1}.ipynb"
jupytext --to py "${NOTEBOOK_2}.ipynb"
jupytext --to py "${NOTEBOOK_3}.ipynb"
jupytext --to py "${NOTEBOOK_4}.ipynb"
# <convert_notebook_to_py>

#<replace_template_values>
sed -i "s/<SUBSCRIPTION_ID>/$SUBSCRIPTION_ID/g;
s/<RESOURCE_GROUP>/$RESOURCE_GROUP/g;
s/<AML_WORKSPACE_NAME>/$AML_WORKSPACE_NAME/g;" $1

#<replace_template_values>
sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;" "${NOTEBOOK_1}.py"
sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;
s/<FEATURE_STORAGE_ACCOUNT_NAME>/$FEATURE_STORAGE_ACCOUNT_NAME/g;
s/<USER_AAD_OBJECTID>/$USER_ID/g;" "${NOTEBOOK_2}.py"
sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;" "${NOTEBOOK_3}.py"
sed -i "s/display/$OUTPUT_COMMAND/g;s/.\/Users\/<your_user_alias>\/featurestore_sample/.\//g;" "${NOTEBOOK_4}.py"
88 changes: 88 additions & 0 deletions sdk/python/featurestore_sample/test_featurestore_sdk_samples.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Use a serverless Spark compute"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"You should have an attached Synapse Spark pool available in your workspace. Please see documentation page: [Attach and manage a Synapse Spark pool in Azure Machine Learning (preview)](https://learn.microsoft.com/azure/machine-learning/how-to-manage-synapse-spark-pool) for more details.\n",
"\n",
"**Note** - To ensure successful execution of Spark job, the identity being used for the Spark job should be assigned **Contributor** and **Storage Blob Data Contributor** roles on the Azure storage account used for data input and output."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azure.ai.ml import MLClient, spark, Input, Output\n",
"from azure.identity import DefaultAzureCredential\n",
"from azure.ai.ml.entities import Environment\n",
"\n",
"subscription_id = \"<SUBSCRIPTION_ID>\"\n",
"resource_group = \"<RESOURCE_GROUP>\"\n",
"workspace = \"<AML_WORKSPACE_NAME>\"\n",
"ml_client = MLClient(\n",
" DefaultAzureCredential(), subscription_id, resource_group, workspace\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spark_job = spark(\n",
" display_name=\"featurestore_sample_test\",\n",
" code=\"./\",\n",
" entry={\"file\": \"featurestore_sdk_job.py\"},\n",
" driver_cores=1,\n",
" driver_memory=\"1g\",\n",
" executor_cores=1,\n",
" executor_memory=\"1g\",\n",
" executor_instances=1,\n",
" resources={\n",
" \"instance_type\": \"Standard_E8S_V3\",\n",
" \"runtime_version\": \"3.2.0\",\n",
" },\n",
" environment=Environment(conda_file=\"project/env/conda.yml\"),\n",
")\n",
"\n",
"returned_spark_job = ml_client.jobs.create_or_update(spark_job)\n",
"\n",
"print(returned_spark_job.id)\n",
"# Wait until the job completes\n",
"ml_client.jobs.stream(returned_spark_job.name)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10 - SDK V2",
"language": "python",
"name": "python310-sdkv2"
},
"language_info": {
"name": "python",
"version": "3.7.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "6aeff17a1aa7735c2f7cb3a6d691fe1b4d4c3b8d2d650f644ad0f24e1b8e3f3f"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@
"except:\n",
" workspace_ml_client = MLClient(\n",
" credential,\n",
" subscription_id=\"ed2cab61-14cc-4fb3-ac23-d72609214cfd\",\n",
" resource_group_name=\"training_rg\",\n",
" workspace_name=\"train-finetune-dev-workspace\",\n",
" subscription_id=\"<SUBSCRIPTION_ID>\",\n",
" resource_group_name=\"<RESOURCE_GROUP>\",\n",
" workspace_name=\"<WORKSPACE_NAME>\",\n",
" )\n",
"\n",
"# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml\"\n",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# import library to parse command line arguments
import argparse, os

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="samsum", help="dataset name")
# add an argument to specify a dataset name to download
parser.add_argument(
"--dataset_subset", type=str, default="split", help="dataset subset name"
)
# add an argument to specify the directory to download the dataset to
parser.add_argument(
"--download_dir",
type=str,
default="data",
help="directory to download the dataset to",
)
args = parser.parse_args()

# create the download directory if it does not exist
if not os.path.exists(args.download_dir):
os.makedirs(args.download_dir)


# import hugging face datasets library
from datasets import load_dataset, get_dataset_split_names

for split in get_dataset_split_names(args.dataset):
# load the split of the dataset
dataset = load_dataset(args.dataset, split=split)
# save the split of the dataset to the download directory as json lines file
dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
# print dataset features
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Loading

0 comments on commit 4d891e3

Please sign in to comment.