Skip to content

Commit

Permalink
Vvatsalya/fix cli oai v2 workflow (#2717)
Browse files Browse the repository at this point in the history
* setting location as ncus

* set in setup-cli step

* new init and setup script for oai v2

* correcting syntax for init sh

* fix init oai v2 script

* fix

* fix 1

* fix 2

* change training dataset name in cli oai v2 example
  • Loading branch information
vvatsalya authored Oct 9, 2023
1 parent 01b0410 commit 33d122b
Show file tree
Hide file tree
Showing 5 changed files with 383 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,31 +26,31 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
bash bootstrap_oai_v2.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/init_environment_oai_v2.sh";
bash setup_oai_v2.sh
working-directory: cli
continue-on-error: true
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment_oai_v2.sh";
bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
[ -f "../../.azureml/config" ] && cat "../../.azureml/config";
bash -x ../dataset-create.sh
bash -x ../../../../run-job.sh openai_completions_finetune_pipeline_spec.yaml
working-directory: cli/foundation-models/azure_openai/oai-v2/openai_completions_finetune_pipeline
working-directory: cli/foundation-models/azure_openai/oai-v2/openai_completions_finetune_pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ jobs:
type: pipeline
component: azureml://registries/azureml/components/openai_completions_finetune_pipeline/versions/0.0.9
inputs:
train_dataset:
train_dataset:
type: uri_folder
path: azureml:identity-train-chat:1
validation_dataset:
path: azureml:oai_sample_training_data:1
validation_dataset:
type: uri_folder
path: azureml:identity-train-chat:1
path: azureml:oai_sample_training_data:1
registered_model_name: ${{parent.inputs.registered_model_name}}
model: ${{parent.inputs.model}}
task_type: ${{parent.inputs.task_type}}
n_epochs: ${{parent.inputs.n_epochs}}
learning_rate_multiplier: ${{parent.inputs.learning_rate_multiplier}}
batch_size: ${{parent.inputs.batch_size}}
batch_size: ${{parent.inputs.batch_size}}
47 changes: 47 additions & 0 deletions cli/setup_oai_v2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

### If installing a release candidate:
### * Update the "$wheel_url"
### * Uncomment the following block surrounded by {}
### * Comment the ml extension install within <az_ml_install>

# {
# wheel_url='https://azuremlsdktestpypi.blob.core.windows.net/wheels/sdk-cli-v2-public/ml-2.9.0-py3-none-any.whl'
#
# az extension remove -n ml
# if ! az extension add --yes --upgrade --source "$wheel_url"; then
#
# echo "Error: Failed to install release candidate"
# exit 1
# fi
# az version
# unset wheel_url
# }


# <az_ml_install>
az extension add -n ml -y
# </az_ml_install>

## For backward compatibility - running on old subscription
# <set_variables>
GROUP="azureml-examples"
LOCATION="northcentralus"
WORKSPACE="main"
# </set_variables>

# If RESOURCE_GROUP_NAME is empty, the az configure is pending.
RESOURCE_GROUP_NAME=${RESOURCE_GROUP_NAME:-}
if [[ -z "$RESOURCE_GROUP_NAME" ]]
then
echo "No resource group name [RESOURCE_GROUP_NAME] specified, defaulting to ${GROUP}."
# Installing extension temporarily assuming the run is on old subscription
# without bootstrap script.

# <az_configure_defaults>
az configure --defaults group=$GROUP workspace=$WORKSPACE location=$LOCATION
# </az_configure_defaults>
echo "Default resource group set to $GROUP"
else
echo "Workflows are using the new subscription."
fi
166 changes: 166 additions & 0 deletions infra/bootstrapping/bootstrap_oai_v2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#!/bin/bash
# set -xe
# Strict mode, fail on any error
set -euo pipefail

set -o errexit
set -o pipefail
set -o nounset
# set -o xtrace # For debugging

# set -Eeuo pipefail # https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
# set -o xtrace # For debugging

### Usage bash ./infra/bootstrapping/bootstrap.sh
### Bootstrapping script that creates Resource group and Workspace
### This assumes you have performed az login and have sufficient permissions

# The filename of this script for help messages
SCRIPT_PATH="${BASH_SOURCE[0]:-$0}"
SCRIPT_DIR="$( cd "$( dirname "${SCRIPT_PATH}" )" && pwd )"

###################
# REQUIRED ENVIRONMENT VARIABLES:
#
# RESOURCE_GROUP_NAME
# WORKSPACE_NAME
# LOCATION
# SUBSCRIPTION_ID

###############

# update directory with full permissions
if [ -d "$SCRIPT_DIR" ]; then
sudo chmod -R 777 "$SCRIPT_DIR"
fi

if [ -f "$SCRIPT_DIR"/sdk_helpers.sh ]; then
source "$SCRIPT_DIR"/sdk_helpers.sh;
else
echo "---------------------------------------------------------"
echo -e "ERROR: sdk_helpers.sh not found."
echo "---------------------------------------------------------"
fi

if [ -f "$SCRIPT_DIR"/init_environment_oai_v2.sh ]; then
source "$SCRIPT_DIR"/init_environment_oai_v2.sh;
else
echo "---------------------------------------------------------"
echo -e "ERROR: init_environment_oai_v2.sh not found."
echo "---------------------------------------------------------"
fi

echo_title "Installing tools"
"$SCRIPT_DIR"/sdk_helpers.sh install_tools

###################
# validate dependencies if the required utilities are installed
###################

"$SCRIPT_DIR"/sdk_helpers.sh validate_tool az || exit 1
"$SCRIPT_DIR"/sdk_helpers.sh validate_tool sed || exit 1

#login to azure using your credentials
az account show 1> /dev/null
if [[ $? != 0 ]];
then
az login
fi

echo_title "RESOURCE_GROUP_NAME = \"${RESOURCE_GROUP_NAME}\" & LOCATION=\"${LOCATION}\" set as defaults. "
az configure --defaults group="${RESOURCE_GROUP_NAME}" workspace="${WORKSPACE_NAME}" location="${LOCATION}" # for subsequent commands.
az account set -s "${SUBSCRIPTION_ID}" || exit 1


# RUN_BOOTSTRAP=1
if [[ ! -z "${RUN_BOOTSTRAP:-}" ]]; then

echo_title "Ensuring Resource group"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_resourcegroup
echo_title "Ensuring Workspace"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_ml_workspace "${WORKSPACE_NAME}"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_ml_workspace "mlw-mevnet"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_vnet "vnet-mevnet"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_subnet "vnet-mevnet" "snet-scoring"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_identity "uaimevnet"
"$SCRIPT_DIR"/sdk_helpers.sh grant_permission_identity_on_acr "uaimevnet"

echo_title "Ensuring Permissions on RG"
"$SCRIPT_DIR"/sdk_helpers.sh grant_permission_app_id_on_rg "${APP_NAME}"

echo_title "Ensuring Registry ${REGISTRY_NAME}"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_registry "${REGISTRY_NAME}"
echo_title "Ensuring Registry of tomorrow ${REGISTRY_NAME_TOMORROW}"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_registry "${REGISTRY_NAME_TOMORROW}"

echo_title "Ensuring CPU compute"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "cpu-cluster" 0 20 "Standard_DS3_v2"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "automl-cpu-cluster" 0 4 "Standard_DS3_v2"
# Larger CPU cluster for Dask and Spark examples
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "cpu-cluster-lg" 0 4 "Standard_DS15_v2"

echo_title "Ensuring GPU compute"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "gpu-cluster" 0 20 "STANDARD_NC6s_v3"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "automl-gpu-cluster" 0 4 "STANDARD_NC6s_v3"
# v100 single GPU cluster for pytorch 2.0 based notebooks
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "gpu-v100-1GPU-cluster" 0 4 "Standard_NC6s_v3"
# v100 GPU cluster for deepspeed cli examples
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "gpu-v100-cluster" 0 2 "Standard_ND40rs_v2"

echo_title "Running prerequisites"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_prerequisites_in_workspace
"$SCRIPT_DIR"/sdk_helpers.sh update_dataset
"$SCRIPT_DIR"/sdk_helpers.sh ensure_prerequisites_in_registry

"$SCRIPT_DIR"/sdk_helpers.sh register_providers

echo_title "Creating AKS clusters."
configure_aks_cluster=(
aks-cpu-is
aks-cpu-ml
aks-cpu-od
aks-cpu-mc
scoring-explain
)
for aks_compute in "${configure_aks_cluster[@]}"; do
(
echo_info "Creating AKS cluster: '$aks_compute'"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aks_compute "${aks_compute}" 1 3 "STANDARD_D3_V2"
) &
done
wait # until all AKS are created
for aks_compute in "${configure_aks_cluster[@]}"; do
(
echo_info "Attaching AKS cluster: '$aks_compute'"
"$SCRIPT_DIR"/sdk_helpers.sh install_k8s_extension "${aks_compute}" "managedClusters" "Microsoft.ContainerService/managedClusters"
"$SCRIPT_DIR"/sdk_helpers.sh setup_compute "${aks_compute}" "${aks_compute}" "managedClusters" "azureml"
)
done
echo_info ">>> Done creating AKS clusters"

# Arc cluster configuration
configure_arc_cluster=(
${ARC_CLUSTER_NAME}
)
for arc_compute in "${configure_arc_cluster[@]}"; do
(
echo_info "Creating amlarc cluster: '$arc_compute'"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aks_compute "${arc_compute}" 1 3 "STANDARD_D3_V2"
"$SCRIPT_DIR"/sdk_helpers.sh install_k8s_extension "${arc_compute}" "connectedClusters" "Microsoft.Kubernetes/connectedClusters"
"$SCRIPT_DIR"/sdk_helpers.sh setup_compute "${arc_compute}-arc" "${ARC_COMPUTE_NAME}" "connectedClusters" "azureml"
"$SCRIPT_DIR"/sdk_helpers.sh setup_instance_type_aml_arc "${arc_compute}"
)
done
echo_info ">>> Done creating amlarc clusters"
"$SCRIPT_DIR"/sdk_helpers.sh vmss_upgrade_policy_all_rg
# echo_title "Copying data"
# "$SCRIPT_DIR"/sdk_helpers.sh install_azcopy
# "$SCRIPT_DIR"/sdk_helpers.sh copy_dataset

else
"$SCRIPT_DIR"/sdk_helpers.sh update_dataset
echo_info "Skipping Bootstrapping. Set the RUN_BOOTSTRAP environment variable to enable bootstrapping."
fi

echo_title "✅ Resource provisioning completed..."

Loading

0 comments on commit 33d122b

Please sign in to comment.