Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vvatsalya/fix cli oai v2 workflow #2717

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,31 +26,31 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
bash bootstrap_oai_v2.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/init_environment_oai_v2.sh";
bash setup_oai_v2.sh
working-directory: cli
continue-on-error: true
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment_oai_v2.sh";
bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
[ -f "../../.azureml/config" ] && cat "../../.azureml/config";
bash -x ../dataset-create.sh
bash -x ../../../../run-job.sh openai_completions_finetune_pipeline_spec.yaml
working-directory: cli/foundation-models/azure_openai/oai-v2/openai_completions_finetune_pipeline
working-directory: cli/foundation-models/azure_openai/oai-v2/openai_completions_finetune_pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ jobs:
type: pipeline
component: azureml://registries/azureml/components/openai_completions_finetune_pipeline/versions/0.0.9
inputs:
train_dataset:
train_dataset:
type: uri_folder
path: azureml:identity-train-chat:1
validation_dataset:
path: azureml:oai_sample_training_data:1
validation_dataset:
type: uri_folder
path: azureml:identity-train-chat:1
path: azureml:oai_sample_training_data:1
registered_model_name: ${{parent.inputs.registered_model_name}}
model: ${{parent.inputs.model}}
task_type: ${{parent.inputs.task_type}}
n_epochs: ${{parent.inputs.n_epochs}}
learning_rate_multiplier: ${{parent.inputs.learning_rate_multiplier}}
batch_size: ${{parent.inputs.batch_size}}
batch_size: ${{parent.inputs.batch_size}}
47 changes: 47 additions & 0 deletions cli/setup_oai_v2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

### If installing a release candidate:
### * Update the "$wheel_url"
### * Uncomment the following block surrounded by {}
### * Comment the ml extension install within <az_ml_install>

# {
# wheel_url='https://azuremlsdktestpypi.blob.core.windows.net/wheels/sdk-cli-v2-public/ml-2.9.0-py3-none-any.whl'
#
# az extension remove -n ml
# if ! az extension add --yes --upgrade --source "$wheel_url"; then
#
# echo "Error: Failed to install release candidate"
# exit 1
# fi
# az version
# unset wheel_url
# }


# <az_ml_install>
az extension add -n ml -y
# </az_ml_install>

## For backward compatibility - running on old subscription
# <set_variables>
GROUP="azureml-examples"
LOCATION="northcentralus"
WORKSPACE="main"
# </set_variables>

# If RESOURCE_GROUP_NAME is empty, the az configure is pending.
RESOURCE_GROUP_NAME=${RESOURCE_GROUP_NAME:-}
if [[ -z "$RESOURCE_GROUP_NAME" ]]
then
echo "No resource group name [RESOURCE_GROUP_NAME] specified, defaulting to ${GROUP}."
# Installing extension temporarily assuming the run is on old subscription
# without bootstrap script.

# <az_configure_defaults>
az configure --defaults group=$GROUP workspace=$WORKSPACE location=$LOCATION
# </az_configure_defaults>
echo "Default resource group set to $GROUP"
else
echo "Workflows are using the new subscription."
fi
166 changes: 166 additions & 0 deletions infra/bootstrapping/bootstrap_oai_v2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#!/bin/bash
# set -xe
# Strict mode, fail on any error
set -euo pipefail

set -o errexit
set -o pipefail
set -o nounset
# set -o xtrace # For debugging

# set -Eeuo pipefail # https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
# set -o xtrace # For debugging

### Usage bash ./infra/bootstrapping/bootstrap.sh
### Bootstrapping script that creates Resource group and Workspace
### This assumes you have performed az login and have sufficient permissions

# The filename of this script for help messages
SCRIPT_PATH="${BASH_SOURCE[0]:-$0}"
SCRIPT_DIR="$( cd "$( dirname "${SCRIPT_PATH}" )" && pwd )"

###################
# REQUIRED ENVIRONMENT VARIABLES:
#
# RESOURCE_GROUP_NAME
# WORKSPACE_NAME
# LOCATION
# SUBSCRIPTION_ID

###############

# update directory with full permissions
if [ -d "$SCRIPT_DIR" ]; then
sudo chmod -R 777 "$SCRIPT_DIR"
fi

if [ -f "$SCRIPT_DIR"/sdk_helpers.sh ]; then
source "$SCRIPT_DIR"/sdk_helpers.sh;
else
echo "---------------------------------------------------------"
echo -e "ERROR: sdk_helpers.sh not found."
echo "---------------------------------------------------------"
fi

if [ -f "$SCRIPT_DIR"/init_environment_oai_v2.sh ]; then
source "$SCRIPT_DIR"/init_environment_oai_v2.sh;
else
echo "---------------------------------------------------------"
echo -e "ERROR: init_environment_oai_v2.sh not found."
echo "---------------------------------------------------------"
fi

echo_title "Installing tools"
"$SCRIPT_DIR"/sdk_helpers.sh install_tools

###################
# validate dependencies if the required utilities are installed
###################

"$SCRIPT_DIR"/sdk_helpers.sh validate_tool az || exit 1
"$SCRIPT_DIR"/sdk_helpers.sh validate_tool sed || exit 1

#login to azure using your credentials
az account show 1> /dev/null
if [[ $? != 0 ]];
then
az login
fi

echo_title "RESOURCE_GROUP_NAME = \"${RESOURCE_GROUP_NAME}\" & LOCATION=\"${LOCATION}\" set as defaults. "
az configure --defaults group="${RESOURCE_GROUP_NAME}" workspace="${WORKSPACE_NAME}" location="${LOCATION}" # for subsequent commands.
az account set -s "${SUBSCRIPTION_ID}" || exit 1


# RUN_BOOTSTRAP=1
if [[ ! -z "${RUN_BOOTSTRAP:-}" ]]; then

echo_title "Ensuring Resource group"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_resourcegroup
echo_title "Ensuring Workspace"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_ml_workspace "${WORKSPACE_NAME}"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_ml_workspace "mlw-mevnet"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_vnet "vnet-mevnet"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_subnet "vnet-mevnet" "snet-scoring"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_identity "uaimevnet"
"$SCRIPT_DIR"/sdk_helpers.sh grant_permission_identity_on_acr "uaimevnet"

echo_title "Ensuring Permissions on RG"
"$SCRIPT_DIR"/sdk_helpers.sh grant_permission_app_id_on_rg "${APP_NAME}"

echo_title "Ensuring Registry ${REGISTRY_NAME}"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_registry "${REGISTRY_NAME}"
echo_title "Ensuring Registry of tomorrow ${REGISTRY_NAME_TOMORROW}"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_registry "${REGISTRY_NAME_TOMORROW}"

echo_title "Ensuring CPU compute"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "cpu-cluster" 0 20 "Standard_DS3_v2"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "automl-cpu-cluster" 0 4 "Standard_DS3_v2"
# Larger CPU cluster for Dask and Spark examples
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "cpu-cluster-lg" 0 4 "Standard_DS15_v2"

echo_title "Ensuring GPU compute"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "gpu-cluster" 0 20 "STANDARD_NC6s_v3"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "automl-gpu-cluster" 0 4 "STANDARD_NC6s_v3"
# v100 single GPU cluster for pytorch 2.0 based notebooks
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "gpu-v100-1GPU-cluster" 0 4 "Standard_NC6s_v3"
# v100 GPU cluster for deepspeed cli examples
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aml_compute "gpu-v100-cluster" 0 2 "Standard_ND40rs_v2"

echo_title "Running prerequisites"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_prerequisites_in_workspace
"$SCRIPT_DIR"/sdk_helpers.sh update_dataset
"$SCRIPT_DIR"/sdk_helpers.sh ensure_prerequisites_in_registry

"$SCRIPT_DIR"/sdk_helpers.sh register_providers

echo_title "Creating AKS clusters."
configure_aks_cluster=(
aks-cpu-is
aks-cpu-ml
aks-cpu-od
aks-cpu-mc
scoring-explain
)
for aks_compute in "${configure_aks_cluster[@]}"; do
(
echo_info "Creating AKS cluster: '$aks_compute'"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aks_compute "${aks_compute}" 1 3 "STANDARD_D3_V2"
) &
done
wait # until all AKS are created
for aks_compute in "${configure_aks_cluster[@]}"; do
(
echo_info "Attaching AKS cluster: '$aks_compute'"
"$SCRIPT_DIR"/sdk_helpers.sh install_k8s_extension "${aks_compute}" "managedClusters" "Microsoft.ContainerService/managedClusters"
"$SCRIPT_DIR"/sdk_helpers.sh setup_compute "${aks_compute}" "${aks_compute}" "managedClusters" "azureml"
)
done
echo_info ">>> Done creating AKS clusters"

# Arc cluster configuration
configure_arc_cluster=(
${ARC_CLUSTER_NAME}
)
for arc_compute in "${configure_arc_cluster[@]}"; do
(
echo_info "Creating amlarc cluster: '$arc_compute'"
"$SCRIPT_DIR"/sdk_helpers.sh ensure_aks_compute "${arc_compute}" 1 3 "STANDARD_D3_V2"
"$SCRIPT_DIR"/sdk_helpers.sh install_k8s_extension "${arc_compute}" "connectedClusters" "Microsoft.Kubernetes/connectedClusters"
"$SCRIPT_DIR"/sdk_helpers.sh setup_compute "${arc_compute}-arc" "${ARC_COMPUTE_NAME}" "connectedClusters" "azureml"
"$SCRIPT_DIR"/sdk_helpers.sh setup_instance_type_aml_arc "${arc_compute}"
)
done
echo_info ">>> Done creating amlarc clusters"
"$SCRIPT_DIR"/sdk_helpers.sh vmss_upgrade_policy_all_rg
# echo_title "Copying data"
# "$SCRIPT_DIR"/sdk_helpers.sh install_azcopy
# "$SCRIPT_DIR"/sdk_helpers.sh copy_dataset

else
"$SCRIPT_DIR"/sdk_helpers.sh update_dataset
echo_info "Skipping Bootstrapping. Set the RUN_BOOTSTRAP environment variable to enable bootstrapping."
fi

echo_title "✅ Resource provisioning completed..."

Loading
Loading