Skip to content

Commit

Permalink
fix: Switch to Production ACR (#252)
Browse files Browse the repository at this point in the history
Switch off of existing AMR ACR for Production one

PR also introduces a boolean "Run all e2e tests" which if checked will
run e2e on all models, overriding whether supported_models.yaml was
updated. This was necessary for importing models into the new prod ACR.
  • Loading branch information
ishaansehgal99 authored Feb 25, 2024
1 parent 37485fa commit cc53389
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 68 deletions.
33 changes: 22 additions & 11 deletions .github/e2e-preset-configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,67 +5,78 @@
"name": "falcon-7b",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100
"node-osdisk-size": 100,
"OSS": true
},
{
"name": "falcon-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100
"node-osdisk-size": 100,
"OSS": true
},
{
"name": "falcon-40b",
"node-count": 1,
"node-vm-size": "Standard_NC96ads_A100_v4",
"node-osdisk-size": 400
"node-osdisk-size": 400,
"OSS": true
},
{
"name": "falcon-40b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC96ads_A100_v4",
"node-osdisk-size": 400
"node-osdisk-size": 400,
"OSS": true
},
{
"name": "mistral-7b",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100
"node-osdisk-size": 100,
"OSS": true
},
{
"name": "mistral-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100
"node-osdisk-size": 100,
"OSS": true
},
{
"name": "phi-2",
"node-count": 1,
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 50
"node-osdisk-size": 50,
"OSS": true
},
{
"name": "llama-2-7b",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100
"node-osdisk-size": 100,
"OSS": false
},
{
"name": "llama-2-7b-chat",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100
"node-osdisk-size": 100,
"OSS": false
},
{
"name": "llama-2-13b",
"node-count": 2,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 150
"node-osdisk-size": 150,
"OSS": false
},
{
"name": "llama-2-13b-chat",
"node-count": 2,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 150
"node-osdisk-size": 150,
"OSS": false
}
]
}
Expand Down
66 changes: 40 additions & 26 deletions .github/workflows/e2e-preset-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ on:
types:
- completed
workflow_dispatch:
inputs:
force-run-all:
type: boolean
default: false
description: "Test all models for E2E"

env:
GO_VERSION: "1.20"
Expand All @@ -30,13 +35,17 @@ jobs:
submodules: true
fetch-depth: 0

- name: Set FORCE_RUN_ALL Flag
run: echo "FORCE_RUN_ALL=${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}" >> $GITHUB_ENV

# This script should output a JSON array of model names
- name: Determine Affected Models
id: affected_models
run: |
PR_BRANCH=${{ env.BRANCH_NAME }} \
FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \
python3 .github/workflows/kind-cluster/determine_models.py
- name: Print Determined Models
run: |
echo "Output from determine_models: ${{ steps.affected_models.outputs.matrix }}"
Expand Down Expand Up @@ -97,15 +106,9 @@ jobs:
with:
submodules: true
fetch-depth: 0

- name: Install Azure CLI latest
run: |
if ! which az > /dev/null; then
echo "Azure CLI not found. Installing..."
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
else
echo "Azure CLI already installed."
fi

- name: Set OSS Flag
run: echo "MODEL_IS_OSS=${{ matrix.model.OSS }}" >> $GITHUB_ENV

- name: 'Az CLI login'
uses: azure/[email protected]
Expand All @@ -114,16 +117,16 @@ jobs:
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
allow-no-subscriptions: true

- name: 'Set subscription'
- name: 'Set Prod Subscription'
run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}}

- name: 'Check if Image exists in Test ACR'
id: check_test_image
- name: 'Check if Image exists in Prod ACR'
id: check_prod_image
run: |
ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }}
IMAGE_NAME=${{ matrix.model.name }}
ACR_NAME=${{ secrets.PROD_ACR_USERNAME }}
IMAGE_NAME=unlisted/aks/kaito/kaito-${{ matrix.model.name }}
TAG=${{ matrix.model.tag }}
# Use '|| true' to prevent script from exiting with an error if the repository is not found
TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true)
Expand All @@ -138,11 +141,14 @@ jobs:
echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME."
fi
fi
- name: 'Check if Image exists in Prod ACR'
id: check_prod_image
- name: 'Set Test Subscription'
run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}}

- name: 'Check if Image exists in Test ACR'
id: check_test_image
run: |
ACR_NAME=${{ secrets.ACR_AMR_USERNAME }}
ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }}
IMAGE_NAME=${{ matrix.model.name }}
TAG=${{ matrix.model.tag }}
Expand Down Expand Up @@ -345,23 +351,31 @@ jobs:
fi
- name: Move from Test to Prod ACR
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' && github.event_name == 'workflow_dispatch'
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' && github.event_name == 'workflow_dispatch' && env.MODEL_IS_OSS == 'true'
run: |
# This should only run if:
# This should only run if:
# 1. All prior steps have succeeed (Given)
# 2. Image exists in test ACR repo but not Prod
# 3. Workflow was triggered manually (workflow_dispatch)
# 4. Image is OSS (MIT/Apache2.0)
az account set --subscription ${{secrets.PROD_ACR_SUB_ID}}
TEST_ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }}
PROD_ACR_NAME=${{ secrets.ACR_AMR_USERNAME }}
PROD_ACR_NAME=${{ secrets.PROD_ACR_USERNAME }}
IMAGE_NAME=${{ matrix.model.name }}
TAG=${{ matrix.model.tag }}
# Formulate the source image reference
SOURCE_IMAGE="$TEST_ACR_NAME.azurecr.io/$IMAGE_NAME:$TAG"
SOURCE_IMAGE=$IMAGE_NAME:$TAG
DEST_IMAGE=unlisted/aks/kaito/kaito-$IMAGE_NAME:$TAG
# Import the image from Test ACR to Prod ACR
az acr import --name $PROD_ACR_NAME --source $SOURCE_IMAGE --image $IMAGE_NAME:$TAG
az acr import \
--name $PROD_ACR_NAME \
--source $SOURCE_IMAGE \
--image $DEST_IMAGE \
--registry /subscriptions/${{secrets.AZURE_SUBSCRIPTION_ID}}/resourceGroups/${{secrets.TEST_ACR_RG}}/providers/Microsoft.ContainerRegistry/registries/$TEST_ACR_NAME
- name: Cleanup
if: always()
Expand Down
20 changes: 5 additions & 15 deletions .github/workflows/kaito-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ jobs:
echo "VERSION=${rand}" >> $GITHUB_ENV
echo "CLUSTER_NAME=kaito${rand}" >> $GITHUB_ENV
echo "RUN_LLAMA_13B=false" >> $GITHUB_ENV
echo "AI_MODELS_IMAGE_VERSION=0.0.3" >> $GITHUB_ENV
- uses: azure/[email protected]
with:
Expand Down Expand Up @@ -137,20 +136,13 @@ jobs:
REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io
VERSION: ${{ env.VERSION }}

- name: Login to Private Presets ACR
- name: Attach Private Presets ACR
uses: azure/[email protected]
with:
inlineScript: |
az acr login --name ${{ secrets.DOCKER_SERVER }} --expose-token
az aks update -n ${{ env.CLUSTER_NAME }} -g ${{ env.CLUSTER_NAME }} --attach-acr ${{secrets.ACR_AIMODELSREGISTRY}}
- name: Add Secret Credentials
run: |
kubectl create secret docker-registry ${{secrets.DOCKER_REGISTRY}} \
--docker-server=${{secrets.DOCKER_SERVER}} \
--docker-username=${{secrets.DOCKER_USERNAME}} \
--docker-password=${{secrets.DOCKER_PASSWORD}}
az aks update -n ${{ env.CLUSTER_NAME }} -g ${{ env.CLUSTER_NAME }} \
--attach-acr ${{ secrets.ACR_AMRT_USERNAME }}
- name: Log kaito-workspace
run: |
kubectl get pods -n kaito-workspace -o name | grep "^pod/kaito-workspace" | sed 's/^pod\///' | xargs -I {} kubectl logs -n kaito-workspace {}
Expand All @@ -161,9 +153,7 @@ jobs:
env:
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
RUN_LLAMA_13B: ${{ env.RUN_LLAMA_13B }}
AI_MODELS_REGISTRY: ${{secrets.DOCKER_SERVER}}
AI_MODELS_REGISTRY_SECRET: ${{ secrets.DOCKER_REGISTRY }}
AI_MODELS_IMAGE_VERSION: ${{ env.AI_MODELS_IMAGE_VERSION }}
AI_MODELS_REGISTRY: ${{secrets.ACR_AMRT_USERNAME}}.azurecr.io

- name: Cleanup e2e resources
if: ${{ always() }}
Expand Down
14 changes: 10 additions & 4 deletions .github/workflows/kind-cluster/determine_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,16 @@ def check_modified_models(pr_branch):
return modified_models

def main():
pr_branch = os.environ.get("PR_BRANCH", "main")
# Logic to determine affected models
# Example: affected_models = ['model1', 'model2', 'model3']
affected_models = check_modified_models(pr_branch)
pr_branch = os.environ.get("PR_BRANCH", "main") # If not specified default to 'main'
force_run_all = os.environ.get("FORCE_RUN_ALL", False) # If not specified default to False

affected_models = []
if force_run_all:
affected_models = [model['name'] for model in YAML_PR['models']]
else:
# Logic to determine affected models
# Example: affected_models = ['model1', 'model2', 'model3']
affected_models = check_modified_models(pr_branch)

# Convert the list of models into JSON matrix format
matrix = create_matrix(affected_models)
Expand Down
41 changes: 29 additions & 12 deletions test/e2e/preset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,28 +46,28 @@ func createFalconWorkspaceWithPresetPublicMode(numOfNode int) *kaitov1alpha1.Wor
return workspaceObj
}

func createLlama7BWorkspaceWithPresetPrivateMode(registry, registrySecret, imageVersion string, numOfNode int) *kaitov1alpha1.Workspace {
func createLlama7BWorkspaceWithPresetPrivateMode(registry, imageVersion string, numOfNode int) *kaitov1alpha1.Workspace {
workspaceObj := &kaitov1alpha1.Workspace{}
By("Creating a workspace CR with Llama 7B Chat preset private mode", func() {
uniqueID := fmt.Sprint("preset-", rand.Intn(1000))
workspaceObj = utils.GenerateWorkspaceManifest(uniqueID, namespaceName, fmt.Sprintf("%s/%s:%s", registry, PresetLlama2AChat, imageVersion),
numOfNode, "Standard_NC12s_v3", &metav1.LabelSelector{
MatchLabels: map[string]string{"kaito-workspace": "private-preset-e2e-test"},
}, nil, PresetLlama2AChat, kaitov1alpha1.ModelImageAccessModePrivate, []string{registrySecret}, nil)
}, nil, PresetLlama2AChat, kaitov1alpha1.ModelImageAccessModePrivate, []string{}, nil)

createAndValidateWorkspace(workspaceObj)
})
return workspaceObj
}

func createLlama13BWorkspaceWithPresetPrivateMode(registry, registrySecret, imageVersion string, numOfNode int) *kaitov1alpha1.Workspace {
func createLlama13BWorkspaceWithPresetPrivateMode(registry, imageVersion string, numOfNode int) *kaitov1alpha1.Workspace {
workspaceObj := &kaitov1alpha1.Workspace{}
By("Creating a workspace CR with Llama 13B Chat preset private mode", func() {
uniqueID := fmt.Sprint("preset-", rand.Intn(1000))
workspaceObj = utils.GenerateWorkspaceManifest(uniqueID, namespaceName, fmt.Sprintf("%s/%s:%s", registry, PresetLlama2BChat, imageVersion),
numOfNode, "Standard_NC12s_v3", &metav1.LabelSelector{
MatchLabels: map[string]string{"kaito-workspace": "private-preset-e2e-test"},
}, nil, PresetLlama2BChat, kaitov1alpha1.ModelImageAccessModePrivate, []string{registrySecret}, nil)
}, nil, PresetLlama2BChat, kaitov1alpha1.ModelImageAccessModePrivate, []string{}, nil)

createAndValidateWorkspace(workspaceObj)
})
Expand Down Expand Up @@ -305,11 +305,9 @@ func deleteWorkspace(workspaceObj *kaitov1alpha1.Workspace) error {

var runLlama13B bool
var aiModelsRegistry string
var aiModelsRegistrySecret string
var aiModelsImageVersion string
var modelInfo map[string]string

var _ = Describe("Workspace Preset", func() {

BeforeEach(func() {
var err error
runLlama13B, err = strconv.ParseBool(os.Getenv("RUN_LLAMA_13B"))
Expand All @@ -318,10 +316,21 @@ var _ = Describe("Workspace Preset", func() {
fmt.Print("Error: RUN_LLAMA_13B ENV Variable not set")
runLlama13B = false
}

aiModelsRegistry = utils.GetEnv("AI_MODELS_REGISTRY")
aiModelsRegistrySecret = utils.GetEnv("AI_MODELS_REGISTRY_SECRET")
aiModelsImageVersion = utils.GetEnv("AI_MODELS_IMAGE_VERSION")

// Load stable model versions
configs, err := utils.GetModelConfigInfo("/home/runner/work/kaito/kaito/presets/models/supported_models.yaml")
if err != nil {
fmt.Printf("Failed to load model configs: %v\n", err)
os.Exit(1)
}

modelInfo, err = utils.ExtractModelVersion(configs)
if err != nil {
fmt.Printf("Failed to extract stable model versions: %v\n", err)
os.Exit(1)
}
})

It("should create a workspace with preset public mode successfully", func() {
Expand All @@ -345,7 +354,11 @@ var _ = Describe("Workspace Preset", func() {

It("should create a llama 7b workspace with preset private mode successfully", func() {
numOfNode := 1
workspaceObj := createLlama7BWorkspaceWithPresetPrivateMode(aiModelsRegistry, aiModelsRegistrySecret, aiModelsImageVersion, numOfNode)
modelVersion, ok := modelInfo[PresetLlama2AChat]
if !ok {
Fail(fmt.Sprintf("Model version for %s not found", PresetLlama2AChat))
}
workspaceObj := createLlama7BWorkspaceWithPresetPrivateMode(aiModelsRegistry, modelVersion, numOfNode)

defer cleanupResources(workspaceObj)
time.Sleep(30 * time.Second)
Expand All @@ -367,7 +380,11 @@ var _ = Describe("Workspace Preset", func() {
Skip("Skipping llama 13b workspace test")
}
numOfNode := 2
workspaceObj := createLlama13BWorkspaceWithPresetPrivateMode(aiModelsRegistry, aiModelsRegistrySecret, aiModelsImageVersion, numOfNode)
modelVersion, ok := modelInfo[PresetLlama2BChat]
if !ok {
Fail(fmt.Sprintf("Model version for %s not found", PresetLlama2AChat))
}
workspaceObj := createLlama13BWorkspaceWithPresetPrivateMode(aiModelsRegistry, modelVersion, numOfNode)

defer cleanupResources(workspaceObj)

Expand Down
Loading

0 comments on commit cc53389

Please sign in to comment.