diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json index e3cbbb8b9..c0173f6f5 100644 --- a/.github/e2e-preset-configs.json +++ b/.github/e2e-preset-configs.json @@ -5,67 +5,78 @@ "name": "falcon-7b", "node-count": 1, "node-vm-size": "Standard_NC12s_v3", - "node-osdisk-size": 100 + "node-osdisk-size": 100, + "OSS": true }, { "name": "falcon-7b-instruct", "node-count": 1, "node-vm-size": "Standard_NC12s_v3", - "node-osdisk-size": 100 + "node-osdisk-size": 100, + "OSS": true }, { "name": "falcon-40b", "node-count": 1, "node-vm-size": "Standard_NC96ads_A100_v4", - "node-osdisk-size": 400 + "node-osdisk-size": 400, + "OSS": true }, { "name": "falcon-40b-instruct", "node-count": 1, "node-vm-size": "Standard_NC96ads_A100_v4", - "node-osdisk-size": 400 + "node-osdisk-size": 400, + "OSS": true }, { "name": "mistral-7b", "node-count": 1, "node-vm-size": "Standard_NC12s_v3", - "node-osdisk-size": 100 + "node-osdisk-size": 100, + "OSS": true }, { "name": "mistral-7b-instruct", "node-count": 1, "node-vm-size": "Standard_NC12s_v3", - "node-osdisk-size": 100 + "node-osdisk-size": 100, + "OSS": true }, { "name": "phi-2", "node-count": 1, "node-vm-size": "Standard_NC6s_v3", - "node-osdisk-size": 50 + "node-osdisk-size": 50, + "OSS": true }, { "name": "llama-2-7b", "node-count": 1, "node-vm-size": "Standard_NC12s_v3", - "node-osdisk-size": 100 + "node-osdisk-size": 100, + "OSS": false }, { "name": "llama-2-7b-chat", "node-count": 1, "node-vm-size": "Standard_NC12s_v3", - "node-osdisk-size": 100 + "node-osdisk-size": 100, + "OSS": false }, { "name": "llama-2-13b", "node-count": 2, "node-vm-size": "Standard_NC12s_v3", - "node-osdisk-size": 150 + "node-osdisk-size": 150, + "OSS": false }, { "name": "llama-2-13b-chat", "node-count": 2, "node-vm-size": "Standard_NC12s_v3", - "node-osdisk-size": 150 + "node-osdisk-size": 150, + "OSS": false } ] } diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 5573403a6..5e2a8c836 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -6,6 +6,11 @@ on: types: - completed workflow_dispatch: + inputs: + force-run-all: + type: boolean + default: false + description: "Test all models for E2E" env: GO_VERSION: "1.20" @@ -30,13 +35,17 @@ jobs: submodules: true fetch-depth: 0 + - name: Set FORCE_RUN_ALL Flag + run: echo "FORCE_RUN_ALL=${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}" >> $GITHUB_ENV + # This script should output a JSON array of model names - name: Determine Affected Models id: affected_models run: | PR_BRANCH=${{ env.BRANCH_NAME }} \ + FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \ python3 .github/workflows/kind-cluster/determine_models.py - + - name: Print Determined Models run: | echo "Output from determine_models: ${{ steps.affected_models.outputs.matrix }}" @@ -97,15 +106,9 @@ jobs: with: submodules: true fetch-depth: 0 - - - name: Install Azure CLI latest - run: | - if ! which az > /dev/null; then - echo "Azure CLI not found. Installing..." - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - else - echo "Azure CLI already installed." - fi + + - name: Set OSS Flag + run: echo "MODEL_IS_OSS=${{ matrix.model.OSS }}" >> $GITHUB_ENV - name: 'Az CLI login' uses: azure/login@v1.6.1 @@ -114,16 +117,16 @@ jobs: tenant-id: ${{ secrets.AZURE_TENANT_ID }} allow-no-subscriptions: true - - name: 'Set subscription' + - name: 'Set Prod Subscription' run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}} - - - name: 'Check if Image exists in Test ACR' - id: check_test_image + + - name: 'Check if Image exists in Prod ACR' + id: check_prod_image run: | - ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} - IMAGE_NAME=${{ matrix.model.name }} + ACR_NAME=${{ secrets.PROD_ACR_USERNAME }} + IMAGE_NAME=unlisted/aks/kaito/kaito-${{ matrix.model.name }} TAG=${{ matrix.model.tag }} - + # Use '|| true' to prevent script from exiting with an error if the repository is not found TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true) @@ -138,11 +141,14 @@ jobs: echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." fi fi - - - name: 'Check if Image exists in Prod ACR' - id: check_prod_image + + - name: 'Set Test Subscription' + run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}} + + - name: 'Check if Image exists in Test ACR' + id: check_test_image run: | - ACR_NAME=${{ secrets.ACR_AMR_USERNAME }} + ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} IMAGE_NAME=${{ matrix.model.name }} TAG=${{ matrix.model.tag }} @@ -345,23 +351,31 @@ jobs: fi - name: Move from Test to Prod ACR - if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' && github.event_name == 'workflow_dispatch' + if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' && github.event_name == 'workflow_dispatch' && env.MODEL_IS_OSS == 'true' run: | - # This should only run if: + # This should only run if: # 1. All prior steps have succeeed (Given) # 2. Image exists in test ACR repo but not Prod # 3. Workflow was triggered manually (workflow_dispatch) + # 4. Image is OSS (MIT/Apache2.0) + + az account set --subscription ${{secrets.PROD_ACR_SUB_ID}} TEST_ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} - PROD_ACR_NAME=${{ secrets.ACR_AMR_USERNAME }} + PROD_ACR_NAME=${{ secrets.PROD_ACR_USERNAME }} IMAGE_NAME=${{ matrix.model.name }} TAG=${{ matrix.model.tag }} # Formulate the source image reference - SOURCE_IMAGE="$TEST_ACR_NAME.azurecr.io/$IMAGE_NAME:$TAG" + SOURCE_IMAGE=$IMAGE_NAME:$TAG + DEST_IMAGE=unlisted/aks/kaito/kaito-$IMAGE_NAME:$TAG # Import the image from Test ACR to Prod ACR - az acr import --name $PROD_ACR_NAME --source $SOURCE_IMAGE --image $IMAGE_NAME:$TAG + az acr import \ + --name $PROD_ACR_NAME \ + --source $SOURCE_IMAGE \ + --image $DEST_IMAGE \ + --registry /subscriptions/${{secrets.AZURE_SUBSCRIPTION_ID}}/resourceGroups/${{secrets.TEST_ACR_RG}}/providers/Microsoft.ContainerRegistry/registries/$TEST_ACR_NAME - name: Cleanup if: always() diff --git a/.github/workflows/kaito-e2e.yaml b/.github/workflows/kaito-e2e.yaml index ab6303639..36c6e7c49 100644 --- a/.github/workflows/kaito-e2e.yaml +++ b/.github/workflows/kaito-e2e.yaml @@ -51,7 +51,6 @@ jobs: echo "VERSION=${rand}" >> $GITHUB_ENV echo "CLUSTER_NAME=kaito${rand}" >> $GITHUB_ENV echo "RUN_LLAMA_13B=false" >> $GITHUB_ENV - echo "AI_MODELS_IMAGE_VERSION=0.0.3" >> $GITHUB_ENV - uses: azure/login@v1.6.1 with: @@ -137,20 +136,13 @@ jobs: REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io VERSION: ${{ env.VERSION }} - - name: Login to Private Presets ACR + - name: Attach Private Presets ACR uses: azure/CLI@v1.0.9 with: inlineScript: | - az acr login --name ${{ secrets.DOCKER_SERVER }} --expose-token - az aks update -n ${{ env.CLUSTER_NAME }} -g ${{ env.CLUSTER_NAME }} --attach-acr ${{secrets.ACR_AIMODELSREGISTRY}} - - - name: Add Secret Credentials - run: | - kubectl create secret docker-registry ${{secrets.DOCKER_REGISTRY}} \ - --docker-server=${{secrets.DOCKER_SERVER}} \ - --docker-username=${{secrets.DOCKER_USERNAME}} \ - --docker-password=${{secrets.DOCKER_PASSWORD}} - + az aks update -n ${{ env.CLUSTER_NAME }} -g ${{ env.CLUSTER_NAME }} \ + --attach-acr ${{ secrets.ACR_AMRT_USERNAME }} + - name: Log kaito-workspace run: | kubectl get pods -n kaito-workspace -o name | grep "^pod/kaito-workspace" | sed 's/^pod\///' | xargs -I {} kubectl logs -n kaito-workspace {} @@ -161,9 +153,7 @@ jobs: env: AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} RUN_LLAMA_13B: ${{ env.RUN_LLAMA_13B }} - AI_MODELS_REGISTRY: ${{secrets.DOCKER_SERVER}} - AI_MODELS_REGISTRY_SECRET: ${{ secrets.DOCKER_REGISTRY }} - AI_MODELS_IMAGE_VERSION: ${{ env.AI_MODELS_IMAGE_VERSION }} + AI_MODELS_REGISTRY: ${{secrets.ACR_AMRT_USERNAME}}.azurecr.io - name: Cleanup e2e resources if: ${{ always() }} diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py index 537595055..402365441 100644 --- a/.github/workflows/kind-cluster/determine_models.py +++ b/.github/workflows/kind-cluster/determine_models.py @@ -116,10 +116,16 @@ def check_modified_models(pr_branch): return modified_models def main(): - pr_branch = os.environ.get("PR_BRANCH", "main") - # Logic to determine affected models - # Example: affected_models = ['model1', 'model2', 'model3'] - affected_models = check_modified_models(pr_branch) + pr_branch = os.environ.get("PR_BRANCH", "main") # If not specified default to 'main' + force_run_all = os.environ.get("FORCE_RUN_ALL", False) # If not specified default to False + + affected_models = [] + if force_run_all: + affected_models = [model['name'] for model in YAML_PR['models']] + else: + # Logic to determine affected models + # Example: affected_models = ['model1', 'model2', 'model3'] + affected_models = check_modified_models(pr_branch) # Convert the list of models into JSON matrix format matrix = create_matrix(affected_models) diff --git a/test/e2e/preset_test.go b/test/e2e/preset_test.go index 5cce84843..be6b5abe9 100644 --- a/test/e2e/preset_test.go +++ b/test/e2e/preset_test.go @@ -46,28 +46,28 @@ func createFalconWorkspaceWithPresetPublicMode(numOfNode int) *kaitov1alpha1.Wor return workspaceObj } -func createLlama7BWorkspaceWithPresetPrivateMode(registry, registrySecret, imageVersion string, numOfNode int) *kaitov1alpha1.Workspace { +func createLlama7BWorkspaceWithPresetPrivateMode(registry, imageVersion string, numOfNode int) *kaitov1alpha1.Workspace { workspaceObj := &kaitov1alpha1.Workspace{} By("Creating a workspace CR with Llama 7B Chat preset private mode", func() { uniqueID := fmt.Sprint("preset-", rand.Intn(1000)) workspaceObj = utils.GenerateWorkspaceManifest(uniqueID, namespaceName, fmt.Sprintf("%s/%s:%s", registry, PresetLlama2AChat, imageVersion), numOfNode, "Standard_NC12s_v3", &metav1.LabelSelector{ MatchLabels: map[string]string{"kaito-workspace": "private-preset-e2e-test"}, - }, nil, PresetLlama2AChat, kaitov1alpha1.ModelImageAccessModePrivate, []string{registrySecret}, nil) + }, nil, PresetLlama2AChat, kaitov1alpha1.ModelImageAccessModePrivate, []string{}, nil) createAndValidateWorkspace(workspaceObj) }) return workspaceObj } -func createLlama13BWorkspaceWithPresetPrivateMode(registry, registrySecret, imageVersion string, numOfNode int) *kaitov1alpha1.Workspace { +func createLlama13BWorkspaceWithPresetPrivateMode(registry, imageVersion string, numOfNode int) *kaitov1alpha1.Workspace { workspaceObj := &kaitov1alpha1.Workspace{} By("Creating a workspace CR with Llama 13B Chat preset private mode", func() { uniqueID := fmt.Sprint("preset-", rand.Intn(1000)) workspaceObj = utils.GenerateWorkspaceManifest(uniqueID, namespaceName, fmt.Sprintf("%s/%s:%s", registry, PresetLlama2BChat, imageVersion), numOfNode, "Standard_NC12s_v3", &metav1.LabelSelector{ MatchLabels: map[string]string{"kaito-workspace": "private-preset-e2e-test"}, - }, nil, PresetLlama2BChat, kaitov1alpha1.ModelImageAccessModePrivate, []string{registrySecret}, nil) + }, nil, PresetLlama2BChat, kaitov1alpha1.ModelImageAccessModePrivate, []string{}, nil) createAndValidateWorkspace(workspaceObj) }) @@ -305,11 +305,9 @@ func deleteWorkspace(workspaceObj *kaitov1alpha1.Workspace) error { var runLlama13B bool var aiModelsRegistry string -var aiModelsRegistrySecret string -var aiModelsImageVersion string +var modelInfo map[string]string var _ = Describe("Workspace Preset", func() { - BeforeEach(func() { var err error runLlama13B, err = strconv.ParseBool(os.Getenv("RUN_LLAMA_13B")) @@ -318,10 +316,21 @@ var _ = Describe("Workspace Preset", func() { fmt.Print("Error: RUN_LLAMA_13B ENV Variable not set") runLlama13B = false } - + aiModelsRegistry = utils.GetEnv("AI_MODELS_REGISTRY") - aiModelsRegistrySecret = utils.GetEnv("AI_MODELS_REGISTRY_SECRET") - aiModelsImageVersion = utils.GetEnv("AI_MODELS_IMAGE_VERSION") + + // Load stable model versions + configs, err := utils.GetModelConfigInfo("/home/runner/work/kaito/kaito/presets/models/supported_models.yaml") + if err != nil { + fmt.Printf("Failed to load model configs: %v\n", err) + os.Exit(1) + } + + modelInfo, err = utils.ExtractModelVersion(configs) + if err != nil { + fmt.Printf("Failed to extract stable model versions: %v\n", err) + os.Exit(1) + } }) It("should create a workspace with preset public mode successfully", func() { @@ -345,7 +354,11 @@ var _ = Describe("Workspace Preset", func() { It("should create a llama 7b workspace with preset private mode successfully", func() { numOfNode := 1 - workspaceObj := createLlama7BWorkspaceWithPresetPrivateMode(aiModelsRegistry, aiModelsRegistrySecret, aiModelsImageVersion, numOfNode) + modelVersion, ok := modelInfo[PresetLlama2AChat] + if !ok { + Fail(fmt.Sprintf("Model version for %s not found", PresetLlama2AChat)) + } + workspaceObj := createLlama7BWorkspaceWithPresetPrivateMode(aiModelsRegistry, modelVersion, numOfNode) defer cleanupResources(workspaceObj) time.Sleep(30 * time.Second) @@ -367,7 +380,11 @@ var _ = Describe("Workspace Preset", func() { Skip("Skipping llama 13b workspace test") } numOfNode := 2 - workspaceObj := createLlama13BWorkspaceWithPresetPrivateMode(aiModelsRegistry, aiModelsRegistrySecret, aiModelsImageVersion, numOfNode) + modelVersion, ok := modelInfo[PresetLlama2BChat] + if !ok { + Fail(fmt.Sprintf("Model version for %s not found", PresetLlama2AChat)) + } + workspaceObj := createLlama13BWorkspaceWithPresetPrivateMode(aiModelsRegistry, modelVersion, numOfNode) defer cleanupResources(workspaceObj) diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go index 5963a82cb..3914f00eb 100644 --- a/test/e2e/utils/utils.go +++ b/test/e2e/utils/utils.go @@ -5,11 +5,13 @@ package utils import ( "fmt" + "io/ioutil" "os" "time" kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1" "github.com/samber/lo" + "gopkg.in/yaml.v2" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -34,6 +36,51 @@ func GetEnv(envVar string) string { return env } +func GetModelConfigInfo(configFilePath string) (map[string]interface{}, error) { + var data map[string]interface{} + + yamlData, err := ioutil.ReadFile(configFilePath) + if err != nil { + return nil, fmt.Errorf("error reading YAML file: %w", err) + } + + err = yaml.Unmarshal(yamlData, &data) + if err != nil { + return nil, fmt.Errorf("error unmarshalling YAML: %w", err) + } + + return data, nil +} + +func ExtractModelVersion(configs map[string]interface{}) (map[string]string, error) { + modelsInfo := make(map[string]string) + models, ok := configs["models"].([]interface{}) + if !ok { + return nil, fmt.Errorf("'models' key not found or is not a slice") + } + + for _, modelItem := range models { + model, ok := modelItem.(map[interface{}]interface{}) + if !ok { + return nil, fmt.Errorf("model item is not a map") + } + + modelName, ok := model["name"].(string) + if !ok { + return nil, fmt.Errorf("model name is not a string or not found") + } + + modelTag, ok := model["tag"].(string) // Using 'tag' as the version + if !ok { + return nil, fmt.Errorf("model version for %s is not a string or not found", modelName) + } + + modelsInfo[modelName] = modelTag + } + + return modelsInfo, nil +} + func GenerateWorkspaceManifest(name, namespace, imageName string, resourceCount int, instanceType string, labelSelector *metav1.LabelSelector, preferredNodes []string, presetName kaitov1alpha1.ModelName, inferenceMode kaitov1alpha1.ModelImageAccessMode, imagePullSecret []string,