diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json index acbfa98f2..a499d28d6 100644 --- a/.github/e2e-preset-configs.json +++ b/.github/e2e-preset-configs.json @@ -37,6 +37,12 @@ "node-vm-size": "Standard_NC12s_v3", "node-osdisk-size": 100 }, + { + "name": "phi-2", + "node-count": 1, + "node-vm-size": "Standard_NC6s_v3", + "node-osdisk-size": 30 + }, { "name": "llama-2-7b", "node-count": 1, diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 784c292bd..641ac2ab1 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -234,19 +234,25 @@ jobs: done echo "Service IP is $SERVICE_IP" echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT - - - name: Replace IP and Deploy Statefulset to K8s + + - name: Get Resource Type + id: resource + run: | + RESOURCE_TYPE=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "statefulset" || echo "deployment") + echo "RESOURCE_TYPE=$RESOURCE_TYPE" >> $GITHUB_OUTPUT + + - name: Replace IP and Deploy Resource to K8s if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' run: | - sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml - sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml - sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml - kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml - - - name: Wait for Statefulset to be ready + sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml + sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml + sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml + kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml + + - name: Wait for Resource to be ready if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' run: | - kubectl rollout status statefulset/${{ matrix.model.name }} + kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} - name: Test home endpoint if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' @@ -359,16 +365,22 @@ jobs: - name: Cleanup if: always() run: | + # Only proceed if RESOURCE_TYPE is set (else resource wasn't created) + if [ -n "${{ steps.resource.outputs.RESOURCE_TYPE }}" ]; then + # Use RESOURCE_TYPE from the previous step + RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }} + + # Check and Delete K8s Resource (Deployment or StatefulSet) + if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then + kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }} + fi + fi + # Check and Delete K8s Service if it exists if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then kubectl delete svc ${{ matrix.model.name }} fi - # Check and Delete K8s StatefulSet if it exists - if kubectl get statefulset ${{ matrix.model.name }} > /dev/null 2>&1; then - kubectl delete statefulset ${{ matrix.model.name }} - fi - # Check and Delete AKS Nodepool if it exists if [ -n "${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }}" ]; then NODEPOOL_EXIST=$(az aks nodepool show \ diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml index cc41fb6f7..27f21ec46 100644 --- a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml +++ b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml @@ -5,10 +5,9 @@ metadata: spec: selector: app: falcon - statefulset.kubernetes.io/pod-name: falcon-40b-instruct-0 ports: - - protocol: TCP - port: 80 - targetPort: 5000 + - protocol: TCP + port: 80 + targetPort: 5000 type: LoadBalancer publishNotReadyAddresses: true diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml deleted file mode 100644 index 57326ddf7..000000000 --- a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct-statefulset.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: falcon-40b-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: falcon - podManagementPolicy: Parallel - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 4 - limits: - nvidia.com/gpu: 4 # Requesting 4 GPUs - livenessProbe: - httpGet: - path: /healthz - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /healthz - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: n40binstruct diff --git a/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml new file mode 100644 index 000000000..bd6280b9f --- /dev/null +++ b/presets/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-40b-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 4 # Requesting 4 GPUs + limits: + nvidia.com/gpu: 4 + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: n40binstruct diff --git a/presets/test/manifests/falcon-40b/falcon-40b-service.yaml b/presets/test/manifests/falcon-40b/falcon-40b-service.yaml index 599f70ca5..689361052 100644 --- a/presets/test/manifests/falcon-40b/falcon-40b-service.yaml +++ b/presets/test/manifests/falcon-40b/falcon-40b-service.yaml @@ -5,10 +5,9 @@ metadata: spec: selector: app: falcon - statefulset.kubernetes.io/pod-name: falcon-40b-0 ports: - - protocol: TCP - port: 80 - targetPort: 5000 + - protocol: TCP + port: 80 + targetPort: 5000 type: LoadBalancer publishNotReadyAddresses: true diff --git a/presets/test/manifests/falcon-40b/falcon-40b-statefulset.yaml b/presets/test/manifests/falcon-40b/falcon-40b-statefulset.yaml deleted file mode 100644 index 7b77d20df..000000000 --- a/presets/test/manifests/falcon-40b/falcon-40b-statefulset.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: falcon-40b -spec: - replicas: 1 - selector: - matchLabels: - app: falcon - podManagementPolicy: Parallel - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 4 - limits: - nvidia.com/gpu: 4 # Requesting 4 GPUs - livenessProbe: - httpGet: - path: /healthz - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /healthz - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon40b diff --git a/presets/test/manifests/falcon-40b/falcon-40b.yaml b/presets/test/manifests/falcon-40b/falcon-40b.yaml new file mode 100644 index 000000000..a125d838d --- /dev/null +++ b/presets/test/manifests/falcon-40b/falcon-40b.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-40b +spec: + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 4 # Requesting 4 GPUs + limits: + nvidia.com/gpu: 4 + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon40b diff --git a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml index e3c20e3b3..6acbe2405 100644 --- a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml +++ b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml @@ -5,10 +5,9 @@ metadata: spec: selector: app: falcon - statefulset.kubernetes.io/pod-name: falcon-7b-instruct-0 ports: - - protocol: TCP - port: 80 - targetPort: 5000 + - protocol: TCP + port: 80 + targetPort: 5000 type: LoadBalancer publishNotReadyAddresses: true diff --git a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct-statefulset.yaml b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct-statefulset.yaml deleted file mode 100644 index 0387a092b..000000000 --- a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct-statefulset.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: falcon-7b-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: falcon - podManagementPolicy: Parallel - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 # Requesting 2 GPUs - livenessProbe: - httpGet: - path: /healthz - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /healthz - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: on7binstruct diff --git a/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml new file mode 100644 index 000000000..ed8913e76 --- /dev/null +++ b/presets/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-7b-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 # Requesting 2 GPUs + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: on7binstruct diff --git a/presets/test/manifests/falcon-7b/falcon-7b-service.yaml b/presets/test/manifests/falcon-7b/falcon-7b-service.yaml index 1f8fef330..acf56ba74 100644 --- a/presets/test/manifests/falcon-7b/falcon-7b-service.yaml +++ b/presets/test/manifests/falcon-7b/falcon-7b-service.yaml @@ -5,11 +5,9 @@ metadata: spec: selector: app: falcon - statefulset.kubernetes.io/pod-name: falcon-7b-0 ports: - - protocol: TCP - port: 80 - targetPort: 5000 + - protocol: TCP + port: 80 + targetPort: 5000 type: LoadBalancer publishNotReadyAddresses: true - diff --git a/presets/test/manifests/falcon-7b/falcon-7b-statefulset.yaml b/presets/test/manifests/falcon-7b/falcon-7b-statefulset.yaml deleted file mode 100644 index 317da8a40..000000000 --- a/presets/test/manifests/falcon-7b/falcon-7b-statefulset.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: falcon-7b -spec: - replicas: 1 - selector: - matchLabels: - app: falcon - podManagementPolicy: Parallel - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 # Requesting 2 GPUs - livenessProbe: - httpGet: - path: /healthz - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /healthz - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon7b diff --git a/presets/test/manifests/falcon-7b/falcon-7b.yaml b/presets/test/manifests/falcon-7b/falcon-7b.yaml new file mode 100644 index 000000000..2f1aff077 --- /dev/null +++ b/presets/test/manifests/falcon-7b/falcon-7b.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-7b +spec: + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 # Requesting 2 GPUs + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon7b diff --git a/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat-statefulset.yaml b/presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml similarity index 100% rename from presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat-statefulset.yaml rename to presets/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml diff --git a/presets/test/manifests/llama-2-13b/llama-2-13b-statefulset.yaml b/presets/test/manifests/llama-2-13b/llama-2-13b.yaml similarity index 100% rename from presets/test/manifests/llama-2-13b/llama-2-13b-statefulset.yaml rename to presets/test/manifests/llama-2-13b/llama-2-13b.yaml diff --git a/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat-statefulset.yaml b/presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml similarity index 100% rename from presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat-statefulset.yaml rename to presets/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml diff --git a/presets/test/manifests/llama-2-7b/llama-2-7b-statefulset.yaml b/presets/test/manifests/llama-2-7b/llama-2-7b.yaml similarity index 100% rename from presets/test/manifests/llama-2-7b/llama-2-7b-statefulset.yaml rename to presets/test/manifests/llama-2-7b/llama-2-7b.yaml diff --git a/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml b/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml new file mode 100644 index 000000000..31b9206bc --- /dev/null +++ b/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: mistral-7b-instruct +spec: + selector: + app: mistral + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: LoadBalancer + publishNotReadyAddresses: true diff --git a/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml b/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml new file mode 100644 index 000000000..cacfbd484 --- /dev/null +++ b/presets/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: mistral + template: + metadata: + labels: + app: mistral + spec: + containers: + - name: mistral-instruct-container + image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 # Requesting 2 GPUs + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: al7binstruct diff --git a/presets/test/manifests/mistral-7b/mistral-7b-service.yaml b/presets/test/manifests/mistral-7b/mistral-7b-service.yaml new file mode 100644 index 000000000..650422c7c --- /dev/null +++ b/presets/test/manifests/mistral-7b/mistral-7b-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: mistral-7b +spec: + selector: + app: mistral + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: LoadBalancer + publishNotReadyAddresses: true diff --git a/presets/test/manifests/mistral-7b/mistral-7b.yaml b/presets/test/manifests/mistral-7b/mistral-7b.yaml new file mode 100644 index 000000000..287d435a7 --- /dev/null +++ b/presets/test/manifests/mistral-7b/mistral-7b.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b +spec: + replicas: 1 + selector: + matchLabels: + app: mistral + template: + metadata: + labels: + app: mistral + spec: + containers: + - name: mistral-container + image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 # Requesting 2 GPUs + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: mistral7b diff --git a/presets/test/manifests/phi-2/phi-2-service.yaml b/presets/test/manifests/phi-2/phi-2-service.yaml new file mode 100644 index 000000000..b81036bd8 --- /dev/null +++ b/presets/test/manifests/phi-2/phi-2-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: phi-2 +spec: + selector: + app: phi-2 + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: LoadBalancer + publishNotReadyAddresses: true diff --git a/presets/test/manifests/phi-2/phi-2.yaml b/presets/test/manifests/phi-2/phi-2.yaml new file mode 100644 index 000000000..b250d6248 --- /dev/null +++ b/presets/test/manifests/phi-2/phi-2.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-2 +spec: + replicas: 1 + selector: + matchLabels: + app: phi-2 + template: + metadata: + labels: + app: phi-2 + spec: + containers: + - name: phi-2-container + image: REPO_HERE.azurecr.io/phi-2:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference-api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 # Requesting 2 GPUs + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi2 diff --git a/presets/test/manifests/playground/python.yaml b/presets/test/manifests/playground/python.yaml new file mode 100644 index 000000000..7cbcb876e --- /dev/null +++ b/presets/test/manifests/playground/python.yaml @@ -0,0 +1,30 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: python-gpu-app +spec: + replicas: 1 + selector: + matchLabels: + app: python-gpu-app + template: + metadata: + labels: + app: python-gpu-app + spec: + containers: + - name: python-gpu-container + image: python:3.10-slim # Replace with your Docker image + command: ["/bin/sh"] + args: ["-c", "sleep infinity"] + resources: + limits: + nvidia.com/gpu: 2 # Requesting 2 GPUs + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists