diff --git a/.github/workflows/debug-metadata.yml b/.github/workflows/debug-metadata.yml index e9e1bc9c..159917ad 100644 --- a/.github/workflows/debug-metadata.yml +++ b/.github/workflows/debug-metadata.yml @@ -1,61 +1,31 @@ name: Debug Metadata on: - # push: - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true + push: jobs: - integration-tests: - name: Run TGI Integration Tests - runs-on: - group: gcp-ct5lp-hightpu-8t - + debug-dind: + name: Run TGI tests + runs-on: gcp-ct5lp-hightpu-8t-usc1-public-211 + container: + image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla@sha256:8f1dcd5b03f993e4da5c20d17c77aff6a5f22d5455f8eb042d2e4b16ac460526 + options: ${{ vars.V5_LITEPOD_8_ENV }} --shm-size "16gb" --privileged --ipc host -v /mnt/hf_cache:/mnt/cache/ + env: + PJRT_DEVICE: TPU steps: - - name: Install DNS utilities + - name: Install docker run: | - sudo apt-get update -y - sudo apt-get install -y dnsutils iputils-ping curl - - - - - name: Debug metadata + apt-get update -y + apt-get install -y docker.io + - name: Debug XLA (run container) run: | - echo "nslookup metadata.google.internal" - nslookup metadata.google.internal - - echo "ping -c 3 metadata.google.internal" - ping -c 3 metadata.google.internal - - echo "ping -c 3 169.254.169.254" - ping -c 3 169.254.169.254 - - echo "Get the IP address of metadata.google.internal using getent" - # Get the IP address of metadata.google.internal using getent - METADATA_IP=$(getent hosts metadata.google.internal | awk '{ print $1 }') - echo "Metadata server IP address: ${METADATA_IP}" - - echo "Fetching instance metadata directory:" - curl -s http://metadata.google.internal/computeMetadata/v1/instance/ -H "Metadata-Flavor: Google" - echo - - STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://metadata.google.internal/computeMetadata/v1/instance/image" -H "Metadata-Flavor: Google") - RESPONSE=$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/image" -H "Metadata-Flavor: Google") - echo "Status code: $STATUS_CODE" - echo "Metadata response: $RESPONSE" - if [ "$STATUS_CODE" -ne 200 ]; then - echo "Failed to fetch instance image metadata" - exit 1 - fi - - STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/tpu-env" -H "Metadata-Flavor: Google") - RESPONSE=$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/attributes/tpu-env" -H "Metadata-Flavor: Google") - echo "Status code: $STATUS_CODE" - echo "Metadata response: $RESPONSE" - if [ "$STATUS_CODE" -ne 200 ]; then - echo "Failed to fetch TPU environment metadata" - exit 1 - fi + docker run \ + --privileged \ + --ipc host \ + --shm-size=16G \ + -v /mnt/hf_cache:/mnt/cache/ \ + -e PJRT_DEVICE=TPU \ + us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla@sha256:8f1dcd5b03f993e4da5c20d17c77aff6a5f22d5455f8eb042d2e4b16ac460526 \ + bash -c ' + python -c "import torch_xla.core.xla_model as xm; assert xm.xla_device().type == 'xla', 'XLA device not available'" + ' \ No newline at end of file diff --git a/.github/workflows/test-pytorch-xla-tpu-tgi-integration.yml b/.github/workflows/test-pytorch-xla-tpu-tgi-integration.yml index 5107b1fd..232725a1 100644 --- a/.github/workflows/test-pytorch-xla-tpu-tgi-integration.yml +++ b/.github/workflows/test-pytorch-xla-tpu-tgi-integration.yml @@ -1,7 +1,7 @@ name: Optimum TPU / Test TGI on TPU / Integration Tests on: - push: + # push: pull_request: branches: [ main ] paths: