feature_cloud-ci-010135f53e9664ae61b596149e569230d8b45f44

mila-iqia · Apr 9, 2024 · 8248260 · 8248260
1 parent 758f702
commit 8248260
Show file tree

Hide file tree

Showing 263 changed files with 71,525 additions and 0 deletions.
diff --git a/feature_cloud-ci-010135f53e9664ae61b596149e569230d8b45f44/NVIDIA_A10-24Q/README.md b/feature_cloud-ci-010135f53e9664ae61b596149e569230d8b45f44/NVIDIA_A10-24Q/README.md
@@ -0,0 +1,38 @@
+```
+=================
+Benchmark results
+=================
+                         fail n       perf   sem%   std% peak_memory          score weight
+bert-fp16                   4 4        NaN    NaN    NaN       24000            NaN   0.00
+bert-fp32                   4 4        NaN    NaN    NaN       23304            NaN   0.00
+bert-tf32                   4 4        NaN    NaN    NaN       23304            NaN   0.00
+bert-tf32-fp16              4 4        NaN    NaN    NaN       24000            NaN   3.00
+bf16                        0 4      91.87   0.1%   1.4%        3098     183.777391   0.00
+convnext_large-fp16         4 4        NaN    NaN    NaN       24394            NaN   0.00
+convnext_large-fp32         4 4        NaN    NaN    NaN       24430            NaN   0.00
+convnext_large-tf32         4 4        NaN    NaN    NaN       24430            NaN   0.00
+convnext_large-tf32-fp16    4 4        NaN    NaN    NaN       24470            NaN   3.00
+davit_large                 4 4        NaN    NaN    NaN       24438            NaN   1.00
+davit_large-multi           2 2        NaN    NaN    NaN       24366            NaN   5.00
+dlrm                        0 2  376081.29   0.1%   1.4%        5996  376081.290012   1.00
+focalnet                    0 4     146.78   1.0%  15.0%       24468     293.712272   2.00
+fp16                        0 4      92.92   0.1%   1.1%        3098     185.826273   0.00
+fp32                        0 4      15.61   0.1%   1.4%        3476      31.219423   0.00
+llama                       4 4        NaN    NaN    NaN          -1            NaN   1.00
+reformer                    4 4        NaN    NaN    NaN       23556            NaN   1.00
+regnet_y_128gf              4 4        NaN    NaN    NaN       24450            NaN   2.00
+resnet152                   4 4        NaN    NaN    NaN       24458            NaN   1.00
+resnet152-multi             2 2        NaN    NaN    NaN       24470            NaN   5.00
+resnet50                    0 4     546.80   0.5%   8.1%        5838    1094.496142   1.00
+rwkv                        4 4        NaN    NaN    NaN        3976            NaN   1.00
+stargan                     4 4        NaN    NaN    NaN       24384            NaN   1.00
+super-slomo                 4 4        NaN    NaN    NaN       24458            NaN   1.00
+t5                          4 4        NaN    NaN    NaN       24098            NaN   2.00
+tf32                        0 4      44.61   0.1%   1.0%        3476      89.225443   0.00
+whisper                     4 4        NaN    NaN    NaN       23124            NaN   1.00
+
+Scores
+------
+Failure rate:      74.51% (FAIL)
+Score:               2.65
+```
diff --git a/feature_cloud-ci-010135f53e9664ae61b596149e569230d8b45f44/NVIDIA_A10-24Q/badge.svg b/feature_cloud-ci-010135f53e9664ae61b596149e569230d8b45f44/NVIDIA_A10-24Q/badge.svg
diff --git a/...96149e569230d8b45f44/NVIDIA_A10-24Q/tuzazolu.2024-04-04_18:28:45.589863/bert-fp16.D0.data b/...96149e569230d8b45f44/NVIDIA_A10-24Q/tuzazolu.2024-04-04_18:28:45.589863/bert-fp16.D0.data
@@ -0,0 +1,38 @@
+{"event": "config", "data": {"system": {"arch": "cuda", "sshkey": null, "nodes": [{"aliaslist": [], "hostname": "decentoriole.eastus2.cloudapp.azure.com", "ip": "decentoriole.eastus2.cloudapp.azure.com", "ipaddrlist": ["::1", "127.0.0.1", "fe80::20d:3aff:fee5:4875%eth0", "00:0d:3a:e5:48:75", "10.0.1.4", "00:00:00:00:00:00"], "key": "/Users/satyaortiz-gagne/.ssh/covalent-azure-task-a10_x2-50899b94affdf1b88f99eb0f84c745cf/id_rsa.covalent.decentoriole.pem", "local": true, "main": true, "name": "manager", "user": "ubuntu"}], "cloud_profiles": {"azure__a100": {"location": "eastus2", "size": "Standard_NC24ads_A100_v4", "username": "ubuntu"}, "azure__a10_x2": {"location": "eastus2", "size": "Standard_NV72ads_A10_v5", "username": "ubuntu"}}, "self": {"aliaslist": [], "hostname": "decentoriole.eastus2.cloudapp.azure.com", "ip": "decentoriole.eastus2.cloudapp.azure.com", "ipaddrlist": ["::1", "127.0.0.1", "fe80::20d:3aff:fee5:4875%eth0", "00:0d:3a:e5:48:75", "10.0.1.4", "00:00:00:00:00:00"], "key": "/Users/satyaortiz-gagne/.ssh/covalent-azure-task-a10_x2-50899b94affdf1b88f99eb0f84c745cf/id_rsa.covalent.decentoriole.pem", "local": true, "main": true, "name": "manager", "user": "ubuntu"}}, "dirs": {"base": "/Users/satyaortiz-gagne/travail/mila/milabench", "venv": "/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch", "data": "/Users/satyaortiz-gagne/travail/mila/milabench/data", "runs": "/Users/satyaortiz-gagne/travail/mila/milabench/runs", "extra": "/Users/satyaortiz-gagne/travail/mila/milabench/extra/hf", "cache": "/Users/satyaortiz-gagne/travail/mila/milabench/cache"}, "group": "hf", "install_group": "torch", "install_variant": "cuda", "run_name": "tuzazolu.2024-04-04_18:28:45.589863", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 600, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/config", "config_file": "/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/config/standard.yaml", "hash": "019421f815bb7f0a373a2409d1d1f1d1", "definition": "/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface", "argv": {"--precision": "fp16", "--num-workers": 8, "--model": "Bert", "--batch-size": 32}, "plan": {"method": "per_gpu"}, "tags": ["huggingface", "language-modeling", "nlp", "precision-showcase", "transformer"], "weight": 0.0, "name": "bert-fp16", "tag": ["bert-fp16", "D0"], "device": "0", "devices": ["0"], "env": {"CUDA_VISIBLE_DEVICES": "0"}}, "pipe": null}
+{"event": "meta", "data": {"cpu": {"count": 72, "brand": "AMD EPYC 74F3 24-Core Processor"}, "os": {"sysname": "Linux", "nodename": "decentoriole", "release": "6.5.0-1017-azure", "version": "#17~22.04.1-Ubuntu SMP Sat Mar  9 04:50:38 UTC 2024", "machine": "x86_64"}, "accelerators": {"arch": "cuda", "gpus": {"GPU-76b68080-f2ad-11ee-aef8-8f495b42d5c8": {"device": "0", "product": "NVIDIA A10-24Q", "memory": {"used": 2409.8125, "total": 24512.0}, "utilization": {"compute": 0, "memory": 0.09831154128590078}, "temperature": null, "power": null, "selection_variable": "CUDA_VISIBLE_DEVICES"}, "GPU-774f1700-f2ad-11ee-8e49-e4b9b9bf87ea": {"device": "1", "product": "NVIDIA A10-24Q", "memory": {"used": 2409.8125, "total": 24512.0}, "utilization": {"compute": 0, "memory": 0.09831154128590078}, "temperature": null, "power": null, "selection_variable": "CUDA_VISIBLE_DEVICES"}}}, "date": 1712255816.478905, "milabench": {"tag": "paice-v1-11-g010135f", "commit": "010135f53e9664ae61b596149e569230d8b45f44", "date": "2024-04-03 00:41:57 -0400"}, "pytorch": {"torch": "2.1.0+cu118", "compiler": "GCC 9.3", "cpp": "C++ Version: 201703", "intel": "Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications", "mkl": "OpenMP 201511 (a.k.a. OpenMP 4.5)", "openmp": "OpenMP 201511 (a.k.a. OpenMP 4.5)", "lapack": "LAPACK is enabled (usually provided by MKL)", "nnpack": "NNPACK is enabled", "cpu": "CPU capability usage: AVX2", "build_settings": {"BLAS_INFO": "mkl", "BUILD_TYPE": "Release", "CUDA_VERSION": "11.8", "CUDNN_VERSION": "8.7.0", "CXX_COMPILER": "/opt/rh/devtoolset-9/root/usr/bin/c++", "CXX_FLAGS": "-D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-aligned-allocation-unavailable -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow", "LAPACK_INFO": "mkl", "PERF_WITH_AVX": "1", "PERF_WITH_AVX2": "1", "PERF_WITH_AVX512": "1", "TORCH_DISABLE_GPU_ASSERTS": "ON", "TORCH_VERSION": "2.1.0", "USE_CUDA": "ON", "USE_CUDNN": "ON", "USE_EXCEPTION_PTR": "1", "USE_GFLAGS": "OFF", "USE_GLOG": "OFF", "USE_MKL": "ON", "USE_MKLDNN": "ON", "USE_MPI": "OFF", "USE_NCCL": "1", "USE_NNPACK": "ON", "USE_OPENMP": "ON", "USE_ROCM": "OFF"}}}, "pipe": null}
+{"event": "start", "data": {"command": ["voir", "--config", "/Users/satyaortiz-gagne/travail/mila/milabench/extra/hf/voirconf-bert-fp16.D0-fb5679c624c0e6290d39628373b49ebc.json", "-m", "bench", "--precision", "fp16", "--num-workers", "8", "--model", "Bert", "--batch-size", "32"], "time": 1712255818.8367445}, "pipe": null}
+{"event": "phase", "data": {"name": "init"}, "pipe": "data"}
+{"event": "phase", "data": {"name": "parse_args"}, "pipe": "data"}
+{"event": "phase", "data": {"name": "load_script"}, "pipe": "data"}
+{"event": "phase", "data": {"name": "run_script"}, "pipe": "data"}
+{"event": "data", "data": {"task": "train", "progress": [0, 100000]}, "pipe": "data"}
+{"event": "data", "data": {"task": "train", "loss": 10.480685234069824}, "pipe": "data"}
+{"event": "data", "data": {"task": "train", "progress": [1, 100000]}, "pipe": "data"}
+{"event": "data", "data": {"task": "main", "gpudata": {"0": {"memory": [23999.8125, 24512.0], "load": 0.17, "temperature": null, "power": null}}}, "pipe": "data"}
+{"event": "error", "data": {"type": "OutOfMemoryError", "message": "CUDA out of memory. Tried to allocate 1.86 GiB. GPU 0 has a total capacty of 23.73 GiB of which 512.19 MiB is free. Including non-PyTorch memory, this process has 21.08 GiB memory in use. Of the allocated memory 19.58 GiB is allocated by PyTorch, and 1.22 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"}, "pipe": "data"}
+{"event": "phase", "data": {"name": "finalize"}, "pipe": "data"}
+{"event": "line", "data": "Traceback (most recent call last):\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/bin/voir\", line 8, in <module>\n", "pipe": "stderr"}
+{"event": "line", "data": "    sys.exit(main())\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/cli.py\", line 124, in main\n", "pipe": "stderr"}
+{"event": "line", "data": "    ov(sys.argv[1:] if argv is None else argv)\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/phase.py\", line 334, in __call__\n", "pipe": "stderr"}
+{"event": "line", "data": "    self._run(*args, **kwargs)\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/overseer.py\", line 242, in _run\n", "pipe": "stderr"}
+{"event": "line", "data": "    set_value(func())\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/scriptutils.py\", line 37, in <lambda>\n", "pipe": "stderr"}
+{"event": "line", "data": "    return lambda: exec(mainsection, glb, glb)\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 156, in <module>\n", "pipe": "stderr"}
+{"event": "line", "data": "    main()\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 152, in main\n", "pipe": "stderr"}
+{"event": "line", "data": "    runner.train()\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 70, in train\n", "pipe": "stderr"}
+{"event": "line", "data": "    self.step(data)\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 59, in step\n", "pipe": "stderr"}
+{"event": "line", "data": "    self.amp_scaler.scale(loss).backward()\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/torch/_tensor.py\", line 492, in backward\n", "pipe": "stderr"}
+{"event": "line", "data": "    torch.autograd.backward(\n", "pipe": "stderr"}
+{"event": "line", "data": "  File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/torch/autograd/__init__.py\", line 251, in backward\n", "pipe": "stderr"}
+{"event": "line", "data": "    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\n", "pipe": "stderr"}
+{"event": "line", "data": "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.86 GiB. GPU 0 has a total capacty of 23.73 GiB of which 512.19 MiB is free. Including non-PyTorch memory, this process has 21.08 GiB memory in use. Of the allocated memory 19.58 GiB is allocated by PyTorch, and 1.22 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\n", "pipe": "stderr"}
+{"event": "end", "data": {"command": ["voir", "--config", "/Users/satyaortiz-gagne/travail/mila/milabench/extra/hf/voirconf-bert-fp16.D0-fb5679c624c0e6290d39628373b49ebc.json", "-m", "bench", "--precision", "fp16", "--num-workers", "8", "--model", "Bert", "--batch-size", "32"], "time": 1712255823.3500848, "return_code": 1}, "pipe": null}