-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feature_cloud-ci-010135f53e9664ae61b596149e569230d8b45f44
- Loading branch information
Showing
263 changed files
with
71,525 additions
and
0 deletions.
There are no files selected for viewing
38 changes: 38 additions & 0 deletions
38
feature_cloud-ci-010135f53e9664ae61b596149e569230d8b45f44/NVIDIA_A10-24Q/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
``` | ||
================= | ||
Benchmark results | ||
================= | ||
fail n perf sem% std% peak_memory score weight | ||
bert-fp16 4 4 NaN NaN NaN 24000 NaN 0.00 | ||
bert-fp32 4 4 NaN NaN NaN 23304 NaN 0.00 | ||
bert-tf32 4 4 NaN NaN NaN 23304 NaN 0.00 | ||
bert-tf32-fp16 4 4 NaN NaN NaN 24000 NaN 3.00 | ||
bf16 0 4 91.87 0.1% 1.4% 3098 183.777391 0.00 | ||
convnext_large-fp16 4 4 NaN NaN NaN 24394 NaN 0.00 | ||
convnext_large-fp32 4 4 NaN NaN NaN 24430 NaN 0.00 | ||
convnext_large-tf32 4 4 NaN NaN NaN 24430 NaN 0.00 | ||
convnext_large-tf32-fp16 4 4 NaN NaN NaN 24470 NaN 3.00 | ||
davit_large 4 4 NaN NaN NaN 24438 NaN 1.00 | ||
davit_large-multi 2 2 NaN NaN NaN 24366 NaN 5.00 | ||
dlrm 0 2 376081.29 0.1% 1.4% 5996 376081.290012 1.00 | ||
focalnet 0 4 146.78 1.0% 15.0% 24468 293.712272 2.00 | ||
fp16 0 4 92.92 0.1% 1.1% 3098 185.826273 0.00 | ||
fp32 0 4 15.61 0.1% 1.4% 3476 31.219423 0.00 | ||
llama 4 4 NaN NaN NaN -1 NaN 1.00 | ||
reformer 4 4 NaN NaN NaN 23556 NaN 1.00 | ||
regnet_y_128gf 4 4 NaN NaN NaN 24450 NaN 2.00 | ||
resnet152 4 4 NaN NaN NaN 24458 NaN 1.00 | ||
resnet152-multi 2 2 NaN NaN NaN 24470 NaN 5.00 | ||
resnet50 0 4 546.80 0.5% 8.1% 5838 1094.496142 1.00 | ||
rwkv 4 4 NaN NaN NaN 3976 NaN 1.00 | ||
stargan 4 4 NaN NaN NaN 24384 NaN 1.00 | ||
super-slomo 4 4 NaN NaN NaN 24458 NaN 1.00 | ||
t5 4 4 NaN NaN NaN 24098 NaN 2.00 | ||
tf32 0 4 44.61 0.1% 1.0% 3476 89.225443 0.00 | ||
whisper 4 4 NaN NaN NaN 23124 NaN 1.00 | ||
Scores | ||
------ | ||
Failure rate: 74.51% (FAIL) | ||
Score: 2.65 | ||
``` |
1 change: 1 addition & 0 deletions
1
feature_cloud-ci-010135f53e9664ae61b596149e569230d8b45f44/NVIDIA_A10-24Q/badge.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
38 changes: 38 additions & 0 deletions
38
...96149e569230d8b45f44/NVIDIA_A10-24Q/tuzazolu.2024-04-04_18:28:45.589863/bert-fp16.D0.data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
{"event": "config", "data": {"system": {"arch": "cuda", "sshkey": null, "nodes": [{"aliaslist": [], "hostname": "decentoriole.eastus2.cloudapp.azure.com", "ip": "decentoriole.eastus2.cloudapp.azure.com", "ipaddrlist": ["::1", "127.0.0.1", "fe80::20d:3aff:fee5:4875%eth0", "00:0d:3a:e5:48:75", "10.0.1.4", "00:00:00:00:00:00"], "key": "/Users/satyaortiz-gagne/.ssh/covalent-azure-task-a10_x2-50899b94affdf1b88f99eb0f84c745cf/id_rsa.covalent.decentoriole.pem", "local": true, "main": true, "name": "manager", "user": "ubuntu"}], "cloud_profiles": {"azure__a100": {"location": "eastus2", "size": "Standard_NC24ads_A100_v4", "username": "ubuntu"}, "azure__a10_x2": {"location": "eastus2", "size": "Standard_NV72ads_A10_v5", "username": "ubuntu"}}, "self": {"aliaslist": [], "hostname": "decentoriole.eastus2.cloudapp.azure.com", "ip": "decentoriole.eastus2.cloudapp.azure.com", "ipaddrlist": ["::1", "127.0.0.1", "fe80::20d:3aff:fee5:4875%eth0", "00:0d:3a:e5:48:75", "10.0.1.4", "00:00:00:00:00:00"], "key": "/Users/satyaortiz-gagne/.ssh/covalent-azure-task-a10_x2-50899b94affdf1b88f99eb0f84c745cf/id_rsa.covalent.decentoriole.pem", "local": true, "main": true, "name": "manager", "user": "ubuntu"}}, "dirs": {"base": "/Users/satyaortiz-gagne/travail/mila/milabench", "venv": "/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch", "data": "/Users/satyaortiz-gagne/travail/mila/milabench/data", "runs": "/Users/satyaortiz-gagne/travail/mila/milabench/runs", "extra": "/Users/satyaortiz-gagne/travail/mila/milabench/extra/hf", "cache": "/Users/satyaortiz-gagne/travail/mila/milabench/cache"}, "group": "hf", "install_group": "torch", "install_variant": "cuda", "run_name": "tuzazolu.2024-04-04_18:28:45.589863", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 600, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/config", "config_file": "/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/config/standard.yaml", "hash": "019421f815bb7f0a373a2409d1d1f1d1", "definition": "/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface", "argv": {"--precision": "fp16", "--num-workers": 8, "--model": "Bert", "--batch-size": 32}, "plan": {"method": "per_gpu"}, "tags": ["huggingface", "language-modeling", "nlp", "precision-showcase", "transformer"], "weight": 0.0, "name": "bert-fp16", "tag": ["bert-fp16", "D0"], "device": "0", "devices": ["0"], "env": {"CUDA_VISIBLE_DEVICES": "0"}}, "pipe": null} | ||
{"event": "meta", "data": {"cpu": {"count": 72, "brand": "AMD EPYC 74F3 24-Core Processor"}, "os": {"sysname": "Linux", "nodename": "decentoriole", "release": "6.5.0-1017-azure", "version": "#17~22.04.1-Ubuntu SMP Sat Mar 9 04:50:38 UTC 2024", "machine": "x86_64"}, "accelerators": {"arch": "cuda", "gpus": {"GPU-76b68080-f2ad-11ee-aef8-8f495b42d5c8": {"device": "0", "product": "NVIDIA A10-24Q", "memory": {"used": 2409.8125, "total": 24512.0}, "utilization": {"compute": 0, "memory": 0.09831154128590078}, "temperature": null, "power": null, "selection_variable": "CUDA_VISIBLE_DEVICES"}, "GPU-774f1700-f2ad-11ee-8e49-e4b9b9bf87ea": {"device": "1", "product": "NVIDIA A10-24Q", "memory": {"used": 2409.8125, "total": 24512.0}, "utilization": {"compute": 0, "memory": 0.09831154128590078}, "temperature": null, "power": null, "selection_variable": "CUDA_VISIBLE_DEVICES"}}}, "date": 1712255816.478905, "milabench": {"tag": "paice-v1-11-g010135f", "commit": "010135f53e9664ae61b596149e569230d8b45f44", "date": "2024-04-03 00:41:57 -0400"}, "pytorch": {"torch": "2.1.0+cu118", "compiler": "GCC 9.3", "cpp": "C++ Version: 201703", "intel": "Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications", "mkl": "OpenMP 201511 (a.k.a. OpenMP 4.5)", "openmp": "OpenMP 201511 (a.k.a. OpenMP 4.5)", "lapack": "LAPACK is enabled (usually provided by MKL)", "nnpack": "NNPACK is enabled", "cpu": "CPU capability usage: AVX2", "build_settings": {"BLAS_INFO": "mkl", "BUILD_TYPE": "Release", "CUDA_VERSION": "11.8", "CUDNN_VERSION": "8.7.0", "CXX_COMPILER": "/opt/rh/devtoolset-9/root/usr/bin/c++", "CXX_FLAGS": "-D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-aligned-allocation-unavailable -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow", "LAPACK_INFO": "mkl", "PERF_WITH_AVX": "1", "PERF_WITH_AVX2": "1", "PERF_WITH_AVX512": "1", "TORCH_DISABLE_GPU_ASSERTS": "ON", "TORCH_VERSION": "2.1.0", "USE_CUDA": "ON", "USE_CUDNN": "ON", "USE_EXCEPTION_PTR": "1", "USE_GFLAGS": "OFF", "USE_GLOG": "OFF", "USE_MKL": "ON", "USE_MKLDNN": "ON", "USE_MPI": "OFF", "USE_NCCL": "1", "USE_NNPACK": "ON", "USE_OPENMP": "ON", "USE_ROCM": "OFF"}}}, "pipe": null} | ||
{"event": "start", "data": {"command": ["voir", "--config", "/Users/satyaortiz-gagne/travail/mila/milabench/extra/hf/voirconf-bert-fp16.D0-fb5679c624c0e6290d39628373b49ebc.json", "-m", "bench", "--precision", "fp16", "--num-workers", "8", "--model", "Bert", "--batch-size", "32"], "time": 1712255818.8367445}, "pipe": null} | ||
{"event": "phase", "data": {"name": "init"}, "pipe": "data"} | ||
{"event": "phase", "data": {"name": "parse_args"}, "pipe": "data"} | ||
{"event": "phase", "data": {"name": "load_script"}, "pipe": "data"} | ||
{"event": "phase", "data": {"name": "run_script"}, "pipe": "data"} | ||
{"event": "data", "data": {"task": "train", "progress": [0, 100000]}, "pipe": "data"} | ||
{"event": "data", "data": {"task": "train", "loss": 10.480685234069824}, "pipe": "data"} | ||
{"event": "data", "data": {"task": "train", "progress": [1, 100000]}, "pipe": "data"} | ||
{"event": "data", "data": {"task": "main", "gpudata": {"0": {"memory": [23999.8125, 24512.0], "load": 0.17, "temperature": null, "power": null}}}, "pipe": "data"} | ||
{"event": "error", "data": {"type": "OutOfMemoryError", "message": "CUDA out of memory. Tried to allocate 1.86 GiB. GPU 0 has a total capacty of 23.73 GiB of which 512.19 MiB is free. Including non-PyTorch memory, this process has 21.08 GiB memory in use. Of the allocated memory 19.58 GiB is allocated by PyTorch, and 1.22 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"}, "pipe": "data"} | ||
{"event": "phase", "data": {"name": "finalize"}, "pipe": "data"} | ||
{"event": "line", "data": "Traceback (most recent call last):\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/bin/voir\", line 8, in <module>\n", "pipe": "stderr"} | ||
{"event": "line", "data": " sys.exit(main())\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/cli.py\", line 124, in main\n", "pipe": "stderr"} | ||
{"event": "line", "data": " ov(sys.argv[1:] if argv is None else argv)\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/phase.py\", line 334, in __call__\n", "pipe": "stderr"} | ||
{"event": "line", "data": " self._run(*args, **kwargs)\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/overseer.py\", line 242, in _run\n", "pipe": "stderr"} | ||
{"event": "line", "data": " set_value(func())\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/scriptutils.py\", line 37, in <lambda>\n", "pipe": "stderr"} | ||
{"event": "line", "data": " return lambda: exec(mainsection, glb, glb)\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 156, in <module>\n", "pipe": "stderr"} | ||
{"event": "line", "data": " main()\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 152, in main\n", "pipe": "stderr"} | ||
{"event": "line", "data": " runner.train()\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 70, in train\n", "pipe": "stderr"} | ||
{"event": "line", "data": " self.step(data)\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 59, in step\n", "pipe": "stderr"} | ||
{"event": "line", "data": " self.amp_scaler.scale(loss).backward()\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/torch/_tensor.py\", line 492, in backward\n", "pipe": "stderr"} | ||
{"event": "line", "data": " torch.autograd.backward(\n", "pipe": "stderr"} | ||
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/torch/autograd/__init__.py\", line 251, in backward\n", "pipe": "stderr"} | ||
{"event": "line", "data": " Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n", "pipe": "stderr"} | ||
{"event": "line", "data": "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.86 GiB. GPU 0 has a total capacty of 23.73 GiB of which 512.19 MiB is free. Including non-PyTorch memory, this process has 21.08 GiB memory in use. Of the allocated memory 19.58 GiB is allocated by PyTorch, and 1.22 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\n", "pipe": "stderr"} | ||
{"event": "end", "data": {"command": ["voir", "--config", "/Users/satyaortiz-gagne/travail/mila/milabench/extra/hf/voirconf-bert-fp16.D0-fb5679c624c0e6290d39628373b49ebc.json", "-m", "bench", "--precision", "fp16", "--num-workers", "8", "--model", "Bert", "--batch-size", "32"], "time": 1712255823.3500848, "return_code": 1}, "pipe": null} |
Oops, something went wrong.