Skip to content

Commit

Permalink
feature_cloud-ci-010135f53e9664ae61b596149e569230d8b45f44
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Apr 9, 2024
1 parent 758f702 commit 8248260
Show file tree
Hide file tree
Showing 263 changed files with 71,525 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
```
=================
Benchmark results
=================
fail n perf sem% std% peak_memory score weight
bert-fp16 4 4 NaN NaN NaN 24000 NaN 0.00
bert-fp32 4 4 NaN NaN NaN 23304 NaN 0.00
bert-tf32 4 4 NaN NaN NaN 23304 NaN 0.00
bert-tf32-fp16 4 4 NaN NaN NaN 24000 NaN 3.00
bf16 0 4 91.87 0.1% 1.4% 3098 183.777391 0.00
convnext_large-fp16 4 4 NaN NaN NaN 24394 NaN 0.00
convnext_large-fp32 4 4 NaN NaN NaN 24430 NaN 0.00
convnext_large-tf32 4 4 NaN NaN NaN 24430 NaN 0.00
convnext_large-tf32-fp16 4 4 NaN NaN NaN 24470 NaN 3.00
davit_large 4 4 NaN NaN NaN 24438 NaN 1.00
davit_large-multi 2 2 NaN NaN NaN 24366 NaN 5.00
dlrm 0 2 376081.29 0.1% 1.4% 5996 376081.290012 1.00
focalnet 0 4 146.78 1.0% 15.0% 24468 293.712272 2.00
fp16 0 4 92.92 0.1% 1.1% 3098 185.826273 0.00
fp32 0 4 15.61 0.1% 1.4% 3476 31.219423 0.00
llama 4 4 NaN NaN NaN -1 NaN 1.00
reformer 4 4 NaN NaN NaN 23556 NaN 1.00
regnet_y_128gf 4 4 NaN NaN NaN 24450 NaN 2.00
resnet152 4 4 NaN NaN NaN 24458 NaN 1.00
resnet152-multi 2 2 NaN NaN NaN 24470 NaN 5.00
resnet50 0 4 546.80 0.5% 8.1% 5838 1094.496142 1.00
rwkv 4 4 NaN NaN NaN 3976 NaN 1.00
stargan 4 4 NaN NaN NaN 24384 NaN 1.00
super-slomo 4 4 NaN NaN NaN 24458 NaN 1.00
t5 4 4 NaN NaN NaN 24098 NaN 2.00
tf32 0 4 44.61 0.1% 1.0% 3476 89.225443 0.00
whisper 4 4 NaN NaN NaN 23124 NaN 1.00
Scores
------
Failure rate: 74.51% (FAIL)
Score: 2.65
```
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{"event": "config", "data": {"system": {"arch": "cuda", "sshkey": null, "nodes": [{"aliaslist": [], "hostname": "decentoriole.eastus2.cloudapp.azure.com", "ip": "decentoriole.eastus2.cloudapp.azure.com", "ipaddrlist": ["::1", "127.0.0.1", "fe80::20d:3aff:fee5:4875%eth0", "00:0d:3a:e5:48:75", "10.0.1.4", "00:00:00:00:00:00"], "key": "/Users/satyaortiz-gagne/.ssh/covalent-azure-task-a10_x2-50899b94affdf1b88f99eb0f84c745cf/id_rsa.covalent.decentoriole.pem", "local": true, "main": true, "name": "manager", "user": "ubuntu"}], "cloud_profiles": {"azure__a100": {"location": "eastus2", "size": "Standard_NC24ads_A100_v4", "username": "ubuntu"}, "azure__a10_x2": {"location": "eastus2", "size": "Standard_NV72ads_A10_v5", "username": "ubuntu"}}, "self": {"aliaslist": [], "hostname": "decentoriole.eastus2.cloudapp.azure.com", "ip": "decentoriole.eastus2.cloudapp.azure.com", "ipaddrlist": ["::1", "127.0.0.1", "fe80::20d:3aff:fee5:4875%eth0", "00:0d:3a:e5:48:75", "10.0.1.4", "00:00:00:00:00:00"], "key": "/Users/satyaortiz-gagne/.ssh/covalent-azure-task-a10_x2-50899b94affdf1b88f99eb0f84c745cf/id_rsa.covalent.decentoriole.pem", "local": true, "main": true, "name": "manager", "user": "ubuntu"}}, "dirs": {"base": "/Users/satyaortiz-gagne/travail/mila/milabench", "venv": "/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch", "data": "/Users/satyaortiz-gagne/travail/mila/milabench/data", "runs": "/Users/satyaortiz-gagne/travail/mila/milabench/runs", "extra": "/Users/satyaortiz-gagne/travail/mila/milabench/extra/hf", "cache": "/Users/satyaortiz-gagne/travail/mila/milabench/cache"}, "group": "hf", "install_group": "torch", "install_variant": "cuda", "run_name": "tuzazolu.2024-04-04_18:28:45.589863", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 600, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/config", "config_file": "/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/config/standard.yaml", "hash": "019421f815bb7f0a373a2409d1d1f1d1", "definition": "/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface", "argv": {"--precision": "fp16", "--num-workers": 8, "--model": "Bert", "--batch-size": 32}, "plan": {"method": "per_gpu"}, "tags": ["huggingface", "language-modeling", "nlp", "precision-showcase", "transformer"], "weight": 0.0, "name": "bert-fp16", "tag": ["bert-fp16", "D0"], "device": "0", "devices": ["0"], "env": {"CUDA_VISIBLE_DEVICES": "0"}}, "pipe": null}
{"event": "meta", "data": {"cpu": {"count": 72, "brand": "AMD EPYC 74F3 24-Core Processor"}, "os": {"sysname": "Linux", "nodename": "decentoriole", "release": "6.5.0-1017-azure", "version": "#17~22.04.1-Ubuntu SMP Sat Mar 9 04:50:38 UTC 2024", "machine": "x86_64"}, "accelerators": {"arch": "cuda", "gpus": {"GPU-76b68080-f2ad-11ee-aef8-8f495b42d5c8": {"device": "0", "product": "NVIDIA A10-24Q", "memory": {"used": 2409.8125, "total": 24512.0}, "utilization": {"compute": 0, "memory": 0.09831154128590078}, "temperature": null, "power": null, "selection_variable": "CUDA_VISIBLE_DEVICES"}, "GPU-774f1700-f2ad-11ee-8e49-e4b9b9bf87ea": {"device": "1", "product": "NVIDIA A10-24Q", "memory": {"used": 2409.8125, "total": 24512.0}, "utilization": {"compute": 0, "memory": 0.09831154128590078}, "temperature": null, "power": null, "selection_variable": "CUDA_VISIBLE_DEVICES"}}}, "date": 1712255816.478905, "milabench": {"tag": "paice-v1-11-g010135f", "commit": "010135f53e9664ae61b596149e569230d8b45f44", "date": "2024-04-03 00:41:57 -0400"}, "pytorch": {"torch": "2.1.0+cu118", "compiler": "GCC 9.3", "cpp": "C++ Version: 201703", "intel": "Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications", "mkl": "OpenMP 201511 (a.k.a. OpenMP 4.5)", "openmp": "OpenMP 201511 (a.k.a. OpenMP 4.5)", "lapack": "LAPACK is enabled (usually provided by MKL)", "nnpack": "NNPACK is enabled", "cpu": "CPU capability usage: AVX2", "build_settings": {"BLAS_INFO": "mkl", "BUILD_TYPE": "Release", "CUDA_VERSION": "11.8", "CUDNN_VERSION": "8.7.0", "CXX_COMPILER": "/opt/rh/devtoolset-9/root/usr/bin/c++", "CXX_FLAGS": "-D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-aligned-allocation-unavailable -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow", "LAPACK_INFO": "mkl", "PERF_WITH_AVX": "1", "PERF_WITH_AVX2": "1", "PERF_WITH_AVX512": "1", "TORCH_DISABLE_GPU_ASSERTS": "ON", "TORCH_VERSION": "2.1.0", "USE_CUDA": "ON", "USE_CUDNN": "ON", "USE_EXCEPTION_PTR": "1", "USE_GFLAGS": "OFF", "USE_GLOG": "OFF", "USE_MKL": "ON", "USE_MKLDNN": "ON", "USE_MPI": "OFF", "USE_NCCL": "1", "USE_NNPACK": "ON", "USE_OPENMP": "ON", "USE_ROCM": "OFF"}}}, "pipe": null}
{"event": "start", "data": {"command": ["voir", "--config", "/Users/satyaortiz-gagne/travail/mila/milabench/extra/hf/voirconf-bert-fp16.D0-fb5679c624c0e6290d39628373b49ebc.json", "-m", "bench", "--precision", "fp16", "--num-workers", "8", "--model", "Bert", "--batch-size", "32"], "time": 1712255818.8367445}, "pipe": null}
{"event": "phase", "data": {"name": "init"}, "pipe": "data"}
{"event": "phase", "data": {"name": "parse_args"}, "pipe": "data"}
{"event": "phase", "data": {"name": "load_script"}, "pipe": "data"}
{"event": "phase", "data": {"name": "run_script"}, "pipe": "data"}
{"event": "data", "data": {"task": "train", "progress": [0, 100000]}, "pipe": "data"}
{"event": "data", "data": {"task": "train", "loss": 10.480685234069824}, "pipe": "data"}
{"event": "data", "data": {"task": "train", "progress": [1, 100000]}, "pipe": "data"}
{"event": "data", "data": {"task": "main", "gpudata": {"0": {"memory": [23999.8125, 24512.0], "load": 0.17, "temperature": null, "power": null}}}, "pipe": "data"}
{"event": "error", "data": {"type": "OutOfMemoryError", "message": "CUDA out of memory. Tried to allocate 1.86 GiB. GPU 0 has a total capacty of 23.73 GiB of which 512.19 MiB is free. Including non-PyTorch memory, this process has 21.08 GiB memory in use. Of the allocated memory 19.58 GiB is allocated by PyTorch, and 1.22 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"}, "pipe": "data"}
{"event": "phase", "data": {"name": "finalize"}, "pipe": "data"}
{"event": "line", "data": "Traceback (most recent call last):\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/bin/voir\", line 8, in <module>\n", "pipe": "stderr"}
{"event": "line", "data": " sys.exit(main())\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/cli.py\", line 124, in main\n", "pipe": "stderr"}
{"event": "line", "data": " ov(sys.argv[1:] if argv is None else argv)\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/phase.py\", line 334, in __call__\n", "pipe": "stderr"}
{"event": "line", "data": " self._run(*args, **kwargs)\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/overseer.py\", line 242, in _run\n", "pipe": "stderr"}
{"event": "line", "data": " set_value(func())\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/voir/scriptutils.py\", line 37, in <lambda>\n", "pipe": "stderr"}
{"event": "line", "data": " return lambda: exec(mainsection, glb, glb)\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 156, in <module>\n", "pipe": "stderr"}
{"event": "line", "data": " main()\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 152, in main\n", "pipe": "stderr"}
{"event": "line", "data": " runner.train()\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 70, in train\n", "pipe": "stderr"}
{"event": "line", "data": " self.step(data)\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/CODE/milabench/benchmarks/huggingface/bench/__main__.py\", line 59, in step\n", "pipe": "stderr"}
{"event": "line", "data": " self.amp_scaler.scale(loss).backward()\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/torch/_tensor.py\", line 492, in backward\n", "pipe": "stderr"}
{"event": "line", "data": " torch.autograd.backward(\n", "pipe": "stderr"}
{"event": "line", "data": " File \"/mnt/Users/satyaortiz-gagne/travail/mila/milabench/venv/torch/lib/python3.10/site-packages/torch/autograd/__init__.py\", line 251, in backward\n", "pipe": "stderr"}
{"event": "line", "data": " Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n", "pipe": "stderr"}
{"event": "line", "data": "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.86 GiB. GPU 0 has a total capacty of 23.73 GiB of which 512.19 MiB is free. Including non-PyTorch memory, this process has 21.08 GiB memory in use. Of the allocated memory 19.58 GiB is allocated by PyTorch, and 1.22 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\n", "pipe": "stderr"}
{"event": "end", "data": {"command": ["voir", "--config", "/Users/satyaortiz-gagne/travail/mila/milabench/extra/hf/voirconf-bert-fp16.D0-fb5679c624c0e6290d39628373b49ebc.json", "-m", "bench", "--precision", "fp16", "--num-workers", "8", "--model", "Bert", "--batch-size", "32"], "time": 1712255823.3500848, "return_code": 1}, "pipe": null}
Loading

0 comments on commit 8248260

Please sign in to comment.